Recently while building Exadata Simulator for Training my students I had issue in Connecting from Exadata Compute Node( DB Server) to Storage Cell.
Configuration:
DB server/Compute Node : 1 = exadbserver1
Storage Cell: 2 = exacell01 and exacell02
[root@exadbserver1 trace]# /grid/stage/ext/bin/kfod disk=all op=disks
--------------------------------------------------------------------------------
Disk Size Path User Group
================================================================================
1: 448 Mb o/192.168.56.151/DATA_CD_DISK01_stocell1
2: 448 Mb o/192.168.56.151/DATA_CD_DISK02_stocell1
3: 448 Mb o/192.168.56.151/DATA_CD_DISK03_stocell1
4: 448 Mb o/192.168.56.151/DATA_CD_DISK04_stocell1
5: 448 Mb o/192.168.56.151/DATA_CD_DISK05_stocell1
6: 448 Mb o/192.168.56.151/DATA_CD_DISK06_stocell1
7: 448 Mb o/192.168.56.151/DATA_CD_DISK07_stocell1
8: 448 Mb o/192.168.56.151/DATA_CD_DISK08_stocell1
9: 448 Mb o/192.168.56.151/DATA_CD_DISK09_stocell1
10: 448 Mb o/192.168.56.151/DATA_CD_DISK10_stocell1
11: 448 Mb o/192.168.56.151/DATA_CD_DISK11_stocell1
12: 448 Mb o/192.168.56.151/DATA_CD_DISK12_stocell1
[root@exacell02 ~]# cellcli
CellCLI: Release 11.2.3.2.1 - Production on Mon Jan 04 08:29:48 GMT 2016
Copyright (c) 2007, 2012, Oracle. All rights reserved.
Cell Efficiency Ratio: 1
CellCLI> list cell detail
name: stocell2
bbuTempThreshold: 60
bbuChargeThreshold: 800
bmcType: absent
cellVersion: OSS_11.2.3.2.1_LINUX.X64_130109
cpuCount: 2
diagHistoryDays: 7
fanCount: 1/1
fanStatus: normal
flashCacheMode: WriteThrough
id: ba486e52-db76-4ac8-b08a-01048f6bfb5d
interconnectCount: 2
interconnect1: eth1
iormBoost: 0.0
ipaddress1: 192.168.56.152/24
kernelVersion: 2.6.18-371.el5xen
makeModel: Fake hardware
metricHistoryDays: 7
offloadEfficiency: 1.0
powerCount: 1/1
powerStatus: normal
releaseVersion: 11.2.3.2.1
releaseTrackingBug: 14522699
status: online
temperatureReading: 0.0
temperatureStatus: normal
upTime: 0 days, 0:34
cellsrvStatus: running
msStatus: running
rsStatus: running
CellCLI> list griddisk
DATA1_CD_DISK01_stocell2 active
DATA1_CD_DISK02_stocell2 active
DATA1_CD_DISK03_stocell2 active
DATA1_CD_DISK04_stocell2 active
DATA1_CD_DISK05_stocell2 active
DATA1_CD_DISK06_stocell2 active
DATA1_CD_DISK07_stocell2 active
DATA1_CD_DISK08_stocell2 active
DATA1_CD_DISK09_stocell2 active
DATA1_CD_DISK10_stocell2 active
DATA1_CD_DISK11_stocell2 active
DATA1_CD_DISK12_stocell2 active
Configuration:
DB server/Compute Node : 1 = exadbserver1
Storage Cell: 2 = exacell01 and exacell02
[root@exadbserver1 trace]# /grid/stage/ext/bin/kfod disk=all op=disks
--------------------------------------------------------------------------------
Disk Size Path User Group
================================================================================
1: 448 Mb o/192.168.56.151/DATA_CD_DISK01_stocell1
2: 448 Mb o/192.168.56.151/DATA_CD_DISK02_stocell1
3: 448 Mb o/192.168.56.151/DATA_CD_DISK03_stocell1
4: 448 Mb o/192.168.56.151/DATA_CD_DISK04_stocell1
5: 448 Mb o/192.168.56.151/DATA_CD_DISK05_stocell1
6: 448 Mb o/192.168.56.151/DATA_CD_DISK06_stocell1
7: 448 Mb o/192.168.56.151/DATA_CD_DISK07_stocell1
8: 448 Mb o/192.168.56.151/DATA_CD_DISK08_stocell1
9: 448 Mb o/192.168.56.151/DATA_CD_DISK09_stocell1
10: 448 Mb o/192.168.56.151/DATA_CD_DISK10_stocell1
11: 448 Mb o/192.168.56.151/DATA_CD_DISK11_stocell1
12: 448 Mb o/192.168.56.151/DATA_CD_DISK12_stocell1
But no disk were visible from 2nd storage cell, running on 192.168.56.152
checked the logs , discovered that its timing out on storage cell02 from logs
[root@texadbserver1 trace]# cat ora_26784_47984932300480.trc
Trace file /u01/oracle/app/oracle/oradiag_root/diag/clients/user_root/host_482230158_80/trace/ora_26784_47984932300480.trc
connect: sosstcpopen failed. boxname = 192.168.56.152, port = 5042
OS system dependent operation:connect_error failed with status: 115
OS failure message: Operation now in progress
failure occurred at: sosstcpconne
Connect retry: sleeping for 1 seconds, connect attempt 2 out of maximum 7 attempts
connect: sosstcpopen failed. boxname = 192.168.56.152, port = 5042
OS system dependent operation:connect_error failed with status: 115
OS failure message: Operation now in progress
failure occurred at: sosstcpconne
Connect retry: sleeping for 1 seconds, connect attempt 2 out of maximum 7 attempts
Troubleshooting
Checked RDS is enabled on exacell02 ( cloned from exacell01, this shouldn't be problem but checked in case , it not restarted).
[root@exacell02 sysconfig]# lsmod |grep rds
rds_rdma 106561 0
rds_tcp 48097 0
rds 155561 144 rds_rdma,rds_tcp
rdma_cm 73429 2 rds_rdma,ib_iser
ib_core 108097 7 rds_rdma,ib_iser,rdma_cm,ib_cm,iw_cm,ib_sa,ib_mad
All looks good
[root@exacell02 ~]# cellcli
CellCLI: Release 11.2.3.2.1 - Production on Mon Jan 04 08:29:48 GMT 2016
Copyright (c) 2007, 2012, Oracle. All rights reserved.
Cell Efficiency Ratio: 1
CellCLI> list cell detail
name: stocell2
bbuTempThreshold: 60
bbuChargeThreshold: 800
bmcType: absent
cellVersion: OSS_11.2.3.2.1_LINUX.X64_130109
cpuCount: 2
diagHistoryDays: 7
fanCount: 1/1
fanStatus: normal
flashCacheMode: WriteThrough
id: ba486e52-db76-4ac8-b08a-01048f6bfb5d
interconnectCount: 2
interconnect1: eth1
iormBoost: 0.0
ipaddress1: 192.168.56.152/24
kernelVersion: 2.6.18-371.el5xen
makeModel: Fake hardware
metricHistoryDays: 7
offloadEfficiency: 1.0
powerCount: 1/1
powerStatus: normal
releaseVersion: 11.2.3.2.1
releaseTrackingBug: 14522699
status: online
temperatureReading: 0.0
temperatureStatus: normal
upTime: 0 days, 0:34
cellsrvStatus: running
msStatus: running
rsStatus: running
All looks Good
CellCLI> list griddisk
DATA1_CD_DISK01_stocell2 active
DATA1_CD_DISK02_stocell2 active
DATA1_CD_DISK03_stocell2 active
DATA1_CD_DISK04_stocell2 active
DATA1_CD_DISK05_stocell2 active
DATA1_CD_DISK06_stocell2 active
DATA1_CD_DISK07_stocell2 active
DATA1_CD_DISK08_stocell2 active
DATA1_CD_DISK09_stocell2 active
DATA1_CD_DISK10_stocell2 active
DATA1_CD_DISK11_stocell2 active
DATA1_CD_DISK12_stocell2 active
all 12 disk are also active.
Till now i didn't notice the issue. until i run strace on kfod again. and noticed gettimeofday function and realized i missed to setup the ntp and time difference in both server(cell1/dbserver1 and cell2) is almost 12 hours
[root@exacell02 ~]# date
Mon Jan 4 08:37:20 GMT 2016
[root@exacell01 sysconfig]# date
Mon Jan 4 20:23:55 GMT 2016
[root@exadbserver1 trace]# date
Mon Jan 4 20:24:09 GMT 2016
strace /grid/stage/ext/bin/kfod disk=all op=disks
gettimeofday({1451938797, 257309}, NULL) = 0
sendmsg(8, {msg_name(16)={sa_family=AF_INET, sin_port=htons(14825), sin_addr=inet_addr("192.168.56.151")}, msg_iov(3)=[{"\4\3\2\1\3033\0\0\0\0\0\0MRON\0\3\0\0\0\0\0\0<\205\360g\0\0\0\0"..., 76}, {"\1W\3249\377\177\0\0\34\0\0\0\223+\0\0XZ\3249\377\177\0\0\1\0\0\0", 28}, {"\4\3\2\1\2\0\0\0\2607(\5\0\0\0\0\263>\0\5\211\323\2247,\200\0\0", 28}], msg_controllen=0, msg_flags=0}, 0) = 132
gettimeofday({1451938797, 257422}, NULL) = 0
sendmsg(8, {msg_name(16)={sa_family=AF_INET, sin_port=htons(40542), sin_addr=inet_addr("192.168.56.151")}, msg_iov(3)=[{"\4\3\2\1\3033\0\0\0\0\0\0MRON\0\3\0\0\0\0\0\0<\205\360g\0\0\0\0"..., 76}, {"\1W\3249\377\177\0\0\34\0\0\0\223+\0\0XZ\3249\377\177\0\0\1\0\0\0", 28}, {"\4\3\2\1\2\0\0\0\312\305\6(\0\0\0\0n\237*P\212\323\2247\26\200\0\0", 28}], msg_controllen=0, msg_flags=0}, 0) = 132
times({tms_utime=7, tms_stime=6, tms_cutime=0, tms_cstime=0}) = 430767563
gettimeofday({1451938797, 257621}, NULL) = 0
gettimeofday({1451938797, 257641}, NULL) = 0
socket(PF_NETLINK, SOCK_RAW, 0) = 14
bind(14, {sa_family=AF_NETLINK, pid=0, groups=00000000}, 12) = 0
getsockname(14, {sa_family=AF_NETLINK, pid=29303, groups=00000000}, [4294967308]) = 0
sendto(14, "\24\0\0\0\26\0\1\3\355\323\212V\0\0\0\0\0\0\0\0", 20, 0, {sa_family=AF_NETLINK, pid=0, groups=00000000}, 12) = 20
recvmsg(14, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov(1)=[{"<\0\0\0\24\0\2\0\355\323\212Vwr\0\0\2\10\200\376\1\0\0\0\10\0\1\0\177\0\0\1"..., 4096}], msg_controllen=0, msg_flags=0}, 0) = 468
recvmsg(14, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov(1)=[{"\24\0\0\0\3\0\2\0\355\323\212Vwr\0\0\0\0\0\0\1\0\0\0\10\0\1\0\177\0\0\1"..., 4096}], msg_controllen=0, msg_flags=0}, 0) = 20
close(14) = 0
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 14
fcntl(14, F_SETFL, O_RDONLY|O_NONBLOCK) = 0
connect(14, {sa_family=AF_INET, sin_port=htons(5042), sin_addr=inet_addr("192.168.56.151")}, 16) = -1 EINPROGRESS (Operation now in progress)
poll([{fd=14, events=POLLOUT}], 1, 2000) = 1 ([{fd=14, revents=POLLOUT}])