Recently while building Exadata Simulator for Training my students I had issue in Connecting from Exadata  Compute Node( DB Server) to Storage Cell.
Configuration:
DB server/Compute Node : 1 = exadbserver1
Storage Cell: 2 = exacell01 and exacell02
[root@exadbserver1 trace]# /grid/stage/ext/bin/kfod disk=all op=disks
--------------------------------------------------------------------------------
Disk Size Path User Group
================================================================================
1: 448 Mb o/192.168.56.151/DATA_CD_DISK01_stocell1  
2: 448 Mb o/192.168.56.151/DATA_CD_DISK02_stocell1  
3: 448 Mb o/192.168.56.151/DATA_CD_DISK03_stocell1  
4: 448 Mb o/192.168.56.151/DATA_CD_DISK04_stocell1  
5: 448 Mb o/192.168.56.151/DATA_CD_DISK05_stocell1  
6: 448 Mb o/192.168.56.151/DATA_CD_DISK06_stocell1  
7: 448 Mb o/192.168.56.151/DATA_CD_DISK07_stocell1  
8: 448 Mb o/192.168.56.151/DATA_CD_DISK08_stocell1  
9: 448 Mb o/192.168.56.151/DATA_CD_DISK09_stocell1  
10: 448 Mb o/192.168.56.151/DATA_CD_DISK10_stocell1  
11: 448 Mb o/192.168.56.151/DATA_CD_DISK11_stocell1  
12: 448 Mb o/192.168.56.151/DATA_CD_DISK12_stocell1  
[root@exacell02 ~]# cellcli
CellCLI: Release 11.2.3.2.1 - Production on Mon Jan 04 08:29:48 GMT 2016
Copyright (c) 2007, 2012, Oracle. All rights reserved.
Cell Efficiency Ratio: 1
CellCLI> list cell detail
name: stocell2
bbuTempThreshold: 60
bbuChargeThreshold: 800
bmcType: absent
cellVersion: OSS_11.2.3.2.1_LINUX.X64_130109
cpuCount: 2
diagHistoryDays: 7
fanCount: 1/1
fanStatus: normal
flashCacheMode: WriteThrough
id: ba486e52-db76-4ac8-b08a-01048f6bfb5d
interconnectCount: 2
interconnect1: eth1
iormBoost: 0.0
ipaddress1: 192.168.56.152/24
kernelVersion: 2.6.18-371.el5xen
makeModel: Fake hardware
metricHistoryDays: 7
offloadEfficiency: 1.0
powerCount: 1/1
powerStatus: normal
releaseVersion: 11.2.3.2.1
releaseTrackingBug: 14522699
status: online
temperatureReading: 0.0
temperatureStatus: normal
upTime: 0 days, 0:34
cellsrvStatus: running
msStatus: running
rsStatus: running
CellCLI> list griddisk
DATA1_CD_DISK01_stocell2 active
DATA1_CD_DISK02_stocell2 active
DATA1_CD_DISK03_stocell2 active
DATA1_CD_DISK04_stocell2 active
DATA1_CD_DISK05_stocell2 active
DATA1_CD_DISK06_stocell2 active
DATA1_CD_DISK07_stocell2 active
DATA1_CD_DISK08_stocell2 active
DATA1_CD_DISK09_stocell2 active
DATA1_CD_DISK10_stocell2 active
DATA1_CD_DISK11_stocell2 active
DATA1_CD_DISK12_stocell2 active
Configuration:
DB server/Compute Node : 1 = exadbserver1
Storage Cell: 2 = exacell01 and exacell02
[root@exadbserver1 trace]# /grid/stage/ext/bin/kfod disk=all op=disks
--------------------------------------------------------------------------------
Disk Size Path User Group
================================================================================
1: 448 Mb o/192.168.56.151/DATA_CD_DISK01_stocell1
2: 448 Mb o/192.168.56.151/DATA_CD_DISK02_stocell1
3: 448 Mb o/192.168.56.151/DATA_CD_DISK03_stocell1
4: 448 Mb o/192.168.56.151/DATA_CD_DISK04_stocell1
5: 448 Mb o/192.168.56.151/DATA_CD_DISK05_stocell1
6: 448 Mb o/192.168.56.151/DATA_CD_DISK06_stocell1
7: 448 Mb o/192.168.56.151/DATA_CD_DISK07_stocell1
8: 448 Mb o/192.168.56.151/DATA_CD_DISK08_stocell1
9: 448 Mb o/192.168.56.151/DATA_CD_DISK09_stocell1
10: 448 Mb o/192.168.56.151/DATA_CD_DISK10_stocell1
11: 448 Mb o/192.168.56.151/DATA_CD_DISK11_stocell1
12: 448 Mb o/192.168.56.151/DATA_CD_DISK12_stocell1
But no disk were visible from 2nd storage cell,   running on 192.168.56.152
checked the logs , discovered that its timing out on storage cell02 from logs
[root@texadbserver1 trace]# cat ora_26784_47984932300480.trc
Trace file /u01/oracle/app/oracle/oradiag_root/diag/clients/user_root/host_482230158_80/trace/ora_26784_47984932300480.trc
connect: sosstcpopen failed. boxname = 192.168.56.152, port = 5042
OS system dependent operation:connect_error failed with status: 115
OS failure message: Operation now in progress
failure occurred at: sosstcpconne
Connect retry: sleeping for 1 seconds, connect attempt 2 out of maximum 7 attempts
connect: sosstcpopen failed. boxname = 192.168.56.152, port = 5042
OS system dependent operation:connect_error failed with status: 115
OS failure message: Operation now in progress
failure occurred at: sosstcpconne
Connect retry: sleeping for 1 seconds, connect attempt 2 out of maximum 7 attempts
Troubleshooting
Checked RDS is enabled on exacell02 ( cloned from exacell01, this shouldn't be problem but checked in case , it not restarted).
[root@exacell02 sysconfig]# lsmod |grep rds
rds_rdma              106561  0
rds_tcp                48097  0
rds                   155561  144 rds_rdma,rds_tcp
rdma_cm                73429  2 rds_rdma,ib_iser
ib_core               108097  7 rds_rdma,ib_iser,rdma_cm,ib_cm,iw_cm,ib_sa,ib_mad
All looks good
[root@exacell02 ~]# cellcli
CellCLI: Release 11.2.3.2.1 - Production on Mon Jan 04 08:29:48 GMT 2016
Copyright (c) 2007, 2012, Oracle. All rights reserved.
Cell Efficiency Ratio: 1
CellCLI> list cell detail
name: stocell2
bbuTempThreshold: 60
bbuChargeThreshold: 800
bmcType: absent
cellVersion: OSS_11.2.3.2.1_LINUX.X64_130109
cpuCount: 2
diagHistoryDays: 7
fanCount: 1/1
fanStatus: normal
flashCacheMode: WriteThrough
id: ba486e52-db76-4ac8-b08a-01048f6bfb5d
interconnectCount: 2
interconnect1: eth1
iormBoost: 0.0
ipaddress1: 192.168.56.152/24
kernelVersion: 2.6.18-371.el5xen
makeModel: Fake hardware
metricHistoryDays: 7
offloadEfficiency: 1.0
powerCount: 1/1
powerStatus: normal
releaseVersion: 11.2.3.2.1
releaseTrackingBug: 14522699
status: online
temperatureReading: 0.0
temperatureStatus: normal
upTime: 0 days, 0:34
cellsrvStatus: running
msStatus: running
rsStatus: running
All looks Good 
CellCLI> list griddisk
DATA1_CD_DISK01_stocell2 active
DATA1_CD_DISK02_stocell2 active
DATA1_CD_DISK03_stocell2 active
DATA1_CD_DISK04_stocell2 active
DATA1_CD_DISK05_stocell2 active
DATA1_CD_DISK06_stocell2 active
DATA1_CD_DISK07_stocell2 active
DATA1_CD_DISK08_stocell2 active
DATA1_CD_DISK09_stocell2 active
DATA1_CD_DISK10_stocell2 active
DATA1_CD_DISK11_stocell2 active
DATA1_CD_DISK12_stocell2 active
all 12 disk are also active.
Till now i didn't notice the issue. until i run strace on kfod again.  and noticed gettimeofday function and realized i missed to setup the ntp and time difference in both server(cell1/dbserver1 and cell2) is almost 12 hours
[root@exacell02 ~]# date
Mon Jan  4 08:37:20 GMT 2016
[root@exacell01 sysconfig]# date
Mon Jan  4 20:23:55 GMT 2016
[root@exadbserver1 trace]# date
Mon Jan  4 20:24:09 GMT 2016
strace /grid/stage/ext/bin/kfod disk=all op=disks
gettimeofday({1451938797, 257309}, NULL) = 0
sendmsg(8, {msg_name(16)={sa_family=AF_INET, sin_port=htons(14825), sin_addr=inet_addr("192.168.56.151")}, msg_iov(3)=[{"\4\3\2\1\3033\0\0\0\0\0\0MRON\0\3\0\0\0\0\0\0<\205\360g\0\0\0\0"..., 76}, {"\1W\3249\377\177\0\0\34\0\0\0\223+\0\0XZ\3249\377\177\0\0\1\0\0\0", 28}, {"\4\3\2\1\2\0\0\0\2607(\5\0\0\0\0\263>\0\5\211\323\2247,\200\0\0", 28}], msg_controllen=0, msg_flags=0}, 0) = 132
gettimeofday({1451938797, 257422}, NULL) = 0
sendmsg(8, {msg_name(16)={sa_family=AF_INET, sin_port=htons(40542), sin_addr=inet_addr("192.168.56.151")}, msg_iov(3)=[{"\4\3\2\1\3033\0\0\0\0\0\0MRON\0\3\0\0\0\0\0\0<\205\360g\0\0\0\0"..., 76}, {"\1W\3249\377\177\0\0\34\0\0\0\223+\0\0XZ\3249\377\177\0\0\1\0\0\0", 28}, {"\4\3\2\1\2\0\0\0\312\305\6(\0\0\0\0n\237*P\212\323\2247\26\200\0\0", 28}], msg_controllen=0, msg_flags=0}, 0) = 132
times({tms_utime=7, tms_stime=6, tms_cutime=0, tms_cstime=0}) = 430767563
gettimeofday({1451938797, 257621}, NULL) = 0
gettimeofday({1451938797, 257641}, NULL) = 0
socket(PF_NETLINK, SOCK_RAW, 0)         = 14
bind(14, {sa_family=AF_NETLINK, pid=0, groups=00000000}, 12) = 0
getsockname(14, {sa_family=AF_NETLINK, pid=29303, groups=00000000}, [4294967308]) = 0
sendto(14, "\24\0\0\0\26\0\1\3\355\323\212V\0\0\0\0\0\0\0\0", 20, 0, {sa_family=AF_NETLINK, pid=0, groups=00000000}, 12) = 20
recvmsg(14, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov(1)=[{"<\0\0\0\24\0\2\0\355\323\212Vwr\0\0\2\10\200\376\1\0\0\0\10\0\1\0\177\0\0\1"..., 4096}], msg_controllen=0, msg_flags=0}, 0) = 468
recvmsg(14, {msg_name(12)={sa_family=AF_NETLINK, pid=0, groups=00000000}, msg_iov(1)=[{"\24\0\0\0\3\0\2\0\355\323\212Vwr\0\0\0\0\0\0\1\0\0\0\10\0\1\0\177\0\0\1"..., 4096}], msg_controllen=0, msg_flags=0}, 0) = 20
close(14)                               = 0
socket(PF_INET, SOCK_STREAM, IPPROTO_IP) = 14
fcntl(14, F_SETFL, O_RDONLY|O_NONBLOCK) = 0
connect(14, {sa_family=AF_INET, sin_port=htons(5042), sin_addr=inet_addr("192.168.56.151")}, 16) = -1 EINPROGRESS (Operation now in progress)
poll([{fd=14, events=POLLOUT}], 1, 2000) = 1 ([{fd=14, revents=POLLOUT}])
 
This comment has been removed by the author.
ReplyDeleteThis comment has been removed by the author.
ReplyDeleteI would be very thankful if 123essay.org you continue with quality what you are serving right now with your blog...I really enjoyed it...and i really appreciate to you for this....its always pleasure to read so....Thanks for sharing!!!
ReplyDelete