#!/bin/csh -fv ####################################################################### # # OnDemand Monitoring Script # ####################################################################### # /opt/bin/odmonitor.sh # written by Kevin P. Inscoe (kevin@inscoe.org) # http://www.inscoe.org/ondemand # July 29, 1999 # Global Variables set host = `/bin/hostname` set logfile = "/var/log/ars_system.log" set ADMIN = "admin" set PASSWORD = "jazman11" set adsm_id = "archive" set adsm_pwd = "unixrules" set sleeptime = 1800 # 20 minutes # last hour of the day (24 hour) to run @ lasthour = 23 set tmp = "/tmp/odmonitor.tmp" set pidfile = "/etc/odmonitor.pid" set fstab = "/opt/monitor/fstab" set call_ito = "/opt/bin/ito_msg_gen" set ADMIN = "admin" set PASSWD_FILE = "/usr/lpp/ars/config/admin.passwd" set camsprod = "155.90.123.30" @ q_threshold = 20 # threshold for high number of fax queues set checkproc = "/opt/bin/checkproc.sh" setenv no_proxy "localhost, `/bin/echo $camsprod`" set rimage_ip = "198.206.145.200" set exports = "/arspdd" set check_ars_load_daemon = "1" # possible answers are "0" = no and "1" = yes set restart_ars_load = "/opt/bin/start_arsload.sh" # Annouce ourselves /bin/echo "+++++++++++++++++++++++++++++++++++++++++++" /bin/echo $0 "Starting at `/usr/bin/date`" # Stuff the pidfile /bin/ps -o "%P" | /bin/grep -v PPID | /bin/sort -n | /bin/head -1 | /bin/cut -f2 -d' ' > $pidfile top: # cleanup from last time /bin/rm -f $tmp # # monitoring routines go here # mounted: # check mounted filesystems ### does fstab exists? set msg = `/bin/echo "cannot read fstab file" $fstab` if ( ! -r $fstab ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS goto arssockd endif ### process fstab foreach m ( `/bin/cat $fstab` ) /usr/sbin/mount | /bin/grep $m >& /dev/null set msg = `/bin/echo $m "is not mounted"` if ( $status == 1 ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif end arssockd: # check arssockd if working set msg = `/bin/echo "cannot read passwd.admin file" $PASSWD_FILE` if ( ! -r $PASSWD_FILE ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS goto cams endif set PASSWD = `/bin/head -1 $PASSWD_FILE` echo $PASSWD if ( -e $tmp ) /bin/rm -f $tmp /usr/lpp/ars/bin/arsquery -h $host -u $ADMIN -p $PASSWORD -v -o $tmp -q SystemLog -f "System Log" set msg = "cannot connect to arssockd" if ( ! -e $tmp ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif if ( -e $tmp ) /bin/rm -f $tmp cams: # Test cams server #/usr/local/bin/lynx $camsprod adsm: # cleanup from last time if ( -e $tmp ) /bin/rm -f $tmp # Test adsm connectivity to optical set msg = "cannot connect to optical library or drive rop0 appears to be down" /usr/bin/dsmadmc -id=$adsm_id -pa=$adsm_pwd query drive > $tmp if ( `/bin/grep rop0 $tmp | /bin/awk '{ print $5 }'` != "Yes" ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif if ( -e $tmp ) /bin/rm -f $tmp fax: # check print queue set msg = "LightFax queue is currently down" if ( `/bin/lpstat -vLightFax | /bin/head -3 | /bin/tail -1 | /bin/awk '{ print $3 }'` == "DOWN" ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif # check print queue for unacceptable queue depth set msg = "LightFax queue has greater then `/bin/echo $q_threshold` faxes in it" @ num = 0 @ num = `/bin/lpstat -vLightFax | /bin/grep -v "Status" | /bin/grep -v "-" | /bin/grep -v "LightFa" | /bin/wc -l` if ( $num > $q_threshold ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor fax APPL endif daemon: # check Lightning fax daemons are running set msg = "LFapi process is down" if ( `$checkproc LFapi` == 0 ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif set msg = "lfrs process is down" if ( `$checkproc lfrs` == 0 ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif set msg = "LFAIXdriver process is down" if ( `$checkproc LFAIXdriver` == 0 ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif set msg = "LFserver process is down" if ( `$checkproc LFserver` == 0 ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif set msg = "LFslave process is down" if ( `$checkproc LFslave` == 0 ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif set msg = "LFapifbk process is down" if ( `$checkproc LFapifbk` == 0 ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif set msg = "LFrcp process is down" if ( `$checkproc LFrcp` == 0 ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif set msg = "LFprint process is down" if ( `$checkproc LFprint` == 0 ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif PSF: # Check PSF availability set msg = "PSF queue LightFax appears to be down" if ( `/usr/lpp/psf/bin/psfstat -n -P LightFax | /bin/awk '{ print $4 }'` != "ok" ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif escon: # Check ESCON connection if ( `/usr/sbin/ifconfig en4 | /bin/tail -1 | /bin/awk '{ print $2 }'` == "155.90.2.1" ) then set mfip = "192.168.202.1" set mfdev = "ATTPRD1" else set mfip = "192.168.202.2" set mfdev = "ATTPRD2" endif set msg = "ESCON interface `/bin/echo $mfip` is down - contact NSC L2 - restart device `/bin/echo $mfdev`" if ( `/usr/sbin/ping -q -c 10 $mfip | /bin/tail -1 | /bin/awk -F, '{ print $3 }' | /bin/awk -F% '{ print $1 }'` > 10 ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor network OS endif nfs: # Check that nfs services are running set msg = "biod is down" if ( `/bin/lssrc -g nfs | /bin/grep biod | /bin/awk '{ print $4 }'` != "active" ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif set msg = "nfsd is down" if ( `/bin/lssrc -g nfs | /bin/grep nfsd | /bin/awk '{ print $4 }'` != "active" ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif set msg = "rpc.mountd is down" if ( `/bin/lssrc -g nfs | /bin/grep rpc.mountd | /bin/awk '{ print $4 }'` != "active" ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif set msg = "rpc.statd is down" if ( `/bin/lssrc -g nfs | /bin/grep rpc.statd | /bin/awk '{ print $4 }'` != "active" ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif set msg = "rpc.lockd is down" if ( `/bin/lssrc -g nfs | /bin/grep rpc.lockd | /bin/awk '{ print $4 }'` != "active" ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif rimage: # rimage connectivity set msg = "Cannot ping Rimage PC at `/bin/echo $rimage_ip`" if ( `/usr/sbin/ping -q -c 10 $rimage_ip | /bin/tail -1 | /bin/awk -F, '{ print $3 }' | /bin/awk -F% '{ print $1 }'` > 10 ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif export: # check exported filesystems foreach e ( `/bin/echo $exports` ) set msg = "`/bin/echo $e` is not exported" @ num = 0 @ num = `/usr/sbin/exportfs | /bin/grep $e | /bin/wc -l` if ( $num == 0 ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif end arsload: # Check ars_load daemon if check_ars_load_daemon = "1" if ( $check_ars_load_daemon == "1" ) then set msg = "ars_load deamon was down, restarted ok" if ( `$checkproc ars_load` == 0 ) then $restart_ars_load /bin/sleep 30 if ( `$checkproc ars_load` > 0 ) then /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito warning odmonitor AIX OS else set msg = "ars_load deamon is down - cannot restart - contact oncall" /bin/echo $0 ": error:" $msg /bin/echo $msg | $call_ito critical odmonitor AIX OS endif endif endif loop_check: # check to see if we are still in our window... @ hour = `/usr/bin/date '+%H' | /bin/awk '{ printf("%d\n", $1) }'` echo "hour=" $hour echo "lasthour=" $lasthour if ( $hour == $lasthour ) goto bail # goto sleep /bin/echo "sleeping..." sleep $sleeptime goto top bail: # # our tour of duty is over..time to bail # /bin/echo $0 "hour=" $hour "lastrun completed...exiting at `/usr/bin/date`" exit 0