#!/bin/sh # # @(#) cpg, 1.3.6 quick check to intuit performance problems # # 24 Feb 1993 Initial release # 03 Mar 1993 Fix some bugs # 11 Mar 1993 Give advice more carefully about nfsd processes # 15 Mar 1993 Add default route check # 25 Mar 1993 Various cleanups # 26 Mar 1993 More echo and other minor fixes # 27 Oct 1993 Fix nfsd check for Solaris 2.x # 02 Nov 1993 Threads/Processes confusion # 24 Jan 1994 grep'ing the system file for maxusers oddity on 2.3 # 07 Feb 1994 egrep -v pcnfs in network check # # Richard Pettit, SMCC Senior Technical Consultant # # Richard.Pettit@Sun.COM # # This script implements the flowchart in the Configuration and Capacity # Planning for Sun Servers document. It does a pretty good job of # intuiting performance problems. # # Use: cpg [-c] # # Flags: # # -c Continuous mode # prog=`basename $0` if [ ! -f /bin/uname -o `/bin/uname -s` != "SunOS" ]; then echo "$prog": This script can only run on SunOS exit fi tty -s if [ $? -ne 0 ]; then echo "$prog": Not a tty exit fi if [ -f core ]; then echo "$prog": Core file must be removed from this directory before running. exit fi # page out and swap out threshold values po_thresh=8 so_thresh=8 # udp overflow threshold uo_thresh=100 # tmp file awk_file=/tmp/p.awk_$$ # is -c specified ? once=1 # only spew about the nfsd stuff once disclaimer=0 # whooo are you? ooh ooh ooh ooh release=`/bin/uname -r` nodename=`/bin/uname -n` while getopts c c 2> /dev/null ; do case $c in c ) once=0 ;; * ) echo Use: "$prog" '[-c]' exit ;; esac shift done # Set up functions and variables for each OS case $release in 4.1* ) PATH=/bin:/usr/bin:/usr/ucb:/usr/etc if [ -f /usr/5bin/echo ]; then ECHO() /usr/5bin/echo "$@" else ECHO() { /bin/echo "$@" | sed 's/\\.//g' } fi psax() /bin/ps ax psaxc() /bin/ps axc awk() /bin/awk "$@" sol2=0 ;; 5.* ) PATH=/bin:/usr/ccs/bin ECHO() echo "$@" psax() /usr/bin/ps -eaf psaxc() /usr/bin/ps -ea awk() /usr/bin/nawk "$@" # awk dumps core on 5.1 sol2=1 ;; * ) echo "$prog": Unknown release: $release exit ;; esac ECHO \\n what $0 | tail -1 ECHO \\n # no args eat_4_lines() { (line ; line ; line ; line) > /dev/null } # no args get_cpu_times() { ECHO 'Getting cpu times...\c' f=/tmp/$$ vmstat 1 10 | ( eat_4_lines t_intfaults=0 # initialize counter t_sysfaults=0 # initialize counter t_cswitch=0 # initialize counter t_usr=0 # initialize counter t_sys=0 # initialize counter t_idle=0 # initialize counter for i in 1 2 3 4 5 6 7 8 ; do # for each of the rem. lines set -- `line` # use shell to extract n=`expr $# - 6` # the last arg on the line shift $n # which is the idle time t_intfaults=`expr $1 + $t_intfaults` # accumulate interrupt faults shift t_sysfaults=`expr $1 + $t_sysfaults` # accumulate syscall faults shift t_cswitch=`expr $1 + $t_cswitch` # accumulate context switches shift t_usr=`expr $1 + $t_usr` # accumulate user shift t_sys=`expr $1 + $t_sys` # accumulate system shift t_idle=`expr $1 + $t_idle` # accumulate idle done avg_intfaults=`expr $t_intfaults / 8` avg_sysfaults=`expr $t_sysfaults / 8` avg_cswitch=`expr $t_cswitch / 8` avg_usr=`expr $t_usr / 8` avg_sys=`expr $t_sys / 8` avg_idle=`expr $t_idle / 8` ECHO $avg_intfaults $avg_sysfaults $avg_cswitch $avg_usr $avg_sys $avg_idle ) > $f read avg_intfaults avg_sysfaults avg_cswitch avg_usr avg_sys avg_idle < $f /bin/rm -f $f ECHO ok } # no args check_paging_swapping() { ECHO 'Checking paging/swapping...\c' vmstat -S 1 10 | ( eat_4_lines po=0 so=0 for i in 1 2 3 4 5 6 7 8 ; do # for each of the rem. lines set -- `line` # use shell to extract args so=`expr $so + $7` # swap out value po=`expr $po + $9` # page out value done if [ $po -gt $po_thresh -o $so -gt $so_thresh ]; then ECHO '\n\tAdd memory.' ECHO '\tRearrange process load.' ECHO '\tAnalyze process behaviour.' ECHO '\tUse tmpfs or mmap().' else ECHO ok fi ) } # no args generate_awk_file() { cat > $awk_file << EOF BEGIN { go_ahead_and_debug_it = 0; } { lines++; if ( lines == 1 ) for(i=0; i 0) { diff = n / rwps[i]; if (diff > 0.20) { printf(" Disk %s has %g %% more activity than Disk %s\n", \ disk_name[j], diff * 100.0, disk_name[i]); unbalanced = 1; } } else { printf(" Disk %s has %d more r-w/second than Disk %s\n", \ disk_name[j], n, disk_name[i]); unbalanced = 1; } } } else { if (n > 30) { if (rwps[j] > 0) { diff = n / rwps[j]; if (diff > 0.20) { printf(" Disk %s has %g %% more activity than Disk %s\n", \ disk_name[i], diff * 100.0, disk_name[j]); unbalanced = 1; } } else { printf(" Disk %s has %d more r-w/second than Disk %s\n", \ disk_name[i], n, disk_name[j]); unbalanced = 1; } } } } if (unbalanced) printf(" Unbalanced disk load. Try moving data or striping.\n"); unbalanced = 0; for(i=0; i= 15) && (wps[i] >= (5 * rps[i]))) { printf(" Writes/sec are %g %% the reads/sec on disk %s\n", \ (wps[i] / rps[i]) * 100.0, disk_name[i]); unbalanced = 1; } if (unbalanced) printf(" Unbalanced read/write load. Try adding PrestoServe.\n"); } EOF } # no args check_disk_saturation() { ECHO 'Checking disk saturation...' generate_awk_file iostat -l 10 -D 1 10 | awk -f $awk_file } # no args check_dnlc() { ECHO 'Checking DNLC hit rate...' if [ $sol2 -eq 0 ]; then # gadzooks. someone tell me how to find maxusers another way. set -- `(ECHO 'nproc?D' | adb /vmunix | ( line > /dev/null ; line))` maxusers=`expr '(' $2 - 10 ')' / 16` else maxusers=`grep '^set.*maxusers' /etc/system | sed 's/^.*= *//' | cut -f1` if [ -z "$maxusers" ]; then set -- `(ECHO 'maxusers?D' | adb /kernel/unix | (line > /dev/null;line))` maxusers=$2 fi fi set -- `vmstat -s | fgrep 'total name lookups'` if [ $sol2 -eq 0 ]; then hit_rate=`ECHO $7 | tr -d %` else hit_rate=`ECHO $7 | sed 's/%)//g'` fi total_lookups=$1 set -- $hit_rate if [ $1 -lt 80 ]; then if [ $1 -lt 0 ]; then ECHO '\tOverflow on DNLC. Re-run shortly after next reboot.' else ECHO "\tDNLC hit rate is only $1 %. Should be at least 80 %." if [ $maxusers -lt 64 ]; then more=`expr $maxusers + 8` if [ $more -gt 64 ]; then more=64 fi ECHO "\tTry increasing MAXUSERS from $maxusers to $more" else ECHO '\tTry increasing ncsize in param.c' fi fi fi set -- `vmstat -s | fgrep toolong` if [ $sol2 -eq 0 ]; then toolong=$2 else toolong=$1 fi ECHO $toolong $total_lookups | awk '{ n = (($1 / $2) * 100); if (n > 10.0) { printf(" Too-long pathnames are %5.2f %% of total lookups.\n", n); printf(" Should be no more than 10 %%.\n"); } }' } check_cpu() { ECHO 'Checking CPU times...' if [ $avg_sys -gt 30 ]; then if [ $avg_sysfaults -gt 12300 ]; then # 30% of 41000 (peak IPX) ECHO '\tInefficient use of system calls.' fi if [ $avg_cswitch -gt 1140 ]; then # 30% of 3800 (peak IPX) ECHO '\tHigh context switch rate.' fi fi if [ $avg_usr -gt 70 ]; then n_procs=`psax | awk '{ if ( $1 == "PID" || $1 < 300 ) next; n++; } END { print n }'` if [ $n_procs -gt $maxusers ]; then ECHO '\tHigh user time w/many processes.' ECHO '\tMigrate to MP or use cron or nice.' else ECHO '\tHigh user time w/few processes.' ECHO '\tDivide processes into subprocesses, profile and optimize code.' fi fi if [ $avg_intfaults -gt 1000 ]; then # 30% of 3000 (peak) ECHO '\tHigh interrupt rate. Culprits are:' vmstat -i | awk '{ if ( $1 == "interrupt" || substr($1, 1, 4) == "----" || $1 == "Total" ) next; if ( $NF > 30 && $1 != "clock" ) printf("%s %d\n", $1, $NF); }' | while read device rate ; do ECHO "\t\t$device ( $rate / second )" case $device in ie* | le* ) ECHO '\t\t\tCheck transceiver or try NC400.' ;; mti* ) ECHO '\t\t\tTry intelligent terminal servers.' ;; zs* ) ECHO '\t\t\tCheck for noisy ports or try HSI.' ;; esp* ) ECHO '\t\t\tTry SBE.' ;; * ) ECHO '\t\t\tUnknown solution (now).' esac done fi } check_network() { ECHO 'Checking network condition...' f=/tmp/$$ netstat -i | egrep -v '^Name|^lo0' | ( while read name mtu net add ipkts ierrs opkts oerrs collis queue ; do if [ -z "$t_ipkts" ]; then t_ipkts=0; fi if [ -z "$t_ierrs" ]; then t_ierrs=0; fi if [ -z "$t_collis" ]; then t_collis=0; fi if [ -z "$t_opkts" ]; then t_opkts=0; fi t_ipkts=`expr $t_ipkts + $ipkts` t_ierrs=`expr $t_ierrs + $ierrs` t_collis=`expr $t_collis + $collis` t_opkts=`expr $t_opkts + $opkts` done ECHO $t_ipkts $t_ierrs $t_collis $t_opkts ) > $f read t_ipkts t_ierrs t_collis t_opkts < $f /bin/rm -f $f ECHO $t_collis $t_opkts $t_ierrs $t_ipkts | awk '{ if (($2 == 0) || ($4 == 0)) next; coll_rate = $1 / $2; err_rate = $3 / $4; if (coll_rate > 0.05) printf(" High collision rate ( %g %% ). Subnet or check cabling.\n", \ (coll_rate * 100.0)); if (err_rate > 0.00025) printf(" Error rate not zero ( %g %% ). Increase buffer space.\n", \ (err_rate * 100.0)); }' set -- `nfsstat -rc | tail -1` ECHO $1 $3 $4 | awk '{ if ($1 == 0) next; calls = $1; retrans = $3; badxid = $4; if (( retrans / calls ) > 0.05 ) if (( badxid / calls ) < 0.05 ) { printf(" High retransmission rate.\n"); printf(" Check routers and bridges for dropped packets.\n"); printf(" Try decreasing rsize and wsize in fstab\n"); printf(" to improve NFS client I/O.\n"); } else { printf(" Bad server response time for client.\n"); printf(" Try increasing timeo in fstab to improve\n"); printf(" NFS client I/O.\n"); } }' if [ $sol2 -eq 0 ]; then udp_overflows=`netstat -s | fgrep 'socket overflow' | awk '{ print $1 }'` else udp_overflows=`netstat -s | fgrep udpInOverflows | awk '{ print $6 }'` fi if [ $udp_overflows -gt $uo_thresh ]; then if [ $sol2 -eq 0 ]; then n_nfsd=`psaxc | fgrep nfsd | wc -l` else n_nfsd=`psax | fgrep nfsd | egrep -v 'fgrep|pcnfs' | awk '{ print $NF}'` fi n=`ECHO $n_nfsd | sed 's/ //g'` # aarrgghh if [ $disclaimer -eq 0 ]; then if [ $sol2 -eq 0 ]; then whatarethey="processes" else whatarethey="threads" fi if [ $n_nfsd -le 20 ]; then disclaimer=1 nn_nfsd=`expr $n_nfsd + 4` ECHO "\tOverrun of nfsd processes ( $udp_overflows times )" ECHO "\tTry increasing from $n to $nn_nfsd" cat << EOF NOTE: The number of nfsd $whatarethey on the system ( $n ) is not enough to satisfy the requests of nfs clients on the network. This is based on information currently given in the netstat -s command. If you decide to increase the nfsd count from $n to $nn_nfsd then you should reboot the machine and allow the statistics of the netstat command to accumulate again before making the decision to increase the number of nfsd processes again. EOF else ECHO "\tThere are already $n nfsd $whatarethey running." ECHO '\tTry adding NC400 if not already installed.' fi fi fi f=/tmp/$$ # Removed following line after getattr=$4 # shift 11 nfsstat -s | tail -5 | egrep -v 'wrcache|mkdir' | ( set -- `line` getattr=$4 readlink=$1 nread=$3 set -- `line` nwrite=$4 line > /dev/null ECHO $getattr $readlink $nread $nwrite ) | tr -d '%' > $f read getattr readlink nread nwrite < $f /bin/rm -f $f if [ $getattr -gt 35 ]; then ECHO "\tHigh getattr count ($getattr %)." ECHO '\tCheck actimeo in fstab for client NFS I/O' ECHO '\t\tand increase for read-only clients.' fi if [ $readlink -gt 5 ]; then ECHO "\tHigh readlink count ($readlink %)." ECHO '\tCut down on number of symbolic links on NFS mounts for clients.' fi if [ $sol2 -eq 0 ]; then strings /vmunix | grep -is presto has_presto=$? else if [ -f /kernel/unix ]; then strings /kernel/unix | grep -is presto has_presto=$? else strings /kernel/genunix | grep -is presto has_presto=$? fi fi if [ $nwrite -gt 5 ]; then ECHO "\tHigh percentage of NFS writes ($nwrite %).\c" if [ $has_presto -eq 1 ]; then ECHO " Add PrestoServe." else ECHO " PrestoServe already installed." fi fi if [ $nread -gt 30 ]; then ECHO "\tHigh percentage of NFS reads ($nread %). Add NC400." fi netstat -rn | sed -e '/^$/d' -e '/^Routing [tT]able/d' \ -e '/^.*Destination/d' -e '/^----/d' | awk " { if ((\$1 != \"multicast\") && (\$2 != \"$nodename\") && \ (\$2 != \"localhost\")) { routes++; if (gate[\$2] == 0) { gateways++; } gate[\$2]++; } } END { if ((routes > 1) && (gateways == 1)) { for(i in gate) g = i; printf(\" Create /etc/defaultrouter with IP address '%s' in it.\n\", g); } }" } trap "/bin/rm -f $awk_file" 0 if [ $sol2 -eq 1 ]; then case "$TZ" in */*) ECHO "\n\tChange TZ in /etc/TIMEZONE to something other than ${TZ}." ECHO "\tFor instance: PST8PDT for US/Pacific. Then reboot.\n" ;; *) ;; esac fi while true do ECHO ----------------------------------- date ECHO ----------------------------------- get_cpu_times check_paging_swapping check_disk_saturation check_dnlc check_cpu check_network if [ $once -eq 1 ]; then exit fi done