May 2010

Nagios Meta Check Part 3

In part one of this series the basic trusses needed by the Nagios check_systemhealth script were put together. In part two the actual checks themselves were coded. In this the third and final part of the series compulsory checks are added, the main loop is constructed and the finall full source listing produced.

The Story so far:

# Signals we are interested in dealing with, the right operand is the
# subroutine which handles the given interrupt type
$SIG{'INT' } = 'interrupt';
$SIG{'HUP' } = 'interrupt';
$SIG{'ABRT'} = 'interrupt';
$SIG{'QUIT'} = 'interrupt';
$SIG{'TRAP'} = 'interrupt';
$SIG{'STOP'} = 'interrupt';

# Globals
my $USER1="/usr/local/nagios/libexec"; # Be consistent wrt Nagios
my $CHECK="HEALTH"; # the name of the check; feel free to change
my $OUTFILE = "/var/tmp/healthcheck.tmp"; # an outfile for later use

# Where we store cherry picked results; init these to a space in case they
# are not all collected
my @LOAD_VALUES = " ";
my @SYSTIME_VALUE = " ";
my @ROOTDISK_VALUE = " ";
# Default values for LOAD, ROOTDISK Usage
my $DEF_LOAD_WARN = "4,2,2";
my $DEF_LOAD_CRIT = "5,4,3";
my $DEF_DISK_WARN = 95;
my $DEF_DISK_CRIT = 98;
my $DEF_SNMP_COMMUNITY = "public_readonly";

my $STATUS = 0; # A status var to be returned to nagios

# Flags
$DNS = 1; # do check that this host has a DNS entry
$PING = 0; # don't preping by default since nagios does, switch to 1 if
           # you want to preping before bothering with the rest

# Brain dead interrupt handler
sub interrupt { # usage: interrupt \'sig\'
    my($sig) = @_;
    die $sig;
    die;
}


# Generic sub: Load a file into an array and send the array back
sub load_file {
    my ($file) = shift;
    my @flist;

    open(FILE, $file) or die "Unable to open logfile $file: $!\n";
    @flist = <FILE>
    close FILE;

    return(@flist);
}

# Handle results status and print a final message with values of collated data
sub check_exit { # usage: check_exit("message string",RETVAL)
    my ($msg,$ret) = @_;

    # determine our status and exit appropriately
    if ($ret >= 3) {
        print "$CHECK UNKNOWN: $msg ";
    } elsif ($ret == 2) {
        print "$CHECK CRIT: $msg ";
    } elsif ($ret == 1) {
        print "$CHECK WARN: $msg ";
    } elsif ($ret == 0) {
        print "$CHECK OK: $msg ";
    } else{
        print "$CHECK UNKNOWN STATE: $msg ";
    }

    # print what we collected - note if one fails we do not collect the rest
    chomp (@SYSTIME_VALUE);
    chomp (@LOAD_VALUES);
    print("@SYSTIME_VALUE, System Load @LOAD_VALUES, Rootdisk @ROOTDISK_VALUE");
    unlink($OUTFILE); # delete the temp file for good
    exit ($ret);      # exit appropriately so nagios knows what to do
}

# Check the outfile in some cases for a SNMP warn or critical
# send back the appropriate signal for nagios
sub check_outfile { # usage: check_outfile
    my @critical = `grep CRITICAL $OUTFILE`;
    if (@critical) {
        return 2;
    }

    my @warn = `grep WARN $OUTFILE`;
    if (@warn) {
        return 1;
    }

    return 0;
}

# ye olde usage message
sub usage {
    print "Usage: $0 [-u[-H ||[ -lw  -lc  -dw  -dc ]]\n";
    print "Usage: $0 [--nodns][--noping][--snmp \"community [user] [pass]\"\n";
    print "Options:\n";
    print " -H       Check system called  (required)\n";
    print " -lw     Set load warning values\n";
    print "                Default: $DEF_LOAD_WARN\n";
    print " -lc     Set load critical values\n";
    print "                Default: $DEF_LOAD_CRIT\n";
    print " -dw     Set rootdisk warning percent\n";
    print "                Default: $DEF_DISK_WARN\n";
    print " -dc     Set rootdisk critical percent\n";
    print "                Default: $DEF_DISK_CRIT\n";
    print " --nodns        Do not check for DNS resolution\n";
    print " --noping       Do not preping to make sure the host is up\n";
    print "                Note: this will improve performance\n";
    print " --snmp   Set SNMP community name\n";
    print "                Default: $DEF_SNMP_COMMUNITY\n";
    print " -u             Print usage message and exit\n";
}

# Check Load
check load
sub load { # usage: load($host_or_ip,warn,critical,community)
    my ($host,$warn,$crit,$comm) = @_;
    
    system("$USER1/check_snmp -H $host -C $comm -o \
      .1.3.6.1.4.1.2021.10.1.3.1,.1.3.6.1.4.1.2021.10.1.3.2,\
      .1.3.6.1.4.1.2021.10.1.3.3 -w $warn -c $crit \
         -l \"Load 1min/5min/10min\"  > $OUTFILE");
    my $r = check_outfile();
    @LOAD_VALUES = `cat $OUTFILE|\
      awk '{ print \$3 \" \" \$5 \" \" \$6 \" \" \$7}'`;
    if ($r > 0) {
        if ($STATUS < $r) {
            $STATUS = $r;
        }
    }
}   

# Check rootdisk
sub rootdisk { # usage: rootdisk(host_or_ip,warn,crit,community)
    my ($host,$warn,$crit,$comm) = @_;
    
    system("$USER1/check_snmp -H $host -C $comm \
      -o 1.3.6.1.4.1.2021.9.1.9.1,.1.3.6.1.4.1.2021.9.1.7.1,\
      .1.3.6.1.4.1.2021.9.1.8.1,.1.3.6.1.4.1.2021.9.1.3.1,\
       .1.3.6.1.4.1.2021.9.1.2.1 -w $warn -c $crit > $OUTFILE");
    my $r = check_outfile();
    @ROOTDISK_VALUE = `cat $OUTFILE|\
       awk '{print \$4 \" \" \$5 \" \" \$6}'`;
    if ($r > 0) {
        if ($STATUS < $r) {
            $STATUS = $r;
        }
    }
}   

Compulsory Checks

For the purposes of this script the compulsory checks are ping and functioning DNS resolution for the system being checked. These will have the capability of being overridden at runtime (see the usage() sub routine for details).

Preping

sub preflight { # usage: preflight(host_or_ip)
    my ($host) = shift;

    system("$USER1/check_ping -H $host -p 3 \
        -w 10,60% -c 30,80% -4 -t 3 > $OUTFILE");
    my $r = $?;
    if ($r >= 2) {
        check_exit("Cannot ping host - abandoning checks",1);
    }
}

Note that for this check the actual Nagios check_ping is used instead of reinventing the wheel.

DNS

sub dns { # usage: dns($hostname_or_ip)
    my ($host) = shift;

    system("nslookup $host > $OUTFILE");
    my $r = $?;
    if ($r > 0) {
        check_exit("$host did not resolve in DNS",$r);
    }
}

Simple enough and easy, some platforms may not support nslookup and the host command would have to be parsed instead.

The Loop

The main() part of the program is now ready to go. First all of the variables needed are initialized:

my $load_warn = $DEF_LOAD_WARN;
my $load_crit = $DEF_LOAD_CRIT;
my $disk_warn = $DEF_DISK_WARN;
my $disk_crit = $DEF_DISK_CRIT;
my $snmp_community = $DEF_SNMP_COMMUNITY;
my $host;

Simple enough, now the actual options parsing:

while ( my $i = shift @ARGV ) {
    if ($i eq '-u') {
        usage();
        exit (0);
    } elsif ($i eq '-H') {
        $host = shift @ARGV;
    } elsif ($i eq '-lw') {
        $load_warn = shift @ARGV;
    } elsif ($i eq '-lc') {
        $load_crit = shift @ARGV;
    } elsif ($i eq '-dw') {
        $disk_warn = shift @ARGV;
    } elsif ($i eq '-dc') {
        $disk_crit = shift @ARGV;
    } elsif ($i eq '--nodns') {
        $DNS = 0;
    } elsif ($i eq '--ping') {
        $PING = 1;
    } elsif ($i eq '--snmp') {
        $snmp_community = shift @ARGV;
    }
}

A safety check then call the prechecks:

# there is no spoon...
if (!$host) {
    print "Error: no host specified\n";
    usage();
    exit (1);
}

# if we wanna ping go ahead XXX-mui do we care about stats?
if ($PING == 1) {
    preflight($host);
}
# if we wanna resolve then resolve
if ($DNS == 1) {
    dns($host);
}

And finally off and running:

# Call checks
load($host,$load_warn, $load_crit,$snmp_community);
rootdisk($host,$disk_warn,$disk_crit,$snmp_community);

# were all good - go ahead and exit
check_exit ("",$STATUS);

Full Listing

# Signals we are interested in dealing with, the right operand is the
# subroutine which handles the given interrupt type
$SIG{'INT' } = 'interrupt';
$SIG{'HUP' } = 'interrupt';
$SIG{'ABRT'} = 'interrupt';
$SIG{'QUIT'} = 'interrupt';
$SIG{'TRAP'} = 'interrupt';
$SIG{'STOP'} = 'interrupt';

# Globals
my $USER1="/usr/local/nagios/libexec"; # Be consistent wrt Nagios
my $CHECK="HEALTH"; # the name of the check; feel free to change
my $OUTFILE = "/var/tmp/healthcheck.tmp"; # an outfile for later use

# Where we store cherry picked results; init these to a space in case they
# are not all collected
my @LOAD_VALUES = " ";
my @SYSTIME_VALUE = " ";
my @ROOTDISK_VALUE = " ";
# Default values for LOAD, ROOTDISK Usage
my $DEF_LOAD_WARN = "4,2,2";
my $DEF_LOAD_CRIT = "5,4,3";
my $DEF_DISK_WARN = 95;
my $DEF_DISK_CRIT = 98;
my $DEF_SNMP_COMMUNITY = "public_readonly";

my $STATUS = 0; # A status var to be returned to nagios

# Flags
$DNS = 1; # do check that this host has a DNS entry
$PING = 0; # don't preping by default since nagios does, switch to 1 if
           # you want to preping before bothering with the rest

# Brain dead interrupt handler
sub interrupt { # usage: interrupt \'sig\'
    my($sig) = @_;
    die $sig;
    die;
}


# Generic sub: Load a file into an array and send the array back
sub load_file {
    my ($file) = shift;
    my @flist;

    open(FILE, $file) or die "Unable to open logfile $file: $!\n";
    @flist = <FILE>
    close FILE;

    return(@flist);
}

# Handle results status and print a final message with values of collated data
sub check_exit { # usage: check_exit("message string",RETVAL)
    my ($msg,$ret) = @_;

    # determine our status and exit appropriately
    if ($ret >= 3) {
        print "$CHECK UNKNOWN: $msg ";
    } elsif ($ret == 2) {
        print "$CHECK CRIT: $msg ";
    } elsif ($ret == 1) {
        print "$CHECK WARN: $msg ";
    } elsif ($ret == 0) {
        print "$CHECK OK: $msg ";
    } else{
        print "$CHECK UNKNOWN STATE: $msg ";
    }

    # print what we collected - note if one fails we do not collect the rest
    chomp (@SYSTIME_VALUE);
    chomp (@LOAD_VALUES);
    print("@SYSTIME_VALUE, System Load @LOAD_VALUES, Rootdisk @ROOTDISK_VALUE");
    unlink($OUTFILE); # delete the temp file for good
    exit ($ret);      # exit appropriately so nagios knows what to do
}

# Check the outfile in some cases for a SNMP warn or critical
# send back the appropriate signal for nagios
sub check_outfile { # usage: check_outfile
    my @critical = `grep CRITICAL $OUTFILE`;
    if (@critical) {
        return 2;
    }

    my @warn = `grep WARN $OUTFILE`;
    if (@warn) {
        return 1;
    }

    return 0;
}

# ye olde usage message
sub usage {
    print "Usage: $0 [-u[-H ||[ -lw  -lc  -dw  -dc ]]\n";
    print "Usage: $0 [--nodns][--noping][--snmp \"community [user] [pass]\"\n";
    print "Options:\n";
    print " -H       Check system called  (required)\n";
    print " -lw     Set load warning values\n";
    print "                Default: $DEF_LOAD_WARN\n";
    print " -lc     Set load critical values\n";
    print "                Default: $DEF_LOAD_CRIT\n";
    print " -dw     Set rootdisk warning percent\n";
    print "                Default: $DEF_DISK_WARN\n";
    print " -dc     Set rootdisk critical percent\n";
    print "                Default: $DEF_DISK_CRIT\n";
    print " --nodns        Do not check for DNS resolution\n";
    print " --noping       Do not preping to make sure the host is up\n";
    print "                Note: this will improve performance\n";
    print " --snmp   Set SNMP community name\n";
    print "                Default: $DEF_SNMP_COMMUNITY\n";
    print " -u             Print usage message and exit\n";
}

# Check Load
check load
sub load { # usage: load($host_or_ip,warn,critical,community)
    my ($host,$warn,$crit,$comm) = @_;

    system("$USER1/check_snmp -H $host -C $comm -o \
      .1.3.6.1.4.1.2021.10.1.3.1,.1.3.6.1.4.1.2021.10.1.3.2,\
      .1.3.6.1.4.1.2021.10.1.3.3 -w $warn -c $crit \
         -l \"Load 1min/5min/10min\"  > $OUTFILE");
    my $r = check_outfile();
    @LOAD_VALUES = `cat $OUTFILE|\
      awk '{ print \$3 \" \" \$5 \" \" \$6 \" \" \$7}'`;
    if ($r > 0) {
        if ($STATUS < $r) {
            $STATUS = $r;
        }
    }
}

# Check rootdisk
sub rootdisk { # usage: rootdisk(host_or_ip,warn,crit,community)
    my ($host,$warn,$crit,$comm) = @_;

    system("$USER1/check_snmp -H $host -C $comm \
      -o 1.3.6.1.4.1.2021.9.1.9.1,.1.3.6.1.4.1.2021.9.1.7.1,\
      .1.3.6.1.4.1.2021.9.1.8.1,.1.3.6.1.4.1.2021.9.1.3.1,\
       .1.3.6.1.4.1.2021.9.1.2.1 -w $warn -c $crit > $OUTFILE");
    my $r = check_outfile();
    @ROOTDISK_VALUE = `cat $OUTFILE|\
       awk '{print \$4 \" \" \$5 \" \" \$6}'`;
    if ($r > 0) {
        if ($STATUS < $r) {
            $STATUS = $r;
        }
    }
}

sub preflight { # usage: preflight(host_or_ip)
    my ($host) = shift;

    system("$USER1/check_ping -H $host -p 3 \
        -w 10,60% -c 30,80% -4 -t 3 > $OUTFILE");
    my $r = $?;
    if ($r >= 2) {
        check_exit("Cannot ping host - abandoning checks",1);
    }
}

sub dns { # usage: dns($hostname_or_ip)
    my ($host) = shift;

    system("nslookup $host > $OUTFILE");
    my $r = $?;
    if ($r > 0) {
        check_exit("$host did not resolve in DNS",$r);
    }
}

my $load_warn = $DEF_LOAD_WARN;
my $load_crit = $DEF_LOAD_CRIT;
my $disk_warn = $DEF_DISK_WARN;
my $disk_crit = $DEF_DISK_CRIT;
my $snmp_community = $DEF_SNMP_COMMUNITY;
my $host;

while ( my $i = shift @ARGV ) {
    if ($i eq '-u') {
        usage();
        exit (0);
    } elsif ($i eq '-H') {
        $host = shift @ARGV;
    } elsif ($i eq '-lw') {
        $load_warn = shift @ARGV;
    } elsif ($i eq '-lc') {
        $load_crit = shift @ARGV;
    } elsif ($i eq '-dw') {
        $disk_warn = shift @ARGV;
    } elsif ($i eq '-dc') {
        $disk_crit = shift @ARGV;
    } elsif ($i eq '--nodns') {
        $DNS = 0;
    } elsif ($i eq '--ping') {
        $PING = 1;
    } elsif ($i eq '--snmp') {
        $snmp_community = shift @ARGV;
    }
}

# there is no spoon...
if (!$host) {
    print "Error: no host specified\n";
    usage();
    exit (1);
}

# if we wanna ping go ahead XXX-mui do we care about stats?
if ($PING == 1) {
    preflight($host);
}
# if we wanna resolve then resolve
if ($DNS == 1) {
    dns($host);
}

# Call checks
load($host,$load_warn, $load_crit,$snmp_community);
rootdisk($host,$disk_warn,$disk_crit,$snmp_community);

# were all good - go ahead and exit
check_exit ("",$STATUS);

Summary & More Stuff

This generalized meta script is just an example of where a systems adminsitrator could go with the idea. Classes of meta scripts can be created with libraries of the subroutines made available and so on. Additionally there are probably a few more checks that could go in the meta check like ntp, system time etc.