check_netapp/main.go

648 lines
16 KiB
Go

// Copyright 2020 Lars Hoogestraat
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package main
import (
"errors"
"flag"
"fmt"
"github.com/dustin/go-humanize"
"github.com/gosnmp/gosnmp"
"os"
"strconv"
"strings"
)
type status int
const (
ok = iota
warning
critical
unknown
)
const (
productVersion = "1.3.6.1.4.1.789.1.1.2.0"
productFirmwareVersion = "1.3.6.1.4.1.789.1.1.6.0"
dfFileSys = "1.3.6.1.4.1.789.1.5.4.1.2"
dfPerCentKBytesCapacity = "1.3.6.1.4.1.789.1.5.4.1.6"
df64TotalKBytes = "1.3.6.1.4.1.789.1.5.4.1.29"
df64UsedKBytes = "1.3.6.1.4.1.789.1.5.4.1.30"
df64AvailKBytes = "1.3.6.1.4.1.789.1.5.4.1.31"
diskActiveCount = "1.3.6.1.4.1.789.1.6.4.2"
diskReconstructingCount = "1.3.6.1.4.1.789.1.6.4.3"
diskReconstructingParityCount = "1.3.6.1.4.1.789.1.6.4.4"
diskVerifyingParityCount = "1.3.6.1.4.1.789.1.6.4.5"
diskScrubbingCount = "1.3.6.1.4.1.789.1.6.4.6"
diskFailedCount = "1.3.6.1.4.1.789.1.6.4.7.0"
diskSpareCount = "1.3.6.1.4.1.789.1.6.4.8"
diskAddingSpareCount = "1.3.6.1.4.1.789.1.6.4.9"
diskFailedMessage = "1.3.6.1.4.1.789.1.6.4.10"
diskPrefailedCount = "1.3.6.1.4.1.789.1.6.4.11"
enclFansFailed = "1.3.6.1.4.1.789.1.21.1.2.1.18"
enclElectronicsFailed = "1.3.6.1.4.1.789.1.21.1.2.1.33"
)
type genericFlags struct {
host string
community string
}
type diskFailedFlags struct {
critical int
warning int
}
type dfFlags struct {
volumeID int
volume string
listVolumes bool
critical int
warning int
criticalBytes string
warningBytes string
}
func usage() {
fmt.Println("check_netapp is a command line tool to check SNMP values from NetApp for e.g. icinga")
fmt.Println("Usage:")
fmt.Println("\t ./check_netapp (version|fw|df|fansfailed|elecfailed|diskfailed)")
}
func main() {
stat := ok
defer func() {
if gosnmp.Default.Conn != nil {
err := gosnmp.Default.Conn.Close()
if err != nil {
fmt.Println(err)
os.Exit(unknown)
}
}
os.Exit(stat)
}()
genericFlags := genericFlags{}
dfFlags := dfFlags{}
version := flag.NewFlagSet("version", flag.ExitOnError)
version.StringVar(&genericFlags.host, "H", "localhost", "The host to connect")
version.StringVar(&genericFlags.community, "C", "public", "The community to connect")
fw := flag.NewFlagSet("fw", flag.ExitOnError)
fw.StringVar(&genericFlags.host, "H", "localhost", "The host to connect")
fw.StringVar(&genericFlags.community, "C", "public", "The community to connect")
fansFailed := flag.NewFlagSet("fansfailed", flag.ExitOnError)
fansFailed.StringVar(&genericFlags.host, "H", "localhost", "The host to connect")
fansFailed.StringVar(&genericFlags.community, "C", "public", "The community to connect")
elecFailed := flag.NewFlagSet("elecfailed", flag.ExitOnError)
elecFailed.StringVar(&genericFlags.host, "H", "localhost", "The host to connect")
elecFailed.StringVar(&genericFlags.community, "C", "public", "The community to connect")
df := flag.NewFlagSet("df", flag.ExitOnError)
df.StringVar(&genericFlags.host, "H", "localhost", "The host to connect")
df.StringVar(&genericFlags.community, "C", "public", "The community to connect")
df.IntVar(&dfFlags.volumeID, "i", -1, "The ID of the volume. The last digits returned by the volume list of e.g. .1.3.6.1.4.1.789.1.5.4.1.2.1135 the id would be 1135.")
df.StringVar(&dfFlags.volume, "p", "", "The path of the volume to monitor. Could be slow, walks over all volumes")
df.BoolVar(&dfFlags.listVolumes, "l", false, "List all volumes reported by SNMP")
df.IntVar(&dfFlags.critical, "c", -1, "Percentage of space used before going critical")
df.IntVar(&dfFlags.warning, "w", -1, "Percentage of space used before sending a warning")
df.StringVar(&dfFlags.criticalBytes, "cb", "", "Check available space against a string representation of bytes ('20 MB' / '40 MiB' / '1 GB') before going critical")
df.StringVar(&dfFlags.warningBytes, "wb", "", "Check available space against a string representation of bytes ('20 MB' / '40 MiB' / '1 GB') before sending a warning")
diskFailedFlags := diskFailedFlags{}
diskFailed := flag.NewFlagSet("diskfailed", flag.ExitOnError)
diskFailed.StringVar(&genericFlags.host, "H", "localhost", "The host to connect")
diskFailed.StringVar(&genericFlags.community, "C", "public", "The community to connect")
diskFailed.IntVar(&diskFailedFlags.critical, "c", -1, "number of failed disks before going critical")
diskFailed.IntVar(&diskFailedFlags.warning, "w", -1, "number of failed disks before sending a warning")
if len(os.Args) < 2 {
usage()
stat = unknown
return
}
switch os.Args[1] {
case "df":
df.Parse(os.Args[2:])
err := genericFlags.connect()
if err != nil {
fmt.Println(err)
stat = unknown
return
}
msg, status, err := dfFlags.diskSpaceUsage()
if err != nil {
fmt.Println(err)
stat = int(status)
return
}
stat = int(status)
fmt.Print(msg)
return
case "version":
version.Parse(os.Args[2:])
err := genericFlags.connect()
if err != nil {
fmt.Println(err)
stat = unknown
return
}
msg, status, err := genericFlags.getVersion()
if err != nil {
fmt.Println(err)
stat = int(status)
return
}
stat = int(status)
fmt.Print(msg)
return
case "fw":
fw.Parse(os.Args[2:])
err := genericFlags.connect()
if err != nil {
fmt.Println(err)
stat = unknown
return
}
msg, status, err := genericFlags.getFirmwareVersion()
if err != nil {
fmt.Println(err)
stat = int(status)
return
}
stat = int(status)
fmt.Print(msg)
return
case "diskfailed":
diskFailed.Parse(os.Args[2:])
err := genericFlags.connect()
if err != nil {
fmt.Println(err)
stat = unknown
return
}
msg, status, err := diskFailedFlags.getDiskFailed()
if err != nil {
fmt.Println(err)
stat = int(status)
return
}
stat = int(status)
fmt.Print(msg)
return
case "elecfailed":
elecFailed.Parse(os.Args[2:])
err := genericFlags.connect()
if err != nil {
fmt.Println(err)
stat = unknown
return
}
msg, status, err := genericFlags.getElectronicFailed()
if err != nil {
fmt.Println(err)
stat = int(status)
return
}
stat = int(status)
fmt.Print(msg)
return
case "fansfailed":
fansFailed.Parse(os.Args[2:])
err := genericFlags.connect()
if err != nil {
fmt.Println(err)
os.Exit(critical)
}
msg, status, err := genericFlags.getFansFailed()
if err != nil {
stat = int(status)
return
}
fmt.Print(msg)
stat = int(status)
return
default:
usage()
stat = unknown
return
}
}
// connect established connection with the snmp host
func (f genericFlags) connect() error {
gosnmp.Default.Target = f.host
gosnmp.Default.Community = f.community
err := gosnmp.Default.Connect()
if err != nil {
fmt.Printf("error: Connect(): %v", err)
os.Exit(unknown)
}
return nil
}
// getVersion returns the product version of the NetApp (OID .1.3.6.1.4.1.789.1.1.2.0)
// http://oidref.com/1.3.6.1.4.1.789.1.1.2.0
func (f genericFlags) getVersion() (string, status, error) {
version, err := gosnmp.Default.Get([]string{productVersion})
if err != nil {
return "", unknown, fmt.Errorf("Get(%s) err: %v", productVersion, err)
}
str, err := getStringValues(version.Variables)
if err != nil {
return "", unknown, err
}
return str[0], ok, nil
}
// getElectronicFailed the list of failed electronics elements. Valid only if enclElectronicsPresent shows that some are present.
// http://oidref.com/1.3.6.1.4.1.789.1.21.1.2.1.33
func (f genericFlags) getElectronicFailed() (string, status, error) {
elecFailed, err := gosnmp.Default.WalkAll(enclElectronicsFailed)
if err != nil {
return "", unknown, fmt.Errorf("WalkAll(%s) err: %v", enclElectronicsFailed, err)
}
values, err := getStringValues(elecFailed)
if err != nil {
return "", unknown, err
}
elecFailedMsg := ""
for _, str := range values {
if str != "" {
elecFailedMsg += fmt.Sprintf("%s\n", str)
}
}
if elecFailedMsg != "" {
return "", critical, errors.New(elecFailedMsg)
}
return "Electronic is ok", ok, nil
}
// getFansFailed the list of failed fans or fan modules in this enclosure. Fans are numbered as described in enclFansPresent.
// http://oidref.com/1.3.6.1.4.1.789.1.21.1.2.1.18
func (f genericFlags) getFansFailed() (string, status, error) {
fansFailed, err := gosnmp.Default.WalkAll(enclFansFailed)
if err != nil {
return "", unknown, fmt.Errorf("WalkAll(%s) err: %v", enclElectronicsFailed, err)
}
fansFailedMsg := ""
values, err := getStringValues(fansFailed)
if err != nil {
return "", unknown, err
}
for _, str := range values {
if str != "" {
fansFailedMsg += fmt.Sprintf("%s\n", str)
}
}
if fansFailedMsg != "" {
return "", critical, errors.New(fansFailedMsg)
}
return "Fans are ok", ok, nil
}
// getFirmwareVersion returns the firmware version of the NetApp
// http://oidref.com/1.3.6.1.4.1.789.1.1.6.0
func (f genericFlags) getFirmwareVersion() (string, status, error) {
version, err := gosnmp.Default.Get([]string{productFirmwareVersion})
if err != nil {
return "", unknown, fmt.Errorf("Get(%s) err: %v", productFirmwareVersion, err)
}
str, err := getStringValues(version.Variables)
if err != nil {
return "", unknown, err
}
return fmt.Sprintf("Firmware version: %s", str[0]), ok, nil
}
// getDiskFailed returns the number of failed disk with the failure message if any
// http://oidref.com/1.3.6.1.4.1.789.1.6.4.7.0
func (f diskFailedFlags) getDiskFailed() (string, status, error) {
failedDisk, err := gosnmp.Default.Get([]string{diskFailedCount})
if err != nil {
return "", unknown, fmt.Errorf("Get(%s) err: %v", diskFailedCount, err)
}
str, err := getStringValues(failedDisk.Variables)
if err != nil {
return "", unknown, err
}
iFailedDisk, err := strconv.Atoi(str[0])
if err != nil {
return "", unknown, err
}
failedDiskMsg, err := f.getDiskFailedMessage(iFailedDisk)
if err != nil {
return "", unknown, err
}
if f.critical > 0 && iFailedDisk >= f.critical {
return "", critical, fmt.Errorf(failedDiskMsg)
} else if f.warning > 0 && iFailedDisk >= f.warning {
return "", warning, fmt.Errorf(failedDiskMsg)
}
return failedDiskMsg, ok, nil
}
func (f diskFailedFlags) getDiskFailedMessage(failedDisks int) (string, error) {
failedDiskMsg := fmt.Sprintf("%d disk(s) has failures", failedDisks)
if failedDisks > 0 {
msg, err := gosnmp.Default.Get([]string{diskFailedMessage})
if err != nil {
return "", fmt.Errorf("Get(%s) err: %v", diskFailedMessage, err)
}
str, err := getStringValues(msg.Variables)
if err != nil {
return "", err
}
failedDiskMsg += fmt.Sprintf("\nDescription: %s", str[0])
}
return failedDiskMsg, nil
}
// diskSpaceUsage returns the percentage of used space of a volume
// http://oidref.com/1.3.6.1.4.1.789.1.5.4.1.2
func (f dfFlags) diskSpaceUsage() (string, status, error) {
volID := f.volumeID
if volID == -1 {
results, err := gosnmp.Default.WalkAll(dfFileSys)
if err != nil {
return "", unknown, fmt.Errorf("WalkAll(%s): %v", dfFileSys, err)
}
if f.listVolumes {
volumes := ""
for _, v := range results {
volumes += fmt.Sprintf("%s %s\n", v.Name, v.Value)
}
return volumes, ok, nil
}
volID, err = getVolumeID(f.volume, results)
if err != nil {
return "", unknown, err
}
}
volName, err := gosnmp.Default.Get([]string{dfFileSys + "." + strconv.Itoa(volID)})
if err != nil {
return "", unknown, fmt.Errorf("Get(%s) err: %v", dfFileSys+"."+strconv.Itoa(volID), err)
}
strVolName, err := getStringValues(volName.Variables)
if err != nil {
return "", unknown, err
}
//Get total space
total, err := gosnmp.Default.Get([]string{df64TotalKBytes + "." + strconv.Itoa(volID)})
if err != nil {
return "", unknown, fmt.Errorf("Get(%s) err: %v", df64TotalKBytes+"."+strconv.Itoa(volID), err)
}
iTotal, err := getUint64Values(total.Variables)
if err != nil {
return "", unknown, err
}
//Get used space
used, err := gosnmp.Default.Get([]string{df64UsedKBytes + "." + strconv.Itoa(volID)})
if err != nil {
return "", unknown, fmt.Errorf("Get(%s) err: %v", df64UsedKBytes+"."+strconv.Itoa(volID), err)
}
iUsed, err := getUint64Values(used.Variables)
if err != nil {
return "", unknown, err
}
//Get available space
available, err := gosnmp.Default.Get([]string{df64AvailKBytes + "." + strconv.Itoa(volID)})
if err != nil {
return "", unknown, fmt.Errorf("Get(%s) err: %v", df64AvailKBytes+"."+strconv.Itoa(volID), err)
}
iAvailable, err := getUint64Values(available.Variables)
if err != nil {
return "", unknown, err
}
hrTotal := humanize.Bytes(iTotal[0] * 1024)
hrAvailable := humanize.Bytes(iAvailable[0] * 1024)
hrUsed := humanize.Bytes(iUsed[0] * 1024)
percentageUsed := 100 / float64(iTotal[0]) * float64(iUsed[0])
msg := fmt.Sprintf("Space of volume %s has usage of %.2f%%. Space total: %s - Space available: %s - Space used: %s", strVolName, percentageUsed, hrTotal, hrAvailable, hrUsed)
if f.critical > -1 {
if int(percentageUsed) >= f.critical {
return "", critical, fmt.Errorf(msg, strVolName, percentageUsed, hrTotal, hrAvailable, hrUsed)
}
}
if f.warning > -1 {
if int(percentageUsed) >= f.warning {
return "", warning, errors.New(msg)
}
}
if f.criticalBytes != "" {
criticalBytes, err := humanize.ParseBytes(f.criticalBytes)
if err != nil {
return "", unknown, fmt.Errorf("could not parse bytes from string %s, error: %v", f.criticalBytes, err)
}
if uint64(iAvailable[0]*1024) <= criticalBytes {
return "", critical, errors.New(msg)
}
}
if f.warningBytes != "" {
warningBytes, err := humanize.ParseBytes(f.warningBytes)
if err != nil {
return "", unknown, fmt.Errorf("could not parse bytes from string %s, error: %v", f.warningBytes, err)
}
if uint64(iAvailable[0]*1024) <= warningBytes {
return "", warning, fmt.Errorf(msg, strVolName, percentageUsed, hrTotal, hrAvailable, hrUsed)
}
}
return msg, ok, nil
}
func getVolumeID(volName string, in []gosnmp.SnmpPDU) (int, error) {
for _, v := range in {
value := v.Value.(string)
if value == volName {
pos := strings.LastIndex(v.Name, ".")
if pos == -1 {
return -1, fmt.Errorf("error: unexpected snmp response %s, please check via snmpget", v.Name)
}
volID, err := strconv.Atoi(v.Name[pos+1:])
if err != nil {
return -1, fmt.Errorf("error: could not convert unexpected value %s into integer", v.Name[pos+1:])
}
return volID, nil
}
}
return -1, fmt.Errorf("error: volume %s not found", volName)
}
func getStringValues(vars []gosnmp.SnmpPDU) ([]string, error) {
values, err := getValues(vars)
if err != nil {
return nil, err
}
var str []string
for _, v := range values {
if v2, ok := v.(string); ok {
str = append(str, v2)
}
}
return str, nil
}
func getUint64Values(vars []gosnmp.SnmpPDU) ([]uint64, error) {
values, err := getValues(vars)
if err != nil {
return nil, err
}
var uints []uint64
for _, v := range values {
if v2, ok := v.(uint64); ok {
uints = append(uints, v2)
}
}
return uints, nil
}
func getValues(vars []gosnmp.SnmpPDU) ([]interface{}, error) {
var values []interface{}
for _, variable := range vars {
switch variable.Type {
case gosnmp.OctetString:
values = append(values, string(variable.Value.([]byte)))
case gosnmp.Counter64:
values = append(values, uint64(variable.Value.(uint64)))
case gosnmp.NoSuchInstance:
return nil, fmt.Errorf("no such instance, requested object instance with OID %s could not be returned", variable.Name)
case gosnmp.NoSuchObject:
return nil, fmt.Errorf("no such object, requested object instance with OID %s could not be returned", variable.Name)
case gosnmp.UnknownType:
return nil, fmt.Errorf("unknown type, object instance with OID %s contained an unknown type", variable.Name)
}
return values, nil
}
return nil, errors.New("unexpected type, object instance with OID %s contained an unknown type")
}