Commit 98f60be9 authored by anton's avatar anton
Browse files

add check_lvm_cache, check_net_traffic, check_xen_cpu

parent 170ee9ed
# Changelog
All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [1.1.0] - 2019-10-19
### Added
- *check_xen_cpu* - Reports CPU usage perfdata for all Xen Domains.
- *check_net* - Reports network traffic perfdata for all interfaces.
- *check_lvm_cache* - Reports statistics perfdata for a dm-cache device.
## [1.0.0] - 2018-05-05
Ported to git
# fem-nagios-plugins
Nagios plugins written and/or collected by FeM
#!/usr/bin/env python3
# check_lvm_cache
# Reports Cache stats for dm-caches using dmsetup
# Author: Anton Schubert <ischluff@mailbox.org>
# Version: 1.0
# Changelog:
# 2017-11-22 - 1.0 - Initial version
#
# Copyright (c) 2019
# License: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the FeM e.V. nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import os, sys, argparse, subprocess
from enum import IntEnum
class Status(IntEnum):
OK = 0
WARNING = 1
CRITICAL = 2
UNKNOWN = 3
def parse_stats(lines):
parts = lines[0].split(" ")
if len(parts) < 3:
return None
parttype = parts[2]
if parttype != "cache":
return None
metadata_size = parts[4].split("/")
data_size = parts[6].split("/")
read_hits = max(1, int(parts[7]))
write_hits = max(1, int(parts[9]))
return {
"used_metadata": int(metadata_size[0]) / int(metadata_size[1]) * 100,
"used_data": int(data_size[0]) / int(data_size[1]) * 100,
"read_hitrate": read_hits / (read_hits + int(parts[8])) * 100,
"write_hitrate": write_hits / (write_hits + int(parts[10])) * 100,
"dirty": int(parts[13]) / int(data_size[1]) * 100,
"demotions": int(parts[11]),
"promotions": int(parts[12])
}
return sample
def get_perfdata(sample):
perfdata = "Data Usage = {:.2f}%, Metadata Usage = {:.2f}%, Read Hitrate = {:.2f}%, Write Hitrate = {:.2f}%|".format(
sample["used_data"], sample["used_metadata"], sample["read_hitrate"], sample["write_hitrate"])
results = list(sample.items())
results.sort()
for name,value in results:
perfdata += "{}={} ".format(name, value)
return perfdata
# nagios compatible exit
def exit_status(status, perfdata="", message=None,):
if message is not None:
print("Cache {:} - {:}".format(status.name, message))
else:
print("Cache {:} - {:}".format(status.name, perfdata))
sys.exit(status.value)
def main():
parser = argparse.ArgumentParser(description="Reports Cache stats for dm-cache partitions")
parser.add_argument("device", help="cached dm")
args = parser.parse_args()
# get sample
cmd = ["/sbin/dmsetup", "status", args.device]
try:
output = subprocess.check_output(cmd)
lines = output.decode("utf-8").split("\n")
except (CalledProcessError, UnicodeError) as e:
if isinstance(e, CalledProcessError):
exit_status(Status.UNKNOWN, "Failed to read stats with dmsetup - {:}".format(e.output))
else:
exit_status(Status.UNKNOWN, "Failed to parse dmsetup output")
sample = parse_stats(lines)
if sample is None:
exit_status(Status.UNKNOWN, "Device {} not found, or not a cache".format(args.device))
## generate output
exit_status(Status.OK, get_perfdata(sample))
if __name__ == "__main__":
main()
#!/usr/bin/env python3
# check_net_traffic
# Reports Network Traffic for all interfaces
# Author: Anton Schubert <ischluff@mailbox.org>
# Version: 1.0
# Changelog:
# 2018-06-05 - 1.0 - Initial version
#
# Copyright (c) 2019
# License: BSD-3-Clause
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the FeM e.V. nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from __future__ import print_function
import os, sys, time, argparse, math, json
import re
from enum import IntEnum
class Status(IntEnum):
OK = 0
WARNING = 1
CRITICAL = 2
UNKNOWN = 3
def parse_net_stats(data):
data = data.split("\n")
results = {}
for i in range(2, len(data) - 1):
fields = re.split("\s+", data[i].strip())
results[fields[0][:-1]] = {
"rbytes": int(fields[1]),
"tbytes": int(fields[9])
}
return results
def diff_samples(s1, s2, diff):
results = {}
for n2,d2 in s2.items():
for n1,d1 in s1.items():
if n1 == n2:
results[n2] = dict((k, (d2[k]-d1[k]) / diff) for k in d2 if k in d1)
break
return results
# output total usage + usage per vm
def get_perfdata(results, diff):
total_rxrate = sum(v["rbytes"] for n,v in results.items()) * 8 / 1024**2
total_txrate = sum(v["tbytes"] for n,v in results.items()) * 8 / 1024**2
perfdata = "Rx = {:.2f}Mbit/s, Tx = {:.2f}Mbit/s in {:.0f}s|".format(total_rxrate, total_txrate, diff)
for name,value in results.items():
perfdata += " {:}_rx={:.2f}B".format(name, value["rbytes"])
perfdata += " {:}_tx={:.2f}B".format(name, value["tbytes"])
return perfdata
# nagios compatible exit
def exit_status(status, perfdata="", message=None,):
if message is not None:
print("Net {:} - {:}".format(status.name, message))
else:
print("Net {:} - {:}".format(status.name, perfdata))
sys.exit(status.value)
def main():
histfile = "/tmp/.check_net"
status = Status.OK
parser = argparse.ArgumentParser(description="Reports Network Stats for all Interfaces")
args = parser.parse_args()
# get sample
with open("/proc/net/dev", "r") as f:
data = f.read()
sample = parse_net_stats(data)
now = time.time()
# load history file
try:
stat = os.stat(histfile)
past = stat.st_mtime
with open(histfile) as f:
previous_sample = json.load(f)
except (IOError,OSError) as e:
if e.errno == os.errno.ENOENT:
status, message = Status.UNKNOWN, "History file not written yet"
else:
status, message = Status.UNKNOWN, "Failed to read the history file - {:}".format(e.strerror)
# write history file
with open(histfile, 'w') as f:
json.dump(sample, f)
# early exit on errors
if status > Status.OK:
exit_status(status, message)
# compute values
results = diff_samples(previous_sample, sample, now - past)
# generate output
exit_status(Status.OK, get_perfdata(results, now - past))
if __name__ == "__main__":
main()
#!/usr/bin/env python2
# check_xen_cpu
# Reports CPU Usage for Xen VMs
# Author: Anton Schubert <ischluff@mailbox.org>
# Version: 2.1
# Changelog:
# 2016-06-13 - 1.0 - Initial version
# 2017-07-08 - 1.1 - Handle counter rollover
# 2017-07-17 - 2.0 - Use history-file instead of sampling approach
# 2017-07-23 - 2.1 - Only initialize xenstat handle for VCPU stats
#
# Copyright (c) 2019
# License: BSD-3-clause
#
# All rights reserved.
#
#Redistribution and use in source and binary forms, with or without
#modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the FeM e.V. nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
#ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
#WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
#DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
#DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
#(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
#LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
#ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
#SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import os, sys, time, argparse, math, json
from enum import IntEnum
from xenstat import *
class Status(IntEnum):
OK = 0
WARNING = 1
CRITICAL = 2
UNKNOWN = 3
def get_domains(node):
count = xenstat_node_num_domains(node)
return map(lambda i: xenstat_node_domain_by_index(node, i), range(count))
def get_domain_names(domains):
return map(lambda d: xenstat_domain_name(d), domains)
def get_domain_times(domains):
return map(lambda d: xenstat_domain_cpu_ns(d), domains)
def get_domain_cpus(domains):
return map(lambda d: xenstat_domain_num_vcpus(d), domains)
def cputime_to_percent(sample1, sample2, diff):
# handle rollover
if (sample2 < sample1):
estimated_max = 2**math.ceil(math.log(sample1)/math.log(2))
sample1 -= estimated_max
return (sample2 - sample1) / (diff * 10.0e6)
def diff_results(samples1, samples2, diff):
results = []
for name,value in samples2:
for other_name,other_value in samples1:
if other_name == name:
results.append((name, cputime_to_percent(other_value, value, diff)))
continue
return results
# output total usage + usage per vm
def get_perfdata(num_cpus, names, cpus, results, diff):
total = sum(x[1] for x in results)
perfdata = "Total Usage = {:.2f}% in {:.0f}s, CPUs = {:d}|total={:.2f}%;;;0;{:d}".format(total, diff, num_cpus, total, num_cpus * 100)
for i in range(len(names)):
for name,value in results:
if name == names[i]:
perfdata += " domain_{:}={:.2f}%;;;0;{:d}".format(name, value, cpus[i] * 100)
continue
return perfdata
# nagios compatible exit
def exit_status(status, perfdata="", message=None,):
if message is not None:
print("CPU {:} - {:}".format(status.name, message))
else:
print("CPU {:} - {:}".format(status.name, perfdata))
sys.exit(status.value)
def main():
histfile = "/tmp/.check_xen_cpu"
parser = argparse.ArgumentParser(description="Reports CPU Usage for xen VMs")
args = parser.parse_args()
# initialize handle
handle = xenstat_init()
status = Status.OK
message = ""
previous_sample = None
past = None
# prevent access without handle
if handle is None:
exit_status(Status.UNKNOWN, "Failed to get xenstat handle, you are probably not root")
# sample cpu usage
node = xenstat_get_node(handle, XENSTAT_VCPU)
domains = get_domains(node)
sample = zip(get_domain_names(domains), get_domain_times(domains))
now = time.time()
# load history file
try:
stat = os.stat(histfile)
past = stat.st_mtime
with open(histfile) as f:
previous_sample = json.load(f)
except (IOError,OSError) as e:
if e.errno == os.errno.ENOENT:
status, message = Status.UNKNOWN, "History file not written yet"
else:
status, message = Status.UNKNOWN, "Failed to read the history file - {:}".format(e.strerror)
# write history file
try:
with open(histfile, 'w') as f:
json.dump(sample, f)
except (IOError,OSError) as e:
status, message = Status.UNKNOWN, "Could not write history file - {:}".format(e.strerror)
# early exit on errors
if status > Status.OK:
exit_status(status, message)
# compute values
results = diff_results(previous_sample, sample, now - past)
# get additional info
num_cpus = xenstat_node_num_cpus(node)
names = get_domain_names(domains)
cpus = get_domain_cpus(domains)
xenstat_uninit(handle)
# generate output
exit_status(Status.OK, get_perfdata(num_cpus, names, cpus, results, now - past))
if __name__ == "__main__":
main()
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment