view load-record @ 38:737061eac1d0 default tip

Skip more large videos and some permission issues
author IBBoard <dev@ibboard.co.uk>
date Wed, 18 Aug 2021 20:40:17 +0100
parents ccc8f0903d2e
children
line wrap: on
line source

#! /usr/bin/env python3

import psutil
import os
import os.path
import rrdtool
import sched
import subprocess
import threading

from pathlib import Path

home = str(Path.home())
DB = os.path.join(home, ".load.rrd")

cpus = psutil.cpu_count()

config = [
	['load_1', 'GAUGE', 2, 0, 100],
	['load_5', 'GAUGE', 2, 0, 100],
	['load_15', 'GAUGE', 2, 0, 100],
	*[[f'core{i+1}', 'GAUGE', 2, 0, 100] for i in range(cpus)],
	['core_avg', 'GAUGE', 2, 0, 100],
	['GPU', 'GAUGE', 2, 0, 100],
	['user', 'GAUGE', 2, 0, 100],
	['system', 'GAUGE', 2, 0, 100],
	['iowait', 'GAUGE', 2, 0, 100],
	['mem_used', 'GAUGE', 2, 0, 100],
	['mem_buffers', 'GAUGE', 2, 0, 100],
]

fields = len(config)


def needs_creating():
	if not os.path.exists(DB):
		return True
	else:
		cur_config = rrdtool.info(DB)
		for i, entry in enumerate(config):
			key, datatype, heartbeat, minval, maxval = entry
			if f"ds[{key}].index" not in cur_config or \
				cur_config[f"ds[{key}].index"] != i:
				return True
			elif cur_config[f"ds[{key}].type"] != datatype or \
				cur_config[f"ds[{key}].minimal_heartbeat"] != heartbeat:
				# We don't appear to be able to check min/max from info
				return True
		# TODO: Check RRA definitions based on rra[i].cf, rra[i].pdp_per_row and rra[i].rows
		return False


# TODO: Add "pressure" support - relies on "psi=1" in kernel and /proc/pressure/… existing
if needs_creating():
	rrdtool.create(DB, '--step', '1',
		*[f'DS:{key}:{datatype}:{heartbeat}:{minval}:{maxval}' for \
		key, datatype, heartbeat, minval, maxval in config],
		'RRA:AVERAGE:0.5:1:3600', #1hr of 1s interval (averaged)
		'RRA:MAX:0.5:60:360', #6hrs of 1 minute
		'RRA:MAX:0.5:300:8640') #and 1mo of 5m resolution

samples = 10
gpu_idx = 3 + cpus + 1
last_avg_idx = 3 + cpus + 2 # load + CPUs + CPU average + GPU
total_mem = psutil.virtual_memory().total

# Note: We use some global variables on the assumption that:
#  1) We just need "the latest" gpu_val value
#  2) Because gpu_val is numeric then it can't be "inconsistent"
#  3) The use of the scheduler and its priorities ensures that
#     the record_interims and record_record functions happen in sequence
#  4) The record_interims function takes under 1/samples seconds to run
#
# If this ever fails, we need to look at multiprocessing.Value and .Array
gpu_val = 0
interims = [[0] * samples for _ in range(fields)]
pos = 0


def parse_nvidia_output():
	global gpu_val
	nv_smi = subprocess.Popen(['nvidia-smi', 'dmon', '-s', 'u'],
								stdout=subprocess.PIPE,
								universal_newlines=True)
	while True:
		# Readline blocks, so this thread will update as and when new values are available
		line = nv_smi.stdout.readline()
		if line and line[0] != '#':
			gpu_val = int(line[8:11])


def record_interims():
	global pos
	scheduler.enter(1.0/samples, 1, record_interims)
	cur_pos = pos
	pos = (pos + 1) % samples
	cpu_pcs = psutil.cpu_percent(percpu=True)
	cpu_pc = sum(cpu_pcs) / cpus
	#TODO: If cpu_pc > 25% (?) log top processes
	cpu_states_pc = psutil.cpu_times_percent()
	loads = os.getloadavg()
	mem = psutil.virtual_memory()
	i = 0
	interims[i][cur_pos] = loads[0]
	i = i + 1
	interims[i][cur_pos] = loads[1]
	i = i + 1
	interims[i][cur_pos] = loads[2]
	i = i + 1
	for a_cpu_pc in cpu_pcs:
		interims[i][cur_pos] = a_cpu_pc
		i = i + 1
	interims[i][cur_pos] = cpu_pc
	i = i + 1
	interims[i][cur_pos] = 0 # Placeholder for GPU
	i = i + 1
	interims[i][cur_pos] = cpu_states_pc.user
	i = i + 1
	interims[i][cur_pos] = cpu_states_pc.system
	i = i + 1
	interims[i][cur_pos] = cpu_states_pc.iowait
	i = i + 1
	interims[i][cur_pos] = (mem.used / total_mem) * 100
	i = i + 1
	interims[i][cur_pos] = ((mem.buffers + mem.cached) / total_mem) * 100


def record_record():
	global gpu_val, gpu_idx
	scheduler.enter(1, 2, record_record)
	vals = [0] * fields
	for i, interim_vals in enumerate(interims):
		if i  < last_avg_idx:
			# Average most values
			vals[i] = sum(interim_vals) / samples
		else:
			# But take the max CPU state value and memory usage
			vals[i] = max(interim_vals)

	vals[gpu_idx] = gpu_val
	rrdtool.update(DB, "N:{}".format(':'.join(str(val) for val in vals)))


nv_thread = threading.Thread(target=parse_nvidia_output)
nv_thread.start()
scheduler = sched.scheduler()
# Let record_interims run and schedule itself
record_interims()
# Schedule record recording for 1s later so we've got a set of values, and then it'll schedule future calls
scheduler.enter(1, 2, record_record)
scheduler.run()