changeset 29:c2584db4a650

Make load recording more flexible and efficient * Works with variable number of CPUs * Use Python `enumerate()` function where possible * Use long-running NVidia function
author IBBoard <dev@ibboard.co.uk>
date Wed, 30 Dec 2020 17:22:29 +0000
parents 278968272584
children dc0cf6bc9fbe
files load-graph load-record
diffstat 2 files changed, 102 insertions(+), 55 deletions(-) [+]
line wrap: on
line diff
--- a/load-graph	Wed Dec 30 17:14:21 2020 +0000
+++ b/load-graph	Wed Dec 30 17:22:29 2020 +0000
@@ -1,7 +1,7 @@
 #! /bin/bash
 
 start="-6h"
-end="0"
+end="-0h"
 
 if [ $# -eq 1 ]
 then
@@ -12,18 +12,20 @@
 	end=$2
 fi
 
+cpu_count=$(lscpu|grep '^CPU(s)'|awk '{print $2}')
+
+cpu_lines=()
+
+for ((i=1; i<=cpu_count; i++)); do
+	cpu_lines+=("DEF:core${i}=$HOME/.load.rrd:core${i}:AVERAGE")
+	cpu_lines+=("LINE1:core${i}#99000040:\"Core-${i}\"")
+done
+
 rrdtool graph /tmp/load-percent.png -w 1280 -h 1024 -a PNG --start "$start" --end "$end" \
 	--vertical-label "Usage (%)" \
-	DEF:core1=$HOME/.load.rrd:core1:AVERAGE \
-	DEF:core2=$HOME/.load.rrd:core2:AVERAGE \
-	DEF:core3=$HOME/.load.rrd:core3:AVERAGE \
-	DEF:core4=$HOME/.load.rrd:core4:AVERAGE \
+	${cpu_lines[@]} \
 	DEF:CPU=$HOME/.load.rrd:core_avg:AVERAGE \
 	DEF:GPU=$HOME/.load.rrd:GPU:AVERAGE \
-	LINE1:core1#99000030:"Core 1" \
-	LINE1:core2#99000030:"Core 2" \
-	LINE1:core3#99000030:"Core 3" \
-	LINE1:core4#99000030:"Core 4" \
 	LINE2:CPU#990000:"CPU" \
 	LINE2:GPU#009900:"GPU" \
 && rrdtool graph /tmp/load-type.png -w 1280 -h 1024 -a PNG --start "$start" --end "$end" \
@@ -34,5 +36,11 @@
 	AREA:iowait#888888:"I/O Wait" \
 	STACK:system#6666FF:"System" \
 	STACK:user#CCCC66:"User" \
+&& rrdtool graph /tmp/load-memory.png -w 1280 -h 1024 -a PNG --start "$start" --end "$end" \
+	--vertical-label "Usage (%)" \
+	DEF:mem_used=$HOME/.load.rrd:mem_used:AVERAGE \
+	DEF:mem_buffers=$HOME/.load.rrd:mem_buffers:AVERAGE \
+	AREA:mem_used#888888:"Used" \
+	STACK:mem_buffers#cccccc:"Buffers" \
 && eog /tmp/load-*.png
 rm -f /tmp/load-*.png
--- a/load-record	Wed Dec 30 17:14:21 2020 +0000
+++ b/load-record	Wed Dec 30 17:22:29 2020 +0000
@@ -12,70 +12,109 @@
 home = str(Path.home())
 DB = os.path.join(home, ".load.rrd")
 
-if not os.path.exists(DB):
+cpus = psutil.cpu_count()
+
+config = [
+	['load_1', 'GAUGE', 2, 0, 100],
+	['load_5', 'GAUGE', 2, 0, 100],
+	['load_15', 'GAUGE', 2, 0, 100],
+	*[[f'core{i+1}', 'GAUGE', 2, 0, 100] for i in range(cpus)],
+	['core_avg', 'GAUGE', 2, 0, 100],
+	['GPU', 'GAUGE', 2, 0, 100],
+	['user', 'GAUGE', 2, 0, 100],
+	['system', 'GAUGE', 2, 0, 100],
+	['iowait', 'GAUGE', 2, 0, 100],
+	['mem_used', 'GAUGE', 2, 0, 100],
+	['mem_buffers', 'GAUGE', 2, 0, 100],
+]
+
+fields = len(config)
+
+def needs_creating():
+	if not os.path.exists(DB):
+		return True
+	else:
+		cur_config = rrdtool.info(DB)
+		for i, entry in enumerate(config):
+			key, datatype, heartbeat, minval, maxval = entry
+			if f"ds[{key}].index" not in cur_config or \
+				cur_config[f"ds[{key}].index"] != i:
+				return True
+			elif cur_config[f"ds[{key}].type"] != datatype or \
+				cur_config[f"ds[{key}].minimal_heartbeat"] != heartbeat:
+				# We don't appear to be able to check min/max from info
+				return True
+		return False
+
+# TODO: Add "pressure" support - relies on "psi=1" in kernel and /proc/pressure/… existing
+if needs_creating():
 	rrdtool.create(DB, '--step', '1',
-		'DS:load_1:GAUGE:2:0.0:100.0',
-		'DS:load_5:GAUGE:2:0.0:100.0',
-		'DS:load_15:GAUGE:2:0.0:100.0',
-		'DS:core1:GAUGE:2:0.0:100.0',
-		'DS:core2:GAUGE:2:0.0:100.0',
-		'DS:core3:GAUGE:2:0.0:100.0',
-		'DS:core4:GAUGE:2:0.0:100.0',
-		'DS:core_avg:GAUGE:2:0.0:100.0',
-		'DS:GPU:GAUGE:2:0.0:100.0',
-		'DS:user:GAUGE:2:0.0:100.0',
-		'DS:system:GAUGE:2:0.0:100.0',
-		'DS:iowait:GAUGE:2:0.0:100.0',
+		*[f'DS:{key}:{datatype}:{heartbeat}:{minval}:{maxval}' for \
+		key, datatype, heartbeat, minval, maxval in config],
 		'RRA:AVERAGE:0.5:1:3600', #1hr of 1s interval (averaged)
 		'RRA:MAX:0.5:60:360', #6hrs of 1 minute
 		'RRA:MAX:0.5:300:8640') #and 1mo of 5m resolution
 
-cpus = psutil.cpu_count()
-fields = 12
+samples = 10
+gpu_idx = 3 + cpus + 1
+last_avg_idx = 3 + cpus + 2 # load + CPUs + CPU average + GPU
+total_mem = psutil.virtual_memory().total
+
+# Use dmon and assume we'll keep to roughly every second
+# to keep up with its output to reduce the popen overhead
+nv_smi = subprocess.Popen(['nvidia-smi', 'dmon', '-s', 'u'],
+							stdout=subprocess.PIPE,
+							universal_newlines=True)
 
 while True:
 	interims = [[] for _ in range(fields)]
+	# Average 10 values internally to reduce RRD updates
+	# and to allow us to max some stats rather than average
 	for _ in range(0, 10):
 		cpu_pcs = psutil.cpu_percent(percpu=True)
 		cpu_pc = sum(cpu_pcs) / cpus
 		#TODO: If cpu_pc > 25% (?) log top processes
 		cpu_states_pc = psutil.cpu_times_percent()
 		loads = os.getloadavg()
-		# TODO: Include GPU load
-		nv_smi = subprocess.run(["nvidia-smi", "stats", "-d", "gpuUtil", "-c", "1"], stdout=subprocess.PIPE, universal_newlines=True)
-		gpu = 0
-		lines = nv_smi.stdout.split('\n')
-		line_count = len(lines)
-		for line in lines:
-			line_parts = line.split(',')
-			if len(line_parts) == 4:
-				gpu += int(line_parts[3])
-			else:
-				line_count -= 1
-		gpu = gpu / line_count
+		mem = psutil.virtual_memory()
+		i = 0
+		interims[i].append(loads[0])
+		i = i + 1
+		interims[i].append(loads[1])
+		i = i + 1
+		interims[i].append(loads[2])
+		i = i + 1
+		for a_cpu_pc in cpu_pcs:
+			interims[i].append(a_cpu_pc)
+			i = i + 1
+		interims[i].append(cpu_pc)
+		i = i + 1
+		interims[i].append(0) # Placeholder for GPU
+		i = i + 1
+		interims[i].append(cpu_states_pc.user)
+		i = i + 1
+		interims[i].append(cpu_states_pc.system)
+		i = i + 1
+		interims[i].append(cpu_states_pc.iowait)
+		i = i + 1
+		interims[i].append((mem.used / total_mem) * 100)
+		i = i + 1
+		interims[i].append(((mem.buffers + mem.cached) / total_mem) * 100)
 		time.sleep(0.1)
-		interims[0].append(loads[0])
-		interims[1].append(loads[1])
-		interims[2].append(loads[2])
-		for i in range(cpus):
-			interims[i+3].append(cpu_pcs[i])
-		i = cpus + 3
-		interims[i].append(cpu_pc)
-		i += 1
-		interims[i].append(gpu)
-		i += 1
-		interims[i].append(cpu_states_pc.user)
-		i += 1
-		interims[i].append(cpu_states_pc.system)
-		i += 1
-		interims[i].append(cpu_states_pc.iowait)
+
 	vals = []
-	for i in range(fields):
-		interim_vals = interims[i]
-		# Average most values
-		if i  < fields - 3:
+	for i, interim_vals in enumerate(interims):
+		if i  < last_avg_idx:
+			# Average most values
 			vals.append(sum(interim_vals) / 10)
 		else:
 			# But take the max CPU state value
 			vals.append(max(interim_vals))
+
+	while True:
+		line = nv_smi.stdout.readline()
+		if line[0] != '#':
+			vals[gpu_idx] = int(line[8:11])
+			break
+
 	rrdtool.update(DB, "N:{}".format(':'.join(str(val) for val in vals)))