diff load-record @ 29:c2584db4a650

Make load recording more flexible and efficient * Works with variable number of CPUs * Use Python `enumerate()` function where possible * Use long-running NVidia function
author IBBoard <dev@ibboard.co.uk>
date Wed, 30 Dec 2020 17:22:29 +0000
parents e245a271fc44
children ccc8f0903d2e
line wrap: on
line diff
--- a/load-record	Wed Dec 30 17:14:21 2020 +0000
+++ b/load-record	Wed Dec 30 17:22:29 2020 +0000
@@ -12,70 +12,109 @@
 home = str(Path.home())
 DB = os.path.join(home, ".load.rrd")
 
-if not os.path.exists(DB):
+cpus = psutil.cpu_count()
+
+config = [
+	['load_1', 'GAUGE', 2, 0, 100],
+	['load_5', 'GAUGE', 2, 0, 100],
+	['load_15', 'GAUGE', 2, 0, 100],
+	*[[f'core{i+1}', 'GAUGE', 2, 0, 100] for i in range(cpus)],
+	['core_avg', 'GAUGE', 2, 0, 100],
+	['GPU', 'GAUGE', 2, 0, 100],
+	['user', 'GAUGE', 2, 0, 100],
+	['system', 'GAUGE', 2, 0, 100],
+	['iowait', 'GAUGE', 2, 0, 100],
+	['mem_used', 'GAUGE', 2, 0, 100],
+	['mem_buffers', 'GAUGE', 2, 0, 100],
+]
+
+fields = len(config)
+
+def needs_creating():
+	if not os.path.exists(DB):
+		return True
+	else:
+		cur_config = rrdtool.info(DB)
+		for i, entry in enumerate(config):
+			key, datatype, heartbeat, minval, maxval = entry
+			if f"ds[{key}].index" not in cur_config or \
+				cur_config[f"ds[{key}].index"] != i:
+				return True
+			elif cur_config[f"ds[{key}].type"] != datatype or \
+				cur_config[f"ds[{key}].minimal_heartbeat"] != heartbeat:
+				# We don't appear to be able to check min/max from info
+				return True
+		return False
+
+# TODO: Add "pressure" support - relies on "psi=1" in kernel and /proc/pressure/… existing
+if needs_creating():
 	rrdtool.create(DB, '--step', '1',
-		'DS:load_1:GAUGE:2:0.0:100.0',
-		'DS:load_5:GAUGE:2:0.0:100.0',
-		'DS:load_15:GAUGE:2:0.0:100.0',
-		'DS:core1:GAUGE:2:0.0:100.0',
-		'DS:core2:GAUGE:2:0.0:100.0',
-		'DS:core3:GAUGE:2:0.0:100.0',
-		'DS:core4:GAUGE:2:0.0:100.0',
-		'DS:core_avg:GAUGE:2:0.0:100.0',
-		'DS:GPU:GAUGE:2:0.0:100.0',
-		'DS:user:GAUGE:2:0.0:100.0',
-		'DS:system:GAUGE:2:0.0:100.0',
-		'DS:iowait:GAUGE:2:0.0:100.0',
+		*[f'DS:{key}:{datatype}:{heartbeat}:{minval}:{maxval}' for \
+		key, datatype, heartbeat, minval, maxval in config],
 		'RRA:AVERAGE:0.5:1:3600', #1hr of 1s interval (averaged)
 		'RRA:MAX:0.5:60:360', #6hrs of 1 minute
 		'RRA:MAX:0.5:300:8640') #and 1mo of 5m resolution
 
-cpus = psutil.cpu_count()
-fields = 12
+samples = 10
+gpu_idx = 3 + cpus + 1
+last_avg_idx = 3 + cpus + 2 # load + CPUs + CPU average + GPU
+total_mem = psutil.virtual_memory().total
+
+# Use dmon and assume we'll keep to roughly every second
+# to keep up with its output to reduce the popen overhead
+nv_smi = subprocess.Popen(['nvidia-smi', 'dmon', '-s', 'u'],
+							stdout=subprocess.PIPE,
+							universal_newlines=True)
 
 while True:
 	interims = [[] for _ in range(fields)]
+	# Average 10 values internally to reduce RRD updates
+	# and to allow us to max some stats rather than average
 	for _ in range(0, 10):
 		cpu_pcs = psutil.cpu_percent(percpu=True)
 		cpu_pc = sum(cpu_pcs) / cpus
 		#TODO: If cpu_pc > 25% (?) log top processes
 		cpu_states_pc = psutil.cpu_times_percent()
 		loads = os.getloadavg()
-		# TODO: Include GPU load
-		nv_smi = subprocess.run(["nvidia-smi", "stats", "-d", "gpuUtil", "-c", "1"], stdout=subprocess.PIPE, universal_newlines=True)
-		gpu = 0
-		lines = nv_smi.stdout.split('\n')
-		line_count = len(lines)
-		for line in lines:
-			line_parts = line.split(',')
-			if len(line_parts) == 4:
-				gpu += int(line_parts[3])
-			else:
-				line_count -= 1
-		gpu = gpu / line_count
+		mem = psutil.virtual_memory()
+		i = 0
+		interims[i].append(loads[0])
+		i = i + 1
+		interims[i].append(loads[1])
+		i = i + 1
+		interims[i].append(loads[2])
+		i = i + 1
+		for a_cpu_pc in cpu_pcs:
+			interims[i].append(a_cpu_pc)
+			i = i + 1
+		interims[i].append(cpu_pc)
+		i = i + 1
+		interims[i].append(0) # Placeholder for GPU
+		i = i + 1
+		interims[i].append(cpu_states_pc.user)
+		i = i + 1
+		interims[i].append(cpu_states_pc.system)
+		i = i + 1
+		interims[i].append(cpu_states_pc.iowait)
+		i = i + 1
+		interims[i].append((mem.used / total_mem) * 100)
+		i = i + 1
+		interims[i].append(((mem.buffers + mem.cached) / total_mem) * 100)
 		time.sleep(0.1)
-		interims[0].append(loads[0])
-		interims[1].append(loads[1])
-		interims[2].append(loads[2])
-		for i in range(cpus):
-			interims[i+3].append(cpu_pcs[i])
-		i = cpus + 3
-		interims[i].append(cpu_pc)
-		i += 1
-		interims[i].append(gpu)
-		i += 1
-		interims[i].append(cpu_states_pc.user)
-		i += 1
-		interims[i].append(cpu_states_pc.system)
-		i += 1
-		interims[i].append(cpu_states_pc.iowait)
+
 	vals = []
-	for i in range(fields):
-		interim_vals = interims[i]
-		# Average most values
-		if i  < fields - 3:
+	for i, interim_vals in enumerate(interims):
+		if i  < last_avg_idx:
+			# Average most values
 			vals.append(sum(interim_vals) / 10)
 		else:
 			# But take the max CPU state value
 			vals.append(max(interim_vals))
+
+	while True:
+		line = nv_smi.stdout.readline()
+		if line[0] != '#':
+			vals[gpu_idx] = int(line[8:11])
+			break
+
 	rrdtool.update(DB, "N:{}".format(':'.join(str(val) for val in vals)))