Mercurial > repos > other > usr-local-bin
changeset 29:c2584db4a650
Make load recording more flexible and efficient
* Works with variable number of CPUs
* Use Python `enumerate()` function where possible
* Use long-running NVidia function
author | IBBoard <dev@ibboard.co.uk> |
---|---|
date | Wed, 30 Dec 2020 17:22:29 +0000 |
parents | 278968272584 |
children | dc0cf6bc9fbe |
files | load-graph load-record |
diffstat | 2 files changed, 102 insertions(+), 55 deletions(-) [+] |
line wrap: on
line diff
--- a/load-graph Wed Dec 30 17:14:21 2020 +0000 +++ b/load-graph Wed Dec 30 17:22:29 2020 +0000 @@ -1,7 +1,7 @@ #! /bin/bash start="-6h" -end="0" +end="-0h" if [ $# -eq 1 ] then @@ -12,18 +12,20 @@ end=$2 fi +cpu_count=$(lscpu|grep '^CPU(s)'|awk '{print $2}') + +cpu_lines=() + +for ((i=1; i<=cpu_count; i++)); do + cpu_lines+=("DEF:core${i}=$HOME/.load.rrd:core${i}:AVERAGE") + cpu_lines+=("LINE1:core${i}#99000040:\"Core-${i}\"") +done + rrdtool graph /tmp/load-percent.png -w 1280 -h 1024 -a PNG --start "$start" --end "$end" \ --vertical-label "Usage (%)" \ - DEF:core1=$HOME/.load.rrd:core1:AVERAGE \ - DEF:core2=$HOME/.load.rrd:core2:AVERAGE \ - DEF:core3=$HOME/.load.rrd:core3:AVERAGE \ - DEF:core4=$HOME/.load.rrd:core4:AVERAGE \ + ${cpu_lines[@]} \ DEF:CPU=$HOME/.load.rrd:core_avg:AVERAGE \ DEF:GPU=$HOME/.load.rrd:GPU:AVERAGE \ - LINE1:core1#99000030:"Core 1" \ - LINE1:core2#99000030:"Core 2" \ - LINE1:core3#99000030:"Core 3" \ - LINE1:core4#99000030:"Core 4" \ LINE2:CPU#990000:"CPU" \ LINE2:GPU#009900:"GPU" \ && rrdtool graph /tmp/load-type.png -w 1280 -h 1024 -a PNG --start "$start" --end "$end" \ @@ -34,5 +36,11 @@ AREA:iowait#888888:"I/O Wait" \ STACK:system#6666FF:"System" \ STACK:user#CCCC66:"User" \ +&& rrdtool graph /tmp/load-memory.png -w 1280 -h 1024 -a PNG --start "$start" --end "$end" \ + --vertical-label "Usage (%)" \ + DEF:mem_used=$HOME/.load.rrd:mem_used:AVERAGE \ + DEF:mem_buffers=$HOME/.load.rrd:mem_buffers:AVERAGE \ + AREA:mem_used#888888:"Used" \ + STACK:mem_buffers#cccccc:"Buffers" \ && eog /tmp/load-*.png rm -f /tmp/load-*.png
--- a/load-record Wed Dec 30 17:14:21 2020 +0000 +++ b/load-record Wed Dec 30 17:22:29 2020 +0000 @@ -12,70 +12,109 @@ home = str(Path.home()) DB = os.path.join(home, ".load.rrd") -if not os.path.exists(DB): +cpus = psutil.cpu_count() + +config = [ + ['load_1', 'GAUGE', 2, 0, 100], + ['load_5', 'GAUGE', 2, 0, 100], + ['load_15', 'GAUGE', 2, 0, 100], + *[[f'core{i+1}', 'GAUGE', 2, 0, 100] for i in range(cpus)], + ['core_avg', 'GAUGE', 2, 0, 100], + ['GPU', 'GAUGE', 2, 0, 100], + ['user', 'GAUGE', 2, 0, 100], + ['system', 'GAUGE', 2, 0, 100], + ['iowait', 'GAUGE', 2, 0, 100], + ['mem_used', 'GAUGE', 2, 0, 100], + ['mem_buffers', 'GAUGE', 2, 0, 100], +] + +fields = len(config) + +def needs_creating(): + if not os.path.exists(DB): + return True + else: + cur_config = rrdtool.info(DB) + for i, entry in enumerate(config): + key, datatype, heartbeat, minval, maxval = entry + if f"ds[{key}].index" not in cur_config or \ + cur_config[f"ds[{key}].index"] != i: + return True + elif cur_config[f"ds[{key}].type"] != datatype or \ + cur_config[f"ds[{key}].minimal_heartbeat"] != heartbeat: + # We don't appear to be able to check min/max from info + return True + return False + +# TODO: Add "pressure" support - relies on "psi=1" in kernel and /proc/pressure/… existing +if needs_creating(): rrdtool.create(DB, '--step', '1', - 'DS:load_1:GAUGE:2:0.0:100.0', - 'DS:load_5:GAUGE:2:0.0:100.0', - 'DS:load_15:GAUGE:2:0.0:100.0', - 'DS:core1:GAUGE:2:0.0:100.0', - 'DS:core2:GAUGE:2:0.0:100.0', - 'DS:core3:GAUGE:2:0.0:100.0', - 'DS:core4:GAUGE:2:0.0:100.0', - 'DS:core_avg:GAUGE:2:0.0:100.0', - 'DS:GPU:GAUGE:2:0.0:100.0', - 'DS:user:GAUGE:2:0.0:100.0', - 'DS:system:GAUGE:2:0.0:100.0', - 'DS:iowait:GAUGE:2:0.0:100.0', + *[f'DS:{key}:{datatype}:{heartbeat}:{minval}:{maxval}' for \ + key, datatype, heartbeat, minval, maxval in config], 'RRA:AVERAGE:0.5:1:3600', #1hr of 1s interval (averaged) 'RRA:MAX:0.5:60:360', #6hrs of 1 minute 'RRA:MAX:0.5:300:8640') #and 1mo of 5m resolution -cpus = psutil.cpu_count() -fields = 12 +samples = 10 +gpu_idx = 3 + cpus + 1 +last_avg_idx = 3 + cpus + 2 # load + CPUs + CPU average + GPU +total_mem = psutil.virtual_memory().total + +# Use dmon and assume we'll keep to roughly every second +# to keep up with its output to reduce the popen overhead +nv_smi = subprocess.Popen(['nvidia-smi', 'dmon', '-s', 'u'], + stdout=subprocess.PIPE, + universal_newlines=True) while True: interims = [[] for _ in range(fields)] + # Average 10 values internally to reduce RRD updates + # and to allow us to max some stats rather than average for _ in range(0, 10): cpu_pcs = psutil.cpu_percent(percpu=True) cpu_pc = sum(cpu_pcs) / cpus #TODO: If cpu_pc > 25% (?) log top processes cpu_states_pc = psutil.cpu_times_percent() loads = os.getloadavg() - # TODO: Include GPU load - nv_smi = subprocess.run(["nvidia-smi", "stats", "-d", "gpuUtil", "-c", "1"], stdout=subprocess.PIPE, universal_newlines=True) - gpu = 0 - lines = nv_smi.stdout.split('\n') - line_count = len(lines) - for line in lines: - line_parts = line.split(',') - if len(line_parts) == 4: - gpu += int(line_parts[3]) - else: - line_count -= 1 - gpu = gpu / line_count + mem = psutil.virtual_memory() + i = 0 + interims[i].append(loads[0]) + i = i + 1 + interims[i].append(loads[1]) + i = i + 1 + interims[i].append(loads[2]) + i = i + 1 + for a_cpu_pc in cpu_pcs: + interims[i].append(a_cpu_pc) + i = i + 1 + interims[i].append(cpu_pc) + i = i + 1 + interims[i].append(0) # Placeholder for GPU + i = i + 1 + interims[i].append(cpu_states_pc.user) + i = i + 1 + interims[i].append(cpu_states_pc.system) + i = i + 1 + interims[i].append(cpu_states_pc.iowait) + i = i + 1 + interims[i].append((mem.used / total_mem) * 100) + i = i + 1 + interims[i].append(((mem.buffers + mem.cached) / total_mem) * 100) time.sleep(0.1) - interims[0].append(loads[0]) - interims[1].append(loads[1]) - interims[2].append(loads[2]) - for i in range(cpus): - interims[i+3].append(cpu_pcs[i]) - i = cpus + 3 - interims[i].append(cpu_pc) - i += 1 - interims[i].append(gpu) - i += 1 - interims[i].append(cpu_states_pc.user) - i += 1 - interims[i].append(cpu_states_pc.system) - i += 1 - interims[i].append(cpu_states_pc.iowait) + vals = [] - for i in range(fields): - interim_vals = interims[i] - # Average most values - if i < fields - 3: + for i, interim_vals in enumerate(interims): + if i < last_avg_idx: + # Average most values vals.append(sum(interim_vals) / 10) else: # But take the max CPU state value vals.append(max(interim_vals)) + + while True: + line = nv_smi.stdout.readline() + if line[0] != '#': + vals[gpu_idx] = int(line[8:11]) + break + rrdtool.update(DB, "N:{}".format(':'.join(str(val) for val in vals)))