Mercurial > repos > other > usr-local-bin
diff load-record @ 29:c2584db4a650
Make load recording more flexible and efficient
* Works with variable number of CPUs
* Use Python `enumerate()` function where possible
* Use long-running NVidia function
author | IBBoard <dev@ibboard.co.uk> |
---|---|
date | Wed, 30 Dec 2020 17:22:29 +0000 |
parents | e245a271fc44 |
children | ccc8f0903d2e |
line wrap: on
line diff
--- a/load-record Wed Dec 30 17:14:21 2020 +0000 +++ b/load-record Wed Dec 30 17:22:29 2020 +0000 @@ -12,70 +12,109 @@ home = str(Path.home()) DB = os.path.join(home, ".load.rrd") -if not os.path.exists(DB): +cpus = psutil.cpu_count() + +config = [ + ['load_1', 'GAUGE', 2, 0, 100], + ['load_5', 'GAUGE', 2, 0, 100], + ['load_15', 'GAUGE', 2, 0, 100], + *[[f'core{i+1}', 'GAUGE', 2, 0, 100] for i in range(cpus)], + ['core_avg', 'GAUGE', 2, 0, 100], + ['GPU', 'GAUGE', 2, 0, 100], + ['user', 'GAUGE', 2, 0, 100], + ['system', 'GAUGE', 2, 0, 100], + ['iowait', 'GAUGE', 2, 0, 100], + ['mem_used', 'GAUGE', 2, 0, 100], + ['mem_buffers', 'GAUGE', 2, 0, 100], +] + +fields = len(config) + +def needs_creating(): + if not os.path.exists(DB): + return True + else: + cur_config = rrdtool.info(DB) + for i, entry in enumerate(config): + key, datatype, heartbeat, minval, maxval = entry + if f"ds[{key}].index" not in cur_config or \ + cur_config[f"ds[{key}].index"] != i: + return True + elif cur_config[f"ds[{key}].type"] != datatype or \ + cur_config[f"ds[{key}].minimal_heartbeat"] != heartbeat: + # We don't appear to be able to check min/max from info + return True + return False + +# TODO: Add "pressure" support - relies on "psi=1" in kernel and /proc/pressure/… existing +if needs_creating(): rrdtool.create(DB, '--step', '1', - 'DS:load_1:GAUGE:2:0.0:100.0', - 'DS:load_5:GAUGE:2:0.0:100.0', - 'DS:load_15:GAUGE:2:0.0:100.0', - 'DS:core1:GAUGE:2:0.0:100.0', - 'DS:core2:GAUGE:2:0.0:100.0', - 'DS:core3:GAUGE:2:0.0:100.0', - 'DS:core4:GAUGE:2:0.0:100.0', - 'DS:core_avg:GAUGE:2:0.0:100.0', - 'DS:GPU:GAUGE:2:0.0:100.0', - 'DS:user:GAUGE:2:0.0:100.0', - 'DS:system:GAUGE:2:0.0:100.0', - 'DS:iowait:GAUGE:2:0.0:100.0', + *[f'DS:{key}:{datatype}:{heartbeat}:{minval}:{maxval}' for \ + key, datatype, heartbeat, minval, maxval in config], 'RRA:AVERAGE:0.5:1:3600', #1hr of 1s interval (averaged) 'RRA:MAX:0.5:60:360', #6hrs of 1 minute 'RRA:MAX:0.5:300:8640') #and 1mo of 5m resolution -cpus = psutil.cpu_count() -fields = 12 +samples = 10 +gpu_idx = 3 + cpus + 1 +last_avg_idx = 3 + cpus + 2 # load + CPUs + CPU average + GPU +total_mem = psutil.virtual_memory().total + +# Use dmon and assume we'll keep to roughly every second +# to keep up with its output to reduce the popen overhead +nv_smi = subprocess.Popen(['nvidia-smi', 'dmon', '-s', 'u'], + stdout=subprocess.PIPE, + universal_newlines=True) while True: interims = [[] for _ in range(fields)] + # Average 10 values internally to reduce RRD updates + # and to allow us to max some stats rather than average for _ in range(0, 10): cpu_pcs = psutil.cpu_percent(percpu=True) cpu_pc = sum(cpu_pcs) / cpus #TODO: If cpu_pc > 25% (?) log top processes cpu_states_pc = psutil.cpu_times_percent() loads = os.getloadavg() - # TODO: Include GPU load - nv_smi = subprocess.run(["nvidia-smi", "stats", "-d", "gpuUtil", "-c", "1"], stdout=subprocess.PIPE, universal_newlines=True) - gpu = 0 - lines = nv_smi.stdout.split('\n') - line_count = len(lines) - for line in lines: - line_parts = line.split(',') - if len(line_parts) == 4: - gpu += int(line_parts[3]) - else: - line_count -= 1 - gpu = gpu / line_count + mem = psutil.virtual_memory() + i = 0 + interims[i].append(loads[0]) + i = i + 1 + interims[i].append(loads[1]) + i = i + 1 + interims[i].append(loads[2]) + i = i + 1 + for a_cpu_pc in cpu_pcs: + interims[i].append(a_cpu_pc) + i = i + 1 + interims[i].append(cpu_pc) + i = i + 1 + interims[i].append(0) # Placeholder for GPU + i = i + 1 + interims[i].append(cpu_states_pc.user) + i = i + 1 + interims[i].append(cpu_states_pc.system) + i = i + 1 + interims[i].append(cpu_states_pc.iowait) + i = i + 1 + interims[i].append((mem.used / total_mem) * 100) + i = i + 1 + interims[i].append(((mem.buffers + mem.cached) / total_mem) * 100) time.sleep(0.1) - interims[0].append(loads[0]) - interims[1].append(loads[1]) - interims[2].append(loads[2]) - for i in range(cpus): - interims[i+3].append(cpu_pcs[i]) - i = cpus + 3 - interims[i].append(cpu_pc) - i += 1 - interims[i].append(gpu) - i += 1 - interims[i].append(cpu_states_pc.user) - i += 1 - interims[i].append(cpu_states_pc.system) - i += 1 - interims[i].append(cpu_states_pc.iowait) + vals = [] - for i in range(fields): - interim_vals = interims[i] - # Average most values - if i < fields - 3: + for i, interim_vals in enumerate(interims): + if i < last_avg_idx: + # Average most values vals.append(sum(interim_vals) / 10) else: # But take the max CPU state value vals.append(max(interim_vals)) + + while True: + line = nv_smi.stdout.readline() + if line[0] != '#': + vals[gpu_idx] = int(line[8:11]) + break + rrdtool.update(DB, "N:{}".format(':'.join(str(val) for val in vals)))