metrollm-bench-mac / scripts /mac_bench /thermal_sampler.py
Remco Hendriks
Update Mac bench dist
2d05890 verified
#!/usr/bin/env python3
"""Real-time tok/s + RSS sampler for run_thermal.sh.
Polls llama-server.log incrementally and the RSS log every `interval` seconds,
appending one row per interval to thermal_curve.csv:
t_sec, tok_s_mean, tok_s_p10, tok_s_n, rss_gb
Exits cleanly after `duration` seconds. Writes a final summary to
thermal_curve.json with cold/sustained/throttle stats.
"""
from __future__ import annotations
import argparse
import json
import re
import statistics
import time
from pathlib import Path
EVAL_RE = re.compile(
r"eval time\s*=\s*[\d.]+\s*ms\s*/\s*(\d+)\s*(?:tokens|runs)\s*"
r"\(\s*[\d.]+\s*ms per token,\s*([\d.]+)\s*tokens per second\)",
re.IGNORECASE,
)
def latest_rss_gb(rss_log: Path) -> float:
if not rss_log.exists():
return 0.0
try:
with rss_log.open() as f:
tail = f.readlines()[-3:]
for line in reversed(tail):
parts = line.split()
if len(parts) >= 2 and parts[1].isdigit():
return int(parts[1]) / 1024 / 1024
except Exception:
pass
return 0.0
def percentile(values, p):
if not values:
return 0.0
s = sorted(values)
idx = max(0, min(len(s) - 1, int(round((p / 100.0) * (len(s) - 1)))))
return s[idx]
def main():
p = argparse.ArgumentParser()
p.add_argument("--llama-log", type=Path, required=True)
p.add_argument("--rss-log", type=Path, required=True)
p.add_argument("--out-csv", type=Path, required=True)
p.add_argument("--out-json", type=Path, required=True)
p.add_argument("--interval", type=int, default=30, help="seconds per sample window")
p.add_argument("--duration", type=int, default=2700, help="seconds to sample (default 45 min)")
p.add_argument("--min-tokens", type=int, default=8,
help="filter eval lines with fewer tokens than this (skip trivial bursts)")
args = p.parse_args()
# Wait until llama-server log exists
deadline = time.time() + 60
while not args.llama_log.exists() and time.time() < deadline:
time.sleep(1)
if not args.llama_log.exists():
print(f"llama log never appeared at {args.llama_log}", flush=True)
return
last_pos = 0
with args.llama_log.open() as f:
f.seek(0, 2) # skip startup lines (model load, etc.)
last_pos = f.tell()
args.out_csv.parent.mkdir(parents=True, exist_ok=True)
csv = args.out_csv.open("w", buffering=1)
csv.write("t_sec,tok_s_mean,tok_s_median,tok_s_p10,tok_s_n,rss_gb\n")
rows: list[dict] = []
start = time.time()
next_sample = start + args.interval
while time.time() - start < args.duration:
sleep_for = max(0.5, next_sample - time.time())
time.sleep(sleep_for)
# Read all new content since last poll
try:
with args.llama_log.open() as f:
f.seek(last_pos)
chunk = f.read()
last_pos = f.tell()
except FileNotFoundError:
chunk = ""
rates = []
# Process line-by-line to filter out prompt-eval lines (which would
# otherwise inflate decode tok/s by ~10x).
for line in chunk.splitlines():
if "prompt eval time" in line:
continue
m = EVAL_RE.search(line)
if m:
n_tok = int(m.group(1))
tok_s = float(m.group(2))
if n_tok >= args.min_tokens:
rates.append(tok_s)
rss = latest_rss_gb(args.rss_log)
t = round(time.time() - start, 1)
if rates:
mean = statistics.mean(rates)
med = statistics.median(rates)
p10 = percentile(rates, 10)
else:
mean = med = p10 = 0.0
csv.write(f"{t:.1f},{mean:.2f},{med:.2f},{p10:.2f},{len(rates)},{rss:.3f}\n")
rows.append({"t_sec": t, "tok_s_mean": mean, "tok_s_median": med,
"tok_s_p10": p10, "tok_s_n": len(rates), "rss_gb": rss})
next_sample += args.interval
csv.close()
# Summary stats
early = [r for r in rows if r["t_sec"] <= 60 and r["tok_s_n"] > 0]
late = [r for r in rows[-min(len(rows), 5):] if r["tok_s_n"] > 0]
all_rates = [r["tok_s_mean"] for r in rows if r["tok_s_n"] > 0]
cold = max((r["tok_s_mean"] for r in rows[:3] if r["tok_s_n"] > 0), default=0.0)
sustained = statistics.median([r["tok_s_mean"] for r in late]) if late else 0.0
overall = statistics.median(all_rates) if all_rates else 0.0
throttle_pct = (1 - sustained / cold) * 100 if cold > 0 else 0.0
peak_rss = max((r["rss_gb"] for r in rows), default=0.0)
summary = {
"duration_sec": args.duration,
"interval_sec": args.interval,
"n_samples": len(rows),
"tok_s_cold": round(cold, 2),
"tok_s_sustained_last5": round(sustained, 2),
"tok_s_median_overall": round(overall, 2),
"throttle_pct_cold_to_sustained": round(throttle_pct, 1),
"peak_rss_gb": round(peak_rss, 3),
"samples": rows,
}
args.out_json.write_text(json.dumps(summary, indent=2))
print(f"Wrote {args.out_csv} and {args.out_json}")
print(f" cold: {cold:.1f} tok/s")
print(f" sustained: {sustained:.1f} tok/s (last 5 samples)")
print(f" throttle: {throttle_pct:+.1f}% (cold → sustained)")
print(f" peak rss: {peak_rss:.2f} GB")
if __name__ == "__main__":
main()