| |
| """Real-time tok/s + RSS sampler for run_thermal.sh. |
| |
| Polls llama-server.log incrementally and the RSS log every `interval` seconds, |
| appending one row per interval to thermal_curve.csv: |
| |
| t_sec, tok_s_mean, tok_s_p10, tok_s_n, rss_gb |
| |
| Exits cleanly after `duration` seconds. Writes a final summary to |
| thermal_curve.json with cold/sustained/throttle stats. |
| """ |
| from __future__ import annotations |
| import argparse |
| import json |
| import re |
| import statistics |
| import time |
| from pathlib import Path |
|
|
| EVAL_RE = re.compile( |
| r"eval time\s*=\s*[\d.]+\s*ms\s*/\s*(\d+)\s*(?:tokens|runs)\s*" |
| r"\(\s*[\d.]+\s*ms per token,\s*([\d.]+)\s*tokens per second\)", |
| re.IGNORECASE, |
| ) |
|
|
|
|
| def latest_rss_gb(rss_log: Path) -> float: |
| if not rss_log.exists(): |
| return 0.0 |
| try: |
| with rss_log.open() as f: |
| tail = f.readlines()[-3:] |
| for line in reversed(tail): |
| parts = line.split() |
| if len(parts) >= 2 and parts[1].isdigit(): |
| return int(parts[1]) / 1024 / 1024 |
| except Exception: |
| pass |
| return 0.0 |
|
|
|
|
| def percentile(values, p): |
| if not values: |
| return 0.0 |
| s = sorted(values) |
| idx = max(0, min(len(s) - 1, int(round((p / 100.0) * (len(s) - 1))))) |
| return s[idx] |
|
|
|
|
| def main(): |
| p = argparse.ArgumentParser() |
| p.add_argument("--llama-log", type=Path, required=True) |
| p.add_argument("--rss-log", type=Path, required=True) |
| p.add_argument("--out-csv", type=Path, required=True) |
| p.add_argument("--out-json", type=Path, required=True) |
| p.add_argument("--interval", type=int, default=30, help="seconds per sample window") |
| p.add_argument("--duration", type=int, default=2700, help="seconds to sample (default 45 min)") |
| p.add_argument("--min-tokens", type=int, default=8, |
| help="filter eval lines with fewer tokens than this (skip trivial bursts)") |
| args = p.parse_args() |
|
|
| |
| deadline = time.time() + 60 |
| while not args.llama_log.exists() and time.time() < deadline: |
| time.sleep(1) |
| if not args.llama_log.exists(): |
| print(f"llama log never appeared at {args.llama_log}", flush=True) |
| return |
|
|
| last_pos = 0 |
| with args.llama_log.open() as f: |
| f.seek(0, 2) |
| last_pos = f.tell() |
|
|
| args.out_csv.parent.mkdir(parents=True, exist_ok=True) |
| csv = args.out_csv.open("w", buffering=1) |
| csv.write("t_sec,tok_s_mean,tok_s_median,tok_s_p10,tok_s_n,rss_gb\n") |
|
|
| rows: list[dict] = [] |
| start = time.time() |
| next_sample = start + args.interval |
|
|
| while time.time() - start < args.duration: |
| sleep_for = max(0.5, next_sample - time.time()) |
| time.sleep(sleep_for) |
|
|
| |
| try: |
| with args.llama_log.open() as f: |
| f.seek(last_pos) |
| chunk = f.read() |
| last_pos = f.tell() |
| except FileNotFoundError: |
| chunk = "" |
|
|
| rates = [] |
| |
| |
| for line in chunk.splitlines(): |
| if "prompt eval time" in line: |
| continue |
| m = EVAL_RE.search(line) |
| if m: |
| n_tok = int(m.group(1)) |
| tok_s = float(m.group(2)) |
| if n_tok >= args.min_tokens: |
| rates.append(tok_s) |
|
|
| rss = latest_rss_gb(args.rss_log) |
| t = round(time.time() - start, 1) |
| if rates: |
| mean = statistics.mean(rates) |
| med = statistics.median(rates) |
| p10 = percentile(rates, 10) |
| else: |
| mean = med = p10 = 0.0 |
| csv.write(f"{t:.1f},{mean:.2f},{med:.2f},{p10:.2f},{len(rates)},{rss:.3f}\n") |
| rows.append({"t_sec": t, "tok_s_mean": mean, "tok_s_median": med, |
| "tok_s_p10": p10, "tok_s_n": len(rates), "rss_gb": rss}) |
| next_sample += args.interval |
|
|
| csv.close() |
|
|
| |
| early = [r for r in rows if r["t_sec"] <= 60 and r["tok_s_n"] > 0] |
| late = [r for r in rows[-min(len(rows), 5):] if r["tok_s_n"] > 0] |
| all_rates = [r["tok_s_mean"] for r in rows if r["tok_s_n"] > 0] |
| cold = max((r["tok_s_mean"] for r in rows[:3] if r["tok_s_n"] > 0), default=0.0) |
| sustained = statistics.median([r["tok_s_mean"] for r in late]) if late else 0.0 |
| overall = statistics.median(all_rates) if all_rates else 0.0 |
| throttle_pct = (1 - sustained / cold) * 100 if cold > 0 else 0.0 |
| peak_rss = max((r["rss_gb"] for r in rows), default=0.0) |
|
|
| summary = { |
| "duration_sec": args.duration, |
| "interval_sec": args.interval, |
| "n_samples": len(rows), |
| "tok_s_cold": round(cold, 2), |
| "tok_s_sustained_last5": round(sustained, 2), |
| "tok_s_median_overall": round(overall, 2), |
| "throttle_pct_cold_to_sustained": round(throttle_pct, 1), |
| "peak_rss_gb": round(peak_rss, 3), |
| "samples": rows, |
| } |
| args.out_json.write_text(json.dumps(summary, indent=2)) |
| print(f"Wrote {args.out_csv} and {args.out_json}") |
| print(f" cold: {cold:.1f} tok/s") |
| print(f" sustained: {sustained:.1f} tok/s (last 5 samples)") |
| print(f" throttle: {throttle_pct:+.1f}% (cold → sustained)") |
| print(f" peak rss: {peak_rss:.2f} GB") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|