File size: 5,458 Bytes

2d05890

#!/usr/bin/env python3
"""Real-time tok/s + RSS sampler for run_thermal.sh.

Polls llama-server.log incrementally and the RSS log every `interval` seconds,
appending one row per interval to thermal_curve.csv:

    t_sec, tok_s_mean, tok_s_p10, tok_s_n, rss_gb

Exits cleanly after `duration` seconds. Writes a final summary to
thermal_curve.json with cold/sustained/throttle stats.
"""
from __future__ import annotations
import argparse
import json
import re
import statistics
import time
from pathlib import Path

EVAL_RE = re.compile(
    r"eval time\s*=\s*[\d.]+\s*ms\s*/\s*(\d+)\s*(?:tokens|runs)\s*"
    r"\(\s*[\d.]+\s*ms per token,\s*([\d.]+)\s*tokens per second\)",
    re.IGNORECASE,
)


def latest_rss_gb(rss_log: Path) -> float:
    if not rss_log.exists():
        return 0.0
    try:
        with rss_log.open() as f:
            tail = f.readlines()[-3:]
        for line in reversed(tail):
            parts = line.split()
            if len(parts) >= 2 and parts[1].isdigit():
                return int(parts[1]) / 1024 / 1024
    except Exception:
        pass
    return 0.0


def percentile(values, p):
    if not values:
        return 0.0
    s = sorted(values)
    idx = max(0, min(len(s) - 1, int(round((p / 100.0) * (len(s) - 1)))))
    return s[idx]


def main():
    p = argparse.ArgumentParser()
    p.add_argument("--llama-log", type=Path, required=True)
    p.add_argument("--rss-log", type=Path, required=True)
    p.add_argument("--out-csv", type=Path, required=True)
    p.add_argument("--out-json", type=Path, required=True)
    p.add_argument("--interval", type=int, default=30, help="seconds per sample window")
    p.add_argument("--duration", type=int, default=2700, help="seconds to sample (default 45 min)")
    p.add_argument("--min-tokens", type=int, default=8,
                   help="filter eval lines with fewer tokens than this (skip trivial bursts)")
    args = p.parse_args()

    # Wait until llama-server log exists
    deadline = time.time() + 60
    while not args.llama_log.exists() and time.time() < deadline:
        time.sleep(1)
    if not args.llama_log.exists():
        print(f"llama log never appeared at {args.llama_log}", flush=True)
        return

    last_pos = 0
    with args.llama_log.open() as f:
        f.seek(0, 2)  # skip startup lines (model load, etc.)
        last_pos = f.tell()

    args.out_csv.parent.mkdir(parents=True, exist_ok=True)
    csv = args.out_csv.open("w", buffering=1)
    csv.write("t_sec,tok_s_mean,tok_s_median,tok_s_p10,tok_s_n,rss_gb\n")

    rows: list[dict] = []
    start = time.time()
    next_sample = start + args.interval

    while time.time() - start < args.duration:
        sleep_for = max(0.5, next_sample - time.time())
        time.sleep(sleep_for)

        # Read all new content since last poll
        try:
            with args.llama_log.open() as f:
                f.seek(last_pos)
                chunk = f.read()
                last_pos = f.tell()
        except FileNotFoundError:
            chunk = ""

        rates = []
        # Process line-by-line to filter out prompt-eval lines (which would
        # otherwise inflate decode tok/s by ~10x).
        for line in chunk.splitlines():
            if "prompt eval time" in line:
                continue
            m = EVAL_RE.search(line)
            if m:
                n_tok = int(m.group(1))
                tok_s = float(m.group(2))
                if n_tok >= args.min_tokens:
                    rates.append(tok_s)

        rss = latest_rss_gb(args.rss_log)
        t = round(time.time() - start, 1)
        if rates:
            mean = statistics.mean(rates)
            med = statistics.median(rates)
            p10 = percentile(rates, 10)
        else:
            mean = med = p10 = 0.0
        csv.write(f"{t:.1f},{mean:.2f},{med:.2f},{p10:.2f},{len(rates)},{rss:.3f}\n")
        rows.append({"t_sec": t, "tok_s_mean": mean, "tok_s_median": med,
                     "tok_s_p10": p10, "tok_s_n": len(rates), "rss_gb": rss})
        next_sample += args.interval

    csv.close()

    # Summary stats
    early = [r for r in rows if r["t_sec"] <= 60 and r["tok_s_n"] > 0]
    late = [r for r in rows[-min(len(rows), 5):] if r["tok_s_n"] > 0]
    all_rates = [r["tok_s_mean"] for r in rows if r["tok_s_n"] > 0]
    cold = max((r["tok_s_mean"] for r in rows[:3] if r["tok_s_n"] > 0), default=0.0)
    sustained = statistics.median([r["tok_s_mean"] for r in late]) if late else 0.0
    overall = statistics.median(all_rates) if all_rates else 0.0
    throttle_pct = (1 - sustained / cold) * 100 if cold > 0 else 0.0
    peak_rss = max((r["rss_gb"] for r in rows), default=0.0)

    summary = {
        "duration_sec": args.duration,
        "interval_sec": args.interval,
        "n_samples": len(rows),
        "tok_s_cold": round(cold, 2),
        "tok_s_sustained_last5": round(sustained, 2),
        "tok_s_median_overall": round(overall, 2),
        "throttle_pct_cold_to_sustained": round(throttle_pct, 1),
        "peak_rss_gb": round(peak_rss, 3),
        "samples": rows,
    }
    args.out_json.write_text(json.dumps(summary, indent=2))
    print(f"Wrote {args.out_csv} and {args.out_json}")
    print(f"  cold:      {cold:.1f} tok/s")
    print(f"  sustained: {sustained:.1f} tok/s (last 5 samples)")
    print(f"  throttle:  {throttle_pct:+.1f}%  (cold → sustained)")
    print(f"  peak rss:  {peak_rss:.2f} GB")


if __name__ == "__main__":
    main()