#!/usr/bin/env python3 """fill_submission.py — turn measured results into ready-to-paste submission numbers. Reads the before/after benchmark JSONs (and, if given, the HumanEval/parity JSON), computes the headline figures (speedup, tau, TTFT delta, pass@1, parity verdict), and PRINTS: * a warning if the data is STILL STUB (shape-only) — so you never submit fake numbers, * the values to drop into MODEL_CARD.md / RESULTS.html, * a filled one-line claim for the demo. It does NOT edit files — paste the numbers yourself, so nothing is silently overwritten. Usage: python scripts/fill_submission.py \ --baseline results/baseline.json --dflash results/dflash.json \ [--humaneval results/humaneval_dflash.json] """ from __future__ import annotations import argparse import json from pathlib import Path from typing import Any def _load(path: str) -> dict[str, Any]: return json.loads(Path(path).read_text()) def _looks_stub(obj: dict[str, Any]) -> bool: """Heuristic: the dress-rehearsal stub stamps a tell-tale completion string.""" for r in obj.get("runs", []) or []: if "stub completion" in str(r.get("text", "")).lower(): return True return obj.get("base_url", "").endswith((":8000", ":8001")) and bool( [r for r in obj.get("runs", []) or [] if "stub" in str(r.get("text", "")).lower()] ) def _g(obj: dict[str, Any], *keys: str, default: Any = None) -> Any: for k in keys: if k in obj: return obj[k] return default def main() -> int: ap = argparse.ArgumentParser(description=__doc__) ap.add_argument("--baseline", default="results/baseline.json") ap.add_argument("--dflash", default="results/dflash.json") ap.add_argument("--humaneval", default=None, help="optional pass@1 / parity JSON from humaneval_subset.py") args = ap.parse_args() for p in (args.baseline, args.dflash): if not Path(p).exists(): print(f"no results yet at {p} — run the A/B (scripts/hf_job_ab.py) or 'make rehearse' first.") return 3 base = _load(args.baseline) dfl = _load(args.dflash) stub = _looks_stub(base) or _looks_stub(dfl) if stub: print("=" * 64) print(" ⚠️ STUB DATA DETECTED — do NOT submit these numbers.") print(" These are shape-only dress-rehearsal results. Re-run measure.py") print(" against the real Laguna+DFlash vLLM endpoint, then re-run this.") print("=" * 64) b_tps = float(_g(base, "tokens_per_s_mean", default=0.0)) d_tps = float(_g(dfl, "tokens_per_s_mean", default=0.0)) b_ttft = float(_g(base, "ttft_s_mean", default=0.0)) * 1000 # ms d_ttft = float(_g(dfl, "ttft_s_mean", default=0.0)) * 1000 # ms tau = _g(dfl, "acceptance_length_tau") speedup = (d_tps / b_tps) if b_tps else 0.0 # optional quality / parity pass1 = parity = lossless = None if args.humaneval and Path(args.humaneval).exists(): he = _load(args.humaneval) pass1 = _g(he, "pass_at_1", "pass@1", "pass1") lossless = _g(he, "lossless") parity = _g(he, "mismatches", "token_mismatches") def fmt(x, nd=1, suffix=""): return f"{x:.{nd}f}{suffix}" if isinstance(x, (int, float)) else "—" print("\n--- HEADLINE (paste into MODEL_CARD.md + RESULTS.html) ---") print(f" baseline tokens/sec : {fmt(b_tps)}") print(f" dflash tokens/sec : {fmt(d_tps)}") print(f" speedup : {fmt(speedup, 2, 'x')}") print(f" acceptance length tau: {fmt(tau, 2) if tau is not None else '— (read from /metrics)'}") print(f" TTFT baseline / dflash (ms): {fmt(b_ttft)} / {fmt(d_ttft)} (expect ~equal)") print(f" HumanEval pass@1 : {pass1 if pass1 is not None else '— (run humaneval_subset.py)'}") print(f" greedy parity : " + ("LOSSLESS ✓ (0 mismatches)" if (lossless is True or parity == 0) else (f"{parity} mismatches ⚠️" if parity is not None else "— (run --parity)"))) print("\n--- ONE-LINE CLAIM (demo opener) ---") if b_tps and d_tps: tau_clause = f', tau={fmt(tau,2)}' if tau is not None else '' print(f' "Lean Laguna: DFlash makes Laguna XS.2 generate {fmt(speedup,2,"x")} faster ' f'on one GPU ({fmt(b_tps)} -> {fmt(d_tps)} tok/s{tau_clause}) ' f'with byte-identical output."') else: print(" (fill once real tokens/sec are present)") if stub: print("\n[fill_submission] refusing to call this submittable: STUB data.") return 2 return 0 if __name__ == "__main__": raise SystemExit(main())