| |
| """fill_submission.py β turn measured results into ready-to-paste submission numbers. |
| |
| Reads the before/after benchmark JSONs (and, if given, the HumanEval/parity JSON), |
| computes the headline figures (speedup, tau, TTFT delta, pass@1, parity verdict), |
| and PRINTS: |
| * a warning if the data is STILL STUB (shape-only) β so you never submit fake numbers, |
| * the values to drop into MODEL_CARD.md / RESULTS.html, |
| * a filled one-line claim for the demo. |
| |
| It does NOT edit files β paste the numbers yourself, so nothing is silently overwritten. |
| |
| Usage: |
| python scripts/fill_submission.py \ |
| --baseline results/baseline.json --dflash results/dflash.json \ |
| [--humaneval results/humaneval_dflash.json] |
| """ |
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| from pathlib import Path |
| from typing import Any |
|
|
|
|
| def _load(path: str) -> dict[str, Any]: |
| return json.loads(Path(path).read_text()) |
|
|
|
|
| def _looks_stub(obj: dict[str, Any]) -> bool: |
| """Heuristic: the dress-rehearsal stub stamps a tell-tale completion string.""" |
| for r in obj.get("runs", []) or []: |
| if "stub completion" in str(r.get("text", "")).lower(): |
| return True |
| return obj.get("base_url", "").endswith((":8000", ":8001")) and bool( |
| [r for r in obj.get("runs", []) or [] if "stub" in str(r.get("text", "")).lower()] |
| ) |
|
|
|
|
| def _g(obj: dict[str, Any], *keys: str, default: Any = None) -> Any: |
| for k in keys: |
| if k in obj: |
| return obj[k] |
| return default |
|
|
|
|
| def main() -> int: |
| ap = argparse.ArgumentParser(description=__doc__) |
| ap.add_argument("--baseline", default="results/baseline.json") |
| ap.add_argument("--dflash", default="results/dflash.json") |
| ap.add_argument("--humaneval", default=None, |
| help="optional pass@1 / parity JSON from humaneval_subset.py") |
| args = ap.parse_args() |
|
|
| for p in (args.baseline, args.dflash): |
| if not Path(p).exists(): |
| print(f"no results yet at {p} β run the A/B (scripts/hf_job_ab.py) or 'make rehearse' first.") |
| return 3 |
|
|
| base = _load(args.baseline) |
| dfl = _load(args.dflash) |
|
|
| stub = _looks_stub(base) or _looks_stub(dfl) |
| if stub: |
| print("=" * 64) |
| print(" β οΈ STUB DATA DETECTED β do NOT submit these numbers.") |
| print(" These are shape-only dress-rehearsal results. Re-run measure.py") |
| print(" against the real Laguna+DFlash vLLM endpoint, then re-run this.") |
| print("=" * 64) |
|
|
| b_tps = float(_g(base, "tokens_per_s_mean", default=0.0)) |
| d_tps = float(_g(dfl, "tokens_per_s_mean", default=0.0)) |
| b_ttft = float(_g(base, "ttft_s_mean", default=0.0)) * 1000 |
| d_ttft = float(_g(dfl, "ttft_s_mean", default=0.0)) * 1000 |
| tau = _g(dfl, "acceptance_length_tau") |
| speedup = (d_tps / b_tps) if b_tps else 0.0 |
|
|
| |
| pass1 = parity = lossless = None |
| if args.humaneval and Path(args.humaneval).exists(): |
| he = _load(args.humaneval) |
| pass1 = _g(he, "pass_at_1", "pass@1", "pass1") |
| lossless = _g(he, "lossless") |
| parity = _g(he, "mismatches", "token_mismatches") |
|
|
| def fmt(x, nd=1, suffix=""): |
| return f"{x:.{nd}f}{suffix}" if isinstance(x, (int, float)) else "β" |
|
|
| print("\n--- HEADLINE (paste into MODEL_CARD.md + RESULTS.html) ---") |
| print(f" baseline tokens/sec : {fmt(b_tps)}") |
| print(f" dflash tokens/sec : {fmt(d_tps)}") |
| print(f" speedup : {fmt(speedup, 2, 'x')}") |
| print(f" acceptance length tau: {fmt(tau, 2) if tau is not None else 'β (read from /metrics)'}") |
| print(f" TTFT baseline / dflash (ms): {fmt(b_ttft)} / {fmt(d_ttft)} (expect ~equal)") |
| print(f" HumanEval pass@1 : {pass1 if pass1 is not None else 'β (run humaneval_subset.py)'}") |
| print(f" greedy parity : " |
| + ("LOSSLESS β (0 mismatches)" if (lossless is True or parity == 0) |
| else (f"{parity} mismatches β οΈ" if parity is not None else "β (run --parity)"))) |
|
|
| print("\n--- ONE-LINE CLAIM (demo opener) ---") |
| if b_tps and d_tps: |
| tau_clause = f', tau={fmt(tau,2)}' if tau is not None else '' |
| print(f' "Lean Laguna: DFlash makes Laguna XS.2 generate {fmt(speedup,2,"x")} faster ' |
| f'on one GPU ({fmt(b_tps)} -> {fmt(d_tps)} tok/s{tau_clause}) ' |
| f'with byte-identical output."') |
| else: |
| print(" (fill once real tokens/sec are present)") |
|
|
| if stub: |
| print("\n[fill_submission] refusing to call this submittable: STUB data.") |
| return 2 |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|