File size: 4,673 Bytes
0a55ff6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 | #!/usr/bin/env python3
"""fill_submission.py — turn measured results into ready-to-paste submission numbers.
Reads the before/after benchmark JSONs (and, if given, the HumanEval/parity JSON),
computes the headline figures (speedup, tau, TTFT delta, pass@1, parity verdict),
and PRINTS:
* a warning if the data is STILL STUB (shape-only) — so you never submit fake numbers,
* the values to drop into MODEL_CARD.md / RESULTS.html,
* a filled one-line claim for the demo.
It does NOT edit files — paste the numbers yourself, so nothing is silently overwritten.
Usage:
python scripts/fill_submission.py \
--baseline results/baseline.json --dflash results/dflash.json \
[--humaneval results/humaneval_dflash.json]
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import Any
def _load(path: str) -> dict[str, Any]:
return json.loads(Path(path).read_text())
def _looks_stub(obj: dict[str, Any]) -> bool:
"""Heuristic: the dress-rehearsal stub stamps a tell-tale completion string."""
for r in obj.get("runs", []) or []:
if "stub completion" in str(r.get("text", "")).lower():
return True
return obj.get("base_url", "").endswith((":8000", ":8001")) and bool(
[r for r in obj.get("runs", []) or [] if "stub" in str(r.get("text", "")).lower()]
)
def _g(obj: dict[str, Any], *keys: str, default: Any = None) -> Any:
for k in keys:
if k in obj:
return obj[k]
return default
def main() -> int:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--baseline", default="results/baseline.json")
ap.add_argument("--dflash", default="results/dflash.json")
ap.add_argument("--humaneval", default=None,
help="optional pass@1 / parity JSON from humaneval_subset.py")
args = ap.parse_args()
for p in (args.baseline, args.dflash):
if not Path(p).exists():
print(f"no results yet at {p} — run the A/B (scripts/hf_job_ab.py) or 'make rehearse' first.")
return 3
base = _load(args.baseline)
dfl = _load(args.dflash)
stub = _looks_stub(base) or _looks_stub(dfl)
if stub:
print("=" * 64)
print(" ⚠️ STUB DATA DETECTED — do NOT submit these numbers.")
print(" These are shape-only dress-rehearsal results. Re-run measure.py")
print(" against the real Laguna+DFlash vLLM endpoint, then re-run this.")
print("=" * 64)
b_tps = float(_g(base, "tokens_per_s_mean", default=0.0))
d_tps = float(_g(dfl, "tokens_per_s_mean", default=0.0))
b_ttft = float(_g(base, "ttft_s_mean", default=0.0)) * 1000 # ms
d_ttft = float(_g(dfl, "ttft_s_mean", default=0.0)) * 1000 # ms
tau = _g(dfl, "acceptance_length_tau")
speedup = (d_tps / b_tps) if b_tps else 0.0
# optional quality / parity
pass1 = parity = lossless = None
if args.humaneval and Path(args.humaneval).exists():
he = _load(args.humaneval)
pass1 = _g(he, "pass_at_1", "pass@1", "pass1")
lossless = _g(he, "lossless")
parity = _g(he, "mismatches", "token_mismatches")
def fmt(x, nd=1, suffix=""):
return f"{x:.{nd}f}{suffix}" if isinstance(x, (int, float)) else "—"
print("\n--- HEADLINE (paste into MODEL_CARD.md + RESULTS.html) ---")
print(f" baseline tokens/sec : {fmt(b_tps)}")
print(f" dflash tokens/sec : {fmt(d_tps)}")
print(f" speedup : {fmt(speedup, 2, 'x')}")
print(f" acceptance length tau: {fmt(tau, 2) if tau is not None else '— (read from /metrics)'}")
print(f" TTFT baseline / dflash (ms): {fmt(b_ttft)} / {fmt(d_ttft)} (expect ~equal)")
print(f" HumanEval pass@1 : {pass1 if pass1 is not None else '— (run humaneval_subset.py)'}")
print(f" greedy parity : "
+ ("LOSSLESS ✓ (0 mismatches)" if (lossless is True or parity == 0)
else (f"{parity} mismatches ⚠️" if parity is not None else "— (run --parity)")))
print("\n--- ONE-LINE CLAIM (demo opener) ---")
if b_tps and d_tps:
tau_clause = f', tau={fmt(tau,2)}' if tau is not None else ''
print(f' "Lean Laguna: DFlash makes Laguna XS.2 generate {fmt(speedup,2,"x")} faster '
f'on one GPU ({fmt(b_tps)} -> {fmt(d_tps)} tok/s{tau_clause}) '
f'with byte-identical output."')
else:
print(" (fill once real tokens/sec are present)")
if stub:
print("\n[fill_submission] refusing to call this submittable: STUB data.")
return 2
return 0
if __name__ == "__main__":
raise SystemExit(main())
|