lean-laguna / scripts /fill_submission.py

Lean Laguna: lossless DFlash speculative decoding on Laguna XS.2 (harness, environment, results)

0a55ff6 about 5 hours ago

4.67 kB

	#!/usr/bin/env python3
	"""fill_submission.py — turn measured results into ready-to-paste submission numbers.

	Reads the before/after benchmark JSONs (and, if given, the HumanEval/parity JSON),
	computes the headline figures (speedup, tau, TTFT delta, pass@1, parity verdict),
	and PRINTS:
	* a warning if the data is STILL STUB (shape-only) — so you never submit fake numbers,
	* the values to drop into MODEL_CARD.md / RESULTS.html,
	* a filled one-line claim for the demo.

	It does NOT edit files — paste the numbers yourself, so nothing is silently overwritten.

	Usage:
	python scripts/fill_submission.py \
	--baseline results/baseline.json --dflash results/dflash.json \
	[--humaneval results/humaneval_dflash.json]
	"""
	from __future__ import annotations

	import argparse
	import json
	from pathlib import Path
	from typing import Any


	def _load(path: str) -> dict[str, Any]:
	return json.loads(Path(path).read_text())


	def _looks_stub(obj: dict[str, Any]) -> bool:
	"""Heuristic: the dress-rehearsal stub stamps a tell-tale completion string."""
	for r in obj.get("runs", []) or []:
	if "stub completion" in str(r.get("text", "")).lower():
	return True
	return obj.get("base_url", "").endswith((":8000", ":8001")) and bool(
	[r for r in obj.get("runs", []) or [] if "stub" in str(r.get("text", "")).lower()]
	)


	def _g(obj: dict[str, Any], *keys: str, default: Any = None) -> Any:
	for k in keys:
	if k in obj:
	return obj[k]
	return default


	def main() -> int:
	ap = argparse.ArgumentParser(description=__doc__)
	ap.add_argument("--baseline", default="results/baseline.json")
	ap.add_argument("--dflash", default="results/dflash.json")
	ap.add_argument("--humaneval", default=None,
	help="optional pass@1 / parity JSON from humaneval_subset.py")
	args = ap.parse_args()

	for p in (args.baseline, args.dflash):
	if not Path(p).exists():
	print(f"no results yet at {p} — run the A/B (scripts/hf_job_ab.py) or 'make rehearse' first.")
	return 3

	base = _load(args.baseline)
	dfl = _load(args.dflash)

	stub = _looks_stub(base) or _looks_stub(dfl)
	if stub:
	print("=" * 64)
	print(" ⚠️ STUB DATA DETECTED — do NOT submit these numbers.")
	print(" These are shape-only dress-rehearsal results. Re-run measure.py")
	print(" against the real Laguna+DFlash vLLM endpoint, then re-run this.")
	print("=" * 64)

	b_tps = float(_g(base, "tokens_per_s_mean", default=0.0))
	d_tps = float(_g(dfl, "tokens_per_s_mean", default=0.0))
	b_ttft = float(_g(base, "ttft_s_mean", default=0.0)) * 1000 # ms
	d_ttft = float(_g(dfl, "ttft_s_mean", default=0.0)) * 1000 # ms
	tau = _g(dfl, "acceptance_length_tau")
	speedup = (d_tps / b_tps) if b_tps else 0.0

	# optional quality / parity
	pass1 = parity = lossless = None
	if args.humaneval and Path(args.humaneval).exists():
	he = _load(args.humaneval)
	pass1 = _g(he, "pass_at_1", "pass@1", "pass1")
	lossless = _g(he, "lossless")
	parity = _g(he, "mismatches", "token_mismatches")

	def fmt(x, nd=1, suffix=""):
	return f"{x:.{nd}f}{suffix}" if isinstance(x, (int, float)) else "—"

	print("\n--- HEADLINE (paste into MODEL_CARD.md + RESULTS.html) ---")
	print(f" baseline tokens/sec : {fmt(b_tps)}")
	print(f" dflash tokens/sec : {fmt(d_tps)}")
	print(f" speedup : {fmt(speedup, 2, 'x')}")
	print(f" acceptance length tau: {fmt(tau, 2) if tau is not None else '— (read from /metrics)'}")
	print(f" TTFT baseline / dflash (ms): {fmt(b_ttft)} / {fmt(d_ttft)} (expect ~equal)")
	print(f" HumanEval pass@1 : {pass1 if pass1 is not None else '— (run humaneval_subset.py)'}")
	print(f" greedy parity : "
	+ ("LOSSLESS ✓ (0 mismatches)" if (lossless is True or parity == 0)
	else (f"{parity} mismatches ⚠️" if parity is not None else "— (run --parity)")))

	print("\n--- ONE-LINE CLAIM (demo opener) ---")
	if b_tps and d_tps:
	tau_clause = f', tau={fmt(tau,2)}' if tau is not None else ''
	print(f' "Lean Laguna: DFlash makes Laguna XS.2 generate {fmt(speedup,2,"x")} faster '
	f'on one GPU ({fmt(b_tps)} -> {fmt(d_tps)} tok/s{tau_clause}) '
	f'with byte-identical output."')
	else:
	print(" (fill once real tokens/sec are present)")

	if stub:
	print("\n[fill_submission] refusing to call this submittable: STUB data.")
	return 2
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())