File size: 4,673 Bytes

0a55ff6

#!/usr/bin/env python3
"""fill_submission.py — turn measured results into ready-to-paste submission numbers.

Reads the before/after benchmark JSONs (and, if given, the HumanEval/parity JSON),
computes the headline figures (speedup, tau, TTFT delta, pass@1, parity verdict),
and PRINTS:
  * a warning if the data is STILL STUB (shape-only) — so you never submit fake numbers,
  * the values to drop into MODEL_CARD.md / RESULTS.html,
  * a filled one-line claim for the demo.

It does NOT edit files — paste the numbers yourself, so nothing is silently overwritten.

Usage:
  python scripts/fill_submission.py \
    --baseline results/baseline.json --dflash results/dflash.json \
    [--humaneval results/humaneval_dflash.json]
"""
from __future__ import annotations

import argparse
import json
from pathlib import Path
from typing import Any


def _load(path: str) -> dict[str, Any]:
    return json.loads(Path(path).read_text())


def _looks_stub(obj: dict[str, Any]) -> bool:
    """Heuristic: the dress-rehearsal stub stamps a tell-tale completion string."""
    for r in obj.get("runs", []) or []:
        if "stub completion" in str(r.get("text", "")).lower():
            return True
    return obj.get("base_url", "").endswith((":8000", ":8001")) and bool(
        [r for r in obj.get("runs", []) or [] if "stub" in str(r.get("text", "")).lower()]
    )


def _g(obj: dict[str, Any], *keys: str, default: Any = None) -> Any:
    for k in keys:
        if k in obj:
            return obj[k]
    return default


def main() -> int:
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("--baseline", default="results/baseline.json")
    ap.add_argument("--dflash", default="results/dflash.json")
    ap.add_argument("--humaneval", default=None,
                    help="optional pass@1 / parity JSON from humaneval_subset.py")
    args = ap.parse_args()

    for p in (args.baseline, args.dflash):
        if not Path(p).exists():
            print(f"no results yet at {p} — run the A/B (scripts/hf_job_ab.py) or 'make rehearse' first.")
            return 3

    base = _load(args.baseline)
    dfl = _load(args.dflash)

    stub = _looks_stub(base) or _looks_stub(dfl)
    if stub:
        print("=" * 64)
        print("  ⚠️  STUB DATA DETECTED — do NOT submit these numbers.")
        print("  These are shape-only dress-rehearsal results. Re-run measure.py")
        print("  against the real Laguna+DFlash vLLM endpoint, then re-run this.")
        print("=" * 64)

    b_tps = float(_g(base, "tokens_per_s_mean", default=0.0))
    d_tps = float(_g(dfl, "tokens_per_s_mean", default=0.0))
    b_ttft = float(_g(base, "ttft_s_mean", default=0.0)) * 1000  # ms
    d_ttft = float(_g(dfl, "ttft_s_mean", default=0.0)) * 1000   # ms
    tau = _g(dfl, "acceptance_length_tau")
    speedup = (d_tps / b_tps) if b_tps else 0.0

    # optional quality / parity
    pass1 = parity = lossless = None
    if args.humaneval and Path(args.humaneval).exists():
        he = _load(args.humaneval)
        pass1 = _g(he, "pass_at_1", "pass@1", "pass1")
        lossless = _g(he, "lossless")
        parity = _g(he, "mismatches", "token_mismatches")

    def fmt(x, nd=1, suffix=""):
        return f"{x:.{nd}f}{suffix}" if isinstance(x, (int, float)) else "—"

    print("\n--- HEADLINE (paste into MODEL_CARD.md + RESULTS.html) ---")
    print(f"  baseline tokens/sec : {fmt(b_tps)}")
    print(f"  dflash   tokens/sec : {fmt(d_tps)}")
    print(f"  speedup             : {fmt(speedup, 2, 'x')}")
    print(f"  acceptance length tau: {fmt(tau, 2) if tau is not None else '— (read from /metrics)'}")
    print(f"  TTFT baseline / dflash (ms): {fmt(b_ttft)} / {fmt(d_ttft)}  (expect ~equal)")
    print(f"  HumanEval pass@1    : {pass1 if pass1 is not None else '— (run humaneval_subset.py)'}")
    print(f"  greedy parity       : "
          + ("LOSSLESS ✓ (0 mismatches)" if (lossless is True or parity == 0)
             else (f"{parity} mismatches ⚠️" if parity is not None else "— (run --parity)")))

    print("\n--- ONE-LINE CLAIM (demo opener) ---")
    if b_tps and d_tps:
        tau_clause = f', tau={fmt(tau,2)}' if tau is not None else ''
        print(f'  "Lean Laguna: DFlash makes Laguna XS.2 generate {fmt(speedup,2,"x")} faster '
              f'on one GPU ({fmt(b_tps)} -> {fmt(d_tps)} tok/s{tau_clause}) '
              f'with byte-identical output."')
    else:
        print("  (fill once real tokens/sec are present)")

    if stub:
        print("\n[fill_submission] refusing to call this submittable: STUB data.")
        return 2
    return 0


if __name__ == "__main__":
    raise SystemExit(main())