#!/usr/bin/env python3 """check_results.py — smoke-validate the schema of measure.py output JSON. The benchmark's value is the before/after diff of results/baseline.json and results/dflash.json; this asserts those files have the shape the demo expects so a broken run is caught locally, not on stage. Usage: python scripts/check_results.py results/dflash.json results/baseline.json Exit 0 = all valid, 1 = problems listed. """ from __future__ import annotations import json import sys REQUIRED = { "label": str, "model": str, "n": int, "tokens_per_s_mean": (int, float), "ttft_s_mean": (int, float), "runs": list, } RUN_KEYS = {"ttft_s", "total_s", "new_tokens", "tokens_per_s", "text"} def check(path: str) -> list[str]: problems: list[str] = [] try: obj = json.load(open(path)) except (OSError, json.JSONDecodeError) as e: return [f"{path}: cannot read/parse ({e})"] for key, typ in REQUIRED.items(): if key not in obj: problems.append(f"{path}: missing key '{key}'") elif not isinstance(obj[key], typ): problems.append(f"{path}: key '{key}' has wrong type {type(obj[key]).__name__}") runs = obj.get("runs") or [] if isinstance(runs, list) and runs: missing = RUN_KEYS - set(runs[0]) if missing: problems.append(f"{path}: run[0] missing keys {sorted(missing)}") elif isinstance(runs, list): problems.append(f"{path}: 'runs' is empty") return problems def main(paths: list[str]) -> int: if not paths: print(__doc__) return 2 problems: list[str] = [] for p in paths: problems += check(p) for p in paths: print(f"checked {p}") if problems: print("\nFAIL:") for pr in problems: print(" -", pr) return 1 print("\nOK: all result files have the expected schema.") return 0 if __name__ == "__main__": sys.exit(main(sys.argv[1:]))