#!/usr/bin/env python3 """Portable benchmark matrix runner for the repo. Runs a small reproducible subset of benchmark entrypoints and records per-command logs plus a summary manifest. This is intended to answer: which benchmark surfaces are healthy on this machine right now? """ from __future__ import annotations import argparse import json import platform import shlex import subprocess import sys import time from dataclasses import asdict, dataclass from pathlib import Path from typing import Dict, List, Tuple from training.core.runtime_contract import ExecutionMode from training.core.unified_backend import backend_capability_report ROOT = Path(__file__).resolve().parent.parent DEFAULT_OUT = ROOT / "verification_runs" @dataclass class MatrixStep: name: str command: List[str] returncode: int duration_s: float log_path: str requested_backend: str used_backend: str execution_mode: str artifact_paths: List[str] def _timestamp() -> str: return time.strftime("%Y%m%d-%H%M%S") def _step_backend(name: str) -> str: if "max" in name: return "max" return "cpu" def _run_step(name: str, command: List[str], run_dir: Path, artifacts: List[str]) -> MatrixStep: log_path = run_dir / f"{name}.log" started = time.time() proc = subprocess.run( command, cwd=ROOT, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, ) duration = time.time() - started log_path.write_text(proc.stdout) capabilities = backend_capability_report() requested_backend = _step_backend(name) capability = capabilities.get(requested_backend, {}) used_backend = requested_backend if capability.get("available") else str( capability.get("fallback_backend") or "cpu" ) execution_mode = str( capability.get("execution_mode", ExecutionMode.CPU_FALLBACK.value) ) return MatrixStep( name=name, command=command, returncode=proc.returncode, duration_s=duration, log_path=str(log_path.relative_to(ROOT)), requested_backend=requested_backend, used_backend=used_backend, execution_mode=execution_mode, artifact_paths=artifacts, ) def _commands(py: str, max_samples: int, epochs: int) -> List[Tuple[str, List[str], List[str]]]: patience = "2" batch_size = "16" return [ ( "text2cypher_v4", [ py, "-m", "training.text2cypher_kan_v4", "--max-samples", str(max_samples), "--epochs", str(epochs), "--batch-size", batch_size, "--patience", patience, "--no-evolve", ], ["training/kan_bench_results/text2cypher_v4_results.json"], ), ( "spider2_v2", [ py, "-m", "training.spider2_kan_benchmark_v2", "--epochs", str(epochs), "--batch-size", batch_size, "--patience", patience, ], ["training/kan_bench_results/spider2_v2_results.json"], ), ( "unified_dialect", [ py, "-m", "training.unified_dialect_benchmark", "--epochs", str(epochs), "--batch-size", batch_size, "--patience", patience, "--max-hf", "0", ], ["training/kan_bench_results/unified_dialect_results.json"], ), ( "swebench_lite", [ py, "-m", "training.swebench_kan_benchmark", "--variant", "lite", "--split", "test", "--epochs", str(epochs), "--batch-size", batch_size, "--patience", patience, "--max-instances", str(max_samples), ], ["training/kan_bench_results/swebench_results.json"], ), ] def main() -> int: parser = argparse.ArgumentParser(description="Portable benchmark matrix runner") parser.add_argument("--max-samples", type=int, default=16) parser.add_argument("--epochs", type=int, default=1) parser.add_argument("--out-dir", default=str(DEFAULT_OUT)) parser.add_argument("--allow-failures", action="store_true") args = parser.parse_args() run_dir = Path(args.out_dir) / f"benchmark-matrix-{_timestamp()}" run_dir.mkdir(parents=True, exist_ok=True) py = sys.executable results: List[MatrixStep] = [] for name, command, artifacts in _commands(py, args.max_samples, args.epochs): print(f"[run] {name}: {shlex.join(command)}", flush=True) result = _run_step(name, command, run_dir, artifacts) print( f"[done] {name}: rc={result.returncode} backend={result.used_backend} " f"mode={result.execution_mode} time={result.duration_s:.2f}s log={result.log_path}", flush=True, ) results.append(result) if result.returncode != 0 and not args.allow_failures: break summary: Dict[str, object] = { "root": str(ROOT), "python": sys.executable, "python_version": sys.version, "platform": { "system": platform.system(), "release": platform.release(), "machine": platform.machine(), }, "config": { "max_samples": args.max_samples, "epochs": args.epochs, }, "capabilities": backend_capability_report(), "steps": [asdict(r) for r in results], "success": all(r.returncode == 0 for r in results) and len(results) == len(_commands(py, args.max_samples, args.epochs)), } summary_path = run_dir / "summary.json" summary_path.write_text(json.dumps(summary, indent=2)) print(f"[summary] {summary_path.relative_to(ROOT)}", flush=True) return 0 if summary["success"] or args.allow_failures else 1 if __name__ == "__main__": raise SystemExit(main())