ane-kan-runtime / scripts /benchmark_matrix.py
JohnGenetica's picture
Deploy ANE KAN runtime Space
201cf4d verified
#!/usr/bin/env python3
"""Portable benchmark matrix runner for the repo.
Runs a small reproducible subset of benchmark entrypoints and records
per-command logs plus a summary manifest. This is intended to answer:
which benchmark surfaces are healthy on this machine right now?
"""
from __future__ import annotations
import argparse
import json
import platform
import shlex
import subprocess
import sys
import time
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Dict, List, Tuple
from training.core.runtime_contract import ExecutionMode
from training.core.unified_backend import backend_capability_report
ROOT = Path(__file__).resolve().parent.parent
DEFAULT_OUT = ROOT / "verification_runs"
@dataclass
class MatrixStep:
name: str
command: List[str]
returncode: int
duration_s: float
log_path: str
requested_backend: str
used_backend: str
execution_mode: str
artifact_paths: List[str]
def _timestamp() -> str:
return time.strftime("%Y%m%d-%H%M%S")
def _step_backend(name: str) -> str:
if "max" in name:
return "max"
return "cpu"
def _run_step(name: str, command: List[str], run_dir: Path, artifacts: List[str]) -> MatrixStep:
log_path = run_dir / f"{name}.log"
started = time.time()
proc = subprocess.run(
command,
cwd=ROOT,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
)
duration = time.time() - started
log_path.write_text(proc.stdout)
capabilities = backend_capability_report()
requested_backend = _step_backend(name)
capability = capabilities.get(requested_backend, {})
used_backend = requested_backend if capability.get("available") else str(
capability.get("fallback_backend") or "cpu"
)
execution_mode = str(
capability.get("execution_mode", ExecutionMode.CPU_FALLBACK.value)
)
return MatrixStep(
name=name,
command=command,
returncode=proc.returncode,
duration_s=duration,
log_path=str(log_path.relative_to(ROOT)),
requested_backend=requested_backend,
used_backend=used_backend,
execution_mode=execution_mode,
artifact_paths=artifacts,
)
def _commands(py: str, max_samples: int, epochs: int) -> List[Tuple[str, List[str], List[str]]]:
patience = "2"
batch_size = "16"
return [
(
"text2cypher_v4",
[
py,
"-m",
"training.text2cypher_kan_v4",
"--max-samples",
str(max_samples),
"--epochs",
str(epochs),
"--batch-size",
batch_size,
"--patience",
patience,
"--no-evolve",
],
["training/kan_bench_results/text2cypher_v4_results.json"],
),
(
"spider2_v2",
[
py,
"-m",
"training.spider2_kan_benchmark_v2",
"--epochs",
str(epochs),
"--batch-size",
batch_size,
"--patience",
patience,
],
["training/kan_bench_results/spider2_v2_results.json"],
),
(
"unified_dialect",
[
py,
"-m",
"training.unified_dialect_benchmark",
"--epochs",
str(epochs),
"--batch-size",
batch_size,
"--patience",
patience,
"--max-hf",
"0",
],
["training/kan_bench_results/unified_dialect_results.json"],
),
(
"swebench_lite",
[
py,
"-m",
"training.swebench_kan_benchmark",
"--variant",
"lite",
"--split",
"test",
"--epochs",
str(epochs),
"--batch-size",
batch_size,
"--patience",
patience,
"--max-instances",
str(max_samples),
],
["training/kan_bench_results/swebench_results.json"],
),
]
def main() -> int:
parser = argparse.ArgumentParser(description="Portable benchmark matrix runner")
parser.add_argument("--max-samples", type=int, default=16)
parser.add_argument("--epochs", type=int, default=1)
parser.add_argument("--out-dir", default=str(DEFAULT_OUT))
parser.add_argument("--allow-failures", action="store_true")
args = parser.parse_args()
run_dir = Path(args.out_dir) / f"benchmark-matrix-{_timestamp()}"
run_dir.mkdir(parents=True, exist_ok=True)
py = sys.executable
results: List[MatrixStep] = []
for name, command, artifacts in _commands(py, args.max_samples, args.epochs):
print(f"[run] {name}: {shlex.join(command)}", flush=True)
result = _run_step(name, command, run_dir, artifacts)
print(
f"[done] {name}: rc={result.returncode} backend={result.used_backend} "
f"mode={result.execution_mode} time={result.duration_s:.2f}s log={result.log_path}",
flush=True,
)
results.append(result)
if result.returncode != 0 and not args.allow_failures:
break
summary: Dict[str, object] = {
"root": str(ROOT),
"python": sys.executable,
"python_version": sys.version,
"platform": {
"system": platform.system(),
"release": platform.release(),
"machine": platform.machine(),
},
"config": {
"max_samples": args.max_samples,
"epochs": args.epochs,
},
"capabilities": backend_capability_report(),
"steps": [asdict(r) for r in results],
"success": all(r.returncode == 0 for r in results) and len(results) == len(_commands(py, args.max_samples, args.epochs)),
}
summary_path = run_dir / "summary.json"
summary_path.write_text(json.dumps(summary, indent=2))
print(f"[summary] {summary_path.relative_to(ROOT)}", flush=True)
return 0 if summary["success"] or args.allow_failures else 1
if __name__ == "__main__":
raise SystemExit(main())