Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| """Portable benchmark matrix runner for the repo. | |
| Runs a small reproducible subset of benchmark entrypoints and records | |
| per-command logs plus a summary manifest. This is intended to answer: | |
| which benchmark surfaces are healthy on this machine right now? | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import platform | |
| import shlex | |
| import subprocess | |
| import sys | |
| import time | |
| from dataclasses import asdict, dataclass | |
| from pathlib import Path | |
| from typing import Dict, List, Tuple | |
| from training.core.runtime_contract import ExecutionMode | |
| from training.core.unified_backend import backend_capability_report | |
| ROOT = Path(__file__).resolve().parent.parent | |
| DEFAULT_OUT = ROOT / "verification_runs" | |
| class MatrixStep: | |
| name: str | |
| command: List[str] | |
| returncode: int | |
| duration_s: float | |
| log_path: str | |
| requested_backend: str | |
| used_backend: str | |
| execution_mode: str | |
| artifact_paths: List[str] | |
| def _timestamp() -> str: | |
| return time.strftime("%Y%m%d-%H%M%S") | |
| def _step_backend(name: str) -> str: | |
| if "max" in name: | |
| return "max" | |
| return "cpu" | |
| def _run_step(name: str, command: List[str], run_dir: Path, artifacts: List[str]) -> MatrixStep: | |
| log_path = run_dir / f"{name}.log" | |
| started = time.time() | |
| proc = subprocess.run( | |
| command, | |
| cwd=ROOT, | |
| stdout=subprocess.PIPE, | |
| stderr=subprocess.STDOUT, | |
| text=True, | |
| ) | |
| duration = time.time() - started | |
| log_path.write_text(proc.stdout) | |
| capabilities = backend_capability_report() | |
| requested_backend = _step_backend(name) | |
| capability = capabilities.get(requested_backend, {}) | |
| used_backend = requested_backend if capability.get("available") else str( | |
| capability.get("fallback_backend") or "cpu" | |
| ) | |
| execution_mode = str( | |
| capability.get("execution_mode", ExecutionMode.CPU_FALLBACK.value) | |
| ) | |
| return MatrixStep( | |
| name=name, | |
| command=command, | |
| returncode=proc.returncode, | |
| duration_s=duration, | |
| log_path=str(log_path.relative_to(ROOT)), | |
| requested_backend=requested_backend, | |
| used_backend=used_backend, | |
| execution_mode=execution_mode, | |
| artifact_paths=artifacts, | |
| ) | |
| def _commands(py: str, max_samples: int, epochs: int) -> List[Tuple[str, List[str], List[str]]]: | |
| patience = "2" | |
| batch_size = "16" | |
| return [ | |
| ( | |
| "text2cypher_v4", | |
| [ | |
| py, | |
| "-m", | |
| "training.text2cypher_kan_v4", | |
| "--max-samples", | |
| str(max_samples), | |
| "--epochs", | |
| str(epochs), | |
| "--batch-size", | |
| batch_size, | |
| "--patience", | |
| patience, | |
| "--no-evolve", | |
| ], | |
| ["training/kan_bench_results/text2cypher_v4_results.json"], | |
| ), | |
| ( | |
| "spider2_v2", | |
| [ | |
| py, | |
| "-m", | |
| "training.spider2_kan_benchmark_v2", | |
| "--epochs", | |
| str(epochs), | |
| "--batch-size", | |
| batch_size, | |
| "--patience", | |
| patience, | |
| ], | |
| ["training/kan_bench_results/spider2_v2_results.json"], | |
| ), | |
| ( | |
| "unified_dialect", | |
| [ | |
| py, | |
| "-m", | |
| "training.unified_dialect_benchmark", | |
| "--epochs", | |
| str(epochs), | |
| "--batch-size", | |
| batch_size, | |
| "--patience", | |
| patience, | |
| "--max-hf", | |
| "0", | |
| ], | |
| ["training/kan_bench_results/unified_dialect_results.json"], | |
| ), | |
| ( | |
| "swebench_lite", | |
| [ | |
| py, | |
| "-m", | |
| "training.swebench_kan_benchmark", | |
| "--variant", | |
| "lite", | |
| "--split", | |
| "test", | |
| "--epochs", | |
| str(epochs), | |
| "--batch-size", | |
| batch_size, | |
| "--patience", | |
| patience, | |
| "--max-instances", | |
| str(max_samples), | |
| ], | |
| ["training/kan_bench_results/swebench_results.json"], | |
| ), | |
| ] | |
| def main() -> int: | |
| parser = argparse.ArgumentParser(description="Portable benchmark matrix runner") | |
| parser.add_argument("--max-samples", type=int, default=16) | |
| parser.add_argument("--epochs", type=int, default=1) | |
| parser.add_argument("--out-dir", default=str(DEFAULT_OUT)) | |
| parser.add_argument("--allow-failures", action="store_true") | |
| args = parser.parse_args() | |
| run_dir = Path(args.out_dir) / f"benchmark-matrix-{_timestamp()}" | |
| run_dir.mkdir(parents=True, exist_ok=True) | |
| py = sys.executable | |
| results: List[MatrixStep] = [] | |
| for name, command, artifacts in _commands(py, args.max_samples, args.epochs): | |
| print(f"[run] {name}: {shlex.join(command)}", flush=True) | |
| result = _run_step(name, command, run_dir, artifacts) | |
| print( | |
| f"[done] {name}: rc={result.returncode} backend={result.used_backend} " | |
| f"mode={result.execution_mode} time={result.duration_s:.2f}s log={result.log_path}", | |
| flush=True, | |
| ) | |
| results.append(result) | |
| if result.returncode != 0 and not args.allow_failures: | |
| break | |
| summary: Dict[str, object] = { | |
| "root": str(ROOT), | |
| "python": sys.executable, | |
| "python_version": sys.version, | |
| "platform": { | |
| "system": platform.system(), | |
| "release": platform.release(), | |
| "machine": platform.machine(), | |
| }, | |
| "config": { | |
| "max_samples": args.max_samples, | |
| "epochs": args.epochs, | |
| }, | |
| "capabilities": backend_capability_report(), | |
| "steps": [asdict(r) for r in results], | |
| "success": all(r.returncode == 0 for r in results) and len(results) == len(_commands(py, args.max_samples, args.epochs)), | |
| } | |
| summary_path = run_dir / "summary.json" | |
| summary_path.write_text(json.dumps(summary, indent=2)) | |
| print(f"[summary] {summary_path.relative_to(ROOT)}", flush=True) | |
| return 0 if summary["success"] or args.allow_failures else 1 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |