Spaces:

JohnGenetica
/

ane-kan-runtime

Build error

App Files Files Community

ane-kan-runtime / scripts /benchmark_matrix.py

JohnGenetica

Deploy ANE KAN runtime Space

201cf4d verified 10 days ago

raw

history blame contribute delete

6.45 kB

	#!/usr/bin/env python3
	"""Portable benchmark matrix runner for the repo.

	Runs a small reproducible subset of benchmark entrypoints and records
	per-command logs plus a summary manifest. This is intended to answer:
	which benchmark surfaces are healthy on this machine right now?
	"""

	from __future__ import annotations

	import argparse
	import json
	import platform
	import shlex
	import subprocess
	import sys
	import time
	from dataclasses import asdict, dataclass
	from pathlib import Path
	from typing import Dict, List, Tuple

	from training.core.runtime_contract import ExecutionMode
	from training.core.unified_backend import backend_capability_report


	ROOT = Path(__file__).resolve().parent.parent
	DEFAULT_OUT = ROOT / "verification_runs"


	@dataclass
	class MatrixStep:
	name: str
	command: List[str]
	returncode: int
	duration_s: float
	log_path: str
	requested_backend: str
	used_backend: str
	execution_mode: str
	artifact_paths: List[str]


	def _timestamp() -> str:
	return time.strftime("%Y%m%d-%H%M%S")


	def _step_backend(name: str) -> str:
	if "max" in name:
	return "max"
	return "cpu"


	def _run_step(name: str, command: List[str], run_dir: Path, artifacts: List[str]) -> MatrixStep:
	log_path = run_dir / f"{name}.log"
	started = time.time()
	proc = subprocess.run(
	command,
	cwd=ROOT,
	stdout=subprocess.PIPE,
	stderr=subprocess.STDOUT,
	text=True,
	)
	duration = time.time() - started
	log_path.write_text(proc.stdout)
	capabilities = backend_capability_report()
	requested_backend = _step_backend(name)
	capability = capabilities.get(requested_backend, {})
	used_backend = requested_backend if capability.get("available") else str(
	capability.get("fallback_backend") or "cpu"
	)
	execution_mode = str(
	capability.get("execution_mode", ExecutionMode.CPU_FALLBACK.value)
	)
	return MatrixStep(
	name=name,
	command=command,
	returncode=proc.returncode,
	duration_s=duration,
	log_path=str(log_path.relative_to(ROOT)),
	requested_backend=requested_backend,
	used_backend=used_backend,
	execution_mode=execution_mode,
	artifact_paths=artifacts,
	)


	def _commands(py: str, max_samples: int, epochs: int) -> List[Tuple[str, List[str], List[str]]]:
	patience = "2"
	batch_size = "16"
	return [
	(
	"text2cypher_v4",
	[
	py,
	"-m",
	"training.text2cypher_kan_v4",
	"--max-samples",
	str(max_samples),
	"--epochs",
	str(epochs),
	"--batch-size",
	batch_size,
	"--patience",
	patience,
	"--no-evolve",
	],
	["training/kan_bench_results/text2cypher_v4_results.json"],
	),
	(
	"spider2_v2",
	[
	py,
	"-m",
	"training.spider2_kan_benchmark_v2",
	"--epochs",
	str(epochs),
	"--batch-size",
	batch_size,
	"--patience",
	patience,
	],
	["training/kan_bench_results/spider2_v2_results.json"],
	),
	(
	"unified_dialect",
	[
	py,
	"-m",
	"training.unified_dialect_benchmark",
	"--epochs",
	str(epochs),
	"--batch-size",
	batch_size,
	"--patience",
	patience,
	"--max-hf",
	"0",
	],
	["training/kan_bench_results/unified_dialect_results.json"],
	),
	(
	"swebench_lite",
	[
	py,
	"-m",
	"training.swebench_kan_benchmark",
	"--variant",
	"lite",
	"--split",
	"test",
	"--epochs",
	str(epochs),
	"--batch-size",
	batch_size,
	"--patience",
	patience,
	"--max-instances",
	str(max_samples),
	],
	["training/kan_bench_results/swebench_results.json"],
	),
	]


	def main() -> int:
	parser = argparse.ArgumentParser(description="Portable benchmark matrix runner")
	parser.add_argument("--max-samples", type=int, default=16)
	parser.add_argument("--epochs", type=int, default=1)
	parser.add_argument("--out-dir", default=str(DEFAULT_OUT))
	parser.add_argument("--allow-failures", action="store_true")
	args = parser.parse_args()

	run_dir = Path(args.out_dir) / f"benchmark-matrix-{_timestamp()}"
	run_dir.mkdir(parents=True, exist_ok=True)

	py = sys.executable
	results: List[MatrixStep] = []

	for name, command, artifacts in _commands(py, args.max_samples, args.epochs):
	print(f"[run] {name}: {shlex.join(command)}", flush=True)
	result = _run_step(name, command, run_dir, artifacts)
	print(
	f"[done] {name}: rc={result.returncode} backend={result.used_backend} "
	f"mode={result.execution_mode} time={result.duration_s:.2f}s log={result.log_path}",
	flush=True,
	)
	results.append(result)
	if result.returncode != 0 and not args.allow_failures:
	break

	summary: Dict[str, object] = {
	"root": str(ROOT),
	"python": sys.executable,
	"python_version": sys.version,
	"platform": {
	"system": platform.system(),
	"release": platform.release(),
	"machine": platform.machine(),
	},
	"config": {
	"max_samples": args.max_samples,
	"epochs": args.epochs,
	},
	"capabilities": backend_capability_report(),
	"steps": [asdict(r) for r in results],
	"success": all(r.returncode == 0 for r in results) and len(results) == len(_commands(py, args.max_samples, args.epochs)),
	}
	summary_path = run_dir / "summary.json"
	summary_path.write_text(json.dumps(summary, indent=2))
	print(f"[summary] {summary_path.relative_to(ROOT)}", flush=True)
	return 0 if summary["success"] or args.allow_failures else 1


	if __name__ == "__main__":
	raise SystemExit(main())