Spaces:
Runtime error
Runtime error
| #!/usr/bin/env python3 | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import re | |
| import sys | |
| from pathlib import Path | |
| from typing import Any, Callable | |
| REPO_ROOT = Path(__file__).resolve().parents[1] | |
| if str(REPO_ROOT) not in sys.path: | |
| sys.path.insert(0, str(REPO_ROOT)) | |
| LEDGER_TEMPLATE_PATH = REPO_ROOT / "artifacts" / "benchmark_ledger.template.json" | |
| from scripts.hydra_generation import build_hydra_generator | |
| from scripts.benchmark_datasets import resolve_benchmark_dataset as resolve_canonical_dataset | |
| from scripts.benchmark_suite import build_prompt, validate_sample | |
| def load_jsonl_samples(path: Path) -> list[dict[str, Any]]: | |
| rows: list[dict[str, Any]] = [] | |
| for line in path.read_text(encoding="utf-8").splitlines(): | |
| if line.strip(): | |
| rows.append(json.loads(line)) | |
| return rows | |
| def _score_mbpp(samples: list[dict[str, Any]], generate_fn: Callable[[str], str]) -> float: | |
| passed = 0 | |
| for sample in samples: | |
| validate_sample("MBPP", sample) | |
| code = generate_fn(build_prompt("MBPP", sample)) | |
| namespace: dict[str, Any] = {} | |
| exec(code, namespace, namespace) | |
| for test in sample["tests"]: | |
| exec(test, namespace, namespace) | |
| passed += 1 | |
| return passed / len(samples) if samples else 0.0 | |
| def _extract_last_number(text: str) -> str | None: | |
| matches = re.findall(r"-?\d+(?:\.\d+)?", text) | |
| return matches[-1] if matches else None | |
| def _score_gsm8k(samples: list[dict[str, Any]], generate_fn: Callable[[str], str]) -> float: | |
| passed = 0 | |
| for sample in samples: | |
| validate_sample("GSM8K", sample) | |
| output = generate_fn(build_prompt("GSM8K", sample)) | |
| pred = _extract_last_number(output) | |
| if pred is not None and pred == str(sample["answer"]): | |
| passed += 1 | |
| return passed / len(samples) if samples else 0.0 | |
| def _score_humaneval(samples: list[dict[str, Any]], generate_fn: Callable[[str], str]) -> float: | |
| passed = 0 | |
| for sample in samples: | |
| validate_sample("HumanEval", sample) | |
| code = generate_fn(build_prompt("HumanEval", sample)) | |
| namespace: dict[str, Any] = {} | |
| exec(code, namespace, namespace) | |
| exec(sample["test"], namespace, namespace) | |
| passed += 1 | |
| return passed / len(samples) if samples else 0.0 | |
| def _score_arc(samples: list[dict[str, Any]], generate_fn: Callable[[str], str]) -> float: | |
| passed = 0 | |
| for sample in samples: | |
| validate_sample("ARC-Challenge", sample) | |
| output = generate_fn(build_prompt("ARC-Challenge", sample)).strip() | |
| if output == str(sample["answer"]): | |
| passed += 1 | |
| return passed / len(samples) if samples else 0.0 | |
| def run_benchmark(benchmark_name: str, path: Path, generate_fn: Callable[[str], str]) -> dict[str, Any]: | |
| samples = load_jsonl_samples(path) | |
| if benchmark_name == "MBPP": | |
| return { | |
| "benchmark": "MBPP", | |
| "primary_metric": "pass_at_1", | |
| "score": _score_mbpp(samples, generate_fn), | |
| "n_samples": len(samples), | |
| } | |
| if benchmark_name == "GSM8K": | |
| return { | |
| "benchmark": "GSM8K", | |
| "primary_metric": "exact_match", | |
| "score": _score_gsm8k(samples, generate_fn), | |
| "n_samples": len(samples), | |
| } | |
| if benchmark_name == "HumanEval": | |
| return { | |
| "benchmark": "HumanEval", | |
| "primary_metric": "pass_at_1", | |
| "score": _score_humaneval(samples, generate_fn), | |
| "n_samples": len(samples), | |
| } | |
| if benchmark_name == "ARC-Challenge": | |
| return { | |
| "benchmark": "ARC-Challenge", | |
| "primary_metric": "accuracy", | |
| "score": _score_arc(samples, generate_fn), | |
| "n_samples": len(samples), | |
| } | |
| raise ValueError(f"Unsupported runnable benchmark: {benchmark_name}") | |
| def write_benchmark_result(path: Path, payload: dict[str, Any]) -> None: | |
| path.parent.mkdir(parents=True, exist_ok=True) | |
| path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") | |
| def append_benchmark_run_record( | |
| ledger_path: Path, | |
| result: dict[str, Any], | |
| *, | |
| benchmark_name: str, | |
| variant: str, | |
| seed: int, | |
| samples_path: Path, | |
| ) -> None: | |
| if not ledger_path.exists(): | |
| ledger_path.parent.mkdir(parents=True, exist_ok=True) | |
| ledger_path.write_text(LEDGER_TEMPLATE_PATH.read_text(encoding="utf-8"), encoding="utf-8") | |
| payload = json.loads(ledger_path.read_text(encoding="utf-8")) | |
| run_records = payload.setdefault("run_records", []) | |
| if len(run_records) == 1 and run_records[0].get("run_id") == "example-run-0001": | |
| run_records.clear() | |
| run_records.append( | |
| { | |
| "run_id": result.get("run_id", f"{benchmark_name.lower()}-{seed}"), | |
| "commit": "HEAD", | |
| "model_family": "hydra", | |
| "variant": variant, | |
| "seed": seed, | |
| "hardware": { | |
| "hardware_class": payload.get("benchmark_cycle", {}).get("hardware_class", "unknown"), | |
| }, | |
| "budget": { | |
| "budget_mode": payload.get("benchmark_cycle", {}).get("budget_modes", [None])[0], | |
| }, | |
| "capability": { | |
| "coding_score": result["score"] if benchmark_name in {"MBPP", "HumanEval"} else None, | |
| "reasoning_score": result["score"] if benchmark_name in {"GSM8K", "ARC-Challenge"} else None, | |
| }, | |
| "artifacts": { | |
| "samples_path": str(samples_path), | |
| }, | |
| } | |
| ) | |
| ledger_path.write_text(json.dumps(payload, indent=2, sort_keys=True), encoding="utf-8") | |
| def resolve_samples_path(benchmark_name: str, samples: Path | None, suite_path: Path) -> Path: | |
| if samples is not None: | |
| return samples | |
| payload = json.loads(suite_path.read_text(encoding="utf-8")) | |
| for section in ("coding_benchmarks", "reasoning_benchmarks"): | |
| if section not in payload: | |
| continue | |
| for slot in ("fast_iteration", "milestone"): | |
| entry = payload[section].get(slot) | |
| if isinstance(entry, dict) and entry.get("name") == benchmark_name and "sample_path" in entry: | |
| return Path(entry["sample_path"]) | |
| try: | |
| return resolve_canonical_dataset(benchmark_name, None) | |
| except ValueError: | |
| raise ValueError(f"No sample path found for benchmark: {benchmark_name}") | |
| def parse_args(argv: list[str] | None = None) -> argparse.Namespace: | |
| parser = argparse.ArgumentParser(description="Run a local benchmark against JSONL samples") | |
| parser.add_argument("--benchmark", required=True, choices=["MBPP", "GSM8K", "HumanEval", "ARC-Challenge"]) | |
| parser.add_argument("--samples", type=Path) | |
| parser.add_argument("--suite", type=Path, default=REPO_ROOT / "artifacts" / "benchmark_suite.cycle1.json") | |
| parser.add_argument("--out", type=Path) | |
| parser.add_argument("--ledger", type=Path) | |
| parser.add_argument("--variant", default="hydra_full") | |
| parser.add_argument("--seed", type=int, default=42) | |
| parser.add_argument("--generator-mode", choices=["stub", "hydra"], default="stub") | |
| parser.add_argument("--checkpoint", type=Path) | |
| parser.add_argument("--device") | |
| parser.add_argument("--max-new-tokens", type=int, default=256) | |
| parser.add_argument("--temperature", type=float, default=0.2) | |
| parser.add_argument("--top-p", type=float, default=0.95) | |
| return parser.parse_args(argv) | |
| def main(argv: list[str] | None = None) -> int: | |
| args = parse_args(argv) | |
| sample_path = resolve_samples_path(args.benchmark, args.samples, args.suite) | |
| try: | |
| if args.generator_mode == "hydra": | |
| generator = build_hydra_generator( | |
| checkpoint_path=args.checkpoint, | |
| device=args.device, | |
| max_new_tokens=args.max_new_tokens, | |
| temperature=args.temperature, | |
| top_p=args.top_p, | |
| ) | |
| else: | |
| def generator(prompt: str) -> str: | |
| return prompt | |
| result = run_benchmark(args.benchmark, sample_path, generator) | |
| exit_code = 0 | |
| except FileNotFoundError as exc: | |
| result = { | |
| "benchmark": args.benchmark, | |
| "status": "failed", | |
| "failure_type": "missing_checkpoint", | |
| "error": str(exc), | |
| "n_samples": 0, | |
| } | |
| exit_code = 1 | |
| except Exception as exc: # noqa: BLE001 | |
| result = { | |
| "benchmark": args.benchmark, | |
| "status": "failed", | |
| "failure_type": type(exc).__name__, | |
| "error": str(exc), | |
| "n_samples": 0, | |
| } | |
| exit_code = 1 | |
| if args.out is not None: | |
| write_benchmark_result(args.out, result) | |
| if args.ledger is not None and exit_code == 0: | |
| append_benchmark_run_record( | |
| args.ledger, | |
| result, | |
| benchmark_name=args.benchmark, | |
| variant=args.variant, | |
| seed=args.seed, | |
| samples_path=sample_path, | |
| ) | |
| print(json.dumps(result, indent=2, sort_keys=True)) | |
| return exit_code | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |