#!/usr/bin/env python3 from __future__ import annotations import argparse import json import subprocess import sys from pathlib import Path from typing import Any, Dict, List REPO_ROOT = Path(__file__).resolve().parents[1] INFERENCE_PATH = REPO_ROOT / "inference.py" DEFAULT_TASKS = [ "bug_detection_easy_1", "memory_leak_medium_1", "security_hard_1", ] def run_task(task_id: str, max_steps: int, output_dir: Path) -> Dict[str, Any]: output_path = output_dir / f"benchmark_{task_id}.json" cmd = [ sys.executable, str(INFERENCE_PATH), "--task-id", task_id, "--max-steps", str(max_steps), "--output", str(output_path), ] completed = subprocess.run(cmd, capture_output=True, text=True, cwd=str(REPO_ROOT)) if completed.returncode != 0: raise RuntimeError( f"Task {task_id} failed with exit code {completed.returncode}\n" f"stdout:\n{completed.stdout}\n\n" f"stderr:\n{completed.stderr}" ) with output_path.open("r", encoding="utf-8") as fh: return json.load(fh) def to_markdown(results: List[Dict[str, Any]]) -> str: lines = [ "# Benchmark Results", "", "| Task | Task Score | Total Reward | Steps | Model |", "|---|---:|---:|---:|---|", ] for row in results: lines.append( f"| {row.get('task_id')} | {row.get('task_score', 0):.3f} | " f"{row.get('total_reward', 0):.3f} | {row.get('steps', 0)} | {row.get('model', 'unknown')} |" ) avg_score = sum(float(r.get("task_score", 0.0)) for r in results) / max(1, len(results)) avg_reward = sum(float(r.get("total_reward", 0.0)) for r in results) / max(1, len(results)) lines.extend( [ "", f"Average task score: **{avg_score:.3f}**", f"Average total reward: **{avg_reward:.3f}**", "", "Note: task_score is normalized to [0, 1]; total_reward is cumulative and may exceed 1.0.", ] ) return "\n".join(lines) + "\n" def main() -> int: parser = argparse.ArgumentParser(description="Run benchmark tasks and generate a markdown table") parser.add_argument( "--tasks", nargs="+", default=DEFAULT_TASKS, help="Task IDs to evaluate (default: 3 core tasks)", ) parser.add_argument("--max-steps", type=int, default=10) parser.add_argument("--output-dir", type=Path, default=REPO_ROOT / "outputs") parser.add_argument("--table", type=Path, default=REPO_ROOT / "outputs/benchmark_table.md") args = parser.parse_args() args.output_dir.mkdir(parents=True, exist_ok=True) args.table.parent.mkdir(parents=True, exist_ok=True) results: List[Dict[str, Any]] = [] for task_id in args.tasks: print(f"Running task: {task_id}") result = run_task(task_id, args.max_steps, args.output_dir) results.append(result) table = to_markdown(results) args.table.write_text(table, encoding="utf-8") print(f"Wrote benchmark table to {args.table}") return 0 if __name__ == "__main__": raise SystemExit(main())