Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import subprocess | |
| import sys | |
| from pathlib import Path | |
| from typing import Any, Dict, List | |
| REPO_ROOT = Path(__file__).resolve().parents[1] | |
| INFERENCE_PATH = REPO_ROOT / "inference.py" | |
| DEFAULT_TASKS = [ | |
| "bug_detection_easy_1", | |
| "memory_leak_medium_1", | |
| "security_hard_1", | |
| ] | |
| def run_task(task_id: str, max_steps: int, output_dir: Path) -> Dict[str, Any]: | |
| output_path = output_dir / f"benchmark_{task_id}.json" | |
| cmd = [ | |
| sys.executable, | |
| str(INFERENCE_PATH), | |
| "--task-id", | |
| task_id, | |
| "--max-steps", | |
| str(max_steps), | |
| "--output", | |
| str(output_path), | |
| ] | |
| completed = subprocess.run(cmd, capture_output=True, text=True, cwd=str(REPO_ROOT)) | |
| if completed.returncode != 0: | |
| raise RuntimeError( | |
| f"Task {task_id} failed with exit code {completed.returncode}\n" | |
| f"stdout:\n{completed.stdout}\n\n" | |
| f"stderr:\n{completed.stderr}" | |
| ) | |
| with output_path.open("r", encoding="utf-8") as fh: | |
| return json.load(fh) | |
| def to_markdown(results: List[Dict[str, Any]]) -> str: | |
| lines = [ | |
| "# Benchmark Results", | |
| "", | |
| "| Task | Task Score | Total Reward | Steps | Model |", | |
| "|---|---:|---:|---:|---|", | |
| ] | |
| for row in results: | |
| lines.append( | |
| f"| {row.get('task_id')} | {row.get('task_score', 0):.3f} | " | |
| f"{row.get('total_reward', 0):.3f} | {row.get('steps', 0)} | {row.get('model', 'unknown')} |" | |
| ) | |
| avg_score = sum(float(r.get("task_score", 0.0)) for r in results) / max(1, len(results)) | |
| avg_reward = sum(float(r.get("total_reward", 0.0)) for r in results) / max(1, len(results)) | |
| lines.extend( | |
| [ | |
| "", | |
| f"Average task score: **{avg_score:.3f}**", | |
| f"Average total reward: **{avg_reward:.3f}**", | |
| "", | |
| "Note: task_score is normalized to [0, 1]; total_reward is cumulative and may exceed 1.0.", | |
| ] | |
| ) | |
| return "\n".join(lines) + "\n" | |
| def main() -> int: | |
| parser = argparse.ArgumentParser(description="Run benchmark tasks and generate a markdown table") | |
| parser.add_argument( | |
| "--tasks", | |
| nargs="+", | |
| default=DEFAULT_TASKS, | |
| help="Task IDs to evaluate (default: 3 core tasks)", | |
| ) | |
| parser.add_argument("--max-steps", type=int, default=10) | |
| parser.add_argument("--output-dir", type=Path, default=REPO_ROOT / "outputs") | |
| parser.add_argument("--table", type=Path, default=REPO_ROOT / "outputs/benchmark_table.md") | |
| args = parser.parse_args() | |
| args.output_dir.mkdir(parents=True, exist_ok=True) | |
| args.table.parent.mkdir(parents=True, exist_ok=True) | |
| results: List[Dict[str, Any]] = [] | |
| for task_id in args.tasks: | |
| print(f"Running task: {task_id}") | |
| result = run_task(task_id, args.max_steps, args.output_dir) | |
| results.append(result) | |
| table = to_markdown(results) | |
| args.table.write_text(table, encoding="utf-8") | |
| print(f"Wrote benchmark table to {args.table}") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |