code-review-environment / scripts /run_benchmark.py
ashishbaberwal's picture
hf new add
7a23e48
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import subprocess
import sys
from pathlib import Path
from typing import Any, Dict, List
REPO_ROOT = Path(__file__).resolve().parents[1]
INFERENCE_PATH = REPO_ROOT / "inference.py"
DEFAULT_TASKS = [
"bug_detection_easy_1",
"memory_leak_medium_1",
"security_hard_1",
]
def run_task(task_id: str, max_steps: int, output_dir: Path) -> Dict[str, Any]:
output_path = output_dir / f"benchmark_{task_id}.json"
cmd = [
sys.executable,
str(INFERENCE_PATH),
"--task-id",
task_id,
"--max-steps",
str(max_steps),
"--output",
str(output_path),
]
completed = subprocess.run(cmd, capture_output=True, text=True, cwd=str(REPO_ROOT))
if completed.returncode != 0:
raise RuntimeError(
f"Task {task_id} failed with exit code {completed.returncode}\n"
f"stdout:\n{completed.stdout}\n\n"
f"stderr:\n{completed.stderr}"
)
with output_path.open("r", encoding="utf-8") as fh:
return json.load(fh)
def to_markdown(results: List[Dict[str, Any]]) -> str:
lines = [
"# Benchmark Results",
"",
"| Task | Task Score | Total Reward | Steps | Model |",
"|---|---:|---:|---:|---|",
]
for row in results:
lines.append(
f"| {row.get('task_id')} | {row.get('task_score', 0):.3f} | "
f"{row.get('total_reward', 0):.3f} | {row.get('steps', 0)} | {row.get('model', 'unknown')} |"
)
avg_score = sum(float(r.get("task_score", 0.0)) for r in results) / max(1, len(results))
avg_reward = sum(float(r.get("total_reward", 0.0)) for r in results) / max(1, len(results))
lines.extend(
[
"",
f"Average task score: **{avg_score:.3f}**",
f"Average total reward: **{avg_reward:.3f}**",
"",
"Note: task_score is normalized to [0, 1]; total_reward is cumulative and may exceed 1.0.",
]
)
return "\n".join(lines) + "\n"
def main() -> int:
parser = argparse.ArgumentParser(description="Run benchmark tasks and generate a markdown table")
parser.add_argument(
"--tasks",
nargs="+",
default=DEFAULT_TASKS,
help="Task IDs to evaluate (default: 3 core tasks)",
)
parser.add_argument("--max-steps", type=int, default=10)
parser.add_argument("--output-dir", type=Path, default=REPO_ROOT / "outputs")
parser.add_argument("--table", type=Path, default=REPO_ROOT / "outputs/benchmark_table.md")
args = parser.parse_args()
args.output_dir.mkdir(parents=True, exist_ok=True)
args.table.parent.mkdir(parents=True, exist_ok=True)
results: List[Dict[str, Any]] = []
for task_id in args.tasks:
print(f"Running task: {task_id}")
result = run_task(task_id, args.max_steps, args.output_dir)
results.append(result)
table = to_markdown(results)
args.table.write_text(table, encoding="utf-8")
print(f"Wrote benchmark table to {args.table}")
return 0
if __name__ == "__main__":
raise SystemExit(main())