Spaces:
Sleeping
Sleeping
File size: 3,193 Bytes
7a23e48 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | #!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import subprocess
import sys
from pathlib import Path
from typing import Any, Dict, List
REPO_ROOT = Path(__file__).resolve().parents[1]
INFERENCE_PATH = REPO_ROOT / "inference.py"
DEFAULT_TASKS = [
"bug_detection_easy_1",
"memory_leak_medium_1",
"security_hard_1",
]
def run_task(task_id: str, max_steps: int, output_dir: Path) -> Dict[str, Any]:
output_path = output_dir / f"benchmark_{task_id}.json"
cmd = [
sys.executable,
str(INFERENCE_PATH),
"--task-id",
task_id,
"--max-steps",
str(max_steps),
"--output",
str(output_path),
]
completed = subprocess.run(cmd, capture_output=True, text=True, cwd=str(REPO_ROOT))
if completed.returncode != 0:
raise RuntimeError(
f"Task {task_id} failed with exit code {completed.returncode}\n"
f"stdout:\n{completed.stdout}\n\n"
f"stderr:\n{completed.stderr}"
)
with output_path.open("r", encoding="utf-8") as fh:
return json.load(fh)
def to_markdown(results: List[Dict[str, Any]]) -> str:
lines = [
"# Benchmark Results",
"",
"| Task | Task Score | Total Reward | Steps | Model |",
"|---|---:|---:|---:|---|",
]
for row in results:
lines.append(
f"| {row.get('task_id')} | {row.get('task_score', 0):.3f} | "
f"{row.get('total_reward', 0):.3f} | {row.get('steps', 0)} | {row.get('model', 'unknown')} |"
)
avg_score = sum(float(r.get("task_score", 0.0)) for r in results) / max(1, len(results))
avg_reward = sum(float(r.get("total_reward", 0.0)) for r in results) / max(1, len(results))
lines.extend(
[
"",
f"Average task score: **{avg_score:.3f}**",
f"Average total reward: **{avg_reward:.3f}**",
"",
"Note: task_score is normalized to [0, 1]; total_reward is cumulative and may exceed 1.0.",
]
)
return "\n".join(lines) + "\n"
def main() -> int:
parser = argparse.ArgumentParser(description="Run benchmark tasks and generate a markdown table")
parser.add_argument(
"--tasks",
nargs="+",
default=DEFAULT_TASKS,
help="Task IDs to evaluate (default: 3 core tasks)",
)
parser.add_argument("--max-steps", type=int, default=10)
parser.add_argument("--output-dir", type=Path, default=REPO_ROOT / "outputs")
parser.add_argument("--table", type=Path, default=REPO_ROOT / "outputs/benchmark_table.md")
args = parser.parse_args()
args.output_dir.mkdir(parents=True, exist_ok=True)
args.table.parent.mkdir(parents=True, exist_ok=True)
results: List[Dict[str, Any]] = []
for task_id in args.tasks:
print(f"Running task: {task_id}")
result = run_task(task_id, args.max_steps, args.output_dir)
results.append(result)
table = to_markdown(results)
args.table.write_text(table, encoding="utf-8")
print(f"Wrote benchmark table to {args.table}")
return 0
if __name__ == "__main__":
raise SystemExit(main())
|