File size: 3,193 Bytes
7a23e48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python3
from __future__ import annotations

import argparse
import json
import subprocess
import sys
from pathlib import Path
from typing import Any, Dict, List

REPO_ROOT = Path(__file__).resolve().parents[1]
INFERENCE_PATH = REPO_ROOT / "inference.py"

DEFAULT_TASKS = [
    "bug_detection_easy_1",
    "memory_leak_medium_1",
    "security_hard_1",
]


def run_task(task_id: str, max_steps: int, output_dir: Path) -> Dict[str, Any]:
    output_path = output_dir / f"benchmark_{task_id}.json"
    cmd = [
        sys.executable,
        str(INFERENCE_PATH),
        "--task-id",
        task_id,
        "--max-steps",
        str(max_steps),
        "--output",
        str(output_path),
    ]

    completed = subprocess.run(cmd, capture_output=True, text=True, cwd=str(REPO_ROOT))
    if completed.returncode != 0:
        raise RuntimeError(
            f"Task {task_id} failed with exit code {completed.returncode}\n"
            f"stdout:\n{completed.stdout}\n\n"
            f"stderr:\n{completed.stderr}"
        )

    with output_path.open("r", encoding="utf-8") as fh:
        return json.load(fh)


def to_markdown(results: List[Dict[str, Any]]) -> str:
    lines = [
        "# Benchmark Results",
        "",
        "| Task | Task Score | Total Reward | Steps | Model |",
        "|---|---:|---:|---:|---|",
    ]

    for row in results:
        lines.append(
            f"| {row.get('task_id')} | {row.get('task_score', 0):.3f} | "
            f"{row.get('total_reward', 0):.3f} | {row.get('steps', 0)} | {row.get('model', 'unknown')} |"
        )

    avg_score = sum(float(r.get("task_score", 0.0)) for r in results) / max(1, len(results))
    avg_reward = sum(float(r.get("total_reward", 0.0)) for r in results) / max(1, len(results))

    lines.extend(
        [
            "",
            f"Average task score: **{avg_score:.3f}**",
            f"Average total reward: **{avg_reward:.3f}**",
            "",
            "Note: task_score is normalized to [0, 1]; total_reward is cumulative and may exceed 1.0.",
        ]
    )

    return "\n".join(lines) + "\n"


def main() -> int:
    parser = argparse.ArgumentParser(description="Run benchmark tasks and generate a markdown table")
    parser.add_argument(
        "--tasks",
        nargs="+",
        default=DEFAULT_TASKS,
        help="Task IDs to evaluate (default: 3 core tasks)",
    )
    parser.add_argument("--max-steps", type=int, default=10)
    parser.add_argument("--output-dir", type=Path, default=REPO_ROOT / "outputs")
    parser.add_argument("--table", type=Path, default=REPO_ROOT / "outputs/benchmark_table.md")
    args = parser.parse_args()

    args.output_dir.mkdir(parents=True, exist_ok=True)
    args.table.parent.mkdir(parents=True, exist_ok=True)

    results: List[Dict[str, Any]] = []
    for task_id in args.tasks:
        print(f"Running task: {task_id}")
        result = run_task(task_id, args.max_steps, args.output_dir)
        results.append(result)

    table = to_markdown(results)
    args.table.write_text(table, encoding="utf-8")

    print(f"Wrote benchmark table to {args.table}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())