Spaces:

Spirit-26
/

code-review-environment

Sleeping

App Files Files Community

code-review-environment / scripts /run_benchmark.py

ashishbaberwal

hf new add

7a23e48 about 2 months ago

raw

history blame contribute delete

3.19 kB

	#!/usr/bin/env python3
	from __future__ import annotations

	import argparse
	import json
	import subprocess
	import sys
	from pathlib import Path
	from typing import Any, Dict, List

	REPO_ROOT = Path(__file__).resolve().parents[1]
	INFERENCE_PATH = REPO_ROOT / "inference.py"

	DEFAULT_TASKS = [
	"bug_detection_easy_1",
	"memory_leak_medium_1",
	"security_hard_1",
	]


	def run_task(task_id: str, max_steps: int, output_dir: Path) -> Dict[str, Any]:
	output_path = output_dir / f"benchmark_{task_id}.json"
	cmd = [
	sys.executable,
	str(INFERENCE_PATH),
	"--task-id",
	task_id,
	"--max-steps",
	str(max_steps),
	"--output",
	str(output_path),
	]

	completed = subprocess.run(cmd, capture_output=True, text=True, cwd=str(REPO_ROOT))
	if completed.returncode != 0:
	raise RuntimeError(
	f"Task {task_id} failed with exit code {completed.returncode}\n"
	f"stdout:\n{completed.stdout}\n\n"
	f"stderr:\n{completed.stderr}"
	)

	with output_path.open("r", encoding="utf-8") as fh:
	return json.load(fh)


	def to_markdown(results: List[Dict[str, Any]]) -> str:
	lines = [
	"# Benchmark Results",
	"",
	"\| Task \| Task Score \| Total Reward \| Steps \| Model \|",
	"\|---\|---:\|---:\|---:\|---\|",
	]

	for row in results:
	lines.append(
	f"\| {row.get('task_id')} \| {row.get('task_score', 0):.3f} \| "
	f"{row.get('total_reward', 0):.3f} \| {row.get('steps', 0)} \| {row.get('model', 'unknown')} \|"
	)

	avg_score = sum(float(r.get("task_score", 0.0)) for r in results) / max(1, len(results))
	avg_reward = sum(float(r.get("total_reward", 0.0)) for r in results) / max(1, len(results))

	lines.extend(
	[
	"",
	f"Average task score: {avg_score:.3f}",
	f"Average total reward: {avg_reward:.3f}",
	"",
	"Note: task_score is normalized to [0, 1]; total_reward is cumulative and may exceed 1.0.",
	]
	)

	return "\n".join(lines) + "\n"


	def main() -> int:
	parser = argparse.ArgumentParser(description="Run benchmark tasks and generate a markdown table")
	parser.add_argument(
	"--tasks",
	nargs="+",
	default=DEFAULT_TASKS,
	help="Task IDs to evaluate (default: 3 core tasks)",
	)
	parser.add_argument("--max-steps", type=int, default=10)
	parser.add_argument("--output-dir", type=Path, default=REPO_ROOT / "outputs")
	parser.add_argument("--table", type=Path, default=REPO_ROOT / "outputs/benchmark_table.md")
	args = parser.parse_args()

	args.output_dir.mkdir(parents=True, exist_ok=True)
	args.table.parent.mkdir(parents=True, exist_ok=True)

	results: List[Dict[str, Any]] = []
	for task_id in args.tasks:
	print(f"Running task: {task_id}")
	result = run_task(task_id, args.max_steps, args.output_dir)
	results.append(result)

	table = to_markdown(results)
	args.table.write_text(table, encoding="utf-8")

	print(f"Wrote benchmark table to {args.table}")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())