computer-agent-v2 / eval_harness.py
jkorstad's picture
Deploy Computer Agent v2.0 full stack
31e5b3a verified
"""
eval_harness.py — Enhanced Evaluation Framework
================================================
Supports custom benchmarks, WebArena-style tasks, GAIA-style tasks,
A/B testing, and LLM-as-a-judge grading.
"""
import os
import json
import time
import random
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Dict, List, Optional, Callable
from dataclasses import dataclass, field, asdict
# ---------------------------------------------------------------------------
# Benchmark Tasks
# ---------------------------------------------------------------------------
@dataclass
class BenchmarkTask:
id: str
category: str
description: str
expected_answer: Optional[str] = None
expected_contains: Optional[List[str]] = None
max_steps: int = 50
setup_script: Optional[str] = None # Shell commands to prep the sandbox
teardown_script: Optional[str] = None
weight: float = 1.0
DEFAULT_BENCHMARKS: List[BenchmarkTask] = [
# Web navigation
BenchmarkTask(
id="puppies",
category="web_search",
description="Find me pictures of cute puppies",
expected_contains=["puppy", "dog", "image"],
max_steps=30,
),
BenchmarkTask(
id="gmaps_hf_hq",
category="web_navigation",
description="Use Google Maps to find the Hugging Face HQ in Paris",
expected_contains=["Paris", "Hugging Face", "5/7"],
max_steps=40,
),
BenchmarkTask(
id="wikipedia_april4",
category="web_research",
description="Go to Wikipedia and find what happened on April 4th",
expected_contains=["April", "4"],
max_steps=30,
),
BenchmarkTask(
id="commute_bern_basel",
category="web_navigation",
description="Find out the travel time by train from Bern to Basel on Google Maps",
expected_contains=["Bern", "Basel", "hour", "min"],
max_steps=40,
),
BenchmarkTask(
id="hf_flux_gpu",
category="hf_ecosystem",
description="Go to Hugging Face Spaces and find the Space flux.1 schnell. Use it to generate an image of a GPU",
expected_contains=["GPU", "image"],
max_steps=60,
),
BenchmarkTask(
id="github_trending",
category="web_research",
description="Go to GitHub trending and find the top Python repository today",
expected_contains=["Python", "github.com"],
max_steps=35,
),
BenchmarkTask(
id="pdf_extract",
category="document",
description="Download a sample PDF from the internet and extract the first paragraph",
expected_contains=["PDF", "paragraph"],
max_steps=40,
),
BenchmarkTask(
id="calc_sum",
category="code_execution",
description="Calculate the sum of the first 100 prime numbers using Python",
expected_answer="24133",
max_steps=20,
),
BenchmarkTask(
id="dark_mode_maps",
category="web_navigation",
description="Open Google Maps and switch to dark mode if available",
expected_contains=["dark", "theme"],
max_steps=30,
),
BenchmarkTask(
id="hf_model_search",
category="hf_ecosystem",
description="Search Hugging Face Hub for 'text-to-video' models and list the top 3 by downloads",
expected_contains=["text-to-video", "model"],
max_steps=35,
),
]
# ---------------------------------------------------------------------------
# LLM-as-a-Judge
# ---------------------------------------------------------------------------
class LLMJudge:
"""Grades agent outputs using a language model."""
def __init__(self, model_call: Callable[[List[Dict[str, Any]]], str]):
self.model_call = model_call
def grade_exact(self, predicted: str, expected: str) -> float:
return 1.0 if expected.lower().strip() in predicted.lower().strip() else 0.0
def grade_contains(self, predicted: str, expected_list: List[str]) -> float:
if not expected_list:
return 1.0
matched = sum(1 for e in expected_list if e.lower() in predicted.lower())
return matched / len(expected_list)
def grade_semantic(
self,
task_description: str,
agent_trace: str,
predicted: str,
expected: Optional[str] = None,
expected_contains: Optional[List[str]] = None,
) -> Dict[str, Any]:
"""Use an LLM to judge success on a 0-1 scale."""
prompt = f"""You are an expert evaluator. A computer agent was given this task:
Task: {task_description}
The agent's final response / trace summary:
{predicted[:2000]}
Expected answer (if any): {expected or 'N/A'}
Expected keywords (if any): {expected_contains or 'N/A'}
Rate the agent's success on a scale from 0.0 to 1.0, where:
- 1.0 = fully completed and correct
- 0.5 = partially correct or incomplete
- 0.0 = completely wrong or failed
Respond ONLY with a JSON object:
{{"score": float, "reason": "short explanation", "missing": "what was missing"}}
"""
response = self.model_call([{"role": "user", "content": prompt}])
content = response.strip()
if content.startswith("```"):
content = content.split("```", 2)[-1]
if content.startswith("json"):
content = content[4:]
content = content.strip()
try:
result = json.loads(content)
return {
"score": float(result.get("score", 0.0)),
"reason": result.get("reason", ""),
"missing": result.get("missing", ""),
}
except (json.JSONDecodeError, ValueError):
# Fallback heuristic
score = 0.5 if "success" in predicted.lower() or "done" in predicted.lower() else 0.0
return {"score": score, "reason": "LLM judge parsing failed, heuristic fallback", "missing": ""}
# ---------------------------------------------------------------------------
# Evaluation Harness
# ---------------------------------------------------------------------------
@dataclass
class TaskResult:
task_id: str
success: bool
score: float
duration_sec: float
steps_taken: int
final_output: str
error: Optional[str] = None
judge_reason: Optional[str] = None
@dataclass
class EvalSummary:
total_tasks: int
passed: int
failed: int
avg_score: float
avg_duration: float
by_category: Dict[str, Dict[str, Any]]
results: List[TaskResult]
timestamp: float = field(default_factory=time.time)
class EvaluationHarness:
"""Run benchmarks against the agent and produce reports."""
def __init__(
self,
agent_factory: Callable[[], Any],
judge_model_call: Optional[Callable] = None,
output_dir: str = "./eval_results",
):
self.agent_factory = agent_factory
self.judge = LLMJudge(judge_model_call) if judge_model_call else None
self.output_dir = output_dir
os.makedirs(output_dir, exist_ok=True)
def run_task(
self,
task: BenchmarkTask,
num_runs: int = 1,
) -> List[TaskResult]:
results = []
for run_idx in range(num_runs):
start = time.time()
agent = self.agent_factory()
try:
# Run the agent
output = agent.run(task.description, max_steps=task.max_steps)
duration = time.time() - start
# Grade
if self.judge:
judge_result = self.judge.grade_semantic(
task.description,
str(output),
str(output),
task.expected_answer,
task.expected_contains,
)
score = judge_result["score"]
reason = judge_result["reason"]
else:
if task.expected_answer:
score = self.judge.grade_exact(str(output), task.expected_answer) if self.judge else 0.0
elif task.expected_contains:
score = self.judge.grade_contains(str(output), task.expected_contains) if self.judge else 0.0
else:
score = 0.5
reason = "Heuristic grading (no LLM judge)"
success = score >= 0.7
results.append(TaskResult(
task_id=f"{task.id}_run{run_idx}",
success=success,
score=score,
duration_sec=round(duration, 2),
steps_taken=getattr(agent, "step_number", 0),
final_output=str(output)[:2000],
error=None,
judge_reason=reason,
))
except Exception as e:
duration = time.time() - start
results.append(TaskResult(
task_id=f"{task.id}_run{run_idx}",
success=False,
score=0.0,
duration_sec=round(duration, 2),
steps_taken=0,
final_output="",
error=str(e),
judge_reason="Exception during execution",
))
return results
def run_suite(
self,
tasks: Optional[List[BenchmarkTask]] = None,
num_runs: int = 1,
max_parallel: int = 2,
) -> EvalSummary:
tasks = tasks or DEFAULT_BENCHMARKS
all_results: List[TaskResult] = []
def run_single(task):
return self.run_task(task, num_runs=num_runs)
with ThreadPoolExecutor(max_workers=max_parallel) as executor:
futures = [executor.submit(run_single, t) for t in tasks]
for future in futures:
all_results.extend(future.result())
# Aggregate
passed = sum(1 for r in all_results if r.success)
total = len(all_results)
avg_score = sum(r.score for r in all_results) / max(total, 1)
avg_duration = sum(r.duration_sec for r in all_results) / max(total, 1)
by_category: Dict[str, Any] = {}
for r in all_results:
# Map back to category from task_id prefix
cat = "unknown"
for t in tasks:
if r.task_id.startswith(t.id):
cat = t.category
break
by_category.setdefault(cat, {"count": 0, "passed": 0, "avg_score": 0.0, "scores": []})
by_category[cat]["count"] += 1
if r.success:
by_category[cat]["passed"] += 1
by_category[cat]["scores"].append(r.score)
for cat, data in by_category.items():
data["avg_score"] = round(sum(data["scores"]) / max(len(data["scores"]), 1), 3)
del data["scores"]
summary = EvalSummary(
total_tasks=total,
passed=passed,
failed=total - passed,
avg_score=round(avg_score, 3),
avg_duration=round(avg_duration, 2),
by_category=by_category,
results=all_results,
)
# Save
ts = int(time.time())
path = os.path.join(self.output_dir, f"eval_summary_{ts}.json")
with open(path, "w") as f:
json.dump(asdict(summary), f, indent=2, default=str)
print(f"Evaluation saved to {path}")
return summary
def compare_strategies(
self,
strategy_a_factory: Callable[[], Any],
strategy_b_factory: Callable[[], Any],
tasks: Optional[List[BenchmarkTask]] = None,
num_runs: int = 3,
) -> Dict[str, Any]:
"""A/B test two agent configurations."""
print("Running Strategy A...")
old_factory = self.agent_factory
self.agent_factory = strategy_a_factory
results_a = self.run_suite(tasks, num_runs=num_runs, max_parallel=1)
print("Running Strategy B...")
self.agent_factory = strategy_b_factory
results_b = self.run_suite(tasks, num_runs=num_runs, max_parallel=1)
self.agent_factory = old_factory
return {
"strategy_a": {
"avg_score": results_a.avg_score,
"pass_rate": results_a.passed / max(results_a.total_tasks, 1),
"avg_duration": results_a.avg_duration,
},
"strategy_b": {
"avg_score": results_b.avg_score,
"pass_rate": results_b.passed / max(results_b.total_tasks, 1),
"avg_duration": results_b.avg_duration,
},
"winner": "A" if results_a.avg_score > results_b.avg_score else "B",
}