| """ |
| eval_harness.py — Enhanced Evaluation Framework |
| ================================================ |
| Supports custom benchmarks, WebArena-style tasks, GAIA-style tasks, |
| A/B testing, and LLM-as-a-judge grading. |
| """ |
|
|
| import os |
| import json |
| import time |
| import random |
| from concurrent.futures import ThreadPoolExecutor |
| from typing import Any, Dict, List, Optional, Callable |
| from dataclasses import dataclass, field, asdict |
|
|
|
|
| |
| |
| |
|
|
| @dataclass |
| class BenchmarkTask: |
| id: str |
| category: str |
| description: str |
| expected_answer: Optional[str] = None |
| expected_contains: Optional[List[str]] = None |
| max_steps: int = 50 |
| setup_script: Optional[str] = None |
| teardown_script: Optional[str] = None |
| weight: float = 1.0 |
|
|
|
|
| DEFAULT_BENCHMARKS: List[BenchmarkTask] = [ |
| |
| BenchmarkTask( |
| id="puppies", |
| category="web_search", |
| description="Find me pictures of cute puppies", |
| expected_contains=["puppy", "dog", "image"], |
| max_steps=30, |
| ), |
| BenchmarkTask( |
| id="gmaps_hf_hq", |
| category="web_navigation", |
| description="Use Google Maps to find the Hugging Face HQ in Paris", |
| expected_contains=["Paris", "Hugging Face", "5/7"], |
| max_steps=40, |
| ), |
| BenchmarkTask( |
| id="wikipedia_april4", |
| category="web_research", |
| description="Go to Wikipedia and find what happened on April 4th", |
| expected_contains=["April", "4"], |
| max_steps=30, |
| ), |
| BenchmarkTask( |
| id="commute_bern_basel", |
| category="web_navigation", |
| description="Find out the travel time by train from Bern to Basel on Google Maps", |
| expected_contains=["Bern", "Basel", "hour", "min"], |
| max_steps=40, |
| ), |
| BenchmarkTask( |
| id="hf_flux_gpu", |
| category="hf_ecosystem", |
| description="Go to Hugging Face Spaces and find the Space flux.1 schnell. Use it to generate an image of a GPU", |
| expected_contains=["GPU", "image"], |
| max_steps=60, |
| ), |
| BenchmarkTask( |
| id="github_trending", |
| category="web_research", |
| description="Go to GitHub trending and find the top Python repository today", |
| expected_contains=["Python", "github.com"], |
| max_steps=35, |
| ), |
| BenchmarkTask( |
| id="pdf_extract", |
| category="document", |
| description="Download a sample PDF from the internet and extract the first paragraph", |
| expected_contains=["PDF", "paragraph"], |
| max_steps=40, |
| ), |
| BenchmarkTask( |
| id="calc_sum", |
| category="code_execution", |
| description="Calculate the sum of the first 100 prime numbers using Python", |
| expected_answer="24133", |
| max_steps=20, |
| ), |
| BenchmarkTask( |
| id="dark_mode_maps", |
| category="web_navigation", |
| description="Open Google Maps and switch to dark mode if available", |
| expected_contains=["dark", "theme"], |
| max_steps=30, |
| ), |
| BenchmarkTask( |
| id="hf_model_search", |
| category="hf_ecosystem", |
| description="Search Hugging Face Hub for 'text-to-video' models and list the top 3 by downloads", |
| expected_contains=["text-to-video", "model"], |
| max_steps=35, |
| ), |
| ] |
|
|
|
|
| |
| |
| |
|
|
| class LLMJudge: |
| """Grades agent outputs using a language model.""" |
|
|
| def __init__(self, model_call: Callable[[List[Dict[str, Any]]], str]): |
| self.model_call = model_call |
|
|
| def grade_exact(self, predicted: str, expected: str) -> float: |
| return 1.0 if expected.lower().strip() in predicted.lower().strip() else 0.0 |
|
|
| def grade_contains(self, predicted: str, expected_list: List[str]) -> float: |
| if not expected_list: |
| return 1.0 |
| matched = sum(1 for e in expected_list if e.lower() in predicted.lower()) |
| return matched / len(expected_list) |
|
|
| def grade_semantic( |
| self, |
| task_description: str, |
| agent_trace: str, |
| predicted: str, |
| expected: Optional[str] = None, |
| expected_contains: Optional[List[str]] = None, |
| ) -> Dict[str, Any]: |
| """Use an LLM to judge success on a 0-1 scale.""" |
| prompt = f"""You are an expert evaluator. A computer agent was given this task: |
| |
| Task: {task_description} |
| |
| The agent's final response / trace summary: |
| {predicted[:2000]} |
| |
| Expected answer (if any): {expected or 'N/A'} |
| Expected keywords (if any): {expected_contains or 'N/A'} |
| |
| Rate the agent's success on a scale from 0.0 to 1.0, where: |
| - 1.0 = fully completed and correct |
| - 0.5 = partially correct or incomplete |
| - 0.0 = completely wrong or failed |
| |
| Respond ONLY with a JSON object: |
| {{"score": float, "reason": "short explanation", "missing": "what was missing"}} |
| """ |
| response = self.model_call([{"role": "user", "content": prompt}]) |
| content = response.strip() |
| if content.startswith("```"): |
| content = content.split("```", 2)[-1] |
| if content.startswith("json"): |
| content = content[4:] |
| content = content.strip() |
| try: |
| result = json.loads(content) |
| return { |
| "score": float(result.get("score", 0.0)), |
| "reason": result.get("reason", ""), |
| "missing": result.get("missing", ""), |
| } |
| except (json.JSONDecodeError, ValueError): |
| |
| score = 0.5 if "success" in predicted.lower() or "done" in predicted.lower() else 0.0 |
| return {"score": score, "reason": "LLM judge parsing failed, heuristic fallback", "missing": ""} |
|
|
|
|
| |
| |
| |
|
|
| @dataclass |
| class TaskResult: |
| task_id: str |
| success: bool |
| score: float |
| duration_sec: float |
| steps_taken: int |
| final_output: str |
| error: Optional[str] = None |
| judge_reason: Optional[str] = None |
|
|
|
|
| @dataclass |
| class EvalSummary: |
| total_tasks: int |
| passed: int |
| failed: int |
| avg_score: float |
| avg_duration: float |
| by_category: Dict[str, Dict[str, Any]] |
| results: List[TaskResult] |
| timestamp: float = field(default_factory=time.time) |
|
|
|
|
| class EvaluationHarness: |
| """Run benchmarks against the agent and produce reports.""" |
|
|
| def __init__( |
| self, |
| agent_factory: Callable[[], Any], |
| judge_model_call: Optional[Callable] = None, |
| output_dir: str = "./eval_results", |
| ): |
| self.agent_factory = agent_factory |
| self.judge = LLMJudge(judge_model_call) if judge_model_call else None |
| self.output_dir = output_dir |
| os.makedirs(output_dir, exist_ok=True) |
|
|
| def run_task( |
| self, |
| task: BenchmarkTask, |
| num_runs: int = 1, |
| ) -> List[TaskResult]: |
| results = [] |
| for run_idx in range(num_runs): |
| start = time.time() |
| agent = self.agent_factory() |
| try: |
| |
| output = agent.run(task.description, max_steps=task.max_steps) |
| duration = time.time() - start |
|
|
| |
| if self.judge: |
| judge_result = self.judge.grade_semantic( |
| task.description, |
| str(output), |
| str(output), |
| task.expected_answer, |
| task.expected_contains, |
| ) |
| score = judge_result["score"] |
| reason = judge_result["reason"] |
| else: |
| if task.expected_answer: |
| score = self.judge.grade_exact(str(output), task.expected_answer) if self.judge else 0.0 |
| elif task.expected_contains: |
| score = self.judge.grade_contains(str(output), task.expected_contains) if self.judge else 0.0 |
| else: |
| score = 0.5 |
| reason = "Heuristic grading (no LLM judge)" |
|
|
| success = score >= 0.7 |
| results.append(TaskResult( |
| task_id=f"{task.id}_run{run_idx}", |
| success=success, |
| score=score, |
| duration_sec=round(duration, 2), |
| steps_taken=getattr(agent, "step_number", 0), |
| final_output=str(output)[:2000], |
| error=None, |
| judge_reason=reason, |
| )) |
| except Exception as e: |
| duration = time.time() - start |
| results.append(TaskResult( |
| task_id=f"{task.id}_run{run_idx}", |
| success=False, |
| score=0.0, |
| duration_sec=round(duration, 2), |
| steps_taken=0, |
| final_output="", |
| error=str(e), |
| judge_reason="Exception during execution", |
| )) |
| return results |
|
|
| def run_suite( |
| self, |
| tasks: Optional[List[BenchmarkTask]] = None, |
| num_runs: int = 1, |
| max_parallel: int = 2, |
| ) -> EvalSummary: |
| tasks = tasks or DEFAULT_BENCHMARKS |
| all_results: List[TaskResult] = [] |
|
|
| def run_single(task): |
| return self.run_task(task, num_runs=num_runs) |
|
|
| with ThreadPoolExecutor(max_workers=max_parallel) as executor: |
| futures = [executor.submit(run_single, t) for t in tasks] |
| for future in futures: |
| all_results.extend(future.result()) |
|
|
| |
| passed = sum(1 for r in all_results if r.success) |
| total = len(all_results) |
| avg_score = sum(r.score for r in all_results) / max(total, 1) |
| avg_duration = sum(r.duration_sec for r in all_results) / max(total, 1) |
|
|
| by_category: Dict[str, Any] = {} |
| for r in all_results: |
| |
| cat = "unknown" |
| for t in tasks: |
| if r.task_id.startswith(t.id): |
| cat = t.category |
| break |
| by_category.setdefault(cat, {"count": 0, "passed": 0, "avg_score": 0.0, "scores": []}) |
| by_category[cat]["count"] += 1 |
| if r.success: |
| by_category[cat]["passed"] += 1 |
| by_category[cat]["scores"].append(r.score) |
|
|
| for cat, data in by_category.items(): |
| data["avg_score"] = round(sum(data["scores"]) / max(len(data["scores"]), 1), 3) |
| del data["scores"] |
|
|
| summary = EvalSummary( |
| total_tasks=total, |
| passed=passed, |
| failed=total - passed, |
| avg_score=round(avg_score, 3), |
| avg_duration=round(avg_duration, 2), |
| by_category=by_category, |
| results=all_results, |
| ) |
|
|
| |
| ts = int(time.time()) |
| path = os.path.join(self.output_dir, f"eval_summary_{ts}.json") |
| with open(path, "w") as f: |
| json.dump(asdict(summary), f, indent=2, default=str) |
| print(f"Evaluation saved to {path}") |
| return summary |
|
|
| def compare_strategies( |
| self, |
| strategy_a_factory: Callable[[], Any], |
| strategy_b_factory: Callable[[], Any], |
| tasks: Optional[List[BenchmarkTask]] = None, |
| num_runs: int = 3, |
| ) -> Dict[str, Any]: |
| """A/B test two agent configurations.""" |
| print("Running Strategy A...") |
| old_factory = self.agent_factory |
| self.agent_factory = strategy_a_factory |
| results_a = self.run_suite(tasks, num_runs=num_runs, max_parallel=1) |
|
|
| print("Running Strategy B...") |
| self.agent_factory = strategy_b_factory |
| results_b = self.run_suite(tasks, num_runs=num_runs, max_parallel=1) |
|
|
| self.agent_factory = old_factory |
|
|
| return { |
| "strategy_a": { |
| "avg_score": results_a.avg_score, |
| "pass_rate": results_a.passed / max(results_a.total_tasks, 1), |
| "avg_duration": results_a.avg_duration, |
| }, |
| "strategy_b": { |
| "avg_score": results_b.avg_score, |
| "pass_rate": results_b.passed / max(results_b.total_tasks, 1), |
| "avg_duration": results_b.avg_duration, |
| }, |
| "winner": "A" if results_a.avg_score > results_b.avg_score else "B", |
| } |
|
|