""" eval_harness.py — Enhanced Evaluation Framework ================================================ Supports custom benchmarks, WebArena-style tasks, GAIA-style tasks, A/B testing, and LLM-as-a-judge grading. """ import os import json import time import random from concurrent.futures import ThreadPoolExecutor from typing import Any, Dict, List, Optional, Callable from dataclasses import dataclass, field, asdict # --------------------------------------------------------------------------- # Benchmark Tasks # --------------------------------------------------------------------------- @dataclass class BenchmarkTask: id: str category: str description: str expected_answer: Optional[str] = None expected_contains: Optional[List[str]] = None max_steps: int = 50 setup_script: Optional[str] = None # Shell commands to prep the sandbox teardown_script: Optional[str] = None weight: float = 1.0 DEFAULT_BENCHMARKS: List[BenchmarkTask] = [ # Web navigation BenchmarkTask( id="puppies", category="web_search", description="Find me pictures of cute puppies", expected_contains=["puppy", "dog", "image"], max_steps=30, ), BenchmarkTask( id="gmaps_hf_hq", category="web_navigation", description="Use Google Maps to find the Hugging Face HQ in Paris", expected_contains=["Paris", "Hugging Face", "5/7"], max_steps=40, ), BenchmarkTask( id="wikipedia_april4", category="web_research", description="Go to Wikipedia and find what happened on April 4th", expected_contains=["April", "4"], max_steps=30, ), BenchmarkTask( id="commute_bern_basel", category="web_navigation", description="Find out the travel time by train from Bern to Basel on Google Maps", expected_contains=["Bern", "Basel", "hour", "min"], max_steps=40, ), BenchmarkTask( id="hf_flux_gpu", category="hf_ecosystem", description="Go to Hugging Face Spaces and find the Space flux.1 schnell. Use it to generate an image of a GPU", expected_contains=["GPU", "image"], max_steps=60, ), BenchmarkTask( id="github_trending", category="web_research", description="Go to GitHub trending and find the top Python repository today", expected_contains=["Python", "github.com"], max_steps=35, ), BenchmarkTask( id="pdf_extract", category="document", description="Download a sample PDF from the internet and extract the first paragraph", expected_contains=["PDF", "paragraph"], max_steps=40, ), BenchmarkTask( id="calc_sum", category="code_execution", description="Calculate the sum of the first 100 prime numbers using Python", expected_answer="24133", max_steps=20, ), BenchmarkTask( id="dark_mode_maps", category="web_navigation", description="Open Google Maps and switch to dark mode if available", expected_contains=["dark", "theme"], max_steps=30, ), BenchmarkTask( id="hf_model_search", category="hf_ecosystem", description="Search Hugging Face Hub for 'text-to-video' models and list the top 3 by downloads", expected_contains=["text-to-video", "model"], max_steps=35, ), ] # --------------------------------------------------------------------------- # LLM-as-a-Judge # --------------------------------------------------------------------------- class LLMJudge: """Grades agent outputs using a language model.""" def __init__(self, model_call: Callable[[List[Dict[str, Any]]], str]): self.model_call = model_call def grade_exact(self, predicted: str, expected: str) -> float: return 1.0 if expected.lower().strip() in predicted.lower().strip() else 0.0 def grade_contains(self, predicted: str, expected_list: List[str]) -> float: if not expected_list: return 1.0 matched = sum(1 for e in expected_list if e.lower() in predicted.lower()) return matched / len(expected_list) def grade_semantic( self, task_description: str, agent_trace: str, predicted: str, expected: Optional[str] = None, expected_contains: Optional[List[str]] = None, ) -> Dict[str, Any]: """Use an LLM to judge success on a 0-1 scale.""" prompt = f"""You are an expert evaluator. A computer agent was given this task: Task: {task_description} The agent's final response / trace summary: {predicted[:2000]} Expected answer (if any): {expected or 'N/A'} Expected keywords (if any): {expected_contains or 'N/A'} Rate the agent's success on a scale from 0.0 to 1.0, where: - 1.0 = fully completed and correct - 0.5 = partially correct or incomplete - 0.0 = completely wrong or failed Respond ONLY with a JSON object: {{"score": float, "reason": "short explanation", "missing": "what was missing"}} """ response = self.model_call([{"role": "user", "content": prompt}]) content = response.strip() if content.startswith("```"): content = content.split("```", 2)[-1] if content.startswith("json"): content = content[4:] content = content.strip() try: result = json.loads(content) return { "score": float(result.get("score", 0.0)), "reason": result.get("reason", ""), "missing": result.get("missing", ""), } except (json.JSONDecodeError, ValueError): # Fallback heuristic score = 0.5 if "success" in predicted.lower() or "done" in predicted.lower() else 0.0 return {"score": score, "reason": "LLM judge parsing failed, heuristic fallback", "missing": ""} # --------------------------------------------------------------------------- # Evaluation Harness # --------------------------------------------------------------------------- @dataclass class TaskResult: task_id: str success: bool score: float duration_sec: float steps_taken: int final_output: str error: Optional[str] = None judge_reason: Optional[str] = None @dataclass class EvalSummary: total_tasks: int passed: int failed: int avg_score: float avg_duration: float by_category: Dict[str, Dict[str, Any]] results: List[TaskResult] timestamp: float = field(default_factory=time.time) class EvaluationHarness: """Run benchmarks against the agent and produce reports.""" def __init__( self, agent_factory: Callable[[], Any], judge_model_call: Optional[Callable] = None, output_dir: str = "./eval_results", ): self.agent_factory = agent_factory self.judge = LLMJudge(judge_model_call) if judge_model_call else None self.output_dir = output_dir os.makedirs(output_dir, exist_ok=True) def run_task( self, task: BenchmarkTask, num_runs: int = 1, ) -> List[TaskResult]: results = [] for run_idx in range(num_runs): start = time.time() agent = self.agent_factory() try: # Run the agent output = agent.run(task.description, max_steps=task.max_steps) duration = time.time() - start # Grade if self.judge: judge_result = self.judge.grade_semantic( task.description, str(output), str(output), task.expected_answer, task.expected_contains, ) score = judge_result["score"] reason = judge_result["reason"] else: if task.expected_answer: score = self.judge.grade_exact(str(output), task.expected_answer) if self.judge else 0.0 elif task.expected_contains: score = self.judge.grade_contains(str(output), task.expected_contains) if self.judge else 0.0 else: score = 0.5 reason = "Heuristic grading (no LLM judge)" success = score >= 0.7 results.append(TaskResult( task_id=f"{task.id}_run{run_idx}", success=success, score=score, duration_sec=round(duration, 2), steps_taken=getattr(agent, "step_number", 0), final_output=str(output)[:2000], error=None, judge_reason=reason, )) except Exception as e: duration = time.time() - start results.append(TaskResult( task_id=f"{task.id}_run{run_idx}", success=False, score=0.0, duration_sec=round(duration, 2), steps_taken=0, final_output="", error=str(e), judge_reason="Exception during execution", )) return results def run_suite( self, tasks: Optional[List[BenchmarkTask]] = None, num_runs: int = 1, max_parallel: int = 2, ) -> EvalSummary: tasks = tasks or DEFAULT_BENCHMARKS all_results: List[TaskResult] = [] def run_single(task): return self.run_task(task, num_runs=num_runs) with ThreadPoolExecutor(max_workers=max_parallel) as executor: futures = [executor.submit(run_single, t) for t in tasks] for future in futures: all_results.extend(future.result()) # Aggregate passed = sum(1 for r in all_results if r.success) total = len(all_results) avg_score = sum(r.score for r in all_results) / max(total, 1) avg_duration = sum(r.duration_sec for r in all_results) / max(total, 1) by_category: Dict[str, Any] = {} for r in all_results: # Map back to category from task_id prefix cat = "unknown" for t in tasks: if r.task_id.startswith(t.id): cat = t.category break by_category.setdefault(cat, {"count": 0, "passed": 0, "avg_score": 0.0, "scores": []}) by_category[cat]["count"] += 1 if r.success: by_category[cat]["passed"] += 1 by_category[cat]["scores"].append(r.score) for cat, data in by_category.items(): data["avg_score"] = round(sum(data["scores"]) / max(len(data["scores"]), 1), 3) del data["scores"] summary = EvalSummary( total_tasks=total, passed=passed, failed=total - passed, avg_score=round(avg_score, 3), avg_duration=round(avg_duration, 2), by_category=by_category, results=all_results, ) # Save ts = int(time.time()) path = os.path.join(self.output_dir, f"eval_summary_{ts}.json") with open(path, "w") as f: json.dump(asdict(summary), f, indent=2, default=str) print(f"Evaluation saved to {path}") return summary def compare_strategies( self, strategy_a_factory: Callable[[], Any], strategy_b_factory: Callable[[], Any], tasks: Optional[List[BenchmarkTask]] = None, num_runs: int = 3, ) -> Dict[str, Any]: """A/B test two agent configurations.""" print("Running Strategy A...") old_factory = self.agent_factory self.agent_factory = strategy_a_factory results_a = self.run_suite(tasks, num_runs=num_runs, max_parallel=1) print("Running Strategy B...") self.agent_factory = strategy_b_factory results_b = self.run_suite(tasks, num_runs=num_runs, max_parallel=1) self.agent_factory = old_factory return { "strategy_a": { "avg_score": results_a.avg_score, "pass_rate": results_a.passed / max(results_a.total_tasks, 1), "avg_duration": results_a.avg_duration, }, "strategy_b": { "avg_score": results_b.avg_score, "pass_rate": results_b.passed / max(results_b.total_tasks, 1), "avg_duration": results_b.avg_duration, }, "winner": "A" if results_a.avg_score > results_b.avg_score else "B", }