Spaces:
Sleeping
Sleeping
| """ | |
| eval_harness.py — Enhanced Evaluation Framework | |
| ================================================ | |
| Supports custom benchmarks, WebArena-style tasks, GAIA-style tasks, | |
| A/B testing, and LLM-as-a-judge grading. | |
| """ | |
| import os | |
| import json | |
| import time | |
| import random | |
| from concurrent.futures import ThreadPoolExecutor | |
| from typing import Any, Dict, List, Optional, Callable | |
| from dataclasses import dataclass, field, asdict | |
| # --------------------------------------------------------------------------- | |
| # Benchmark Tasks | |
| # --------------------------------------------------------------------------- | |
| class BenchmarkTask: | |
| id: str | |
| category: str | |
| description: str | |
| expected_answer: Optional[str] = None | |
| expected_contains: Optional[List[str]] = None | |
| max_steps: int = 50 | |
| setup_script: Optional[str] = None # Shell commands to prep the sandbox | |
| teardown_script: Optional[str] = None | |
| weight: float = 1.0 | |
| DEFAULT_BENCHMARKS: List[BenchmarkTask] = [ | |
| # Web navigation | |
| BenchmarkTask( | |
| id="puppies", | |
| category="web_search", | |
| description="Find me pictures of cute puppies", | |
| expected_contains=["puppy", "dog", "image"], | |
| max_steps=30, | |
| ), | |
| BenchmarkTask( | |
| id="gmaps_hf_hq", | |
| category="web_navigation", | |
| description="Use Google Maps to find the Hugging Face HQ in Paris", | |
| expected_contains=["Paris", "Hugging Face", "5/7"], | |
| max_steps=40, | |
| ), | |
| BenchmarkTask( | |
| id="wikipedia_april4", | |
| category="web_research", | |
| description="Go to Wikipedia and find what happened on April 4th", | |
| expected_contains=["April", "4"], | |
| max_steps=30, | |
| ), | |
| BenchmarkTask( | |
| id="commute_bern_basel", | |
| category="web_navigation", | |
| description="Find out the travel time by train from Bern to Basel on Google Maps", | |
| expected_contains=["Bern", "Basel", "hour", "min"], | |
| max_steps=40, | |
| ), | |
| BenchmarkTask( | |
| id="hf_flux_gpu", | |
| category="hf_ecosystem", | |
| description="Go to Hugging Face Spaces and find the Space flux.1 schnell. Use it to generate an image of a GPU", | |
| expected_contains=["GPU", "image"], | |
| max_steps=60, | |
| ), | |
| BenchmarkTask( | |
| id="github_trending", | |
| category="web_research", | |
| description="Go to GitHub trending and find the top Python repository today", | |
| expected_contains=["Python", "github.com"], | |
| max_steps=35, | |
| ), | |
| BenchmarkTask( | |
| id="pdf_extract", | |
| category="document", | |
| description="Download a sample PDF from the internet and extract the first paragraph", | |
| expected_contains=["PDF", "paragraph"], | |
| max_steps=40, | |
| ), | |
| BenchmarkTask( | |
| id="calc_sum", | |
| category="code_execution", | |
| description="Calculate the sum of the first 100 prime numbers using Python", | |
| expected_answer="24133", | |
| max_steps=20, | |
| ), | |
| BenchmarkTask( | |
| id="dark_mode_maps", | |
| category="web_navigation", | |
| description="Open Google Maps and switch to dark mode if available", | |
| expected_contains=["dark", "theme"], | |
| max_steps=30, | |
| ), | |
| BenchmarkTask( | |
| id="hf_model_search", | |
| category="hf_ecosystem", | |
| description="Search Hugging Face Hub for 'text-to-video' models and list the top 3 by downloads", | |
| expected_contains=["text-to-video", "model"], | |
| max_steps=35, | |
| ), | |
| ] | |
| # --------------------------------------------------------------------------- | |
| # LLM-as-a-Judge | |
| # --------------------------------------------------------------------------- | |
| class LLMJudge: | |
| """Grades agent outputs using a language model.""" | |
| def __init__(self, model_call: Callable[[List[Dict[str, Any]]], str]): | |
| self.model_call = model_call | |
| def grade_exact(self, predicted: str, expected: str) -> float: | |
| return 1.0 if expected.lower().strip() in predicted.lower().strip() else 0.0 | |
| def grade_contains(self, predicted: str, expected_list: List[str]) -> float: | |
| if not expected_list: | |
| return 1.0 | |
| matched = sum(1 for e in expected_list if e.lower() in predicted.lower()) | |
| return matched / len(expected_list) | |
| def grade_semantic( | |
| self, | |
| task_description: str, | |
| agent_trace: str, | |
| predicted: str, | |
| expected: Optional[str] = None, | |
| expected_contains: Optional[List[str]] = None, | |
| ) -> Dict[str, Any]: | |
| """Use an LLM to judge success on a 0-1 scale.""" | |
| prompt = f"""You are an expert evaluator. A computer agent was given this task: | |
| Task: {task_description} | |
| The agent's final response / trace summary: | |
| {predicted[:2000]} | |
| Expected answer (if any): {expected or 'N/A'} | |
| Expected keywords (if any): {expected_contains or 'N/A'} | |
| Rate the agent's success on a scale from 0.0 to 1.0, where: | |
| - 1.0 = fully completed and correct | |
| - 0.5 = partially correct or incomplete | |
| - 0.0 = completely wrong or failed | |
| Respond ONLY with a JSON object: | |
| {{"score": float, "reason": "short explanation", "missing": "what was missing"}} | |
| """ | |
| response = self.model_call([{"role": "user", "content": prompt}]) | |
| content = response.strip() | |
| if content.startswith("```"): | |
| content = content.split("```", 2)[-1] | |
| if content.startswith("json"): | |
| content = content[4:] | |
| content = content.strip() | |
| try: | |
| result = json.loads(content) | |
| return { | |
| "score": float(result.get("score", 0.0)), | |
| "reason": result.get("reason", ""), | |
| "missing": result.get("missing", ""), | |
| } | |
| except (json.JSONDecodeError, ValueError): | |
| # Fallback heuristic | |
| score = 0.5 if "success" in predicted.lower() or "done" in predicted.lower() else 0.0 | |
| return {"score": score, "reason": "LLM judge parsing failed, heuristic fallback", "missing": ""} | |
| # --------------------------------------------------------------------------- | |
| # Evaluation Harness | |
| # --------------------------------------------------------------------------- | |
| class TaskResult: | |
| task_id: str | |
| success: bool | |
| score: float | |
| duration_sec: float | |
| steps_taken: int | |
| final_output: str | |
| error: Optional[str] = None | |
| judge_reason: Optional[str] = None | |
| class EvalSummary: | |
| total_tasks: int | |
| passed: int | |
| failed: int | |
| avg_score: float | |
| avg_duration: float | |
| by_category: Dict[str, Dict[str, Any]] | |
| results: List[TaskResult] | |
| timestamp: float = field(default_factory=time.time) | |
| class EvaluationHarness: | |
| """Run benchmarks against the agent and produce reports.""" | |
| def __init__( | |
| self, | |
| agent_factory: Callable[[], Any], | |
| judge_model_call: Optional[Callable] = None, | |
| output_dir: str = "./eval_results", | |
| ): | |
| self.agent_factory = agent_factory | |
| self.judge = LLMJudge(judge_model_call) if judge_model_call else None | |
| self.output_dir = output_dir | |
| os.makedirs(output_dir, exist_ok=True) | |
| def run_task( | |
| self, | |
| task: BenchmarkTask, | |
| num_runs: int = 1, | |
| ) -> List[TaskResult]: | |
| results = [] | |
| for run_idx in range(num_runs): | |
| start = time.time() | |
| agent = self.agent_factory() | |
| try: | |
| # Run the agent | |
| output = agent.run(task.description, max_steps=task.max_steps) | |
| duration = time.time() - start | |
| # Grade | |
| if self.judge: | |
| judge_result = self.judge.grade_semantic( | |
| task.description, | |
| str(output), | |
| str(output), | |
| task.expected_answer, | |
| task.expected_contains, | |
| ) | |
| score = judge_result["score"] | |
| reason = judge_result["reason"] | |
| else: | |
| if task.expected_answer: | |
| score = self.judge.grade_exact(str(output), task.expected_answer) if self.judge else 0.0 | |
| elif task.expected_contains: | |
| score = self.judge.grade_contains(str(output), task.expected_contains) if self.judge else 0.0 | |
| else: | |
| score = 0.5 | |
| reason = "Heuristic grading (no LLM judge)" | |
| success = score >= 0.7 | |
| results.append(TaskResult( | |
| task_id=f"{task.id}_run{run_idx}", | |
| success=success, | |
| score=score, | |
| duration_sec=round(duration, 2), | |
| steps_taken=getattr(agent, "step_number", 0), | |
| final_output=str(output)[:2000], | |
| error=None, | |
| judge_reason=reason, | |
| )) | |
| except Exception as e: | |
| duration = time.time() - start | |
| results.append(TaskResult( | |
| task_id=f"{task.id}_run{run_idx}", | |
| success=False, | |
| score=0.0, | |
| duration_sec=round(duration, 2), | |
| steps_taken=0, | |
| final_output="", | |
| error=str(e), | |
| judge_reason="Exception during execution", | |
| )) | |
| return results | |
| def run_suite( | |
| self, | |
| tasks: Optional[List[BenchmarkTask]] = None, | |
| num_runs: int = 1, | |
| max_parallel: int = 2, | |
| ) -> EvalSummary: | |
| tasks = tasks or DEFAULT_BENCHMARKS | |
| all_results: List[TaskResult] = [] | |
| def run_single(task): | |
| return self.run_task(task, num_runs=num_runs) | |
| with ThreadPoolExecutor(max_workers=max_parallel) as executor: | |
| futures = [executor.submit(run_single, t) for t in tasks] | |
| for future in futures: | |
| all_results.extend(future.result()) | |
| # Aggregate | |
| passed = sum(1 for r in all_results if r.success) | |
| total = len(all_results) | |
| avg_score = sum(r.score for r in all_results) / max(total, 1) | |
| avg_duration = sum(r.duration_sec for r in all_results) / max(total, 1) | |
| by_category: Dict[str, Any] = {} | |
| for r in all_results: | |
| # Map back to category from task_id prefix | |
| cat = "unknown" | |
| for t in tasks: | |
| if r.task_id.startswith(t.id): | |
| cat = t.category | |
| break | |
| by_category.setdefault(cat, {"count": 0, "passed": 0, "avg_score": 0.0, "scores": []}) | |
| by_category[cat]["count"] += 1 | |
| if r.success: | |
| by_category[cat]["passed"] += 1 | |
| by_category[cat]["scores"].append(r.score) | |
| for cat, data in by_category.items(): | |
| data["avg_score"] = round(sum(data["scores"]) / max(len(data["scores"]), 1), 3) | |
| del data["scores"] | |
| summary = EvalSummary( | |
| total_tasks=total, | |
| passed=passed, | |
| failed=total - passed, | |
| avg_score=round(avg_score, 3), | |
| avg_duration=round(avg_duration, 2), | |
| by_category=by_category, | |
| results=all_results, | |
| ) | |
| # Save | |
| ts = int(time.time()) | |
| path = os.path.join(self.output_dir, f"eval_summary_{ts}.json") | |
| with open(path, "w") as f: | |
| json.dump(asdict(summary), f, indent=2, default=str) | |
| print(f"Evaluation saved to {path}") | |
| return summary | |
| def compare_strategies( | |
| self, | |
| strategy_a_factory: Callable[[], Any], | |
| strategy_b_factory: Callable[[], Any], | |
| tasks: Optional[List[BenchmarkTask]] = None, | |
| num_runs: int = 3, | |
| ) -> Dict[str, Any]: | |
| """A/B test two agent configurations.""" | |
| print("Running Strategy A...") | |
| old_factory = self.agent_factory | |
| self.agent_factory = strategy_a_factory | |
| results_a = self.run_suite(tasks, num_runs=num_runs, max_parallel=1) | |
| print("Running Strategy B...") | |
| self.agent_factory = strategy_b_factory | |
| results_b = self.run_suite(tasks, num_runs=num_runs, max_parallel=1) | |
| self.agent_factory = old_factory | |
| return { | |
| "strategy_a": { | |
| "avg_score": results_a.avg_score, | |
| "pass_rate": results_a.passed / max(results_a.total_tasks, 1), | |
| "avg_duration": results_a.avg_duration, | |
| }, | |
| "strategy_b": { | |
| "avg_score": results_b.avg_score, | |
| "pass_rate": results_b.passed / max(results_b.total_tasks, 1), | |
| "avg_duration": results_b.avg_duration, | |
| }, | |
| "winner": "A" if results_a.avg_score > results_b.avg_score else "B", | |
| } | |
| # --------------------------------------------------------------------------- | |
| # CLI entrypoint | |
| # --------------------------------------------------------------------------- | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Run evaluation harness") | |
| parser.add_argument("--runs", type=int, default=1, help="Number of runs per task") | |
| parser.add_argument("--parallel", type=int, default=1, help="Max parallel workers") | |
| parser.add_argument("--output", type=str, default="./eval_results", help="Output directory") | |
| parser.add_argument("--tasks", type=str, default="all", help="Task IDs comma-separated or 'all'") | |
| args = parser.parse_args() | |
| # Minimal stub factory — override with your real agent factory | |
| def dummy_agent_factory(): | |
| class DummyAgent: | |
| def run(self, task, max_steps=50): | |
| return f"Dummy result for: {task}" | |
| return DummyAgent() | |
| tasks = DEFAULT_BENCHMARKS | |
| if args.tasks != "all": | |
| ids = set(args.tasks.split(",")) | |
| tasks = [t for t in tasks if t.id in ids] | |
| harness = EvaluationHarness(agent_factory=dummy_agent_factory, output_dir=args.output) | |
| summary = harness.run_suite(tasks, num_runs=args.runs, max_parallel=args.parallel) | |
| print("=" * 50) | |
| print(f"Total tasks: {summary.total_tasks}") | |
| print(f"Passed: {summary.passed}") | |
| print(f"Failed: {summary.failed}") | |
| print(f"Avg score: {summary.avg_score}") | |
| print(f"Avg duration: {summary.avg_duration}s") | |
| print("=" * 50) | |
| for cat, data in summary.by_category.items(): | |
| print(f" {cat}: {data['passed']}/{data['count']} passed (avg score {data['avg_score']})") | |