File size: 12,839 Bytes

31e5b3a

"""
eval_harness.py — Enhanced Evaluation Framework
================================================
Supports custom benchmarks, WebArena-style tasks, GAIA-style tasks,
A/B testing, and LLM-as-a-judge grading.
"""

import os
import json
import time
import random
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Dict, List, Optional, Callable
from dataclasses import dataclass, field, asdict


# ---------------------------------------------------------------------------
# Benchmark Tasks
# ---------------------------------------------------------------------------

@dataclass
class BenchmarkTask:
    id: str
    category: str
    description: str
    expected_answer: Optional[str] = None
    expected_contains: Optional[List[str]] = None
    max_steps: int = 50
    setup_script: Optional[str] = None  # Shell commands to prep the sandbox
    teardown_script: Optional[str] = None
    weight: float = 1.0


DEFAULT_BENCHMARKS: List[BenchmarkTask] = [
    # Web navigation
    BenchmarkTask(
        id="puppies",
        category="web_search",
        description="Find me pictures of cute puppies",
        expected_contains=["puppy", "dog", "image"],
        max_steps=30,
    ),
    BenchmarkTask(
        id="gmaps_hf_hq",
        category="web_navigation",
        description="Use Google Maps to find the Hugging Face HQ in Paris",
        expected_contains=["Paris", "Hugging Face", "5/7"],
        max_steps=40,
    ),
    BenchmarkTask(
        id="wikipedia_april4",
        category="web_research",
        description="Go to Wikipedia and find what happened on April 4th",
        expected_contains=["April", "4"],
        max_steps=30,
    ),
    BenchmarkTask(
        id="commute_bern_basel",
        category="web_navigation",
        description="Find out the travel time by train from Bern to Basel on Google Maps",
        expected_contains=["Bern", "Basel", "hour", "min"],
        max_steps=40,
    ),
    BenchmarkTask(
        id="hf_flux_gpu",
        category="hf_ecosystem",
        description="Go to Hugging Face Spaces and find the Space flux.1 schnell. Use it to generate an image of a GPU",
        expected_contains=["GPU", "image"],
        max_steps=60,
    ),
    BenchmarkTask(
        id="github_trending",
        category="web_research",
        description="Go to GitHub trending and find the top Python repository today",
        expected_contains=["Python", "github.com"],
        max_steps=35,
    ),
    BenchmarkTask(
        id="pdf_extract",
        category="document",
        description="Download a sample PDF from the internet and extract the first paragraph",
        expected_contains=["PDF", "paragraph"],
        max_steps=40,
    ),
    BenchmarkTask(
        id="calc_sum",
        category="code_execution",
        description="Calculate the sum of the first 100 prime numbers using Python",
        expected_answer="24133",
        max_steps=20,
    ),
    BenchmarkTask(
        id="dark_mode_maps",
        category="web_navigation",
        description="Open Google Maps and switch to dark mode if available",
        expected_contains=["dark", "theme"],
        max_steps=30,
    ),
    BenchmarkTask(
        id="hf_model_search",
        category="hf_ecosystem",
        description="Search Hugging Face Hub for 'text-to-video' models and list the top 3 by downloads",
        expected_contains=["text-to-video", "model"],
        max_steps=35,
    ),
]


# ---------------------------------------------------------------------------
# LLM-as-a-Judge
# ---------------------------------------------------------------------------

class LLMJudge:
    """Grades agent outputs using a language model."""

    def __init__(self, model_call: Callable[[List[Dict[str, Any]]], str]):
        self.model_call = model_call

    def grade_exact(self, predicted: str, expected: str) -> float:
        return 1.0 if expected.lower().strip() in predicted.lower().strip() else 0.0

    def grade_contains(self, predicted: str, expected_list: List[str]) -> float:
        if not expected_list:
            return 1.0
        matched = sum(1 for e in expected_list if e.lower() in predicted.lower())
        return matched / len(expected_list)

    def grade_semantic(
        self,
        task_description: str,
        agent_trace: str,
        predicted: str,
        expected: Optional[str] = None,
        expected_contains: Optional[List[str]] = None,
    ) -> Dict[str, Any]:
        """Use an LLM to judge success on a 0-1 scale."""
        prompt = f"""You are an expert evaluator. A computer agent was given this task:

Task: {task_description}

The agent's final response / trace summary:
{predicted[:2000]}

Expected answer (if any): {expected or 'N/A'}
Expected keywords (if any): {expected_contains or 'N/A'}

Rate the agent's success on a scale from 0.0 to 1.0, where:
- 1.0 = fully completed and correct
- 0.5 = partially correct or incomplete
- 0.0 = completely wrong or failed

Respond ONLY with a JSON object:
{{"score": float, "reason": "short explanation", "missing": "what was missing"}}
"""
        response = self.model_call([{"role": "user", "content": prompt}])
        content = response.strip()
        if content.startswith("```"):
            content = content.split("```", 2)[-1]
            if content.startswith("json"):
                content = content[4:]
        content = content.strip()
        try:
            result = json.loads(content)
            return {
                "score": float(result.get("score", 0.0)),
                "reason": result.get("reason", ""),
                "missing": result.get("missing", ""),
            }
        except (json.JSONDecodeError, ValueError):
            # Fallback heuristic
            score = 0.5 if "success" in predicted.lower() or "done" in predicted.lower() else 0.0
            return {"score": score, "reason": "LLM judge parsing failed, heuristic fallback", "missing": ""}


# ---------------------------------------------------------------------------
# Evaluation Harness
# ---------------------------------------------------------------------------

@dataclass
class TaskResult:
    task_id: str
    success: bool
    score: float
    duration_sec: float
    steps_taken: int
    final_output: str
    error: Optional[str] = None
    judge_reason: Optional[str] = None


@dataclass
class EvalSummary:
    total_tasks: int
    passed: int
    failed: int
    avg_score: float
    avg_duration: float
    by_category: Dict[str, Dict[str, Any]]
    results: List[TaskResult]
    timestamp: float = field(default_factory=time.time)


class EvaluationHarness:
    """Run benchmarks against the agent and produce reports."""

    def __init__(
        self,
        agent_factory: Callable[[], Any],
        judge_model_call: Optional[Callable] = None,
        output_dir: str = "./eval_results",
    ):
        self.agent_factory = agent_factory
        self.judge = LLMJudge(judge_model_call) if judge_model_call else None
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)

    def run_task(
        self,
        task: BenchmarkTask,
        num_runs: int = 1,
    ) -> List[TaskResult]:
        results = []
        for run_idx in range(num_runs):
            start = time.time()
            agent = self.agent_factory()
            try:
                # Run the agent
                output = agent.run(task.description, max_steps=task.max_steps)
                duration = time.time() - start

                # Grade
                if self.judge:
                    judge_result = self.judge.grade_semantic(
                        task.description,
                        str(output),
                        str(output),
                        task.expected_answer,
                        task.expected_contains,
                    )
                    score = judge_result["score"]
                    reason = judge_result["reason"]
                else:
                    if task.expected_answer:
                        score = self.judge.grade_exact(str(output), task.expected_answer) if self.judge else 0.0
                    elif task.expected_contains:
                        score = self.judge.grade_contains(str(output), task.expected_contains) if self.judge else 0.0
                    else:
                        score = 0.5
                    reason = "Heuristic grading (no LLM judge)"

                success = score >= 0.7
                results.append(TaskResult(
                    task_id=f"{task.id}_run{run_idx}",
                    success=success,
                    score=score,
                    duration_sec=round(duration, 2),
                    steps_taken=getattr(agent, "step_number", 0),
                    final_output=str(output)[:2000],
                    error=None,
                    judge_reason=reason,
                ))
            except Exception as e:
                duration = time.time() - start
                results.append(TaskResult(
                    task_id=f"{task.id}_run{run_idx}",
                    success=False,
                    score=0.0,
                    duration_sec=round(duration, 2),
                    steps_taken=0,
                    final_output="",
                    error=str(e),
                    judge_reason="Exception during execution",
                ))
        return results

    def run_suite(
        self,
        tasks: Optional[List[BenchmarkTask]] = None,
        num_runs: int = 1,
        max_parallel: int = 2,
    ) -> EvalSummary:
        tasks = tasks or DEFAULT_BENCHMARKS
        all_results: List[TaskResult] = []

        def run_single(task):
            return self.run_task(task, num_runs=num_runs)

        with ThreadPoolExecutor(max_workers=max_parallel) as executor:
            futures = [executor.submit(run_single, t) for t in tasks]
            for future in futures:
                all_results.extend(future.result())

        # Aggregate
        passed = sum(1 for r in all_results if r.success)
        total = len(all_results)
        avg_score = sum(r.score for r in all_results) / max(total, 1)
        avg_duration = sum(r.duration_sec for r in all_results) / max(total, 1)

        by_category: Dict[str, Any] = {}
        for r in all_results:
            # Map back to category from task_id prefix
            cat = "unknown"
            for t in tasks:
                if r.task_id.startswith(t.id):
                    cat = t.category
                    break
            by_category.setdefault(cat, {"count": 0, "passed": 0, "avg_score": 0.0, "scores": []})
            by_category[cat]["count"] += 1
            if r.success:
                by_category[cat]["passed"] += 1
            by_category[cat]["scores"].append(r.score)

        for cat, data in by_category.items():
            data["avg_score"] = round(sum(data["scores"]) / max(len(data["scores"]), 1), 3)
            del data["scores"]

        summary = EvalSummary(
            total_tasks=total,
            passed=passed,
            failed=total - passed,
            avg_score=round(avg_score, 3),
            avg_duration=round(avg_duration, 2),
            by_category=by_category,
            results=all_results,
        )

        # Save
        ts = int(time.time())
        path = os.path.join(self.output_dir, f"eval_summary_{ts}.json")
        with open(path, "w") as f:
            json.dump(asdict(summary), f, indent=2, default=str)
        print(f"Evaluation saved to {path}")
        return summary

    def compare_strategies(
        self,
        strategy_a_factory: Callable[[], Any],
        strategy_b_factory: Callable[[], Any],
        tasks: Optional[List[BenchmarkTask]] = None,
        num_runs: int = 3,
    ) -> Dict[str, Any]:
        """A/B test two agent configurations."""
        print("Running Strategy A...")
        old_factory = self.agent_factory
        self.agent_factory = strategy_a_factory
        results_a = self.run_suite(tasks, num_runs=num_runs, max_parallel=1)

        print("Running Strategy B...")
        self.agent_factory = strategy_b_factory
        results_b = self.run_suite(tasks, num_runs=num_runs, max_parallel=1)

        self.agent_factory = old_factory

        return {
            "strategy_a": {
                "avg_score": results_a.avg_score,
                "pass_rate": results_a.passed / max(results_a.total_tasks, 1),
                "avg_duration": results_a.avg_duration,
            },
            "strategy_b": {
                "avg_score": results_b.avg_score,
                "pass_rate": results_b.passed / max(results_b.total_tasks, 1),
                "avg_duration": results_b.avg_duration,
            },
            "winner": "A" if results_a.avg_score > results_b.avg_score else "B",
        }