File size: 12,839 Bytes
31e5b3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
"""
eval_harness.py — Enhanced Evaluation Framework
================================================
Supports custom benchmarks, WebArena-style tasks, GAIA-style tasks,
A/B testing, and LLM-as-a-judge grading.
"""

import os
import json
import time
import random
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Dict, List, Optional, Callable
from dataclasses import dataclass, field, asdict


# ---------------------------------------------------------------------------
# Benchmark Tasks
# ---------------------------------------------------------------------------

@dataclass
class BenchmarkTask:
    id: str
    category: str
    description: str
    expected_answer: Optional[str] = None
    expected_contains: Optional[List[str]] = None
    max_steps: int = 50
    setup_script: Optional[str] = None  # Shell commands to prep the sandbox
    teardown_script: Optional[str] = None
    weight: float = 1.0


DEFAULT_BENCHMARKS: List[BenchmarkTask] = [
    # Web navigation
    BenchmarkTask(
        id="puppies",
        category="web_search",
        description="Find me pictures of cute puppies",
        expected_contains=["puppy", "dog", "image"],
        max_steps=30,
    ),
    BenchmarkTask(
        id="gmaps_hf_hq",
        category="web_navigation",
        description="Use Google Maps to find the Hugging Face HQ in Paris",
        expected_contains=["Paris", "Hugging Face", "5/7"],
        max_steps=40,
    ),
    BenchmarkTask(
        id="wikipedia_april4",
        category="web_research",
        description="Go to Wikipedia and find what happened on April 4th",
        expected_contains=["April", "4"],
        max_steps=30,
    ),
    BenchmarkTask(
        id="commute_bern_basel",
        category="web_navigation",
        description="Find out the travel time by train from Bern to Basel on Google Maps",
        expected_contains=["Bern", "Basel", "hour", "min"],
        max_steps=40,
    ),
    BenchmarkTask(
        id="hf_flux_gpu",
        category="hf_ecosystem",
        description="Go to Hugging Face Spaces and find the Space flux.1 schnell. Use it to generate an image of a GPU",
        expected_contains=["GPU", "image"],
        max_steps=60,
    ),
    BenchmarkTask(
        id="github_trending",
        category="web_research",
        description="Go to GitHub trending and find the top Python repository today",
        expected_contains=["Python", "github.com"],
        max_steps=35,
    ),
    BenchmarkTask(
        id="pdf_extract",
        category="document",
        description="Download a sample PDF from the internet and extract the first paragraph",
        expected_contains=["PDF", "paragraph"],
        max_steps=40,
    ),
    BenchmarkTask(
        id="calc_sum",
        category="code_execution",
        description="Calculate the sum of the first 100 prime numbers using Python",
        expected_answer="24133",
        max_steps=20,
    ),
    BenchmarkTask(
        id="dark_mode_maps",
        category="web_navigation",
        description="Open Google Maps and switch to dark mode if available",
        expected_contains=["dark", "theme"],
        max_steps=30,
    ),
    BenchmarkTask(
        id="hf_model_search",
        category="hf_ecosystem",
        description="Search Hugging Face Hub for 'text-to-video' models and list the top 3 by downloads",
        expected_contains=["text-to-video", "model"],
        max_steps=35,
    ),
]


# ---------------------------------------------------------------------------
# LLM-as-a-Judge
# ---------------------------------------------------------------------------

class LLMJudge:
    """Grades agent outputs using a language model."""

    def __init__(self, model_call: Callable[[List[Dict[str, Any]]], str]):
        self.model_call = model_call

    def grade_exact(self, predicted: str, expected: str) -> float:
        return 1.0 if expected.lower().strip() in predicted.lower().strip() else 0.0

    def grade_contains(self, predicted: str, expected_list: List[str]) -> float:
        if not expected_list:
            return 1.0
        matched = sum(1 for e in expected_list if e.lower() in predicted.lower())
        return matched / len(expected_list)

    def grade_semantic(
        self,
        task_description: str,
        agent_trace: str,
        predicted: str,
        expected: Optional[str] = None,
        expected_contains: Optional[List[str]] = None,
    ) -> Dict[str, Any]:
        """Use an LLM to judge success on a 0-1 scale."""
        prompt = f"""You are an expert evaluator. A computer agent was given this task:

Task: {task_description}

The agent's final response / trace summary:
{predicted[:2000]}

Expected answer (if any): {expected or 'N/A'}
Expected keywords (if any): {expected_contains or 'N/A'}

Rate the agent's success on a scale from 0.0 to 1.0, where:
- 1.0 = fully completed and correct
- 0.5 = partially correct or incomplete
- 0.0 = completely wrong or failed

Respond ONLY with a JSON object:
{{"score": float, "reason": "short explanation", "missing": "what was missing"}}
"""
        response = self.model_call([{"role": "user", "content": prompt}])
        content = response.strip()
        if content.startswith("```"):
            content = content.split("```", 2)[-1]
            if content.startswith("json"):
                content = content[4:]
        content = content.strip()
        try:
            result = json.loads(content)
            return {
                "score": float(result.get("score", 0.0)),
                "reason": result.get("reason", ""),
                "missing": result.get("missing", ""),
            }
        except (json.JSONDecodeError, ValueError):
            # Fallback heuristic
            score = 0.5 if "success" in predicted.lower() or "done" in predicted.lower() else 0.0
            return {"score": score, "reason": "LLM judge parsing failed, heuristic fallback", "missing": ""}


# ---------------------------------------------------------------------------
# Evaluation Harness
# ---------------------------------------------------------------------------

@dataclass
class TaskResult:
    task_id: str
    success: bool
    score: float
    duration_sec: float
    steps_taken: int
    final_output: str
    error: Optional[str] = None
    judge_reason: Optional[str] = None


@dataclass
class EvalSummary:
    total_tasks: int
    passed: int
    failed: int
    avg_score: float
    avg_duration: float
    by_category: Dict[str, Dict[str, Any]]
    results: List[TaskResult]
    timestamp: float = field(default_factory=time.time)


class EvaluationHarness:
    """Run benchmarks against the agent and produce reports."""

    def __init__(
        self,
        agent_factory: Callable[[], Any],
        judge_model_call: Optional[Callable] = None,
        output_dir: str = "./eval_results",
    ):
        self.agent_factory = agent_factory
        self.judge = LLMJudge(judge_model_call) if judge_model_call else None
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)

    def run_task(
        self,
        task: BenchmarkTask,
        num_runs: int = 1,
    ) -> List[TaskResult]:
        results = []
        for run_idx in range(num_runs):
            start = time.time()
            agent = self.agent_factory()
            try:
                # Run the agent
                output = agent.run(task.description, max_steps=task.max_steps)
                duration = time.time() - start

                # Grade
                if self.judge:
                    judge_result = self.judge.grade_semantic(
                        task.description,
                        str(output),
                        str(output),
                        task.expected_answer,
                        task.expected_contains,
                    )
                    score = judge_result["score"]
                    reason = judge_result["reason"]
                else:
                    if task.expected_answer:
                        score = self.judge.grade_exact(str(output), task.expected_answer) if self.judge else 0.0
                    elif task.expected_contains:
                        score = self.judge.grade_contains(str(output), task.expected_contains) if self.judge else 0.0
                    else:
                        score = 0.5
                    reason = "Heuristic grading (no LLM judge)"

                success = score >= 0.7
                results.append(TaskResult(
                    task_id=f"{task.id}_run{run_idx}",
                    success=success,
                    score=score,
                    duration_sec=round(duration, 2),
                    steps_taken=getattr(agent, "step_number", 0),
                    final_output=str(output)[:2000],
                    error=None,
                    judge_reason=reason,
                ))
            except Exception as e:
                duration = time.time() - start
                results.append(TaskResult(
                    task_id=f"{task.id}_run{run_idx}",
                    success=False,
                    score=0.0,
                    duration_sec=round(duration, 2),
                    steps_taken=0,
                    final_output="",
                    error=str(e),
                    judge_reason="Exception during execution",
                ))
        return results

    def run_suite(
        self,
        tasks: Optional[List[BenchmarkTask]] = None,
        num_runs: int = 1,
        max_parallel: int = 2,
    ) -> EvalSummary:
        tasks = tasks or DEFAULT_BENCHMARKS
        all_results: List[TaskResult] = []

        def run_single(task):
            return self.run_task(task, num_runs=num_runs)

        with ThreadPoolExecutor(max_workers=max_parallel) as executor:
            futures = [executor.submit(run_single, t) for t in tasks]
            for future in futures:
                all_results.extend(future.result())

        # Aggregate
        passed = sum(1 for r in all_results if r.success)
        total = len(all_results)
        avg_score = sum(r.score for r in all_results) / max(total, 1)
        avg_duration = sum(r.duration_sec for r in all_results) / max(total, 1)

        by_category: Dict[str, Any] = {}
        for r in all_results:
            # Map back to category from task_id prefix
            cat = "unknown"
            for t in tasks:
                if r.task_id.startswith(t.id):
                    cat = t.category
                    break
            by_category.setdefault(cat, {"count": 0, "passed": 0, "avg_score": 0.0, "scores": []})
            by_category[cat]["count"] += 1
            if r.success:
                by_category[cat]["passed"] += 1
            by_category[cat]["scores"].append(r.score)

        for cat, data in by_category.items():
            data["avg_score"] = round(sum(data["scores"]) / max(len(data["scores"]), 1), 3)
            del data["scores"]

        summary = EvalSummary(
            total_tasks=total,
            passed=passed,
            failed=total - passed,
            avg_score=round(avg_score, 3),
            avg_duration=round(avg_duration, 2),
            by_category=by_category,
            results=all_results,
        )

        # Save
        ts = int(time.time())
        path = os.path.join(self.output_dir, f"eval_summary_{ts}.json")
        with open(path, "w") as f:
            json.dump(asdict(summary), f, indent=2, default=str)
        print(f"Evaluation saved to {path}")
        return summary

    def compare_strategies(
        self,
        strategy_a_factory: Callable[[], Any],
        strategy_b_factory: Callable[[], Any],
        tasks: Optional[List[BenchmarkTask]] = None,
        num_runs: int = 3,
    ) -> Dict[str, Any]:
        """A/B test two agent configurations."""
        print("Running Strategy A...")
        old_factory = self.agent_factory
        self.agent_factory = strategy_a_factory
        results_a = self.run_suite(tasks, num_runs=num_runs, max_parallel=1)

        print("Running Strategy B...")
        self.agent_factory = strategy_b_factory
        results_b = self.run_suite(tasks, num_runs=num_runs, max_parallel=1)

        self.agent_factory = old_factory

        return {
            "strategy_a": {
                "avg_score": results_a.avg_score,
                "pass_rate": results_a.passed / max(results_a.total_tasks, 1),
                "avg_duration": results_a.avg_duration,
            },
            "strategy_b": {
                "avg_score": results_b.avg_score,
                "pass_rate": results_b.passed / max(results_b.total_tasks, 1),
                "avg_duration": results_b.avg_duration,
            },
            "winner": "A" if results_a.avg_score > results_b.avg_score else "B",
        }