Spaces:

jkorstad
/

computer-agent-v2

Sleeping

App Files Files Community

computer-agent-v2 / eval_harness.py

jkorstad

v2.0-polish: tuple streaming, plan+cost display wiring, tracker sync, interrupt safety, README, eval CLI

877f588 30 days ago

raw

history blame contribute delete

14.5 kB

	"""
	eval_harness.py — Enhanced Evaluation Framework
	================================================
	Supports custom benchmarks, WebArena-style tasks, GAIA-style tasks,
	A/B testing, and LLM-as-a-judge grading.
	"""

	import os
	import json
	import time
	import random
	from concurrent.futures import ThreadPoolExecutor
	from typing import Any, Dict, List, Optional, Callable
	from dataclasses import dataclass, field, asdict


	# ---------------------------------------------------------------------------
	# Benchmark Tasks
	# ---------------------------------------------------------------------------

	@dataclass
	class BenchmarkTask:
	id: str
	category: str
	description: str
	expected_answer: Optional[str] = None
	expected_contains: Optional[List[str]] = None
	max_steps: int = 50
	setup_script: Optional[str] = None # Shell commands to prep the sandbox
	teardown_script: Optional[str] = None
	weight: float = 1.0


	DEFAULT_BENCHMARKS: List[BenchmarkTask] = [
	# Web navigation
	BenchmarkTask(
	id="puppies",
	category="web_search",
	description="Find me pictures of cute puppies",
	expected_contains=["puppy", "dog", "image"],
	max_steps=30,
	),
	BenchmarkTask(
	id="gmaps_hf_hq",
	category="web_navigation",
	description="Use Google Maps to find the Hugging Face HQ in Paris",
	expected_contains=["Paris", "Hugging Face", "5/7"],
	max_steps=40,
	),
	BenchmarkTask(
	id="wikipedia_april4",
	category="web_research",
	description="Go to Wikipedia and find what happened on April 4th",
	expected_contains=["April", "4"],
	max_steps=30,
	),
	BenchmarkTask(
	id="commute_bern_basel",
	category="web_navigation",
	description="Find out the travel time by train from Bern to Basel on Google Maps",
	expected_contains=["Bern", "Basel", "hour", "min"],
	max_steps=40,
	),
	BenchmarkTask(
	id="hf_flux_gpu",
	category="hf_ecosystem",
	description="Go to Hugging Face Spaces and find the Space flux.1 schnell. Use it to generate an image of a GPU",
	expected_contains=["GPU", "image"],
	max_steps=60,
	),
	BenchmarkTask(
	id="github_trending",
	category="web_research",
	description="Go to GitHub trending and find the top Python repository today",
	expected_contains=["Python", "github.com"],
	max_steps=35,
	),
	BenchmarkTask(
	id="pdf_extract",
	category="document",
	description="Download a sample PDF from the internet and extract the first paragraph",
	expected_contains=["PDF", "paragraph"],
	max_steps=40,
	),
	BenchmarkTask(
	id="calc_sum",
	category="code_execution",
	description="Calculate the sum of the first 100 prime numbers using Python",
	expected_answer="24133",
	max_steps=20,
	),
	BenchmarkTask(
	id="dark_mode_maps",
	category="web_navigation",
	description="Open Google Maps and switch to dark mode if available",
	expected_contains=["dark", "theme"],
	max_steps=30,
	),
	BenchmarkTask(
	id="hf_model_search",
	category="hf_ecosystem",
	description="Search Hugging Face Hub for 'text-to-video' models and list the top 3 by downloads",
	expected_contains=["text-to-video", "model"],
	max_steps=35,
	),
	]


	# ---------------------------------------------------------------------------
	# LLM-as-a-Judge
	# ---------------------------------------------------------------------------

	class LLMJudge:
	"""Grades agent outputs using a language model."""

	def __init__(self, model_call: Callable[[List[Dict[str, Any]]], str]):
	self.model_call = model_call

	def grade_exact(self, predicted: str, expected: str) -> float:
	return 1.0 if expected.lower().strip() in predicted.lower().strip() else 0.0

	def grade_contains(self, predicted: str, expected_list: List[str]) -> float:
	if not expected_list:
	return 1.0
	matched = sum(1 for e in expected_list if e.lower() in predicted.lower())
	return matched / len(expected_list)

	def grade_semantic(
	self,
	task_description: str,
	agent_trace: str,
	predicted: str,
	expected: Optional[str] = None,
	expected_contains: Optional[List[str]] = None,
	) -> Dict[str, Any]:
	"""Use an LLM to judge success on a 0-1 scale."""
	prompt = f"""You are an expert evaluator. A computer agent was given this task:

	Task: {task_description}

	The agent's final response / trace summary:
	{predicted[:2000]}

	Expected answer (if any): {expected or 'N/A'}
	Expected keywords (if any): {expected_contains or 'N/A'}

	Rate the agent's success on a scale from 0.0 to 1.0, where:
	- 1.0 = fully completed and correct
	- 0.5 = partially correct or incomplete
	- 0.0 = completely wrong or failed

	Respond ONLY with a JSON object:
	{{"score": float, "reason": "short explanation", "missing": "what was missing"}}
	"""
	response = self.model_call([{"role": "user", "content": prompt}])
	content = response.strip()
	if content.startswith("```"):
	content = content.split("```", 2)[-1]
	if content.startswith("json"):
	content = content[4:]
	content = content.strip()
	try:
	result = json.loads(content)
	return {
	"score": float(result.get("score", 0.0)),
	"reason": result.get("reason", ""),
	"missing": result.get("missing", ""),
	}
	except (json.JSONDecodeError, ValueError):
	# Fallback heuristic
	score = 0.5 if "success" in predicted.lower() or "done" in predicted.lower() else 0.0
	return {"score": score, "reason": "LLM judge parsing failed, heuristic fallback", "missing": ""}


	# ---------------------------------------------------------------------------
	# Evaluation Harness
	# ---------------------------------------------------------------------------

	@dataclass
	class TaskResult:
	task_id: str
	success: bool
	score: float
	duration_sec: float
	steps_taken: int
	final_output: str
	error: Optional[str] = None
	judge_reason: Optional[str] = None


	@dataclass
	class EvalSummary:
	total_tasks: int
	passed: int
	failed: int
	avg_score: float
	avg_duration: float
	by_category: Dict[str, Dict[str, Any]]
	results: List[TaskResult]
	timestamp: float = field(default_factory=time.time)


	class EvaluationHarness:
	"""Run benchmarks against the agent and produce reports."""

	def __init__(
	self,
	agent_factory: Callable[[], Any],
	judge_model_call: Optional[Callable] = None,
	output_dir: str = "./eval_results",
	):
	self.agent_factory = agent_factory
	self.judge = LLMJudge(judge_model_call) if judge_model_call else None
	self.output_dir = output_dir
	os.makedirs(output_dir, exist_ok=True)

	def run_task(
	self,
	task: BenchmarkTask,
	num_runs: int = 1,
	) -> List[TaskResult]:
	results = []
	for run_idx in range(num_runs):
	start = time.time()
	agent = self.agent_factory()
	try:
	# Run the agent
	output = agent.run(task.description, max_steps=task.max_steps)
	duration = time.time() - start

	# Grade
	if self.judge:
	judge_result = self.judge.grade_semantic(
	task.description,
	str(output),
	str(output),
	task.expected_answer,
	task.expected_contains,
	)
	score = judge_result["score"]
	reason = judge_result["reason"]
	else:
	if task.expected_answer:
	score = self.judge.grade_exact(str(output), task.expected_answer) if self.judge else 0.0
	elif task.expected_contains:
	score = self.judge.grade_contains(str(output), task.expected_contains) if self.judge else 0.0
	else:
	score = 0.5
	reason = "Heuristic grading (no LLM judge)"

	success = score >= 0.7
	results.append(TaskResult(
	task_id=f"{task.id}_run{run_idx}",
	success=success,
	score=score,
	duration_sec=round(duration, 2),
	steps_taken=getattr(agent, "step_number", 0),
	final_output=str(output)[:2000],
	error=None,
	judge_reason=reason,
	))
	except Exception as e:
	duration = time.time() - start
	results.append(TaskResult(
	task_id=f"{task.id}_run{run_idx}",
	success=False,
	score=0.0,
	duration_sec=round(duration, 2),
	steps_taken=0,
	final_output="",
	error=str(e),
	judge_reason="Exception during execution",
	))
	return results

	def run_suite(
	self,
	tasks: Optional[List[BenchmarkTask]] = None,
	num_runs: int = 1,
	max_parallel: int = 2,
	) -> EvalSummary:
	tasks = tasks or DEFAULT_BENCHMARKS
	all_results: List[TaskResult] = []

	def run_single(task):
	return self.run_task(task, num_runs=num_runs)

	with ThreadPoolExecutor(max_workers=max_parallel) as executor:
	futures = [executor.submit(run_single, t) for t in tasks]
	for future in futures:
	all_results.extend(future.result())

	# Aggregate
	passed = sum(1 for r in all_results if r.success)
	total = len(all_results)
	avg_score = sum(r.score for r in all_results) / max(total, 1)
	avg_duration = sum(r.duration_sec for r in all_results) / max(total, 1)

	by_category: Dict[str, Any] = {}
	for r in all_results:
	# Map back to category from task_id prefix
	cat = "unknown"
	for t in tasks:
	if r.task_id.startswith(t.id):
	cat = t.category
	break
	by_category.setdefault(cat, {"count": 0, "passed": 0, "avg_score": 0.0, "scores": []})
	by_category[cat]["count"] += 1
	if r.success:
	by_category[cat]["passed"] += 1
	by_category[cat]["scores"].append(r.score)

	for cat, data in by_category.items():
	data["avg_score"] = round(sum(data["scores"]) / max(len(data["scores"]), 1), 3)
	del data["scores"]

	summary = EvalSummary(
	total_tasks=total,
	passed=passed,
	failed=total - passed,
	avg_score=round(avg_score, 3),
	avg_duration=round(avg_duration, 2),
	by_category=by_category,
	results=all_results,
	)

	# Save
	ts = int(time.time())
	path = os.path.join(self.output_dir, f"eval_summary_{ts}.json")
	with open(path, "w") as f:
	json.dump(asdict(summary), f, indent=2, default=str)
	print(f"Evaluation saved to {path}")
	return summary

	def compare_strategies(
	self,
	strategy_a_factory: Callable[[], Any],
	strategy_b_factory: Callable[[], Any],
	tasks: Optional[List[BenchmarkTask]] = None,
	num_runs: int = 3,
	) -> Dict[str, Any]:
	"""A/B test two agent configurations."""
	print("Running Strategy A...")
	old_factory = self.agent_factory
	self.agent_factory = strategy_a_factory
	results_a = self.run_suite(tasks, num_runs=num_runs, max_parallel=1)

	print("Running Strategy B...")
	self.agent_factory = strategy_b_factory
	results_b = self.run_suite(tasks, num_runs=num_runs, max_parallel=1)

	self.agent_factory = old_factory

	return {
	"strategy_a": {
	"avg_score": results_a.avg_score,
	"pass_rate": results_a.passed / max(results_a.total_tasks, 1),
	"avg_duration": results_a.avg_duration,
	},
	"strategy_b": {
	"avg_score": results_b.avg_score,
	"pass_rate": results_b.passed / max(results_b.total_tasks, 1),
	"avg_duration": results_b.avg_duration,
	},
	"winner": "A" if results_a.avg_score > results_b.avg_score else "B",
	}


	# ---------------------------------------------------------------------------
	# CLI entrypoint
	# ---------------------------------------------------------------------------

	if __name__ == "__main__":
	import argparse
	parser = argparse.ArgumentParser(description="Run evaluation harness")
	parser.add_argument("--runs", type=int, default=1, help="Number of runs per task")
	parser.add_argument("--parallel", type=int, default=1, help="Max parallel workers")
	parser.add_argument("--output", type=str, default="./eval_results", help="Output directory")
	parser.add_argument("--tasks", type=str, default="all", help="Task IDs comma-separated or 'all'")
	args = parser.parse_args()

	# Minimal stub factory — override with your real agent factory
	def dummy_agent_factory():
	class DummyAgent:
	def run(self, task, max_steps=50):
	return f"Dummy result for: {task}"
	return DummyAgent()

	tasks = DEFAULT_BENCHMARKS
	if args.tasks != "all":
	ids = set(args.tasks.split(","))
	tasks = [t for t in tasks if t.id in ids]

	harness = EvaluationHarness(agent_factory=dummy_agent_factory, output_dir=args.output)
	summary = harness.run_suite(tasks, num_runs=args.runs, max_parallel=args.parallel)
	print("=" * 50)
	print(f"Total tasks: {summary.total_tasks}")
	print(f"Passed: {summary.passed}")
	print(f"Failed: {summary.failed}")
	print(f"Avg score: {summary.avg_score}")
	print(f"Avg duration: {summary.avg_duration}s")
	print("=" * 50)
	for cat, data in summary.by_category.items():
	print(f" {cat}: {data['passed']}/{data['count']} passed (avg score {data['avg_score']})")