# server/benchmark_runner.py """ Benchmark Runner + Leaderboard — v4.0 Automatically runs ALL tasks Ɨ selected agent configurations and generates a research-grade leaderboard output with per-task, per-strategy breakdowns. Unlike existing benchmarks (SWE-bench, HumanEval) which require manual setup, this runs end-to-end in-process with deterministic strategies. Output format: - Leaderboard table (ranked by composite score) - Per-task breakdown - Per-failure-type breakdown - Generalization score (variance across tasks) - Robustness score (from counterfactual engine) - A "benchmark JSON" suitable for publishing or comparing systems """ from __future__ import annotations import time import json from typing import List, Dict, Any, Optional from dataclasses import dataclass, field @dataclass class BenchmarkResult: """Result of running one agent on one task variant.""" agent_name: str task: str variant_id: str final_score: float total_steps: int cumulative_reward: float duration_seconds: float strategy: str failure_type: str reliability_index: float causal_score: float robustness_score: float calibration_score: float action_sequence: List[str] @dataclass class AgentBenchmarkSummary: """Aggregated results for one agent across all tasks.""" agent_name: str tasks_run: int mean_score: float std_score: float generalization_score: float # 1 - std (lower variance = more generalizable) mean_steps: float best_task: str worst_task: str mean_reliability: float mean_causal_score: float mean_robustness_score: float mean_calibration_score: float dominant_strategy: str dominant_failure: str composite_rank_score: float # Weighted final score for leaderboard per_task_scores: Dict[str, float] def to_dict(self) -> dict: return { "agent_name": self.agent_name, "tasks_run": self.tasks_run, "scores": { "mean": round(self.mean_score, 3), "std": round(self.std_score, 3), "generalization": round(self.generalization_score, 3), "reliability": round(self.mean_reliability, 3), "causal_reasoning": round(self.mean_causal_score, 3), "robustness": round(self.mean_robustness_score, 3), "calibration": round(self.mean_calibration_score, 3), "composite": round(self.composite_rank_score, 3), }, "efficiency": { "mean_steps": round(self.mean_steps, 1), }, "behavior": { "dominant_strategy": self.dominant_strategy, "dominant_failure": self.dominant_failure, }, "per_task_scores": {k: round(v, 3) for k, v in self.per_task_scores.items()}, "best_task": self.best_task, "worst_task": self.worst_task, } @dataclass class LeaderboardReport: """Full benchmark leaderboard.""" benchmark_id: str tasks_evaluated: List[str] agents_evaluated: List[str] total_episodes: int run_duration_seconds: float rankings: List[AgentBenchmarkSummary] raw_results: List[BenchmarkResult] def to_dict(self) -> dict: return { "benchmark_id": self.benchmark_id, "tasks_evaluated": self.tasks_evaluated, "agents_evaluated": self.agents_evaluated, "total_episodes": self.total_episodes, "run_duration_seconds": round(self.run_duration_seconds, 2), "leaderboard": [r.to_dict() for r in self.rankings], "winner": self.rankings[0].agent_name if self.rankings else "none", "insights": self._generate_insights(), } def _generate_insights(self) -> List[str]: if not self.rankings: return [] insights = [] top = self.rankings[0] bottom = self.rankings[-1] if top.composite_rank_score - bottom.composite_rank_score > 0.2: insights.append( f"Large performance gap: '{top.agent_name}' ({top.composite_rank_score:.2f}) " f"vs '{bottom.agent_name}' ({bottom.composite_rank_score:.2f})" ) if top.generalization_score > 0.7: insights.append( f"'{top.agent_name}' shows strong generalization " f"(std={top.std_score:.3f} across {top.tasks_run} tasks)" ) for r in self.rankings: if r.mean_causal_score > 0.6: insights.append( f"'{r.agent_name}' demonstrated genuine causal reasoning " f"(causal_score={r.mean_causal_score:.2f})" ) strategies = [r.dominant_strategy for r in self.rankings] if len(set(strategies)) > 1: best_strategy = self.rankings[0].dominant_strategy insights.append( f"Strategy '{best_strategy}' produced the highest composite score." ) return insights def render_table(self) -> str: """Render ASCII leaderboard table.""" if not self.rankings: return "No results." lines = [ f"{'═'*90}", f" šŸ† BENCHMARK LEADERBOARD — {self.benchmark_id}", f" Tasks: {', '.join(self.tasks_evaluated)} | Agents: {len(self.agents_evaluated)} | Episodes: {self.total_episodes}", f"{'═'*90}", f"{'Rank':<5} {'Agent':<16} {'Score':<8} {'Causal':<8} {'Robust':<8} {'Calibr':<8} {'Genrz':<8} {'Steps':<7} {'Strategy'}", f"{'─'*90}", ] for i, r in enumerate(self.rankings): medal = "šŸ„‡" if i == 0 else "🄈" if i == 1 else "šŸ„‰" if i == 2 else f" #{i+1}" lines.append( f"{medal:<5} {r.agent_name:<16} {r.mean_score:<8.3f} " f"{r.mean_causal_score:<8.3f} {r.mean_robustness_score:<8.3f} " f"{r.mean_calibration_score:<8.3f} {r.generalization_score:<8.3f} " f"{r.mean_steps:<7.1f} {r.dominant_strategy}" ) lines.append(f"{'═'*90}") lines.append("\nšŸ“Š Per-Task Breakdown:") for r in self.rankings: task_str = " | ".join(f"{t}: {s:.2f}" for t, s in sorted(r.per_task_scores.items())) lines.append(f" {r.agent_name:<16} {task_str}") if self._generate_insights(): lines.append("\nšŸ’” Insights:") lines.extend(f" → {i}" for i in self._generate_insights()) return "\n".join(lines) class BenchmarkRunner: """ Automated benchmark runner. Runs each agent in AGENT_CONFIGS across each task, collecting: - Final score - All intelligence metrics (causal, counterfactual, confidence) - Strategy and failure classification - Reliability index Then generates a ranked leaderboard. """ def run( self, env, tasks: Optional[List[str]] = None, agents: Optional[List[str]] = None, benchmark_id: Optional[str] = None, ) -> LeaderboardReport: """Run the full benchmark.""" import uuid from server.models import RepoAction from server.strategy_detector import StrategyDetector from server.failure_classifier import FailureClassifier from server.advanced_metrics import AdvancedMetricsEngine from server.causal_probe import CausalProbe from server.counterfactual_engine import CounterfactualEngine from server.confidence_calibrator import ConfidenceCalibrator benchmark_id = benchmark_id or f"bench_{uuid.uuid4().hex[:8]}" tasks = tasks or ["task1", "task2", "task3"] agent_configs = self._get_agent_configs() if agents: agent_configs = {k: v for k, v in agent_configs.items() if k in agents} clf = FailureClassifier() det = StrategyDetector() adv = AdvancedMetricsEngine() causal = CausalProbe() counter = CounterfactualEngine() calibrator = ConfidenceCalibrator() start_time = time.time() all_results: List[BenchmarkResult] = [] for task in tasks: for agent_name, agent_fn in agent_configs.items(): try: result = self._run_episode( env, task, agent_name, agent_fn, clf, det, adv, causal, counter, calibrator ) all_results.append(result) except Exception as e: # Don't crash the whole benchmark on one failure all_results.append(BenchmarkResult( agent_name=agent_name, task=task, variant_id="error", final_score=0.0, total_steps=0, cumulative_reward=0.0, duration_seconds=0.0, strategy="ERROR", failure_type="BENCHMARK_ERROR", reliability_index=0.0, causal_score=0.0, robustness_score=0.0, calibration_score=0.0, action_sequence=[], )) total_duration = time.time() - start_time rankings = self._compute_rankings(all_results, tasks) return LeaderboardReport( benchmark_id=benchmark_id, tasks_evaluated=tasks, agents_evaluated=list(agent_configs.keys()), total_episodes=len(all_results), run_duration_seconds=total_duration, rankings=rankings, raw_results=all_results, ) def _run_episode( self, env, task, agent_name, agent_fn, clf, det, adv, causal, counter, calibrator ) -> BenchmarkResult: from server.models import RepoAction reset_result = env.reset(task=task) obs = reset_result.observation variant_id = reset_result.info.get("variant_id", "unknown") context = {} obs_dict = obs.model_dump() start = time.time() cumulative_reward = 0.0 files_read, files_written, action_sequence = [], [], [] max_steps = 15 for step_num in range(1, max_steps + 1): if env.done: break action_dict = agent_fn(obs_dict, step_num, context) action = RepoAction( action_type=action_dict.get("action_type", "submit"), path=action_dict.get("path"), query=action_dict.get("query"), ) result = env.step(action) obs = result.observation obs_dict = obs.model_dump() cumulative_reward += result.reward action_sequence.append(action.action_type) if action.path and action.action_type == "read_file": files_read.append(action.path) if action.path and action.action_type == "write_file": files_written.append(action.path) if result.done: break if not env.done: r = env.step(RepoAction(action_type="submit")) cumulative_reward += r.reward action_sequence.append("submit") duration = time.time() - start final_score = env.final_score traj = env.get_trajectory() steps = traj.get("steps", []) if traj else [] meta = env.variant.meta if env.variant else {} # Intelligence metrics fail_r = clf.classify( traj.get("episode_id", "") if traj else "", task, steps, meta, files_read, files_written, final_score ) strat_r = det.detect(steps, task, meta, files_read, final_score) adv_r = adv.compute(steps, meta, final_score, files_read, files_written) causal_r = causal.probe( traj.get("episode_id", "") if traj else "", task, steps, meta, files_read, files_written, final_score ) counter_r = counter.analyze( traj.get("episode_id", "") if traj else "", task, steps, meta, files_read, files_written, final_score ) calib_r = calibrator.calibrate( traj.get("episode_id", "") if traj else "", task, steps, final_score, ) return BenchmarkResult( agent_name=agent_name, task=task, variant_id=variant_id, final_score=final_score, total_steps=len(action_sequence), cumulative_reward=cumulative_reward, duration_seconds=duration, strategy=strat_r.strategy, failure_type=fail_r.primary_failure, reliability_index=adv_r.reliability_index, causal_score=causal_r.causal_score, robustness_score=counter_r.robustness_score, calibration_score=calib_r.calibration_score, action_sequence=action_sequence, ) def _compute_rankings( self, results: List[BenchmarkResult], tasks: List[str] ) -> List[AgentBenchmarkSummary]: import math from collections import Counter # Group by agent agent_results: Dict[str, List[BenchmarkResult]] = {} for r in results: agent_results.setdefault(r.agent_name, []).append(r) summaries = [] for agent_name, agent_res in agent_results.items(): scores = [r.final_score for r in agent_res] mean_score = sum(scores) / len(scores) if len(scores) > 1: variance = sum((s - mean_score) ** 2 for s in scores) / len(scores) std_score = math.sqrt(variance) else: std_score = 0.0 generalization_score = max(0.0, 1.0 - std_score) per_task = {r.task: r.final_score for r in agent_res} strategies = Counter(r.strategy for r in agent_res) failures = Counter(r.failure_type for r in agent_res) mean_steps = sum(r.total_steps for r in agent_res) / len(agent_res) mean_reliability = sum(r.reliability_index for r in agent_res) / len(agent_res) mean_causal = sum(r.causal_score for r in agent_res) / len(agent_res) mean_robustness = sum(r.robustness_score for r in agent_res) / len(agent_res) mean_calibration = sum(r.calibration_score for r in agent_res) / len(agent_res) # Composite leaderboard score — weighted across all dimensions composite = ( mean_score * 0.35 + mean_causal * 0.20 + mean_robustness * 0.15 + mean_calibration * 0.15 + generalization_score * 0.15 ) best_task = max(per_task, key=per_task.get) worst_task = min(per_task, key=per_task.get) summaries.append(AgentBenchmarkSummary( agent_name=agent_name, tasks_run=len(agent_res), mean_score=mean_score, std_score=std_score, generalization_score=generalization_score, mean_steps=mean_steps, best_task=best_task, worst_task=worst_task, mean_reliability=mean_reliability, mean_causal_score=mean_causal, mean_robustness_score=mean_robustness, mean_calibration_score=mean_calibration, dominant_strategy=strategies.most_common(1)[0][0], dominant_failure=failures.most_common(1)[0][0], composite_rank_score=composite, per_task_scores=per_task, )) summaries.sort(key=lambda s: -s.composite_rank_score) return summaries def _get_agent_configs(self) -> Dict: """Reuse built-in strategies from multi_agent.py.""" from server.multi_agent import MultiAgentComparison return MultiAgentComparison.AGENT_CONFIGS