Spaces:
Sleeping
Sleeping
| """ | |
| TeamForge Evaluation Protocol v1.0 | |
| ==================================== | |
| Formal, reproducible evaluation specification for the TeamForge benchmark. | |
| This module defines the canonical evaluation protocol used to produce | |
| all results in the leaderboard. Any third party can reproduce our numbers | |
| by following this spec exactly. | |
| Usage: | |
| python evaluation.py --help | |
| python evaluation.py --model llama3-8b-8192 --runs 3 | |
| python evaluation.py --model llama3-70b-8192 --runs 5 --seed 42 | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import random | |
| import sys | |
| import time | |
| from dataclasses import dataclass, asdict | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from statistics import mean, stdev | |
| from typing import Dict, List, Optional | |
| from rich.console import Console | |
| from rich.table import Table | |
| from rich import box | |
| console = Console() | |
| # βββ Protocol constants (FROZEN β do not change between runs) βββββββββββββββββ | |
| PROTOCOL_VERSION = "1.0.0" | |
| TEMPERATURE = 0.15 # all model calls use this | |
| MAX_TOKENS = 1800 # per LLM call | |
| CONTEXT_WINDOW_MSGS = 12 # last N messages kept in context | |
| DEFAULT_RUNS = 3 # runs per (model, task) for CI | |
| FULL_RUNS = 5 # runs for publication-quality results | |
| TASK_WEIGHTS = { | |
| "easy_bugfix_chunk_list": 0.20, | |
| "medium_refactor_stats": 0.35, | |
| "hard_lru_cache_performance": 0.45, | |
| } | |
| PASS_THRESHOLD = 0.70 # final_score >= this β task "passed" | |
| BENCHMARK_PASS_SCORE = 0.60 # teamforge_score >= this β model "passes benchmark" | |
| # βββ Data classes βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class RunResult: | |
| model: str | |
| task_id: str | |
| run_index: int | |
| seed: int | |
| final_score: float | |
| test_pass_rate: float | |
| lint_score: float | |
| efficiency_score: float | |
| review_quality: float | |
| reflection_quality: float | |
| total_steps: int | |
| elapsed_s: float | |
| passed: bool | |
| timestamp: str | |
| class TaskSummary: | |
| task_id: str | |
| n_runs: int | |
| mean_score: float | |
| std_score: float | |
| best_score: float | |
| worst_score: float | |
| pass_rate: float # fraction of runs that passed | |
| mean_steps: float | |
| mean_elapsed_s: float | |
| class ModelReport: | |
| model: str | |
| protocol_version: str | |
| timestamp: str | |
| teamforge_score: float # weighted aggregate of best-run scores | |
| benchmark_passed: bool | |
| task_summaries: Dict[str, TaskSummary] | |
| raw_runs: List[RunResult] | |
| def to_dict(self) -> dict: | |
| d = asdict(self) | |
| return d | |
| # βββ Evaluator ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TeamForgeEvaluator: | |
| """ | |
| Runs the canonical evaluation protocol for a given model. | |
| Produces a ModelReport with full statistical summary. | |
| """ | |
| def __init__( | |
| self, | |
| model: str, | |
| n_runs: int = DEFAULT_RUNS, | |
| base_seed: int = 42, | |
| task_ids: Optional[List[str]] = None, | |
| results_dir: str = "results", | |
| ): | |
| self.model = model | |
| self.n_runs = n_runs | |
| self.base_seed = base_seed | |
| self.task_ids = task_ids or list(TASK_WEIGHTS.keys()) | |
| self.results_dir = Path(results_dir) | |
| self.results_dir.mkdir(exist_ok=True) | |
| def run(self) -> ModelReport: | |
| """Execute full evaluation. Returns ModelReport.""" | |
| from environment import TeamForgeEnv | |
| from benchmark import BenchmarkAgent | |
| from openai import OpenAI | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY", "") | |
| API_BASE_URL = os.getenv("API_BASE_URL", "https://api.groq.com/openai/v1") | |
| client = OpenAI(api_key=GROQ_API_KEY, base_url=API_BASE_URL) | |
| agent = BenchmarkAgent(client, self.model) | |
| env = TeamForgeEnv() | |
| all_runs: List[RunResult] = [] | |
| total_episodes = len(self.task_ids) * self.n_runs | |
| done_count = 0 | |
| for task_id in self.task_ids: | |
| console.print(f"\n [bold cyan]Task:[/bold cyan] {task_id}") | |
| for run_i in range(self.n_runs): | |
| seed = self.base_seed + run_i | |
| random.seed(seed) | |
| console.print(f" run {run_i+1}/{self.n_runs} (seed={seed}) β¦ ", end="") | |
| t0 = time.perf_counter() | |
| agent.reset() | |
| obs = env.reset(task_id) | |
| while not obs.done: | |
| action = agent.act(obs) | |
| if action is None: | |
| break | |
| obs = env.step(action) | |
| elapsed = time.perf_counter() - t0 | |
| result = env.grade() | |
| run = RunResult( | |
| model=self.model, | |
| task_id=task_id, | |
| run_index=run_i, | |
| seed=seed, | |
| final_score=result.final_score, | |
| test_pass_rate=result.test_pass_rate, | |
| lint_score=result.lint_score, | |
| efficiency_score=result.efficiency_score, | |
| review_quality=result.review_quality, | |
| reflection_quality=result.reflection_quality, | |
| total_steps=result.total_steps, | |
| elapsed_s=round(elapsed, 2), | |
| passed=result.final_score >= PASS_THRESHOLD, | |
| timestamp=datetime.now(timezone.utc).isoformat(), | |
| ) | |
| all_runs.append(run) | |
| done_count += 1 | |
| status = "[green]β[/green]" if run.passed else "[red]β[/red]" | |
| console.print(f"{status} score={run.final_score:.4f} steps={run.total_steps} {elapsed:.1f}s") | |
| env._sandbox.teardown() | |
| # Build report | |
| report = self._build_report(all_runs) | |
| self._save(report) | |
| return report | |
| def _build_report(self, runs: List[RunResult]) -> ModelReport: | |
| task_summaries = {} | |
| for task_id in self.task_ids: | |
| task_runs = [r for r in runs if r.task_id == task_id] | |
| if not task_runs: | |
| continue | |
| scores = [r.final_score for r in task_runs] | |
| task_summaries[task_id] = TaskSummary( | |
| task_id=task_id, | |
| n_runs=len(task_runs), | |
| mean_score=round(mean(scores), 4), | |
| std_score=round(stdev(scores) if len(scores) > 1 else 0.0, 4), | |
| best_score=round(max(scores), 4), | |
| worst_score=round(min(scores), 4), | |
| pass_rate=round(sum(1 for r in task_runs if r.passed) / len(task_runs), 4), | |
| mean_steps=round(mean(r.total_steps for r in task_runs), 1), | |
| mean_elapsed_s=round(mean(r.elapsed_s for r in task_runs), 1), | |
| ) | |
| # TeamForge Score = weighted sum of BEST runs | |
| teamforge_score = sum( | |
| TASK_WEIGHTS.get(tid, 0) * task_summaries[tid].best_score | |
| for tid in self.task_ids | |
| if tid in task_summaries | |
| ) | |
| return ModelReport( | |
| model=self.model, | |
| protocol_version=PROTOCOL_VERSION, | |
| timestamp=datetime.now(timezone.utc).isoformat(), | |
| teamforge_score=round(teamforge_score, 4), | |
| benchmark_passed=teamforge_score >= BENCHMARK_PASS_SCORE, | |
| task_summaries=task_summaries, | |
| raw_runs=runs, | |
| ) | |
| def _save(self, report: ModelReport) -> None: | |
| model_dir = self.results_dir / self.model.replace("/", "_") | |
| model_dir.mkdir(exist_ok=True) | |
| ts = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| path = model_dir / f"eval_{ts}.json" | |
| path.write_text(json.dumps(report.to_dict(), indent=2)) | |
| console.print(f"\n [dim]Saved β {path}[/dim]") | |
| # βββ Report printer βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def print_report(report: ModelReport) -> None: | |
| console.print() | |
| console.rule(f"[bold]Evaluation Report β {report.model}[/bold]") | |
| # Per-task detail | |
| t = Table(box=box.DOUBLE_EDGE, border_style="cyan", header_style="bold cyan") | |
| t.add_column("Task", width=34) | |
| t.add_column("MeanΒ±Std", justify="center", width=12) | |
| t.add_column("Best", justify="center", width=8) | |
| t.add_column("Worst", justify="center", width=8) | |
| t.add_column("Pass Rate", justify="center", width=10) | |
| t.add_column("Avg Steps", justify="center", width=10) | |
| for task_id, ts in report.task_summaries.items(): | |
| score_color = "green" if ts.mean_score >= 0.7 else ("yellow" if ts.mean_score >= 0.5 else "red") | |
| t.add_row( | |
| task_id, | |
| f"[{score_color}]{ts.mean_score:.3f}[/{score_color}]Β±{ts.std_score:.3f}", | |
| f"{ts.best_score:.3f}", | |
| f"{ts.worst_score:.3f}", | |
| f"{ts.pass_rate*100:.0f}%", | |
| str(ts.mean_steps), | |
| ) | |
| console.print(t) | |
| # Summary | |
| passed_str = "[bold green]β BENCHMARK PASSED[/bold green]" \ | |
| if report.benchmark_passed else "[bold red]β BENCHMARK FAILED[/bold red]" | |
| ts_color = "green" if report.teamforge_score >= 0.7 else ("yellow" if report.teamforge_score >= 0.5 else "red") | |
| console.print( | |
| f"\n [{ts_color}]TeamForge Score: {report.teamforge_score:.4f}[/{ts_color}] Β· {passed_str}" | |
| f"\n [dim]Protocol v{report.protocol_version} | {report.n_runs} runs/task | T={TEMPERATURE}[/dim]" | |
| ) | |
| # βββ CLI ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="TeamForge Formal Evaluation Protocol", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| Examples: | |
| python evaluation.py --model llama3-8b-8192 | |
| python evaluation.py --model llama3-70b-8192 --runs 5 --seed 0 | |
| python evaluation.py --model llama3-8b-8192 --task easy_bugfix_chunk_list | |
| """, | |
| ) | |
| parser.add_argument("--model", required=True, help="Model name (Groq model string)") | |
| parser.add_argument("--runs", type=int, default=DEFAULT_RUNS, help=f"Runs per task (default {DEFAULT_RUNS})") | |
| parser.add_argument("--seed", type=int, default=42, help="Base random seed") | |
| parser.add_argument("--task", default=None, help="Evaluate single task only") | |
| args = parser.parse_args() | |
| task_ids = [args.task] if args.task else list(TASK_WEIGHTS.keys()) | |
| console.rule("[bold blue]TeamForge Evaluation Protocol v1.0[/bold blue]") | |
| console.print(f" Model: [bold]{args.model}[/bold]") | |
| console.print(f" Tasks: {task_ids}") | |
| console.print(f" Runs: {args.runs} per task") | |
| console.print(f" Seed: {args.seed}") | |
| console.print(f" Temp: {TEMPERATURE}") | |
| console.print() | |
| evaluator = TeamForgeEvaluator( | |
| model=args.model, | |
| n_runs=args.runs, | |
| base_seed=args.seed, | |
| task_ids=task_ids, | |
| ) | |
| report = evaluator.run() | |
| print_report(report) | |
| if __name__ == "__main__": | |
| main() | |