| """ |
| scripts/evaluate.py β Reproducible benchmark evaluation across all tasks. |
| |
| Usage |
| ----- |
| python scripts/evaluate.py [--seed SEED] [--output results.json] |
| |
| Runs every task in the registry with the baseline agent and reports: |
| - Per-task score, cost, steps |
| - Aggregate metrics per difficulty |
| - Overall average score |
| """ |
|
|
| import argparse |
| import json |
| import logging |
| import sys |
| import os |
| import time |
|
|
| sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) |
|
|
| from env.environment import SelfImprovingAgentEnv |
| from env.tasks import ALL_TASKS |
| from agent.baseline_agent import BaselineAgent |
|
|
|
|
| def evaluate(seed: int = 42, output_path: str = "results.json", verbose: bool = False): |
| level = logging.DEBUG if verbose else logging.WARNING |
| logging.basicConfig(level=level, format="%(asctime)s [%(levelname)s] %(message)s") |
|
|
| print(f"\n{'='*65}") |
| print(" Benchmark Evaluation β Self-Improving Agent Environment") |
| print(f" Seed: {seed} | Tasks: {len(ALL_TASKS)}") |
| print(f"{'='*65}\n") |
|
|
| results = [] |
| per_difficulty = {"easy": [], "medium": [], "hard": []} |
| t0 = time.time() |
|
|
| for task in ALL_TASKS: |
| env = SelfImprovingAgentEnv(seed=seed, task_id=task.task_id) |
| agent = BaselineAgent(seed=seed) |
|
|
| summary = agent.run_episode(env, task_id=task.task_id) |
| results.append(summary) |
| per_difficulty[task.difficulty.value].append(summary) |
|
|
| status = "β
" if summary["best_score"] >= 0.9 else ("β οΈ " if summary["best_score"] >= 0.5 else "β") |
| print( |
| f" {status} [{task.difficulty.value:6s}] {task.task_id:30s} " |
| f"score={summary['best_score']:.2f} " |
| f"steps={summary['steps']:2d} " |
| f"budget_used={summary['budget_used']:.1f}/{summary['budget_total']:.0f}" |
| ) |
|
|
| elapsed = time.time() - t0 |
|
|
| |
| print(f"\n{'β'*65}") |
| print(" Results by Difficulty") |
| print(f"{'β'*65}") |
|
|
| agg = {} |
| for diff, recs in per_difficulty.items(): |
| if not recs: |
| continue |
| avg_score = sum(r["best_score"] for r in recs) / len(recs) |
| avg_steps = sum(r["steps"] for r in recs) / len(recs) |
| avg_cost = sum(r["budget_used"] for r in recs) / len(recs) |
| avg_reward = sum(r["total_reward"] for r in recs) / len(recs) |
| agg[diff] = { |
| "count": len(recs), |
| "avg_score": round(avg_score, 4), |
| "avg_steps": round(avg_steps, 2), |
| "avg_cost": round(avg_cost, 4), |
| "avg_reward": round(avg_reward, 4), |
| } |
| print( |
| f" {diff.upper():6s} | tasks={len(recs)} | " |
| f"avg_score={avg_score:.3f} | avg_steps={avg_steps:.1f} | avg_cost={avg_cost:.2f}" |
| ) |
|
|
| overall_score = sum(r["best_score"] for r in results) / len(results) |
| overall_reward = sum(r["total_reward"] for r in results) / len(results) |
|
|
| print(f"\n{'β'*65}") |
| print(f" OVERALL Average Score : {overall_score:.4f}") |
| print(f" OVERALL Average Reward : {overall_reward:.4f}") |
| print(f" Total tasks evaluated : {len(results)}") |
| print(f" Wall time : {elapsed:.2f}s") |
| print(f"{'β'*65}\n") |
|
|
| |
| report = { |
| "seed": seed, |
| "total_tasks": len(results), |
| "overall_avg_score": round(overall_score, 4), |
| "overall_avg_reward": round(overall_reward, 4), |
| "by_difficulty": agg, |
| "task_results": results, |
| "elapsed_seconds": round(elapsed, 2), |
| } |
|
|
| os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else ".", exist_ok=True) |
| with open(output_path, "w") as f: |
| json.dump(report, f, indent=2) |
| print(f" Full results saved to: {output_path}") |
|
|
| return report |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Evaluate baseline agent on all tasks.") |
| parser.add_argument("--seed", type=int, default=42) |
| parser.add_argument("--output", type=str, default="results.json") |
| parser.add_argument("--verbose", action="store_true") |
| args = parser.parse_args() |
| evaluate(seed=args.seed, output_path=args.output, verbose=args.verbose) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|