""" scripts/evaluate.py — Reproducible benchmark evaluation across all tasks. Usage ----- python scripts/evaluate.py [--seed SEED] [--output results.json] Runs every task in the registry with the baseline agent and reports: - Per-task score, cost, steps - Aggregate metrics per difficulty - Overall average score """ import argparse import json import logging import sys import os import time sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from env.environment import SelfImprovingAgentEnv from env.tasks import ALL_TASKS from agent.baseline_agent import BaselineAgent def evaluate(seed: int = 42, output_path: str = "results.json", verbose: bool = False): level = logging.DEBUG if verbose else logging.WARNING logging.basicConfig(level=level, format="%(asctime)s [%(levelname)s] %(message)s") print(f"\n{'='*65}") print(" Benchmark Evaluation — Self-Improving Agent Environment") print(f" Seed: {seed} | Tasks: {len(ALL_TASKS)}") print(f"{'='*65}\n") results = [] per_difficulty = {"easy": [], "medium": [], "hard": []} t0 = time.time() for task in ALL_TASKS: env = SelfImprovingAgentEnv(seed=seed, task_id=task.task_id) agent = BaselineAgent(seed=seed) summary = agent.run_episode(env, task_id=task.task_id) results.append(summary) per_difficulty[task.difficulty.value].append(summary) status = "✅" if summary["best_score"] >= 0.9 else ("⚠️ " if summary["best_score"] >= 0.5 else "❌") print( f" {status} [{task.difficulty.value:6s}] {task.task_id:30s} " f"score={summary['best_score']:.2f} " f"steps={summary['steps']:2d} " f"budget_used={summary['budget_used']:.1f}/{summary['budget_total']:.0f}" ) elapsed = time.time() - t0 # ── Aggregate metrics ───────────────────────────────────────────────────── print(f"\n{'─'*65}") print(" Results by Difficulty") print(f"{'─'*65}") agg = {} for diff, recs in per_difficulty.items(): if not recs: continue avg_score = sum(r["best_score"] for r in recs) / len(recs) avg_steps = sum(r["steps"] for r in recs) / len(recs) avg_cost = sum(r["budget_used"] for r in recs) / len(recs) avg_reward = sum(r["total_reward"] for r in recs) / len(recs) agg[diff] = { "count": len(recs), "avg_score": round(avg_score, 4), "avg_steps": round(avg_steps, 2), "avg_cost": round(avg_cost, 4), "avg_reward": round(avg_reward, 4), } print( f" {diff.upper():6s} | tasks={len(recs)} | " f"avg_score={avg_score:.3f} | avg_steps={avg_steps:.1f} | avg_cost={avg_cost:.2f}" ) overall_score = sum(r["best_score"] for r in results) / len(results) overall_reward = sum(r["total_reward"] for r in results) / len(results) print(f"\n{'─'*65}") print(f" OVERALL Average Score : {overall_score:.4f}") print(f" OVERALL Average Reward : {overall_reward:.4f}") print(f" Total tasks evaluated : {len(results)}") print(f" Wall time : {elapsed:.2f}s") print(f"{'─'*65}\n") # ── Save output ─────────────────────────────────────────────────────────── report = { "seed": seed, "total_tasks": len(results), "overall_avg_score": round(overall_score, 4), "overall_avg_reward": round(overall_reward, 4), "by_difficulty": agg, "task_results": results, "elapsed_seconds": round(elapsed, 2), } os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else ".", exist_ok=True) with open(output_path, "w") as f: json.dump(report, f, indent=2) print(f" Full results saved to: {output_path}") return report def main(): parser = argparse.ArgumentParser(description="Evaluate baseline agent on all tasks.") parser.add_argument("--seed", type=int, default=42) parser.add_argument("--output", type=str, default="results.json") parser.add_argument("--verbose", action="store_true") args = parser.parse_args() evaluate(seed=args.seed, output_path=args.output, verbose=args.verbose) if __name__ == "__main__": main()