File size: 4,548 Bytes
62f4978
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
"""
scripts/evaluate.py β€” Reproducible benchmark evaluation across all tasks.

Usage
-----
python scripts/evaluate.py [--seed SEED] [--output results.json]

Runs every task in the registry with the baseline agent and reports:
  - Per-task score, cost, steps
  - Aggregate metrics per difficulty
  - Overall average score
"""

import argparse
import json
import logging
import sys
import os
import time

sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from env.environment import SelfImprovingAgentEnv
from env.tasks import ALL_TASKS
from agent.baseline_agent import BaselineAgent


def evaluate(seed: int = 42, output_path: str = "results.json", verbose: bool = False):
    level = logging.DEBUG if verbose else logging.WARNING
    logging.basicConfig(level=level, format="%(asctime)s [%(levelname)s] %(message)s")

    print(f"\n{'='*65}")
    print("  Benchmark Evaluation β€” Self-Improving Agent Environment")
    print(f"  Seed: {seed}  |  Tasks: {len(ALL_TASKS)}")
    print(f"{'='*65}\n")

    results = []
    per_difficulty = {"easy": [], "medium": [], "hard": []}
    t0 = time.time()

    for task in ALL_TASKS:
        env = SelfImprovingAgentEnv(seed=seed, task_id=task.task_id)
        agent = BaselineAgent(seed=seed)

        summary = agent.run_episode(env, task_id=task.task_id)
        results.append(summary)
        per_difficulty[task.difficulty.value].append(summary)

        status = "βœ…" if summary["best_score"] >= 0.9 else ("⚠️ " if summary["best_score"] >= 0.5 else "❌")
        print(
            f"  {status} [{task.difficulty.value:6s}] {task.task_id:30s} "
            f"score={summary['best_score']:.2f}  "
            f"steps={summary['steps']:2d}  "
            f"budget_used={summary['budget_used']:.1f}/{summary['budget_total']:.0f}"
        )

    elapsed = time.time() - t0

    # ── Aggregate metrics ─────────────────────────────────────────────────────
    print(f"\n{'─'*65}")
    print("  Results by Difficulty")
    print(f"{'─'*65}")

    agg = {}
    for diff, recs in per_difficulty.items():
        if not recs:
            continue
        avg_score = sum(r["best_score"] for r in recs) / len(recs)
        avg_steps = sum(r["steps"] for r in recs) / len(recs)
        avg_cost  = sum(r["budget_used"] for r in recs) / len(recs)
        avg_reward = sum(r["total_reward"] for r in recs) / len(recs)
        agg[diff] = {
            "count": len(recs),
            "avg_score": round(avg_score, 4),
            "avg_steps": round(avg_steps, 2),
            "avg_cost": round(avg_cost, 4),
            "avg_reward": round(avg_reward, 4),
        }
        print(
            f"  {diff.upper():6s} | tasks={len(recs)} | "
            f"avg_score={avg_score:.3f} | avg_steps={avg_steps:.1f} | avg_cost={avg_cost:.2f}"
        )

    overall_score = sum(r["best_score"] for r in results) / len(results)
    overall_reward = sum(r["total_reward"] for r in results) / len(results)

    print(f"\n{'─'*65}")
    print(f"  OVERALL Average Score  : {overall_score:.4f}")
    print(f"  OVERALL Average Reward : {overall_reward:.4f}")
    print(f"  Total tasks evaluated  : {len(results)}")
    print(f"  Wall time              : {elapsed:.2f}s")
    print(f"{'─'*65}\n")

    # ── Save output ───────────────────────────────────────────────────────────
    report = {
        "seed": seed,
        "total_tasks": len(results),
        "overall_avg_score": round(overall_score, 4),
        "overall_avg_reward": round(overall_reward, 4),
        "by_difficulty": agg,
        "task_results": results,
        "elapsed_seconds": round(elapsed, 2),
    }

    os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else ".", exist_ok=True)
    with open(output_path, "w") as f:
        json.dump(report, f, indent=2)
    print(f"  Full results saved to: {output_path}")

    return report


def main():
    parser = argparse.ArgumentParser(description="Evaluate baseline agent on all tasks.")
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--output", type=str, default="results.json")
    parser.add_argument("--verbose", action="store_true")
    args = parser.parse_args()
    evaluate(seed=args.seed, output_path=args.output, verbose=args.verbose)


if __name__ == "__main__":
    main()