File size: 4,548 Bytes
62f4978 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | """
scripts/evaluate.py β Reproducible benchmark evaluation across all tasks.
Usage
-----
python scripts/evaluate.py [--seed SEED] [--output results.json]
Runs every task in the registry with the baseline agent and reports:
- Per-task score, cost, steps
- Aggregate metrics per difficulty
- Overall average score
"""
import argparse
import json
import logging
import sys
import os
import time
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from env.environment import SelfImprovingAgentEnv
from env.tasks import ALL_TASKS
from agent.baseline_agent import BaselineAgent
def evaluate(seed: int = 42, output_path: str = "results.json", verbose: bool = False):
level = logging.DEBUG if verbose else logging.WARNING
logging.basicConfig(level=level, format="%(asctime)s [%(levelname)s] %(message)s")
print(f"\n{'='*65}")
print(" Benchmark Evaluation β Self-Improving Agent Environment")
print(f" Seed: {seed} | Tasks: {len(ALL_TASKS)}")
print(f"{'='*65}\n")
results = []
per_difficulty = {"easy": [], "medium": [], "hard": []}
t0 = time.time()
for task in ALL_TASKS:
env = SelfImprovingAgentEnv(seed=seed, task_id=task.task_id)
agent = BaselineAgent(seed=seed)
summary = agent.run_episode(env, task_id=task.task_id)
results.append(summary)
per_difficulty[task.difficulty.value].append(summary)
status = "β
" if summary["best_score"] >= 0.9 else ("β οΈ " if summary["best_score"] >= 0.5 else "β")
print(
f" {status} [{task.difficulty.value:6s}] {task.task_id:30s} "
f"score={summary['best_score']:.2f} "
f"steps={summary['steps']:2d} "
f"budget_used={summary['budget_used']:.1f}/{summary['budget_total']:.0f}"
)
elapsed = time.time() - t0
# ββ Aggregate metrics βββββββββββββββββββββββββββββββββββββββββββββββββββββ
print(f"\n{'β'*65}")
print(" Results by Difficulty")
print(f"{'β'*65}")
agg = {}
for diff, recs in per_difficulty.items():
if not recs:
continue
avg_score = sum(r["best_score"] for r in recs) / len(recs)
avg_steps = sum(r["steps"] for r in recs) / len(recs)
avg_cost = sum(r["budget_used"] for r in recs) / len(recs)
avg_reward = sum(r["total_reward"] for r in recs) / len(recs)
agg[diff] = {
"count": len(recs),
"avg_score": round(avg_score, 4),
"avg_steps": round(avg_steps, 2),
"avg_cost": round(avg_cost, 4),
"avg_reward": round(avg_reward, 4),
}
print(
f" {diff.upper():6s} | tasks={len(recs)} | "
f"avg_score={avg_score:.3f} | avg_steps={avg_steps:.1f} | avg_cost={avg_cost:.2f}"
)
overall_score = sum(r["best_score"] for r in results) / len(results)
overall_reward = sum(r["total_reward"] for r in results) / len(results)
print(f"\n{'β'*65}")
print(f" OVERALL Average Score : {overall_score:.4f}")
print(f" OVERALL Average Reward : {overall_reward:.4f}")
print(f" Total tasks evaluated : {len(results)}")
print(f" Wall time : {elapsed:.2f}s")
print(f"{'β'*65}\n")
# ββ Save output βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
report = {
"seed": seed,
"total_tasks": len(results),
"overall_avg_score": round(overall_score, 4),
"overall_avg_reward": round(overall_reward, 4),
"by_difficulty": agg,
"task_results": results,
"elapsed_seconds": round(elapsed, 2),
}
os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else ".", exist_ok=True)
with open(output_path, "w") as f:
json.dump(report, f, indent=2)
print(f" Full results saved to: {output_path}")
return report
def main():
parser = argparse.ArgumentParser(description="Evaluate baseline agent on all tasks.")
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--output", type=str, default="results.json")
parser.add_argument("--verbose", action="store_true")
args = parser.parse_args()
evaluate(seed=args.seed, output_path=args.output, verbose=args.verbose)
if __name__ == "__main__":
main()
|