SelfEvo / scripts /evaluate.py
Akhil-8605's picture
Punlishing to Hugging Face
62f4978
"""
scripts/evaluate.py β€” Reproducible benchmark evaluation across all tasks.
Usage
-----
python scripts/evaluate.py [--seed SEED] [--output results.json]
Runs every task in the registry with the baseline agent and reports:
- Per-task score, cost, steps
- Aggregate metrics per difficulty
- Overall average score
"""
import argparse
import json
import logging
import sys
import os
import time
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from env.environment import SelfImprovingAgentEnv
from env.tasks import ALL_TASKS
from agent.baseline_agent import BaselineAgent
def evaluate(seed: int = 42, output_path: str = "results.json", verbose: bool = False):
level = logging.DEBUG if verbose else logging.WARNING
logging.basicConfig(level=level, format="%(asctime)s [%(levelname)s] %(message)s")
print(f"\n{'='*65}")
print(" Benchmark Evaluation β€” Self-Improving Agent Environment")
print(f" Seed: {seed} | Tasks: {len(ALL_TASKS)}")
print(f"{'='*65}\n")
results = []
per_difficulty = {"easy": [], "medium": [], "hard": []}
t0 = time.time()
for task in ALL_TASKS:
env = SelfImprovingAgentEnv(seed=seed, task_id=task.task_id)
agent = BaselineAgent(seed=seed)
summary = agent.run_episode(env, task_id=task.task_id)
results.append(summary)
per_difficulty[task.difficulty.value].append(summary)
status = "βœ…" if summary["best_score"] >= 0.9 else ("⚠️ " if summary["best_score"] >= 0.5 else "❌")
print(
f" {status} [{task.difficulty.value:6s}] {task.task_id:30s} "
f"score={summary['best_score']:.2f} "
f"steps={summary['steps']:2d} "
f"budget_used={summary['budget_used']:.1f}/{summary['budget_total']:.0f}"
)
elapsed = time.time() - t0
# ── Aggregate metrics ─────────────────────────────────────────────────────
print(f"\n{'─'*65}")
print(" Results by Difficulty")
print(f"{'─'*65}")
agg = {}
for diff, recs in per_difficulty.items():
if not recs:
continue
avg_score = sum(r["best_score"] for r in recs) / len(recs)
avg_steps = sum(r["steps"] for r in recs) / len(recs)
avg_cost = sum(r["budget_used"] for r in recs) / len(recs)
avg_reward = sum(r["total_reward"] for r in recs) / len(recs)
agg[diff] = {
"count": len(recs),
"avg_score": round(avg_score, 4),
"avg_steps": round(avg_steps, 2),
"avg_cost": round(avg_cost, 4),
"avg_reward": round(avg_reward, 4),
}
print(
f" {diff.upper():6s} | tasks={len(recs)} | "
f"avg_score={avg_score:.3f} | avg_steps={avg_steps:.1f} | avg_cost={avg_cost:.2f}"
)
overall_score = sum(r["best_score"] for r in results) / len(results)
overall_reward = sum(r["total_reward"] for r in results) / len(results)
print(f"\n{'─'*65}")
print(f" OVERALL Average Score : {overall_score:.4f}")
print(f" OVERALL Average Reward : {overall_reward:.4f}")
print(f" Total tasks evaluated : {len(results)}")
print(f" Wall time : {elapsed:.2f}s")
print(f"{'─'*65}\n")
# ── Save output ───────────────────────────────────────────────────────────
report = {
"seed": seed,
"total_tasks": len(results),
"overall_avg_score": round(overall_score, 4),
"overall_avg_reward": round(overall_reward, 4),
"by_difficulty": agg,
"task_results": results,
"elapsed_seconds": round(elapsed, 2),
}
os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else ".", exist_ok=True)
with open(output_path, "w") as f:
json.dump(report, f, indent=2)
print(f" Full results saved to: {output_path}")
return report
def main():
parser = argparse.ArgumentParser(description="Evaluate baseline agent on all tasks.")
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--output", type=str, default="results.json")
parser.add_argument("--verbose", action="store_true")
args = parser.parse_args()
evaluate(seed=args.seed, output_path=args.output, verbose=args.verbose)
if __name__ == "__main__":
main()