Agentic-zork / evaluation /evaluate.py
nathanael-fijalkow's picture
Major refactoring
450ea3f
#!/usr/bin/env python3
"""
Evaluation Script for Text Adventure Agents
Evaluates student submissions by running their agent + MCP server
on a text adventure game for multiple trials and averaging scores.
Usage:
# Evaluate a student submission
python evaluation/evaluate.py \\
--submission path/to/student/submission \\
--game zork1 \\
--trials 5 \\
--max-steps 100
# Evaluate with reference agent comparison
python evaluation/evaluate.py \\
--submission path/to/student/submission \\
--game zork1 \\
--reference
# Evaluate from a Hugging Face Space
python evaluation/evaluate.py \\
--hf-space username/space-name \\
--game zork1
# Batch evaluate multiple submissions
python evaluation/evaluate.py \\
--submissions-dir path/to/all/submissions \\
--game zork1 \\
--output results.json
Examples:
# Quick test with 3 trials
python evaluation/evaluate.py -s ./submission_template -g zork1 -t 3
# Full evaluation for grading
python evaluation/evaluate.py -s ./submission_template -g advent -t 5 --max-steps 150
"""
import argparse
import asyncio
import json
import os
import random
import sys
import tempfile
import warnings
from datetime import datetime
from pathlib import Path
# Suppress asyncio subprocess cleanup warnings
warnings.filterwarnings("ignore", message=".*Event loop is closed.*")
warnings.filterwarnings("ignore", category=UserWarning, module="multiprocessing.resource_tracker")
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from evaluation.metrics import EvaluationResult, TrialResult
from evaluation.runner import RunConfig, run_agent_with_server, run_reference_agent
from games.zork_env import list_available_games
def generate_seeds(base_seed: int, num_trials: int) -> list[int]:
"""Generate deterministic seeds for each trial."""
random.seed(base_seed)
return [random.randint(0, 2**32 - 1) for _ in range(num_trials)]
async def evaluate_submission(
submission_path: Path,
game: str,
num_trials: int = 5,
max_steps: int = 100,
base_seed: int = 42,
verbose: bool = False,
) -> EvaluationResult:
"""
Evaluate a student submission across multiple trials.
Args:
submission_path: Path to student's submission directory
game: Name of the game to evaluate on
num_trials: Number of trials to run (default: 5)
max_steps: Maximum steps per trial (default: 100)
base_seed: Base seed for reproducibility (default: 42)
verbose: Print detailed output
Returns:
EvaluationResult with aggregated metrics
"""
# Locate agent and server files
agent_path = submission_path / "agent.py"
server_path = submission_path / "mcp_server.py"
# Extract student ID from path or README
student_id = submission_path.name
readme_path = submission_path / "README.md"
if readme_path.exists():
content = readme_path.read_text()
# Try to extract student name from README
for line in content.split("\n"):
if line.startswith("# ") or "name:" in line.lower():
student_id = line.replace("#", "").replace("name:", "").strip()[:50]
break
# Initialize results
result = EvaluationResult(
student_id=student_id,
game=game,
num_trials=num_trials,
max_steps=max_steps,
)
# Generate deterministic seeds
seeds = generate_seeds(base_seed, num_trials)
print(f"\nEvaluating: {student_id}")
print(f"Game: {game}")
print(f"Trials: {num_trials}")
print(f"Max steps: {max_steps}")
print(f"Seeds: {seeds}")
print("-" * 50)
for i, seed in enumerate(seeds):
trial_num = i + 1
print(f"\nTrial {trial_num}/{num_trials} (seed={seed})...")
config = RunConfig(
agent_path=agent_path,
server_path=server_path,
game=game,
max_steps=max_steps,
seed=seed,
verbose=verbose,
)
try:
run_result = await run_agent_with_server(config)
trial = TrialResult(
trial_number=trial_num,
final_score=run_result.final_score,
max_score=run_result.max_score,
moves=run_result.moves,
locations_visited=len(run_result.locations_visited),
game_completed=run_result.game_completed,
error=run_result.error,
)
if run_result.error:
print(f" Error: {run_result.error[:100]}...")
else:
print(f" Score: {run_result.final_score}")
print(f" Moves: {run_result.moves}")
print(f" Locations: {len(run_result.locations_visited)}")
except Exception as e:
trial = TrialResult(
trial_number=trial_num,
final_score=0,
max_score=0,
moves=0,
locations_visited=0,
game_completed=False,
error=str(e),
)
print(f" Exception: {e}")
result.add_trial(trial)
return result
async def evaluate_with_reference(
submission_path: Path,
game: str,
num_trials: int = 5,
max_steps: int = 100,
base_seed: int = 42,
verbose: bool = False,
) -> tuple[EvaluationResult, EvaluationResult]:
"""
Evaluate student submission and compare with reference agent.
Returns:
Tuple of (student_result, reference_result)
"""
# Evaluate student
student_result = await evaluate_submission(
submission_path=submission_path,
game=game,
num_trials=num_trials,
max_steps=max_steps,
base_seed=base_seed,
verbose=verbose,
)
# Evaluate reference agent (from examples/mcp_react)
print("\n" + "=" * 50)
print("Running reference agent for comparison...")
print("=" * 50)
seeds = generate_seeds(base_seed, num_trials)
reference_result = EvaluationResult(
student_id="reference_agent",
game=game,
num_trials=num_trials,
max_steps=max_steps,
)
for i, seed in enumerate(seeds):
trial_num = i + 1
print(f"\nReference Trial {trial_num}/{num_trials} (seed={seed})...")
try:
run_result = await run_reference_agent(
game=game,
max_steps=max_steps,
seed=seed,
verbose=verbose,
)
trial = TrialResult(
trial_number=trial_num,
final_score=run_result.final_score,
max_score=run_result.max_score,
moves=run_result.moves,
locations_visited=len(run_result.locations_visited),
game_completed=run_result.game_completed,
error=run_result.error,
)
if run_result.error:
print(f" Error: {run_result.error[:100]}...")
else:
print(f" Score: {run_result.final_score}")
except Exception as e:
trial = TrialResult(
trial_number=trial_num,
final_score=0,
max_score=0,
moves=0,
locations_visited=0,
game_completed=False,
error=str(e),
)
print(f" Exception: {e}")
reference_result.add_trial(trial)
return student_result, reference_result
def clone_hf_space(space_id: str, target_dir: Path) -> Path:
"""Clone a Hugging Face Space to local directory."""
import subprocess
# HF Spaces are git repos at huggingface.co/spaces/
repo_url = f"https://huggingface.co/spaces/{space_id}"
print(f"Cloning {repo_url}...")
subprocess.run(
["git", "clone", "--depth", "1", repo_url, str(target_dir)],
check=True,
capture_output=True,
)
return target_dir
async def batch_evaluate(
submissions_dir: Path,
game: str,
num_trials: int = 5,
max_steps: int = 100,
base_seed: int = 42,
output_path: Path = None,
verbose: bool = False,
) -> list[EvaluationResult]:
"""Evaluate all submissions in a directory."""
results = []
# Find all submission directories (those containing agent.py)
submission_dirs = [
d for d in submissions_dir.iterdir()
if d.is_dir() and (d / "agent.py").exists()
]
print(f"Found {len(submission_dirs)} submissions")
for submission_path in sorted(submission_dirs):
try:
result = await evaluate_submission(
submission_path=submission_path,
game=game,
num_trials=num_trials,
max_steps=max_steps,
base_seed=base_seed,
verbose=verbose,
)
results.append(result)
except Exception as e:
print(f"Failed to evaluate {submission_path}: {e}")
# Sort by mean score (descending)
results.sort(key=lambda r: r.mean_score, reverse=True)
# Save results
if output_path:
output_data = {
"evaluation_date": datetime.now().isoformat(),
"game": game,
"num_trials": num_trials,
"max_steps": max_steps,
"base_seed": base_seed,
"results": [r.to_dict() for r in results],
"leaderboard": [
{
"rank": i + 1,
"student_id": r.student_id,
"mean_score": round(r.mean_score, 2),
"std_score": round(r.std_score, 2),
}
for i, r in enumerate(results)
],
}
with open(output_path, "w") as f:
json.dump(output_data, f, indent=2)
print(f"\nResults saved to {output_path}")
return results
def print_comparison(student: EvaluationResult, reference: EvaluationResult):
"""Print a comparison between student and reference results."""
print("\n" + "=" * 60)
print("EVALUATION COMPARISON")
print("=" * 60)
print(f"\n{'Metric':<25} {'Student':<15} {'Reference':<15}")
print("-" * 55)
print(f"{'Mean Score':<25} {student.mean_score:<15.2f} {reference.mean_score:<15.2f}")
print(f"{'Std Score':<25} {student.std_score:<15.2f} {reference.std_score:<15.2f}")
print(f"{'Min Score':<25} {student.min_score:<15} {reference.min_score:<15}")
print(f"{'Max Score':<25} {student.max_score_achieved:<15} {reference.max_score_achieved:<15}")
print(f"{'Mean Moves':<25} {student.mean_moves:<15.1f} {reference.mean_moves:<15.1f}")
print(f"{'Mean Locations':<25} {student.mean_locations:<15.1f} {reference.mean_locations:<15.1f}")
print(f"{'Successful Trials':<25} {student.successful_trials:<15} {reference.successful_trials:<15}")
# Performance ratio
if reference.mean_score > 0:
ratio = student.mean_score / reference.mean_score * 100
print(f"\nStudent performance: {ratio:.1f}% of reference")
def main():
parser = argparse.ArgumentParser(
description="Evaluate text adventure agent submissions",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
# Input options (mutually exclusive)
input_group = parser.add_mutually_exclusive_group(required=True)
input_group.add_argument(
"-s", "--submission",
type=Path,
help="Path to student submission directory",
)
input_group.add_argument(
"--hf-space",
type=str,
help="Hugging Face Space ID (e.g., username/space-name)",
)
input_group.add_argument(
"--submissions-dir",
type=Path,
help="Directory containing multiple submissions (for batch evaluation)",
)
# Evaluation parameters
parser.add_argument(
"-g", "--game",
type=str,
default="lostpig",
help="Game to evaluate on (default: lostpig)",
)
parser.add_argument(
"-t", "--trials",
type=int,
default=5,
help="Number of trials to run (default: 5)",
)
parser.add_argument(
"--max-steps",
type=int,
default=100,
help="Maximum steps per trial (default: 100)",
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="Base random seed for reproducibility (default: 42)",
)
# Reference comparison
parser.add_argument(
"-r", "--reference",
action="store_true",
help="Also run reference agent (from examples/mcp_react) for comparison",
)
# Output options
parser.add_argument(
"-o", "--output",
type=Path,
help="Output file for results (JSON)",
)
parser.add_argument(
"-v", "--verbose",
action="store_true",
help="Print detailed output",
)
parser.add_argument(
"--list-games",
action="store_true",
help="List available games and exit",
)
args = parser.parse_args()
# List games if requested
if args.list_games:
games = list_available_games()
print(f"Available games ({len(games)}):")
for game in games:
print(f" - {game}")
return
# Validate game
available_games = list_available_games()
if args.game not in available_games:
print(f"Error: Unknown game '{args.game}'")
print(f"Available: {', '.join(available_games[:10])}...")
sys.exit(1)
# Handle HF Space input
if args.hf_space:
with tempfile.TemporaryDirectory() as tmpdir:
submission_path = clone_hf_space(args.hf_space, Path(tmpdir) / "submission")
if args.reference:
student_result, reference_result = asyncio.run(
evaluate_with_reference(
submission_path=submission_path,
game=args.game,
num_trials=args.trials,
max_steps=args.max_steps,
base_seed=args.seed,
verbose=args.verbose,
)
)
print_comparison(student_result, reference_result)
else:
result = asyncio.run(
evaluate_submission(
submission_path=submission_path,
game=args.game,
num_trials=args.trials,
max_steps=args.max_steps,
base_seed=args.seed,
verbose=args.verbose,
)
)
print("\n" + result.summary_str())
# Handle batch evaluation
elif args.submissions_dir:
results = asyncio.run(
batch_evaluate(
submissions_dir=args.submissions_dir,
game=args.game,
num_trials=args.trials,
max_steps=args.max_steps,
base_seed=args.seed,
output_path=args.output,
verbose=args.verbose,
)
)
# Print leaderboard
print("\n" + "=" * 60)
print("LEADERBOARD")
print("=" * 60)
print(f"\n{'Rank':<6} {'Student':<30} {'Mean Score':<12} {'Std':<10}")
print("-" * 58)
for i, r in enumerate(results):
print(f"{i+1:<6} {r.student_id:<30} {r.mean_score:<12.2f} {r.std_score:<10.2f}")
# Handle single submission
else:
submission_path = args.submission
if not submission_path.exists():
print(f"Error: Submission path not found: {submission_path}")
sys.exit(1)
if args.reference:
student_result, reference_result = asyncio.run(
evaluate_with_reference(
submission_path=submission_path,
game=args.game,
num_trials=args.trials,
max_steps=args.max_steps,
base_seed=args.seed,
verbose=args.verbose,
)
)
print_comparison(student_result, reference_result)
# Save results if output specified
if args.output:
output_data = {
"evaluation_date": datetime.now().isoformat(),
"student": student_result.to_dict(),
"reference": reference_result.to_dict(),
}
with open(args.output, "w") as f:
json.dump(output_data, f, indent=2)
print(f"\nResults saved to {args.output}")
else:
result = asyncio.run(
evaluate_submission(
submission_path=submission_path,
game=args.game,
num_trials=args.trials,
max_steps=args.max_steps,
base_seed=args.seed,
verbose=args.verbose,
)
)
print("\n" + result.summary_str())
# Save results if output specified
if args.output:
with open(args.output, "w") as f:
json.dump(result.to_dict(), f, indent=2)
print(f"\nResults saved to {args.output}")
if __name__ == "__main__":
main()