#!/usr/bin/env python3 """ Evaluation Script for Text Adventure Agents Evaluates student submissions by running their agent + MCP server on a text adventure game for multiple trials and averaging scores. Usage: # Evaluate a student submission python evaluation/evaluate.py \\ --submission path/to/student/submission \\ --game zork1 \\ --trials 5 \\ --max-steps 100 # Evaluate with reference agent comparison python evaluation/evaluate.py \\ --submission path/to/student/submission \\ --game zork1 \\ --reference # Evaluate from a Hugging Face Space python evaluation/evaluate.py \\ --hf-space username/space-name \\ --game zork1 # Batch evaluate multiple submissions python evaluation/evaluate.py \\ --submissions-dir path/to/all/submissions \\ --game zork1 \\ --output results.json Examples: # Quick test with 3 trials python evaluation/evaluate.py -s ./submission_template -g zork1 -t 3 # Full evaluation for grading python evaluation/evaluate.py -s ./submission_template -g advent -t 5 --max-steps 150 """ import argparse import asyncio import json import os import random import sys import tempfile import warnings from datetime import datetime from pathlib import Path # Suppress asyncio subprocess cleanup warnings warnings.filterwarnings("ignore", message=".*Event loop is closed.*") warnings.filterwarnings("ignore", category=UserWarning, module="multiprocessing.resource_tracker") # Add parent directory to path sys.path.insert(0, str(Path(__file__).parent.parent)) from evaluation.metrics import EvaluationResult, TrialResult from evaluation.runner import RunConfig, run_agent_with_server, run_reference_agent from games.zork_env import list_available_games def generate_seeds(base_seed: int, num_trials: int) -> list[int]: """Generate deterministic seeds for each trial.""" random.seed(base_seed) return [random.randint(0, 2**32 - 1) for _ in range(num_trials)] async def evaluate_submission( submission_path: Path, game: str, num_trials: int = 5, max_steps: int = 100, base_seed: int = 42, verbose: bool = False, ) -> EvaluationResult: """ Evaluate a student submission across multiple trials. Args: submission_path: Path to student's submission directory game: Name of the game to evaluate on num_trials: Number of trials to run (default: 5) max_steps: Maximum steps per trial (default: 100) base_seed: Base seed for reproducibility (default: 42) verbose: Print detailed output Returns: EvaluationResult with aggregated metrics """ # Locate agent and server files agent_path = submission_path / "agent.py" server_path = submission_path / "mcp_server.py" # Extract student ID from path or README student_id = submission_path.name readme_path = submission_path / "README.md" if readme_path.exists(): content = readme_path.read_text() # Try to extract student name from README for line in content.split("\n"): if line.startswith("# ") or "name:" in line.lower(): student_id = line.replace("#", "").replace("name:", "").strip()[:50] break # Initialize results result = EvaluationResult( student_id=student_id, game=game, num_trials=num_trials, max_steps=max_steps, ) # Generate deterministic seeds seeds = generate_seeds(base_seed, num_trials) print(f"\nEvaluating: {student_id}") print(f"Game: {game}") print(f"Trials: {num_trials}") print(f"Max steps: {max_steps}") print(f"Seeds: {seeds}") print("-" * 50) for i, seed in enumerate(seeds): trial_num = i + 1 print(f"\nTrial {trial_num}/{num_trials} (seed={seed})...") config = RunConfig( agent_path=agent_path, server_path=server_path, game=game, max_steps=max_steps, seed=seed, verbose=verbose, ) try: run_result = await run_agent_with_server(config) trial = TrialResult( trial_number=trial_num, final_score=run_result.final_score, max_score=run_result.max_score, moves=run_result.moves, locations_visited=len(run_result.locations_visited), game_completed=run_result.game_completed, error=run_result.error, ) if run_result.error: print(f" Error: {run_result.error[:100]}...") else: print(f" Score: {run_result.final_score}") print(f" Moves: {run_result.moves}") print(f" Locations: {len(run_result.locations_visited)}") except Exception as e: trial = TrialResult( trial_number=trial_num, final_score=0, max_score=0, moves=0, locations_visited=0, game_completed=False, error=str(e), ) print(f" Exception: {e}") result.add_trial(trial) return result async def evaluate_with_reference( submission_path: Path, game: str, num_trials: int = 5, max_steps: int = 100, base_seed: int = 42, verbose: bool = False, ) -> tuple[EvaluationResult, EvaluationResult]: """ Evaluate student submission and compare with reference agent. Returns: Tuple of (student_result, reference_result) """ # Evaluate student student_result = await evaluate_submission( submission_path=submission_path, game=game, num_trials=num_trials, max_steps=max_steps, base_seed=base_seed, verbose=verbose, ) # Evaluate reference agent (from examples/mcp_react) print("\n" + "=" * 50) print("Running reference agent for comparison...") print("=" * 50) seeds = generate_seeds(base_seed, num_trials) reference_result = EvaluationResult( student_id="reference_agent", game=game, num_trials=num_trials, max_steps=max_steps, ) for i, seed in enumerate(seeds): trial_num = i + 1 print(f"\nReference Trial {trial_num}/{num_trials} (seed={seed})...") try: run_result = await run_reference_agent( game=game, max_steps=max_steps, seed=seed, verbose=verbose, ) trial = TrialResult( trial_number=trial_num, final_score=run_result.final_score, max_score=run_result.max_score, moves=run_result.moves, locations_visited=len(run_result.locations_visited), game_completed=run_result.game_completed, error=run_result.error, ) if run_result.error: print(f" Error: {run_result.error[:100]}...") else: print(f" Score: {run_result.final_score}") except Exception as e: trial = TrialResult( trial_number=trial_num, final_score=0, max_score=0, moves=0, locations_visited=0, game_completed=False, error=str(e), ) print(f" Exception: {e}") reference_result.add_trial(trial) return student_result, reference_result def clone_hf_space(space_id: str, target_dir: Path) -> Path: """Clone a Hugging Face Space to local directory.""" import subprocess # HF Spaces are git repos at huggingface.co/spaces/ repo_url = f"https://huggingface.co/spaces/{space_id}" print(f"Cloning {repo_url}...") subprocess.run( ["git", "clone", "--depth", "1", repo_url, str(target_dir)], check=True, capture_output=True, ) return target_dir async def batch_evaluate( submissions_dir: Path, game: str, num_trials: int = 5, max_steps: int = 100, base_seed: int = 42, output_path: Path = None, verbose: bool = False, ) -> list[EvaluationResult]: """Evaluate all submissions in a directory.""" results = [] # Find all submission directories (those containing agent.py) submission_dirs = [ d for d in submissions_dir.iterdir() if d.is_dir() and (d / "agent.py").exists() ] print(f"Found {len(submission_dirs)} submissions") for submission_path in sorted(submission_dirs): try: result = await evaluate_submission( submission_path=submission_path, game=game, num_trials=num_trials, max_steps=max_steps, base_seed=base_seed, verbose=verbose, ) results.append(result) except Exception as e: print(f"Failed to evaluate {submission_path}: {e}") # Sort by mean score (descending) results.sort(key=lambda r: r.mean_score, reverse=True) # Save results if output_path: output_data = { "evaluation_date": datetime.now().isoformat(), "game": game, "num_trials": num_trials, "max_steps": max_steps, "base_seed": base_seed, "results": [r.to_dict() for r in results], "leaderboard": [ { "rank": i + 1, "student_id": r.student_id, "mean_score": round(r.mean_score, 2), "std_score": round(r.std_score, 2), } for i, r in enumerate(results) ], } with open(output_path, "w") as f: json.dump(output_data, f, indent=2) print(f"\nResults saved to {output_path}") return results def print_comparison(student: EvaluationResult, reference: EvaluationResult): """Print a comparison between student and reference results.""" print("\n" + "=" * 60) print("EVALUATION COMPARISON") print("=" * 60) print(f"\n{'Metric':<25} {'Student':<15} {'Reference':<15}") print("-" * 55) print(f"{'Mean Score':<25} {student.mean_score:<15.2f} {reference.mean_score:<15.2f}") print(f"{'Std Score':<25} {student.std_score:<15.2f} {reference.std_score:<15.2f}") print(f"{'Min Score':<25} {student.min_score:<15} {reference.min_score:<15}") print(f"{'Max Score':<25} {student.max_score_achieved:<15} {reference.max_score_achieved:<15}") print(f"{'Mean Moves':<25} {student.mean_moves:<15.1f} {reference.mean_moves:<15.1f}") print(f"{'Mean Locations':<25} {student.mean_locations:<15.1f} {reference.mean_locations:<15.1f}") print(f"{'Successful Trials':<25} {student.successful_trials:<15} {reference.successful_trials:<15}") # Performance ratio if reference.mean_score > 0: ratio = student.mean_score / reference.mean_score * 100 print(f"\nStudent performance: {ratio:.1f}% of reference") def main(): parser = argparse.ArgumentParser( description="Evaluate text adventure agent submissions", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__, ) # Input options (mutually exclusive) input_group = parser.add_mutually_exclusive_group(required=True) input_group.add_argument( "-s", "--submission", type=Path, help="Path to student submission directory", ) input_group.add_argument( "--hf-space", type=str, help="Hugging Face Space ID (e.g., username/space-name)", ) input_group.add_argument( "--submissions-dir", type=Path, help="Directory containing multiple submissions (for batch evaluation)", ) # Evaluation parameters parser.add_argument( "-g", "--game", type=str, default="lostpig", help="Game to evaluate on (default: lostpig)", ) parser.add_argument( "-t", "--trials", type=int, default=5, help="Number of trials to run (default: 5)", ) parser.add_argument( "--max-steps", type=int, default=100, help="Maximum steps per trial (default: 100)", ) parser.add_argument( "--seed", type=int, default=42, help="Base random seed for reproducibility (default: 42)", ) # Reference comparison parser.add_argument( "-r", "--reference", action="store_true", help="Also run reference agent (from examples/mcp_react) for comparison", ) # Output options parser.add_argument( "-o", "--output", type=Path, help="Output file for results (JSON)", ) parser.add_argument( "-v", "--verbose", action="store_true", help="Print detailed output", ) parser.add_argument( "--list-games", action="store_true", help="List available games and exit", ) args = parser.parse_args() # List games if requested if args.list_games: games = list_available_games() print(f"Available games ({len(games)}):") for game in games: print(f" - {game}") return # Validate game available_games = list_available_games() if args.game not in available_games: print(f"Error: Unknown game '{args.game}'") print(f"Available: {', '.join(available_games[:10])}...") sys.exit(1) # Handle HF Space input if args.hf_space: with tempfile.TemporaryDirectory() as tmpdir: submission_path = clone_hf_space(args.hf_space, Path(tmpdir) / "submission") if args.reference: student_result, reference_result = asyncio.run( evaluate_with_reference( submission_path=submission_path, game=args.game, num_trials=args.trials, max_steps=args.max_steps, base_seed=args.seed, verbose=args.verbose, ) ) print_comparison(student_result, reference_result) else: result = asyncio.run( evaluate_submission( submission_path=submission_path, game=args.game, num_trials=args.trials, max_steps=args.max_steps, base_seed=args.seed, verbose=args.verbose, ) ) print("\n" + result.summary_str()) # Handle batch evaluation elif args.submissions_dir: results = asyncio.run( batch_evaluate( submissions_dir=args.submissions_dir, game=args.game, num_trials=args.trials, max_steps=args.max_steps, base_seed=args.seed, output_path=args.output, verbose=args.verbose, ) ) # Print leaderboard print("\n" + "=" * 60) print("LEADERBOARD") print("=" * 60) print(f"\n{'Rank':<6} {'Student':<30} {'Mean Score':<12} {'Std':<10}") print("-" * 58) for i, r in enumerate(results): print(f"{i+1:<6} {r.student_id:<30} {r.mean_score:<12.2f} {r.std_score:<10.2f}") # Handle single submission else: submission_path = args.submission if not submission_path.exists(): print(f"Error: Submission path not found: {submission_path}") sys.exit(1) if args.reference: student_result, reference_result = asyncio.run( evaluate_with_reference( submission_path=submission_path, game=args.game, num_trials=args.trials, max_steps=args.max_steps, base_seed=args.seed, verbose=args.verbose, ) ) print_comparison(student_result, reference_result) # Save results if output specified if args.output: output_data = { "evaluation_date": datetime.now().isoformat(), "student": student_result.to_dict(), "reference": reference_result.to_dict(), } with open(args.output, "w") as f: json.dump(output_data, f, indent=2) print(f"\nResults saved to {args.output}") else: result = asyncio.run( evaluate_submission( submission_path=submission_path, game=args.game, num_trials=args.trials, max_steps=args.max_steps, base_seed=args.seed, verbose=args.verbose, ) ) print("\n" + result.summary_str()) # Save results if output specified if args.output: with open(args.output, "w") as f: json.dump(result.to_dict(), f, indent=2) print(f"\nResults saved to {args.output}") if __name__ == "__main__": main()