Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Evaluation Script for Text Adventure Agents | |
| Evaluates student submissions by running their agent + MCP server | |
| on a text adventure game for multiple trials and averaging scores. | |
| Usage: | |
| # Evaluate a student submission | |
| python evaluation/evaluate.py \\ | |
| --submission path/to/student/submission \\ | |
| --game zork1 \\ | |
| --trials 5 \\ | |
| --max-steps 100 | |
| # Evaluate with reference agent comparison | |
| python evaluation/evaluate.py \\ | |
| --submission path/to/student/submission \\ | |
| --game zork1 \\ | |
| --reference | |
| # Evaluate from a Hugging Face Space | |
| python evaluation/evaluate.py \\ | |
| --hf-space username/space-name \\ | |
| --game zork1 | |
| # Batch evaluate multiple submissions | |
| python evaluation/evaluate.py \\ | |
| --submissions-dir path/to/all/submissions \\ | |
| --game zork1 \\ | |
| --output results.json | |
| Examples: | |
| # Quick test with 3 trials | |
| python evaluation/evaluate.py -s ./submission_template -g zork1 -t 3 | |
| # Full evaluation for grading | |
| python evaluation/evaluate.py -s ./submission_template -g advent -t 5 --max-steps 150 | |
| """ | |
| import argparse | |
| import asyncio | |
| import json | |
| import os | |
| import random | |
| import sys | |
| import tempfile | |
| import warnings | |
| from datetime import datetime | |
| from pathlib import Path | |
| # Suppress asyncio subprocess cleanup warnings | |
| warnings.filterwarnings("ignore", message=".*Event loop is closed.*") | |
| warnings.filterwarnings("ignore", category=UserWarning, module="multiprocessing.resource_tracker") | |
| # Add parent directory to path | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from evaluation.metrics import EvaluationResult, TrialResult | |
| from evaluation.runner import RunConfig, run_agent_with_server, run_reference_agent | |
| from games.zork_env import list_available_games | |
| def generate_seeds(base_seed: int, num_trials: int) -> list[int]: | |
| """Generate deterministic seeds for each trial.""" | |
| random.seed(base_seed) | |
| return [random.randint(0, 2**32 - 1) for _ in range(num_trials)] | |
| async def evaluate_submission( | |
| submission_path: Path, | |
| game: str, | |
| num_trials: int = 5, | |
| max_steps: int = 100, | |
| base_seed: int = 42, | |
| verbose: bool = False, | |
| ) -> EvaluationResult: | |
| """ | |
| Evaluate a student submission across multiple trials. | |
| Args: | |
| submission_path: Path to student's submission directory | |
| game: Name of the game to evaluate on | |
| num_trials: Number of trials to run (default: 5) | |
| max_steps: Maximum steps per trial (default: 100) | |
| base_seed: Base seed for reproducibility (default: 42) | |
| verbose: Print detailed output | |
| Returns: | |
| EvaluationResult with aggregated metrics | |
| """ | |
| # Locate agent and server files | |
| agent_path = submission_path / "agent.py" | |
| server_path = submission_path / "mcp_server.py" | |
| # Extract student ID from path or README | |
| student_id = submission_path.name | |
| readme_path = submission_path / "README.md" | |
| if readme_path.exists(): | |
| content = readme_path.read_text() | |
| # Try to extract student name from README | |
| for line in content.split("\n"): | |
| if line.startswith("# ") or "name:" in line.lower(): | |
| student_id = line.replace("#", "").replace("name:", "").strip()[:50] | |
| break | |
| # Initialize results | |
| result = EvaluationResult( | |
| student_id=student_id, | |
| game=game, | |
| num_trials=num_trials, | |
| max_steps=max_steps, | |
| ) | |
| # Generate deterministic seeds | |
| seeds = generate_seeds(base_seed, num_trials) | |
| print(f"\nEvaluating: {student_id}") | |
| print(f"Game: {game}") | |
| print(f"Trials: {num_trials}") | |
| print(f"Max steps: {max_steps}") | |
| print(f"Seeds: {seeds}") | |
| print("-" * 50) | |
| for i, seed in enumerate(seeds): | |
| trial_num = i + 1 | |
| print(f"\nTrial {trial_num}/{num_trials} (seed={seed})...") | |
| config = RunConfig( | |
| agent_path=agent_path, | |
| server_path=server_path, | |
| game=game, | |
| max_steps=max_steps, | |
| seed=seed, | |
| verbose=verbose, | |
| ) | |
| try: | |
| run_result = await run_agent_with_server(config) | |
| trial = TrialResult( | |
| trial_number=trial_num, | |
| final_score=run_result.final_score, | |
| max_score=run_result.max_score, | |
| moves=run_result.moves, | |
| locations_visited=len(run_result.locations_visited), | |
| game_completed=run_result.game_completed, | |
| error=run_result.error, | |
| ) | |
| if run_result.error: | |
| print(f" Error: {run_result.error[:100]}...") | |
| else: | |
| print(f" Score: {run_result.final_score}") | |
| print(f" Moves: {run_result.moves}") | |
| print(f" Locations: {len(run_result.locations_visited)}") | |
| except Exception as e: | |
| trial = TrialResult( | |
| trial_number=trial_num, | |
| final_score=0, | |
| max_score=0, | |
| moves=0, | |
| locations_visited=0, | |
| game_completed=False, | |
| error=str(e), | |
| ) | |
| print(f" Exception: {e}") | |
| result.add_trial(trial) | |
| return result | |
| async def evaluate_with_reference( | |
| submission_path: Path, | |
| game: str, | |
| num_trials: int = 5, | |
| max_steps: int = 100, | |
| base_seed: int = 42, | |
| verbose: bool = False, | |
| ) -> tuple[EvaluationResult, EvaluationResult]: | |
| """ | |
| Evaluate student submission and compare with reference agent. | |
| Returns: | |
| Tuple of (student_result, reference_result) | |
| """ | |
| # Evaluate student | |
| student_result = await evaluate_submission( | |
| submission_path=submission_path, | |
| game=game, | |
| num_trials=num_trials, | |
| max_steps=max_steps, | |
| base_seed=base_seed, | |
| verbose=verbose, | |
| ) | |
| # Evaluate reference agent (from examples/mcp_react) | |
| print("\n" + "=" * 50) | |
| print("Running reference agent for comparison...") | |
| print("=" * 50) | |
| seeds = generate_seeds(base_seed, num_trials) | |
| reference_result = EvaluationResult( | |
| student_id="reference_agent", | |
| game=game, | |
| num_trials=num_trials, | |
| max_steps=max_steps, | |
| ) | |
| for i, seed in enumerate(seeds): | |
| trial_num = i + 1 | |
| print(f"\nReference Trial {trial_num}/{num_trials} (seed={seed})...") | |
| try: | |
| run_result = await run_reference_agent( | |
| game=game, | |
| max_steps=max_steps, | |
| seed=seed, | |
| verbose=verbose, | |
| ) | |
| trial = TrialResult( | |
| trial_number=trial_num, | |
| final_score=run_result.final_score, | |
| max_score=run_result.max_score, | |
| moves=run_result.moves, | |
| locations_visited=len(run_result.locations_visited), | |
| game_completed=run_result.game_completed, | |
| error=run_result.error, | |
| ) | |
| if run_result.error: | |
| print(f" Error: {run_result.error[:100]}...") | |
| else: | |
| print(f" Score: {run_result.final_score}") | |
| except Exception as e: | |
| trial = TrialResult( | |
| trial_number=trial_num, | |
| final_score=0, | |
| max_score=0, | |
| moves=0, | |
| locations_visited=0, | |
| game_completed=False, | |
| error=str(e), | |
| ) | |
| print(f" Exception: {e}") | |
| reference_result.add_trial(trial) | |
| return student_result, reference_result | |
| def clone_hf_space(space_id: str, target_dir: Path) -> Path: | |
| """Clone a Hugging Face Space to local directory.""" | |
| import subprocess | |
| # HF Spaces are git repos at huggingface.co/spaces/ | |
| repo_url = f"https://huggingface.co/spaces/{space_id}" | |
| print(f"Cloning {repo_url}...") | |
| subprocess.run( | |
| ["git", "clone", "--depth", "1", repo_url, str(target_dir)], | |
| check=True, | |
| capture_output=True, | |
| ) | |
| return target_dir | |
| async def batch_evaluate( | |
| submissions_dir: Path, | |
| game: str, | |
| num_trials: int = 5, | |
| max_steps: int = 100, | |
| base_seed: int = 42, | |
| output_path: Path = None, | |
| verbose: bool = False, | |
| ) -> list[EvaluationResult]: | |
| """Evaluate all submissions in a directory.""" | |
| results = [] | |
| # Find all submission directories (those containing agent.py) | |
| submission_dirs = [ | |
| d for d in submissions_dir.iterdir() | |
| if d.is_dir() and (d / "agent.py").exists() | |
| ] | |
| print(f"Found {len(submission_dirs)} submissions") | |
| for submission_path in sorted(submission_dirs): | |
| try: | |
| result = await evaluate_submission( | |
| submission_path=submission_path, | |
| game=game, | |
| num_trials=num_trials, | |
| max_steps=max_steps, | |
| base_seed=base_seed, | |
| verbose=verbose, | |
| ) | |
| results.append(result) | |
| except Exception as e: | |
| print(f"Failed to evaluate {submission_path}: {e}") | |
| # Sort by mean score (descending) | |
| results.sort(key=lambda r: r.mean_score, reverse=True) | |
| # Save results | |
| if output_path: | |
| output_data = { | |
| "evaluation_date": datetime.now().isoformat(), | |
| "game": game, | |
| "num_trials": num_trials, | |
| "max_steps": max_steps, | |
| "base_seed": base_seed, | |
| "results": [r.to_dict() for r in results], | |
| "leaderboard": [ | |
| { | |
| "rank": i + 1, | |
| "student_id": r.student_id, | |
| "mean_score": round(r.mean_score, 2), | |
| "std_score": round(r.std_score, 2), | |
| } | |
| for i, r in enumerate(results) | |
| ], | |
| } | |
| with open(output_path, "w") as f: | |
| json.dump(output_data, f, indent=2) | |
| print(f"\nResults saved to {output_path}") | |
| return results | |
| def print_comparison(student: EvaluationResult, reference: EvaluationResult): | |
| """Print a comparison between student and reference results.""" | |
| print("\n" + "=" * 60) | |
| print("EVALUATION COMPARISON") | |
| print("=" * 60) | |
| print(f"\n{'Metric':<25} {'Student':<15} {'Reference':<15}") | |
| print("-" * 55) | |
| print(f"{'Mean Score':<25} {student.mean_score:<15.2f} {reference.mean_score:<15.2f}") | |
| print(f"{'Std Score':<25} {student.std_score:<15.2f} {reference.std_score:<15.2f}") | |
| print(f"{'Min Score':<25} {student.min_score:<15} {reference.min_score:<15}") | |
| print(f"{'Max Score':<25} {student.max_score_achieved:<15} {reference.max_score_achieved:<15}") | |
| print(f"{'Mean Moves':<25} {student.mean_moves:<15.1f} {reference.mean_moves:<15.1f}") | |
| print(f"{'Mean Locations':<25} {student.mean_locations:<15.1f} {reference.mean_locations:<15.1f}") | |
| print(f"{'Successful Trials':<25} {student.successful_trials:<15} {reference.successful_trials:<15}") | |
| # Performance ratio | |
| if reference.mean_score > 0: | |
| ratio = student.mean_score / reference.mean_score * 100 | |
| print(f"\nStudent performance: {ratio:.1f}% of reference") | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Evaluate text adventure agent submissions", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=__doc__, | |
| ) | |
| # Input options (mutually exclusive) | |
| input_group = parser.add_mutually_exclusive_group(required=True) | |
| input_group.add_argument( | |
| "-s", "--submission", | |
| type=Path, | |
| help="Path to student submission directory", | |
| ) | |
| input_group.add_argument( | |
| "--hf-space", | |
| type=str, | |
| help="Hugging Face Space ID (e.g., username/space-name)", | |
| ) | |
| input_group.add_argument( | |
| "--submissions-dir", | |
| type=Path, | |
| help="Directory containing multiple submissions (for batch evaluation)", | |
| ) | |
| # Evaluation parameters | |
| parser.add_argument( | |
| "-g", "--game", | |
| type=str, | |
| default="lostpig", | |
| help="Game to evaluate on (default: lostpig)", | |
| ) | |
| parser.add_argument( | |
| "-t", "--trials", | |
| type=int, | |
| default=5, | |
| help="Number of trials to run (default: 5)", | |
| ) | |
| parser.add_argument( | |
| "--max-steps", | |
| type=int, | |
| default=100, | |
| help="Maximum steps per trial (default: 100)", | |
| ) | |
| parser.add_argument( | |
| "--seed", | |
| type=int, | |
| default=42, | |
| help="Base random seed for reproducibility (default: 42)", | |
| ) | |
| # Reference comparison | |
| parser.add_argument( | |
| "-r", "--reference", | |
| action="store_true", | |
| help="Also run reference agent (from examples/mcp_react) for comparison", | |
| ) | |
| # Output options | |
| parser.add_argument( | |
| "-o", "--output", | |
| type=Path, | |
| help="Output file for results (JSON)", | |
| ) | |
| parser.add_argument( | |
| "-v", "--verbose", | |
| action="store_true", | |
| help="Print detailed output", | |
| ) | |
| parser.add_argument( | |
| "--list-games", | |
| action="store_true", | |
| help="List available games and exit", | |
| ) | |
| args = parser.parse_args() | |
| # List games if requested | |
| if args.list_games: | |
| games = list_available_games() | |
| print(f"Available games ({len(games)}):") | |
| for game in games: | |
| print(f" - {game}") | |
| return | |
| # Validate game | |
| available_games = list_available_games() | |
| if args.game not in available_games: | |
| print(f"Error: Unknown game '{args.game}'") | |
| print(f"Available: {', '.join(available_games[:10])}...") | |
| sys.exit(1) | |
| # Handle HF Space input | |
| if args.hf_space: | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| submission_path = clone_hf_space(args.hf_space, Path(tmpdir) / "submission") | |
| if args.reference: | |
| student_result, reference_result = asyncio.run( | |
| evaluate_with_reference( | |
| submission_path=submission_path, | |
| game=args.game, | |
| num_trials=args.trials, | |
| max_steps=args.max_steps, | |
| base_seed=args.seed, | |
| verbose=args.verbose, | |
| ) | |
| ) | |
| print_comparison(student_result, reference_result) | |
| else: | |
| result = asyncio.run( | |
| evaluate_submission( | |
| submission_path=submission_path, | |
| game=args.game, | |
| num_trials=args.trials, | |
| max_steps=args.max_steps, | |
| base_seed=args.seed, | |
| verbose=args.verbose, | |
| ) | |
| ) | |
| print("\n" + result.summary_str()) | |
| # Handle batch evaluation | |
| elif args.submissions_dir: | |
| results = asyncio.run( | |
| batch_evaluate( | |
| submissions_dir=args.submissions_dir, | |
| game=args.game, | |
| num_trials=args.trials, | |
| max_steps=args.max_steps, | |
| base_seed=args.seed, | |
| output_path=args.output, | |
| verbose=args.verbose, | |
| ) | |
| ) | |
| # Print leaderboard | |
| print("\n" + "=" * 60) | |
| print("LEADERBOARD") | |
| print("=" * 60) | |
| print(f"\n{'Rank':<6} {'Student':<30} {'Mean Score':<12} {'Std':<10}") | |
| print("-" * 58) | |
| for i, r in enumerate(results): | |
| print(f"{i+1:<6} {r.student_id:<30} {r.mean_score:<12.2f} {r.std_score:<10.2f}") | |
| # Handle single submission | |
| else: | |
| submission_path = args.submission | |
| if not submission_path.exists(): | |
| print(f"Error: Submission path not found: {submission_path}") | |
| sys.exit(1) | |
| if args.reference: | |
| student_result, reference_result = asyncio.run( | |
| evaluate_with_reference( | |
| submission_path=submission_path, | |
| game=args.game, | |
| num_trials=args.trials, | |
| max_steps=args.max_steps, | |
| base_seed=args.seed, | |
| verbose=args.verbose, | |
| ) | |
| ) | |
| print_comparison(student_result, reference_result) | |
| # Save results if output specified | |
| if args.output: | |
| output_data = { | |
| "evaluation_date": datetime.now().isoformat(), | |
| "student": student_result.to_dict(), | |
| "reference": reference_result.to_dict(), | |
| } | |
| with open(args.output, "w") as f: | |
| json.dump(output_data, f, indent=2) | |
| print(f"\nResults saved to {args.output}") | |
| else: | |
| result = asyncio.run( | |
| evaluate_submission( | |
| submission_path=submission_path, | |
| game=args.game, | |
| num_trials=args.trials, | |
| max_steps=args.max_steps, | |
| base_seed=args.seed, | |
| verbose=args.verbose, | |
| ) | |
| ) | |
| print("\n" + result.summary_str()) | |
| # Save results if output specified | |
| if args.output: | |
| with open(args.output, "w") as f: | |
| json.dump(result.to_dict(), f, indent=2) | |
| print(f"\nResults saved to {args.output}") | |
| if __name__ == "__main__": | |
| main() |