""" Example: Multi-seed evaluation without LLM ========================================== Demonstrates how multi-seed evaluation works using a simple random policy. No API key required. """ import random from typing import Dict, List from grid_env.env import WarehouseFulfillmentEnv from grid_env.graders import grade_episode def random_policy(env: WarehouseFulfillmentEnv) -> str: """Simple random policy for demonstration.""" return random.choice(env.action_space) def run_task_with_seed(task_id: str, seed: int) -> Dict[str, float]: """Run a single task with a specific seed using random policy.""" env = WarehouseFulfillmentEnv(task_id=task_id, seed=seed) obs = env.reset(task_id=task_id, seed=seed) done = False while not done: action = random_policy(env) obs, reward, done, info = env.step(action) final_state = env.state() score = grade_episode(final_state) return { "task_id": task_id, "seed": seed, "score": round(score, 4), "steps": final_state.step_count, "success": final_state.success, } def evaluate_multiseed(task_id: str, seeds: List[int]) -> Dict: """Evaluate a task across multiple seeds.""" print(f"\nEvaluating {task_id} with seeds: {seeds}") results = [] for seed in seeds: result = run_task_with_seed(task_id, seed) results.append(result) print(f" Seed {seed:>3}: score={result['score']:.4f} steps={result['steps']:>3} success={result['success']}") scores = [r["score"] for r in results] mean_score = sum(scores) / len(scores) std_score = (sum((s - mean_score)**2 for s in scores) / len(scores))**0.5 print(f" → Mean: {mean_score:.4f} ± {std_score:.4f} Min: {min(scores):.4f} Max: {max(scores):.4f}") return { "task_id": task_id, "mean_score": round(mean_score, 4), "std_score": round(std_score, 4), "min_score": round(min(scores), 4), "max_score": round(max(scores), 4), "num_seeds": len(seeds), } if __name__ == "__main__": # Set random seed for reproducibility random.seed(42) # Evaluate a few tasks with multiple seeds tasks = ["easy_single_pick", "medium_multi_item", "obstacle_course"] eval_seeds = [7, 42, 123, 456, 789] print("="*60) print("Multi-Seed Evaluation Demo (Random Policy)") print("="*60) all_results = [] for task_id in tasks: result = evaluate_multiseed(task_id, eval_seeds) all_results.append(result) print("\n" + "="*60) print("Summary") print("="*60) print(f"{'Task':<30} {'Mean Score':>12} {'Std':>8} {'Min':>8} {'Max':>8}") print("-"*60) for r in all_results: print( f"{r['task_id']:<30} {r['mean_score']:>12.4f} {r['std_score']:>8.4f} " f"{r['min_score']:>8.4f} {r['max_score']:>8.4f}" ) overall_mean = sum(r["mean_score"] for r in all_results) / len(all_results) print("-"*60) print(f"{'Overall Mean':<30} {overall_mean:>12.4f}") print("\nNote: This uses a random policy for demonstration.") print("Real evaluation should use an LLM-based policy (see baseline.py or inference.py)")