Spaces:
Sleeping
Sleeping
| """ | |
| Example: Multi-seed evaluation without LLM | |
| ========================================== | |
| Demonstrates how multi-seed evaluation works using a simple random policy. | |
| No API key required. | |
| """ | |
| import random | |
| from typing import Dict, List | |
| from grid_env.env import WarehouseFulfillmentEnv | |
| from grid_env.graders import grade_episode | |
| def random_policy(env: WarehouseFulfillmentEnv) -> str: | |
| """Simple random policy for demonstration.""" | |
| return random.choice(env.action_space) | |
| def run_task_with_seed(task_id: str, seed: int) -> Dict[str, float]: | |
| """Run a single task with a specific seed using random policy.""" | |
| env = WarehouseFulfillmentEnv(task_id=task_id, seed=seed) | |
| obs = env.reset(task_id=task_id, seed=seed) | |
| done = False | |
| while not done: | |
| action = random_policy(env) | |
| obs, reward, done, info = env.step(action) | |
| final_state = env.state() | |
| score = grade_episode(final_state) | |
| return { | |
| "task_id": task_id, | |
| "seed": seed, | |
| "score": round(score, 4), | |
| "steps": final_state.step_count, | |
| "success": final_state.success, | |
| } | |
| def evaluate_multiseed(task_id: str, seeds: List[int]) -> Dict: | |
| """Evaluate a task across multiple seeds.""" | |
| print(f"\nEvaluating {task_id} with seeds: {seeds}") | |
| results = [] | |
| for seed in seeds: | |
| result = run_task_with_seed(task_id, seed) | |
| results.append(result) | |
| print(f" Seed {seed:>3}: score={result['score']:.4f} steps={result['steps']:>3} success={result['success']}") | |
| scores = [r["score"] for r in results] | |
| mean_score = sum(scores) / len(scores) | |
| std_score = (sum((s - mean_score)**2 for s in scores) / len(scores))**0.5 | |
| print(f" → Mean: {mean_score:.4f} ± {std_score:.4f} Min: {min(scores):.4f} Max: {max(scores):.4f}") | |
| return { | |
| "task_id": task_id, | |
| "mean_score": round(mean_score, 4), | |
| "std_score": round(std_score, 4), | |
| "min_score": round(min(scores), 4), | |
| "max_score": round(max(scores), 4), | |
| "num_seeds": len(seeds), | |
| } | |
| if __name__ == "__main__": | |
| # Set random seed for reproducibility | |
| random.seed(42) | |
| # Evaluate a few tasks with multiple seeds | |
| tasks = ["easy_single_pick", "medium_multi_item", "obstacle_course"] | |
| eval_seeds = [7, 42, 123, 456, 789] | |
| print("="*60) | |
| print("Multi-Seed Evaluation Demo (Random Policy)") | |
| print("="*60) | |
| all_results = [] | |
| for task_id in tasks: | |
| result = evaluate_multiseed(task_id, eval_seeds) | |
| all_results.append(result) | |
| print("\n" + "="*60) | |
| print("Summary") | |
| print("="*60) | |
| print(f"{'Task':<30} {'Mean Score':>12} {'Std':>8} {'Min':>8} {'Max':>8}") | |
| print("-"*60) | |
| for r in all_results: | |
| print( | |
| f"{r['task_id']:<30} {r['mean_score']:>12.4f} {r['std_score']:>8.4f} " | |
| f"{r['min_score']:>8.4f} {r['max_score']:>8.4f}" | |
| ) | |
| overall_mean = sum(r["mean_score"] for r in all_results) / len(all_results) | |
| print("-"*60) | |
| print(f"{'Overall Mean':<30} {overall_mean:>12.4f}") | |
| print("\nNote: This uses a random policy for demonstration.") | |
| print("Real evaluation should use an LLM-based policy (see baseline.py or inference.py)") | |