File size: 3,268 Bytes
9faf143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
Example: Multi-seed evaluation without LLM
==========================================
Demonstrates how multi-seed evaluation works using a simple random policy.
No API key required.
"""

import random
from typing import Dict, List

from grid_env.env import WarehouseFulfillmentEnv
from grid_env.graders import grade_episode


def random_policy(env: WarehouseFulfillmentEnv) -> str:
    """Simple random policy for demonstration."""
    return random.choice(env.action_space)


def run_task_with_seed(task_id: str, seed: int) -> Dict[str, float]:
    """Run a single task with a specific seed using random policy."""
    env = WarehouseFulfillmentEnv(task_id=task_id, seed=seed)
    obs = env.reset(task_id=task_id, seed=seed)
    
    done = False
    while not done:
        action = random_policy(env)
        obs, reward, done, info = env.step(action)
    
    final_state = env.state()
    score = grade_episode(final_state)
    
    return {
        "task_id": task_id,
        "seed": seed,
        "score": round(score, 4),
        "steps": final_state.step_count,
        "success": final_state.success,
    }


def evaluate_multiseed(task_id: str, seeds: List[int]) -> Dict:
    """Evaluate a task across multiple seeds."""
    print(f"\nEvaluating {task_id} with seeds: {seeds}")
    
    results = []
    for seed in seeds:
        result = run_task_with_seed(task_id, seed)
        results.append(result)
        print(f"  Seed {seed:>3}: score={result['score']:.4f}  steps={result['steps']:>3}  success={result['success']}")
    
    scores = [r["score"] for r in results]
    mean_score = sum(scores) / len(scores)
    std_score = (sum((s - mean_score)**2 for s in scores) / len(scores))**0.5
    
    print(f"  → Mean: {mean_score:.4f} ± {std_score:.4f}  Min: {min(scores):.4f}  Max: {max(scores):.4f}")
    
    return {
        "task_id": task_id,
        "mean_score": round(mean_score, 4),
        "std_score": round(std_score, 4),
        "min_score": round(min(scores), 4),
        "max_score": round(max(scores), 4),
        "num_seeds": len(seeds),
    }


if __name__ == "__main__":
    # Set random seed for reproducibility
    random.seed(42)
    
    # Evaluate a few tasks with multiple seeds
    tasks = ["easy_single_pick", "medium_multi_item", "obstacle_course"]
    eval_seeds = [7, 42, 123, 456, 789]
    
    print("="*60)
    print("Multi-Seed Evaluation Demo (Random Policy)")
    print("="*60)
    
    all_results = []
    for task_id in tasks:
        result = evaluate_multiseed(task_id, eval_seeds)
        all_results.append(result)
    
    print("\n" + "="*60)
    print("Summary")
    print("="*60)
    print(f"{'Task':<30} {'Mean Score':>12} {'Std':>8} {'Min':>8} {'Max':>8}")
    print("-"*60)
    for r in all_results:
        print(
            f"{r['task_id']:<30} {r['mean_score']:>12.4f} {r['std_score']:>8.4f} "
            f"{r['min_score']:>8.4f} {r['max_score']:>8.4f}"
        )
    
    overall_mean = sum(r["mean_score"] for r in all_results) / len(all_results)
    print("-"*60)
    print(f"{'Overall Mean':<30} {overall_mean:>12.4f}")
    
    print("\nNote: This uses a random policy for demonstration.")
    print("Real evaluation should use an LLM-based policy (see baseline.py or inference.py)")