mini-rl-env / tests /example_multiseed.py
sohambose98's picture
updated multi seed evalution; addon with deterministic graders
9faf143
"""
Example: Multi-seed evaluation without LLM
==========================================
Demonstrates how multi-seed evaluation works using a simple random policy.
No API key required.
"""
import random
from typing import Dict, List
from grid_env.env import WarehouseFulfillmentEnv
from grid_env.graders import grade_episode
def random_policy(env: WarehouseFulfillmentEnv) -> str:
"""Simple random policy for demonstration."""
return random.choice(env.action_space)
def run_task_with_seed(task_id: str, seed: int) -> Dict[str, float]:
"""Run a single task with a specific seed using random policy."""
env = WarehouseFulfillmentEnv(task_id=task_id, seed=seed)
obs = env.reset(task_id=task_id, seed=seed)
done = False
while not done:
action = random_policy(env)
obs, reward, done, info = env.step(action)
final_state = env.state()
score = grade_episode(final_state)
return {
"task_id": task_id,
"seed": seed,
"score": round(score, 4),
"steps": final_state.step_count,
"success": final_state.success,
}
def evaluate_multiseed(task_id: str, seeds: List[int]) -> Dict:
"""Evaluate a task across multiple seeds."""
print(f"\nEvaluating {task_id} with seeds: {seeds}")
results = []
for seed in seeds:
result = run_task_with_seed(task_id, seed)
results.append(result)
print(f" Seed {seed:>3}: score={result['score']:.4f} steps={result['steps']:>3} success={result['success']}")
scores = [r["score"] for r in results]
mean_score = sum(scores) / len(scores)
std_score = (sum((s - mean_score)**2 for s in scores) / len(scores))**0.5
print(f" → Mean: {mean_score:.4f} ± {std_score:.4f} Min: {min(scores):.4f} Max: {max(scores):.4f}")
return {
"task_id": task_id,
"mean_score": round(mean_score, 4),
"std_score": round(std_score, 4),
"min_score": round(min(scores), 4),
"max_score": round(max(scores), 4),
"num_seeds": len(seeds),
}
if __name__ == "__main__":
# Set random seed for reproducibility
random.seed(42)
# Evaluate a few tasks with multiple seeds
tasks = ["easy_single_pick", "medium_multi_item", "obstacle_course"]
eval_seeds = [7, 42, 123, 456, 789]
print("="*60)
print("Multi-Seed Evaluation Demo (Random Policy)")
print("="*60)
all_results = []
for task_id in tasks:
result = evaluate_multiseed(task_id, eval_seeds)
all_results.append(result)
print("\n" + "="*60)
print("Summary")
print("="*60)
print(f"{'Task':<30} {'Mean Score':>12} {'Std':>8} {'Min':>8} {'Max':>8}")
print("-"*60)
for r in all_results:
print(
f"{r['task_id']:<30} {r['mean_score']:>12.4f} {r['std_score']:>8.4f} "
f"{r['min_score']:>8.4f} {r['max_score']:>8.4f}"
)
overall_mean = sum(r["mean_score"] for r in all_results) / len(all_results)
print("-"*60)
print(f"{'Overall Mean':<30} {overall_mean:>12.4f}")
print("\nNote: This uses a random policy for demonstration.")
print("Real evaluation should use an LLM-based policy (see baseline.py or inference.py)")