Spaces:

sohambose98
/

mini-rl-env

Sleeping

App Files Files Community

mini-rl-env / tests /example_multiseed.py

sohambose98

updated multi seed evalution; addon with deterministic graders

9faf143 about 2 months ago

raw

history blame contribute delete

3.27 kB

	"""
	Example: Multi-seed evaluation without LLM
	==========================================
	Demonstrates how multi-seed evaluation works using a simple random policy.
	No API key required.
	"""

	import random
	from typing import Dict, List

	from grid_env.env import WarehouseFulfillmentEnv
	from grid_env.graders import grade_episode


	def random_policy(env: WarehouseFulfillmentEnv) -> str:
	"""Simple random policy for demonstration."""
	return random.choice(env.action_space)


	def run_task_with_seed(task_id: str, seed: int) -> Dict[str, float]:
	"""Run a single task with a specific seed using random policy."""
	env = WarehouseFulfillmentEnv(task_id=task_id, seed=seed)
	obs = env.reset(task_id=task_id, seed=seed)

	done = False
	while not done:
	action = random_policy(env)
	obs, reward, done, info = env.step(action)

	final_state = env.state()
	score = grade_episode(final_state)

	return {
	"task_id": task_id,
	"seed": seed,
	"score": round(score, 4),
	"steps": final_state.step_count,
	"success": final_state.success,
	}


	def evaluate_multiseed(task_id: str, seeds: List[int]) -> Dict:
	"""Evaluate a task across multiple seeds."""
	print(f"\nEvaluating {task_id} with seeds: {seeds}")

	results = []
	for seed in seeds:
	result = run_task_with_seed(task_id, seed)
	results.append(result)
	print(f" Seed {seed:>3}: score={result['score']:.4f} steps={result['steps']:>3} success={result['success']}")

	scores = [r["score"] for r in results]
	mean_score = sum(scores) / len(scores)
	std_score = (sum((s - mean_score)2 for s in scores) / len(scores))0.5

	print(f" → Mean: {mean_score:.4f} ± {std_score:.4f} Min: {min(scores):.4f} Max: {max(scores):.4f}")

	return {
	"task_id": task_id,
	"mean_score": round(mean_score, 4),
	"std_score": round(std_score, 4),
	"min_score": round(min(scores), 4),
	"max_score": round(max(scores), 4),
	"num_seeds": len(seeds),
	}


	if __name__ == "__main__":
	# Set random seed for reproducibility
	random.seed(42)

	# Evaluate a few tasks with multiple seeds
	tasks = ["easy_single_pick", "medium_multi_item", "obstacle_course"]
	eval_seeds = [7, 42, 123, 456, 789]

	print("="*60)
	print("Multi-Seed Evaluation Demo (Random Policy)")
	print("="*60)

	all_results = []
	for task_id in tasks:
	result = evaluate_multiseed(task_id, eval_seeds)
	all_results.append(result)

	print("\n" + "="*60)
	print("Summary")
	print("="*60)
	print(f"{'Task':<30} {'Mean Score':>12} {'Std':>8} {'Min':>8} {'Max':>8}")
	print("-"*60)
	for r in all_results:
	print(
	f"{r['task_id']:<30} {r['mean_score']:>12.4f} {r['std_score']:>8.4f} "
	f"{r['min_score']:>8.4f} {r['max_score']:>8.4f}"
	)

	overall_mean = sum(r["mean_score"] for r in all_results) / len(all_results)
	print("-"*60)
	print(f"{'Overall Mean':<30} {overall_mean:>12.4f}")

	print("\nNote: This uses a random policy for demonstration.")
	print("Real evaluation should use an LLM-based policy (see baseline.py or inference.py)")