from typing import Any, Dict, List def grade(trajectory: List[Dict[str, Any]], **kwargs) -> float: """ Generic OpenEnv grader. Extracts the final reward from the agent's trajectory. """ if not trajectory: return 0.01 last_step = trajectory[-1] # Try to extract from OpenEnv's typical trajectory format reward = 0.01 if "reward" in last_step: reward = float(last_step.get("reward", 0.01)) elif "observation" in last_step and "reward" in last_step["observation"]: reward = float(last_step["observation"].get("reward", 0.01)) # Ensure it's bounded between 0.01 and 0.99 return min(max(reward, 0.01), 0.99)