File size: 699 Bytes
9bd4a93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from typing import Any, Dict, List

def grade(trajectory: List[Dict[str, Any]], **kwargs) -> float:
    """
    Generic OpenEnv grader.
    Extracts the final reward from the agent's trajectory.
    """
    if not trajectory:
        return 0.01
        
    last_step = trajectory[-1]
    
    # Try to extract from OpenEnv's typical trajectory format
    reward = 0.01
    if "reward" in last_step:
        reward = float(last_step.get("reward", 0.01))
    elif "observation" in last_step and "reward" in last_step["observation"]:
        reward = float(last_step["observation"].get("reward", 0.01))
        
    # Ensure it's bounded between 0.01 and 0.99
    return min(max(reward, 0.01), 0.99)