Spaces:
Sleeping
Sleeping
| from typing import Dict | |
| def compute_reward(grader_score: float, previous_score: float, step_number: int, max_steps: int, is_done: bool, action_valid: bool) -> float: | |
| """ | |
| Computes a shaped reward based on the problem statement requirements: | |
| - Partial credit per step: +0.0-0.5 for incremental improvement | |
| - Completion bonus: +0.5 if grader score >= 0.8 when is_done=True | |
| - Step penalty: -0.02 per unnecessary step (> task's min required steps) | |
| - Invalid action penalty: -0.1 for empty/unparseable queries | |
| - Total clamped to [0.0, 1.0] | |
| """ | |
| reward = 0.0 | |
| if not action_valid: | |
| return -0.1 | |
| improvement = max(0.0, grader_score - previous_score) | |
| # Give partial credit up to 0.5 based on improvement | |
| reward += improvement * 0.5 | |
| if is_done and grader_score >= 0.8: | |
| reward += 0.5 | |
| if step_number > max_steps: | |
| reward -= 0.02 * (step_number - max_steps) | |
| return max(0.0, min(1.0, reward)) | |