from typing import Dict def compute_reward(grader_score: float, previous_score: float, step_number: int, max_steps: int, is_done: bool, action_valid: bool) -> float: """ Computes a shaped reward based on the problem statement requirements: - Partial credit per step: +0.0-0.5 for incremental improvement - Completion bonus: +0.5 if grader score >= 0.8 when is_done=True - Step penalty: -0.02 per unnecessary step (> task's min required steps) - Invalid action penalty: -0.1 for empty/unparseable queries - Total clamped to [0.0, 1.0] """ reward = 0.0 if not action_valid: return -0.1 improvement = max(0.0, grader_score - previous_score) # Give partial credit up to 0.5 based on improvement reward += improvement * 0.5 if is_done and grader_score >= 0.8: reward += 0.5 if step_number > max_steps: reward -= 0.02 * (step_number - max_steps) return max(0.0, min(1.0, reward))