from ..reward.trace_scorer import score_trace def compute_reward(test_score, trace_obj, code, steps_taken, max_steps, prev_test_score=0.0, last_action_empty=False): """ Compute reward for code fixing episode. Args: test_score: Test execution score (0.0-1.0) trace_obj: TraceCollector object with action history code: Fixed code string for quality evaluation steps_taken: Number of steps taken max_steps: Maximum steps allowed prev_test_score: Previous test score (for regression penalty) last_action_empty: Whether the last action was empty/no-op Returns: Reward score in [0.0, 1.0] """ # If last action was empty/no-op, give minimal reward to encourage meaningful actions # NOTE: validator requires score strictly > 0.0, so use a small epsilon _EPS = 1e-6 if last_action_empty: return _EPS # 1. Functional Progress (90% weight) — primary signal functional_reward = float(test_score) # 1b. Regression Penalty: penalize when test score decreases # This encourages the agent to not make things worse test_score_delta = test_score - prev_test_score regression_penalty = 0.0 if test_score_delta < 0: # Penalize proportionally to how much the score dropped regression_penalty = abs(test_score_delta) * 0.1 # 10% penalty for regression # 2. Reasoning Quality (10% weight) - bonus for good reasoning trace trace_reward = max(0.0, score_trace(trace_obj) if trace_obj else 0.0) # Ensure non-negative # Weighted sum — coefficients sum to 1.0 before penalties reward = ( 0.9 * functional_reward + 0.1 * trace_reward - regression_penalty ) # Clamp to open interval (0.0, 1.0) — validator rejects boundary values _EPS = 1e-6 return max(_EPS, min(1.0 - _EPS, reward))