rl_code_fix_env / src /reward /reward.py
Viraj0112's picture
Upload folder using huggingface_hub
03a907a verified
from ..reward.trace_scorer import score_trace
def compute_reward(test_score, trace_obj, code, steps_taken, max_steps, prev_test_score=0.0, last_action_empty=False):
"""
Compute reward for code fixing episode.
Args:
test_score: Test execution score (0.0-1.0)
trace_obj: TraceCollector object with action history
code: Fixed code string for quality evaluation
steps_taken: Number of steps taken
max_steps: Maximum steps allowed
prev_test_score: Previous test score (for regression penalty)
last_action_empty: Whether the last action was empty/no-op
Returns:
Reward score in [0.0, 1.0]
"""
# If last action was empty/no-op, give minimal reward to encourage meaningful actions
# NOTE: validator requires score strictly > 0.0, so use a small epsilon
_EPS = 1e-6
if last_action_empty:
return _EPS
# 1. Functional Progress (90% weight) β€” primary signal
functional_reward = float(test_score)
# 1b. Regression Penalty: penalize when test score decreases
# This encourages the agent to not make things worse
test_score_delta = test_score - prev_test_score
regression_penalty = 0.0
if test_score_delta < 0:
# Penalize proportionally to how much the score dropped
regression_penalty = abs(test_score_delta) * 0.1 # 10% penalty for regression
# 2. Reasoning Quality (10% weight) - bonus for good reasoning trace
trace_reward = max(0.0, score_trace(trace_obj) if trace_obj else 0.0) # Ensure non-negative
# Weighted sum β€” coefficients sum to 1.0 before penalties
reward = (
0.9 * functional_reward
+ 0.1 * trace_reward
- regression_penalty
)
# Clamp to open interval (0.0, 1.0) β€” validator rejects boundary values
_EPS = 1e-6
return max(_EPS, min(1.0 - _EPS, reward))