Spaces:

Viraj0112
/

rl_code_fix_env

Running

App Files Files Community

rl_code_fix_env / src /reward /reward.py

Viraj0112

Upload folder using huggingface_hub

03a907a verified about 20 hours ago

raw

history blame contribute delete

1.95 kB

	from ..reward.trace_scorer import score_trace


	def compute_reward(test_score, trace_obj, code, steps_taken, max_steps, prev_test_score=0.0, last_action_empty=False):
	"""
	Compute reward for code fixing episode.

	Args:
	test_score: Test execution score (0.0-1.0)
	trace_obj: TraceCollector object with action history
	code: Fixed code string for quality evaluation
	steps_taken: Number of steps taken
	max_steps: Maximum steps allowed
	prev_test_score: Previous test score (for regression penalty)
	last_action_empty: Whether the last action was empty/no-op

	Returns:
	Reward score in [0.0, 1.0]
	"""
	# If last action was empty/no-op, give minimal reward to encourage meaningful actions
	# NOTE: validator requires score strictly > 0.0, so use a small epsilon
	_EPS = 1e-6
	if last_action_empty:
	return _EPS

	# 1. Functional Progress (90% weight) — primary signal
	functional_reward = float(test_score)

	# 1b. Regression Penalty: penalize when test score decreases
	# This encourages the agent to not make things worse
	test_score_delta = test_score - prev_test_score
	regression_penalty = 0.0
	if test_score_delta < 0:
	# Penalize proportionally to how much the score dropped
	regression_penalty = abs(test_score_delta) * 0.1 # 10% penalty for regression

	# 2. Reasoning Quality (10% weight) - bonus for good reasoning trace
	trace_reward = max(0.0, score_trace(trace_obj) if trace_obj else 0.0) # Ensure non-negative

	# Weighted sum — coefficients sum to 1.0 before penalties
	reward = (
	0.9 * functional_reward
	+ 0.1 * trace_reward
	- regression_penalty
	)

	# Clamp to open interval (0.0, 1.0) — validator rejects boundary values
	_EPS = 1e-6
	return max(_EPS, min(1.0 - _EPS, reward))