ragavrida commited on
Commit
fdc7ded
·
1 Parent(s): 320d51b

fix: clamp all scores strictly to (0, 1) — no 0.0 or 1.0

Browse files

- graders: episode_score returns 0.01 for empty, clamps to [0.01, 0.99]
- env/base: terminal_step and invalid action use 0.01 not 0.0/-0.1
- validate: updated local check to match strict (0,1) requirement

env/base.py CHANGED
@@ -314,8 +314,8 @@ class CodeReviewEnv:
314
  else:
315
  # Invalid action type for hard task
316
  reward = Reward(
317
- value=-0.1,
318
- breakdown={"step_reward": 0.0, "invalid_action_penalty": -0.1},
319
  reason=f"Invalid action_type '{action.action_type}' for hard task.",
320
  )
321
  return reward, grader_info
@@ -387,8 +387,8 @@ class CodeReviewEnv:
387
  def _terminal_step(self) -> Tuple[Observation, Reward, bool, Dict]:
388
  """Handle step() calls after episode is done."""
389
  reward = Reward(
390
- value=0.0,
391
- breakdown={"step_reward": 0.0},
392
  reason="Episode already done.",
393
  )
394
  return self.current_obs, reward, True, {
 
314
  else:
315
  # Invalid action type for hard task
316
  reward = Reward(
317
+ value=0.01,
318
+ breakdown={"step_reward": 0.01, "invalid_action_penalty": -0.01},
319
  reason=f"Invalid action_type '{action.action_type}' for hard task.",
320
  )
321
  return reward, grader_info
 
387
  def _terminal_step(self) -> Tuple[Observation, Reward, bool, Dict]:
388
  """Handle step() calls after episode is done."""
389
  reward = Reward(
390
+ value=0.01,
391
+ breakdown={"step_reward": 0.01},
392
  reason="Episode already done.",
393
  )
394
  return self.current_obs, reward, True, {
graders/grader_easy.py CHANGED
@@ -226,7 +226,8 @@ class EasyGrader:
226
 
227
  This is the aggregate metric for the easy task.
228
  Each step reward is already in [-1, 1], so mean is also in [-1, 1].
 
229
  """
230
  if not step_rewards:
231
- return 0.0
232
- return sum(step_rewards) / len(step_rewards)
 
226
 
227
  This is the aggregate metric for the easy task.
228
  Each step reward is already in [-1, 1], so mean is also in [-1, 1].
229
+ Clamped to (0, 1) exclusive as required by the validator.
230
  """
231
  if not step_rewards:
232
+ return 0.01
233
+ return max(0.01, min(0.99, sum(step_rewards) / len(step_rewards)))
graders/grader_hard.py CHANGED
@@ -311,8 +311,9 @@ class HardGrader:
311
  return result
312
 
313
  def episode_score(self, step_rewards: List[float]) -> float:
314
- """Compute episode-level score with accumulated penalties."""
 
315
  if not step_rewards:
316
- return 0.0
317
  mean_score = sum(step_rewards) / len(step_rewards)
318
  return max(0.01, min(0.99, mean_score + self.episode_penalties))
 
311
  return result
312
 
313
  def episode_score(self, step_rewards: List[float]) -> float:
314
+ """Compute episode-level score with accumulated penalties.
315
+ Clamped to (0, 1) exclusive as required by the validator."""
316
  if not step_rewards:
317
+ return 0.01
318
  mean_score = sum(step_rewards) / len(step_rewards)
319
  return max(0.01, min(0.99, mean_score + self.episode_penalties))
graders/grader_medium.py CHANGED
@@ -255,7 +255,8 @@ class MediumGrader:
255
  return info
256
 
257
  def episode_score(self, step_rewards: List[float]) -> float:
258
- """Compute episode-level score as mean of step rewards."""
 
259
  if not step_rewards:
260
- return 0.0
261
- return sum(step_rewards) / len(step_rewards)
 
255
  return info
256
 
257
  def episode_score(self, step_rewards: List[float]) -> float:
258
+ """Compute episode-level score as mean of step rewards.
259
+ Clamped to (0, 1) exclusive as required by the validator."""
260
  if not step_rewards:
261
+ return 0.01
262
+ return max(0.01, min(0.99, sum(step_rewards) / len(step_rewards)))
validate.py CHANGED
@@ -75,7 +75,7 @@ def validate():
75
  obs, reward, done, info = env.step(action)
76
  results.append(check(
77
  "step() with wrong action type doesn't crash",
78
- isinstance(reward, Reward) and reward.value <= 0,
79
  ))
80
  except Exception as e:
81
  results.append(check("step() with invalid action doesn't crash", False, str(e)))
 
75
  obs, reward, done, info = env.step(action)
76
  results.append(check(
77
  "step() with wrong action type doesn't crash",
78
+ isinstance(reward, Reward) and 0 < reward.value < 1,
79
  ))
80
  except Exception as e:
81
  results.append(check("step() with invalid action doesn't crash", False, str(e)))