Spaces:

ragavrida
/

code-review-env

Sleeping

ragavrida commited on 9 days ago

Commit

fdc7ded

1 Parent(s): 320d51b

fix: clamp all scores strictly to (0, 1) — no 0.0 or 1.0

- graders: episode_score returns 0.01 for empty, clamps to [0.01, 0.99]
- env/base: terminal_step and invalid action use 0.01 not 0.0/-0.1
- validate: updated local check to match strict (0,1) requirement

Files changed (5) hide show

env/base.py +4 -4
graders/grader_easy.py +3 -2
graders/grader_hard.py +3 -2
graders/grader_medium.py +4 -3
validate.py +1 -1

env/base.py CHANGED Viewed

@@ -314,8 +314,8 @@ class CodeReviewEnv:
         else:
             # Invalid action type for hard task
             reward = Reward(
-                value=-0.1,
-                breakdown={"step_reward": 0.0, "invalid_action_penalty": -0.1},
                 reason=f"Invalid action_type '{action.action_type}' for hard task.",
             )
             return reward, grader_info
@@ -387,8 +387,8 @@ class CodeReviewEnv:
     def _terminal_step(self) -> Tuple[Observation, Reward, bool, Dict]:
         """Handle step() calls after episode is done."""
         reward = Reward(
-            value=0.0,
-            breakdown={"step_reward": 0.0},
             reason="Episode already done.",
         )
         return self.current_obs, reward, True, {

         else:
             # Invalid action type for hard task
             reward = Reward(
+                value=0.01,
+                breakdown={"step_reward": 0.01, "invalid_action_penalty": -0.01},
                 reason=f"Invalid action_type '{action.action_type}' for hard task.",
             )
             return reward, grader_info
     def _terminal_step(self) -> Tuple[Observation, Reward, bool, Dict]:
         """Handle step() calls after episode is done."""
         reward = Reward(
+            value=0.01,
+            breakdown={"step_reward": 0.01},
             reason="Episode already done.",
         )
         return self.current_obs, reward, True, {

graders/grader_easy.py CHANGED Viewed

@@ -226,7 +226,8 @@ class EasyGrader:
         This is the aggregate metric for the easy task.
         Each step reward is already in [-1, 1], so mean is also in [-1, 1].
         """
         if not step_rewards:
-            return 0.0
-        return sum(step_rewards) / len(step_rewards)

         This is the aggregate metric for the easy task.
         Each step reward is already in [-1, 1], so mean is also in [-1, 1].
+        Clamped to (0, 1) exclusive as required by the validator.
         """
         if not step_rewards:
+            return 0.01
+        return max(0.01, min(0.99, sum(step_rewards) / len(step_rewards)))

graders/grader_hard.py CHANGED Viewed

@@ -311,8 +311,9 @@ class HardGrader:
         return result
     def episode_score(self, step_rewards: List[float]) -> float:
-        """Compute episode-level score with accumulated penalties."""
         if not step_rewards:
-            return 0.0
         mean_score = sum(step_rewards) / len(step_rewards)
         return max(0.01, min(0.99, mean_score + self.episode_penalties))

         return result
     def episode_score(self, step_rewards: List[float]) -> float:
+        """Compute episode-level score with accumulated penalties.
+        Clamped to (0, 1) exclusive as required by the validator."""
         if not step_rewards:
+            return 0.01
         mean_score = sum(step_rewards) / len(step_rewards)
         return max(0.01, min(0.99, mean_score + self.episode_penalties))

graders/grader_medium.py CHANGED Viewed

@@ -255,7 +255,8 @@ class MediumGrader:
         return info
     def episode_score(self, step_rewards: List[float]) -> float:
-        """Compute episode-level score as mean of step rewards."""
         if not step_rewards:
-            return 0.0
-        return sum(step_rewards) / len(step_rewards)

         return info
     def episode_score(self, step_rewards: List[float]) -> float:
+        """Compute episode-level score as mean of step rewards.
+        Clamped to (0, 1) exclusive as required by the validator."""
         if not step_rewards:
+            return 0.01
+        return max(0.01, min(0.99, sum(step_rewards) / len(step_rewards)))

validate.py CHANGED Viewed

@@ -75,7 +75,7 @@ def validate():
         obs, reward, done, info = env.step(action)
         results.append(check(
             "step() with wrong action type doesn't crash",
-            isinstance(reward, Reward) and reward.value <= 0,
         ))
     except Exception as e:
         results.append(check("step() with invalid action doesn't crash", False, str(e)))

         obs, reward, done, info = env.step(action)
         results.append(check(
             "step() with wrong action type doesn't crash",
+            isinstance(reward, Reward) and 0 < reward.value < 1,
         ))
     except Exception as e:
         results.append(check("step() with invalid action doesn't crash", False, str(e)))