Spaces:
Sleeping
Sleeping
fix: clamp all scores strictly to (0, 1) — no 0.0 or 1.0
Browse files- graders: episode_score returns 0.01 for empty, clamps to [0.01, 0.99]
- env/base: terminal_step and invalid action use 0.01 not 0.0/-0.1
- validate: updated local check to match strict (0,1) requirement
- env/base.py +4 -4
- graders/grader_easy.py +3 -2
- graders/grader_hard.py +3 -2
- graders/grader_medium.py +4 -3
- validate.py +1 -1
env/base.py
CHANGED
|
@@ -314,8 +314,8 @@ class CodeReviewEnv:
|
|
| 314 |
else:
|
| 315 |
# Invalid action type for hard task
|
| 316 |
reward = Reward(
|
| 317 |
-
value=
|
| 318 |
-
breakdown={"step_reward": 0.
|
| 319 |
reason=f"Invalid action_type '{action.action_type}' for hard task.",
|
| 320 |
)
|
| 321 |
return reward, grader_info
|
|
@@ -387,8 +387,8 @@ class CodeReviewEnv:
|
|
| 387 |
def _terminal_step(self) -> Tuple[Observation, Reward, bool, Dict]:
|
| 388 |
"""Handle step() calls after episode is done."""
|
| 389 |
reward = Reward(
|
| 390 |
-
value=0.
|
| 391 |
-
breakdown={"step_reward": 0.
|
| 392 |
reason="Episode already done.",
|
| 393 |
)
|
| 394 |
return self.current_obs, reward, True, {
|
|
|
|
| 314 |
else:
|
| 315 |
# Invalid action type for hard task
|
| 316 |
reward = Reward(
|
| 317 |
+
value=0.01,
|
| 318 |
+
breakdown={"step_reward": 0.01, "invalid_action_penalty": -0.01},
|
| 319 |
reason=f"Invalid action_type '{action.action_type}' for hard task.",
|
| 320 |
)
|
| 321 |
return reward, grader_info
|
|
|
|
| 387 |
def _terminal_step(self) -> Tuple[Observation, Reward, bool, Dict]:
|
| 388 |
"""Handle step() calls after episode is done."""
|
| 389 |
reward = Reward(
|
| 390 |
+
value=0.01,
|
| 391 |
+
breakdown={"step_reward": 0.01},
|
| 392 |
reason="Episode already done.",
|
| 393 |
)
|
| 394 |
return self.current_obs, reward, True, {
|
graders/grader_easy.py
CHANGED
|
@@ -226,7 +226,8 @@ class EasyGrader:
|
|
| 226 |
|
| 227 |
This is the aggregate metric for the easy task.
|
| 228 |
Each step reward is already in [-1, 1], so mean is also in [-1, 1].
|
|
|
|
| 229 |
"""
|
| 230 |
if not step_rewards:
|
| 231 |
-
return 0.
|
| 232 |
-
return sum(step_rewards) / len(step_rewards)
|
|
|
|
| 226 |
|
| 227 |
This is the aggregate metric for the easy task.
|
| 228 |
Each step reward is already in [-1, 1], so mean is also in [-1, 1].
|
| 229 |
+
Clamped to (0, 1) exclusive as required by the validator.
|
| 230 |
"""
|
| 231 |
if not step_rewards:
|
| 232 |
+
return 0.01
|
| 233 |
+
return max(0.01, min(0.99, sum(step_rewards) / len(step_rewards)))
|
graders/grader_hard.py
CHANGED
|
@@ -311,8 +311,9 @@ class HardGrader:
|
|
| 311 |
return result
|
| 312 |
|
| 313 |
def episode_score(self, step_rewards: List[float]) -> float:
|
| 314 |
-
"""Compute episode-level score with accumulated penalties.
|
|
|
|
| 315 |
if not step_rewards:
|
| 316 |
-
return 0.
|
| 317 |
mean_score = sum(step_rewards) / len(step_rewards)
|
| 318 |
return max(0.01, min(0.99, mean_score + self.episode_penalties))
|
|
|
|
| 311 |
return result
|
| 312 |
|
| 313 |
def episode_score(self, step_rewards: List[float]) -> float:
|
| 314 |
+
"""Compute episode-level score with accumulated penalties.
|
| 315 |
+
Clamped to (0, 1) exclusive as required by the validator."""
|
| 316 |
if not step_rewards:
|
| 317 |
+
return 0.01
|
| 318 |
mean_score = sum(step_rewards) / len(step_rewards)
|
| 319 |
return max(0.01, min(0.99, mean_score + self.episode_penalties))
|
graders/grader_medium.py
CHANGED
|
@@ -255,7 +255,8 @@ class MediumGrader:
|
|
| 255 |
return info
|
| 256 |
|
| 257 |
def episode_score(self, step_rewards: List[float]) -> float:
|
| 258 |
-
"""Compute episode-level score as mean of step rewards.
|
|
|
|
| 259 |
if not step_rewards:
|
| 260 |
-
return 0.
|
| 261 |
-
return sum(step_rewards) / len(step_rewards)
|
|
|
|
| 255 |
return info
|
| 256 |
|
| 257 |
def episode_score(self, step_rewards: List[float]) -> float:
|
| 258 |
+
"""Compute episode-level score as mean of step rewards.
|
| 259 |
+
Clamped to (0, 1) exclusive as required by the validator."""
|
| 260 |
if not step_rewards:
|
| 261 |
+
return 0.01
|
| 262 |
+
return max(0.01, min(0.99, sum(step_rewards) / len(step_rewards)))
|
validate.py
CHANGED
|
@@ -75,7 +75,7 @@ def validate():
|
|
| 75 |
obs, reward, done, info = env.step(action)
|
| 76 |
results.append(check(
|
| 77 |
"step() with wrong action type doesn't crash",
|
| 78 |
-
isinstance(reward, Reward) and reward.value <
|
| 79 |
))
|
| 80 |
except Exception as e:
|
| 81 |
results.append(check("step() with invalid action doesn't crash", False, str(e)))
|
|
|
|
| 75 |
obs, reward, done, info = env.step(action)
|
| 76 |
results.append(check(
|
| 77 |
"step() with wrong action type doesn't crash",
|
| 78 |
+
isinstance(reward, Reward) and 0 < reward.value < 1,
|
| 79 |
))
|
| 80 |
except Exception as e:
|
| 81 |
results.append(check("step() with invalid action doesn't crash", False, str(e)))
|