Fix: clamp scores to strict (0.001, 0.999) — validator rejects exact 0 and 1
Browse files- server/environment.py +1 -1
- server/grader.py +9 -8
server/environment.py
CHANGED
|
@@ -131,7 +131,7 @@ class PolicyEvolverEnvironment(Environment[Action, Observation, State]):
|
|
| 131 |
|
| 132 |
previous_score = self._state.current_score
|
| 133 |
raw_reward = grade(action_dict, self._state.task_id, previous_score=previous_score)
|
| 134 |
-
reward = max(0.0, raw_reward - repetition_penalty)
|
| 135 |
|
| 136 |
self._state.current_score = reward
|
| 137 |
self._state.best_score = max(self._state.best_score, reward)
|
|
|
|
| 131 |
|
| 132 |
previous_score = self._state.current_score
|
| 133 |
raw_reward = grade(action_dict, self._state.task_id, previous_score=previous_score)
|
| 134 |
+
reward = max(0.001, min(0.999, raw_reward - repetition_penalty))
|
| 135 |
|
| 136 |
self._state.current_score = reward
|
| 137 |
self._state.best_score = max(self._state.best_score, reward)
|
server/grader.py
CHANGED
|
@@ -222,7 +222,7 @@ def grade_clarification(action: ProposeClarificationAction, task: Dict) -> float
|
|
| 222 |
# Noise penalty is applied at the very end to ensure it's not diluted
|
| 223 |
final_score -= (exploit_penalty + density_penalty + noise_hit)
|
| 224 |
|
| 225 |
-
return round(max(0.
|
| 226 |
|
| 227 |
|
| 228 |
# ─────────────────────────────────────────────
|
|
@@ -308,7 +308,7 @@ def grade_new_rule(action: ProposeNewRuleAction, task: Dict) -> float:
|
|
| 308 |
|
| 309 |
score -= (exploit_penalty + density_penalty)
|
| 310 |
|
| 311 |
-
return round(max(0.
|
| 312 |
|
| 313 |
|
| 314 |
# ─────────────────────────────────────────────
|
|
@@ -449,7 +449,7 @@ def grade_evolution(action: EvolveProcessAction, task: Dict) -> float:
|
|
| 449 |
|
| 450 |
final_score -= (exploit_penalty + density_penalty + alignment_penalty)
|
| 451 |
|
| 452 |
-
return round(max(0.
|
| 453 |
|
| 454 |
|
| 455 |
# ─────────────────────────────────────────────
|
|
@@ -462,11 +462,11 @@ def grade(action_dict: Dict, task_id: str, temperature: float = 0.0, seed: int =
|
|
| 462 |
action_dict: the raw JSON body from the agent
|
| 463 |
task_id: "task_easy" | "task_medium" | "task_hard"
|
| 464 |
previous_score: the best score achieved so far in the current episode
|
| 465 |
-
Returns float in
|
| 466 |
"""
|
| 467 |
task = TASK_REGISTRY.get(task_id)
|
| 468 |
if task is None:
|
| 469 |
-
return 0.
|
| 470 |
|
| 471 |
think = action_dict.get("think", "")
|
| 472 |
|
|
@@ -522,10 +522,10 @@ def grade(action_dict: Dict, task_id: str, temperature: float = 0.0, seed: int =
|
|
| 522 |
raw = grade_evolution(action, task)
|
| 523 |
else:
|
| 524 |
logger.warning(f"Unknown action_type: {action_type}")
|
| 525 |
-
return 0.
|
| 526 |
except Exception as e:
|
| 527 |
logger.error(f"Grading validation failed: {str(e)}\nAction context: {action_dict}")
|
| 528 |
-
return 0.
|
| 529 |
|
| 530 |
# Step-delta improvement bonus
|
| 531 |
delta = raw - previous_score
|
|
@@ -537,7 +537,8 @@ def grade(action_dict: Dict, task_id: str, temperature: float = 0.0, seed: int =
|
|
| 537 |
improvement_bonus = 0.0
|
| 538 |
|
| 539 |
final_score = raw + improvement_bonus
|
| 540 |
-
|
|
|
|
| 541 |
|
| 542 |
|
| 543 |
if __name__ == "__main__":
|
|
|
|
| 222 |
# Noise penalty is applied at the very end to ensure it's not diluted
|
| 223 |
final_score -= (exploit_penalty + density_penalty + noise_hit)
|
| 224 |
|
| 225 |
+
return round(max(0.001, min(0.999, final_score)), 4)
|
| 226 |
|
| 227 |
|
| 228 |
# ─────────────────────────────────────────────
|
|
|
|
| 308 |
|
| 309 |
score -= (exploit_penalty + density_penalty)
|
| 310 |
|
| 311 |
+
return round(max(0.001, min(0.999, score)), 4)
|
| 312 |
|
| 313 |
|
| 314 |
# ─────────────────────────────────────────────
|
|
|
|
| 449 |
|
| 450 |
final_score -= (exploit_penalty + density_penalty + alignment_penalty)
|
| 451 |
|
| 452 |
+
return round(max(0.001, min(0.999, final_score)), 4)
|
| 453 |
|
| 454 |
|
| 455 |
# ─────────────────────────────────────────────
|
|
|
|
| 462 |
action_dict: the raw JSON body from the agent
|
| 463 |
task_id: "task_easy" | "task_medium" | "task_hard"
|
| 464 |
previous_score: the best score achieved so far in the current episode
|
| 465 |
+
Returns float in (0.0, 1.0) — strictly clamped, never exactly 0 or 1.
|
| 466 |
"""
|
| 467 |
task = TASK_REGISTRY.get(task_id)
|
| 468 |
if task is None:
|
| 469 |
+
return 0.001
|
| 470 |
|
| 471 |
think = action_dict.get("think", "")
|
| 472 |
|
|
|
|
| 522 |
raw = grade_evolution(action, task)
|
| 523 |
else:
|
| 524 |
logger.warning(f"Unknown action_type: {action_type}")
|
| 525 |
+
return 0.001
|
| 526 |
except Exception as e:
|
| 527 |
logger.error(f"Grading validation failed: {str(e)}\nAction context: {action_dict}")
|
| 528 |
+
return 0.001
|
| 529 |
|
| 530 |
# Step-delta improvement bonus
|
| 531 |
delta = raw - previous_score
|
|
|
|
| 537 |
improvement_bonus = 0.0
|
| 538 |
|
| 539 |
final_score = raw + improvement_bonus
|
| 540 |
+
# Strict (0, 1) clamping — validator rejects exact 0.0 and 1.0
|
| 541 |
+
return round(max(0.001, min(0.999, final_score)), 4)
|
| 542 |
|
| 543 |
|
| 544 |
if __name__ == "__main__":
|