Somuai12 commited on
Commit
95a7dc0
·
1 Parent(s): e4f6b1d

Fix: clamp scores to strict (0.001, 0.999) — validator rejects exact 0 and 1

Browse files
Files changed (2) hide show
  1. server/environment.py +1 -1
  2. server/grader.py +9 -8
server/environment.py CHANGED
@@ -131,7 +131,7 @@ class PolicyEvolverEnvironment(Environment[Action, Observation, State]):
131
 
132
  previous_score = self._state.current_score
133
  raw_reward = grade(action_dict, self._state.task_id, previous_score=previous_score)
134
- reward = max(0.0, raw_reward - repetition_penalty)
135
 
136
  self._state.current_score = reward
137
  self._state.best_score = max(self._state.best_score, reward)
 
131
 
132
  previous_score = self._state.current_score
133
  raw_reward = grade(action_dict, self._state.task_id, previous_score=previous_score)
134
+ reward = max(0.001, min(0.999, raw_reward - repetition_penalty))
135
 
136
  self._state.current_score = reward
137
  self._state.best_score = max(self._state.best_score, reward)
server/grader.py CHANGED
@@ -222,7 +222,7 @@ def grade_clarification(action: ProposeClarificationAction, task: Dict) -> float
222
  # Noise penalty is applied at the very end to ensure it's not diluted
223
  final_score -= (exploit_penalty + density_penalty + noise_hit)
224
 
225
- return round(max(0.0, min(1.0, final_score)), 4)
226
 
227
 
228
  # ─────────────────────────────────────────────
@@ -308,7 +308,7 @@ def grade_new_rule(action: ProposeNewRuleAction, task: Dict) -> float:
308
 
309
  score -= (exploit_penalty + density_penalty)
310
 
311
- return round(max(0.0, min(1.0, score)), 4)
312
 
313
 
314
  # ─────────────────────────────────────────────
@@ -449,7 +449,7 @@ def grade_evolution(action: EvolveProcessAction, task: Dict) -> float:
449
 
450
  final_score -= (exploit_penalty + density_penalty + alignment_penalty)
451
 
452
- return round(max(0.0, min(1.0, final_score)), 4)
453
 
454
 
455
  # ─────────────────────────────────────────────
@@ -462,11 +462,11 @@ def grade(action_dict: Dict, task_id: str, temperature: float = 0.0, seed: int =
462
  action_dict: the raw JSON body from the agent
463
  task_id: "task_easy" | "task_medium" | "task_hard"
464
  previous_score: the best score achieved so far in the current episode
465
- Returns float in [0.0, 1.0]always clamped.
466
  """
467
  task = TASK_REGISTRY.get(task_id)
468
  if task is None:
469
- return 0.0
470
 
471
  think = action_dict.get("think", "")
472
 
@@ -522,10 +522,10 @@ def grade(action_dict: Dict, task_id: str, temperature: float = 0.0, seed: int =
522
  raw = grade_evolution(action, task)
523
  else:
524
  logger.warning(f"Unknown action_type: {action_type}")
525
- return 0.0
526
  except Exception as e:
527
  logger.error(f"Grading validation failed: {str(e)}\nAction context: {action_dict}")
528
- return 0.0
529
 
530
  # Step-delta improvement bonus
531
  delta = raw - previous_score
@@ -537,7 +537,8 @@ def grade(action_dict: Dict, task_id: str, temperature: float = 0.0, seed: int =
537
  improvement_bonus = 0.0
538
 
539
  final_score = raw + improvement_bonus
540
- return round(max(0.0, min(1.0, final_score)), 4)
 
541
 
542
 
543
  if __name__ == "__main__":
 
222
  # Noise penalty is applied at the very end to ensure it's not diluted
223
  final_score -= (exploit_penalty + density_penalty + noise_hit)
224
 
225
+ return round(max(0.001, min(0.999, final_score)), 4)
226
 
227
 
228
  # ─────────────────────────────────────────────
 
308
 
309
  score -= (exploit_penalty + density_penalty)
310
 
311
+ return round(max(0.001, min(0.999, score)), 4)
312
 
313
 
314
  # ─────────────────────────────────────────────
 
449
 
450
  final_score -= (exploit_penalty + density_penalty + alignment_penalty)
451
 
452
+ return round(max(0.001, min(0.999, final_score)), 4)
453
 
454
 
455
  # ─────────────────────────────────────────────
 
462
  action_dict: the raw JSON body from the agent
463
  task_id: "task_easy" | "task_medium" | "task_hard"
464
  previous_score: the best score achieved so far in the current episode
465
+ Returns float in (0.0, 1.0)strictly clamped, never exactly 0 or 1.
466
  """
467
  task = TASK_REGISTRY.get(task_id)
468
  if task is None:
469
+ return 0.001
470
 
471
  think = action_dict.get("think", "")
472
 
 
522
  raw = grade_evolution(action, task)
523
  else:
524
  logger.warning(f"Unknown action_type: {action_type}")
525
+ return 0.001
526
  except Exception as e:
527
  logger.error(f"Grading validation failed: {str(e)}\nAction context: {action_dict}")
528
+ return 0.001
529
 
530
  # Step-delta improvement bonus
531
  delta = raw - previous_score
 
537
  improvement_bonus = 0.0
538
 
539
  final_score = raw + improvement_bonus
540
+ # Strict (0, 1) clamping — validator rejects exact 0.0 and 1.0
541
+ return round(max(0.001, min(0.999, final_score)), 4)
542
 
543
 
544
  if __name__ == "__main__":