immortalindeed commited on
Commit
829f543
·
1 Parent(s): fc84271

Clamp scores strictly to (0.01, 0.99) to pass OpenEnv Phase 2 continuous environment score verification checks

Browse files
Files changed (3) hide show
  1. inference.py +2 -2
  2. server/app.py +2 -2
  3. server/graders/base_grader.py +4 -4
inference.py CHANGED
@@ -286,8 +286,8 @@ def run_task(client: OpenAI, task_id: str) -> float:
286
  break
287
 
288
  # Sum the rewards for multi-turn accumulation
289
- total_reward = sum(rewards) if rewards else 0.0
290
- score = round(min(max(total_reward, 0.0), 1.0), 2)
291
  success = score > 0.0
292
  rewards_str = ",".join(f"{r:.2f}" for r in rewards)
293
 
 
286
  break
287
 
288
  # Sum the rewards for multi-turn accumulation
289
+ total_reward = sum(rewards) if rewards else 0.01
290
+ score = round(min(max(total_reward, 0.01), 0.99), 4)
291
  success = score > 0.0
292
  rewards_str = ",".join(f"{r:.2f}" for r in rewards)
293
 
server/app.py CHANGED
@@ -520,8 +520,8 @@ def _run_single_task_inline(task_id, api_base, api_key, model_id, system_prompt)
520
  yield {'type': 'log', 'level': 'info', 'msg': msg}
521
 
522
  # Sum the rewards for multi-turn accumulation — same logic as inference.py
523
- total_reward = sum(rewards) if rewards else 0.0
524
- score = round(min(max(total_reward, 0.0), 1.0), 2)
525
  success = score > 0.0
526
  rewards_str = ','.join(f'{r:.2f}' for r in rewards)
527
 
 
520
  yield {'type': 'log', 'level': 'info', 'msg': msg}
521
 
522
  # Sum the rewards for multi-turn accumulation — same logic as inference.py
523
+ total_reward = sum(rewards) if rewards else 0.01
524
+ score = round(min(max(total_reward, 0.01), 0.99), 4)
525
  success = score > 0.0
526
  rewards_str = ','.join(f'{r:.2f}' for r in rewards)
527
 
server/graders/base_grader.py CHANGED
@@ -6,13 +6,13 @@ from typing import Dict, Any, List, Callable
6
 
7
 
8
  def safe_score(raw) -> float:
9
- """Always clamp to [0.0, 1.0]. Never crash. Handles None, str, out-of-range."""
10
  if raw is None:
11
- return 0.0 # BUG 1 FIX — must be first line
12
  try:
13
- return round(max(0.0, min(1.0, float(raw))), 4)
14
  except (TypeError, ValueError):
15
- return 0.0
16
 
17
 
18
  def repetition_penalty(action_type: str, last_actions: List[str], window: int = 3) -> float:
 
6
 
7
 
8
  def safe_score(raw) -> float:
9
+ """Always clamp strictly to (0.0, 1.0) range e.g. [0.01, 0.99]. Never crash."""
10
  if raw is None:
11
+ return 0.01
12
  try:
13
+ return round(max(0.01, min(0.99, float(raw))), 4)
14
  except (TypeError, ValueError):
15
+ return 0.01
16
 
17
 
18
  def repetition_penalty(action_type: str, last_actions: List[str], window: int = 3) -> float: