Spaces:
Running
Running
Commit ·
829f543
1
Parent(s): fc84271
Clamp scores strictly to (0.01, 0.99) to pass OpenEnv Phase 2 continuous environment score verification checks
Browse files- inference.py +2 -2
- server/app.py +2 -2
- server/graders/base_grader.py +4 -4
inference.py
CHANGED
|
@@ -286,8 +286,8 @@ def run_task(client: OpenAI, task_id: str) -> float:
|
|
| 286 |
break
|
| 287 |
|
| 288 |
# Sum the rewards for multi-turn accumulation
|
| 289 |
-
total_reward = sum(rewards) if rewards else 0.
|
| 290 |
-
score = round(min(max(total_reward, 0.
|
| 291 |
success = score > 0.0
|
| 292 |
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 293 |
|
|
|
|
| 286 |
break
|
| 287 |
|
| 288 |
# Sum the rewards for multi-turn accumulation
|
| 289 |
+
total_reward = sum(rewards) if rewards else 0.01
|
| 290 |
+
score = round(min(max(total_reward, 0.01), 0.99), 4)
|
| 291 |
success = score > 0.0
|
| 292 |
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
|
| 293 |
|
server/app.py
CHANGED
|
@@ -520,8 +520,8 @@ def _run_single_task_inline(task_id, api_base, api_key, model_id, system_prompt)
|
|
| 520 |
yield {'type': 'log', 'level': 'info', 'msg': msg}
|
| 521 |
|
| 522 |
# Sum the rewards for multi-turn accumulation — same logic as inference.py
|
| 523 |
-
total_reward = sum(rewards) if rewards else 0.
|
| 524 |
-
score = round(min(max(total_reward, 0.
|
| 525 |
success = score > 0.0
|
| 526 |
rewards_str = ','.join(f'{r:.2f}' for r in rewards)
|
| 527 |
|
|
|
|
| 520 |
yield {'type': 'log', 'level': 'info', 'msg': msg}
|
| 521 |
|
| 522 |
# Sum the rewards for multi-turn accumulation — same logic as inference.py
|
| 523 |
+
total_reward = sum(rewards) if rewards else 0.01
|
| 524 |
+
score = round(min(max(total_reward, 0.01), 0.99), 4)
|
| 525 |
success = score > 0.0
|
| 526 |
rewards_str = ','.join(f'{r:.2f}' for r in rewards)
|
| 527 |
|
server/graders/base_grader.py
CHANGED
|
@@ -6,13 +6,13 @@ from typing import Dict, Any, List, Callable
|
|
| 6 |
|
| 7 |
|
| 8 |
def safe_score(raw) -> float:
|
| 9 |
-
"""Always clamp to
|
| 10 |
if raw is None:
|
| 11 |
-
return 0.
|
| 12 |
try:
|
| 13 |
-
return round(max(0.
|
| 14 |
except (TypeError, ValueError):
|
| 15 |
-
return 0.
|
| 16 |
|
| 17 |
|
| 18 |
def repetition_penalty(action_type: str, last_actions: List[str], window: int = 3) -> float:
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
def safe_score(raw) -> float:
|
| 9 |
+
"""Always clamp strictly to (0.0, 1.0) range e.g. [0.01, 0.99]. Never crash."""
|
| 10 |
if raw is None:
|
| 11 |
+
return 0.01
|
| 12 |
try:
|
| 13 |
+
return round(max(0.01, min(0.99, float(raw))), 4)
|
| 14 |
except (TypeError, ValueError):
|
| 15 |
+
return 0.01
|
| 16 |
|
| 17 |
|
| 18 |
def repetition_penalty(action_type: str, last_actions: List[str], window: int = 3) -> float:
|