Spaces:

Cooked4riyal
/

EntropyEnv

Running

immortalindeed commited on Apr 8

Commit

829f543

1 Parent(s): fc84271

Clamp scores strictly to (0.01, 0.99) to pass OpenEnv Phase 2 continuous environment score verification checks

Files changed (3) hide show

inference.py CHANGED Viewed

@@ -286,8 +286,8 @@ def run_task(client: OpenAI, task_id: str) -> float:
             break
     # Sum the rewards for multi-turn accumulation
-    total_reward = sum(rewards) if rewards else 0.0
-    score = round(min(max(total_reward, 0.0), 1.0), 2)
     success = score > 0.0
     rewards_str = ",".join(f"{r:.2f}" for r in rewards)

             break
     # Sum the rewards for multi-turn accumulation
+    total_reward = sum(rewards) if rewards else 0.01
+    score = round(min(max(total_reward, 0.01), 0.99), 4)
     success = score > 0.0
     rewards_str = ",".join(f"{r:.2f}" for r in rewards)

server/app.py CHANGED Viewed

@@ -520,8 +520,8 @@ def _run_single_task_inline(task_id, api_base, api_key, model_id, system_prompt)
         yield {'type': 'log', 'level': 'info', 'msg': msg}
     # Sum the rewards for multi-turn accumulation — same logic as inference.py
-    total_reward = sum(rewards) if rewards else 0.0
-    score = round(min(max(total_reward, 0.0), 1.0), 2)
     success = score > 0.0
     rewards_str = ','.join(f'{r:.2f}' for r in rewards)

         yield {'type': 'log', 'level': 'info', 'msg': msg}
     # Sum the rewards for multi-turn accumulation — same logic as inference.py
+    total_reward = sum(rewards) if rewards else 0.01
+    score = round(min(max(total_reward, 0.01), 0.99), 4)
     success = score > 0.0
     rewards_str = ','.join(f'{r:.2f}' for r in rewards)

server/graders/base_grader.py CHANGED Viewed

@@ -6,13 +6,13 @@ from typing import Dict, Any, List, Callable
 def safe_score(raw) -> float:
-    """Always clamp to [0.0, 1.0]. Never crash. Handles None, str, out-of-range."""
     if raw is None:
-        return 0.0                    # BUG 1 FIX — must be first line
     try:
-        return round(max(0.0, min(1.0, float(raw))), 4)
     except (TypeError, ValueError):
-        return 0.0
 def repetition_penalty(action_type: str, last_actions: List[str], window: int = 3) -> float:

 def safe_score(raw) -> float:
+    """Always clamp strictly to (0.0, 1.0) range e.g. [0.01, 0.99]. Never crash."""
     if raw is None:
+        return 0.01
     try:
+        return round(max(0.01, min(0.99, float(raw))), 4)
     except (TypeError, ValueError):
+        return 0.01
 def repetition_penalty(action_type: str, last_actions: List[str], window: int = 3) -> float: