Spaces:

kartikmandar
/

workflow-orchestrator

Running

kartikmandar commited on 5 days ago

Commit

db0bed5

1 Parent(s): d18fe9d

fix: prevent score=0.000 in stdout for Phase 2 validation

- Change score default from 0.0 to 0.01 in inference.py
- Add raise_for_status() on grader HTTP response
- Add safety clamp max(0.01, min(0.99, score)) before log_end
- Remove duplicate [END] line on task timeout
- Clamp environment observation rewards to (0.01, 0.99) exclusive
- Update test assertions to match new reward floor

Files changed (3) hide show

inference.py +6 -3
server/environment.py +2 -2
tests/test_environment.py +3 -3

inference.py CHANGED Viewed

@@ -298,7 +298,7 @@ async def run_task(task_id: str, env: OrchestratorClient) -> float:
     rewards: List[float] = []
     steps_taken: int = 0
-    score: float = 0.0
     success: bool = False
     # Conversation history: sliding window of recent turns
@@ -373,8 +373,9 @@ async def run_task(task_id: str, env: OrchestratorClient) -> float:
         try:
             async with httpx.AsyncClient(base_url=ENV_URL, timeout=60.0) as http:
                 grade_resp = await http.post("/grader", json={"task_id": task_id})
                 grade_data: Dict[str, Any] = grade_resp.json()
-            score = grade_data.get("score", 0.0)
         except Exception as exc:
             print(f"[DEBUG] Grader call failed: {exc}", flush=True)
             score = 0.01
@@ -382,6 +383,8 @@ async def run_task(task_id: str, env: OrchestratorClient) -> float:
         success = score >= SUCCESS_SCORE_THRESHOLD
     finally:
         log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
     return score
@@ -409,7 +412,7 @@ async def main() -> None:
             except asyncio.TimeoutError:
                 print(f"[DEBUG] Task {task_id} timed out after {TASK_TIMEOUT_S}s", flush=True)
                 scores[task_id] = 0.01
-                log_end(success=False, steps=0, score=0.01, rewards=[])
     finally:
         try:
             await env.close()

     rewards: List[float] = []
     steps_taken: int = 0
+    score: float = 0.01
     success: bool = False
     # Conversation history: sliding window of recent turns
         try:
             async with httpx.AsyncClient(base_url=ENV_URL, timeout=60.0) as http:
                 grade_resp = await http.post("/grader", json={"task_id": task_id})
+                grade_resp.raise_for_status()
                 grade_data: Dict[str, Any] = grade_resp.json()
+            score = grade_data.get("score", 0.01)
         except Exception as exc:
             print(f"[DEBUG] Grader call failed: {exc}", flush=True)
             score = 0.01
         success = score >= SUCCESS_SCORE_THRESHOLD
     finally:
+        # Safety clamp: ensure score is strictly in (0, 1) for Phase 2 validation
+        score = max(0.01, min(0.99, score))
         log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
     return score
             except asyncio.TimeoutError:
                 print(f"[DEBUG] Task {task_id} timed out after {TASK_TIMEOUT_S}s", flush=True)
                 scores[task_id] = 0.01
+                # Don't call log_end here — run_task's finally block already emits [END]
     finally:
         try:
             await env.close()

server/environment.py CHANGED Viewed

@@ -471,10 +471,10 @@ class OrchestratorEnvironment(
         cost_budget = self._config.constraints.get("cost_budget")
         budget_used = self._pool.get_budget_used()
-        # Clamp reward to [0, 1] for OpenEnv evaluation compliance.
         # Internal _total_reward still uses unclamped values for accurate tracking;
         # graders use episode logs, not observation rewards, so they're unaffected.
-        clamped_reward = max(0.0, min(1.0, reward))
         return OrchestratorObservation(
             task_description=self._config.description,

         cost_budget = self._config.constraints.get("cost_budget")
         budget_used = self._pool.get_budget_used()
+        # Clamp reward to (0, 1) exclusive for strict OpenEnv evaluation compliance.
         # Internal _total_reward still uses unclamped values for accurate tracking;
         # graders use episode logs, not observation rewards, so they're unaffected.
+        clamped_reward = max(0.01, min(0.99, reward))
         return OrchestratorObservation(
             task_description=self._config.description,

tests/test_environment.py CHANGED Viewed

@@ -49,7 +49,7 @@ class TestReset:
         env, obs = _make_env("easy")
         assert isinstance(obs, OrchestratorObservation)
         assert obs.done is False
-        assert obs.reward == 0.0
         assert obs.time_remaining == 15
         assert obs.time_elapsed == 0
@@ -88,7 +88,7 @@ class TestValidation:
         obs = _delegate(env, "technical_design", "frontend_dev")
         assert len(obs.errors) > 0
         assert "lacks capability" in obs.errors[0]
-        assert obs.reward == 0.0  # negative reward clamped to 0 for eval compliance
     def test_delegate_pending_subtask(self) -> None:
         env, _ = _make_env("easy")
@@ -443,7 +443,7 @@ class TestHardTaskEdgeCases:
         obs = _retry(env, "enrich_logs", "investigator_alpha")
         assert len(obs.errors) == 1
         assert "permanent" in obs.errors[0].lower()
-        assert obs.reward == 0.0  # negative penalty clamped to 0 for eval compliance
     def test_monitoring_patience_failure(self) -> None:
         """Synthesizing immediately after all complete loses patience score."""

         env, obs = _make_env("easy")
         assert isinstance(obs, OrchestratorObservation)
         assert obs.done is False
+        assert obs.reward == 0.01  # clamped to (0, 1) exclusive for eval compliance
         assert obs.time_remaining == 15
         assert obs.time_elapsed == 0
         obs = _delegate(env, "technical_design", "frontend_dev")
         assert len(obs.errors) > 0
         assert "lacks capability" in obs.errors[0]
+        assert obs.reward == 0.01  # negative reward clamped to (0, 1) exclusive for eval compliance
     def test_delegate_pending_subtask(self) -> None:
         env, _ = _make_env("easy")
         obs = _retry(env, "enrich_logs", "investigator_alpha")
         assert len(obs.errors) == 1
         assert "permanent" in obs.errors[0].lower()
+        assert obs.reward == 0.01  # negative penalty clamped to (0, 1) exclusive for eval compliance
     def test_monitoring_patience_failure(self) -> None:
         """Synthesizing immediately after all complete loses patience score."""