Spaces:

kartikmandar
/

workflow-orchestrator

Running

kartikmandar commited on 22 days ago

Commit

d18fe9d

1 Parent(s): 1af75ba

fix: clamp observation rewards to [0,1] for Phase 2 eval compliance

The evaluator validates that all reward/score values are in (0, 1).
Our per-step rewards could be negative (e.g., -0.15 for capacity
violations), causing Task Validation to fail. Now observation.reward
is clamped to [0, 1] in the environment, and inference.py further
clamps to [0.01, 0.99] before stdout logging. Internal _total_reward
and grader logic remain unaffected (use episode logs, not obs reward).

Files changed (3) hide show

inference.py +1 -1
server/environment.py +6 -1
tests/test_environment.py +2 -2

inference.py CHANGED Viewed

@@ -355,7 +355,7 @@ async def run_task(task_id: str, env: OrchestratorClient) -> float:
             result = await env.step(action)
-            reward: float = result.reward or 0.0
             done: bool = result.done
             error: Optional[str] = None
             if result.observation.errors:

             result = await env.step(action)
+            reward: float = max(0.01, min(0.99, result.reward or 0.0))
             done: bool = result.done
             error: Optional[str] = None
             if result.observation.errors:

server/environment.py CHANGED Viewed

@@ -471,6 +471,11 @@ class OrchestratorEnvironment(
         cost_budget = self._config.constraints.get("cost_budget")
         budget_used = self._pool.get_budget_used()
         return OrchestratorObservation(
             task_description=self._config.description,
             subtasks=self._dag.get_subtask_infos(),
@@ -486,7 +491,7 @@ class OrchestratorEnvironment(
             available_actions=self._compute_available_actions(),
             hint=self._compute_hint(),
             done=self._done,
-            reward=reward,
         )
     def _compute_available_actions(self) -> list[str]:

         cost_budget = self._config.constraints.get("cost_budget")
         budget_used = self._pool.get_budget_used()
+        # Clamp reward to [0, 1] for OpenEnv evaluation compliance.
+        # Internal _total_reward still uses unclamped values for accurate tracking;
+        # graders use episode logs, not observation rewards, so they're unaffected.
+        clamped_reward = max(0.0, min(1.0, reward))
         return OrchestratorObservation(
             task_description=self._config.description,
             subtasks=self._dag.get_subtask_infos(),
             available_actions=self._compute_available_actions(),
             hint=self._compute_hint(),
             done=self._done,
+            reward=clamped_reward,
         )
     def _compute_available_actions(self) -> list[str]:

tests/test_environment.py CHANGED Viewed

@@ -88,7 +88,7 @@ class TestValidation:
         obs = _delegate(env, "technical_design", "frontend_dev")
         assert len(obs.errors) > 0
         assert "lacks capability" in obs.errors[0]
-        assert obs.reward < 0
     def test_delegate_pending_subtask(self) -> None:
         env, _ = _make_env("easy")
@@ -443,7 +443,7 @@ class TestHardTaskEdgeCases:
         obs = _retry(env, "enrich_logs", "investigator_alpha")
         assert len(obs.errors) == 1
         assert "permanent" in obs.errors[0].lower()
-        assert obs.reward < 0  # penalty applied
     def test_monitoring_patience_failure(self) -> None:
         """Synthesizing immediately after all complete loses patience score."""

         obs = _delegate(env, "technical_design", "frontend_dev")
         assert len(obs.errors) > 0
         assert "lacks capability" in obs.errors[0]
+        assert obs.reward == 0.0  # negative reward clamped to 0 for eval compliance
     def test_delegate_pending_subtask(self) -> None:
         env, _ = _make_env("easy")
         obs = _retry(env, "enrich_logs", "investigator_alpha")
         assert len(obs.errors) == 1
         assert "permanent" in obs.errors[0].lower()
+        assert obs.reward == 0.0  # negative penalty clamped to 0 for eval compliance
     def test_monitoring_patience_failure(self) -> None:
         """Synthesizing immediately after all complete loses patience score."""