kartikmandar commited on
Commit
d18fe9d
·
1 Parent(s): 1af75ba

fix: clamp observation rewards to [0,1] for Phase 2 eval compliance

Browse files

The evaluator validates that all reward/score values are in (0, 1).
Our per-step rewards could be negative (e.g., -0.15 for capacity
violations), causing Task Validation to fail. Now observation.reward
is clamped to [0, 1] in the environment, and inference.py further
clamps to [0.01, 0.99] before stdout logging. Internal _total_reward
and grader logic remain unaffected (use episode logs, not obs reward).

inference.py CHANGED
@@ -355,7 +355,7 @@ async def run_task(task_id: str, env: OrchestratorClient) -> float:
355
 
356
  result = await env.step(action)
357
 
358
- reward: float = result.reward or 0.0
359
  done: bool = result.done
360
  error: Optional[str] = None
361
  if result.observation.errors:
 
355
 
356
  result = await env.step(action)
357
 
358
+ reward: float = max(0.01, min(0.99, result.reward or 0.0))
359
  done: bool = result.done
360
  error: Optional[str] = None
361
  if result.observation.errors:
server/environment.py CHANGED
@@ -471,6 +471,11 @@ class OrchestratorEnvironment(
471
  cost_budget = self._config.constraints.get("cost_budget")
472
  budget_used = self._pool.get_budget_used()
473
 
 
 
 
 
 
474
  return OrchestratorObservation(
475
  task_description=self._config.description,
476
  subtasks=self._dag.get_subtask_infos(),
@@ -486,7 +491,7 @@ class OrchestratorEnvironment(
486
  available_actions=self._compute_available_actions(),
487
  hint=self._compute_hint(),
488
  done=self._done,
489
- reward=reward,
490
  )
491
 
492
  def _compute_available_actions(self) -> list[str]:
 
471
  cost_budget = self._config.constraints.get("cost_budget")
472
  budget_used = self._pool.get_budget_used()
473
 
474
+ # Clamp reward to [0, 1] for OpenEnv evaluation compliance.
475
+ # Internal _total_reward still uses unclamped values for accurate tracking;
476
+ # graders use episode logs, not observation rewards, so they're unaffected.
477
+ clamped_reward = max(0.0, min(1.0, reward))
478
+
479
  return OrchestratorObservation(
480
  task_description=self._config.description,
481
  subtasks=self._dag.get_subtask_infos(),
 
491
  available_actions=self._compute_available_actions(),
492
  hint=self._compute_hint(),
493
  done=self._done,
494
+ reward=clamped_reward,
495
  )
496
 
497
  def _compute_available_actions(self) -> list[str]:
tests/test_environment.py CHANGED
@@ -88,7 +88,7 @@ class TestValidation:
88
  obs = _delegate(env, "technical_design", "frontend_dev")
89
  assert len(obs.errors) > 0
90
  assert "lacks capability" in obs.errors[0]
91
- assert obs.reward < 0
92
 
93
  def test_delegate_pending_subtask(self) -> None:
94
  env, _ = _make_env("easy")
@@ -443,7 +443,7 @@ class TestHardTaskEdgeCases:
443
  obs = _retry(env, "enrich_logs", "investigator_alpha")
444
  assert len(obs.errors) == 1
445
  assert "permanent" in obs.errors[0].lower()
446
- assert obs.reward < 0 # penalty applied
447
 
448
  def test_monitoring_patience_failure(self) -> None:
449
  """Synthesizing immediately after all complete loses patience score."""
 
88
  obs = _delegate(env, "technical_design", "frontend_dev")
89
  assert len(obs.errors) > 0
90
  assert "lacks capability" in obs.errors[0]
91
+ assert obs.reward == 0.0 # negative reward clamped to 0 for eval compliance
92
 
93
  def test_delegate_pending_subtask(self) -> None:
94
  env, _ = _make_env("easy")
 
443
  obs = _retry(env, "enrich_logs", "investigator_alpha")
444
  assert len(obs.errors) == 1
445
  assert "permanent" in obs.errors[0].lower()
446
+ assert obs.reward == 0.0 # negative penalty clamped to 0 for eval compliance
447
 
448
  def test_monitoring_patience_failure(self) -> None:
449
  """Synthesizing immediately after all complete loses patience score."""