kartikmandar commited on
Commit
db0bed5
·
1 Parent(s): d18fe9d

fix: prevent score=0.000 in stdout for Phase 2 validation

Browse files

- Change score default from 0.0 to 0.01 in inference.py
- Add raise_for_status() on grader HTTP response
- Add safety clamp max(0.01, min(0.99, score)) before log_end
- Remove duplicate [END] line on task timeout
- Clamp environment observation rewards to (0.01, 0.99) exclusive
- Update test assertions to match new reward floor

inference.py CHANGED
@@ -298,7 +298,7 @@ async def run_task(task_id: str, env: OrchestratorClient) -> float:
298
 
299
  rewards: List[float] = []
300
  steps_taken: int = 0
301
- score: float = 0.0
302
  success: bool = False
303
 
304
  # Conversation history: sliding window of recent turns
@@ -373,8 +373,9 @@ async def run_task(task_id: str, env: OrchestratorClient) -> float:
373
  try:
374
  async with httpx.AsyncClient(base_url=ENV_URL, timeout=60.0) as http:
375
  grade_resp = await http.post("/grader", json={"task_id": task_id})
 
376
  grade_data: Dict[str, Any] = grade_resp.json()
377
- score = grade_data.get("score", 0.0)
378
  except Exception as exc:
379
  print(f"[DEBUG] Grader call failed: {exc}", flush=True)
380
  score = 0.01
@@ -382,6 +383,8 @@ async def run_task(task_id: str, env: OrchestratorClient) -> float:
382
  success = score >= SUCCESS_SCORE_THRESHOLD
383
 
384
  finally:
 
 
385
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
386
 
387
  return score
@@ -409,7 +412,7 @@ async def main() -> None:
409
  except asyncio.TimeoutError:
410
  print(f"[DEBUG] Task {task_id} timed out after {TASK_TIMEOUT_S}s", flush=True)
411
  scores[task_id] = 0.01
412
- log_end(success=False, steps=0, score=0.01, rewards=[])
413
  finally:
414
  try:
415
  await env.close()
 
298
 
299
  rewards: List[float] = []
300
  steps_taken: int = 0
301
+ score: float = 0.01
302
  success: bool = False
303
 
304
  # Conversation history: sliding window of recent turns
 
373
  try:
374
  async with httpx.AsyncClient(base_url=ENV_URL, timeout=60.0) as http:
375
  grade_resp = await http.post("/grader", json={"task_id": task_id})
376
+ grade_resp.raise_for_status()
377
  grade_data: Dict[str, Any] = grade_resp.json()
378
+ score = grade_data.get("score", 0.01)
379
  except Exception as exc:
380
  print(f"[DEBUG] Grader call failed: {exc}", flush=True)
381
  score = 0.01
 
383
  success = score >= SUCCESS_SCORE_THRESHOLD
384
 
385
  finally:
386
+ # Safety clamp: ensure score is strictly in (0, 1) for Phase 2 validation
387
+ score = max(0.01, min(0.99, score))
388
  log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
389
 
390
  return score
 
412
  except asyncio.TimeoutError:
413
  print(f"[DEBUG] Task {task_id} timed out after {TASK_TIMEOUT_S}s", flush=True)
414
  scores[task_id] = 0.01
415
+ # Don't call log_end here run_task's finally block already emits [END]
416
  finally:
417
  try:
418
  await env.close()
server/environment.py CHANGED
@@ -471,10 +471,10 @@ class OrchestratorEnvironment(
471
  cost_budget = self._config.constraints.get("cost_budget")
472
  budget_used = self._pool.get_budget_used()
473
 
474
- # Clamp reward to [0, 1] for OpenEnv evaluation compliance.
475
  # Internal _total_reward still uses unclamped values for accurate tracking;
476
  # graders use episode logs, not observation rewards, so they're unaffected.
477
- clamped_reward = max(0.0, min(1.0, reward))
478
 
479
  return OrchestratorObservation(
480
  task_description=self._config.description,
 
471
  cost_budget = self._config.constraints.get("cost_budget")
472
  budget_used = self._pool.get_budget_used()
473
 
474
+ # Clamp reward to (0, 1) exclusive for strict OpenEnv evaluation compliance.
475
  # Internal _total_reward still uses unclamped values for accurate tracking;
476
  # graders use episode logs, not observation rewards, so they're unaffected.
477
+ clamped_reward = max(0.01, min(0.99, reward))
478
 
479
  return OrchestratorObservation(
480
  task_description=self._config.description,
tests/test_environment.py CHANGED
@@ -49,7 +49,7 @@ class TestReset:
49
  env, obs = _make_env("easy")
50
  assert isinstance(obs, OrchestratorObservation)
51
  assert obs.done is False
52
- assert obs.reward == 0.0
53
  assert obs.time_remaining == 15
54
  assert obs.time_elapsed == 0
55
 
@@ -88,7 +88,7 @@ class TestValidation:
88
  obs = _delegate(env, "technical_design", "frontend_dev")
89
  assert len(obs.errors) > 0
90
  assert "lacks capability" in obs.errors[0]
91
- assert obs.reward == 0.0 # negative reward clamped to 0 for eval compliance
92
 
93
  def test_delegate_pending_subtask(self) -> None:
94
  env, _ = _make_env("easy")
@@ -443,7 +443,7 @@ class TestHardTaskEdgeCases:
443
  obs = _retry(env, "enrich_logs", "investigator_alpha")
444
  assert len(obs.errors) == 1
445
  assert "permanent" in obs.errors[0].lower()
446
- assert obs.reward == 0.0 # negative penalty clamped to 0 for eval compliance
447
 
448
  def test_monitoring_patience_failure(self) -> None:
449
  """Synthesizing immediately after all complete loses patience score."""
 
49
  env, obs = _make_env("easy")
50
  assert isinstance(obs, OrchestratorObservation)
51
  assert obs.done is False
52
+ assert obs.reward == 0.01 # clamped to (0, 1) exclusive for eval compliance
53
  assert obs.time_remaining == 15
54
  assert obs.time_elapsed == 0
55
 
 
88
  obs = _delegate(env, "technical_design", "frontend_dev")
89
  assert len(obs.errors) > 0
90
  assert "lacks capability" in obs.errors[0]
91
+ assert obs.reward == 0.01 # negative reward clamped to (0, 1) exclusive for eval compliance
92
 
93
  def test_delegate_pending_subtask(self) -> None:
94
  env, _ = _make_env("easy")
 
443
  obs = _retry(env, "enrich_logs", "investigator_alpha")
444
  assert len(obs.errors) == 1
445
  assert "permanent" in obs.errors[0].lower()
446
+ assert obs.reward == 0.01 # negative penalty clamped to (0, 1) exclusive for eval compliance
447
 
448
  def test_monitoring_patience_failure(self) -> None:
449
  """Synthesizing immediately after all complete loses patience score."""