Commit ·
db0bed5
1
Parent(s): d18fe9d
fix: prevent score=0.000 in stdout for Phase 2 validation
Browse files- Change score default from 0.0 to 0.01 in inference.py
- Add raise_for_status() on grader HTTP response
- Add safety clamp max(0.01, min(0.99, score)) before log_end
- Remove duplicate [END] line on task timeout
- Clamp environment observation rewards to (0.01, 0.99) exclusive
- Update test assertions to match new reward floor
- inference.py +6 -3
- server/environment.py +2 -2
- tests/test_environment.py +3 -3
inference.py
CHANGED
|
@@ -298,7 +298,7 @@ async def run_task(task_id: str, env: OrchestratorClient) -> float:
|
|
| 298 |
|
| 299 |
rewards: List[float] = []
|
| 300 |
steps_taken: int = 0
|
| 301 |
-
score: float = 0.
|
| 302 |
success: bool = False
|
| 303 |
|
| 304 |
# Conversation history: sliding window of recent turns
|
|
@@ -373,8 +373,9 @@ async def run_task(task_id: str, env: OrchestratorClient) -> float:
|
|
| 373 |
try:
|
| 374 |
async with httpx.AsyncClient(base_url=ENV_URL, timeout=60.0) as http:
|
| 375 |
grade_resp = await http.post("/grader", json={"task_id": task_id})
|
|
|
|
| 376 |
grade_data: Dict[str, Any] = grade_resp.json()
|
| 377 |
-
score = grade_data.get("score", 0.
|
| 378 |
except Exception as exc:
|
| 379 |
print(f"[DEBUG] Grader call failed: {exc}", flush=True)
|
| 380 |
score = 0.01
|
|
@@ -382,6 +383,8 @@ async def run_task(task_id: str, env: OrchestratorClient) -> float:
|
|
| 382 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 383 |
|
| 384 |
finally:
|
|
|
|
|
|
|
| 385 |
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 386 |
|
| 387 |
return score
|
|
@@ -409,7 +412,7 @@ async def main() -> None:
|
|
| 409 |
except asyncio.TimeoutError:
|
| 410 |
print(f"[DEBUG] Task {task_id} timed out after {TASK_TIMEOUT_S}s", flush=True)
|
| 411 |
scores[task_id] = 0.01
|
| 412 |
-
log_end
|
| 413 |
finally:
|
| 414 |
try:
|
| 415 |
await env.close()
|
|
|
|
| 298 |
|
| 299 |
rewards: List[float] = []
|
| 300 |
steps_taken: int = 0
|
| 301 |
+
score: float = 0.01
|
| 302 |
success: bool = False
|
| 303 |
|
| 304 |
# Conversation history: sliding window of recent turns
|
|
|
|
| 373 |
try:
|
| 374 |
async with httpx.AsyncClient(base_url=ENV_URL, timeout=60.0) as http:
|
| 375 |
grade_resp = await http.post("/grader", json={"task_id": task_id})
|
| 376 |
+
grade_resp.raise_for_status()
|
| 377 |
grade_data: Dict[str, Any] = grade_resp.json()
|
| 378 |
+
score = grade_data.get("score", 0.01)
|
| 379 |
except Exception as exc:
|
| 380 |
print(f"[DEBUG] Grader call failed: {exc}", flush=True)
|
| 381 |
score = 0.01
|
|
|
|
| 383 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 384 |
|
| 385 |
finally:
|
| 386 |
+
# Safety clamp: ensure score is strictly in (0, 1) for Phase 2 validation
|
| 387 |
+
score = max(0.01, min(0.99, score))
|
| 388 |
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
|
| 389 |
|
| 390 |
return score
|
|
|
|
| 412 |
except asyncio.TimeoutError:
|
| 413 |
print(f"[DEBUG] Task {task_id} timed out after {TASK_TIMEOUT_S}s", flush=True)
|
| 414 |
scores[task_id] = 0.01
|
| 415 |
+
# Don't call log_end here — run_task's finally block already emits [END]
|
| 416 |
finally:
|
| 417 |
try:
|
| 418 |
await env.close()
|
server/environment.py
CHANGED
|
@@ -471,10 +471,10 @@ class OrchestratorEnvironment(
|
|
| 471 |
cost_budget = self._config.constraints.get("cost_budget")
|
| 472 |
budget_used = self._pool.get_budget_used()
|
| 473 |
|
| 474 |
-
# Clamp reward to
|
| 475 |
# Internal _total_reward still uses unclamped values for accurate tracking;
|
| 476 |
# graders use episode logs, not observation rewards, so they're unaffected.
|
| 477 |
-
clamped_reward = max(0.
|
| 478 |
|
| 479 |
return OrchestratorObservation(
|
| 480 |
task_description=self._config.description,
|
|
|
|
| 471 |
cost_budget = self._config.constraints.get("cost_budget")
|
| 472 |
budget_used = self._pool.get_budget_used()
|
| 473 |
|
| 474 |
+
# Clamp reward to (0, 1) exclusive for strict OpenEnv evaluation compliance.
|
| 475 |
# Internal _total_reward still uses unclamped values for accurate tracking;
|
| 476 |
# graders use episode logs, not observation rewards, so they're unaffected.
|
| 477 |
+
clamped_reward = max(0.01, min(0.99, reward))
|
| 478 |
|
| 479 |
return OrchestratorObservation(
|
| 480 |
task_description=self._config.description,
|
tests/test_environment.py
CHANGED
|
@@ -49,7 +49,7 @@ class TestReset:
|
|
| 49 |
env, obs = _make_env("easy")
|
| 50 |
assert isinstance(obs, OrchestratorObservation)
|
| 51 |
assert obs.done is False
|
| 52 |
-
assert obs.reward == 0.0
|
| 53 |
assert obs.time_remaining == 15
|
| 54 |
assert obs.time_elapsed == 0
|
| 55 |
|
|
@@ -88,7 +88,7 @@ class TestValidation:
|
|
| 88 |
obs = _delegate(env, "technical_design", "frontend_dev")
|
| 89 |
assert len(obs.errors) > 0
|
| 90 |
assert "lacks capability" in obs.errors[0]
|
| 91 |
-
assert obs.reward == 0.
|
| 92 |
|
| 93 |
def test_delegate_pending_subtask(self) -> None:
|
| 94 |
env, _ = _make_env("easy")
|
|
@@ -443,7 +443,7 @@ class TestHardTaskEdgeCases:
|
|
| 443 |
obs = _retry(env, "enrich_logs", "investigator_alpha")
|
| 444 |
assert len(obs.errors) == 1
|
| 445 |
assert "permanent" in obs.errors[0].lower()
|
| 446 |
-
assert obs.reward == 0.
|
| 447 |
|
| 448 |
def test_monitoring_patience_failure(self) -> None:
|
| 449 |
"""Synthesizing immediately after all complete loses patience score."""
|
|
|
|
| 49 |
env, obs = _make_env("easy")
|
| 50 |
assert isinstance(obs, OrchestratorObservation)
|
| 51 |
assert obs.done is False
|
| 52 |
+
assert obs.reward == 0.01 # clamped to (0, 1) exclusive for eval compliance
|
| 53 |
assert obs.time_remaining == 15
|
| 54 |
assert obs.time_elapsed == 0
|
| 55 |
|
|
|
|
| 88 |
obs = _delegate(env, "technical_design", "frontend_dev")
|
| 89 |
assert len(obs.errors) > 0
|
| 90 |
assert "lacks capability" in obs.errors[0]
|
| 91 |
+
assert obs.reward == 0.01 # negative reward clamped to (0, 1) exclusive for eval compliance
|
| 92 |
|
| 93 |
def test_delegate_pending_subtask(self) -> None:
|
| 94 |
env, _ = _make_env("easy")
|
|
|
|
| 443 |
obs = _retry(env, "enrich_logs", "investigator_alpha")
|
| 444 |
assert len(obs.errors) == 1
|
| 445 |
assert "permanent" in obs.errors[0].lower()
|
| 446 |
+
assert obs.reward == 0.01 # negative penalty clamped to (0, 1) exclusive for eval compliance
|
| 447 |
|
| 448 |
def test_monitoring_patience_failure(self) -> None:
|
| 449 |
"""Synthesizing immediately after all complete loses patience score."""
|