Spaces:
Sleeping
Sleeping
Commit ·
1af75ba
1
Parent(s): 233154d
fix: increase grader score epsilon to survive .3f stdout formatting
Browse filesPhase 2 failed because _SCORE_EPS=0.0001 rounded to 0.000 or 1.000
when formatted with :.3f in the [END] log line, causing the evaluator
to reject scores as "not strictly between 0 and 1". Bumped epsilon to
0.01 so minimum/maximum scores format as 0.010/0.990. Also fixed
fallback score=0.0 in inference.py error paths.
- inference.py +3 -3
- server/graders.py +2 -2
- tests/test_graders.py +1 -1
inference.py
CHANGED
|
@@ -377,7 +377,7 @@ async def run_task(task_id: str, env: OrchestratorClient) -> float:
|
|
| 377 |
score = grade_data.get("score", 0.0)
|
| 378 |
except Exception as exc:
|
| 379 |
print(f"[DEBUG] Grader call failed: {exc}", flush=True)
|
| 380 |
-
score = 0.
|
| 381 |
|
| 382 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 383 |
|
|
@@ -408,8 +408,8 @@ async def main() -> None:
|
|
| 408 |
)
|
| 409 |
except asyncio.TimeoutError:
|
| 410 |
print(f"[DEBUG] Task {task_id} timed out after {TASK_TIMEOUT_S}s", flush=True)
|
| 411 |
-
scores[task_id] = 0.
|
| 412 |
-
log_end(success=False, steps=0, score=0.
|
| 413 |
finally:
|
| 414 |
try:
|
| 415 |
await env.close()
|
|
|
|
| 377 |
score = grade_data.get("score", 0.0)
|
| 378 |
except Exception as exc:
|
| 379 |
print(f"[DEBUG] Grader call failed: {exc}", flush=True)
|
| 380 |
+
score = 0.01
|
| 381 |
|
| 382 |
success = score >= SUCCESS_SCORE_THRESHOLD
|
| 383 |
|
|
|
|
| 408 |
)
|
| 409 |
except asyncio.TimeoutError:
|
| 410 |
print(f"[DEBUG] Task {task_id} timed out after {TASK_TIMEOUT_S}s", flush=True)
|
| 411 |
+
scores[task_id] = 0.01
|
| 412 |
+
log_end(success=False, steps=0, score=0.01, rewards=[])
|
| 413 |
finally:
|
| 414 |
try:
|
| 415 |
await env.close()
|
server/graders.py
CHANGED
|
@@ -514,8 +514,8 @@ _GRADERS = {
|
|
| 514 |
|
| 515 |
|
| 516 |
# Epsilon for strict open-interval (0, 1) compliance.
|
| 517 |
-
# Applied after per-grader rounding so that 0.0000 -> 0.
|
| 518 |
-
_SCORE_EPS = 0.
|
| 519 |
|
| 520 |
|
| 521 |
def grade(task_id: str, log: EpisodeLog) -> GradeResult:
|
|
|
|
| 514 |
|
| 515 |
|
| 516 |
# Epsilon for strict open-interval (0, 1) compliance.
|
| 517 |
+
# Applied after per-grader rounding so that 0.0000 -> 0.01 and 1.0000 -> 0.99.
|
| 518 |
+
_SCORE_EPS = 0.01
|
| 519 |
|
| 520 |
|
| 521 |
def grade(task_id: str, log: EpisodeLog) -> GradeResult:
|
tests/test_graders.py
CHANGED
|
@@ -264,7 +264,7 @@ class TestDegeneratePolicies:
|
|
| 264 |
|
| 265 |
def test_do_nothing_easy_scores_zero(self) -> None:
|
| 266 |
result = self._run_do_nothing("easy")
|
| 267 |
-
assert result.score == 0.
|
| 268 |
|
| 269 |
def test_do_nothing_medium_scores_near_zero(self) -> None:
|
| 270 |
result = self._run_do_nothing("medium")
|
|
|
|
| 264 |
|
| 265 |
def test_do_nothing_easy_scores_zero(self) -> None:
|
| 266 |
result = self._run_do_nothing("easy")
|
| 267 |
+
assert result.score == 0.01
|
| 268 |
|
| 269 |
def test_do_nothing_medium_scores_near_zero(self) -> None:
|
| 270 |
result = self._run_do_nothing("medium")
|