kartikmandar commited on
Commit
1af75ba
·
1 Parent(s): 233154d

fix: increase grader score epsilon to survive .3f stdout formatting

Browse files

Phase 2 failed because _SCORE_EPS=0.0001 rounded to 0.000 or 1.000
when formatted with :.3f in the [END] log line, causing the evaluator
to reject scores as "not strictly between 0 and 1". Bumped epsilon to
0.01 so minimum/maximum scores format as 0.010/0.990. Also fixed
fallback score=0.0 in inference.py error paths.

Files changed (3) hide show
  1. inference.py +3 -3
  2. server/graders.py +2 -2
  3. tests/test_graders.py +1 -1
inference.py CHANGED
@@ -377,7 +377,7 @@ async def run_task(task_id: str, env: OrchestratorClient) -> float:
377
  score = grade_data.get("score", 0.0)
378
  except Exception as exc:
379
  print(f"[DEBUG] Grader call failed: {exc}", flush=True)
380
- score = 0.0
381
 
382
  success = score >= SUCCESS_SCORE_THRESHOLD
383
 
@@ -408,8 +408,8 @@ async def main() -> None:
408
  )
409
  except asyncio.TimeoutError:
410
  print(f"[DEBUG] Task {task_id} timed out after {TASK_TIMEOUT_S}s", flush=True)
411
- scores[task_id] = 0.0
412
- log_end(success=False, steps=0, score=0.0, rewards=[])
413
  finally:
414
  try:
415
  await env.close()
 
377
  score = grade_data.get("score", 0.0)
378
  except Exception as exc:
379
  print(f"[DEBUG] Grader call failed: {exc}", flush=True)
380
+ score = 0.01
381
 
382
  success = score >= SUCCESS_SCORE_THRESHOLD
383
 
 
408
  )
409
  except asyncio.TimeoutError:
410
  print(f"[DEBUG] Task {task_id} timed out after {TASK_TIMEOUT_S}s", flush=True)
411
+ scores[task_id] = 0.01
412
+ log_end(success=False, steps=0, score=0.01, rewards=[])
413
  finally:
414
  try:
415
  await env.close()
server/graders.py CHANGED
@@ -514,8 +514,8 @@ _GRADERS = {
514
 
515
 
516
  # Epsilon for strict open-interval (0, 1) compliance.
517
- # Applied after per-grader rounding so that 0.0000 -> 0.0001 and 1.0000 -> 0.9999.
518
- _SCORE_EPS = 0.0001
519
 
520
 
521
  def grade(task_id: str, log: EpisodeLog) -> GradeResult:
 
514
 
515
 
516
  # Epsilon for strict open-interval (0, 1) compliance.
517
+ # Applied after per-grader rounding so that 0.0000 -> 0.01 and 1.0000 -> 0.99.
518
+ _SCORE_EPS = 0.01
519
 
520
 
521
  def grade(task_id: str, log: EpisodeLog) -> GradeResult:
tests/test_graders.py CHANGED
@@ -264,7 +264,7 @@ class TestDegeneratePolicies:
264
 
265
  def test_do_nothing_easy_scores_zero(self) -> None:
266
  result = self._run_do_nothing("easy")
267
- assert result.score == 0.0001
268
 
269
  def test_do_nothing_medium_scores_near_zero(self) -> None:
270
  result = self._run_do_nothing("medium")
 
264
 
265
  def test_do_nothing_easy_scores_zero(self) -> None:
266
  result = self._run_do_nothing("easy")
267
+ assert result.score == 0.01
268
 
269
  def test_do_nothing_medium_scores_near_zero(self) -> None:
270
  result = self._run_do_nothing("medium")