Spaces:

kartikmandar
/

workflow-orchestrator

Sleeping

kartikmandar commited on 17 days ago

Commit

1af75ba

1 Parent(s): 233154d

fix: increase grader score epsilon to survive .3f stdout formatting

Phase 2 failed because _SCORE_EPS=0.0001 rounded to 0.000 or 1.000
when formatted with :.3f in the [END] log line, causing the evaluator
to reject scores as "not strictly between 0 and 1". Bumped epsilon to
0.01 so minimum/maximum scores format as 0.010/0.990. Also fixed
fallback score=0.0 in inference.py error paths.

Files changed (3) hide show

inference.py +3 -3
server/graders.py +2 -2
tests/test_graders.py +1 -1

inference.py CHANGED Viewed

@@ -377,7 +377,7 @@ async def run_task(task_id: str, env: OrchestratorClient) -> float:
             score = grade_data.get("score", 0.0)
         except Exception as exc:
             print(f"[DEBUG] Grader call failed: {exc}", flush=True)
-            score = 0.0
         success = score >= SUCCESS_SCORE_THRESHOLD
@@ -408,8 +408,8 @@ async def main() -> None:
                 )
             except asyncio.TimeoutError:
                 print(f"[DEBUG] Task {task_id} timed out after {TASK_TIMEOUT_S}s", flush=True)
-                scores[task_id] = 0.0
-                log_end(success=False, steps=0, score=0.0, rewards=[])
     finally:
         try:
             await env.close()

             score = grade_data.get("score", 0.0)
         except Exception as exc:
             print(f"[DEBUG] Grader call failed: {exc}", flush=True)
+            score = 0.01
         success = score >= SUCCESS_SCORE_THRESHOLD
                 )
             except asyncio.TimeoutError:
                 print(f"[DEBUG] Task {task_id} timed out after {TASK_TIMEOUT_S}s", flush=True)
+                scores[task_id] = 0.01
+                log_end(success=False, steps=0, score=0.01, rewards=[])
     finally:
         try:
             await env.close()

server/graders.py CHANGED Viewed

@@ -514,8 +514,8 @@ _GRADERS = {
 # Epsilon for strict open-interval (0, 1) compliance.
-# Applied after per-grader rounding so that 0.0000 -> 0.0001 and 1.0000 -> 0.9999.
-_SCORE_EPS = 0.0001
 def grade(task_id: str, log: EpisodeLog) -> GradeResult:

 # Epsilon for strict open-interval (0, 1) compliance.
+# Applied after per-grader rounding so that 0.0000 -> 0.01 and 1.0000 -> 0.99.
+_SCORE_EPS = 0.01
 def grade(task_id: str, log: EpisodeLog) -> GradeResult:

tests/test_graders.py CHANGED Viewed

@@ -264,7 +264,7 @@ class TestDegeneratePolicies:
     def test_do_nothing_easy_scores_zero(self) -> None:
         result = self._run_do_nothing("easy")
-        assert result.score == 0.0001
     def test_do_nothing_medium_scores_near_zero(self) -> None:
         result = self._run_do_nothing("medium")

     def test_do_nothing_easy_scores_zero(self) -> None:
         result = self._run_do_nothing("easy")
+        assert result.score == 0.01
     def test_do_nothing_medium_scores_near_zero(self) -> None:
         result = self._run_do_nothing("medium")