Spaces:

ar9av
/

sql-agent-openenv

Sleeping

ar9avg commited on Apr 11

Commit

d2d92b8

1 Parent(s): 63d67f0

Fix task scores to be strictly in (0, 1) exclusive per OpenEnv spec

Clamp all grader outputs in grade_response() and inference.py score
calculation using epsilon bounds to satisfy validator requirement that
scores must not be exactly 0.0 or 1.0.

Files changed (2) hide show

backend/env/tasks.py +6 -1
inference.py +4 -2

backend/env/tasks.py CHANGED Viewed

@@ -330,6 +330,9 @@ def get_all_tasks() -> list[Task]:
     return list(TASKS.values())
 def grade_response(
     task_id: str,
     question_id: str,
@@ -342,4 +345,6 @@ def grade_response(
     question = next((q for q in task.questions if q.id == question_id), None)
     if question is None:
         raise ValueError(f"Unknown question_id {question_id!r} in task {task_id!r}")
-    return task.grader(question, sql, rows, error, attempts)

     return list(TASKS.values())
+_EPS = 1e-9
 def grade_response(
     task_id: str,
     question_id: str,
     question = next((q for q in task.questions if q.id == question_id), None)
     if question is None:
         raise ValueError(f"Unknown question_id {question_id!r} in task {task_id!r}")
+    raw = task.grader(question, sql, rows, error, attempts)
+    # Score must be strictly in (0, 1) exclusive per OpenEnv spec
+    return max(_EPS, min(1.0 - _EPS, raw))

inference.py CHANGED Viewed

@@ -200,10 +200,12 @@ async def run_episode(
             if done:
                 break
-        # Score: clamp sum of rewards to [0, 1]
         total = sum(rewards)
         max_possible = MAX_STEPS * 1.0  # max reward per step is 1.0
-        score = min(max(total / max_possible, 0.0), 1.0)
     finally:
         log_end(

             if done:
                 break
+        # Score: clamp sum of rewards to (0, 1) exclusive per OpenEnv spec
+        _EPS = 1e-9
         total = sum(rewards)
         max_possible = MAX_STEPS * 1.0  # max reward per step is 1.0
+        raw_score = total / max_possible if max_possible > 0 else _EPS
+        score = max(_EPS, min(1.0 - _EPS, raw_score))
     finally:
         log_end(