ar9avg commited on
Commit
d2d92b8
·
1 Parent(s): 63d67f0

Fix task scores to be strictly in (0, 1) exclusive per OpenEnv spec

Browse files

Clamp all grader outputs in grade_response() and inference.py score
calculation using epsilon bounds to satisfy validator requirement that
scores must not be exactly 0.0 or 1.0.

Files changed (2) hide show
  1. backend/env/tasks.py +6 -1
  2. inference.py +4 -2
backend/env/tasks.py CHANGED
@@ -330,6 +330,9 @@ def get_all_tasks() -> list[Task]:
330
  return list(TASKS.values())
331
 
332
 
 
 
 
333
  def grade_response(
334
  task_id: str,
335
  question_id: str,
@@ -342,4 +345,6 @@ def grade_response(
342
  question = next((q for q in task.questions if q.id == question_id), None)
343
  if question is None:
344
  raise ValueError(f"Unknown question_id {question_id!r} in task {task_id!r}")
345
- return task.grader(question, sql, rows, error, attempts)
 
 
 
330
  return list(TASKS.values())
331
 
332
 
333
+ _EPS = 1e-9
334
+
335
+
336
  def grade_response(
337
  task_id: str,
338
  question_id: str,
 
345
  question = next((q for q in task.questions if q.id == question_id), None)
346
  if question is None:
347
  raise ValueError(f"Unknown question_id {question_id!r} in task {task_id!r}")
348
+ raw = task.grader(question, sql, rows, error, attempts)
349
+ # Score must be strictly in (0, 1) exclusive per OpenEnv spec
350
+ return max(_EPS, min(1.0 - _EPS, raw))
inference.py CHANGED
@@ -200,10 +200,12 @@ async def run_episode(
200
  if done:
201
  break
202
 
203
- # Score: clamp sum of rewards to [0, 1]
 
204
  total = sum(rewards)
205
  max_possible = MAX_STEPS * 1.0 # max reward per step is 1.0
206
- score = min(max(total / max_possible, 0.0), 1.0)
 
207
 
208
  finally:
209
  log_end(
 
200
  if done:
201
  break
202
 
203
+ # Score: clamp sum of rewards to (0, 1) exclusive per OpenEnv spec
204
+ _EPS = 1e-9
205
  total = sum(rewards)
206
  max_possible = MAX_STEPS * 1.0 # max reward per step is 1.0
207
+ raw_score = total / max_possible if max_possible > 0 else _EPS
208
+ score = max(_EPS, min(1.0 - _EPS, raw_score))
209
 
210
  finally:
211
  log_end(