Spaces:
Sleeping
Sleeping
Fix task scores to be strictly in (0, 1) exclusive per OpenEnv spec
Browse filesClamp all grader outputs in grade_response() and inference.py score
calculation using epsilon bounds to satisfy validator requirement that
scores must not be exactly 0.0 or 1.0.
- backend/env/tasks.py +6 -1
- inference.py +4 -2
backend/env/tasks.py
CHANGED
|
@@ -330,6 +330,9 @@ def get_all_tasks() -> list[Task]:
|
|
| 330 |
return list(TASKS.values())
|
| 331 |
|
| 332 |
|
|
|
|
|
|
|
|
|
|
| 333 |
def grade_response(
|
| 334 |
task_id: str,
|
| 335 |
question_id: str,
|
|
@@ -342,4 +345,6 @@ def grade_response(
|
|
| 342 |
question = next((q for q in task.questions if q.id == question_id), None)
|
| 343 |
if question is None:
|
| 344 |
raise ValueError(f"Unknown question_id {question_id!r} in task {task_id!r}")
|
| 345 |
-
|
|
|
|
|
|
|
|
|
| 330 |
return list(TASKS.values())
|
| 331 |
|
| 332 |
|
| 333 |
+
_EPS = 1e-9
|
| 334 |
+
|
| 335 |
+
|
| 336 |
def grade_response(
|
| 337 |
task_id: str,
|
| 338 |
question_id: str,
|
|
|
|
| 345 |
question = next((q for q in task.questions if q.id == question_id), None)
|
| 346 |
if question is None:
|
| 347 |
raise ValueError(f"Unknown question_id {question_id!r} in task {task_id!r}")
|
| 348 |
+
raw = task.grader(question, sql, rows, error, attempts)
|
| 349 |
+
# Score must be strictly in (0, 1) exclusive per OpenEnv spec
|
| 350 |
+
return max(_EPS, min(1.0 - _EPS, raw))
|
inference.py
CHANGED
|
@@ -200,10 +200,12 @@ async def run_episode(
|
|
| 200 |
if done:
|
| 201 |
break
|
| 202 |
|
| 203 |
-
# Score: clamp sum of rewards to
|
|
|
|
| 204 |
total = sum(rewards)
|
| 205 |
max_possible = MAX_STEPS * 1.0 # max reward per step is 1.0
|
| 206 |
-
|
|
|
|
| 207 |
|
| 208 |
finally:
|
| 209 |
log_end(
|
|
|
|
| 200 |
if done:
|
| 201 |
break
|
| 202 |
|
| 203 |
+
# Score: clamp sum of rewards to (0, 1) exclusive per OpenEnv spec
|
| 204 |
+
_EPS = 1e-9
|
| 205 |
total = sum(rewards)
|
| 206 |
max_possible = MAX_STEPS * 1.0 # max reward per step is 1.0
|
| 207 |
+
raw_score = total / max_possible if max_possible > 0 else _EPS
|
| 208 |
+
score = max(_EPS, min(1.0 - _EPS, raw_score))
|
| 209 |
|
| 210 |
finally:
|
| 211 |
log_end(
|