Spaces:
Sleeping
Sleeping
| """Strict (0, 1) grader for SQL repair tasks. | |
| Phase 2 hard requirement: scores MUST be in the OPEN interval (0, 1). | |
| Validator rejects exactly 0.0 and exactly 1.0. NaN/inf are also rejected, | |
| so we coerce them to 0.5 (a neutral, in-range fallback). | |
| """ | |
| from __future__ import annotations | |
| import math | |
| from typing import Any | |
| # Module-level constants — also used by inference.py for consistency. | |
| SCORE_MIN: float = 1e-3 # 0.001 — strictly > 0 | |
| SCORE_MAX: float = 0.999 # strictly < 1 | |
| def strict_clamp(value: Any) -> float: | |
| """Coerce any input into a float strictly inside the OPEN interval (0, 1). | |
| NaN, inf, -inf, and non-numeric inputs all collapse to 0.5. | |
| Two hard invariants from Canary's Phase 2 failures: | |
| 1. Never emit exactly 0.0 or 1.0 (validator rejects endpoints). | |
| 2. After rounding for display (.4f), the value must STILL be strictly | |
| inside (0, 1). A tiny raw value like 0.00004 would round to 0.0000 | |
| and trip the validator, so we floor to SCORE_MIN in that case. | |
| """ | |
| try: | |
| s = float(value) | |
| except (TypeError, ValueError): | |
| return 0.5 | |
| if math.isnan(s) or math.isinf(s): | |
| return 0.5 | |
| if s <= 0.0: | |
| return SCORE_MIN | |
| if s >= 1.0: | |
| return SCORE_MAX | |
| rounded = round(s, 4) | |
| if rounded <= 0.0: | |
| return SCORE_MIN | |
| if rounded >= 1.0: | |
| return SCORE_MAX | |
| return rounded | |
| def grade_task(state, task_id: str) -> float: | |
| """Score the current state of an EnvState for the given task. | |
| Score components (sum, then strict_clamp): | |
| - 0.05 : agent submitted at least one query | |
| - 0.25 : last query executed without error | |
| - 0.60 : result rows matched expected rows | |
| - 0.09 : efficiency bonus (faster solves score higher) | |
| Worst case (no submission): 0.000 -> clamped to 0.001 | |
| Best case (1-step solve): 0.99 -> clamped to 0.99 | |
| Wrong-result executes: 0.30 -> in range | |
| """ | |
| from .env_core import MAX_STEPS # local import avoids circular | |
| if state.task_id != task_id: | |
| return SCORE_MIN | |
| raw = 0.0 | |
| if state.last_query: | |
| raw += 0.05 | |
| if state.last_error is None and state.last_result is not None: | |
| raw += 0.25 | |
| if state.last_result == state.expected_rows and state.expected_rows: | |
| raw += 0.60 | |
| if state.solved and state.step_count > 0: | |
| bonus = 0.09 * max(0, MAX_STEPS - state.step_count) / MAX_STEPS | |
| raw += bonus | |
| return strict_clamp(raw) | |