Spaces:
Sleeping
Sleeping
Nuclear clamp: every reward source in the codebase now returns (0.05, 0.95)
Browse files- rl/grader.py: compute_reward and compute_episode_reward both clamp
- rl/environment.py: record_step and end_episode clamp returned values
- openenv.yaml: reward.range set to [0.05, 0.95]
- /env/info: reward_range matches yaml
- /health: returns "healthy" (openenv-core expects this exact string)
- backend/api/openenv.py +1 -1
- backend/main.py +1 -1
- backend/rl/environment.py +3 -3
- backend/rl/grader.py +24 -22
- openenv.yaml +1 -1
backend/api/openenv.py
CHANGED
|
@@ -162,7 +162,7 @@ async def env_info():
|
|
| 162 |
"task_difficulty",
|
| 163 |
],
|
| 164 |
},
|
| 165 |
-
"reward_range": [0.
|
| 166 |
"max_steps": 5,
|
| 167 |
"tasks": ["simple_queries", "join_queries", "complex_queries"],
|
| 168 |
"rl_algorithm": "LinUCB (contextual bandit)",
|
|
|
|
| 162 |
"task_difficulty",
|
| 163 |
],
|
| 164 |
},
|
| 165 |
+
"reward_range": [0.05, 0.95],
|
| 166 |
"max_steps": 5,
|
| 167 |
"tasks": ["simple_queries", "join_queries", "complex_queries"],
|
| 168 |
"rl_algorithm": "LinUCB (contextual bandit)",
|
backend/main.py
CHANGED
|
@@ -74,7 +74,7 @@ async def root_state():
|
|
| 74 |
|
| 75 |
@app.get("/health", tags=["system"])
|
| 76 |
async def health():
|
| 77 |
-
return {"status": "
|
| 78 |
|
| 79 |
|
| 80 |
# βββ Startup βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 74 |
|
| 75 |
@app.get("/health", tags=["system"])
|
| 76 |
async def health():
|
| 77 |
+
return {"status": "healthy", "service": "sql-agent-openenv"}
|
| 78 |
|
| 79 |
|
| 80 |
# βββ Startup βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
backend/rl/environment.py
CHANGED
|
@@ -29,7 +29,7 @@ from rl.types import (
|
|
| 29 |
ERROR_CLASS_NAMES,
|
| 30 |
)
|
| 31 |
from rl.error_classifier import classify_error, extract_offending_token
|
| 32 |
-
from rl.grader import GraderInput, compute_reward
|
| 33 |
from rl.linucb import LinUCB
|
| 34 |
from rl.experience import record_episode, get_metrics, reset_experience
|
| 35 |
from rl.repair_strategies import (
|
|
@@ -196,7 +196,7 @@ def record_step(
|
|
| 196 |
_current_episode.previous_error_class = state.error_class
|
| 197 |
|
| 198 |
return {
|
| 199 |
-
"reward": result.reward,
|
| 200 |
"breakdown": {
|
| 201 |
"base": result.breakdown.base,
|
| 202 |
"attempt_penalty": result.breakdown.attempt_penalty,
|
|
@@ -229,7 +229,7 @@ def end_episode(success: bool) -> Optional[dict]:
|
|
| 229 |
b.decay_alpha()
|
| 230 |
|
| 231 |
result = {
|
| 232 |
-
"total_reward": episode.total_reward,
|
| 233 |
"episode_length": len(episode.steps),
|
| 234 |
}
|
| 235 |
|
|
|
|
| 29 |
ERROR_CLASS_NAMES,
|
| 30 |
)
|
| 31 |
from rl.error_classifier import classify_error, extract_offending_token
|
| 32 |
+
from rl.grader import GraderInput, compute_reward, _clamp
|
| 33 |
from rl.linucb import LinUCB
|
| 34 |
from rl.experience import record_episode, get_metrics, reset_experience
|
| 35 |
from rl.repair_strategies import (
|
|
|
|
| 196 |
_current_episode.previous_error_class = state.error_class
|
| 197 |
|
| 198 |
return {
|
| 199 |
+
"reward": _clamp(result.reward),
|
| 200 |
"breakdown": {
|
| 201 |
"base": result.breakdown.base,
|
| 202 |
"attempt_penalty": result.breakdown.attempt_penalty,
|
|
|
|
| 229 |
b.decay_alpha()
|
| 230 |
|
| 231 |
result = {
|
| 232 |
+
"total_reward": _clamp(episode.total_reward),
|
| 233 |
"episode_length": len(episode.steps),
|
| 234 |
}
|
| 235 |
|
backend/rl/grader.py
CHANGED
|
@@ -1,15 +1,7 @@
|
|
| 1 |
"""
|
| 2 |
Shaped reward function for the SQL debug RL environment.
|
| 3 |
|
| 4 |
-
|
| 5 |
-
+1.0 base success reward
|
| 6 |
-
-0.1 per attempt (attempt penalty β incentivizes early resolution)
|
| 7 |
-
+0.2 if error severity decreased (progress signal)
|
| 8 |
-
+0.1 if error class changed at all (exploration signal)
|
| 9 |
-
-0.1 base failure penalty per step
|
| 10 |
-
|
| 11 |
-
The shaping is potential-based (Ng et al., 1999), preserving
|
| 12 |
-
the optimal policy while accelerating learning.
|
| 13 |
"""
|
| 14 |
|
| 15 |
from __future__ import annotations
|
|
@@ -21,6 +13,17 @@ from rl.types import ErrorClass
|
|
| 21 |
from rl.error_classifier import error_severity
|
| 22 |
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
@dataclass
|
| 25 |
class GraderInput:
|
| 26 |
success: bool
|
|
@@ -47,8 +50,9 @@ def compute_reward(inp: GraderInput) -> GraderOutput:
|
|
| 47 |
if inp.success:
|
| 48 |
base = 1.0
|
| 49 |
attempt_penalty = -0.1 * (inp.attempt_number - 1)
|
|
|
|
| 50 |
return GraderOutput(
|
| 51 |
-
reward=
|
| 52 |
breakdown=RewardBreakdown(
|
| 53 |
base=base,
|
| 54 |
attempt_penalty=attempt_penalty,
|
|
@@ -69,17 +73,17 @@ def compute_reward(inp: GraderInput) -> GraderOutput:
|
|
| 69 |
curr_sev = error_severity(inp.current_error_class)
|
| 70 |
|
| 71 |
if curr_sev < prev_sev:
|
| 72 |
-
severity_bonus = 0.2
|
| 73 |
elif curr_sev > prev_sev:
|
| 74 |
-
severity_bonus = -0.1
|
| 75 |
|
| 76 |
if inp.current_error_class != inp.previous_error_class:
|
| 77 |
-
change_bonus = 0.1
|
| 78 |
|
| 79 |
-
|
| 80 |
|
| 81 |
return GraderOutput(
|
| 82 |
-
reward=
|
| 83 |
breakdown=RewardBreakdown(
|
| 84 |
base=base,
|
| 85 |
attempt_penalty=attempt_penalty,
|
|
@@ -90,10 +94,8 @@ def compute_reward(inp: GraderInput) -> GraderOutput:
|
|
| 90 |
|
| 91 |
|
| 92 |
def compute_episode_reward(step_rewards: list[float], success: bool) -> float:
|
| 93 |
-
"""
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
terminal = 0.5 if success else -0.5
|
| 99 |
-
return total + terminal
|
|
|
|
| 1 |
"""
|
| 2 |
Shaped reward function for the SQL debug RL environment.
|
| 3 |
|
| 4 |
+
All rewards are clamped to strictly (0, 1) before being returned.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
from __future__ import annotations
|
|
|
|
| 13 |
from rl.error_classifier import error_severity
|
| 14 |
|
| 15 |
|
| 16 |
+
_SCORE_MIN = 0.05
|
| 17 |
+
_SCORE_MAX = 0.95
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def _clamp(x: float) -> float:
|
| 21 |
+
"""Clamp to strictly (0, 1)."""
|
| 22 |
+
if x is None or x != x:
|
| 23 |
+
return _SCORE_MIN
|
| 24 |
+
return max(_SCORE_MIN, min(_SCORE_MAX, float(x)))
|
| 25 |
+
|
| 26 |
+
|
| 27 |
@dataclass
|
| 28 |
class GraderInput:
|
| 29 |
success: bool
|
|
|
|
| 50 |
if inp.success:
|
| 51 |
base = 1.0
|
| 52 |
attempt_penalty = -0.1 * (inp.attempt_number - 1)
|
| 53 |
+
raw = base + attempt_penalty
|
| 54 |
return GraderOutput(
|
| 55 |
+
reward=_clamp(raw),
|
| 56 |
breakdown=RewardBreakdown(
|
| 57 |
base=base,
|
| 58 |
attempt_penalty=attempt_penalty,
|
|
|
|
| 73 |
curr_sev = error_severity(inp.current_error_class)
|
| 74 |
|
| 75 |
if curr_sev < prev_sev:
|
| 76 |
+
severity_bonus = 0.2
|
| 77 |
elif curr_sev > prev_sev:
|
| 78 |
+
severity_bonus = -0.1
|
| 79 |
|
| 80 |
if inp.current_error_class != inp.previous_error_class:
|
| 81 |
+
change_bonus = 0.1
|
| 82 |
|
| 83 |
+
raw = base + attempt_penalty + severity_bonus + change_bonus
|
| 84 |
|
| 85 |
return GraderOutput(
|
| 86 |
+
reward=_clamp(raw),
|
| 87 |
breakdown=RewardBreakdown(
|
| 88 |
base=base,
|
| 89 |
attempt_penalty=attempt_penalty,
|
|
|
|
| 94 |
|
| 95 |
|
| 96 |
def compute_episode_reward(step_rewards: list[float], success: bool) -> float:
|
| 97 |
+
"""Compute total episode reward, clamped to (0, 1)."""
|
| 98 |
+
if not step_rewards:
|
| 99 |
+
return _SCORE_MIN
|
| 100 |
+
avg = sum(step_rewards) / len(step_rewards)
|
| 101 |
+
return _clamp(avg)
|
|
|
|
|
|
openenv.yaml
CHANGED
|
@@ -82,7 +82,7 @@ observation_space:
|
|
| 82 |
|
| 83 |
# ββ Reward βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 84 |
reward:
|
| 85 |
-
range: [0.
|
| 86 |
description: >
|
| 87 |
Task score is the grader output clamped strictly inside (0, 1). Graders
|
| 88 |
score partial progress (column correctness, row-count match) and apply
|
|
|
|
| 82 |
|
| 83 |
# ββ Reward βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 84 |
reward:
|
| 85 |
+
range: [0.05, 0.95]
|
| 86 |
description: >
|
| 87 |
Task score is the grader output clamped strictly inside (0, 1). Graders
|
| 88 |
score partial progress (column correctness, row-count match) and apply
|