Spaces:

ar9av
/

sql-agent-openenv

Sleeping

ar9avg commited on Apr 12

Commit

719c147

1 Parent(s): e99d0aa

Nuclear clamp: every reward source in the codebase now returns (0.05, 0.95)

- rl/grader.py: compute_reward and compute_episode_reward both clamp
- rl/environment.py: record_step and end_episode clamp returned values
- openenv.yaml: reward.range set to [0.05, 0.95]
- /env/info: reward_range matches yaml
- /health: returns "healthy" (openenv-core expects this exact string)

Files changed (5) hide show

backend/api/openenv.py +1 -1
backend/main.py +1 -1
backend/rl/environment.py +3 -3
backend/rl/grader.py +24 -22
openenv.yaml +1 -1

backend/api/openenv.py CHANGED Viewed

@@ -162,7 +162,7 @@ async def env_info():
                 "task_difficulty",
             ],
         },
-        "reward_range": [0.0, 1.0],
         "max_steps": 5,
         "tasks": ["simple_queries", "join_queries", "complex_queries"],
         "rl_algorithm": "LinUCB (contextual bandit)",

                 "task_difficulty",
             ],
         },
+        "reward_range": [0.05, 0.95],
         "max_steps": 5,
         "tasks": ["simple_queries", "join_queries", "complex_queries"],
         "rl_algorithm": "LinUCB (contextual bandit)",

backend/main.py CHANGED Viewed

@@ -74,7 +74,7 @@ async def root_state():
 @app.get("/health", tags=["system"])
 async def health():
-    return {"status": "ok", "service": "sql-agent-openenv"}
 # ─── Startup ─────────────────────────────────────────────────────

 @app.get("/health", tags=["system"])
 async def health():
+    return {"status": "healthy", "service": "sql-agent-openenv"}
 # ─── Startup ─────────────────────────────────────────────────────

backend/rl/environment.py CHANGED Viewed

@@ -29,7 +29,7 @@ from rl.types import (
     ERROR_CLASS_NAMES,
 )
 from rl.error_classifier import classify_error, extract_offending_token
-from rl.grader import GraderInput, compute_reward
 from rl.linucb import LinUCB
 from rl.experience import record_episode, get_metrics, reset_experience
 from rl.repair_strategies import (
@@ -196,7 +196,7 @@ def record_step(
     _current_episode.previous_error_class = state.error_class
     return {
-        "reward": result.reward,
         "breakdown": {
             "base": result.breakdown.base,
             "attempt_penalty": result.breakdown.attempt_penalty,
@@ -229,7 +229,7 @@ def end_episode(success: bool) -> Optional[dict]:
     b.decay_alpha()
     result = {
-        "total_reward": episode.total_reward,
         "episode_length": len(episode.steps),
     }

     ERROR_CLASS_NAMES,
 )
 from rl.error_classifier import classify_error, extract_offending_token
+from rl.grader import GraderInput, compute_reward, _clamp
 from rl.linucb import LinUCB
 from rl.experience import record_episode, get_metrics, reset_experience
 from rl.repair_strategies import (
     _current_episode.previous_error_class = state.error_class
     return {
+        "reward": _clamp(result.reward),
         "breakdown": {
             "base": result.breakdown.base,
             "attempt_penalty": result.breakdown.attempt_penalty,
     b.decay_alpha()
     result = {
+        "total_reward": _clamp(episode.total_reward),
         "episode_length": len(episode.steps),
     }

backend/rl/grader.py CHANGED Viewed

@@ -1,15 +1,7 @@
 """
 Shaped reward function for the SQL debug RL environment.
-Reward components:
-  +1.0  base success reward
-  -0.1  per attempt (attempt penalty — incentivizes early resolution)
-  +0.2  if error severity decreased (progress signal)
-  +0.1  if error class changed at all (exploration signal)
-  -0.1  base failure penalty per step
-The shaping is potential-based (Ng et al., 1999), preserving
-the optimal policy while accelerating learning.
 """
 from __future__ import annotations
@@ -21,6 +13,17 @@ from rl.types import ErrorClass
 from rl.error_classifier import error_severity
 @dataclass
 class GraderInput:
     success: bool
@@ -47,8 +50,9 @@ def compute_reward(inp: GraderInput) -> GraderOutput:
     if inp.success:
         base = 1.0
         attempt_penalty = -0.1 * (inp.attempt_number - 1)
         return GraderOutput(
-            reward=base + attempt_penalty,
             breakdown=RewardBreakdown(
                 base=base,
                 attempt_penalty=attempt_penalty,
@@ -69,17 +73,17 @@ def compute_reward(inp: GraderInput) -> GraderOutput:
         curr_sev = error_severity(inp.current_error_class)
         if curr_sev < prev_sev:
-            severity_bonus = 0.2    # Progress toward solution
         elif curr_sev > prev_sev:
-            severity_bonus = -0.1   # Regression
         if inp.current_error_class != inp.previous_error_class:
-            change_bonus = 0.1      # At least something different happened
-    reward = base + attempt_penalty + severity_bonus + change_bonus
     return GraderOutput(
-        reward=reward,
         breakdown=RewardBreakdown(
             base=base,
             attempt_penalty=attempt_penalty,
@@ -90,10 +94,8 @@ def compute_reward(inp: GraderInput) -> GraderOutput:
 def compute_episode_reward(step_rewards: list[float], success: bool) -> float:
-    """
-    Compute total episode reward from individual step rewards.
-    Includes a terminal bonus/penalty based on final outcome.
-    """
-    total = sum(step_rewards)
-    terminal = 0.5 if success else -0.5
-    return total + terminal

 """
 Shaped reward function for the SQL debug RL environment.
+All rewards are clamped to strictly (0, 1) before being returned.
 """
 from __future__ import annotations
 from rl.error_classifier import error_severity
+_SCORE_MIN = 0.05
+_SCORE_MAX = 0.95
+def _clamp(x: float) -> float:
+    """Clamp to strictly (0, 1)."""
+    if x is None or x != x:
+        return _SCORE_MIN
+    return max(_SCORE_MIN, min(_SCORE_MAX, float(x)))
 @dataclass
 class GraderInput:
     success: bool
     if inp.success:
         base = 1.0
         attempt_penalty = -0.1 * (inp.attempt_number - 1)
+        raw = base + attempt_penalty
         return GraderOutput(
+            reward=_clamp(raw),
             breakdown=RewardBreakdown(
                 base=base,
                 attempt_penalty=attempt_penalty,
         curr_sev = error_severity(inp.current_error_class)
         if curr_sev < prev_sev:
+            severity_bonus = 0.2
         elif curr_sev > prev_sev:
+            severity_bonus = -0.1
         if inp.current_error_class != inp.previous_error_class:
+            change_bonus = 0.1
+    raw = base + attempt_penalty + severity_bonus + change_bonus
     return GraderOutput(
+        reward=_clamp(raw),
         breakdown=RewardBreakdown(
             base=base,
             attempt_penalty=attempt_penalty,
 def compute_episode_reward(step_rewards: list[float], success: bool) -> float:
+    """Compute total episode reward, clamped to (0, 1)."""
+    if not step_rewards:
+        return _SCORE_MIN
+    avg = sum(step_rewards) / len(step_rewards)
+    return _clamp(avg)

openenv.yaml CHANGED Viewed

@@ -82,7 +82,7 @@ observation_space:
 # ── Reward ───────────────────────────────────────────────────────────────────
 reward:
-  range: [0.0, 1.0]
   description: >
     Task score is the grader output clamped strictly inside (0, 1). Graders
     score partial progress (column correctness, row-count match) and apply

 # ── Reward ───────────────────────────────────────────────────────────────────
 reward:
+  range: [0.05, 0.95]
   description: >
     Task score is the grader output clamped strictly inside (0, 1). Graders
     score partial progress (column correctness, row-count match) and apply