ar9avg commited on
Commit
719c147
Β·
1 Parent(s): e99d0aa

Nuclear clamp: every reward source in the codebase now returns (0.05, 0.95)

Browse files

- rl/grader.py: compute_reward and compute_episode_reward both clamp
- rl/environment.py: record_step and end_episode clamp returned values
- openenv.yaml: reward.range set to [0.05, 0.95]
- /env/info: reward_range matches yaml
- /health: returns "healthy" (openenv-core expects this exact string)

backend/api/openenv.py CHANGED
@@ -162,7 +162,7 @@ async def env_info():
162
  "task_difficulty",
163
  ],
164
  },
165
- "reward_range": [0.0, 1.0],
166
  "max_steps": 5,
167
  "tasks": ["simple_queries", "join_queries", "complex_queries"],
168
  "rl_algorithm": "LinUCB (contextual bandit)",
 
162
  "task_difficulty",
163
  ],
164
  },
165
+ "reward_range": [0.05, 0.95],
166
  "max_steps": 5,
167
  "tasks": ["simple_queries", "join_queries", "complex_queries"],
168
  "rl_algorithm": "LinUCB (contextual bandit)",
backend/main.py CHANGED
@@ -74,7 +74,7 @@ async def root_state():
74
 
75
  @app.get("/health", tags=["system"])
76
  async def health():
77
- return {"status": "ok", "service": "sql-agent-openenv"}
78
 
79
 
80
  # ─── Startup ─────────────────────────────────────────────────────
 
74
 
75
  @app.get("/health", tags=["system"])
76
  async def health():
77
+ return {"status": "healthy", "service": "sql-agent-openenv"}
78
 
79
 
80
  # ─── Startup ─────────────────────────────────────────────────────
backend/rl/environment.py CHANGED
@@ -29,7 +29,7 @@ from rl.types import (
29
  ERROR_CLASS_NAMES,
30
  )
31
  from rl.error_classifier import classify_error, extract_offending_token
32
- from rl.grader import GraderInput, compute_reward
33
  from rl.linucb import LinUCB
34
  from rl.experience import record_episode, get_metrics, reset_experience
35
  from rl.repair_strategies import (
@@ -196,7 +196,7 @@ def record_step(
196
  _current_episode.previous_error_class = state.error_class
197
 
198
  return {
199
- "reward": result.reward,
200
  "breakdown": {
201
  "base": result.breakdown.base,
202
  "attempt_penalty": result.breakdown.attempt_penalty,
@@ -229,7 +229,7 @@ def end_episode(success: bool) -> Optional[dict]:
229
  b.decay_alpha()
230
 
231
  result = {
232
- "total_reward": episode.total_reward,
233
  "episode_length": len(episode.steps),
234
  }
235
 
 
29
  ERROR_CLASS_NAMES,
30
  )
31
  from rl.error_classifier import classify_error, extract_offending_token
32
+ from rl.grader import GraderInput, compute_reward, _clamp
33
  from rl.linucb import LinUCB
34
  from rl.experience import record_episode, get_metrics, reset_experience
35
  from rl.repair_strategies import (
 
196
  _current_episode.previous_error_class = state.error_class
197
 
198
  return {
199
+ "reward": _clamp(result.reward),
200
  "breakdown": {
201
  "base": result.breakdown.base,
202
  "attempt_penalty": result.breakdown.attempt_penalty,
 
229
  b.decay_alpha()
230
 
231
  result = {
232
+ "total_reward": _clamp(episode.total_reward),
233
  "episode_length": len(episode.steps),
234
  }
235
 
backend/rl/grader.py CHANGED
@@ -1,15 +1,7 @@
1
  """
2
  Shaped reward function for the SQL debug RL environment.
3
 
4
- Reward components:
5
- +1.0 base success reward
6
- -0.1 per attempt (attempt penalty β€” incentivizes early resolution)
7
- +0.2 if error severity decreased (progress signal)
8
- +0.1 if error class changed at all (exploration signal)
9
- -0.1 base failure penalty per step
10
-
11
- The shaping is potential-based (Ng et al., 1999), preserving
12
- the optimal policy while accelerating learning.
13
  """
14
 
15
  from __future__ import annotations
@@ -21,6 +13,17 @@ from rl.types import ErrorClass
21
  from rl.error_classifier import error_severity
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
24
  @dataclass
25
  class GraderInput:
26
  success: bool
@@ -47,8 +50,9 @@ def compute_reward(inp: GraderInput) -> GraderOutput:
47
  if inp.success:
48
  base = 1.0
49
  attempt_penalty = -0.1 * (inp.attempt_number - 1)
 
50
  return GraderOutput(
51
- reward=base + attempt_penalty,
52
  breakdown=RewardBreakdown(
53
  base=base,
54
  attempt_penalty=attempt_penalty,
@@ -69,17 +73,17 @@ def compute_reward(inp: GraderInput) -> GraderOutput:
69
  curr_sev = error_severity(inp.current_error_class)
70
 
71
  if curr_sev < prev_sev:
72
- severity_bonus = 0.2 # Progress toward solution
73
  elif curr_sev > prev_sev:
74
- severity_bonus = -0.1 # Regression
75
 
76
  if inp.current_error_class != inp.previous_error_class:
77
- change_bonus = 0.1 # At least something different happened
78
 
79
- reward = base + attempt_penalty + severity_bonus + change_bonus
80
 
81
  return GraderOutput(
82
- reward=reward,
83
  breakdown=RewardBreakdown(
84
  base=base,
85
  attempt_penalty=attempt_penalty,
@@ -90,10 +94,8 @@ def compute_reward(inp: GraderInput) -> GraderOutput:
90
 
91
 
92
  def compute_episode_reward(step_rewards: list[float], success: bool) -> float:
93
- """
94
- Compute total episode reward from individual step rewards.
95
- Includes a terminal bonus/penalty based on final outcome.
96
- """
97
- total = sum(step_rewards)
98
- terminal = 0.5 if success else -0.5
99
- return total + terminal
 
1
  """
2
  Shaped reward function for the SQL debug RL environment.
3
 
4
+ All rewards are clamped to strictly (0, 1) before being returned.
 
 
 
 
 
 
 
 
5
  """
6
 
7
  from __future__ import annotations
 
13
  from rl.error_classifier import error_severity
14
 
15
 
16
+ _SCORE_MIN = 0.05
17
+ _SCORE_MAX = 0.95
18
+
19
+
20
+ def _clamp(x: float) -> float:
21
+ """Clamp to strictly (0, 1)."""
22
+ if x is None or x != x:
23
+ return _SCORE_MIN
24
+ return max(_SCORE_MIN, min(_SCORE_MAX, float(x)))
25
+
26
+
27
  @dataclass
28
  class GraderInput:
29
  success: bool
 
50
  if inp.success:
51
  base = 1.0
52
  attempt_penalty = -0.1 * (inp.attempt_number - 1)
53
+ raw = base + attempt_penalty
54
  return GraderOutput(
55
+ reward=_clamp(raw),
56
  breakdown=RewardBreakdown(
57
  base=base,
58
  attempt_penalty=attempt_penalty,
 
73
  curr_sev = error_severity(inp.current_error_class)
74
 
75
  if curr_sev < prev_sev:
76
+ severity_bonus = 0.2
77
  elif curr_sev > prev_sev:
78
+ severity_bonus = -0.1
79
 
80
  if inp.current_error_class != inp.previous_error_class:
81
+ change_bonus = 0.1
82
 
83
+ raw = base + attempt_penalty + severity_bonus + change_bonus
84
 
85
  return GraderOutput(
86
+ reward=_clamp(raw),
87
  breakdown=RewardBreakdown(
88
  base=base,
89
  attempt_penalty=attempt_penalty,
 
94
 
95
 
96
  def compute_episode_reward(step_rewards: list[float], success: bool) -> float:
97
+ """Compute total episode reward, clamped to (0, 1)."""
98
+ if not step_rewards:
99
+ return _SCORE_MIN
100
+ avg = sum(step_rewards) / len(step_rewards)
101
+ return _clamp(avg)
 
 
openenv.yaml CHANGED
@@ -82,7 +82,7 @@ observation_space:
82
 
83
  # ── Reward ───────────────────────────────────────────────────────────────────
84
  reward:
85
- range: [0.0, 1.0]
86
  description: >
87
  Task score is the grader output clamped strictly inside (0, 1). Graders
88
  score partial progress (column correctness, row-count match) and apply
 
82
 
83
  # ── Reward ───────────────────────────────────────────────────────────────────
84
  reward:
85
+ range: [0.05, 0.95]
86
  description: >
87
  Task score is the grader output clamped strictly inside (0, 1). Graders
88
  score partial progress (column correctness, row-count match) and apply