ar9avg commited on
Commit
b86d426
·
1 Parent(s): d2d92b8

Fix reward.value to always be task_score in (0,1) exclusive

Browse files

The validator checks reward.value from /step responses. Previously
this was the RL reward (compute_reward) which could be exactly 1.0
on first-attempt success or negative on failure.

Now reward.value = task_score (clamped to (eps, 1-eps) in grade_response).
The RL reward is preserved in info.rl_reward for bandit training.
Also corrects reward_range in /env/info to [0.0, 1.0].

backend/api/openenv.py CHANGED
@@ -162,7 +162,7 @@ async def env_info():
162
  "task_difficulty",
163
  ],
164
  },
165
- "reward_range": [-1.5, 1.5],
166
  "max_steps": 5,
167
  "tasks": ["simple_queries", "join_queries", "complex_queries"],
168
  "rl_algorithm": "LinUCB (contextual bandit)",
 
162
  "task_difficulty",
163
  ],
164
  },
165
+ "reward_range": [0.0, 1.0],
166
  "max_steps": 5,
167
  "tasks": ["simple_queries", "join_queries", "complex_queries"],
168
  "rl_algorithm": "LinUCB (contextual bandit)",
backend/env/sql_env.py CHANGED
@@ -332,11 +332,12 @@ class SQLAgentEnv:
332
 
333
  obs = self._build_observation()
334
  reward_info = RewardInfo(
335
- value=grader_out.reward,
336
  success=success,
337
  done=done,
338
  info={
339
  "task_score": task_score,
 
340
  "attempt": ep.attempt_number,
341
  "breakdown": {
342
  "base": grader_out.breakdown.base,
 
332
 
333
  obs = self._build_observation()
334
  reward_info = RewardInfo(
335
+ value=task_score, # always in (eps, 1-eps) per OpenEnv spec
336
  success=success,
337
  done=done,
338
  info={
339
  "task_score": task_score,
340
+ "rl_reward": grader_out.reward,
341
  "attempt": ep.attempt_number,
342
  "breakdown": {
343
  "base": grader_out.breakdown.base,
inference.py CHANGED
@@ -200,12 +200,11 @@ async def run_episode(
200
  if done:
201
  break
202
 
203
- # Score: clamp sum of rewards to (0, 1) exclusive per OpenEnv spec
204
- _EPS = 1e-9
205
  total = sum(rewards)
206
- max_possible = MAX_STEPS * 1.0 # max reward per step is 1.0
207
- raw_score = total / max_possible if max_possible > 0 else _EPS
208
- score = max(_EPS, min(1.0 - _EPS, raw_score))
209
 
210
  finally:
211
  log_end(
 
200
  if done:
201
  break
202
 
203
+ # Score: average of per-step rewards (each already in (0,1) from env)
204
+ # Clamp to [0, 1] as a safety net
205
  total = sum(rewards)
206
+ max_possible = MAX_STEPS * 1.0
207
+ score = min(max(total / max_possible if max_possible > 0 else 0.0, 0.0), 1.0)
 
208
 
209
  finally:
210
  log_end(