Spaces:
Sleeping
Sleeping
Fix reward.value to always be task_score in (0,1) exclusive
Browse filesThe validator checks reward.value from /step responses. Previously
this was the RL reward (compute_reward) which could be exactly 1.0
on first-attempt success or negative on failure.
Now reward.value = task_score (clamped to (eps, 1-eps) in grade_response).
The RL reward is preserved in info.rl_reward for bandit training.
Also corrects reward_range in /env/info to [0.0, 1.0].
- backend/api/openenv.py +1 -1
- backend/env/sql_env.py +2 -1
- inference.py +4 -5
backend/api/openenv.py
CHANGED
|
@@ -162,7 +162,7 @@ async def env_info():
|
|
| 162 |
"task_difficulty",
|
| 163 |
],
|
| 164 |
},
|
| 165 |
-
"reward_range": [
|
| 166 |
"max_steps": 5,
|
| 167 |
"tasks": ["simple_queries", "join_queries", "complex_queries"],
|
| 168 |
"rl_algorithm": "LinUCB (contextual bandit)",
|
|
|
|
| 162 |
"task_difficulty",
|
| 163 |
],
|
| 164 |
},
|
| 165 |
+
"reward_range": [0.0, 1.0],
|
| 166 |
"max_steps": 5,
|
| 167 |
"tasks": ["simple_queries", "join_queries", "complex_queries"],
|
| 168 |
"rl_algorithm": "LinUCB (contextual bandit)",
|
backend/env/sql_env.py
CHANGED
|
@@ -332,11 +332,12 @@ class SQLAgentEnv:
|
|
| 332 |
|
| 333 |
obs = self._build_observation()
|
| 334 |
reward_info = RewardInfo(
|
| 335 |
-
value=
|
| 336 |
success=success,
|
| 337 |
done=done,
|
| 338 |
info={
|
| 339 |
"task_score": task_score,
|
|
|
|
| 340 |
"attempt": ep.attempt_number,
|
| 341 |
"breakdown": {
|
| 342 |
"base": grader_out.breakdown.base,
|
|
|
|
| 332 |
|
| 333 |
obs = self._build_observation()
|
| 334 |
reward_info = RewardInfo(
|
| 335 |
+
value=task_score, # always in (eps, 1-eps) per OpenEnv spec
|
| 336 |
success=success,
|
| 337 |
done=done,
|
| 338 |
info={
|
| 339 |
"task_score": task_score,
|
| 340 |
+
"rl_reward": grader_out.reward,
|
| 341 |
"attempt": ep.attempt_number,
|
| 342 |
"breakdown": {
|
| 343 |
"base": grader_out.breakdown.base,
|
inference.py
CHANGED
|
@@ -200,12 +200,11 @@ async def run_episode(
|
|
| 200 |
if done:
|
| 201 |
break
|
| 202 |
|
| 203 |
-
# Score:
|
| 204 |
-
|
| 205 |
total = sum(rewards)
|
| 206 |
-
max_possible = MAX_STEPS * 1.0
|
| 207 |
-
|
| 208 |
-
score = max(_EPS, min(1.0 - _EPS, raw_score))
|
| 209 |
|
| 210 |
finally:
|
| 211 |
log_end(
|
|
|
|
| 200 |
if done:
|
| 201 |
break
|
| 202 |
|
| 203 |
+
# Score: average of per-step rewards (each already in (0,1) from env)
|
| 204 |
+
# Clamp to [0, 1] as a safety net
|
| 205 |
total = sum(rewards)
|
| 206 |
+
max_possible = MAX_STEPS * 1.0
|
| 207 |
+
score = min(max(total / max_possible if max_possible > 0 else 0.0, 0.0), 1.0)
|
|
|
|
| 208 |
|
| 209 |
finally:
|
| 210 |
log_end(
|