Spaces:
Sleeping
Sleeping
fix: enforce reward bounds (0.01–0.99) and 2 decimal precision across grader, env, and inference
Browse files- client.py +1 -1
- inference.py +7 -7
- models.py +2 -2
- server/WhyDidItFail_environment.py +4 -4
- server/graders.py +2 -2
- server/llm_judge.py +1 -1
client.py
CHANGED
|
@@ -50,7 +50,7 @@ class WhyDidItFailEnv(EnvClient[WhyDidItFailAction, WhyDidItFailObservation, Why
|
|
| 50 |
visible_data=obs_data.get("visible_data", {}),
|
| 51 |
available_actions=obs_data.get("available_actions", []),
|
| 52 |
steps_taken=obs_data.get("steps_taken", 0),
|
| 53 |
-
reward=obs_data.get("reward", 0.
|
| 54 |
done=obs_data.get("done", False),
|
| 55 |
feedback=obs_data.get("feedback", ""),
|
| 56 |
)
|
|
|
|
| 50 |
visible_data=obs_data.get("visible_data", {}),
|
| 51 |
available_actions=obs_data.get("available_actions", []),
|
| 52 |
steps_taken=obs_data.get("steps_taken", 0),
|
| 53 |
+
reward=obs_data.get("reward", 0.01),
|
| 54 |
done=obs_data.get("done", False),
|
| 55 |
feedback=obs_data.get("feedback", ""),
|
| 56 |
)
|
inference.py
CHANGED
|
@@ -195,7 +195,7 @@ async def run_episode(
|
|
| 195 |
rewards: List[float] = []
|
| 196 |
inspection_order: List[str] = []
|
| 197 |
submit_action: WhyDidItFailAction | None = None
|
| 198 |
-
score = 0.
|
| 199 |
success = False
|
| 200 |
|
| 201 |
try:
|
|
@@ -229,7 +229,7 @@ async def run_episode(
|
|
| 229 |
break
|
| 230 |
|
| 231 |
# WebSocket is closed — safe to call the judge now
|
| 232 |
-
keyword_score = rewards[-1] if rewards else 0.
|
| 233 |
judge_score: float | None = None
|
| 234 |
if submit_action is not None:
|
| 235 |
judge_score = llm_judge(
|
|
@@ -242,10 +242,10 @@ async def run_episode(
|
|
| 242 |
inspection_order=inspection_order,
|
| 243 |
)
|
| 244 |
if judge_score is None:
|
| 245 |
-
score = round(keyword_score,
|
| 246 |
-
# print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.
|
| 247 |
else:
|
| 248 |
-
score = round(0.85 * keyword_score + 0.15 * judge_score,
|
| 249 |
# print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.3f} reasoning={judge_score:.3f} total={score:.3f}", file=sys.stderr, flush=True)
|
| 250 |
|
| 251 |
success = score >= SUCCESS_THRESHOLD
|
|
@@ -253,7 +253,7 @@ async def run_episode(
|
|
| 253 |
finally:
|
| 254 |
steps_taken = len(rewards)
|
| 255 |
final_score = round(max(0.01, min(0.99, sum(rewards))), 2) if rewards else 0.01
|
| 256 |
-
print(f"[END] success={str(success).lower()} steps={steps_taken} reward={final_score}", flush=True)
|
| 257 |
|
| 258 |
return {"scenario_key": scenario_key, "score": score, "steps": steps_taken, "success": success}, env
|
| 259 |
|
|
@@ -293,7 +293,7 @@ async def main() -> None:
|
|
| 293 |
scores += await run_task("task_easy", EASY_SCENARIOS, env, client)
|
| 294 |
scores += await run_task("task_medium", MEDIUM_SCENARIOS, env, client)
|
| 295 |
scores += await run_task("task_hard", HARD_SCENARIOS, env, client)
|
| 296 |
-
overall = sum(scores) / len(scores) if scores else 0.
|
| 297 |
# print(f" [OVERALL] avg_score={overall:.3f}", file=sys.stderr, flush=True)
|
| 298 |
# print(f"[END] score={overall:.3f}", flush=True)
|
| 299 |
finally:
|
|
|
|
| 195 |
rewards: List[float] = []
|
| 196 |
inspection_order: List[str] = []
|
| 197 |
submit_action: WhyDidItFailAction | None = None
|
| 198 |
+
score = 0.01
|
| 199 |
success = False
|
| 200 |
|
| 201 |
try:
|
|
|
|
| 229 |
break
|
| 230 |
|
| 231 |
# WebSocket is closed — safe to call the judge now
|
| 232 |
+
keyword_score = rewards[-1] if rewards else 0.01
|
| 233 |
judge_score: float | None = None
|
| 234 |
if submit_action is not None:
|
| 235 |
judge_score = llm_judge(
|
|
|
|
| 242 |
inspection_order=inspection_order,
|
| 243 |
)
|
| 244 |
if judge_score is None:
|
| 245 |
+
score = round(keyword_score, 2)
|
| 246 |
+
# print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.2f} reasoning=n/a total={score:.2f}", file=sys.stderr, flush=True)
|
| 247 |
else:
|
| 248 |
+
score = round(0.85 * keyword_score + 0.15 * judge_score, 2)
|
| 249 |
# print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.3f} reasoning={judge_score:.3f} total={score:.3f}", file=sys.stderr, flush=True)
|
| 250 |
|
| 251 |
success = score >= SUCCESS_THRESHOLD
|
|
|
|
| 253 |
finally:
|
| 254 |
steps_taken = len(rewards)
|
| 255 |
final_score = round(max(0.01, min(0.99, sum(rewards))), 2) if rewards else 0.01
|
| 256 |
+
print(f"[END] success={str(success).lower()} steps={steps_taken} reward={final_score:.2f}", flush=True)
|
| 257 |
|
| 258 |
return {"scenario_key": scenario_key, "score": score, "steps": steps_taken, "success": success}, env
|
| 259 |
|
|
|
|
| 293 |
scores += await run_task("task_easy", EASY_SCENARIOS, env, client)
|
| 294 |
scores += await run_task("task_medium", MEDIUM_SCENARIOS, env, client)
|
| 295 |
scores += await run_task("task_hard", HARD_SCENARIOS, env, client)
|
| 296 |
+
overall = round(sum(scores) / len(scores), 2) if scores else 0.01
|
| 297 |
# print(f" [OVERALL] avg_score={overall:.3f}", file=sys.stderr, flush=True)
|
| 298 |
# print(f"[END] score={overall:.3f}", flush=True)
|
| 299 |
finally:
|
models.py
CHANGED
|
@@ -42,8 +42,8 @@ class WhyDidItFailObservation(Observation):
|
|
| 42 |
"Which action_types are valid on this step.")
|
| 43 |
steps_taken: int = Field(..., description=
|
| 44 |
"Number of actions taken so far in this episode.")
|
| 45 |
-
reward: float = Field(default=0.
|
| 46 |
-
"Score for the current step.
|
| 47 |
done: bool = Field(default=False, description=
|
| 48 |
"True when the episode has ended.")
|
| 49 |
feedback: str = Field(..., description=
|
|
|
|
| 42 |
"Which action_types are valid on this step.")
|
| 43 |
steps_taken: int = Field(..., description=
|
| 44 |
"Number of actions taken so far in this episode.")
|
| 45 |
+
reward: float = Field(default=0.01, description= # type: ignore[override]
|
| 46 |
+
"Score for the current step. 0.99 = max.")
|
| 47 |
done: bool = Field(default=False, description=
|
| 48 |
"True when the episode has ended.")
|
| 49 |
feedback: str = Field(..., description=
|
server/WhyDidItFail_environment.py
CHANGED
|
@@ -56,7 +56,7 @@ class WhyDidItFailEnvironment(Environment):
|
|
| 56 |
visible_data={"hint": "Start by inspecting the training logs."},
|
| 57 |
available_actions=["inspect_logs", "inspect_config", "inspect_gradients", "submit_diagnosis"],
|
| 58 |
steps_taken=0,
|
| 59 |
-
reward=0.
|
| 60 |
done=False,
|
| 61 |
feedback="Investigation started.",
|
| 62 |
)
|
|
@@ -67,18 +67,18 @@ class WhyDidItFailEnvironment(Environment):
|
|
| 67 |
|
| 68 |
self._state.step_count += 1
|
| 69 |
|
| 70 |
-
# Hard step limit — terminate immediately, grade() will return 0.
|
| 71 |
if self._state.step_count > self.max_steps and action.action_type != "submit_diagnosis":
|
| 72 |
return WhyDidItFailObservation(
|
| 73 |
task_description="Step limit reached. Episode terminated.",
|
| 74 |
visible_data={},
|
| 75 |
available_actions=[],
|
| 76 |
steps_taken=self._state.step_count,
|
| 77 |
-
reward=0.
|
| 78 |
done=True,
|
| 79 |
feedback=(
|
| 80 |
f"Step limit ({self.max_steps}) reached without a diagnosis. "
|
| 81 |
-
f"Score: 0.
|
| 82 |
),
|
| 83 |
)
|
| 84 |
required: list[str] = self.scenario.get("required_sources", ["logs"])
|
|
|
|
| 56 |
visible_data={"hint": "Start by inspecting the training logs."},
|
| 57 |
available_actions=["inspect_logs", "inspect_config", "inspect_gradients", "submit_diagnosis"],
|
| 58 |
steps_taken=0,
|
| 59 |
+
reward=0.01,
|
| 60 |
done=False,
|
| 61 |
feedback="Investigation started.",
|
| 62 |
)
|
|
|
|
| 67 |
|
| 68 |
self._state.step_count += 1
|
| 69 |
|
| 70 |
+
# Hard step limit — terminate immediately, grade() will return 0.01.
|
| 71 |
if self._state.step_count > self.max_steps and action.action_type != "submit_diagnosis":
|
| 72 |
return WhyDidItFailObservation(
|
| 73 |
task_description="Step limit reached. Episode terminated.",
|
| 74 |
visible_data={},
|
| 75 |
available_actions=[],
|
| 76 |
steps_taken=self._state.step_count,
|
| 77 |
+
reward=0.01,
|
| 78 |
done=True,
|
| 79 |
feedback=(
|
| 80 |
f"Step limit ({self.max_steps}) reached without a diagnosis. "
|
| 81 |
+
f"Score: 0.01. Actual failure: '{self.scenario['correct_diagnosis']}'."
|
| 82 |
),
|
| 83 |
)
|
| 84 |
required: list[str] = self.scenario.get("required_sources", ["logs"])
|
server/graders.py
CHANGED
|
@@ -6,7 +6,7 @@ grade() is the single entry point. It scores the full episode trajectory:
|
|
| 6 |
diagnosis_score (0.00 – 0.70) was the diagnosis correct?
|
| 7 |
evidence_score (0.00 – 0.15) did the agent inspect the right sources?
|
| 8 |
efficiency_score (0.00 – 0.15) did the agent act without waste?
|
| 9 |
-
fix_bonus (0.00 – 0.15) did the agent suggest a valid fix? (bonus, capped at
|
| 10 |
|
| 11 |
Step-level partial rewards are returned by the environment's step() on every action,
|
| 12 |
giving the agent a signal over the full trajectory before the episode ends.
|
|
@@ -197,7 +197,7 @@ def grade(
|
|
| 197 |
Single unified grade function. Scores every scenario identically.
|
| 198 |
|
| 199 |
Total score = diagnosis_score + evidence_score + efficiency_score + fix_bonus
|
| 200 |
-
clamped to [0.
|
| 201 |
|
| 202 |
Max achievable without fix: 0.70 + 0.15 + 0.15 = 1.00
|
| 203 |
Max achievable with fix: 0.70 + 0.15 + 0.15 + 0.15 = 1.00 (capped)
|
|
|
|
| 6 |
diagnosis_score (0.00 – 0.70) was the diagnosis correct?
|
| 7 |
evidence_score (0.00 – 0.15) did the agent inspect the right sources?
|
| 8 |
efficiency_score (0.00 – 0.15) did the agent act without waste?
|
| 9 |
+
fix_bonus (0.00 – 0.15) did the agent suggest a valid fix? (bonus, capped at 0.99)
|
| 10 |
|
| 11 |
Step-level partial rewards are returned by the environment's step() on every action,
|
| 12 |
giving the agent a signal over the full trajectory before the episode ends.
|
|
|
|
| 197 |
Single unified grade function. Scores every scenario identically.
|
| 198 |
|
| 199 |
Total score = diagnosis_score + evidence_score + efficiency_score + fix_bonus
|
| 200 |
+
clamped to [0.01, 0.99].
|
| 201 |
|
| 202 |
Max achievable without fix: 0.70 + 0.15 + 0.15 = 1.00
|
| 203 |
Max achievable with fix: 0.70 + 0.15 + 0.15 + 0.15 = 1.00 (capped)
|
server/llm_judge.py
CHANGED
|
@@ -91,7 +91,7 @@ def judge(
|
|
| 91 |
+ data.get("fix_rationale", 0)
|
| 92 |
)
|
| 93 |
# normalize: raw 0–15 → 0.0–1.0
|
| 94 |
-
return round(max(0, min(15, raw)) / 15,
|
| 95 |
|
| 96 |
except Exception as exc:
|
| 97 |
print(f" [JUDGE] failed: {exc}", flush=True)
|
|
|
|
| 91 |
+ data.get("fix_rationale", 0)
|
| 92 |
)
|
| 93 |
# normalize: raw 0–15 → 0.0–1.0
|
| 94 |
+
return round(max(0, min(15, raw)) / 15, 2)
|
| 95 |
|
| 96 |
except Exception as exc:
|
| 97 |
print(f" [JUDGE] failed: {exc}", flush=True)
|