samrat-rm commited on
Commit
3781ce7
·
1 Parent(s): c130122

fix: enforce reward bounds (0.01–0.99) and 2 decimal precision across grader, env, and inference

Browse files
client.py CHANGED
@@ -50,7 +50,7 @@ class WhyDidItFailEnv(EnvClient[WhyDidItFailAction, WhyDidItFailObservation, Why
50
  visible_data=obs_data.get("visible_data", {}),
51
  available_actions=obs_data.get("available_actions", []),
52
  steps_taken=obs_data.get("steps_taken", 0),
53
- reward=obs_data.get("reward", 0.0),
54
  done=obs_data.get("done", False),
55
  feedback=obs_data.get("feedback", ""),
56
  )
 
50
  visible_data=obs_data.get("visible_data", {}),
51
  available_actions=obs_data.get("available_actions", []),
52
  steps_taken=obs_data.get("steps_taken", 0),
53
+ reward=obs_data.get("reward", 0.01),
54
  done=obs_data.get("done", False),
55
  feedback=obs_data.get("feedback", ""),
56
  )
inference.py CHANGED
@@ -195,7 +195,7 @@ async def run_episode(
195
  rewards: List[float] = []
196
  inspection_order: List[str] = []
197
  submit_action: WhyDidItFailAction | None = None
198
- score = 0.0
199
  success = False
200
 
201
  try:
@@ -229,7 +229,7 @@ async def run_episode(
229
  break
230
 
231
  # WebSocket is closed — safe to call the judge now
232
- keyword_score = rewards[-1] if rewards else 0.0
233
  judge_score: float | None = None
234
  if submit_action is not None:
235
  judge_score = llm_judge(
@@ -242,10 +242,10 @@ async def run_episode(
242
  inspection_order=inspection_order,
243
  )
244
  if judge_score is None:
245
- score = round(keyword_score, 4)
246
- # print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.3f} reasoning=n/a total={score:.3f}", file=sys.stderr, flush=True)
247
  else:
248
- score = round(0.85 * keyword_score + 0.15 * judge_score, 4)
249
  # print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.3f} reasoning={judge_score:.3f} total={score:.3f}", file=sys.stderr, flush=True)
250
 
251
  success = score >= SUCCESS_THRESHOLD
@@ -253,7 +253,7 @@ async def run_episode(
253
  finally:
254
  steps_taken = len(rewards)
255
  final_score = round(max(0.01, min(0.99, sum(rewards))), 2) if rewards else 0.01
256
- print(f"[END] success={str(success).lower()} steps={steps_taken} reward={final_score}", flush=True)
257
 
258
  return {"scenario_key": scenario_key, "score": score, "steps": steps_taken, "success": success}, env
259
 
@@ -293,7 +293,7 @@ async def main() -> None:
293
  scores += await run_task("task_easy", EASY_SCENARIOS, env, client)
294
  scores += await run_task("task_medium", MEDIUM_SCENARIOS, env, client)
295
  scores += await run_task("task_hard", HARD_SCENARIOS, env, client)
296
- overall = sum(scores) / len(scores) if scores else 0.0
297
  # print(f" [OVERALL] avg_score={overall:.3f}", file=sys.stderr, flush=True)
298
  # print(f"[END] score={overall:.3f}", flush=True)
299
  finally:
 
195
  rewards: List[float] = []
196
  inspection_order: List[str] = []
197
  submit_action: WhyDidItFailAction | None = None
198
+ score = 0.01
199
  success = False
200
 
201
  try:
 
229
  break
230
 
231
  # WebSocket is closed — safe to call the judge now
232
+ keyword_score = rewards[-1] if rewards else 0.01
233
  judge_score: float | None = None
234
  if submit_action is not None:
235
  judge_score = llm_judge(
 
242
  inspection_order=inspection_order,
243
  )
244
  if judge_score is None:
245
+ score = round(keyword_score, 2)
246
+ # print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.2f} reasoning=n/a total={score:.2f}", file=sys.stderr, flush=True)
247
  else:
248
+ score = round(0.85 * keyword_score + 0.15 * judge_score, 2)
249
  # print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.3f} reasoning={judge_score:.3f} total={score:.3f}", file=sys.stderr, flush=True)
250
 
251
  success = score >= SUCCESS_THRESHOLD
 
253
  finally:
254
  steps_taken = len(rewards)
255
  final_score = round(max(0.01, min(0.99, sum(rewards))), 2) if rewards else 0.01
256
+ print(f"[END] success={str(success).lower()} steps={steps_taken} reward={final_score:.2f}", flush=True)
257
 
258
  return {"scenario_key": scenario_key, "score": score, "steps": steps_taken, "success": success}, env
259
 
 
293
  scores += await run_task("task_easy", EASY_SCENARIOS, env, client)
294
  scores += await run_task("task_medium", MEDIUM_SCENARIOS, env, client)
295
  scores += await run_task("task_hard", HARD_SCENARIOS, env, client)
296
+ overall = round(sum(scores) / len(scores), 2) if scores else 0.01
297
  # print(f" [OVERALL] avg_score={overall:.3f}", file=sys.stderr, flush=True)
298
  # print(f"[END] score={overall:.3f}", flush=True)
299
  finally:
models.py CHANGED
@@ -42,8 +42,8 @@ class WhyDidItFailObservation(Observation):
42
  "Which action_types are valid on this step.")
43
  steps_taken: int = Field(..., description=
44
  "Number of actions taken so far in this episode.")
45
- reward: float = Field(default=0.0, description= # type: ignore[override]
46
- "Score for the current step. 1.0 = solved.")
47
  done: bool = Field(default=False, description=
48
  "True when the episode has ended.")
49
  feedback: str = Field(..., description=
 
42
  "Which action_types are valid on this step.")
43
  steps_taken: int = Field(..., description=
44
  "Number of actions taken so far in this episode.")
45
+ reward: float = Field(default=0.01, description= # type: ignore[override]
46
+ "Score for the current step. 0.99 = max.")
47
  done: bool = Field(default=False, description=
48
  "True when the episode has ended.")
49
  feedback: str = Field(..., description=
server/WhyDidItFail_environment.py CHANGED
@@ -56,7 +56,7 @@ class WhyDidItFailEnvironment(Environment):
56
  visible_data={"hint": "Start by inspecting the training logs."},
57
  available_actions=["inspect_logs", "inspect_config", "inspect_gradients", "submit_diagnosis"],
58
  steps_taken=0,
59
- reward=0.0,
60
  done=False,
61
  feedback="Investigation started.",
62
  )
@@ -67,18 +67,18 @@ class WhyDidItFailEnvironment(Environment):
67
 
68
  self._state.step_count += 1
69
 
70
- # Hard step limit — terminate immediately, grade() will return 0.0.
71
  if self._state.step_count > self.max_steps and action.action_type != "submit_diagnosis":
72
  return WhyDidItFailObservation(
73
  task_description="Step limit reached. Episode terminated.",
74
  visible_data={},
75
  available_actions=[],
76
  steps_taken=self._state.step_count,
77
- reward=0.0,
78
  done=True,
79
  feedback=(
80
  f"Step limit ({self.max_steps}) reached without a diagnosis. "
81
- f"Score: 0.00. Actual failure: '{self.scenario['correct_diagnosis']}'."
82
  ),
83
  )
84
  required: list[str] = self.scenario.get("required_sources", ["logs"])
 
56
  visible_data={"hint": "Start by inspecting the training logs."},
57
  available_actions=["inspect_logs", "inspect_config", "inspect_gradients", "submit_diagnosis"],
58
  steps_taken=0,
59
+ reward=0.01,
60
  done=False,
61
  feedback="Investigation started.",
62
  )
 
67
 
68
  self._state.step_count += 1
69
 
70
+ # Hard step limit — terminate immediately, grade() will return 0.01.
71
  if self._state.step_count > self.max_steps and action.action_type != "submit_diagnosis":
72
  return WhyDidItFailObservation(
73
  task_description="Step limit reached. Episode terminated.",
74
  visible_data={},
75
  available_actions=[],
76
  steps_taken=self._state.step_count,
77
+ reward=0.01,
78
  done=True,
79
  feedback=(
80
  f"Step limit ({self.max_steps}) reached without a diagnosis. "
81
+ f"Score: 0.01. Actual failure: '{self.scenario['correct_diagnosis']}'."
82
  ),
83
  )
84
  required: list[str] = self.scenario.get("required_sources", ["logs"])
server/graders.py CHANGED
@@ -6,7 +6,7 @@ grade() is the single entry point. It scores the full episode trajectory:
6
  diagnosis_score (0.00 – 0.70) was the diagnosis correct?
7
  evidence_score (0.00 – 0.15) did the agent inspect the right sources?
8
  efficiency_score (0.00 – 0.15) did the agent act without waste?
9
- fix_bonus (0.00 – 0.15) did the agent suggest a valid fix? (bonus, capped at 1.0)
10
 
11
  Step-level partial rewards are returned by the environment's step() on every action,
12
  giving the agent a signal over the full trajectory before the episode ends.
@@ -197,7 +197,7 @@ def grade(
197
  Single unified grade function. Scores every scenario identically.
198
 
199
  Total score = diagnosis_score + evidence_score + efficiency_score + fix_bonus
200
- clamped to [0.0, 1.0].
201
 
202
  Max achievable without fix: 0.70 + 0.15 + 0.15 = 1.00
203
  Max achievable with fix: 0.70 + 0.15 + 0.15 + 0.15 = 1.00 (capped)
 
6
  diagnosis_score (0.00 – 0.70) was the diagnosis correct?
7
  evidence_score (0.00 – 0.15) did the agent inspect the right sources?
8
  efficiency_score (0.00 – 0.15) did the agent act without waste?
9
+ fix_bonus (0.00 – 0.15) did the agent suggest a valid fix? (bonus, capped at 0.99)
10
 
11
  Step-level partial rewards are returned by the environment's step() on every action,
12
  giving the agent a signal over the full trajectory before the episode ends.
 
197
  Single unified grade function. Scores every scenario identically.
198
 
199
  Total score = diagnosis_score + evidence_score + efficiency_score + fix_bonus
200
+ clamped to [0.01, 0.99].
201
 
202
  Max achievable without fix: 0.70 + 0.15 + 0.15 = 1.00
203
  Max achievable with fix: 0.70 + 0.15 + 0.15 + 0.15 = 1.00 (capped)
server/llm_judge.py CHANGED
@@ -91,7 +91,7 @@ def judge(
91
  + data.get("fix_rationale", 0)
92
  )
93
  # normalize: raw 0–15 → 0.0–1.0
94
- return round(max(0, min(15, raw)) / 15, 4)
95
 
96
  except Exception as exc:
97
  print(f" [JUDGE] failed: {exc}", flush=True)
 
91
  + data.get("fix_rationale", 0)
92
  )
93
  # normalize: raw 0–15 → 0.0–1.0
94
+ return round(max(0, min(15, raw)) / 15, 2)
95
 
96
  except Exception as exc:
97
  print(f" [JUDGE] failed: {exc}", flush=True)