Spaces:

samrat-rm
/

WhyDidItFail

Sleeping

App Files Files Community

samrat-rm commited on 8 days ago

Commit

3781ce7

1 Parent(s): c130122

fix: enforce reward bounds (0.01–0.99) and 2 decimal precision across grader, env, and inference

Browse files

Files changed (6) hide show

client.py +1 -1
inference.py +7 -7
models.py +2 -2
server/WhyDidItFail_environment.py +4 -4
server/graders.py +2 -2
server/llm_judge.py +1 -1

client.py CHANGED Viewed

@@ -50,7 +50,7 @@ class WhyDidItFailEnv(EnvClient[WhyDidItFailAction, WhyDidItFailObservation, Why
             visible_data=obs_data.get("visible_data", {}),
             available_actions=obs_data.get("available_actions", []),
             steps_taken=obs_data.get("steps_taken", 0),
-            reward=obs_data.get("reward", 0.0),
             done=obs_data.get("done", False),
             feedback=obs_data.get("feedback", ""),
         )

             visible_data=obs_data.get("visible_data", {}),
             available_actions=obs_data.get("available_actions", []),
             steps_taken=obs_data.get("steps_taken", 0),
+            reward=obs_data.get("reward", 0.01),
             done=obs_data.get("done", False),
             feedback=obs_data.get("feedback", ""),
         )

inference.py CHANGED Viewed

@@ -195,7 +195,7 @@ async def run_episode(
     rewards: List[float] = []
     inspection_order: List[str] = []
     submit_action: WhyDidItFailAction | None = None
-    score    = 0.0
     success  = False
     try:
@@ -229,7 +229,7 @@ async def run_episode(
                 break
         # WebSocket is closed — safe to call the judge now
-        keyword_score = rewards[-1] if rewards else 0.0
         judge_score: float | None = None
         if submit_action is not None:
             judge_score = llm_judge(
@@ -242,10 +242,10 @@ async def run_episode(
                 inspection_order=inspection_order,
             )
         if judge_score is None:
-            score = round(keyword_score, 4)
-            # print(f"  [JUDGE]   scenario={scenario_key} keyword={keyword_score:.3f} reasoning=n/a total={score:.3f}", file=sys.stderr, flush=True)
         else:
-            score = round(0.85 * keyword_score + 0.15 * judge_score, 4)
             # print(f"  [JUDGE]   scenario={scenario_key} keyword={keyword_score:.3f} reasoning={judge_score:.3f} total={score:.3f}", file=sys.stderr, flush=True)
         success = score >= SUCCESS_THRESHOLD
@@ -253,7 +253,7 @@ async def run_episode(
     finally:
         steps_taken = len(rewards)
         final_score = round(max(0.01, min(0.99, sum(rewards))), 2) if rewards else 0.01
-        print(f"[END] success={str(success).lower()} steps={steps_taken} reward={final_score}", flush=True)
     return {"scenario_key": scenario_key, "score": score, "steps": steps_taken, "success": success}, env
@@ -293,7 +293,7 @@ async def main() -> None:
         scores += await run_task("task_easy",   EASY_SCENARIOS,   env, client)
         scores += await run_task("task_medium", MEDIUM_SCENARIOS, env, client)
         scores += await run_task("task_hard",   HARD_SCENARIOS,   env, client)
-        overall = sum(scores) / len(scores) if scores else 0.0
         # print(f"  [OVERALL] avg_score={overall:.3f}", file=sys.stderr, flush=True)
         # print(f"[END] score={overall:.3f}", flush=True)
     finally:

     rewards: List[float] = []
     inspection_order: List[str] = []
     submit_action: WhyDidItFailAction | None = None
+    score    = 0.01
     success  = False
     try:
                 break
         # WebSocket is closed — safe to call the judge now
+        keyword_score = rewards[-1] if rewards else 0.01
         judge_score: float | None = None
         if submit_action is not None:
             judge_score = llm_judge(
                 inspection_order=inspection_order,
             )
         if judge_score is None:
+            score = round(keyword_score, 2)
+            # print(f"  [JUDGE]   scenario={scenario_key} keyword={keyword_score:.2f} reasoning=n/a total={score:.2f}", file=sys.stderr, flush=True)
         else:
+            score = round(0.85 * keyword_score + 0.15 * judge_score, 2)
             # print(f"  [JUDGE]   scenario={scenario_key} keyword={keyword_score:.3f} reasoning={judge_score:.3f} total={score:.3f}", file=sys.stderr, flush=True)
         success = score >= SUCCESS_THRESHOLD
     finally:
         steps_taken = len(rewards)
         final_score = round(max(0.01, min(0.99, sum(rewards))), 2) if rewards else 0.01
+        print(f"[END] success={str(success).lower()} steps={steps_taken} reward={final_score:.2f}", flush=True)
     return {"scenario_key": scenario_key, "score": score, "steps": steps_taken, "success": success}, env
         scores += await run_task("task_easy",   EASY_SCENARIOS,   env, client)
         scores += await run_task("task_medium", MEDIUM_SCENARIOS, env, client)
         scores += await run_task("task_hard",   HARD_SCENARIOS,   env, client)
+        overall = round(sum(scores) / len(scores), 2) if scores else 0.01
         # print(f"  [OVERALL] avg_score={overall:.3f}", file=sys.stderr, flush=True)
         # print(f"[END] score={overall:.3f}", flush=True)
     finally:

models.py CHANGED Viewed

@@ -42,8 +42,8 @@ class WhyDidItFailObservation(Observation):
         "Which action_types are valid on this step.")
     steps_taken: int = Field(..., description=
         "Number of actions taken so far in this episode.")
-    reward: float = Field(default=0.0, description=    # type: ignore[override]
-        "Score for the current step. 1.0 = solved.")
     done: bool = Field(default=False, description=
         "True when the episode has ended.")
     feedback: str = Field(..., description=

         "Which action_types are valid on this step.")
     steps_taken: int = Field(..., description=
         "Number of actions taken so far in this episode.")
+    reward: float = Field(default=0.01, description=    # type: ignore[override]
+        "Score for the current step. 0.99 = max.")
     done: bool = Field(default=False, description=
         "True when the episode has ended.")
     feedback: str = Field(..., description=

server/WhyDidItFail_environment.py CHANGED Viewed

@@ -56,7 +56,7 @@ class WhyDidItFailEnvironment(Environment):
             visible_data={"hint": "Start by inspecting the training logs."},
             available_actions=["inspect_logs", "inspect_config", "inspect_gradients", "submit_diagnosis"],
             steps_taken=0,
-            reward=0.0,
             done=False,
             feedback="Investigation started.",
         )
@@ -67,18 +67,18 @@ class WhyDidItFailEnvironment(Environment):
         self._state.step_count += 1
-        # Hard step limit — terminate immediately, grade() will return 0.0.
         if self._state.step_count > self.max_steps and action.action_type != "submit_diagnosis":
             return WhyDidItFailObservation(
                 task_description="Step limit reached. Episode terminated.",
                 visible_data={},
                 available_actions=[],
                 steps_taken=self._state.step_count,
-                reward=0.0,
                 done=True,
                 feedback=(
                     f"Step limit ({self.max_steps}) reached without a diagnosis. "
-                    f"Score: 0.00. Actual failure: '{self.scenario['correct_diagnosis']}'."
                 ),
             )
         required: list[str] = self.scenario.get("required_sources", ["logs"])

             visible_data={"hint": "Start by inspecting the training logs."},
             available_actions=["inspect_logs", "inspect_config", "inspect_gradients", "submit_diagnosis"],
             steps_taken=0,
+            reward=0.01,
             done=False,
             feedback="Investigation started.",
         )
         self._state.step_count += 1
+        # Hard step limit — terminate immediately, grade() will return 0.01.
         if self._state.step_count > self.max_steps and action.action_type != "submit_diagnosis":
             return WhyDidItFailObservation(
                 task_description="Step limit reached. Episode terminated.",
                 visible_data={},
                 available_actions=[],
                 steps_taken=self._state.step_count,
+                reward=0.01,
                 done=True,
                 feedback=(
                     f"Step limit ({self.max_steps}) reached without a diagnosis. "
+                    f"Score: 0.01. Actual failure: '{self.scenario['correct_diagnosis']}'."
                 ),
             )
         required: list[str] = self.scenario.get("required_sources", ["logs"])

server/graders.py CHANGED Viewed

@@ -6,7 +6,7 @@ grade() is the single entry point. It scores the full episode trajectory:
   diagnosis_score  (0.00 – 0.70)  was the diagnosis correct?
   evidence_score   (0.00 – 0.15)  did the agent inspect the right sources?
   efficiency_score (0.00 – 0.15)  did the agent act without waste?
-  fix_bonus        (0.00 – 0.15)  did the agent suggest a valid fix? (bonus, capped at 1.0)
 Step-level partial rewards are returned by the environment's step() on every action,
 giving the agent a signal over the full trajectory before the episode ends.
@@ -197,7 +197,7 @@ def grade(
     Single unified grade function. Scores every scenario identically.
     Total score = diagnosis_score + evidence_score + efficiency_score + fix_bonus
-                  clamped to [0.0, 1.0].
     Max achievable without fix:  0.70 + 0.15 + 0.15       = 1.00
     Max achievable with fix:     0.70 + 0.15 + 0.15 + 0.15 = 1.00  (capped)

   diagnosis_score  (0.00 – 0.70)  was the diagnosis correct?
   evidence_score   (0.00 – 0.15)  did the agent inspect the right sources?
   efficiency_score (0.00 – 0.15)  did the agent act without waste?
+  fix_bonus        (0.00 – 0.15)  did the agent suggest a valid fix? (bonus, capped at 0.99)
 Step-level partial rewards are returned by the environment's step() on every action,
 giving the agent a signal over the full trajectory before the episode ends.
     Single unified grade function. Scores every scenario identically.
     Total score = diagnosis_score + evidence_score + efficiency_score + fix_bonus
+                  clamped to [0.01, 0.99].
     Max achievable without fix:  0.70 + 0.15 + 0.15       = 1.00
     Max achievable with fix:     0.70 + 0.15 + 0.15 + 0.15 = 1.00  (capped)

server/llm_judge.py CHANGED Viewed

@@ -91,7 +91,7 @@ def judge(
             + data.get("fix_rationale", 0)
         )
         # normalize: raw 0–15 → 0.0–1.0
-        return round(max(0, min(15, raw)) / 15, 4)
     except Exception as exc:
         print(f"  [JUDGE] failed: {exc}", flush=True)

             + data.get("fix_rationale", 0)
         )
         # normalize: raw 0–15 → 0.0–1.0
+        return round(max(0, min(15, raw)) / 15, 2)
     except Exception as exc:
         print(f"  [JUDGE] failed: {exc}", flush=True)