Spaces:

samrat-rm
/

WhyDidItFail

Sleeping

App Files Files Community

samrat-rm commited on 10 days ago

Commit

ae1e803

1 Parent(s): 1288c52

feat: implementing judge LLM which contributes to 15% of scoring

Browse files

Files changed (2) hide show

inference.py +28 -5
llm_judge.py +94 -0

inference.py CHANGED Viewed

@@ -30,6 +30,7 @@ load_dotenv()
 from openai import OpenAI
 from client import WhyDidItFailEnv
 from models import WhyDidItFailAction
 from server.scenarios import SCENARIOS
@@ -112,8 +113,8 @@ def _get_action(client: OpenAI, step: int, obs_summary: str, history: List[str])
     except Exception as exc:
         print(f"  [DEBUG] parse error: {exc}", flush=True)
         if step <= 2:
-            return WhyDidItFailAction(action_type="inspect_logs", diagnosis=None, suggested_fix=None)
-        return WhyDidItFailAction(action_type="submit_diagnosis", diagnosis="unknown", suggested_fix=None)
 # ── episode runner ────────────────────────────────────────────────────────────
@@ -133,7 +134,15 @@ async def run_episode(env: WhyDidItFailEnv, client: OpenAI, scenario_key: str) -
         obs      = result.observation
         reward   = result.reward or 0.0
         done     = result.done
-        act_str  = action.model_dump_json(exclude_none=True)
         rewards.append(reward)
         history.append(f"Step {step}: {act_str} → reward={reward:.2f} | {obs.feedback}")
@@ -142,8 +151,22 @@ async def run_episode(env: WhyDidItFailEnv, client: OpenAI, scenario_key: str) -
         if done:
             break
-    # Final score = reward on submit_diagnosis (last reward)
-    score   = rewards[-1] if rewards else 0.0
     success = score >= SUCCESS_THRESHOLD
     return {"scenario_key": scenario_key, "score": score, "steps": len(rewards), "success": success}

 from openai import OpenAI
 from client import WhyDidItFailEnv
+from llm_judge import judge as llm_judge
 from models import WhyDidItFailAction
 from server.scenarios import SCENARIOS
     except Exception as exc:
         print(f"  [DEBUG] parse error: {exc}", flush=True)
         if step <= 2:
+            return WhyDidItFailAction(action_type="inspect_logs", diagnosis=None, suggested_fix=None,reasoning=None)
+        return WhyDidItFailAction(action_type="submit_diagnosis", diagnosis="unknown", suggested_fix=None,reasoning=None)
 # ── episode runner ────────────────────────────────────────────────────────────
         obs      = result.observation
         reward   = result.reward or 0.0
         done     = result.done
+        act_str  = action.model_dump_json(exclude_none=True, exclude_defaults=True)
+        if action.action_type in ("inspect_logs", "inspect_config", "inspect_gradients"):
+            source = action.action_type.replace("inspect_", "")
+            if source not in inspection_order:
+                inspection_order.append(source)
+        if action.action_type == "submit_diagnosis":
+            submit_action = action  # judge runs after loop — WebSocket is closed by then
         rewards.append(reward)
         history.append(f"Step {step}: {act_str} → reward={reward:.2f} | {obs.feedback}")
         if done:
             break
+    # WebSocket is closed — safe to call the judge now
+    keyword_score = rewards[-1] if rewards else 0.0
+    judge_score = 0.0
+    if submit_action is not None:
+        judge_score = llm_judge(
+            client=client,
+            model=MODEL_NAME,
+            diagnosis=submit_action.diagnosis or "",
+            reasoning=submit_action.reasoning,
+            suggested_fix=submit_action.suggested_fix,
+            scenario=SCENARIOS[scenario_key],
+            inspection_order=inspection_order,
+        )
+    score = round(0.85 * keyword_score + 0.15 * judge_score, 4)
+    print(f"  [JUDGE]   scenario={scenario_key} keyword={keyword_score:.3f} reasoning={judge_score:.3f} total={score:.3f}", flush=True)
     success = score >= SUCCESS_THRESHOLD
     return {"scenario_key": scenario_key, "score": score, "steps": len(rewards), "success": success}

llm_judge.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""
+LLM Judge — reasoning quality scorer for WhyDidItFail.
+Called from inference.py after submit_diagnosis.
+Uses the same OpenAI-compatible client and model as the agent.
+Returns a normalized score in [0.0, 1.0] representing reasoning quality.
+Returns 0.0 silently if reasoning is absent or the call fails.
+Scoring criteria (0–5 each, total 0–15 → normalized to 0.0–1.0):
+  evidence_grounding  — does the reasoning cite specific observed values?
+  causal_chain        — does it connect evidence to the failure mode logically?
+  fix_rationale       — is the fix justified by the evidence?
+Final score in inference.py:
+  total = 0.85 * keyword_score + 0.15 * judge_score  → always in [0.0, 1.0]
+"""
+import json
+from openai import OpenAI
+def _build_prompt(
+    diagnosis: str,
+    suggested_fix: str | None,
+    reasoning: str,
+    scenario: dict,
+    inspection_order: list[str],
+) -> str:
+    seen: dict = {}
+    if "logs" in inspection_order:
+        seen["training_logs"] = scenario.get("logs", [])
+    if "config" in inspection_order:
+        seen["config"] = scenario.get("config", {})
+    if "gradients" in inspection_order:
+        seen["gradient_norms"] = scenario.get("gradient_norms", None)
+    return f"""You are evaluating the reasoning of an ML debugging agent.
+Agent submission:
+  Diagnosis:     {diagnosis}
+  Suggested fix: {suggested_fix or "none provided"}
+  Reasoning:     {reasoning}
+Data the agent had access to:
+{json.dumps(seen, indent=2)}
+Score the reasoning (integers only):
+  evidence_grounding (0-5): Does the reasoning cite specific values from the data above?
+  causal_chain       (0-5): Does it logically connect that evidence to the diagnosed failure mode?
+  fix_rationale      (0-5): Is the fix directly justified by the evidence and diagnosis?
+Respond with JSON only, no explanation:
+{{"evidence_grounding": <int>, "causal_chain": <int>, "fix_rationale": <int>}}"""
+def judge(
+    client: OpenAI,
+    model: str,
+    diagnosis: str,
+    reasoning: str | None,
+    suggested_fix: str | None,
+    scenario: dict,
+    inspection_order: list[str],
+) -> float:
+    """Score reasoning quality. Returns 0.0–1.0. Returns 0.0 if reasoning absent or call fails."""
+    if not reasoning or not reasoning.strip():
+        return 0.0
+    try:
+        completion = client.chat.completions.create(
+            model=model,
+            messages=[
+                {"role": "user", "content": _build_prompt(
+                    diagnosis, suggested_fix, reasoning, scenario, inspection_order
+                )},
+            ],
+            temperature=0.0,
+            max_tokens=64,
+        )
+        text = (completion.choices[0].message.content or "").strip()
+        data = json.loads(text)
+        raw = (
+            data.get("evidence_grounding", 0)
+            + data.get("causal_chain", 0)
+            + data.get("fix_rationale", 0)
+        )
+        # normalize: raw 0–15 → 0.0–1.0
+        return round(max(0, min(15, raw)) / 15, 4)
+    except Exception as exc:
+        print(f"  [JUDGE] failed: {exc}", flush=True)
+        return 0.0