Spaces:

samrat-rm
/

WhyDidItFail

Sleeping

App Files Files Community

samrat-rm commited on 11 days ago

Commit

9f554a9

1 Parent(s): 572e42a

fix(grade): keyword matching and requires_fix flag for diagnosis scoring

Browse files

Files changed (2) hide show

server/WhyDidItFail_environment.py +33 -3
server/scenarios.py +1 -0

server/WhyDidItFail_environment.py CHANGED Viewed

@@ -86,6 +86,28 @@ class WhyDidItFailEnvironment(Environment):
             reward=0.0, done=False, feedback=feedback
         )
     def grade(self, action: WhyDidItFailAction) -> tuple[float, str, bool]:
         """Score a submit_diagnosis action against the current scenario."""
         if self.scenario is None:
@@ -95,12 +117,20 @@ class WhyDidItFailEnvironment(Environment):
         correct_fix = (self.scenario.get("correct_fix") or "").strip().lower()
         suggested_fix = (action.suggested_fix or "").strip().lower()
-        diagnosis_correct = diagnosis == correct_diagnosis
-        fix_correct = suggested_fix == correct_fix if correct_fix else True
         if diagnosis_correct and fix_correct:
             reward = 1.0
-            feedback = "Correct diagnosis and fix!"
         elif diagnosis_correct:
             reward = 0.5
             feedback = f"Correct diagnosis, but the suggested fix was wrong. Expected: '{self.scenario.get('correct_fix')}'."

             reward=0.0, done=False, feedback=feedback
         )
+    @staticmethod
+    def _keywords_match(submitted: str, expected: str) -> bool:
+        """Return True if all significant keywords from expected appear in submitted.
+        Both strings should already be lowercased. Underscores and hyphens are
+        treated as spaces so "exploding_gradients" matches "exploding gradients".
+        Common stop words ("to", "a", "the", …) are ignored during keyword
+        extraction so that filler differences don't cause false negatives.
+        """
+        _STOP_WORDS = {"to", "a", "the", "and", "or", "is", "are", "was", "an", "in", "of"}
+        def _normalize(s: str) -> str:
+            return s.replace("_", " ").replace("-", " ")
+        submitted_norm = _normalize(submitted)
+        keywords = [
+            w for w in _normalize(expected).split()
+            if w not in _STOP_WORDS and len(w) > 1
+        ]
+        return all(kw in submitted_norm for kw in keywords)
+    # TODO : Partial credit scoreing, Configurable keyword aliases per scenario, False positive Gaurd,
     def grade(self, action: WhyDidItFailAction) -> tuple[float, str, bool]:
         """Score a submit_diagnosis action against the current scenario."""
         if self.scenario is None:
         correct_fix = (self.scenario.get("correct_fix") or "").strip().lower()
         suggested_fix = (action.suggested_fix or "").strip().lower()
+        requires_fix: bool = self.scenario.get("requires_fix", False)
+        diagnosis_correct = self._keywords_match(diagnosis, correct_diagnosis)
+        if not requires_fix:
+            fix_correct = True  # fix not evaluated for this scenario
+        elif not correct_fix:
+            # Scenario marked requires_fix=True but forgot to set correct_fix — safe default.
+            fix_correct = False
+        else:
+            fix_correct = self._keywords_match(suggested_fix, correct_fix)
         if diagnosis_correct and fix_correct:
             reward = 1.0
+            feedback = "Correct diagnosis and fix!" if requires_fix else "Correct diagnosis!"
         elif diagnosis_correct:
             reward = 0.5
             feedback = f"Correct diagnosis, but the suggested fix was wrong. Expected: '{self.scenario.get('correct_fix')}'."

server/scenarios.py CHANGED Viewed

@@ -18,6 +18,7 @@ SCENARIOS = {
         "gradient_norms": None,   # not visible until agent requests it
         "correct_diagnosis": "exploding_gradients",
         "correct_fix": "reduce learning_rate to 0.001",
     }
 }
 # TODO : Add more scenarios

         "gradient_norms": None,   # not visible until agent requests it
         "correct_diagnosis": "exploding_gradients",
         "correct_fix": "reduce learning_rate to 0.001",
+        "requires_fix": False,   # set True on hard scenarios where fix must be graded
     }
 }
 # TODO : Add more scenarios