Spaces:
Running on Zero
Running on Zero
| """ | |
| NAH-8 — grade_answer strict-JSON grading + parser/repair retry. | |
| Runs with the real (non-stub) code path but with llm.chat monkeypatched, so no | |
| model/GPU is needed. Verifies: clean JSON parses, a bad first reply triggers one | |
| repair retry, and a never-valid model gives a safe default instead of crashing. | |
| RECALL_STUB=0 python3 test_grade_answer.py | |
| """ | |
| import os | |
| os.environ["RECALL_STUB"] = "0" # exercise the real model path, not the heuristic | |
| import llm | |
| import learning_engine as le | |
| from schema import new_card | |
| # `llm.STUB` is read once at import. Under the full pytest run another test file | |
| # imports `llm` first (with stub on), so the env flip above wouldn't take — and | |
| # grade_answer would silently use the heuristic instead of the patched chat. | |
| # Pin it off here so these tests are robust to import order. (Downstream test | |
| # files either reload llm or don't depend on STUB.) | |
| llm.STUB = False | |
| def _card(): | |
| return new_card( | |
| question="What does mitochondria do?", | |
| answer="It produces ATP, the cell's energy.", | |
| topic="Cell Biology", | |
| ) | |
| def _fake_chat(replies): | |
| """Return a chat() that yields the given replies in order.""" | |
| calls = {"n": 0} | |
| def chat(messages, max_tokens=512): | |
| i = min(calls["n"], len(replies) - 1) | |
| calls["n"] += 1 | |
| return replies[i] | |
| return chat, calls | |
| def test_clean_json_first_try(): | |
| llm.chat, calls = _fake_chat([ | |
| '{"score": 5, "explanation": "Spot on.", "missed_concept": ""}' | |
| ]) | |
| g = le.grade_answer(_card(), "It makes ATP energy") | |
| assert g["score"] == 5 and g["correct"] is True | |
| assert g["explanation"] == "Spot on." | |
| assert calls["n"] == 1, "should not retry when first reply is valid" | |
| print("ok clean JSON on first try") | |
| def test_repair_retry_recovers(): | |
| # First reply is junk; repair pass returns valid JSON. | |
| llm.chat, calls = _fake_chat([ | |
| "Sure! The student did okay I think, maybe a 2 or 3.", | |
| '```json\n{"score": 2, "explanation": "Missed the ATP detail.", ' | |
| '"missed_concept": "ATP production"}\n```', | |
| ]) | |
| g = le.grade_answer(_card(), "it is in the cell") | |
| assert g["score"] == 2 and g["correct"] is False | |
| assert g["missed_concept"] == "ATP production" | |
| assert calls["n"] == 2, "should retry exactly once to repair bad JSON" | |
| print("ok repair retry recovers bad first reply") | |
| def test_safe_default_when_never_valid(): | |
| llm.chat, calls = _fake_chat(["no json here", "still no json at all"]) | |
| g = le.grade_answer(_card(), "it makes energy for the cell") # a real attempt | |
| assert g["score"] == 2 # neutral safe default | |
| assert "reference" in g["explanation"].lower() | |
| assert calls["n"] == 2, "tries once + one repair, then gives up" | |
| print("ok safe default when model never returns JSON") | |
| def test_out_of_range_score_rejected(): | |
| # Score outside 0-5 must be treated as unusable, not clamped silently. | |
| llm.chat, calls = _fake_chat([ | |
| '{"score": 99, "explanation": "x"}', | |
| 'also not valid json', | |
| ]) | |
| g = le.grade_answer(_card(), "whatever") | |
| assert g["score"] == 2, "out-of-range score should fall through to default" | |
| print("ok out-of-range score rejected -> safe default") | |
| def test_third_person_possessive_rewritten_to_second(): | |
| # The model slips into "The student's answer/response" ~half the time; the | |
| # safe possessive swaps are applied to the returned explanation. | |
| llm.chat, _ = _fake_chat([ | |
| '{"score": 1, "explanation": "The student\'s answer, \'magic\', is wrong.", ' | |
| '"missed_concept": "the student\'s grasp of the mechanism"}' | |
| ]) | |
| g = le.grade_answer(_card(), "magic") | |
| assert g["explanation"] == "Your answer, 'magic', is wrong.", g["explanation"] | |
| assert g["missed_concept"] == "your grasp of the mechanism", g["missed_concept"] | |
| print("ok third-person possessive rewritten to second person") | |
| def test_second_person_leaves_safe_subject_form_alone(): | |
| # We only swap possessives — a subject "The student identifies..." is left | |
| # untouched rather than mangled into "You identifies...". | |
| assert le._to_second_person("The student identifies it.") == "The student identifies it." | |
| assert le._to_second_person("Your answer is close.") == "Your answer is close." | |
| print("ok subject-form third person left alone (no grammar mangling)") | |
| def test_empty_answer_short_circuits_to_zero(): | |
| # An empty answer is a miss — score 0 with no model call (the model otherwise | |
| # ignores the blank input and hallucinates a 4/5 "correct"). | |
| llm.chat, calls = _fake_chat(['{"score": 5, "explanation": "x"}']) | |
| for blank in ("", " ", "\n\t"): | |
| g = le.grade_answer(_card(), blank) | |
| assert g["score"] == 0 and g["correct"] is False, (blank, g) | |
| assert "reference answer" in g["explanation"].lower() | |
| assert calls["n"] == 0, "empty answer must not call the model" | |
| print("ok empty answer short-circuits to score 0 (no model call)") | |
| def test_non_answer_short_circuits_to_zero(): | |
| # "idk" / "don't know" / "?" are misses too — the model otherwise ignores | |
| # them and grades the reference answer, hallucinating a 4/5 "correct". | |
| llm.chat, calls = _fake_chat(['{"score": 5, "explanation": "x"}']) | |
| for non in ("idk", "I don't know", "don know", "no idea", "?", "..."): | |
| g = le.grade_answer(_card(), non) | |
| assert g["score"] == 0 and g["correct"] is False, (non, g) | |
| assert calls["n"] == 0, "a non-answer must not call the model" | |
| # A real attempt that merely contains "no"/"don't know" still reaches the model. | |
| llm.chat, calls = _fake_chat(['{"score": 4, "explanation": "Close."}']) | |
| g = le.grade_answer(_card(), "no, it is the stroma") | |
| assert calls["n"] == 1, "a real attempt must still be graded by the model" | |
| print("ok non-answers ('idk', \"don't know\", '?') short-circuit to score 0") | |
| if __name__ == "__main__": | |
| test_clean_json_first_try() | |
| test_repair_retry_recovers() | |
| test_safe_default_when_never_valid() | |
| test_out_of_range_score_rejected() | |
| test_third_person_possessive_rewritten_to_second() | |
| test_second_person_leaves_safe_subject_form_alone() | |
| test_empty_answer_short_circuits_to_zero() | |
| test_non_answer_short_circuits_to_zero() | |
| print("\nAll NAH-8 grade_answer tests passed.") | |