""" NAH-8 — grade_answer strict-JSON grading + parser/repair retry. Runs with the real (non-stub) code path but with llm.chat monkeypatched, so no model/GPU is needed. Verifies: clean JSON parses, a bad first reply triggers one repair retry, and a never-valid model gives a safe default instead of crashing. RECALL_STUB=0 python3 test_grade_answer.py """ import os os.environ["RECALL_STUB"] = "0" # exercise the real model path, not the heuristic import llm import learning_engine as le from schema import new_card def _card(): return new_card( question="What does mitochondria do?", answer="It produces ATP, the cell's energy.", topic="Cell Biology", ) def _fake_chat(replies): """Return a chat() that yields the given replies in order.""" calls = {"n": 0} def chat(messages, max_tokens=512): i = min(calls["n"], len(replies) - 1) calls["n"] += 1 return replies[i] return chat, calls def test_clean_json_first_try(): llm.chat, calls = _fake_chat([ '{"score": 5, "explanation": "Spot on.", "missed_concept": ""}' ]) g = le.grade_answer(_card(), "It makes ATP energy") assert g["score"] == 5 and g["correct"] is True assert g["explanation"] == "Spot on." assert calls["n"] == 1, "should not retry when first reply is valid" print("ok clean JSON on first try") def test_repair_retry_recovers(): # First reply is junk; repair pass returns valid JSON. llm.chat, calls = _fake_chat([ "Sure! The student did okay I think, maybe a 2 or 3.", '```json\n{"score": 2, "explanation": "Missed the ATP detail.", ' '"missed_concept": "ATP production"}\n```', ]) g = le.grade_answer(_card(), "it is in the cell") assert g["score"] == 2 and g["correct"] is False assert g["missed_concept"] == "ATP production" assert calls["n"] == 2, "should retry exactly once to repair bad JSON" print("ok repair retry recovers bad first reply") def test_safe_default_when_never_valid(): llm.chat, calls = _fake_chat(["no json here", "still no json at all"]) g = le.grade_answer(_card(), "dunno") assert g["score"] == 2 # neutral safe default assert "reference" in g["explanation"].lower() assert calls["n"] == 2, "tries once + one repair, then gives up" print("ok safe default when model never returns JSON") def test_out_of_range_score_rejected(): # Score outside 0-5 must be treated as unusable, not clamped silently. llm.chat, calls = _fake_chat([ '{"score": 99, "explanation": "x"}', 'also not valid json', ]) g = le.grade_answer(_card(), "whatever") assert g["score"] == 2, "out-of-range score should fall through to default" print("ok out-of-range score rejected -> safe default") if __name__ == "__main__": test_clean_json_first_try() test_repair_retry_recovers() test_safe_default_when_never_valid() test_out_of_range_score_rejected() print("\nAll NAH-8 grade_answer tests passed.")