recall / test_grade_answer.py
nz-nz's picture
[NAH-8] grade_answer: strict-JSON grading + parser/repair retry
560c197
Raw
History Blame Contribute Delete
3.04 kB
"""
NAH-8 — grade_answer strict-JSON grading + parser/repair retry.
Runs with the real (non-stub) code path but with llm.chat monkeypatched, so no
model/GPU is needed. Verifies: clean JSON parses, a bad first reply triggers one
repair retry, and a never-valid model gives a safe default instead of crashing.
RECALL_STUB=0 python3 test_grade_answer.py
"""
import os
os.environ["RECALL_STUB"] = "0" # exercise the real model path, not the heuristic
import llm
import learning_engine as le
from schema import new_card
def _card():
return new_card(
question="What does mitochondria do?",
answer="It produces ATP, the cell's energy.",
topic="Cell Biology",
)
def _fake_chat(replies):
"""Return a chat() that yields the given replies in order."""
calls = {"n": 0}
def chat(messages, max_tokens=512):
i = min(calls["n"], len(replies) - 1)
calls["n"] += 1
return replies[i]
return chat, calls
def test_clean_json_first_try():
llm.chat, calls = _fake_chat([
'{"score": 5, "explanation": "Spot on.", "missed_concept": ""}'
])
g = le.grade_answer(_card(), "It makes ATP energy")
assert g["score"] == 5 and g["correct"] is True
assert g["explanation"] == "Spot on."
assert calls["n"] == 1, "should not retry when first reply is valid"
print("ok clean JSON on first try")
def test_repair_retry_recovers():
# First reply is junk; repair pass returns valid JSON.
llm.chat, calls = _fake_chat([
"Sure! The student did okay I think, maybe a 2 or 3.",
'```json\n{"score": 2, "explanation": "Missed the ATP detail.", '
'"missed_concept": "ATP production"}\n```',
])
g = le.grade_answer(_card(), "it is in the cell")
assert g["score"] == 2 and g["correct"] is False
assert g["missed_concept"] == "ATP production"
assert calls["n"] == 2, "should retry exactly once to repair bad JSON"
print("ok repair retry recovers bad first reply")
def test_safe_default_when_never_valid():
llm.chat, calls = _fake_chat(["no json here", "still no json at all"])
g = le.grade_answer(_card(), "dunno")
assert g["score"] == 2 # neutral safe default
assert "reference" in g["explanation"].lower()
assert calls["n"] == 2, "tries once + one repair, then gives up"
print("ok safe default when model never returns JSON")
def test_out_of_range_score_rejected():
# Score outside 0-5 must be treated as unusable, not clamped silently.
llm.chat, calls = _fake_chat([
'{"score": 99, "explanation": "x"}',
'also not valid json',
])
g = le.grade_answer(_card(), "whatever")
assert g["score"] == 2, "out-of-range score should fall through to default"
print("ok out-of-range score rejected -> safe default")
if __name__ == "__main__":
test_clean_json_first_try()
test_repair_retry_recovers()
test_safe_default_when_never_valid()
test_out_of_range_score_rejected()
print("\nAll NAH-8 grade_answer tests passed.")