Spaces:
Running on Zero
Running on Zero
File size: 6,332 Bytes
7563305 058157a 7563305 6505c2d 7563305 22402d5 0a7459e 6505c2d 7563305 22402d5 0a7459e 6505c2d 7563305 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | """
NAH-8 — grade_answer strict-JSON grading + parser/repair retry.
Runs with the real (non-stub) code path but with llm.chat monkeypatched, so no
model/GPU is needed. Verifies: clean JSON parses, a bad first reply triggers one
repair retry, and a never-valid model gives a safe default instead of crashing.
RECALL_STUB=0 python3 test_grade_answer.py
"""
import os
os.environ["RECALL_STUB"] = "0" # exercise the real model path, not the heuristic
import llm
import learning_engine as le
from schema import new_card
# `llm.STUB` is read once at import. Under the full pytest run another test file
# imports `llm` first (with stub on), so the env flip above wouldn't take — and
# grade_answer would silently use the heuristic instead of the patched chat.
# Pin it off here so these tests are robust to import order. (Downstream test
# files either reload llm or don't depend on STUB.)
llm.STUB = False
def _card():
return new_card(
question="What does mitochondria do?",
answer="It produces ATP, the cell's energy.",
topic="Cell Biology",
)
def _fake_chat(replies):
"""Return a chat() that yields the given replies in order."""
calls = {"n": 0}
def chat(messages, max_tokens=512):
i = min(calls["n"], len(replies) - 1)
calls["n"] += 1
return replies[i]
return chat, calls
def test_clean_json_first_try():
llm.chat, calls = _fake_chat([
'{"score": 5, "explanation": "Spot on.", "missed_concept": ""}'
])
g = le.grade_answer(_card(), "It makes ATP energy")
assert g["score"] == 5 and g["correct"] is True
assert g["explanation"] == "Spot on."
assert calls["n"] == 1, "should not retry when first reply is valid"
print("ok clean JSON on first try")
def test_repair_retry_recovers():
# First reply is junk; repair pass returns valid JSON.
llm.chat, calls = _fake_chat([
"Sure! The student did okay I think, maybe a 2 or 3.",
'```json\n{"score": 2, "explanation": "Missed the ATP detail.", '
'"missed_concept": "ATP production"}\n```',
])
g = le.grade_answer(_card(), "it is in the cell")
assert g["score"] == 2 and g["correct"] is False
assert g["missed_concept"] == "ATP production"
assert calls["n"] == 2, "should retry exactly once to repair bad JSON"
print("ok repair retry recovers bad first reply")
def test_safe_default_when_never_valid():
llm.chat, calls = _fake_chat(["no json here", "still no json at all"])
g = le.grade_answer(_card(), "it makes energy for the cell") # a real attempt
assert g["score"] == 2 # neutral safe default
assert "reference" in g["explanation"].lower()
assert calls["n"] == 2, "tries once + one repair, then gives up"
print("ok safe default when model never returns JSON")
def test_out_of_range_score_rejected():
# Score outside 0-5 must be treated as unusable, not clamped silently.
llm.chat, calls = _fake_chat([
'{"score": 99, "explanation": "x"}',
'also not valid json',
])
g = le.grade_answer(_card(), "whatever")
assert g["score"] == 2, "out-of-range score should fall through to default"
print("ok out-of-range score rejected -> safe default")
def test_third_person_possessive_rewritten_to_second():
# The model slips into "The student's answer/response" ~half the time; the
# safe possessive swaps are applied to the returned explanation.
llm.chat, _ = _fake_chat([
'{"score": 1, "explanation": "The student\'s answer, \'magic\', is wrong.", '
'"missed_concept": "the student\'s grasp of the mechanism"}'
])
g = le.grade_answer(_card(), "magic")
assert g["explanation"] == "Your answer, 'magic', is wrong.", g["explanation"]
assert g["missed_concept"] == "your grasp of the mechanism", g["missed_concept"]
print("ok third-person possessive rewritten to second person")
def test_second_person_leaves_safe_subject_form_alone():
# We only swap possessives — a subject "The student identifies..." is left
# untouched rather than mangled into "You identifies...".
assert le._to_second_person("The student identifies it.") == "The student identifies it."
assert le._to_second_person("Your answer is close.") == "Your answer is close."
print("ok subject-form third person left alone (no grammar mangling)")
def test_empty_answer_short_circuits_to_zero():
# An empty answer is a miss — score 0 with no model call (the model otherwise
# ignores the blank input and hallucinates a 4/5 "correct").
llm.chat, calls = _fake_chat(['{"score": 5, "explanation": "x"}'])
for blank in ("", " ", "\n\t"):
g = le.grade_answer(_card(), blank)
assert g["score"] == 0 and g["correct"] is False, (blank, g)
assert "reference answer" in g["explanation"].lower()
assert calls["n"] == 0, "empty answer must not call the model"
print("ok empty answer short-circuits to score 0 (no model call)")
def test_non_answer_short_circuits_to_zero():
# "idk" / "don't know" / "?" are misses too — the model otherwise ignores
# them and grades the reference answer, hallucinating a 4/5 "correct".
llm.chat, calls = _fake_chat(['{"score": 5, "explanation": "x"}'])
for non in ("idk", "I don't know", "don know", "no idea", "?", "..."):
g = le.grade_answer(_card(), non)
assert g["score"] == 0 and g["correct"] is False, (non, g)
assert calls["n"] == 0, "a non-answer must not call the model"
# A real attempt that merely contains "no"/"don't know" still reaches the model.
llm.chat, calls = _fake_chat(['{"score": 4, "explanation": "Close."}'])
g = le.grade_answer(_card(), "no, it is the stroma")
assert calls["n"] == 1, "a real attempt must still be graded by the model"
print("ok non-answers ('idk', \"don't know\", '?') short-circuit to score 0")
if __name__ == "__main__":
test_clean_json_first_try()
test_repair_retry_recovers()
test_safe_default_when_never_valid()
test_out_of_range_score_rejected()
test_third_person_possessive_rewritten_to_second()
test_second_person_leaves_safe_subject_form_alone()
test_empty_answer_short_circuits_to_zero()
test_non_answer_short_circuits_to_zero()
print("\nAll NAH-8 grade_answer tests passed.")
|