File size: 6,332 Bytes
7563305
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
058157a
 
 
 
 
 
 
7563305
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6505c2d
7563305
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22402d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a7459e
 
 
 
 
 
 
 
 
 
 
 
6505c2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7563305
 
 
 
 
22402d5
 
0a7459e
6505c2d
7563305
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""
NAH-8 — grade_answer strict-JSON grading + parser/repair retry.

Runs with the real (non-stub) code path but with llm.chat monkeypatched, so no
model/GPU is needed. Verifies: clean JSON parses, a bad first reply triggers one
repair retry, and a never-valid model gives a safe default instead of crashing.

    RECALL_STUB=0 python3 test_grade_answer.py
"""
import os

os.environ["RECALL_STUB"] = "0"  # exercise the real model path, not the heuristic

import llm
import learning_engine as le
from schema import new_card

# `llm.STUB` is read once at import. Under the full pytest run another test file
# imports `llm` first (with stub on), so the env flip above wouldn't take — and
# grade_answer would silently use the heuristic instead of the patched chat.
# Pin it off here so these tests are robust to import order. (Downstream test
# files either reload llm or don't depend on STUB.)
llm.STUB = False


def _card():
    return new_card(
        question="What does mitochondria do?",
        answer="It produces ATP, the cell's energy.",
        topic="Cell Biology",
    )


def _fake_chat(replies):
    """Return a chat() that yields the given replies in order."""
    calls = {"n": 0}

    def chat(messages, max_tokens=512):
        i = min(calls["n"], len(replies) - 1)
        calls["n"] += 1
        return replies[i]

    return chat, calls


def test_clean_json_first_try():
    llm.chat, calls = _fake_chat([
        '{"score": 5, "explanation": "Spot on.", "missed_concept": ""}'
    ])
    g = le.grade_answer(_card(), "It makes ATP energy")
    assert g["score"] == 5 and g["correct"] is True
    assert g["explanation"] == "Spot on."
    assert calls["n"] == 1, "should not retry when first reply is valid"
    print("ok  clean JSON on first try")


def test_repair_retry_recovers():
    # First reply is junk; repair pass returns valid JSON.
    llm.chat, calls = _fake_chat([
        "Sure! The student did okay I think, maybe a 2 or 3.",
        '```json\n{"score": 2, "explanation": "Missed the ATP detail.", '
        '"missed_concept": "ATP production"}\n```',
    ])
    g = le.grade_answer(_card(), "it is in the cell")
    assert g["score"] == 2 and g["correct"] is False
    assert g["missed_concept"] == "ATP production"
    assert calls["n"] == 2, "should retry exactly once to repair bad JSON"
    print("ok  repair retry recovers bad first reply")


def test_safe_default_when_never_valid():
    llm.chat, calls = _fake_chat(["no json here", "still no json at all"])
    g = le.grade_answer(_card(), "it makes energy for the cell")  # a real attempt
    assert g["score"] == 2  # neutral safe default
    assert "reference" in g["explanation"].lower()
    assert calls["n"] == 2, "tries once + one repair, then gives up"
    print("ok  safe default when model never returns JSON")


def test_out_of_range_score_rejected():
    # Score outside 0-5 must be treated as unusable, not clamped silently.
    llm.chat, calls = _fake_chat([
        '{"score": 99, "explanation": "x"}',
        'also not valid json',
    ])
    g = le.grade_answer(_card(), "whatever")
    assert g["score"] == 2, "out-of-range score should fall through to default"
    print("ok  out-of-range score rejected -> safe default")


def test_third_person_possessive_rewritten_to_second():
    # The model slips into "The student's answer/response" ~half the time; the
    # safe possessive swaps are applied to the returned explanation.
    llm.chat, _ = _fake_chat([
        '{"score": 1, "explanation": "The student\'s answer, \'magic\', is wrong.", '
        '"missed_concept": "the student\'s grasp of the mechanism"}'
    ])
    g = le.grade_answer(_card(), "magic")
    assert g["explanation"] == "Your answer, 'magic', is wrong.", g["explanation"]
    assert g["missed_concept"] == "your grasp of the mechanism", g["missed_concept"]
    print("ok  third-person possessive rewritten to second person")


def test_second_person_leaves_safe_subject_form_alone():
    # We only swap possessives — a subject "The student identifies..." is left
    # untouched rather than mangled into "You identifies...".
    assert le._to_second_person("The student identifies it.") == "The student identifies it."
    assert le._to_second_person("Your answer is close.") == "Your answer is close."
    print("ok  subject-form third person left alone (no grammar mangling)")


def test_empty_answer_short_circuits_to_zero():
    # An empty answer is a miss — score 0 with no model call (the model otherwise
    # ignores the blank input and hallucinates a 4/5 "correct").
    llm.chat, calls = _fake_chat(['{"score": 5, "explanation": "x"}'])
    for blank in ("", "   ", "\n\t"):
        g = le.grade_answer(_card(), blank)
        assert g["score"] == 0 and g["correct"] is False, (blank, g)
        assert "reference answer" in g["explanation"].lower()
    assert calls["n"] == 0, "empty answer must not call the model"
    print("ok  empty answer short-circuits to score 0 (no model call)")


def test_non_answer_short_circuits_to_zero():
    # "idk" / "don't know" / "?" are misses too — the model otherwise ignores
    # them and grades the reference answer, hallucinating a 4/5 "correct".
    llm.chat, calls = _fake_chat(['{"score": 5, "explanation": "x"}'])
    for non in ("idk", "I don't know", "don know", "no idea", "?", "..."):
        g = le.grade_answer(_card(), non)
        assert g["score"] == 0 and g["correct"] is False, (non, g)
    assert calls["n"] == 0, "a non-answer must not call the model"
    # A real attempt that merely contains "no"/"don't know" still reaches the model.
    llm.chat, calls = _fake_chat(['{"score": 4, "explanation": "Close."}'])
    g = le.grade_answer(_card(), "no, it is the stroma")
    assert calls["n"] == 1, "a real attempt must still be graded by the model"
    print("ok  non-answers ('idk', \"don't know\", '?') short-circuit to score 0")


if __name__ == "__main__":
    test_clean_json_first_try()
    test_repair_retry_recovers()
    test_safe_default_when_never_valid()
    test_out_of_range_score_rejected()
    test_third_person_possessive_rewritten_to_second()
    test_second_person_leaves_safe_subject_form_alone()
    test_empty_answer_short_circuits_to_zero()
    test_non_answer_short_circuits_to_zero()
    print("\nAll NAH-8 grade_answer tests passed.")