File size: 6,185 Bytes
7563305
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d0b820
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae15cb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7563305
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d0b820
 
ae15cb7
 
 
7563305
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
NAH-50 — extract_json must tolerate over-escaped JSON.

MiniCPM4.1-8B sometimes emits JSON with backslash-escaped quotes
({\\"k\\": \\"v\\"} instead of {"k": "v"}), which plain json.loads rejects.
That silently broke the adaptive follow-ups (extract_json -> None -> no cards).
These check the un-escape fallback without any model/GPU.

    python3 test_extract_json.py
"""
import os

os.environ["RECALL_STUB"] = "1"

import llm


def test_overescaped_object():
    raw = '{\\"score\\": 0, \\"explanation\\": \\"Wrong.\\", \\"missed_concept\\": \\"stroma\\"}'
    data = llm.extract_json(raw)
    assert data == {"score": 0, "explanation": "Wrong.", "missed_concept": "stroma"}, data
    print("ok  over-escaped object parses")


def test_overescaped_array_the_followups_case():
    # The exact shape the 8B returned that dropped the follow-ups.
    raw = ('[\n'
           '  {\\"question\\": \\"What are the products of the light reactions?\\", '
           '\\"answer\\": \\"ATP and NADPH\\", \\"topic\\": \\"Photosynthesis\\"},\n'
           '  {\\"question\\": \\"Where do they happen?\\", '
           '\\"answer\\": \\"Thylakoid membranes\\", \\"topic\\": \\"Photosynthesis\\"}\n'
           ']')
    data = llm.extract_json(raw)
    assert isinstance(data, list) and len(data) == 2, data
    assert data[0]["answer"] == "ATP and NADPH"
    print("ok  over-escaped array parses (the follow-ups case)")


def test_fully_escaped_with_newlines():
    # The 8B sometimes escapes the WHOLE reply like a string literal: not just
    # \" but also \n (literal backslash-n) for the layout. Verbatim shape from
    # the model that dropped the follow-ups even after the quote-only fix.
    raw = ('[\\n    {\\n        \\"question\\": \\"Where do the light reactions occur?\\",\\n'
           '        \\"answer\\": \\"Thylakoid membranes\\",\\n        \\"topic\\": \\"Photosynthesis\\"\\n'
           '    }\\n]')
    data = llm.extract_json(raw)
    assert isinstance(data, list) and len(data) == 1, data
    assert data[0]["answer"] == "Thylakoid membranes"
    print("ok  fully-escaped JSON (\\\" + \\n) decoded")


def test_overescaped_with_think_and_fence():
    raw = '<think>grade</think>\n```json\n{\\"score\\": 3}\n```'
    assert llm.extract_json(raw) == {"score": 3}
    print("ok  over-escaped + <think> + fence all handled together")


def test_overescaped_embedded_in_prose():
    raw = 'Here is the grade: {\\"score\\": 5} done.'
    assert llm.extract_json(raw) == {"score": 5}
    print("ok  over-escaped object found inside prose")


def test_concatenated_objects_no_array_wrapper():
    # MiniCPM-V on image input returns several objects space-separated with no
    # [ ] wrapper (+ a stray trailing quote) — verbatim shape from the OCR path.
    raw = ('{"question":"Where do the light reactions occur?","answer":"Thylakoid '
           'membranes.","topic":"Photosynthesis","difficulty":2} '
           '{"question":"Where does the Calvin cycle take place?","answer":"The '
           'stroma.","topic":"Photosynthesis","difficulty":1}"}')
    data = llm.extract_json(raw)
    assert isinstance(data, list) and len(data) == 2, data
    assert data[0]["answer"] == "Thylakoid membranes."
    assert data[1]["topic"] == "Photosynthesis"
    print("ok  concatenated objects (no array wrapper) -> list")


def test_concatenated_arrays_flattened():
    raw = '[{"question":"A","answer":"a","topic":"T"}] [{"question":"B","answer":"b","topic":"T"}]'
    data = llm.extract_json(raw)
    assert isinstance(data, list) and len(data) == 2, data
    assert [d["question"] for d in data] == ["A", "B"]
    print("ok  concatenated arrays flattened -> single list")


def test_truncated_single_object_midstring():
    # MiniCPM-V grade reply cut off by the token limit mid-explanation. Closing
    # the dangling string + brace recovers a usable grade (was: None -> garbage).
    raw = '{"score": 4, "explanation": "Close but you missed the key idea'
    data = llm.extract_json(raw)
    assert data == {"score": 4, "explanation": "Close but you missed the key idea"}, data
    print("ok  truncated object (mid-string) recovered")


def test_truncated_object_midkey_drops_partial_pair():
    # Cut off mid-key: closing as-is is invalid, so we drop the incomplete pair
    # and keep what parsed — still a usable score.
    raw = '{"score": 2, "explanation": "Partly right", "missed_conc'
    data = llm.extract_json(raw)
    assert isinstance(data, dict) and data.get("score") == 2, data
    assert data.get("explanation") == "Partly right", data
    print("ok  truncated object (mid-key) drops the partial pair")


def test_truncated_array_salvages_complete_objects():
    # A deck array cut off inside the last object: the complete ones survive.
    raw = ('[{"question":"A","answer":"a","topic":"T","difficulty":1},'
           '{"question":"B","answer":"b","topic":"T","difficulty":2},'
           '{"question":"C","answer":')
    data = llm.extract_json(raw)
    assert isinstance(data, list) and len(data) == 2, data
    assert [d["question"] for d in data] == ["A", "B"]
    print("ok  truncated array salvages the complete objects")


def test_valid_json_unaffected_regression():
    # Already-valid JSON (incl. a legitimately escaped quote inside a string)
    # parses on the first try and never hits the un-escape fallback.
    assert llm.extract_json('{"a": 1}') == {"a": 1}
    assert llm.extract_json('{"q": "she said \\"hi\\""}') == {"q": 'she said "hi"'}
    assert llm.extract_json("not json") is None
    print("ok  valid/escaped-in-string JSON unaffected (regression)")


if __name__ == "__main__":
    test_overescaped_object()
    test_overescaped_array_the_followups_case()
    test_fully_escaped_with_newlines()
    test_overescaped_with_think_and_fence()
    test_overescaped_embedded_in_prose()
    test_concatenated_objects_no_array_wrapper()
    test_concatenated_arrays_flattened()
    test_truncated_single_object_midstring()
    test_truncated_object_midkey_drops_partial_pair()
    test_truncated_array_salvages_complete_objects()
    test_valid_json_unaffected_regression()
    print("\nAll NAH-50 extract_json robustness tests passed.")