""" NAH-50 — extract_json must tolerate over-escaped JSON. MiniCPM4.1-8B sometimes emits JSON with backslash-escaped quotes ({\\"k\\": \\"v\\"} instead of {"k": "v"}), which plain json.loads rejects. That silently broke the adaptive follow-ups (extract_json -> None -> no cards). These check the un-escape fallback without any model/GPU. python3 test_extract_json.py """ import os os.environ["RECALL_STUB"] = "1" import llm def test_overescaped_object(): raw = '{\\"score\\": 0, \\"explanation\\": \\"Wrong.\\", \\"missed_concept\\": \\"stroma\\"}' data = llm.extract_json(raw) assert data == {"score": 0, "explanation": "Wrong.", "missed_concept": "stroma"}, data print("ok over-escaped object parses") def test_overescaped_array_the_followups_case(): # The exact shape the 8B returned that dropped the follow-ups. raw = ('[\n' ' {\\"question\\": \\"What are the products of the light reactions?\\", ' '\\"answer\\": \\"ATP and NADPH\\", \\"topic\\": \\"Photosynthesis\\"},\n' ' {\\"question\\": \\"Where do they happen?\\", ' '\\"answer\\": \\"Thylakoid membranes\\", \\"topic\\": \\"Photosynthesis\\"}\n' ']') data = llm.extract_json(raw) assert isinstance(data, list) and len(data) == 2, data assert data[0]["answer"] == "ATP and NADPH" print("ok over-escaped array parses (the follow-ups case)") def test_fully_escaped_with_newlines(): # The 8B sometimes escapes the WHOLE reply like a string literal: not just # \" but also \n (literal backslash-n) for the layout. Verbatim shape from # the model that dropped the follow-ups even after the quote-only fix. raw = ('[\\n {\\n \\"question\\": \\"Where do the light reactions occur?\\",\\n' ' \\"answer\\": \\"Thylakoid membranes\\",\\n \\"topic\\": \\"Photosynthesis\\"\\n' ' }\\n]') data = llm.extract_json(raw) assert isinstance(data, list) and len(data) == 1, data assert data[0]["answer"] == "Thylakoid membranes" print("ok fully-escaped JSON (\\\" + \\n) decoded") def test_overescaped_with_think_and_fence(): raw = 'grade\n```json\n{\\"score\\": 3}\n```' assert llm.extract_json(raw) == {"score": 3} print("ok over-escaped + + fence all handled together") def test_overescaped_embedded_in_prose(): raw = 'Here is the grade: {\\"score\\": 5} done.' assert llm.extract_json(raw) == {"score": 5} print("ok over-escaped object found inside prose") def test_concatenated_objects_no_array_wrapper(): # MiniCPM-V on image input returns several objects space-separated with no # [ ] wrapper (+ a stray trailing quote) — verbatim shape from the OCR path. raw = ('{"question":"Where do the light reactions occur?","answer":"Thylakoid ' 'membranes.","topic":"Photosynthesis","difficulty":2} ' '{"question":"Where does the Calvin cycle take place?","answer":"The ' 'stroma.","topic":"Photosynthesis","difficulty":1}"}') data = llm.extract_json(raw) assert isinstance(data, list) and len(data) == 2, data assert data[0]["answer"] == "Thylakoid membranes." assert data[1]["topic"] == "Photosynthesis" print("ok concatenated objects (no array wrapper) -> list") def test_concatenated_arrays_flattened(): raw = '[{"question":"A","answer":"a","topic":"T"}] [{"question":"B","answer":"b","topic":"T"}]' data = llm.extract_json(raw) assert isinstance(data, list) and len(data) == 2, data assert [d["question"] for d in data] == ["A", "B"] print("ok concatenated arrays flattened -> single list") def test_truncated_single_object_midstring(): # MiniCPM-V grade reply cut off by the token limit mid-explanation. Closing # the dangling string + brace recovers a usable grade (was: None -> garbage). raw = '{"score": 4, "explanation": "Close but you missed the key idea' data = llm.extract_json(raw) assert data == {"score": 4, "explanation": "Close but you missed the key idea"}, data print("ok truncated object (mid-string) recovered") def test_truncated_object_midkey_drops_partial_pair(): # Cut off mid-key: closing as-is is invalid, so we drop the incomplete pair # and keep what parsed — still a usable score. raw = '{"score": 2, "explanation": "Partly right", "missed_conc' data = llm.extract_json(raw) assert isinstance(data, dict) and data.get("score") == 2, data assert data.get("explanation") == "Partly right", data print("ok truncated object (mid-key) drops the partial pair") def test_truncated_array_salvages_complete_objects(): # A deck array cut off inside the last object: the complete ones survive. raw = ('[{"question":"A","answer":"a","topic":"T","difficulty":1},' '{"question":"B","answer":"b","topic":"T","difficulty":2},' '{"question":"C","answer":') data = llm.extract_json(raw) assert isinstance(data, list) and len(data) == 2, data assert [d["question"] for d in data] == ["A", "B"] print("ok truncated array salvages the complete objects") def test_valid_json_unaffected_regression(): # Already-valid JSON (incl. a legitimately escaped quote inside a string) # parses on the first try and never hits the un-escape fallback. assert llm.extract_json('{"a": 1}') == {"a": 1} assert llm.extract_json('{"q": "she said \\"hi\\""}') == {"q": 'she said "hi"'} assert llm.extract_json("not json") is None print("ok valid/escaped-in-string JSON unaffected (regression)") if __name__ == "__main__": test_overescaped_object() test_overescaped_array_the_followups_case() test_fully_escaped_with_newlines() test_overescaped_with_think_and_fence() test_overescaped_embedded_in_prose() test_concatenated_objects_no_array_wrapper() test_concatenated_arrays_flattened() test_truncated_single_object_midstring() test_truncated_object_midkey_drops_partial_pair() test_truncated_array_salvages_complete_objects() test_valid_json_unaffected_regression() print("\nAll NAH-50 extract_json robustness tests passed.")