Spaces:

build-small-hackathon
/

study-partner

Running on Zero

File size: 6,185 Bytes

"""
NAH-50 — extract_json must tolerate over-escaped JSON.

MiniCPM4.1-8B sometimes emits JSON with backslash-escaped quotes
({\\"k\\": \\"v\\"} instead of {"k": "v"}), which plain json.loads rejects.
That silently broke the adaptive follow-ups (extract_json -> None -> no cards).
These check the un-escape fallback without any model/GPU.

    python3 test_extract_json.py
"""
import os

os.environ["RECALL_STUB"] = "1"

import llm


def test_overescaped_object():
    raw = '{\\"score\\": 0, \\"explanation\\": \\"Wrong.\\", \\"missed_concept\\": \\"stroma\\"}'
    data = llm.extract_json(raw)
    assert data == {"score": 0, "explanation": "Wrong.", "missed_concept": "stroma"}, data
    print("ok  over-escaped object parses")


def test_overescaped_array_the_followups_case():
    # The exact shape the 8B returned that dropped the follow-ups.
    raw = ('[\n'
           '  {\\"question\\": \\"What are the products of the light reactions?\\", '
           '\\"answer\\": \\"ATP and NADPH\\", \\"topic\\": \\"Photosynthesis\\"},\n'
           '  {\\"question\\": \\"Where do they happen?\\", '
           '\\"answer\\": \\"Thylakoid membranes\\", \\"topic\\": \\"Photosynthesis\\"}\n'
           ']')
    data = llm.extract_json(raw)
    assert isinstance(data, list) and len(data) == 2, data
    assert data[0]["answer"] == "ATP and NADPH"
    print("ok  over-escaped array parses (the follow-ups case)")


def test_fully_escaped_with_newlines():
    # The 8B sometimes escapes the WHOLE reply like a string literal: not just
    # \" but also \n (literal backslash-n) for the layout. Verbatim shape from
    # the model that dropped the follow-ups even after the quote-only fix.
    raw = ('[\\n    {\\n        \\"question\\": \\"Where do the light reactions occur?\\",\\n'
           '        \\"answer\\": \\"Thylakoid membranes\\",\\n        \\"topic\\": \\"Photosynthesis\\"\\n'
           '    }\\n]')
    data = llm.extract_json(raw)
    assert isinstance(data, list) and len(data) == 1, data
    assert data[0]["answer"] == "Thylakoid membranes"
    print("ok  fully-escaped JSON (\\\" + \\n) decoded")


def test_overescaped_with_think_and_fence():
    raw = '<think>grade</think>\n```json\n{\\"score\\": 3}\n```'
    assert llm.extract_json(raw) == {"score": 3}
    print("ok  over-escaped + <think> + fence all handled together")


def test_overescaped_embedded_in_prose():
    raw = 'Here is the grade: {\\"score\\": 5} done.'
    assert llm.extract_json(raw) == {"score": 5}
    print("ok  over-escaped object found inside prose")


def test_concatenated_objects_no_array_wrapper():
    # MiniCPM-V on image input returns several objects space-separated with no
    # [ ] wrapper (+ a stray trailing quote) — verbatim shape from the OCR path.
    raw = ('{"question":"Where do the light reactions occur?","answer":"Thylakoid '
           'membranes.","topic":"Photosynthesis","difficulty":2} '
           '{"question":"Where does the Calvin cycle take place?","answer":"The '
           'stroma.","topic":"Photosynthesis","difficulty":1}"}')
    data = llm.extract_json(raw)
    assert isinstance(data, list) and len(data) == 2, data
    assert data[0]["answer"] == "Thylakoid membranes."
    assert data[1]["topic"] == "Photosynthesis"
    print("ok  concatenated objects (no array wrapper) -> list")


def test_concatenated_arrays_flattened():
    raw = '[{"question":"A","answer":"a","topic":"T"}] [{"question":"B","answer":"b","topic":"T"}]'
    data = llm.extract_json(raw)
    assert isinstance(data, list) and len(data) == 2, data
    assert [d["question"] for d in data] == ["A", "B"]
    print("ok  concatenated arrays flattened -> single list")


def test_truncated_single_object_midstring():
    # MiniCPM-V grade reply cut off by the token limit mid-explanation. Closing
    # the dangling string + brace recovers a usable grade (was: None -> garbage).
    raw = '{"score": 4, "explanation": "Close but you missed the key idea'
    data = llm.extract_json(raw)
    assert data == {"score": 4, "explanation": "Close but you missed the key idea"}, data
    print("ok  truncated object (mid-string) recovered")


def test_truncated_object_midkey_drops_partial_pair():
    # Cut off mid-key: closing as-is is invalid, so we drop the incomplete pair
    # and keep what parsed — still a usable score.
    raw = '{"score": 2, "explanation": "Partly right", "missed_conc'
    data = llm.extract_json(raw)
    assert isinstance(data, dict) and data.get("score") == 2, data
    assert data.get("explanation") == "Partly right", data
    print("ok  truncated object (mid-key) drops the partial pair")


def test_truncated_array_salvages_complete_objects():
    # A deck array cut off inside the last object: the complete ones survive.
    raw = ('[{"question":"A","answer":"a","topic":"T","difficulty":1},'
           '{"question":"B","answer":"b","topic":"T","difficulty":2},'
           '{"question":"C","answer":')
    data = llm.extract_json(raw)
    assert isinstance(data, list) and len(data) == 2, data
    assert [d["question"] for d in data] == ["A", "B"]
    print("ok  truncated array salvages the complete objects")


def test_valid_json_unaffected_regression():
    # Already-valid JSON (incl. a legitimately escaped quote inside a string)
    # parses on the first try and never hits the un-escape fallback.
    assert llm.extract_json('{"a": 1}') == {"a": 1}
    assert llm.extract_json('{"q": "she said \\"hi\\""}') == {"q": 'she said "hi"'}
    assert llm.extract_json("not json") is None
    print("ok  valid/escaped-in-string JSON unaffected (regression)")


if __name__ == "__main__":
    test_overescaped_object()
    test_overescaped_array_the_followups_case()
    test_fully_escaped_with_newlines()
    test_overescaped_with_think_and_fence()
    test_overescaped_embedded_in_prose()
    test_concatenated_objects_no_array_wrapper()
    test_concatenated_arrays_flattened()
    test_truncated_single_object_midstring()
    test_truncated_object_midkey_drops_partial_pair()
    test_truncated_array_salvages_complete_objects()
    test_valid_json_unaffected_regression()
    print("\nAll NAH-50 extract_json robustness tests passed.")