Spaces:
Runtime error
Runtime error
| """ | |
| NAH-50 — extract_json must tolerate over-escaped JSON. | |
| MiniCPM4.1-8B sometimes emits JSON with backslash-escaped quotes | |
| ({\\"k\\": \\"v\\"} instead of {"k": "v"}), which plain json.loads rejects. | |
| That silently broke the adaptive follow-ups (extract_json -> None -> no cards). | |
| These check the un-escape fallback without any model/GPU. | |
| python3 test_extract_json.py | |
| """ | |
| import os | |
| os.environ["RECALL_STUB"] = "1" | |
| import llm | |
| def test_overescaped_object(): | |
| raw = '{\\"score\\": 0, \\"explanation\\": \\"Wrong.\\", \\"missed_concept\\": \\"stroma\\"}' | |
| data = llm.extract_json(raw) | |
| assert data == {"score": 0, "explanation": "Wrong.", "missed_concept": "stroma"}, data | |
| print("ok over-escaped object parses") | |
| def test_overescaped_array_the_followups_case(): | |
| # The exact shape the 8B returned that dropped the follow-ups. | |
| raw = ('[\n' | |
| ' {\\"question\\": \\"What are the products of the light reactions?\\", ' | |
| '\\"answer\\": \\"ATP and NADPH\\", \\"topic\\": \\"Photosynthesis\\"},\n' | |
| ' {\\"question\\": \\"Where do they happen?\\", ' | |
| '\\"answer\\": \\"Thylakoid membranes\\", \\"topic\\": \\"Photosynthesis\\"}\n' | |
| ']') | |
| data = llm.extract_json(raw) | |
| assert isinstance(data, list) and len(data) == 2, data | |
| assert data[0]["answer"] == "ATP and NADPH" | |
| print("ok over-escaped array parses (the follow-ups case)") | |
| def test_fully_escaped_with_newlines(): | |
| # The 8B sometimes escapes the WHOLE reply like a string literal: not just | |
| # \" but also \n (literal backslash-n) for the layout. Verbatim shape from | |
| # the model that dropped the follow-ups even after the quote-only fix. | |
| raw = ('[\\n {\\n \\"question\\": \\"Where do the light reactions occur?\\",\\n' | |
| ' \\"answer\\": \\"Thylakoid membranes\\",\\n \\"topic\\": \\"Photosynthesis\\"\\n' | |
| ' }\\n]') | |
| data = llm.extract_json(raw) | |
| assert isinstance(data, list) and len(data) == 1, data | |
| assert data[0]["answer"] == "Thylakoid membranes" | |
| print("ok fully-escaped JSON (\\\" + \\n) decoded") | |
| def test_overescaped_with_think_and_fence(): | |
| raw = '<think>grade</think>\n```json\n{\\"score\\": 3}\n```' | |
| assert llm.extract_json(raw) == {"score": 3} | |
| print("ok over-escaped + <think> + fence all handled together") | |
| def test_overescaped_embedded_in_prose(): | |
| raw = 'Here is the grade: {\\"score\\": 5} done.' | |
| assert llm.extract_json(raw) == {"score": 5} | |
| print("ok over-escaped object found inside prose") | |
| def test_valid_json_unaffected_regression(): | |
| # Already-valid JSON (incl. a legitimately escaped quote inside a string) | |
| # parses on the first try and never hits the un-escape fallback. | |
| assert llm.extract_json('{"a": 1}') == {"a": 1} | |
| assert llm.extract_json('{"q": "she said \\"hi\\""}') == {"q": 'she said "hi"'} | |
| assert llm.extract_json("not json") is None | |
| print("ok valid/escaped-in-string JSON unaffected (regression)") | |
| if __name__ == "__main__": | |
| test_overescaped_object() | |
| test_overescaped_array_the_followups_case() | |
| test_fully_escaped_with_newlines() | |
| test_overescaped_with_think_and_fence() | |
| test_overescaped_embedded_in_prose() | |
| test_valid_json_unaffected_regression() | |
| print("\nAll NAH-50 extract_json robustness tests passed.") | |