""" NAH-48 — reasoning-model blocks must not break strict-JSON parsing. MiniCPM4.1/MiniCPM5 are hybrid reasoning models: they emit before the answer. extract_json() must look past that preamble (including the template-prefill case where only the closing appears) and still pull out the JSON the grading / deck-gen callers depend on. python3 test_think_stripping.py """ import os os.environ["RECALL_STUB"] = "1" import llm def test_full_think_block_then_object(): reply = ('\nThe student got the gist but missed ATP. I should score ' 'around 2.\n\n{"score": 2, "explanation": "Missed ATP.", ' '"missed_concept": "ATP production"}') data = llm.extract_json(reply) assert data["score"] == 2 and data["missed_concept"] == "ATP production" print("ok preamble stripped, object parsed") def test_closing_tag_only_template_prefill(): # Template pre-filled the opening , so only is in the reply. reply = 'reasoning about the answer...\n\n{"score": 5, "explanation": "Spot on."}' data = llm.extract_json(reply) assert data["score"] == 5 print("ok closing-only (template prefill) handled") def test_braces_inside_reasoning_do_not_mislead(): # The reasoning mentions a brace/JSON-ish fragment; must not be matched. reply = ('maybe return {"score": 9} ? no, that is wrong, the cap is 5' '{"score": 3, "explanation": "Partial."}') data = llm.extract_json(reply) assert data["score"] == 3, f"matched the wrong brace: {data}" print("ok braces inside do not produce a false match") def test_think_then_json_array(): reply = ('generate three cards\n' '[{"question": "Q1?", "answer": "A1", "topic": "T", "difficulty": 1}]') data = llm.extract_json(reply) assert isinstance(data, list) and data[0]["question"] == "Q1?" print("ok array after parsed") def test_truncated_unclosed_think_returns_none(): # Reasoning ran out of tokens before ever closing or emitting JSON. reply = "Hmm, the student answer is close but I need to weigh whether" assert llm.extract_json(reply) is None print("ok truncated unclosed -> None (caller falls back/retries)") def test_no_think_is_unchanged_regression(): # Plain replies (non-reasoning models / fenced JSON) still work as before. assert llm.extract_json('{"a": 1}') == {"a": 1} assert llm.extract_json('```json\n{"b": 2}\n```') == {"b": 2} assert llm.extract_json('prose then {"c": 3} trailing') == {"c": 3} assert llm.extract_json("no json here") is None print("ok non-reasoning replies unchanged (regression)") def test_chat_json_recovers_from_think_wrapped_reply(): # End-to-end through chat_json with chat() monkeypatched to a reasoning reply. llm.chat = lambda messages, max_tokens=512: ( 'grade it{"score": 4, "explanation": "Good."}' ) data = llm.chat_json([{"role": "user", "content": "grade this"}]) assert data["score"] == 4 print("ok chat_json parses a -wrapped reply on the first try") if __name__ == "__main__": test_full_think_block_then_object() test_closing_tag_only_template_prefill() test_braces_inside_reasoning_do_not_mislead() test_think_then_json_array() test_truncated_unclosed_think_returns_none() test_no_think_is_unchanged_regression() test_chat_json_recovers_from_think_wrapped_reply() print("\nAll NAH-48 think-stripping tests passed.")