Spaces:
Running on Zero
Running on Zero
| """ | |
| NAH-48 — reasoning-model <think> blocks must not break strict-JSON parsing. | |
| MiniCPM4.1/MiniCPM5 are hybrid reasoning models: they emit <think>…</think> | |
| before the answer. extract_json() must look past that preamble (including the | |
| template-prefill case where only the closing </think> appears) and still pull | |
| out the JSON the grading / deck-gen callers depend on. | |
| python3 test_think_stripping.py | |
| """ | |
| import os | |
| os.environ["RECALL_STUB"] = "1" | |
| import llm | |
| def test_full_think_block_then_object(): | |
| reply = ('<think>\nThe student got the gist but missed ATP. I should score ' | |
| 'around 2.\n</think>\n{"score": 2, "explanation": "Missed ATP.", ' | |
| '"missed_concept": "ATP production"}') | |
| data = llm.extract_json(reply) | |
| assert data["score"] == 2 and data["missed_concept"] == "ATP production" | |
| print("ok <think>…</think> preamble stripped, object parsed") | |
| def test_closing_tag_only_template_prefill(): | |
| # Template pre-filled the opening <think>, so only </think> is in the reply. | |
| reply = 'reasoning about the answer...\n</think>\n{"score": 5, "explanation": "Spot on."}' | |
| data = llm.extract_json(reply) | |
| assert data["score"] == 5 | |
| print("ok closing-only </think> (template prefill) handled") | |
| def test_braces_inside_reasoning_do_not_mislead(): | |
| # The reasoning mentions a brace/JSON-ish fragment; must not be matched. | |
| reply = ('<think>maybe return {"score": 9} ? no, that is wrong, the cap is 5' | |
| '</think>{"score": 3, "explanation": "Partial."}') | |
| data = llm.extract_json(reply) | |
| assert data["score"] == 3, f"matched the wrong brace: {data}" | |
| print("ok braces inside <think> do not produce a false match") | |
| def test_think_then_json_array(): | |
| reply = ('<think>generate three cards</think>\n' | |
| '[{"question": "Q1?", "answer": "A1", "topic": "T", "difficulty": 1}]') | |
| data = llm.extract_json(reply) | |
| assert isinstance(data, list) and data[0]["question"] == "Q1?" | |
| print("ok array after <think> parsed") | |
| def test_truncated_unclosed_think_returns_none(): | |
| # Reasoning ran out of tokens before ever closing or emitting JSON. | |
| reply = "<think>Hmm, the student answer is close but I need to weigh whether" | |
| assert llm.extract_json(reply) is None | |
| print("ok truncated unclosed <think> -> None (caller falls back/retries)") | |
| def test_no_think_is_unchanged_regression(): | |
| # Plain replies (non-reasoning models / fenced JSON) still work as before. | |
| assert llm.extract_json('{"a": 1}') == {"a": 1} | |
| assert llm.extract_json('```json\n{"b": 2}\n```') == {"b": 2} | |
| assert llm.extract_json('prose then {"c": 3} trailing') == {"c": 3} | |
| assert llm.extract_json("no json here") is None | |
| print("ok non-reasoning replies unchanged (regression)") | |
| def test_chat_json_recovers_from_think_wrapped_reply(): | |
| # End-to-end through chat_json with chat() monkeypatched to a reasoning reply. | |
| llm.chat = lambda messages, max_tokens=512: ( | |
| '<think>grade it</think>{"score": 4, "explanation": "Good."}' | |
| ) | |
| data = llm.chat_json([{"role": "user", "content": "grade this"}]) | |
| assert data["score"] == 4 | |
| print("ok chat_json parses a <think>-wrapped reply on the first try") | |
| if __name__ == "__main__": | |
| test_full_think_block_then_object() | |
| test_closing_tag_only_template_prefill() | |
| test_braces_inside_reasoning_do_not_mislead() | |
| test_think_then_json_array() | |
| test_truncated_unclosed_think_returns_none() | |
| test_no_think_is_unchanged_regression() | |
| test_chat_json_recovers_from_think_wrapped_reply() | |
| print("\nAll NAH-48 think-stripping tests passed.") | |