study-partner / test_think_stripping.py
nz-nz's picture
Deploy Recall study-partner app (stub-mode demo)
7563305 verified
Raw
History Blame Contribute Delete
3.63 kB
"""
NAH-48 — reasoning-model <think> blocks must not break strict-JSON parsing.
MiniCPM4.1/MiniCPM5 are hybrid reasoning models: they emit <think>…</think>
before the answer. extract_json() must look past that preamble (including the
template-prefill case where only the closing </think> appears) and still pull
out the JSON the grading / deck-gen callers depend on.
python3 test_think_stripping.py
"""
import os
os.environ["RECALL_STUB"] = "1"
import llm
def test_full_think_block_then_object():
reply = ('<think>\nThe student got the gist but missed ATP. I should score '
'around 2.\n</think>\n{"score": 2, "explanation": "Missed ATP.", '
'"missed_concept": "ATP production"}')
data = llm.extract_json(reply)
assert data["score"] == 2 and data["missed_concept"] == "ATP production"
print("ok <think>…</think> preamble stripped, object parsed")
def test_closing_tag_only_template_prefill():
# Template pre-filled the opening <think>, so only </think> is in the reply.
reply = 'reasoning about the answer...\n</think>\n{"score": 5, "explanation": "Spot on."}'
data = llm.extract_json(reply)
assert data["score"] == 5
print("ok closing-only </think> (template prefill) handled")
def test_braces_inside_reasoning_do_not_mislead():
# The reasoning mentions a brace/JSON-ish fragment; must not be matched.
reply = ('<think>maybe return {"score": 9} ? no, that is wrong, the cap is 5'
'</think>{"score": 3, "explanation": "Partial."}')
data = llm.extract_json(reply)
assert data["score"] == 3, f"matched the wrong brace: {data}"
print("ok braces inside <think> do not produce a false match")
def test_think_then_json_array():
reply = ('<think>generate three cards</think>\n'
'[{"question": "Q1?", "answer": "A1", "topic": "T", "difficulty": 1}]')
data = llm.extract_json(reply)
assert isinstance(data, list) and data[0]["question"] == "Q1?"
print("ok array after <think> parsed")
def test_truncated_unclosed_think_returns_none():
# Reasoning ran out of tokens before ever closing or emitting JSON.
reply = "<think>Hmm, the student answer is close but I need to weigh whether"
assert llm.extract_json(reply) is None
print("ok truncated unclosed <think> -> None (caller falls back/retries)")
def test_no_think_is_unchanged_regression():
# Plain replies (non-reasoning models / fenced JSON) still work as before.
assert llm.extract_json('{"a": 1}') == {"a": 1}
assert llm.extract_json('```json\n{"b": 2}\n```') == {"b": 2}
assert llm.extract_json('prose then {"c": 3} trailing') == {"c": 3}
assert llm.extract_json("no json here") is None
print("ok non-reasoning replies unchanged (regression)")
def test_chat_json_recovers_from_think_wrapped_reply():
# End-to-end through chat_json with chat() monkeypatched to a reasoning reply.
llm.chat = lambda messages, max_tokens=512: (
'<think>grade it</think>{"score": 4, "explanation": "Good."}'
)
data = llm.chat_json([{"role": "user", "content": "grade this"}])
assert data["score"] == 4
print("ok chat_json parses a <think>-wrapped reply on the first try")
if __name__ == "__main__":
test_full_think_block_then_object()
test_closing_tag_only_template_prefill()
test_braces_inside_reasoning_do_not_mislead()
test_think_then_json_array()
test_truncated_unclosed_think_returns_none()
test_no_think_is_unchanged_regression()
test_chat_json_recovers_from_think_wrapped_reply()
print("\nAll NAH-48 think-stripping tests passed.")