Spaces:

build-small-hackathon
/

study-partner

Running on Zero

App Files Files Community

study-partner / test_think_stripping.py

nz-nz

Deploy Recall study-partner app (stub-mode demo)

7563305 verified 19 days ago

Raw

History Blame Contribute Delete

3.63 kB

	"""
	NAH-48 — reasoning-model <think> blocks must not break strict-JSON parsing.

	MiniCPM4.1/MiniCPM5 are hybrid reasoning models: they emit <think>…</think>
	before the answer. extract_json() must look past that preamble (including the
	template-prefill case where only the closing </think> appears) and still pull
	out the JSON the grading / deck-gen callers depend on.

	python3 test_think_stripping.py
	"""
	import os

	os.environ["RECALL_STUB"] = "1"

	import llm


	def test_full_think_block_then_object():
	reply = ('<think>\nThe student got the gist but missed ATP. I should score '
	'around 2.\n</think>\n{"score": 2, "explanation": "Missed ATP.", '
	'"missed_concept": "ATP production"}')
	data = llm.extract_json(reply)
	assert data["score"] == 2 and data["missed_concept"] == "ATP production"
	print("ok <think>…</think> preamble stripped, object parsed")


	def test_closing_tag_only_template_prefill():
	# Template pre-filled the opening <think>, so only </think> is in the reply.
	reply = 'reasoning about the answer...\n</think>\n{"score": 5, "explanation": "Spot on."}'
	data = llm.extract_json(reply)
	assert data["score"] == 5
	print("ok closing-only </think> (template prefill) handled")


	def test_braces_inside_reasoning_do_not_mislead():
	# The reasoning mentions a brace/JSON-ish fragment; must not be matched.
	reply = ('<think>maybe return {"score": 9} ? no, that is wrong, the cap is 5'
	'</think>{"score": 3, "explanation": "Partial."}')
	data = llm.extract_json(reply)
	assert data["score"] == 3, f"matched the wrong brace: {data}"
	print("ok braces inside <think> do not produce a false match")


	def test_think_then_json_array():
	reply = ('<think>generate three cards</think>\n'
	'[{"question": "Q1?", "answer": "A1", "topic": "T", "difficulty": 1}]')
	data = llm.extract_json(reply)
	assert isinstance(data, list) and data[0]["question"] == "Q1?"
	print("ok array after <think> parsed")


	def test_truncated_unclosed_think_returns_none():
	# Reasoning ran out of tokens before ever closing or emitting JSON.
	reply = "<think>Hmm, the student answer is close but I need to weigh whether"
	assert llm.extract_json(reply) is None
	print("ok truncated unclosed <think> -> None (caller falls back/retries)")


	def test_no_think_is_unchanged_regression():
	# Plain replies (non-reasoning models / fenced JSON) still work as before.
	assert llm.extract_json('{"a": 1}') == {"a": 1}
	assert llm.extract_json('```json\n{"b": 2}\n```') == {"b": 2}
	assert llm.extract_json('prose then {"c": 3} trailing') == {"c": 3}
	assert llm.extract_json("no json here") is None
	print("ok non-reasoning replies unchanged (regression)")


	def test_chat_json_recovers_from_think_wrapped_reply():
	# End-to-end through chat_json with chat() monkeypatched to a reasoning reply.
	llm.chat = lambda messages, max_tokens=512: (
	'<think>grade it</think>{"score": 4, "explanation": "Good."}'
	)
	data = llm.chat_json([{"role": "user", "content": "grade this"}])
	assert data["score"] == 4
	print("ok chat_json parses a <think>-wrapped reply on the first try")


	if __name__ == "__main__":
	test_full_think_block_then_object()
	test_closing_tag_only_template_prefill()
	test_braces_inside_reasoning_do_not_mislead()
	test_think_then_json_array()
	test_truncated_unclosed_think_returns_none()
	test_no_think_is_unchanged_regression()
	test_chat_json_recovers_from_think_wrapped_reply()
	print("\nAll NAH-48 think-stripping tests passed.")