"""
NAH-48 — reasoning-model <think> blocks must not break strict-JSON parsing.

MiniCPM4.1/MiniCPM5 are hybrid reasoning models: they emit <think>…</think>
before the answer. extract_json() must look past that preamble (including the
template-prefill case where only the closing </think> appears) and still pull
out the JSON the grading / deck-gen callers depend on.

    python3 test_think_stripping.py
"""
import os

os.environ["RECALL_STUB"] = "1"

import llm


def test_full_think_block_then_object():
    reply = ('<think>\nThe student got the gist but missed ATP. I should score '
             'around 2.\n</think>\n{"score": 2, "explanation": "Missed ATP.", '
             '"missed_concept": "ATP production"}')
    data = llm.extract_json(reply)
    assert data["score"] == 2 and data["missed_concept"] == "ATP production"
    print("ok  <think>…</think> preamble stripped, object parsed")


def test_closing_tag_only_template_prefill():
    # Template pre-filled the opening <think>, so only </think> is in the reply.
    reply = 'reasoning about the answer...\n</think>\n{"score": 5, "explanation": "Spot on."}'
    data = llm.extract_json(reply)
    assert data["score"] == 5
    print("ok  closing-only </think> (template prefill) handled")


def test_braces_inside_reasoning_do_not_mislead():
    # The reasoning mentions a brace/JSON-ish fragment; must not be matched.
    reply = ('<think>maybe return {"score": 9} ? no, that is wrong, the cap is 5'
             '</think>{"score": 3, "explanation": "Partial."}')
    data = llm.extract_json(reply)
    assert data["score"] == 3, f"matched the wrong brace: {data}"
    print("ok  braces inside <think> do not produce a false match")


def test_think_then_json_array():
    reply = ('<think>generate three cards</think>\n'
             '[{"question": "Q1?", "answer": "A1", "topic": "T", "difficulty": 1}]')
    data = llm.extract_json(reply)
    assert isinstance(data, list) and data[0]["question"] == "Q1?"
    print("ok  array after <think> parsed")


def test_truncated_unclosed_think_returns_none():
    # Reasoning ran out of tokens before ever closing or emitting JSON.
    reply = "<think>Hmm, the student answer is close but I need to weigh whether"
    assert llm.extract_json(reply) is None
    print("ok  truncated unclosed <think> -> None (caller falls back/retries)")


def test_no_think_is_unchanged_regression():
    # Plain replies (non-reasoning models / fenced JSON) still work as before.
    assert llm.extract_json('{"a": 1}') == {"a": 1}
    assert llm.extract_json('```json\n{"b": 2}\n```') == {"b": 2}
    assert llm.extract_json('prose then {"c": 3} trailing') == {"c": 3}
    assert llm.extract_json("no json here") is None
    print("ok  non-reasoning replies unchanged (regression)")


def test_chat_json_recovers_from_think_wrapped_reply():
    # End-to-end through chat_json with chat() monkeypatched to a reasoning reply.
    llm.chat = lambda messages, max_tokens=512: (
        '<think>grade it</think>{"score": 4, "explanation": "Good."}'
    )
    data = llm.chat_json([{"role": "user", "content": "grade this"}])
    assert data["score"] == 4
    print("ok  chat_json parses a <think>-wrapped reply on the first try")


if __name__ == "__main__":
    test_full_think_block_then_object()
    test_closing_tag_only_template_prefill()
    test_braces_inside_reasoning_do_not_mislead()
    test_think_then_json_array()
    test_truncated_unclosed_think_returns_none()
    test_no_think_is_unchanged_regression()
    test_chat_json_recovers_from_think_wrapped_reply()
    print("\nAll NAH-48 think-stripping tests passed.")