ml-intern

Sleeping

Aksel Joonas Reedi commited on Apr 27

Commit

d408a51

unverified ·

1 Parent(s): 59b2038

Preserve streamed thinking metadata with live model tests (#150)

* Add opt-in live thinking model tests

Add paid integration coverage for the concrete models requested for #143: Anthropic Opus 4.7 and OpenAI's current GPT-5.2 model. The tests load an explicit env file, run only behind ML_INTERN_LIVE_LLM_TESTS=1, and keep normal CI credential-free.

Constraint: Live provider calls require local credentials and should not run by default in CI.

Rejected: Make live provider tests unconditional | would fail or spend tokens anywhere credentials are absent.

Confidence: high

Scope-risk: narrow

Tested: ML_INTERN_LIVE_LLM_TESTS=1 ML_INTERN_LIVE_ENV_FILE=/Users/akseljoonas/Documents/ml-intern/.env UV_CACHE_DIR=/tmp/uv-cache uv run --extra dev pytest tests/integration/test_live_thinking_models.py -q -rs

Tested: UV_CACHE_DIR=/tmp/uv-cache uv run --extra dev pytest tests/unit/test_thinking_history.py tests/integration/test_live_thinking_models.py -q

* Preserve streamed Opus thinking metadata

Live Opus 4.7 exposed that LiteLLM surfaces streamed thinking blocks on deltas while stream_chunk_builder can drop them. Capture Anthropic thinking metadata directly during streaming and keep the chunk rebuild path as a fallback.

Constraint: #150 live test must prove real thinking metadata is present, not pass on None metadata.

Rejected: Switch the live Opus test to non-streaming only | would avoid the actual streaming replay gap.

Confidence: high

Scope-risk: narrow

Directive: Keep replay of provider reasoning fields gated to anthropic/* models; OpenAI-compatible providers must not receive echoed reasoning_content.

Tested: UV_CACHE_DIR=/tmp/uv-cache uv run --extra dev pytest tests/unit/test_thinking_history.py -q

Tested: ML_INTERN_LIVE_LLM_TESTS=1 ML_INTERN_LIVE_ENV_FILE=/Users/akseljoonas/Documents/ml-intern/.env UV_CACHE_DIR=/tmp/uv-cache uv run --extra dev pytest tests/integration/test_live_thinking_models.py -q -rs

Tested: UV_CACHE_DIR=/tmp/uv-cache uv run --extra dev pytest tests/unit/test_thinking_history.py tests/integration/test_live_thinking_models.py -q

Files changed (3) hide show

agent/core/agent_loop.py +27 -5
tests/integration/test_live_thinking_models.py +151 -0
tests/unit/test_thinking_history.py +68 -0

agent/core/agent_loop.py CHANGED Viewed

@@ -410,8 +410,20 @@ def _extract_thinking_state(
     message: Any,
 ) -> tuple[list[dict[str, Any]] | None, str | None]:
     """Return provider reasoning fields that must be replayed after tool calls."""
-    thinking_blocks = getattr(message, "thinking_blocks", None) or None
-    reasoning_content = getattr(message, "reasoning_content", None) or None
     return thinking_blocks, reasoning_content
@@ -492,6 +504,9 @@ async def _call_llm_streaming(session: Session, messages, tools, llm_params) ->
     finish_reason = None
     final_usage_chunk = None
     chunks = []
     async for chunk in response:
         chunks.append(chunk)
@@ -510,6 +525,13 @@ async def _call_llm_streaming(session: Session, messages, tools, llm_params) ->
         if choice.finish_reason:
             finish_reason = choice.finish_reason
         if delta.content:
             full_content += delta.content
             await session.send_event(
@@ -543,9 +565,9 @@ async def _call_llm_streaming(session: Session, messages, tools, llm_params) ->
         latency_ms=int((time.monotonic() - t_start) * 1000),
         finish_reason=finish_reason,
     )
-    thinking_blocks = None
-    reasoning_content = None
-    if chunks and _should_replay_thinking_state(llm_params.get("model")):
         try:
             rebuilt = stream_chunk_builder(chunks, messages=messages)
             if rebuilt and getattr(rebuilt, "choices", None):

     message: Any,
 ) -> tuple[list[dict[str, Any]] | None, str | None]:
     """Return provider reasoning fields that must be replayed after tool calls."""
+    provider_fields = getattr(message, "provider_specific_fields", None)
+    if not isinstance(provider_fields, dict):
+        provider_fields = {}
+    thinking_blocks = (
+        getattr(message, "thinking_blocks", None)
+        or provider_fields.get("thinking_blocks")
+        or None
+    )
+    reasoning_content = (
+        getattr(message, "reasoning_content", None)
+        or provider_fields.get("reasoning_content")
+        or None
+    )
     return thinking_blocks, reasoning_content
     finish_reason = None
     final_usage_chunk = None
     chunks = []
+    should_replay_thinking = _should_replay_thinking_state(llm_params.get("model"))
+    collected_thinking_blocks: list[dict[str, Any]] = []
+    collected_reasoning_content: list[str] = []
     async for chunk in response:
         chunks.append(chunk)
         if choice.finish_reason:
             finish_reason = choice.finish_reason
+        if should_replay_thinking:
+            delta_thinking_blocks, delta_reasoning_content = _extract_thinking_state(delta)
+            if delta_thinking_blocks:
+                collected_thinking_blocks.extend(delta_thinking_blocks)
+            if delta_reasoning_content:
+                collected_reasoning_content.append(delta_reasoning_content)
         if delta.content:
             full_content += delta.content
             await session.send_event(
         latency_ms=int((time.monotonic() - t_start) * 1000),
         finish_reason=finish_reason,
     )
+    thinking_blocks = collected_thinking_blocks or None
+    reasoning_content = "".join(collected_reasoning_content) or None
+    if chunks and should_replay_thinking and not (thinking_blocks or reasoning_content):
         try:
             rebuilt = stream_chunk_builder(chunks, messages=messages)
             if rebuilt and getattr(rebuilt, "choices", None):

tests/integration/test_live_thinking_models.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""Opt-in live provider checks for thinking metadata replay.
+These tests intentionally call paid model APIs and are skipped unless
+``ML_INTERN_LIVE_LLM_TESTS=1`` plus the relevant provider key are set.
+They cover the concrete model families involved in #87 without making
+default CI depend on external credentials or provider availability.
+"""
+from __future__ import annotations
+import os
+from pathlib import Path
+from types import SimpleNamespace
+import pytest
+from dotenv import load_dotenv
+from litellm import Message
+from agent.core.agent_loop import (
+    _assistant_message_from_result,
+    _call_llm_streaming,
+)
+from agent.core.llm_params import _resolve_llm_params
+if env_file := os.environ.get("ML_INTERN_LIVE_ENV_FILE"):
+    load_dotenv(Path(env_file))
+LIVE_TESTS_ENABLED = os.environ.get("ML_INTERN_LIVE_LLM_TESTS") == "1"
+OPUS_47_MODEL = "anthropic/claude-opus-4-7"
+LATEST_GPT_MODEL = "openai/gpt-5.2"
+REPORT_RESULT_TOOL = [
+    {
+        "type": "function",
+        "function": {
+            "name": "report_result",
+            "description": "Report the final test result.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "answer": {
+                        "type": "string",
+                        "description": "The exact marker requested by the test.",
+                    }
+                },
+                "required": ["answer"],
+            },
+        },
+    }
+]
+def _skip_without_live_flag() -> None:
+    if not LIVE_TESTS_ENABLED:
+        pytest.skip("set ML_INTERN_LIVE_LLM_TESTS=1 to run paid live LLM tests")
+def _skip_without_env(name: str) -> None:
+    if not os.environ.get(name):
+        pytest.skip(f"set {name} to run this live provider test")
+def _session(model_name: str):
+    events = []
+    async def send_event(event):
+        events.append(event)
+    return SimpleNamespace(
+        config=SimpleNamespace(model_name=model_name),
+        is_cancelled=False,
+        send_event=send_event,
+        events=events,
+    )
+@pytest.mark.asyncio
+async def test_live_opus_47_preserves_thinking_metadata_for_replay():
+    _skip_without_live_flag()
+    _skip_without_env("ANTHROPIC_API_KEY")
+    session = _session(OPUS_47_MODEL)
+    llm_params = _resolve_llm_params(
+        OPUS_47_MODEL,
+        reasoning_effort="high",
+    )
+    result = await _call_llm_streaming(
+        session,
+        messages=[
+            Message(
+                role="user",
+                content=(
+                    "Use careful reasoning for this small check. "
+                    "If 17 * 19 = 323, call report_result with answer OPUS_OK."
+                ),
+            )
+        ],
+        tools=REPORT_RESULT_TOOL,
+        llm_params=llm_params,
+    )
+    replay = _assistant_message_from_result(
+        result,
+        model_name=OPUS_47_MODEL,
+    )
+    assert result.content or result.tool_calls_acc
+    assert result.thinking_blocks, (
+        "Opus returned no thinking_blocks with reasoning_effort='high' - "
+        "check that adaptive thinking params are being forwarded correctly"
+    )
+    assert getattr(replay, "thinking_blocks", None) == result.thinking_blocks
+    assert getattr(replay, "reasoning_content", None) == result.reasoning_content
+@pytest.mark.asyncio
+async def test_live_latest_gpt_does_not_replay_reasoning_metadata():
+    _skip_without_live_flag()
+    _skip_without_env("OPENAI_API_KEY")
+    session = _session(LATEST_GPT_MODEL)
+    llm_params = _resolve_llm_params(
+        LATEST_GPT_MODEL,
+        reasoning_effort="low",
+    )
+    result = await _call_llm_streaming(
+        session,
+        messages=[
+            Message(
+                role="user",
+                content="Call report_result with answer GPT_OK.",
+            )
+        ],
+        tools=REPORT_RESULT_TOOL,
+        llm_params=llm_params,
+    )
+    # Even if a GPT-family response carries provider reasoning internally,
+    # OpenAI-compatible history must not echo it back on the next tool turn.
+    # Force the non-None strip path when the live model omits reasoning details.
+    result.reasoning_content = result.reasoning_content or "synthetic-reasoning"
+    replay = _assistant_message_from_result(
+        result,
+        model_name=LATEST_GPT_MODEL,
+    )
+    assert result.content or result.tool_calls_acc
+    assert getattr(replay, "thinking_blocks", None) is None
+    assert getattr(replay, "reasoning_content", None) is None

tests/unit/test_thinking_history.py CHANGED Viewed

@@ -26,6 +26,20 @@ def test_extract_thinking_state_from_litellm_message():
     assert reasoning_content == "reasoned"
 def test_assistant_message_from_result_preserves_thinking_with_tool_calls():
     tool_call = ChatCompletionMessageToolCall(
         id="call_1",
@@ -144,6 +158,60 @@ async def test_streaming_call_rebuilds_anthropic_thinking_state(monkeypatch):
     assert result.reasoning_content == "reasoned"
 @pytest.mark.asyncio
 async def test_streaming_call_skips_chunk_rebuild_for_non_anthropic(monkeypatch):
     async def fake_stream():

     assert reasoning_content == "reasoned"
+def test_extract_thinking_state_from_provider_fields():
+    message = SimpleNamespace(
+        provider_specific_fields={
+            "thinking_blocks": [{"type": "thinking", "thinking": "reasoned"}],
+            "reasoning_content": "reasoned",
+        },
+    )
+    thinking_blocks, reasoning_content = _extract_thinking_state(message)
+    assert thinking_blocks == [{"type": "thinking", "thinking": "reasoned"}]
+    assert reasoning_content == "reasoned"
 def test_assistant_message_from_result_preserves_thinking_with_tool_calls():
     tool_call = ChatCompletionMessageToolCall(
         id="call_1",
     assert result.reasoning_content == "reasoned"
+@pytest.mark.asyncio
+async def test_streaming_call_collects_anthropic_delta_thinking_state(monkeypatch):
+    async def fake_stream():
+        yield SimpleNamespace(
+            choices=[
+                SimpleNamespace(
+                    delta=SimpleNamespace(
+                        content=None,
+                        tool_calls=None,
+                        thinking_blocks=[{"type": "thinking", "thinking": "reasoned"}],
+                    ),
+                    finish_reason=None,
+                )
+            ],
+        )
+        yield SimpleNamespace(
+            choices=[
+                SimpleNamespace(
+                    delta=SimpleNamespace(content="done", tool_calls=None),
+                    finish_reason="stop",
+                )
+            ],
+        )
+        yield SimpleNamespace(choices=[], usage=SimpleNamespace(total_tokens=3))
+    async def fake_acompletion(**_kwargs):
+        return fake_stream()
+    def fail_chunk_builder(*_args, **_kwargs):
+        raise AssertionError("stream_chunk_builder should not run when deltas include thinking")
+    events = []
+    async def send_event(event):
+        events.append(event)
+    session = SimpleNamespace(
+        config=SimpleNamespace(model_name="anthropic/claude-opus-4-7"),
+        is_cancelled=False,
+        send_event=send_event,
+    )
+    monkeypatch.setattr(agent_loop, "acompletion", fake_acompletion)
+    monkeypatch.setattr(agent_loop, "stream_chunk_builder", fail_chunk_builder)
+    result = await _call_llm_streaming(
+        session,
+        messages=[Message(role="user", content="hi")],
+        tools=[],
+        llm_params={"model": "anthropic/claude-opus-4-7"},
+    )
+    assert result.content == "done"
+    assert result.thinking_blocks == [{"type": "thinking", "thinking": "reasoned"}]
 @pytest.mark.asyncio
 async def test_streaming_call_skips_chunk_rebuild_for_non_anthropic(monkeypatch):
     async def fake_stream():