Spaces:

smolagents
/

ml-intern

Running on CPU Upgrade

Aksel Joonas Reedi commited on Apr 27

Commit

59b2038

unverified ·

1 Parent(s): e8ed637

Preserve thinking state across tool turns (#143)

* Preserve thinking state across tool turns

Anthropic thinking responses need their thinking_blocks and reasoning_content replayed with assistant tool-call messages. The loop was rebuilding assistant history from only content and tool calls, causing LiteLLM to strip thinking on continuation turns.

Constraint: Non-thinking providers and responses without reasoning fields must keep the existing message shape.

Rejected: Disable extended thinking for tool-using runs | avoids the warning by removing the feature that improves reasoning quality.

Confidence: high

Scope-risk: moderate

Directive: Any future assistant-message reconstruction must preserve provider reasoning fields when present.

Tested: UV_CACHE_DIR=/tmp/uv-cache uv run --extra dev pytest tests/unit/test_thinking_history.py tests/unit/test_dangling_tool_calls.py tests/unit/test_malformed_args_recovery.py

* Replay thinking metadata only for Anthropic

Review caught that reasoning_content is not safe to echo through OpenAI-compatible schemas such as the HF router. Gate replay and streaming chunk rebuilding to direct Anthropic models, where thinking metadata is required for tool continuations.

Constraint: HF router and OpenAI-compatible providers reject reasoning_content in assistant history.

Rejected: Preserve reasoning_content for all providers | reproduces the schema rejection already avoided in the research loop.

Confidence: high

Scope-risk: moderate

Tested: UV_CACHE_DIR=/tmp/uv-cache uv run --extra dev pytest tests/unit/test_thinking_history.py tests/unit/test_dangling_tool_calls.py tests/unit/test_malformed_args_recovery.py

Files changed (2) hide show

agent/core/agent_loop.py +73 -7
tests/unit/test_thinking_history.py +186 -0

agent/core/agent_loop.py CHANGED Viewed

@@ -8,8 +8,14 @@ import logging
 import os
 import time
 from dataclasses import dataclass, field
-from litellm import ChatCompletionMessageToolCall, Message, acompletion
 from litellm.exceptions import ContextWindowExceededError
 from agent.config import Config
@@ -396,6 +402,43 @@ class LLMResult:
     token_count: int
     finish_reason: str | None
     usage: dict = field(default_factory=dict)
 async def _call_llm_streaming(session: Session, messages, tools, llm_params) -> LLMResult:
@@ -448,8 +491,10 @@ async def _call_llm_streaming(session: Session, messages, tools, llm_params) ->
     token_count = 0
     finish_reason = None
     final_usage_chunk = None
     async for chunk in response:
         if session.is_cancelled:
             tool_calls_acc.clear()
             break
@@ -498,6 +543,16 @@ async def _call_llm_streaming(session: Session, messages, tools, llm_params) ->
         latency_ms=int((time.monotonic() - t_start) * 1000),
         finish_reason=finish_reason,
     )
     return LLMResult(
         content=full_content or None,
@@ -505,6 +560,8 @@ async def _call_llm_streaming(session: Session, messages, tools, llm_params) ->
         token_count=token_count,
         finish_reason=finish_reason,
         usage=usage,
     )
@@ -557,6 +614,7 @@ async def _call_llm_non_streaming(session: Session, messages, tools, llm_params)
     content = message.content or None
     finish_reason = choice.finish_reason
     token_count = response.usage.total_tokens if response.usage else 0
     # Build tool_calls_acc in the same format as streaming
     tool_calls_acc: dict[int, dict] = {}
@@ -591,6 +649,8 @@ async def _call_llm_non_streaming(session: Session, messages, tools, llm_params)
         token_count=token_count,
         finish_reason=finish_reason,
         usage=usage,
     )
@@ -754,7 +814,10 @@ class Handlers:
                         "  • For other tools: reduce the size of your arguments or use bash."
                     )
                     if content:
-                        assistant_msg = Message(role="assistant", content=content)
                         session.context_manager.add_message(assistant_msg, token_count)
                     session.context_manager.add_message(
                         Message(role="user", content=f"[SYSTEM: {truncation_hint}]")
@@ -810,7 +873,10 @@ class Handlers:
                         (content or "")[:500],
                     )
                     if content:
-                        assistant_msg = Message(role="assistant", content=content)
                         session.context_manager.add_message(assistant_msg, token_count)
                         final_response = content
                     break
@@ -832,9 +898,9 @@ class Handlers:
                         bad_tools.append(tc)
                 # Add assistant message with all tool calls to context
-                assistant_msg = Message(
-                    role="assistant",
-                    content=content,
                     tool_calls=tool_calls,
                 )
                 session.context_manager.add_message(assistant_msg, token_count)

 import os
 import time
 from dataclasses import dataclass, field
+from typing import Any
+from litellm import (
+    ChatCompletionMessageToolCall,
+    Message,
+    acompletion,
+    stream_chunk_builder,
+)
 from litellm.exceptions import ContextWindowExceededError
 from agent.config import Config
     token_count: int
     finish_reason: str | None
     usage: dict = field(default_factory=dict)
+    thinking_blocks: list[dict[str, Any]] | None = None
+    reasoning_content: str | None = None
+def _extract_thinking_state(
+    message: Any,
+) -> tuple[list[dict[str, Any]] | None, str | None]:
+    """Return provider reasoning fields that must be replayed after tool calls."""
+    thinking_blocks = getattr(message, "thinking_blocks", None) or None
+    reasoning_content = getattr(message, "reasoning_content", None) or None
+    return thinking_blocks, reasoning_content
+def _should_replay_thinking_state(model_name: str | None) -> bool:
+    """Only Anthropic's native adapter accepts replayed thinking metadata."""
+    return bool(model_name and model_name.startswith("anthropic/"))
+def _assistant_message_from_result(
+    llm_result: LLMResult,
+    *,
+    model_name: str | None,
+    tool_calls: list[ToolCall] | None = None,
+) -> Message:
+    """Build an assistant history message without dropping reasoning state."""
+    kwargs: dict[str, Any] = {
+        "role": "assistant",
+        "content": llm_result.content,
+    }
+    if tool_calls is not None:
+        kwargs["tool_calls"] = tool_calls
+    if _should_replay_thinking_state(model_name):
+        if llm_result.thinking_blocks:
+            kwargs["thinking_blocks"] = llm_result.thinking_blocks
+        if llm_result.reasoning_content:
+            kwargs["reasoning_content"] = llm_result.reasoning_content
+    return Message(**kwargs)
 async def _call_llm_streaming(session: Session, messages, tools, llm_params) -> LLMResult:
     token_count = 0
     finish_reason = None
     final_usage_chunk = None
+    chunks = []
     async for chunk in response:
+        chunks.append(chunk)
         if session.is_cancelled:
             tool_calls_acc.clear()
             break
         latency_ms=int((time.monotonic() - t_start) * 1000),
         finish_reason=finish_reason,
     )
+    thinking_blocks = None
+    reasoning_content = None
+    if chunks and _should_replay_thinking_state(llm_params.get("model")):
+        try:
+            rebuilt = stream_chunk_builder(chunks, messages=messages)
+            if rebuilt and getattr(rebuilt, "choices", None):
+                rebuilt_msg = rebuilt.choices[0].message
+                thinking_blocks, reasoning_content = _extract_thinking_state(rebuilt_msg)
+        except Exception:
+            logger.debug("Failed to rebuild streaming thinking state", exc_info=True)
     return LLMResult(
         content=full_content or None,
         token_count=token_count,
         finish_reason=finish_reason,
         usage=usage,
+        thinking_blocks=thinking_blocks,
+        reasoning_content=reasoning_content,
     )
     content = message.content or None
     finish_reason = choice.finish_reason
     token_count = response.usage.total_tokens if response.usage else 0
+    thinking_blocks, reasoning_content = _extract_thinking_state(message)
     # Build tool_calls_acc in the same format as streaming
     tool_calls_acc: dict[int, dict] = {}
         token_count=token_count,
         finish_reason=finish_reason,
         usage=usage,
+        thinking_blocks=thinking_blocks,
+        reasoning_content=reasoning_content,
     )
                         "  • For other tools: reduce the size of your arguments or use bash."
                     )
                     if content:
+                        assistant_msg = _assistant_message_from_result(
+                            llm_result,
+                            model_name=llm_params.get("model"),
+                        )
                         session.context_manager.add_message(assistant_msg, token_count)
                     session.context_manager.add_message(
                         Message(role="user", content=f"[SYSTEM: {truncation_hint}]")
                         (content or "")[:500],
                     )
                     if content:
+                        assistant_msg = _assistant_message_from_result(
+                            llm_result,
+                            model_name=llm_params.get("model"),
+                        )
                         session.context_manager.add_message(assistant_msg, token_count)
                         final_response = content
                     break
                         bad_tools.append(tc)
                 # Add assistant message with all tool calls to context
+                assistant_msg = _assistant_message_from_result(
+                    llm_result,
+                    model_name=llm_params.get("model"),
                     tool_calls=tool_calls,
                 )
                 session.context_manager.add_message(assistant_msg, token_count)

tests/unit/test_thinking_history.py ADDED Viewed

	@@ -0,0 +1,186 @@

+from types import SimpleNamespace
+import pytest
+from litellm import ChatCompletionMessageToolCall, Message
+from agent.core import agent_loop
+from agent.core.agent_loop import (
+    LLMResult,
+    _call_llm_streaming,
+    _assistant_message_from_result,
+    _extract_thinking_state,
+)
+def test_extract_thinking_state_from_litellm_message():
+    message = Message(
+        role="assistant",
+        content="working",
+        thinking_blocks=[{"type": "thinking", "thinking": "reasoned"}],
+        reasoning_content="reasoned",
+    )
+    thinking_blocks, reasoning_content = _extract_thinking_state(message)
+    assert thinking_blocks == [{"type": "thinking", "thinking": "reasoned"}]
+    assert reasoning_content == "reasoned"
+def test_assistant_message_from_result_preserves_thinking_with_tool_calls():
+    tool_call = ChatCompletionMessageToolCall(
+        id="call_1",
+        type="function",
+        function={"name": "bash", "arguments": '{"command": "date"}'},
+    )
+    result = LLMResult(
+        content=None,
+        tool_calls_acc={},
+        token_count=12,
+        finish_reason="tool_calls",
+        thinking_blocks=[{"type": "thinking", "thinking": "reasoned"}],
+        reasoning_content="reasoned",
+    )
+    message = _assistant_message_from_result(
+        result,
+        model_name="anthropic/claude-opus-4-6",
+        tool_calls=[tool_call],
+    )
+    assert message.tool_calls == [tool_call]
+    assert message.thinking_blocks == [{"type": "thinking", "thinking": "reasoned"}]
+    assert message.reasoning_content == "reasoned"
+def test_assistant_message_from_result_strips_non_anthropic_reasoning_content():
+    result = LLMResult(
+        content=None,
+        tool_calls_acc={},
+        token_count=12,
+        finish_reason="tool_calls",
+        thinking_blocks=[{"type": "thinking", "thinking": "reasoned"}],
+        reasoning_content="reasoned",
+    )
+    message = _assistant_message_from_result(
+        result,
+        model_name="openai/Qwen/Qwen3-Next-80B-A3B-Instruct",
+    )
+    assert getattr(message, "thinking_blocks", None) is None
+    assert getattr(message, "reasoning_content", None) is None
+def test_assistant_message_from_result_omits_absent_thinking_fields():
+    result = LLMResult(
+        content="done",
+        tool_calls_acc={},
+        token_count=12,
+        finish_reason="stop",
+    )
+    message = _assistant_message_from_result(
+        result,
+        model_name="anthropic/claude-opus-4-6",
+    )
+    assert message.content == "done"
+    assert getattr(message, "thinking_blocks", None) is None
+    assert getattr(message, "reasoning_content", None) is None
+@pytest.mark.asyncio
+async def test_streaming_call_rebuilds_anthropic_thinking_state(monkeypatch):
+    async def fake_stream():
+        yield SimpleNamespace(
+            choices=[
+                SimpleNamespace(
+                    delta=SimpleNamespace(content="done", tool_calls=None),
+                    finish_reason="stop",
+                )
+            ],
+        )
+        yield SimpleNamespace(choices=[], usage=SimpleNamespace(total_tokens=3))
+    async def fake_acompletion(**_kwargs):
+        return fake_stream()
+    def fake_chunk_builder(chunks, **_kwargs):
+        assert len(chunks) == 2
+        return SimpleNamespace(
+            choices=[
+                SimpleNamespace(
+                    message=Message(
+                        role="assistant",
+                        content="done",
+                        thinking_blocks=[{"type": "thinking", "thinking": "reasoned"}],
+                        reasoning_content="reasoned",
+                    )
+                )
+            ]
+        )
+    events = []
+    async def send_event(event):
+        events.append(event)
+    session = SimpleNamespace(
+        config=SimpleNamespace(model_name="anthropic/claude-opus-4-6"),
+        is_cancelled=False,
+        send_event=send_event,
+    )
+    monkeypatch.setattr(agent_loop, "acompletion", fake_acompletion)
+    monkeypatch.setattr(agent_loop, "stream_chunk_builder", fake_chunk_builder)
+    result = await _call_llm_streaming(
+        session,
+        messages=[Message(role="user", content="hi")],
+        tools=[],
+        llm_params={"model": "anthropic/claude-opus-4-6"},
+    )
+    assert result.content == "done"
+    assert result.thinking_blocks == [{"type": "thinking", "thinking": "reasoned"}]
+    assert result.reasoning_content == "reasoned"
+@pytest.mark.asyncio
+async def test_streaming_call_skips_chunk_rebuild_for_non_anthropic(monkeypatch):
+    async def fake_stream():
+        yield SimpleNamespace(
+            choices=[
+                SimpleNamespace(
+                    delta=SimpleNamespace(content="done", tool_calls=None),
+                    finish_reason="stop",
+                )
+            ],
+        )
+    async def fake_acompletion(**_kwargs):
+        return fake_stream()
+    def fail_chunk_builder(*_args, **_kwargs):
+        raise AssertionError("stream_chunk_builder should not run")
+    events = []
+    async def send_event(event):
+        events.append(event)
+    session = SimpleNamespace(
+        config=SimpleNamespace(model_name="openai/Qwen/Qwen3"),
+        is_cancelled=False,
+        send_event=send_event,
+    )
+    monkeypatch.setattr(agent_loop, "acompletion", fake_acompletion)
+    monkeypatch.setattr(agent_loop, "stream_chunk_builder", fail_chunk_builder)
+    result = await _call_llm_streaming(
+        session,
+        messages=[Message(role="user", content="hi")],
+        tools=[],
+        llm_params={"model": "openai/Qwen/Qwen3"},
+    )
+    assert result.content == "done"
+    assert result.thinking_blocks is None
+    assert result.reasoning_content is None