File size: 4,460 Bytes
d408a51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""Opt-in live provider checks for thinking metadata replay.

These tests intentionally call paid model APIs and are skipped unless
``ML_INTERN_LIVE_LLM_TESTS=1`` plus the relevant provider key are set.
They cover the concrete model families involved in #87 without making
default CI depend on external credentials or provider availability.
"""

from __future__ import annotations

import os
from pathlib import Path
from types import SimpleNamespace

import pytest
from dotenv import load_dotenv
from litellm import Message

from agent.core.agent_loop import (
    _assistant_message_from_result,
    _call_llm_streaming,
)
from agent.core.llm_params import _resolve_llm_params


if env_file := os.environ.get("ML_INTERN_LIVE_ENV_FILE"):
    load_dotenv(Path(env_file))

LIVE_TESTS_ENABLED = os.environ.get("ML_INTERN_LIVE_LLM_TESTS") == "1"
OPUS_47_MODEL = "anthropic/claude-opus-4-7"
LATEST_GPT_MODEL = "openai/gpt-5.2"
REPORT_RESULT_TOOL = [
    {
        "type": "function",
        "function": {
            "name": "report_result",
            "description": "Report the final test result.",
            "parameters": {
                "type": "object",
                "properties": {
                    "answer": {
                        "type": "string",
                        "description": "The exact marker requested by the test.",
                    }
                },
                "required": ["answer"],
            },
        },
    }
]


def _skip_without_live_flag() -> None:
    if not LIVE_TESTS_ENABLED:
        pytest.skip("set ML_INTERN_LIVE_LLM_TESTS=1 to run paid live LLM tests")


def _skip_without_env(name: str) -> None:
    if not os.environ.get(name):
        pytest.skip(f"set {name} to run this live provider test")


def _session(model_name: str):
    events = []

    async def send_event(event):
        events.append(event)

    return SimpleNamespace(
        config=SimpleNamespace(model_name=model_name),
        is_cancelled=False,
        send_event=send_event,
        events=events,
    )


@pytest.mark.asyncio
async def test_live_opus_47_preserves_thinking_metadata_for_replay():
    _skip_without_live_flag()
    _skip_without_env("ANTHROPIC_API_KEY")

    session = _session(OPUS_47_MODEL)
    llm_params = _resolve_llm_params(
        OPUS_47_MODEL,
        reasoning_effort="high",
    )

    result = await _call_llm_streaming(
        session,
        messages=[
            Message(
                role="user",
                content=(
                    "Use careful reasoning for this small check. "
                    "If 17 * 19 = 323, call report_result with answer OPUS_OK."
                ),
            )
        ],
        tools=REPORT_RESULT_TOOL,
        llm_params=llm_params,
    )

    replay = _assistant_message_from_result(
        result,
        model_name=OPUS_47_MODEL,
    )

    assert result.content or result.tool_calls_acc
    assert result.thinking_blocks, (
        "Opus returned no thinking_blocks with reasoning_effort='high' - "
        "check that adaptive thinking params are being forwarded correctly"
    )
    assert getattr(replay, "thinking_blocks", None) == result.thinking_blocks
    assert getattr(replay, "reasoning_content", None) == result.reasoning_content


@pytest.mark.asyncio
async def test_live_latest_gpt_does_not_replay_reasoning_metadata():
    _skip_without_live_flag()
    _skip_without_env("OPENAI_API_KEY")

    session = _session(LATEST_GPT_MODEL)
    llm_params = _resolve_llm_params(
        LATEST_GPT_MODEL,
        reasoning_effort="low",
    )

    result = await _call_llm_streaming(
        session,
        messages=[
            Message(
                role="user",
                content="Call report_result with answer GPT_OK.",
            )
        ],
        tools=REPORT_RESULT_TOOL,
        llm_params=llm_params,
    )

    # Even if a GPT-family response carries provider reasoning internally,
    # OpenAI-compatible history must not echo it back on the next tool turn.
    # Force the non-None strip path when the live model omits reasoning details.
    result.reasoning_content = result.reasoning_content or "synthetic-reasoning"
    replay = _assistant_message_from_result(
        result,
        model_name=LATEST_GPT_MODEL,
    )

    assert result.content or result.tool_calls_acc
    assert getattr(replay, "thinking_blocks", None) is None
    assert getattr(replay, "reasoning_content", None) is None