Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 4,460 Bytes
d408a51 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | """Opt-in live provider checks for thinking metadata replay.
These tests intentionally call paid model APIs and are skipped unless
``ML_INTERN_LIVE_LLM_TESTS=1`` plus the relevant provider key are set.
They cover the concrete model families involved in #87 without making
default CI depend on external credentials or provider availability.
"""
from __future__ import annotations
import os
from pathlib import Path
from types import SimpleNamespace
import pytest
from dotenv import load_dotenv
from litellm import Message
from agent.core.agent_loop import (
_assistant_message_from_result,
_call_llm_streaming,
)
from agent.core.llm_params import _resolve_llm_params
if env_file := os.environ.get("ML_INTERN_LIVE_ENV_FILE"):
load_dotenv(Path(env_file))
LIVE_TESTS_ENABLED = os.environ.get("ML_INTERN_LIVE_LLM_TESTS") == "1"
OPUS_47_MODEL = "anthropic/claude-opus-4-7"
LATEST_GPT_MODEL = "openai/gpt-5.2"
REPORT_RESULT_TOOL = [
{
"type": "function",
"function": {
"name": "report_result",
"description": "Report the final test result.",
"parameters": {
"type": "object",
"properties": {
"answer": {
"type": "string",
"description": "The exact marker requested by the test.",
}
},
"required": ["answer"],
},
},
}
]
def _skip_without_live_flag() -> None:
if not LIVE_TESTS_ENABLED:
pytest.skip("set ML_INTERN_LIVE_LLM_TESTS=1 to run paid live LLM tests")
def _skip_without_env(name: str) -> None:
if not os.environ.get(name):
pytest.skip(f"set {name} to run this live provider test")
def _session(model_name: str):
events = []
async def send_event(event):
events.append(event)
return SimpleNamespace(
config=SimpleNamespace(model_name=model_name),
is_cancelled=False,
send_event=send_event,
events=events,
)
@pytest.mark.asyncio
async def test_live_opus_47_preserves_thinking_metadata_for_replay():
_skip_without_live_flag()
_skip_without_env("ANTHROPIC_API_KEY")
session = _session(OPUS_47_MODEL)
llm_params = _resolve_llm_params(
OPUS_47_MODEL,
reasoning_effort="high",
)
result = await _call_llm_streaming(
session,
messages=[
Message(
role="user",
content=(
"Use careful reasoning for this small check. "
"If 17 * 19 = 323, call report_result with answer OPUS_OK."
),
)
],
tools=REPORT_RESULT_TOOL,
llm_params=llm_params,
)
replay = _assistant_message_from_result(
result,
model_name=OPUS_47_MODEL,
)
assert result.content or result.tool_calls_acc
assert result.thinking_blocks, (
"Opus returned no thinking_blocks with reasoning_effort='high' - "
"check that adaptive thinking params are being forwarded correctly"
)
assert getattr(replay, "thinking_blocks", None) == result.thinking_blocks
assert getattr(replay, "reasoning_content", None) == result.reasoning_content
@pytest.mark.asyncio
async def test_live_latest_gpt_does_not_replay_reasoning_metadata():
_skip_without_live_flag()
_skip_without_env("OPENAI_API_KEY")
session = _session(LATEST_GPT_MODEL)
llm_params = _resolve_llm_params(
LATEST_GPT_MODEL,
reasoning_effort="low",
)
result = await _call_llm_streaming(
session,
messages=[
Message(
role="user",
content="Call report_result with answer GPT_OK.",
)
],
tools=REPORT_RESULT_TOOL,
llm_params=llm_params,
)
# Even if a GPT-family response carries provider reasoning internally,
# OpenAI-compatible history must not echo it back on the next tool turn.
# Force the non-None strip path when the live model omits reasoning details.
result.reasoning_content = result.reasoning_content or "synthetic-reasoning"
replay = _assistant_message_from_result(
result,
model_name=LATEST_GPT_MODEL,
)
assert result.content or result.tool_calls_acc
assert getattr(replay, "thinking_blocks", None) is None
assert getattr(replay, "reasoning_content", None) is None
|