File size: 17,382 Bytes
9aa5185 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 | """
Tests for environments/agent_loop.py β HermesAgentLoop.
Tests the multi-turn agent engine using mocked servers, without needing
real API keys or running servers.
"""
import asyncio
import json
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Optional
from unittest.mock import MagicMock
import pytest
# Ensure repo root is importable
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
try:
from environments.agent_loop import (
AgentResult,
HermesAgentLoop,
ToolError,
_extract_reasoning_from_message,
resize_tool_pool,
)
except ImportError:
pytest.skip("atroposlib not installed", allow_module_level=True)
# βββ Mock server infrastructure βββββββββββββββββββββββββββββββββββββββββ
@dataclass
class MockFunction:
name: str
arguments: str
@dataclass
class MockToolCall:
id: str
function: MockFunction
type: str = "function"
@dataclass
class MockMessage:
content: Optional[str]
role: str = "assistant"
tool_calls: Optional[List[MockToolCall]] = None
reasoning_content: Optional[str] = None
reasoning: Optional[str] = None
reasoning_details: Optional[list] = None
@dataclass
class MockChoice:
message: MockMessage
finish_reason: str = "stop"
index: int = 0
@dataclass
class MockChatCompletion:
choices: List[MockChoice]
id: str = "chatcmpl-mock"
model: str = "mock-model"
class MockServer:
"""
Mock server that returns pre-configured responses in sequence.
Mimics the chat_completion() interface.
"""
def __init__(self, responses: List[MockChatCompletion]):
self.responses = responses
self.call_count = 0
self.call_history: List[Dict[str, Any]] = []
async def chat_completion(self, **kwargs) -> MockChatCompletion:
self.call_history.append(kwargs)
if self.call_count >= len(self.responses):
# Return a simple text response if we run out
return MockChatCompletion(
choices=[MockChoice(message=MockMessage(content="Done."))]
)
resp = self.responses[self.call_count]
self.call_count += 1
return resp
def make_text_response(content: str) -> MockChatCompletion:
"""Create a simple text-only response (no tool calls)."""
return MockChatCompletion(
choices=[MockChoice(message=MockMessage(content=content))]
)
def make_tool_response(
tool_name: str,
arguments: dict,
content: str = "",
tool_call_id: str = "call_001",
) -> MockChatCompletion:
"""Create a response with a single tool call."""
return MockChatCompletion(
choices=[
MockChoice(
message=MockMessage(
content=content,
tool_calls=[
MockToolCall(
id=tool_call_id,
function=MockFunction(
name=tool_name,
arguments=json.dumps(arguments),
),
)
],
),
finish_reason="tool_calls",
)
]
)
# βββ Tests βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestAgentResult:
def test_defaults(self):
result = AgentResult(messages=[])
assert result.messages == []
assert result.managed_state is None
assert result.turns_used == 0
assert result.finished_naturally is False
assert result.reasoning_per_turn == []
assert result.tool_errors == []
class TestExtractReasoning:
def test_reasoning_content_field(self):
msg = MockMessage(content="hello", reasoning_content="I think...")
assert _extract_reasoning_from_message(msg) == "I think..."
def test_reasoning_field(self):
msg = MockMessage(content="hello", reasoning="Let me consider...")
assert _extract_reasoning_from_message(msg) == "Let me consider..."
def test_reasoning_details(self):
detail = MagicMock()
detail.text = "Detail reasoning"
msg = MockMessage(content="hello", reasoning_details=[detail])
assert _extract_reasoning_from_message(msg) == "Detail reasoning"
def test_reasoning_details_dict_format(self):
msg = MockMessage(
content="hello",
reasoning_details=[{"text": "Dict reasoning"}],
)
assert _extract_reasoning_from_message(msg) == "Dict reasoning"
def test_no_reasoning(self):
msg = MockMessage(content="hello")
assert _extract_reasoning_from_message(msg) is None
def test_reasoning_content_takes_priority(self):
msg = MockMessage(
content="hello",
reasoning_content="First",
reasoning="Second",
)
assert _extract_reasoning_from_message(msg) == "First"
class TestHermesAgentLoop:
"""Test the agent loop with mock servers."""
@pytest.fixture
def basic_tools(self):
"""Minimal tool schema for testing."""
return [
{
"type": "function",
"function": {
"name": "terminal",
"description": "Run a command",
"parameters": {
"type": "object",
"properties": {
"command": {
"type": "string",
"description": "Command to run",
}
},
"required": ["command"],
},
},
},
{
"type": "function",
"function": {
"name": "read_file",
"description": "Read a file",
"parameters": {
"type": "object",
"properties": {
"path": {"type": "string"},
},
"required": ["path"],
},
},
},
]
@pytest.fixture
def valid_names(self):
return {"terminal", "read_file", "todo"}
@pytest.mark.asyncio
async def test_simple_text_response(self, basic_tools, valid_names):
"""Model responds with text only, no tool calls."""
server = MockServer([make_text_response("Hello! How can I help?")])
agent = HermesAgentLoop(
server=server,
tool_schemas=basic_tools,
valid_tool_names=valid_names,
max_turns=10,
)
messages = [{"role": "user", "content": "Hi"}]
result = await agent.run(messages)
assert result.finished_naturally is True
assert result.turns_used == 1
assert len(result.messages) >= 2 # user + assistant
assert result.messages[-1]["role"] == "assistant"
assert result.messages[-1]["content"] == "Hello! How can I help?"
@pytest.mark.asyncio
async def test_tool_call_then_text(self, basic_tools, valid_names):
"""Model calls a tool, then responds with text."""
server = MockServer([
make_tool_response("todo", {"todos": [{"id": "1", "content": "test", "status": "pending"}]}),
make_text_response("I created a todo for you."),
])
agent = HermesAgentLoop(
server=server,
tool_schemas=basic_tools,
valid_tool_names=valid_names,
max_turns=10,
)
messages = [{"role": "user", "content": "Create a todo"}]
result = await agent.run(messages)
assert result.finished_naturally is True
assert result.turns_used == 2
# Should have: user, assistant (tool_call), tool (result), assistant (text)
roles = [m["role"] for m in result.messages]
assert roles == ["user", "assistant", "tool", "assistant"]
@pytest.mark.asyncio
async def test_max_turns_reached(self, basic_tools, valid_names):
"""Model keeps calling tools until max_turns is hit."""
# Create responses that always call a tool
responses = [
make_tool_response("todo", {"todos": [{"id": str(i), "content": f"task {i}", "status": "pending"}]}, tool_call_id=f"call_{i}")
for i in range(10)
]
server = MockServer(responses)
agent = HermesAgentLoop(
server=server,
tool_schemas=basic_tools,
valid_tool_names=valid_names,
max_turns=3,
)
messages = [{"role": "user", "content": "Keep going"}]
result = await agent.run(messages)
assert result.finished_naturally is False
assert result.turns_used == 3
@pytest.mark.asyncio
async def test_unknown_tool_name(self, basic_tools, valid_names):
"""Model calls a tool not in valid_tool_names."""
server = MockServer([
make_tool_response("nonexistent_tool", {"arg": "val"}),
make_text_response("OK, that didn't work."),
])
agent = HermesAgentLoop(
server=server,
tool_schemas=basic_tools,
valid_tool_names=valid_names,
max_turns=10,
)
messages = [{"role": "user", "content": "Call something weird"}]
result = await agent.run(messages)
# Should record a tool error
assert len(result.tool_errors) >= 1
assert result.tool_errors[0].tool_name == "nonexistent_tool"
@pytest.mark.asyncio
async def test_empty_response(self, basic_tools, valid_names):
"""Server returns empty response."""
server = MockServer([MockChatCompletion(choices=[])])
agent = HermesAgentLoop(
server=server,
tool_schemas=basic_tools,
valid_tool_names=valid_names,
max_turns=10,
)
messages = [{"role": "user", "content": "Hi"}]
result = await agent.run(messages)
assert result.finished_naturally is False
assert result.turns_used == 1
@pytest.mark.asyncio
async def test_api_error_handling(self, basic_tools, valid_names):
"""Server raises an exception."""
class FailingServer:
async def chat_completion(self, **kwargs):
raise ConnectionError("Server unreachable")
agent = HermesAgentLoop(
server=FailingServer(),
tool_schemas=basic_tools,
valid_tool_names=valid_names,
max_turns=10,
)
messages = [{"role": "user", "content": "Hi"}]
result = await agent.run(messages)
assert result.finished_naturally is False
assert result.turns_used == 1
@pytest.mark.asyncio
async def test_tools_passed_to_server(self, basic_tools, valid_names):
"""Verify tools are passed in the chat_completion kwargs."""
server = MockServer([make_text_response("OK")])
agent = HermesAgentLoop(
server=server,
tool_schemas=basic_tools,
valid_tool_names=valid_names,
max_turns=10,
)
messages = [{"role": "user", "content": "Hi"}]
await agent.run(messages)
assert len(server.call_history) == 1
assert "tools" in server.call_history[0]
assert server.call_history[0]["tools"] == basic_tools
@pytest.mark.asyncio
async def test_extra_body_forwarded(self, basic_tools, valid_names):
"""extra_body should be forwarded to server."""
extra = {"provider": {"ignore": ["DeepInfra"]}}
server = MockServer([make_text_response("OK")])
agent = HermesAgentLoop(
server=server,
tool_schemas=basic_tools,
valid_tool_names=valid_names,
max_turns=10,
extra_body=extra,
)
messages = [{"role": "user", "content": "Hi"}]
await agent.run(messages)
assert server.call_history[0].get("extra_body") == extra
@pytest.mark.asyncio
async def test_managed_state_returned(self, basic_tools, valid_names):
"""If server has get_state(), result should include managed_state."""
server = MockServer([make_text_response("OK")])
server.get_state = lambda: {"nodes": [{"test": True}]}
agent = HermesAgentLoop(
server=server,
tool_schemas=basic_tools,
valid_tool_names=valid_names,
max_turns=10,
)
messages = [{"role": "user", "content": "Hi"}]
result = await agent.run(messages)
assert result.managed_state is not None
assert "nodes" in result.managed_state
@pytest.mark.asyncio
async def test_no_managed_state_without_get_state(self, basic_tools, valid_names):
"""Regular server without get_state() should return None managed_state."""
server = MockServer([make_text_response("OK")])
agent = HermesAgentLoop(
server=server,
tool_schemas=basic_tools,
valid_tool_names=valid_names,
max_turns=10,
)
messages = [{"role": "user", "content": "Hi"}]
result = await agent.run(messages)
assert result.managed_state is None
@pytest.mark.asyncio
async def test_memory_tool_blocked(self, basic_tools):
"""Memory tool should return error in RL environments."""
valid = {"terminal", "read_file", "todo", "memory"}
server = MockServer([
make_tool_response("memory", {"action": "add", "target": "user", "content": "test"}),
make_text_response("Done"),
])
agent = HermesAgentLoop(
server=server,
tool_schemas=basic_tools,
valid_tool_names=valid,
max_turns=10,
)
messages = [{"role": "user", "content": "Remember this"}]
result = await agent.run(messages)
# Find the tool response
tool_msgs = [m for m in result.messages if m["role"] == "tool"]
assert len(tool_msgs) >= 1
tool_result = json.loads(tool_msgs[0]["content"])
assert "error" in tool_result
assert "not available" in tool_result["error"].lower()
@pytest.mark.asyncio
async def test_session_search_blocked(self, basic_tools):
"""session_search should return error in RL environments."""
valid = {"terminal", "read_file", "todo", "session_search"}
server = MockServer([
make_tool_response("session_search", {"query": "test"}),
make_text_response("Done"),
])
agent = HermesAgentLoop(
server=server,
tool_schemas=basic_tools,
valid_tool_names=valid,
max_turns=10,
)
messages = [{"role": "user", "content": "Search sessions"}]
result = await agent.run(messages)
tool_msgs = [m for m in result.messages if m["role"] == "tool"]
assert len(tool_msgs) >= 1
tool_result = json.loads(tool_msgs[0]["content"])
assert "error" in tool_result
@pytest.mark.asyncio
async def test_reasoning_content_preserved(self, basic_tools, valid_names):
"""Reasoning content should be extracted and preserved."""
resp = MockChatCompletion(
choices=[
MockChoice(
message=MockMessage(
content="The answer is 42.",
reasoning_content="Let me think about this step by step...",
)
)
]
)
server = MockServer([resp])
agent = HermesAgentLoop(
server=server,
tool_schemas=basic_tools,
valid_tool_names=valid_names,
max_turns=10,
)
messages = [{"role": "user", "content": "What is the meaning of life?"}]
result = await agent.run(messages)
assert len(result.reasoning_per_turn) == 1
assert result.reasoning_per_turn[0] == "Let me think about this step by step..."
class TestResizeToolPool:
def test_resize_works(self):
"""resize_tool_pool should not raise."""
resize_tool_pool(16) # Small pool for testing
resize_tool_pool(128) # Restore default
def test_resize_shuts_down_previous_executor(self, monkeypatch):
"""Replacing the global tool executor should shut down the old pool."""
import environments.agent_loop as agent_loop_module
old_executor = MagicMock()
new_executor = MagicMock()
monkeypatch.setattr(agent_loop_module, "_tool_executor", old_executor)
monkeypatch.setattr(
agent_loop_module.concurrent.futures,
"ThreadPoolExecutor",
MagicMock(return_value=new_executor),
)
resize_tool_pool(16)
old_executor.shutdown.assert_called_once_with(wait=False)
assert agent_loop_module._tool_executor is new_executor
|