"""Integration tests for HermesAgentLoop tool calling. Tests the full agent loop with real LLM calls via OpenRouter. Uses stepfun/step-3.5-flash:free by default (zero cost), falls back to anthropic/claude-sonnet-4 if the free model is unavailable. These tests verify: 1. Single tool call: model calls a tool, gets result, responds 2. Multi-tool call: model calls multiple tools in one turn 3. Multi-turn: model calls tools across multiple turns 4. Unknown tool rejection: model calling a non-existent tool gets an error 5. Max turns: loop stops when max_turns is reached 6. No tools: model responds without calling any tools 7. Tool error handling: tool execution errors are captured Run: pytest tests/test_agent_loop_tool_calling.py -v pytest tests/test_agent_loop_tool_calling.py -v -k "single" # run one test """ import asyncio import json import os import sys from pathlib import Path from typing import Any, Dict, List, Set from unittest.mock import patch import pytest pytestmark = pytest.mark.skip(reason="Live API integration test — hangs in batch runs") # Ensure repo root is importable _repo_root = Path(__file__).resolve().parent.parent if str(_repo_root) not in sys.path: sys.path.insert(0, str(_repo_root)) try: from environments.agent_loop import AgentResult, HermesAgentLoop from atroposlib.envs.server_handling.openai_server import OpenAIServer # noqa: F401 except ImportError: pytest.skip("atroposlib not installed", allow_module_level=True) # ========================================================================= # Test infrastructure # ========================================================================= # Models to try, in order of preference (free first) _MODELS = [ "stepfun/step-3.5-flash:free", "google/gemini-2.0-flash-001", "anthropic/claude-sonnet-4", ] def _get_api_key(): key = os.getenv("OPENROUTER_API_KEY", "") if not key: pytest.skip("OPENROUTER_API_KEY not set") return key def _make_server(model: str = None): """Create an OpenAI server for testing.""" from atroposlib.envs.server_handling.openai_server import OpenAIServer from atroposlib.envs.server_handling.server_manager import APIServerConfig config = APIServerConfig( base_url="https://openrouter.ai/api/v1", model_name=model or _MODELS[0], server_type="openai", api_key=_get_api_key(), health_check=False, ) return OpenAIServer(config) async def _try_models(test_fn): """Try running a test with each model until one works.""" last_error = None for model in _MODELS: try: server = _make_server(model) return await test_fn(server, model) except Exception as e: last_error = e if "rate" in str(e).lower() or "limit" in str(e).lower(): continue # Rate limited, try next model raise # Real error pytest.skip(f"All models failed. Last error: {last_error}") # ========================================================================= # Fake tools for testing # ========================================================================= # Simple calculator tool CALC_TOOL = { "type": "function", "function": { "name": "calculate", "description": "Calculate a math expression. Returns the numeric result.", "parameters": { "type": "object", "properties": { "expression": { "type": "string", "description": "Math expression to evaluate, e.g. '2 + 3'" } }, "required": ["expression"], }, }, } # Weather lookup tool WEATHER_TOOL = { "type": "function", "function": { "name": "get_weather", "description": "Get the current weather for a city. Returns temperature and conditions.", "parameters": { "type": "object", "properties": { "city": { "type": "string", "description": "City name, e.g. 'Tokyo'" } }, "required": ["city"], }, }, } # Lookup tool (always succeeds) LOOKUP_TOOL = { "type": "function", "function": { "name": "lookup", "description": "Look up a fact. Returns a short answer string.", "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "What to look up" } }, "required": ["query"], }, }, } # Error tool (always fails) ERROR_TOOL = { "type": "function", "function": { "name": "failing_tool", "description": "A tool that always fails with an error.", "parameters": { "type": "object", "properties": { "input": {"type": "string"} }, "required": ["input"], }, }, } def _fake_tool_handler(tool_name: str, args: Dict[str, Any], **kwargs) -> str: """Handle fake tool calls for testing.""" if tool_name == "calculate": expr = args.get("expression", "0") try: # Safe eval for simple math result = eval(expr, {"__builtins__": {}}, {}) return json.dumps({"result": result}) except Exception as e: return json.dumps({"error": str(e)}) elif tool_name == "get_weather": city = args.get("city", "Unknown") # Return canned weather return json.dumps({ "city": city, "temperature": 22, "conditions": "sunny", "humidity": 45, }) elif tool_name == "lookup": query = args.get("query", "") return json.dumps({"answer": f"The answer to '{query}' is 42."}) elif tool_name == "failing_tool": raise RuntimeError("This tool always fails!") return json.dumps({"error": f"Unknown tool: {tool_name}"}) # ========================================================================= # Tests # ========================================================================= @pytest.mark.asyncio async def test_single_tool_call(): """Model should call a single tool, get the result, and respond.""" async def _run(server, model): agent = HermesAgentLoop( server=server, tool_schemas=[WEATHER_TOOL], valid_tool_names={"get_weather"}, max_turns=5, temperature=0.0, max_tokens=500, ) messages = [ {"role": "user", "content": "What's the weather in Tokyo? Use the get_weather tool."}, ] with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): result = await agent.run(messages) assert isinstance(result, AgentResult) assert result.turns_used >= 2, f"Expected at least 2 turns (tool call + response), got {result.turns_used}" # Verify a tool call happened tool_calls_found = False for msg in result.messages: if msg.get("role") == "assistant" and msg.get("tool_calls"): for tc in msg["tool_calls"]: if tc["function"]["name"] == "get_weather": tool_calls_found = True args = json.loads(tc["function"]["arguments"]) assert "city" in args assert tool_calls_found, "Model should have called get_weather" # Verify tool result is in conversation tool_results = [m for m in result.messages if m.get("role") == "tool"] assert len(tool_results) >= 1, "Should have at least one tool result" # Verify the final response references the weather final_msg = result.messages[-1] assert final_msg["role"] == "assistant" assert final_msg["content"], "Final response should have content" return result await _try_models(_run) @pytest.mark.asyncio async def test_multi_tool_single_turn(): """Model should call multiple tools in a single turn.""" async def _run(server, model): agent = HermesAgentLoop( server=server, tool_schemas=[WEATHER_TOOL, CALC_TOOL], valid_tool_names={"get_weather", "calculate"}, max_turns=5, temperature=0.0, max_tokens=500, ) messages = [ {"role": "user", "content": ( "I need two things at once: " "1) What's the weather in Paris? Use get_weather. " "2) What is 15 * 7? Use calculate. " "Call BOTH tools in a single response." )}, ] with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): result = await agent.run(messages) # Count distinct tools called tools_called = set() for msg in result.messages: if msg.get("role") == "assistant" and msg.get("tool_calls"): for tc in msg["tool_calls"]: tools_called.add(tc["function"]["name"]) # At minimum, both tools should have been called (maybe in different turns) assert "get_weather" in tools_called, f"get_weather not called. Called: {tools_called}" assert "calculate" in tools_called, f"calculate not called. Called: {tools_called}" return result await _try_models(_run) @pytest.mark.asyncio async def test_multi_turn_conversation(): """Agent should handle multiple turns of tool calls.""" async def _run(server, model): agent = HermesAgentLoop( server=server, tool_schemas=[LOOKUP_TOOL, CALC_TOOL], valid_tool_names={"lookup", "calculate"}, max_turns=10, temperature=0.0, max_tokens=500, ) messages = [ {"role": "user", "content": ( "First, use the lookup tool to look up 'meaning of life'. " "Then use calculate to compute 6 * 7. " "Do these in separate tool calls, one at a time." )}, ] with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): result = await agent.run(messages) # Should have used both tools tools_called = set() for msg in result.messages: if msg.get("role") == "assistant" and msg.get("tool_calls"): for tc in msg["tool_calls"]: tools_called.add(tc["function"]["name"]) assert "lookup" in tools_called, f"lookup not called. Called: {tools_called}" assert "calculate" in tools_called, f"calculate not called. Called: {tools_called}" # Should finish naturally assert result.finished_naturally, "Should finish naturally after answering" return result await _try_models(_run) @pytest.mark.asyncio async def test_unknown_tool_rejected(): """If the model calls a tool not in valid_tool_names, it gets an error.""" async def _run(server, model): # Only allow "calculate" but give schema for both agent = HermesAgentLoop( server=server, tool_schemas=[CALC_TOOL, WEATHER_TOOL], valid_tool_names={"calculate"}, # weather NOT allowed max_turns=5, temperature=0.0, max_tokens=500, ) messages = [ {"role": "user", "content": "What's the weather in London? Use get_weather."}, ] with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): result = await agent.run(messages) # Check if get_weather was called and rejected if result.tool_errors: weather_errors = [e for e in result.tool_errors if e.tool_name == "get_weather"] assert len(weather_errors) > 0, "get_weather should have been rejected" assert "Unknown tool" in weather_errors[0].error return result await _try_models(_run) @pytest.mark.asyncio async def test_max_turns_limit(): """Agent should stop after max_turns even if model keeps calling tools.""" async def _run(server, model): agent = HermesAgentLoop( server=server, tool_schemas=[LOOKUP_TOOL], valid_tool_names={"lookup"}, max_turns=2, # Very low limit temperature=0.0, max_tokens=500, ) messages = [ {"role": "user", "content": ( "Keep looking up facts. Look up 'fact 1', then 'fact 2', " "then 'fact 3', then 'fact 4'. Do them one at a time." )}, ] with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): result = await agent.run(messages) assert result.turns_used <= 2, f"Should stop at max_turns=2, used {result.turns_used}" assert not result.finished_naturally, "Should NOT finish naturally (hit max_turns)" return result await _try_models(_run) @pytest.mark.asyncio async def test_no_tools_direct_response(): """When no tools are useful, model should respond directly.""" async def _run(server, model): agent = HermesAgentLoop( server=server, tool_schemas=[WEATHER_TOOL], valid_tool_names={"get_weather"}, max_turns=5, temperature=0.0, max_tokens=200, ) messages = [ {"role": "user", "content": "What is 2 + 2? Just answer directly, no tools needed."}, ] with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): result = await agent.run(messages) assert result.finished_naturally, "Should finish naturally with a direct response" assert result.turns_used == 1, f"Should take exactly 1 turn for a direct answer, took {result.turns_used}" final = result.messages[-1] assert final["role"] == "assistant" assert final["content"], "Should have text content" assert "4" in final["content"], "Should contain the answer '4'" return result await _try_models(_run) @pytest.mark.asyncio async def test_tool_error_handling(): """Tool execution errors should be captured and reported to the model.""" async def _run(server, model): agent = HermesAgentLoop( server=server, tool_schemas=[ERROR_TOOL], valid_tool_names={"failing_tool"}, max_turns=5, temperature=0.0, max_tokens=500, ) messages = [ {"role": "user", "content": "Please call the failing_tool with input 'test'."}, ] with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): result = await agent.run(messages) # The tool error should be recorded assert len(result.tool_errors) >= 1, "Should have at least one tool error" assert "RuntimeError" in result.tool_errors[0].error or "always fails" in result.tool_errors[0].error # The error should be in the conversation as a tool result tool_results = [m for m in result.messages if m.get("role") == "tool"] assert len(tool_results) >= 1 error_result = json.loads(tool_results[0]["content"]) assert "error" in error_result return result await _try_models(_run) @pytest.mark.asyncio async def test_agent_result_structure(): """Verify the AgentResult has all expected fields populated.""" async def _run(server, model): agent = HermesAgentLoop( server=server, tool_schemas=[CALC_TOOL], valid_tool_names={"calculate"}, max_turns=5, temperature=0.0, max_tokens=300, ) messages = [ {"role": "user", "content": "What is 3 + 4? Use the calculate tool."}, ] with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): result = await agent.run(messages) # Structural checks assert isinstance(result, AgentResult) assert isinstance(result.messages, list) assert len(result.messages) >= 3, "Should have user + assistant(tool) + tool_result + assistant(final)" assert isinstance(result.turns_used, int) assert result.turns_used > 0 assert isinstance(result.finished_naturally, bool) assert isinstance(result.tool_errors, list) assert isinstance(result.reasoning_per_turn, list) # Messages should follow OpenAI format for msg in result.messages: assert "role" in msg, f"Message missing 'role': {msg}" assert msg["role"] in ("system", "user", "assistant", "tool"), f"Invalid role: {msg['role']}" return result await _try_models(_run) @pytest.mark.asyncio async def test_conversation_history_preserved(): """The full conversation history should be in result.messages.""" async def _run(server, model): agent = HermesAgentLoop( server=server, tool_schemas=[WEATHER_TOOL], valid_tool_names={"get_weather"}, max_turns=5, temperature=0.0, max_tokens=500, ) messages = [ {"role": "system", "content": "You are a helpful weather assistant."}, {"role": "user", "content": "What's the weather in Berlin? Use get_weather."}, ] with patch("environments.agent_loop.handle_function_call", side_effect=_fake_tool_handler): result = await agent.run(messages) # System message should be preserved assert result.messages[0]["role"] == "system" assert "weather assistant" in result.messages[0]["content"] # User message should be preserved assert result.messages[1]["role"] == "user" assert "Berlin" in result.messages[1]["content"] # Should have assistant + tool + assistant sequence roles = [m["role"] for m in result.messages] assert "tool" in roles, "Should have tool results in conversation" return result await _try_models(_run)