agentbee / test /test_llm_integration.py
mangubee's picture
fix: correct author name formatting in multiple files
e7b4937
"""
LLM Integration Tests - Stage 3 Validation
Author: @mangubee
Date: 2026-01-02
Tests for Stage 3 LLM integration:
- Planning with LLM
- Tool selection via function calling
- Answer synthesis from evidence
- Full workflow with mocked LLM responses
"""
import pytest
from unittest.mock import patch, MagicMock
from src.agent.llm_client import (
plan_question,
select_tools_with_function_calling,
synthesize_answer
)
from src.tools import TOOLS
class TestPlanningFunction:
"""Test LLM-based planning function."""
@patch('src.agent.llm_client.Anthropic')
def test_plan_question_basic(self, mock_anthropic):
"""Test planning with simple question."""
# Mock LLM response
mock_client = MagicMock()
mock_response = MagicMock()
mock_response.content = [MagicMock(text="1. Search for information\n2. Analyze results")]
mock_client.messages.create.return_value = mock_response
mock_anthropic.return_value = mock_client
# Test planning
plan = plan_question(
question="What is the capital of France?",
available_tools=TOOLS
)
assert isinstance(plan, str)
assert len(plan) > 0
print(f"✓ Generated plan: {plan[:50]}...")
@patch('src.agent.llm_client.Anthropic')
def test_plan_with_files(self, mock_anthropic):
"""Test planning with file context."""
# Mock LLM response
mock_client = MagicMock()
mock_response = MagicMock()
mock_response.content = [MagicMock(text="1. Parse file\n2. Extract data\n3. Calculate answer")]
mock_client.messages.create.return_value = mock_response
mock_anthropic.return_value = mock_client
# Test planning with files
plan = plan_question(
question="What is the total in the spreadsheet?",
available_tools=TOOLS,
file_paths=["data.xlsx"]
)
assert isinstance(plan, str)
assert len(plan) > 0
print(f"✓ Generated plan with files: {plan[:50]}...")
class TestToolSelection:
"""Test LLM function calling for tool selection."""
@patch('src.agent.llm_client.Anthropic')
def test_select_single_tool(self, mock_anthropic):
"""Test selecting single tool with parameters."""
# Mock LLM response with function call
mock_client = MagicMock()
mock_response = MagicMock()
# Mock tool_use content block
mock_tool_use = MagicMock()
mock_tool_use.type = "tool_use"
mock_tool_use.name = "search"
mock_tool_use.input = {"query": "capital of France"}
mock_tool_use.id = "call_001"
mock_response.content = [mock_tool_use]
mock_client.messages.create.return_value = mock_response
mock_anthropic.return_value = mock_client
# Test tool selection
tool_calls = select_tools_with_function_calling(
question="What is the capital of France?",
plan="1. Search for capital of France",
available_tools=TOOLS
)
assert isinstance(tool_calls, list)
assert len(tool_calls) == 1
assert tool_calls[0]["tool"] == "search"
assert "query" in tool_calls[0]["params"]
print(f"✓ Selected tool: {tool_calls[0]}")
@patch('src.agent.llm_client.Anthropic')
def test_select_multiple_tools(self, mock_anthropic):
"""Test selecting multiple tools in sequence."""
# Mock LLM response with multiple function calls
mock_client = MagicMock()
mock_response = MagicMock()
# Mock multiple tool_use blocks
mock_tool1 = MagicMock()
mock_tool1.type = "tool_use"
mock_tool1.name = "parse_file"
mock_tool1.input = {"file_path": "data.xlsx"}
mock_tool1.id = "call_001"
mock_tool2 = MagicMock()
mock_tool2.type = "tool_use"
mock_tool2.name = "safe_eval"
mock_tool2.input = {"expression": "sum(values)"}
mock_tool2.id = "call_002"
mock_response.content = [mock_tool1, mock_tool2]
mock_client.messages.create.return_value = mock_response
mock_anthropic.return_value = mock_client
# Test tool selection
tool_calls = select_tools_with_function_calling(
question="What is the sum in data.xlsx?",
plan="1. Parse file\n2. Calculate sum",
available_tools=TOOLS
)
assert isinstance(tool_calls, list)
assert len(tool_calls) == 2
assert tool_calls[0]["tool"] == "parse_file"
assert tool_calls[1]["tool"] == "safe_eval"
print(f"✓ Selected {len(tool_calls)} tools")
class TestAnswerSynthesis:
"""Test LLM-based answer synthesis."""
@patch('src.agent.llm_client.Anthropic')
def test_synthesize_simple_answer(self, mock_anthropic):
"""Test synthesizing answer from single evidence."""
# Mock LLM response
mock_client = MagicMock()
mock_response = MagicMock()
mock_response.content = [MagicMock(text="Paris")]
mock_client.messages.create.return_value = mock_response
mock_anthropic.return_value = mock_client
# Test answer synthesis
answer = synthesize_answer(
question="What is the capital of France?",
evidence=["[search] Paris is the capital and most populous city of France"]
)
assert isinstance(answer, str)
assert len(answer) > 0
assert answer == "Paris"
print(f"✓ Synthesized answer: {answer}")
@patch('src.agent.llm_client.Anthropic')
def test_synthesize_from_multiple_evidence(self, mock_anthropic):
"""Test synthesizing answer from multiple evidence sources."""
# Mock LLM response
mock_client = MagicMock()
mock_response = MagicMock()
mock_response.content = [MagicMock(text="42")]
mock_client.messages.create.return_value = mock_response
mock_anthropic.return_value = mock_client
# Test answer synthesis with multiple evidence
answer = synthesize_answer(
question="What is the answer?",
evidence=[
"[search] The answer to life is 42",
"[safe_eval] 6 * 7 = 42",
"[parse_file] Result: 42"
]
)
assert isinstance(answer, str)
assert answer == "42"
print(f"✓ Synthesized answer from {3} evidence items: {answer}")
@patch('src.agent.llm_client.Anthropic')
def test_synthesize_with_conflicts(self, mock_anthropic):
"""Test synthesizing answer when evidence conflicts."""
# Mock LLM response - should resolve conflict
mock_client = MagicMock()
mock_response = MagicMock()
mock_response.content = [MagicMock(text="Paris")]
mock_client.messages.create.return_value = mock_response
mock_anthropic.return_value = mock_client
# Test answer synthesis with conflicting evidence
answer = synthesize_answer(
question="What is the capital of France?",
evidence=[
"[search] Paris is the capital of France (source: Wikipedia, 2024)",
"[search] Lyon was briefly capital during revolution (source: old text, 1793)"
]
)
assert isinstance(answer, str)
assert answer == "Paris" # Should pick more recent/credible source
print(f"✓ Resolved conflict, answer: {answer}")
class TestEndToEndWorkflow:
"""Test full agent workflow with mocked LLM."""
@patch('src.agent.llm_client.Anthropic')
@patch('src.tools.web_search.tavily_search')
def test_full_search_workflow(self, mock_tavily, mock_anthropic):
"""Test complete workflow: plan → search → answer."""
from src.agent import GAIAAgent
# Mock tool execution
mock_tavily.return_value = "Paris is the capital and most populous city of France"
# Mock LLM responses
mock_client = MagicMock()
# Response 1: Planning
# Response 2: Tool selection (function calling)
# Response 3: Answer synthesis
mock_plan_response = MagicMock()
mock_plan_response.content = [MagicMock(text="1. Search for capital of France")]
mock_tool_response = MagicMock()
mock_tool_use = MagicMock()
mock_tool_use.type = "tool_use"
mock_tool_use.name = "web_search"
mock_tool_use.input = {"query": "capital of France"}
mock_tool_use.id = "call_001"
mock_tool_response.content = [mock_tool_use]
mock_answer_response = MagicMock()
mock_answer_response.content = [MagicMock(text="Paris")]
# Set up mock to return different responses for each call
mock_client.messages.create.side_effect = [
mock_plan_response,
mock_tool_response,
mock_answer_response
]
mock_anthropic.return_value = mock_client
# Test full workflow
agent = GAIAAgent()
answer = agent("What is the capital of France?")
assert isinstance(answer, str)
assert answer == "Paris"
print(f"✓ Full workflow completed, answer: {answer}")
if __name__ == "__main__":
print("\n" + "="*70)
print("GAIA Agent - Stage 3 LLM Integration Tests")
print("="*70 + "\n")
# Run tests manually for quick validation
test_plan = TestPlanningFunction()
test_plan.test_plan_question_basic()
test_plan.test_plan_with_files()
test_tools = TestToolSelection()
test_tools.test_select_single_tool()
test_tools.test_select_multiple_tools()
test_answer = TestAnswerSynthesis()
test_answer.test_synthesize_simple_answer()
test_answer.test_synthesize_from_multiple_evidence()
test_answer.test_synthesize_with_conflicts()
test_e2e = TestEndToEndWorkflow()
test_e2e.test_full_search_workflow()
print("\n" + "="*70)
print("✓ All Stage 3 LLM integration tests passed!")
print("="*70 + "\n")