""" LLM Integration Tests - Stage 3 Validation Author: @mangubee Date: 2026-01-02 Tests for Stage 3 LLM integration: - Planning with LLM - Tool selection via function calling - Answer synthesis from evidence - Full workflow with mocked LLM responses """ import pytest from unittest.mock import patch, MagicMock from src.agent.llm_client import ( plan_question, select_tools_with_function_calling, synthesize_answer ) from src.tools import TOOLS class TestPlanningFunction: """Test LLM-based planning function.""" @patch('src.agent.llm_client.Anthropic') def test_plan_question_basic(self, mock_anthropic): """Test planning with simple question.""" # Mock LLM response mock_client = MagicMock() mock_response = MagicMock() mock_response.content = [MagicMock(text="1. Search for information\n2. Analyze results")] mock_client.messages.create.return_value = mock_response mock_anthropic.return_value = mock_client # Test planning plan = plan_question( question="What is the capital of France?", available_tools=TOOLS ) assert isinstance(plan, str) assert len(plan) > 0 print(f"✓ Generated plan: {plan[:50]}...") @patch('src.agent.llm_client.Anthropic') def test_plan_with_files(self, mock_anthropic): """Test planning with file context.""" # Mock LLM response mock_client = MagicMock() mock_response = MagicMock() mock_response.content = [MagicMock(text="1. Parse file\n2. Extract data\n3. Calculate answer")] mock_client.messages.create.return_value = mock_response mock_anthropic.return_value = mock_client # Test planning with files plan = plan_question( question="What is the total in the spreadsheet?", available_tools=TOOLS, file_paths=["data.xlsx"] ) assert isinstance(plan, str) assert len(plan) > 0 print(f"✓ Generated plan with files: {plan[:50]}...") class TestToolSelection: """Test LLM function calling for tool selection.""" @patch('src.agent.llm_client.Anthropic') def test_select_single_tool(self, mock_anthropic): """Test selecting single tool with parameters.""" # Mock LLM response with function call mock_client = MagicMock() mock_response = MagicMock() # Mock tool_use content block mock_tool_use = MagicMock() mock_tool_use.type = "tool_use" mock_tool_use.name = "search" mock_tool_use.input = {"query": "capital of France"} mock_tool_use.id = "call_001" mock_response.content = [mock_tool_use] mock_client.messages.create.return_value = mock_response mock_anthropic.return_value = mock_client # Test tool selection tool_calls = select_tools_with_function_calling( question="What is the capital of France?", plan="1. Search for capital of France", available_tools=TOOLS ) assert isinstance(tool_calls, list) assert len(tool_calls) == 1 assert tool_calls[0]["tool"] == "search" assert "query" in tool_calls[0]["params"] print(f"✓ Selected tool: {tool_calls[0]}") @patch('src.agent.llm_client.Anthropic') def test_select_multiple_tools(self, mock_anthropic): """Test selecting multiple tools in sequence.""" # Mock LLM response with multiple function calls mock_client = MagicMock() mock_response = MagicMock() # Mock multiple tool_use blocks mock_tool1 = MagicMock() mock_tool1.type = "tool_use" mock_tool1.name = "parse_file" mock_tool1.input = {"file_path": "data.xlsx"} mock_tool1.id = "call_001" mock_tool2 = MagicMock() mock_tool2.type = "tool_use" mock_tool2.name = "safe_eval" mock_tool2.input = {"expression": "sum(values)"} mock_tool2.id = "call_002" mock_response.content = [mock_tool1, mock_tool2] mock_client.messages.create.return_value = mock_response mock_anthropic.return_value = mock_client # Test tool selection tool_calls = select_tools_with_function_calling( question="What is the sum in data.xlsx?", plan="1. Parse file\n2. Calculate sum", available_tools=TOOLS ) assert isinstance(tool_calls, list) assert len(tool_calls) == 2 assert tool_calls[0]["tool"] == "parse_file" assert tool_calls[1]["tool"] == "safe_eval" print(f"✓ Selected {len(tool_calls)} tools") class TestAnswerSynthesis: """Test LLM-based answer synthesis.""" @patch('src.agent.llm_client.Anthropic') def test_synthesize_simple_answer(self, mock_anthropic): """Test synthesizing answer from single evidence.""" # Mock LLM response mock_client = MagicMock() mock_response = MagicMock() mock_response.content = [MagicMock(text="Paris")] mock_client.messages.create.return_value = mock_response mock_anthropic.return_value = mock_client # Test answer synthesis answer = synthesize_answer( question="What is the capital of France?", evidence=["[search] Paris is the capital and most populous city of France"] ) assert isinstance(answer, str) assert len(answer) > 0 assert answer == "Paris" print(f"✓ Synthesized answer: {answer}") @patch('src.agent.llm_client.Anthropic') def test_synthesize_from_multiple_evidence(self, mock_anthropic): """Test synthesizing answer from multiple evidence sources.""" # Mock LLM response mock_client = MagicMock() mock_response = MagicMock() mock_response.content = [MagicMock(text="42")] mock_client.messages.create.return_value = mock_response mock_anthropic.return_value = mock_client # Test answer synthesis with multiple evidence answer = synthesize_answer( question="What is the answer?", evidence=[ "[search] The answer to life is 42", "[safe_eval] 6 * 7 = 42", "[parse_file] Result: 42" ] ) assert isinstance(answer, str) assert answer == "42" print(f"✓ Synthesized answer from {3} evidence items: {answer}") @patch('src.agent.llm_client.Anthropic') def test_synthesize_with_conflicts(self, mock_anthropic): """Test synthesizing answer when evidence conflicts.""" # Mock LLM response - should resolve conflict mock_client = MagicMock() mock_response = MagicMock() mock_response.content = [MagicMock(text="Paris")] mock_client.messages.create.return_value = mock_response mock_anthropic.return_value = mock_client # Test answer synthesis with conflicting evidence answer = synthesize_answer( question="What is the capital of France?", evidence=[ "[search] Paris is the capital of France (source: Wikipedia, 2024)", "[search] Lyon was briefly capital during revolution (source: old text, 1793)" ] ) assert isinstance(answer, str) assert answer == "Paris" # Should pick more recent/credible source print(f"✓ Resolved conflict, answer: {answer}") class TestEndToEndWorkflow: """Test full agent workflow with mocked LLM.""" @patch('src.agent.llm_client.Anthropic') @patch('src.tools.web_search.tavily_search') def test_full_search_workflow(self, mock_tavily, mock_anthropic): """Test complete workflow: plan → search → answer.""" from src.agent import GAIAAgent # Mock tool execution mock_tavily.return_value = "Paris is the capital and most populous city of France" # Mock LLM responses mock_client = MagicMock() # Response 1: Planning # Response 2: Tool selection (function calling) # Response 3: Answer synthesis mock_plan_response = MagicMock() mock_plan_response.content = [MagicMock(text="1. Search for capital of France")] mock_tool_response = MagicMock() mock_tool_use = MagicMock() mock_tool_use.type = "tool_use" mock_tool_use.name = "web_search" mock_tool_use.input = {"query": "capital of France"} mock_tool_use.id = "call_001" mock_tool_response.content = [mock_tool_use] mock_answer_response = MagicMock() mock_answer_response.content = [MagicMock(text="Paris")] # Set up mock to return different responses for each call mock_client.messages.create.side_effect = [ mock_plan_response, mock_tool_response, mock_answer_response ] mock_anthropic.return_value = mock_client # Test full workflow agent = GAIAAgent() answer = agent("What is the capital of France?") assert isinstance(answer, str) assert answer == "Paris" print(f"✓ Full workflow completed, answer: {answer}") if __name__ == "__main__": print("\n" + "="*70) print("GAIA Agent - Stage 3 LLM Integration Tests") print("="*70 + "\n") # Run tests manually for quick validation test_plan = TestPlanningFunction() test_plan.test_plan_question_basic() test_plan.test_plan_with_files() test_tools = TestToolSelection() test_tools.test_select_single_tool() test_tools.test_select_multiple_tools() test_answer = TestAnswerSynthesis() test_answer.test_synthesize_simple_answer() test_answer.test_synthesize_from_multiple_evidence() test_answer.test_synthesize_with_conflicts() test_e2e = TestEndToEndWorkflow() test_e2e.test_full_search_workflow() print("\n" + "="*70) print("✓ All Stage 3 LLM integration tests passed!") print("="*70 + "\n")