agentbee

Sleeping

File size: 10,082 Bytes

"""
LLM Integration Tests - Stage 3 Validation
Author: @mangubee
Date: 2026-01-02

Tests for Stage 3 LLM integration:
- Planning with LLM
- Tool selection via function calling
- Answer synthesis from evidence
- Full workflow with mocked LLM responses
"""

import pytest
from unittest.mock import patch, MagicMock
from src.agent.llm_client import (
    plan_question,
    select_tools_with_function_calling,
    synthesize_answer
)
from src.tools import TOOLS


class TestPlanningFunction:
    """Test LLM-based planning function."""

    @patch('src.agent.llm_client.Anthropic')
    def test_plan_question_basic(self, mock_anthropic):
        """Test planning with simple question."""
        # Mock LLM response
        mock_client = MagicMock()
        mock_response = MagicMock()
        mock_response.content = [MagicMock(text="1. Search for information\n2. Analyze results")]
        mock_client.messages.create.return_value = mock_response
        mock_anthropic.return_value = mock_client

        # Test planning
        plan = plan_question(
            question="What is the capital of France?",
            available_tools=TOOLS
        )

        assert isinstance(plan, str)
        assert len(plan) > 0
        print(f"✓ Generated plan: {plan[:50]}...")

    @patch('src.agent.llm_client.Anthropic')
    def test_plan_with_files(self, mock_anthropic):
        """Test planning with file context."""
        # Mock LLM response
        mock_client = MagicMock()
        mock_response = MagicMock()
        mock_response.content = [MagicMock(text="1. Parse file\n2. Extract data\n3. Calculate answer")]
        mock_client.messages.create.return_value = mock_response
        mock_anthropic.return_value = mock_client

        # Test planning with files
        plan = plan_question(
            question="What is the total in the spreadsheet?",
            available_tools=TOOLS,
            file_paths=["data.xlsx"]
        )

        assert isinstance(plan, str)
        assert len(plan) > 0
        print(f"✓ Generated plan with files: {plan[:50]}...")


class TestToolSelection:
    """Test LLM function calling for tool selection."""

    @patch('src.agent.llm_client.Anthropic')
    def test_select_single_tool(self, mock_anthropic):
        """Test selecting single tool with parameters."""
        # Mock LLM response with function call
        mock_client = MagicMock()
        mock_response = MagicMock()

        # Mock tool_use content block
        mock_tool_use = MagicMock()
        mock_tool_use.type = "tool_use"
        mock_tool_use.name = "search"
        mock_tool_use.input = {"query": "capital of France"}
        mock_tool_use.id = "call_001"

        mock_response.content = [mock_tool_use]
        mock_client.messages.create.return_value = mock_response
        mock_anthropic.return_value = mock_client

        # Test tool selection
        tool_calls = select_tools_with_function_calling(
            question="What is the capital of France?",
            plan="1. Search for capital of France",
            available_tools=TOOLS
        )

        assert isinstance(tool_calls, list)
        assert len(tool_calls) == 1
        assert tool_calls[0]["tool"] == "search"
        assert "query" in tool_calls[0]["params"]
        print(f"✓ Selected tool: {tool_calls[0]}")

    @patch('src.agent.llm_client.Anthropic')
    def test_select_multiple_tools(self, mock_anthropic):
        """Test selecting multiple tools in sequence."""
        # Mock LLM response with multiple function calls
        mock_client = MagicMock()
        mock_response = MagicMock()

        # Mock multiple tool_use blocks
        mock_tool1 = MagicMock()
        mock_tool1.type = "tool_use"
        mock_tool1.name = "parse_file"
        mock_tool1.input = {"file_path": "data.xlsx"}
        mock_tool1.id = "call_001"

        mock_tool2 = MagicMock()
        mock_tool2.type = "tool_use"
        mock_tool2.name = "safe_eval"
        mock_tool2.input = {"expression": "sum(values)"}
        mock_tool2.id = "call_002"

        mock_response.content = [mock_tool1, mock_tool2]
        mock_client.messages.create.return_value = mock_response
        mock_anthropic.return_value = mock_client

        # Test tool selection
        tool_calls = select_tools_with_function_calling(
            question="What is the sum in data.xlsx?",
            plan="1. Parse file\n2. Calculate sum",
            available_tools=TOOLS
        )

        assert isinstance(tool_calls, list)
        assert len(tool_calls) == 2
        assert tool_calls[0]["tool"] == "parse_file"
        assert tool_calls[1]["tool"] == "safe_eval"
        print(f"✓ Selected {len(tool_calls)} tools")


class TestAnswerSynthesis:
    """Test LLM-based answer synthesis."""

    @patch('src.agent.llm_client.Anthropic')
    def test_synthesize_simple_answer(self, mock_anthropic):
        """Test synthesizing answer from single evidence."""
        # Mock LLM response
        mock_client = MagicMock()
        mock_response = MagicMock()
        mock_response.content = [MagicMock(text="Paris")]
        mock_client.messages.create.return_value = mock_response
        mock_anthropic.return_value = mock_client

        # Test answer synthesis
        answer = synthesize_answer(
            question="What is the capital of France?",
            evidence=["[search] Paris is the capital and most populous city of France"]
        )

        assert isinstance(answer, str)
        assert len(answer) > 0
        assert answer == "Paris"
        print(f"✓ Synthesized answer: {answer}")

    @patch('src.agent.llm_client.Anthropic')
    def test_synthesize_from_multiple_evidence(self, mock_anthropic):
        """Test synthesizing answer from multiple evidence sources."""
        # Mock LLM response
        mock_client = MagicMock()
        mock_response = MagicMock()
        mock_response.content = [MagicMock(text="42")]
        mock_client.messages.create.return_value = mock_response
        mock_anthropic.return_value = mock_client

        # Test answer synthesis with multiple evidence
        answer = synthesize_answer(
            question="What is the answer?",
            evidence=[
                "[search] The answer to life is 42",
                "[safe_eval] 6 * 7 = 42",
                "[parse_file] Result: 42"
            ]
        )

        assert isinstance(answer, str)
        assert answer == "42"
        print(f"✓ Synthesized answer from {3} evidence items: {answer}")

    @patch('src.agent.llm_client.Anthropic')
    def test_synthesize_with_conflicts(self, mock_anthropic):
        """Test synthesizing answer when evidence conflicts."""
        # Mock LLM response - should resolve conflict
        mock_client = MagicMock()
        mock_response = MagicMock()
        mock_response.content = [MagicMock(text="Paris")]
        mock_client.messages.create.return_value = mock_response
        mock_anthropic.return_value = mock_client

        # Test answer synthesis with conflicting evidence
        answer = synthesize_answer(
            question="What is the capital of France?",
            evidence=[
                "[search] Paris is the capital of France (source: Wikipedia, 2024)",
                "[search] Lyon was briefly capital during revolution (source: old text, 1793)"
            ]
        )

        assert isinstance(answer, str)
        assert answer == "Paris"  # Should pick more recent/credible source
        print(f"✓ Resolved conflict, answer: {answer}")


class TestEndToEndWorkflow:
    """Test full agent workflow with mocked LLM."""

    @patch('src.agent.llm_client.Anthropic')
    @patch('src.tools.web_search.tavily_search')
    def test_full_search_workflow(self, mock_tavily, mock_anthropic):
        """Test complete workflow: plan → search → answer."""
        from src.agent import GAIAAgent

        # Mock tool execution
        mock_tavily.return_value = "Paris is the capital and most populous city of France"

        # Mock LLM responses
        mock_client = MagicMock()

        # Response 1: Planning
        # Response 2: Tool selection (function calling)
        # Response 3: Answer synthesis

        mock_plan_response = MagicMock()
        mock_plan_response.content = [MagicMock(text="1. Search for capital of France")]

        mock_tool_response = MagicMock()
        mock_tool_use = MagicMock()
        mock_tool_use.type = "tool_use"
        mock_tool_use.name = "web_search"
        mock_tool_use.input = {"query": "capital of France"}
        mock_tool_use.id = "call_001"
        mock_tool_response.content = [mock_tool_use]

        mock_answer_response = MagicMock()
        mock_answer_response.content = [MagicMock(text="Paris")]

        # Set up mock to return different responses for each call
        mock_client.messages.create.side_effect = [
            mock_plan_response,
            mock_tool_response,
            mock_answer_response
        ]

        mock_anthropic.return_value = mock_client

        # Test full workflow
        agent = GAIAAgent()
        answer = agent("What is the capital of France?")

        assert isinstance(answer, str)
        assert answer == "Paris"
        print(f"✓ Full workflow completed, answer: {answer}")


if __name__ == "__main__":
    print("\n" + "="*70)
    print("GAIA Agent - Stage 3 LLM Integration Tests")
    print("="*70 + "\n")

    # Run tests manually for quick validation
    test_plan = TestPlanningFunction()
    test_plan.test_plan_question_basic()
    test_plan.test_plan_with_files()

    test_tools = TestToolSelection()
    test_tools.test_select_single_tool()
    test_tools.test_select_multiple_tools()

    test_answer = TestAnswerSynthesis()
    test_answer.test_synthesize_simple_answer()
    test_answer.test_synthesize_from_multiple_evidence()
    test_answer.test_synthesize_with_conflicts()

    test_e2e = TestEndToEndWorkflow()
    test_e2e.test_full_search_workflow()

    print("\n" + "="*70)
    print("✓ All Stage 3 LLM integration tests passed!")
    print("="*70 + "\n")