Spaces:

minhtudragon
/

headroom

Running

File size: 9,744 Bytes

eb1c19a

"""Tests for the memory evaluation framework."""

from headroom.evals.memory.judge import _parse_judge_response, simple_judge
from headroom.evals.memory.locomo import (
    LOCOMO_CATEGORIES,
    DialogueTurn,
    LoCoMoCase,
    LoCoMoConversation,
    Session,
    get_locomo_stats,
)


class TestLoCoMoDataStructures:
    """Test LoCoMo data structures."""

    def test_dialogue_turn_from_dict(self):
        """Test DialogueTurn parsing."""
        data = {
            "speaker": "Alice",
            "text": "Hello Bob!",
            "dia_id": "D1:1",
        }
        turn = DialogueTurn.from_dict(data)

        assert turn.speaker == "Alice"
        assert turn.text == "Hello Bob!"
        assert turn.dia_id == "D1:1"
        assert turn.image_url is None

    def test_dialogue_turn_with_image(self):
        """Test DialogueTurn with image."""
        data = {
            "speaker": "Bob",
            "text": "Check this out",
            "dia_id": "D1:2",
            "img_file": "http://example.com/img.jpg",
            "blip_caption": "A beautiful sunset",
        }
        turn = DialogueTurn.from_dict(data)

        assert turn.image_url == "http://example.com/img.jpg"
        assert turn.image_caption == "A beautiful sunset"

    def test_dialogue_turn_to_message_format(self):
        """Test message format conversion."""
        turn = DialogueTurn(
            speaker="Alice",
            text="I love Python",
            dia_id="D1:1",
        )
        msg = turn.to_message_format()
        assert msg == "Alice: I love Python"

        # With image
        turn_img = DialogueTurn(
            speaker="Bob",
            text="Look at this",
            dia_id="D1:2",
            image_url="http://example.com/img.jpg",
            image_caption="A dog playing",
        )
        msg_img = turn_img.to_message_format()
        assert "[shares image: A dog playing]" in msg_img

    def test_session_properties(self):
        """Test Session properties."""
        dialogues = [
            DialogueTurn(speaker="Alice", text="Hi", dia_id="D1:1"),
            DialogueTurn(speaker="Bob", text="Hello", dia_id="D1:2"),
        ]
        session = Session(session_num=1, datetime="2024-01-15", dialogues=dialogues)

        assert session.num_turns == 2
        assert "Alice: Hi" in session.text
        assert "Bob: Hello" in session.text

    def test_locomo_case_properties(self):
        """Test LoCoMoCase properties."""
        case = LoCoMoCase(
            question="What is Alice's favorite color?",
            answer="Blue",
            category=1,
            evidence=["D1:5", "D2:3"],
            conversation_id="sample_1",
        )

        assert case.category_name == "single_hop"
        assert case.is_answerable is True

        # Test unanswerable case
        case_na = LoCoMoCase(
            question="What is unknown?",
            answer="N/A",
            category=5,
            evidence=[],
            conversation_id="sample_1",
        )
        assert case_na.is_answerable is False

    def test_locomo_categories(self):
        """Test category definitions."""
        assert LOCOMO_CATEGORIES[1] == "single_hop"
        assert LOCOMO_CATEGORIES[2] == "temporal"
        assert LOCOMO_CATEGORIES[3] == "multi_hop"
        assert LOCOMO_CATEGORIES[4] == "open_domain"
        assert LOCOMO_CATEGORIES[5] == "adversarial"


class TestLoCoMoStats:
    """Test LoCoMo statistics."""

    def test_get_stats_empty(self):
        """Test stats with empty list."""
        stats = get_locomo_stats([])
        assert stats["num_conversations"] == 0
        assert stats["num_qa_pairs"] == 0

    def test_get_stats_with_data(self):
        """Test stats calculation."""
        # Create mock conversation
        dialogues = [
            DialogueTurn(speaker="A", text="Hello", dia_id="D1:1"),
            DialogueTurn(speaker="B", text="Hi there", dia_id="D1:2"),
        ]
        session = Session(session_num=1, datetime="2024-01-15", dialogues=dialogues)

        qa_cases = [
            LoCoMoCase(question="Q1", answer="A1", category=1, evidence=[], conversation_id="s1"),
            LoCoMoCase(question="Q2", answer="A2", category=2, evidence=[], conversation_id="s1"),
        ]

        conv = LoCoMoConversation(
            sample_id="s1",
            speaker_a="Alice",
            speaker_b="Bob",
            sessions=[session],
            qa_cases=qa_cases,
        )

        stats = get_locomo_stats([conv])

        assert stats["num_conversations"] == 1
        assert stats["num_sessions"] == 1
        assert stats["num_turns"] == 2
        assert stats["num_qa_pairs"] == 2
        assert "single_hop" in stats["questions_by_category"]
        assert "temporal" in stats["questions_by_category"]


class TestJudge:
    """Test LLM judge functions."""

    def test_parse_judge_response_standard(self):
        """Test parsing standard judge response."""
        response = """Reasoning: The prediction captures the main point.
Score: 4"""

        score, reasoning = _parse_judge_response(response)
        assert score == 4.0
        assert "main point" in reasoning

    def test_parse_judge_response_with_decimal(self):
        """Test parsing score with decimal."""
        response = """Reasoning: Partially correct.
Score: 3.5"""

        score, reasoning = _parse_judge_response(response)
        assert score == 3.5

    def test_parse_judge_response_clamping(self):
        """Test score clamping to valid range."""
        # Score too high
        response = "Reasoning: Perfect\nScore: 10"
        score, _ = _parse_judge_response(response)
        assert score == 5.0

        # Score too low
        response = "Reasoning: Terrible\nScore: 0"
        score, _ = _parse_judge_response(response)
        assert score == 1.0

    def test_simple_judge_exact_match(self):
        """Test simple judge with exact match."""
        score, reasoning = simple_judge(
            "What color?",
            "Blue",
            "Blue",
        )
        assert score == 5.0
        assert "Exact match" in reasoning

    def test_simple_judge_high_overlap(self):
        """Test simple judge with high F1."""
        score, reasoning = simple_judge(
            "What happened?",
            "Alice went to the store to buy groceries",
            "Alice went to the store for groceries",
        )
        assert score >= 4.0
        assert "F1" in reasoning

    def test_simple_judge_no_overlap(self):
        """Test simple judge with no overlap."""
        score, reasoning = simple_judge(
            "What color?",
            "Blue",
            "The weather is nice",
        )
        assert score == 1.0
        assert "Very low" in reasoning


class TestMemoryEvalConfig:
    """Test MemoryEvalConfig."""

    def test_default_config(self):
        """Test default configuration."""
        from headroom.evals.memory import MemoryEvalConfig

        config = MemoryEvalConfig()

        assert config.n_conversations is None
        assert config.skip_adversarial is True
        assert config.top_k_memories == 10
        assert config.llm_judge_enabled is False
        assert config.f1_threshold == 0.5

    def test_custom_config(self):
        """Test custom configuration."""
        from headroom.evals.memory import MemoryEvalConfig

        config = MemoryEvalConfig(
            n_conversations=5,
            categories=[1, 2],
            top_k_memories=20,
            llm_judge_enabled=True,
            f1_threshold=0.7,
        )

        assert config.n_conversations == 5
        assert config.categories == [1, 2]
        assert config.top_k_memories == 20
        assert config.llm_judge_enabled is True
        assert config.f1_threshold == 0.7


class TestMemoryEvalResult:
    """Test MemoryEvalResult and MemoryEvalSuiteResult."""

    def test_eval_result_to_dict(self):
        """Test result serialization."""
        from headroom.evals.memory.runner import MemoryEvalResult

        case = LoCoMoCase(
            question="What color?",
            answer="Blue",
            category=1,
            evidence=[],
            conversation_id="s1",
        )

        result = MemoryEvalResult(
            case=case,
            predicted_answer="Blue",
            retrieved_memories=["Memory 1", "Memory 2"],
            retrieval_scores=[0.9, 0.8],
            f1_score=1.0,
            exact_match=True,
            is_correct=True,
        )

        d = result.to_dict()
        assert d["question"] == "What color?"
        assert d["ground_truth"] == "Blue"
        assert d["predicted"] == "Blue"
        assert d["f1_score"] == 1.0
        assert d["is_correct"] is True

    def test_suite_result_summary(self):
        """Test suite result summary generation."""
        from headroom.evals.memory.runner import MemoryEvalSuiteResult

        suite_result = MemoryEvalSuiteResult(
            total_cases=100,
            correct_cases=75,
            accuracy=0.75,
            avg_f1_score=0.82,
            exact_match_rate=0.5,
            avg_llm_judge_score=4.2,
            metrics_by_category={
                "single_hop": {"count": 30, "accuracy": 0.9, "avg_f1": 0.88, "correct": 27},
                "temporal": {"count": 25, "accuracy": 0.7, "avg_f1": 0.75, "correct": 18},
            },
            total_duration_seconds=120.5,
            avg_retrieval_latency_ms=15.3,
            avg_generation_latency_ms=250.0,
        )

        summary = suite_result.summary()
        assert "100" in summary
        assert "75" in summary  # Accuracy percentage
        assert "0.820" in summary  # F1 score
        assert "single_hop" in summary
        assert "temporal" in summary