File size: 8,404 Bytes
31a2688
 
 
 
a493f04
 
 
 
 
 
 
31a2688
 
 
 
 
a493f04
9612292
a493f04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31a2688
 
 
 
 
a493f04
9612292
a493f04
 
 
 
 
 
9612292
a493f04
 
 
 
31a2688
a493f04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31a2688
 
a493f04
 
 
 
31a2688
 
a493f04
31a2688
 
 
 
 
a493f04
 
31a2688
 
a493f04
 
 
 
 
 
 
31a2688
 
a493f04
31a2688
 
a493f04
 
 
 
 
 
 
31a2688
 
a493f04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31a2688
 
a493f04
 
 
31a2688
 
 
 
 
a493f04
31a2688
a493f04
31a2688
 
 
a493f04
31a2688
a493f04
 
31a2688
 
a493f04
 
 
 
 
 
 
31a2688
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
"""Tests for src.evaluation.evaluator."""

from unittest.mock import MagicMock, patch

import pandas as pd

from src.evaluation.evaluator import (
    _CORRECTNESS_REF_CHAIN,
    _GROUNDING_REF_CHAIN,
    RAGEvaluator,
)

EVAL_MODULE = "src.evaluation.evaluator"


def _make_evaluator() -> RAGEvaluator:
    """Create a RAGEvaluator with a mocked LLM and embeddings."""
    mock_llm = MagicMock()
    mock_embeddings = MagicMock()
    with patch(f"{EVAL_MODULE}.LangchainLLMWrapper"), patch(
        f"{EVAL_MODULE}.LangchainEmbeddingsWrapper"
    ):
        return RAGEvaluator(llm=mock_llm, embeddings=mock_embeddings)


def _make_grounding_df() -> pd.DataFrame:
    """Fake RAGAS dataframe for the grounding pass (Danish reference)."""
    return pd.DataFrame(
        [
            {
                "user_input": "What is KU?",
                "retrieved_contexts": ["KU er Københavns Universitet."],
                "reference": "KU er Københavns Universitet.",
                "response": "KU is a university.",
                "faithfulness": 0.85,
                "answer_relevancy": 0.90,
                "llm_context_precision_with_reference": 0.75,
                "context_recall": 0.80,
            }
        ]
    )


def _make_correctness_df() -> pd.DataFrame:
    """Fake RAGAS dataframe for the correctness pass (English reference)."""
    return pd.DataFrame(
        [
            {
                "user_input": "What is KU?",
                "retrieved_contexts": ["KU er Københavns Universitet."],
                "reference": "KU is the University of Copenhagen.",
                "response": "KU is a university.",
                "answer_correctness": 0.72,
                "factual_correctness(mode=f1)": 0.65,
            }
        ]
    )


def _make_retrieval_result_df() -> pd.DataFrame:
    """Build a fake RAGAS result dataframe with only retrieval metrics."""
    return pd.DataFrame(
        [
            {
                "user_input": "What is KU?",
                "retrieved_contexts": ["KU er Københavns Universitet."],
                "reference": "KU er Københavns Universitet.",
                "llm_context_precision_with_reference": 0.75,
                "context_recall": 0.80,
            }
        ]
    )


class TestRAGEvaluator:
    """Tests for the RAGEvaluator class."""

    @patch(f"{EVAL_MODULE}.LangchainEmbeddingsWrapper")
    @patch(f"{EVAL_MODULE}.LangchainLLMWrapper")
    def test_init_stores_llm_and_embeddings(
        self,
        mock_llm_wrapper: MagicMock,
        mock_emb_wrapper: MagicMock,
    ) -> None:
        """Test that __init__ wraps both the LLM and the embeddings."""
        mock_llm = MagicMock()
        mock_embeddings = MagicMock()
        evaluator = RAGEvaluator(llm=mock_llm, embeddings=mock_embeddings)
        mock_llm_wrapper.assert_called_once_with(mock_llm)
        mock_emb_wrapper.assert_called_once_with(mock_embeddings)
        assert evaluator._llm is not None
        assert evaluator._embeddings is not None

    def test_resolve_reference_with_string(self) -> None:
        """Plain-string ground truths pass through unchanged."""
        assert (
            RAGEvaluator._resolve_reference("hello", _GROUNDING_REF_CHAIN) == "hello"
        )

    def test_resolve_reference_grounding_prefers_danish_quote(self) -> None:
        """Grounding chain prefers source_quote_da over reference_en."""
        gt = {
            "source_quote_da": "Danish text",
            "reference_en": "English text",
        }
        assert (
            RAGEvaluator._resolve_reference(gt, _GROUNDING_REF_CHAIN) == "Danish text"
        )

    def test_resolve_reference_correctness_prefers_english(self) -> None:
        """Correctness chain prefers reference_en over source_quote_da."""
        gt = {
            "source_quote_da": "Danish text",
            "reference_en": "English text",
        }
        assert (
            RAGEvaluator._resolve_reference(gt, _CORRECTNESS_REF_CHAIN)
            == "English text"
        )

    def test_resolve_reference_falls_back_to_other_key(self) -> None:
        """When the preferred key is missing, falls back to the next chain entry."""
        gt = {"reference_en": "English text"}
        # Grounding prefers Danish but only English is available — should fall back.
        assert (
            RAGEvaluator._resolve_reference(gt, _GROUNDING_REF_CHAIN) == "English text"
        )

    @patch(f"{EVAL_MODULE}.evaluate")
    @patch(f"{EVAL_MODULE}.FactualCorrectness")
    @patch(f"{EVAL_MODULE}.AnswerCorrectness")
    @patch(f"{EVAL_MODULE}.LLMContextRecall")
    @patch(f"{EVAL_MODULE}.LLMContextPrecisionWithReference")
    @patch(f"{EVAL_MODULE}.AnswerRelevancy")
    @patch(f"{EVAL_MODULE}.Faithfulness")
    def test_evaluate_runs_two_passes_and_merges_results(
        self,
        mock_faith: MagicMock,
        mock_relevancy: MagicMock,
        mock_precision: MagicMock,
        mock_recall: MagicMock,
        mock_answer_correctness: MagicMock,
        mock_factual_correctness: MagicMock,
        mock_evaluate: MagicMock,
    ) -> None:
        """evaluate() runs grounding + correctness passes and merges per-sample rows."""
        # Two distinct results returned by the two evaluate() calls.
        result_grounding = MagicMock()
        result_grounding.to_pandas.return_value = _make_grounding_df()
        result_correctness = MagicMock()
        result_correctness.to_pandas.return_value = _make_correctness_df()
        mock_evaluate.side_effect = [result_grounding, result_correctness]

        evaluator = _make_evaluator()
        result = evaluator.evaluate(
            questions=["What is KU?"],
            answers=["KU is a university."],
            contexts=[["KU er Københavns Universitet."]],
            ground_truths=[
                {
                    "source_quote_da": "KU er Københavns Universitet.",
                    "reference_en": "KU is the University of Copenhagen.",
                }
            ],
        )

        # evaluate() should be called exactly twice (one per metric family).
        assert mock_evaluate.call_count == 2

        agg = result["aggregate"]
        # Grounding metrics
        assert agg["faithfulness"] == 0.85
        assert agg["answer_relevancy"] == 0.90
        assert agg["llm_context_precision_with_reference"] == 0.75
        assert agg["context_recall"] == 0.80
        # Correctness metrics
        assert agg["answer_correctness"] == 0.72
        assert agg["factual_correctness(mode=f1)"] == 0.65

        # Per-sample rows are merged, including reference_en for transparency.
        per_sample = result["per_sample"]
        assert len(per_sample) == 1
        row = per_sample[0]
        assert row["user_input"] == "What is KU?"
        # The Danish reference (from pass 1) is the canonical reference.
        assert row["reference"] == "KU er Københavns Universitet."
        # The English reference (from pass 2) lives in reference_en.
        assert row["reference_en"] == "KU is the University of Copenhagen."
        # Both metric families are present in the merged row.
        assert row["faithfulness"] == 0.85
        assert row["answer_correctness"] == 0.72

    @patch(f"{EVAL_MODULE}.evaluate")
    @patch(f"{EVAL_MODULE}.LLMContextRecall")
    @patch(f"{EVAL_MODULE}.LLMContextPrecisionWithReference")
    def test_evaluate_retrieval_returns_only_retrieval_metrics(
        self,
        mock_precision: MagicMock,
        mock_recall: MagicMock,
        mock_evaluate: MagicMock,
    ) -> None:
        """evaluate_retrieval() returns only context precision/recall."""
        mock_result = MagicMock()
        mock_result.to_pandas.return_value = _make_retrieval_result_df()
        mock_evaluate.return_value = mock_result

        evaluator = _make_evaluator()
        result = evaluator.evaluate_retrieval(
            questions=["What is KU?"],
            contexts=[["KU er Københavns Universitet."]],
            ground_truths=["KU er Københavns Universitet."],
        )

        agg = result["aggregate"]
        assert "llm_context_precision_with_reference" in agg
        assert "context_recall" in agg
        assert "faithfulness" not in agg
        assert "answer_relevancy" not in agg
        assert "answer_correctness" not in agg
        assert len(result["per_sample"]) == 1
        mock_evaluate.assert_called_once()