File size: 7,763 Bytes
a686b1b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
"""
Testes para sistema de avaliacao RAG.
"""

import pytest
from src.evaluation import RAGEvaluator, RAGEvaluationResult


class TestRAGEvaluationResult:
    """Testes para classe RAGEvaluationResult."""

    def test_create_result(self):
        """Testa criacao de resultado."""
        result = RAGEvaluationResult(
            query="teste query",
            response="teste response",
            contexts=["contexto 1", "contexto 2"],
            ground_truth="verdade"
        )

        assert result.query == "teste query"
        assert result.response == "teste response"
        assert len(result.contexts) == 2
        assert result.ground_truth == "verdade"

    def test_to_dict(self):
        """Testa conversao para dicionario."""
        result = RAGEvaluationResult(
            query="query",
            response="response",
            contexts=["ctx"],
            faithfulness=0.8
        )

        data = result.to_dict()

        assert data['query'] == "query"
        assert data['response'] == "response"
        assert data['faithfulness'] == 0.8

    def test_get_overall_score_all_metrics(self):
        """Testa calculo de score geral com todas metricas."""
        result = RAGEvaluationResult(
            query="q",
            response="r",
            contexts=["c"],
            faithfulness=0.8,
            answer_relevancy=0.9,
            context_precision=0.7,
            context_recall=0.85
        )

        score = result.get_overall_score()

        assert 0 <= score <= 1
        assert abs(score - 0.8125) < 0.01  # Media: (0.8+0.9+0.7+0.85)/4

    def test_get_overall_score_partial_metrics(self):
        """Testa score geral com metricas parciais."""
        result = RAGEvaluationResult(
            query="q",
            response="r",
            contexts=["c"],
            faithfulness=0.8,
            answer_relevancy=0.9
        )

        score = result.get_overall_score()

        assert abs(score - 0.85) < 0.01  # Media: (0.8+0.9)/2

    def test_get_overall_score_no_metrics(self):
        """Testa score geral sem metricas."""
        result = RAGEvaluationResult(
            query="q",
            response="r",
            contexts=["c"]
        )

        score = result.get_overall_score()

        assert score == 0.0


class TestRAGEvaluator:
    """Testes para classe RAGEvaluator."""

    @pytest.fixture
    def evaluator_simple(self):
        """Evaluador com metricas simplificadas."""
        return RAGEvaluator(use_ragas=False)

    def test_create_evaluator(self):
        """Testa criacao de evaluador."""
        evaluator = RAGEvaluator(use_ragas=False)
        assert evaluator is not None
        assert not evaluator.use_ragas

    def test_evaluate_single_simple(self, evaluator_simple):
        """Testa avaliacao simples de caso unico."""
        result = evaluator_simple.evaluate_single(
            query="O que e Python?",
            response="Python e uma linguagem de programacao.",
            contexts=["Python e uma linguagem de programacao moderna."]
        )

        assert result is not None
        assert result.faithfulness is not None
        assert result.answer_relevancy is not None
        assert result.context_precision is not None
        assert 0 <= result.faithfulness <= 1
        assert 0 <= result.answer_relevancy <= 1

    def test_calculate_faithfulness_simple(self, evaluator_simple):
        """Testa calculo de faithfulness."""
        response = "Python e uma linguagem de programacao"
        contexts = ["Python e uma linguagem de programacao moderna"]

        score = evaluator_simple._calculate_faithfulness_simple(response, contexts)

        assert 0 <= score <= 1
        assert score > 0  # Deve ter overlap

    def test_calculate_faithfulness_no_overlap(self, evaluator_simple):
        """Testa faithfulness sem overlap."""
        response = "Java e estaticamente tipada"
        contexts = ["Python e dinamicamente tipada"]

        score = evaluator_simple._calculate_faithfulness_simple(response, contexts)

        assert 0 <= score <= 1

    def test_calculate_relevancy_simple(self, evaluator_simple):
        """Testa calculo de relevancia."""
        query = "O que e Python"
        response = "Python e uma linguagem de programacao"

        score = evaluator_simple._calculate_relevancy_simple(query, response)

        assert 0 <= score <= 1
        assert score > 0  # Deve ter overlap em "Python"

    def test_calculate_precision_simple(self, evaluator_simple):
        """Testa calculo de precisao."""
        query = "Python linguagem"
        contexts = [
            "Python e uma linguagem",
            "Java e outra linguagem",
            "JavaScript nao tem nada"
        ]

        score = evaluator_simple._calculate_precision_simple(query, contexts)

        assert 0 <= score <= 1
        # Deve encontrar "Python" em contexto 1 e "linguagem" em 1 e 2

    def test_calculate_recall_simple(self, evaluator_simple):
        """Testa calculo de recall."""
        ground_truth = "Python e uma linguagem de programacao"
        contexts = [
            "Python e uma linguagem",
            "Usada para programacao"
        ]

        score = evaluator_simple._calculate_recall_simple(ground_truth, contexts)

        assert 0 <= score <= 1

    def test_evaluate_batch(self, evaluator_simple):
        """Testa avaliacao em lote."""
        test_cases = [
            {
                'query': 'O que e Python?',
                'response': 'Python e uma linguagem.',
                'contexts': ['Python e uma linguagem moderna.']
            },
            {
                'query': 'O que e Java?',
                'response': 'Java e uma linguagem.',
                'contexts': ['Java e uma linguagem enterprise.']
            }
        ]

        results = evaluator_simple.evaluate_batch(test_cases)

        assert len(results) == 2
        assert all(r.faithfulness is not None for r in results)
        assert all(r.response_time is not None for r in results)

    def test_generate_report(self, evaluator_simple):
        """Testa geracao de relatorio."""
        # Criar resultados fake
        results = [
            RAGEvaluationResult(
                query=f"query{i}",
                response=f"response{i}",
                contexts=[f"context{i}"],
                faithfulness=0.7 + i * 0.1,
                answer_relevancy=0.8,
                context_precision=0.75
            )
            for i in range(3)
        ]

        report = evaluator_simple.generate_report(results)

        assert report['total_cases'] == 3
        assert 'average_scores' in report
        assert 'min_scores' in report
        assert 'max_scores' in report
        assert 0 <= report['average_scores']['faithfulness'] <= 1

    def test_generate_report_empty(self, evaluator_simple):
        """Testa relatorio com resultados vazios."""
        report = evaluator_simple.generate_report([])
        assert report == {}

    def test_generate_report_worst_cases(self, evaluator_simple):
        """Testa identificacao de piores casos."""
        results = [
            RAGEvaluationResult(
                query=f"query{i}",
                response=f"response{i}",
                contexts=[f"context{i}"],
                faithfulness=0.3 + i * 0.1,
                answer_relevancy=0.4 + i * 0.1
            )
            for i in range(10)
        ]

        report = evaluator_simple.generate_report(results)

        assert 'worst_cases' in report
        assert len(report['worst_cases']) == 5
        # Primeiro caso deve ser o pior (score mais baixo)
        assert report['worst_cases'][0]['index'] == 0


if __name__ == "__main__":
    pytest.main([__file__, "-v"])