Spaces:
Sleeping
Sleeping
| """ | |
| Testes para sistema de avaliacao RAG. | |
| """ | |
| import pytest | |
| from src.evaluation import RAGEvaluator, RAGEvaluationResult | |
| class TestRAGEvaluationResult: | |
| """Testes para classe RAGEvaluationResult.""" | |
| def test_create_result(self): | |
| """Testa criacao de resultado.""" | |
| result = RAGEvaluationResult( | |
| query="teste query", | |
| response="teste response", | |
| contexts=["contexto 1", "contexto 2"], | |
| ground_truth="verdade" | |
| ) | |
| assert result.query == "teste query" | |
| assert result.response == "teste response" | |
| assert len(result.contexts) == 2 | |
| assert result.ground_truth == "verdade" | |
| def test_to_dict(self): | |
| """Testa conversao para dicionario.""" | |
| result = RAGEvaluationResult( | |
| query="query", | |
| response="response", | |
| contexts=["ctx"], | |
| faithfulness=0.8 | |
| ) | |
| data = result.to_dict() | |
| assert data['query'] == "query" | |
| assert data['response'] == "response" | |
| assert data['faithfulness'] == 0.8 | |
| def test_get_overall_score_all_metrics(self): | |
| """Testa calculo de score geral com todas metricas.""" | |
| result = RAGEvaluationResult( | |
| query="q", | |
| response="r", | |
| contexts=["c"], | |
| faithfulness=0.8, | |
| answer_relevancy=0.9, | |
| context_precision=0.7, | |
| context_recall=0.85 | |
| ) | |
| score = result.get_overall_score() | |
| assert 0 <= score <= 1 | |
| assert abs(score - 0.8125) < 0.01 # Media: (0.8+0.9+0.7+0.85)/4 | |
| def test_get_overall_score_partial_metrics(self): | |
| """Testa score geral com metricas parciais.""" | |
| result = RAGEvaluationResult( | |
| query="q", | |
| response="r", | |
| contexts=["c"], | |
| faithfulness=0.8, | |
| answer_relevancy=0.9 | |
| ) | |
| score = result.get_overall_score() | |
| assert abs(score - 0.85) < 0.01 # Media: (0.8+0.9)/2 | |
| def test_get_overall_score_no_metrics(self): | |
| """Testa score geral sem metricas.""" | |
| result = RAGEvaluationResult( | |
| query="q", | |
| response="r", | |
| contexts=["c"] | |
| ) | |
| score = result.get_overall_score() | |
| assert score == 0.0 | |
| class TestRAGEvaluator: | |
| """Testes para classe RAGEvaluator.""" | |
| def evaluator_simple(self): | |
| """Evaluador com metricas simplificadas.""" | |
| return RAGEvaluator(use_ragas=False) | |
| def test_create_evaluator(self): | |
| """Testa criacao de evaluador.""" | |
| evaluator = RAGEvaluator(use_ragas=False) | |
| assert evaluator is not None | |
| assert not evaluator.use_ragas | |
| def test_evaluate_single_simple(self, evaluator_simple): | |
| """Testa avaliacao simples de caso unico.""" | |
| result = evaluator_simple.evaluate_single( | |
| query="O que e Python?", | |
| response="Python e uma linguagem de programacao.", | |
| contexts=["Python e uma linguagem de programacao moderna."] | |
| ) | |
| assert result is not None | |
| assert result.faithfulness is not None | |
| assert result.answer_relevancy is not None | |
| assert result.context_precision is not None | |
| assert 0 <= result.faithfulness <= 1 | |
| assert 0 <= result.answer_relevancy <= 1 | |
| def test_calculate_faithfulness_simple(self, evaluator_simple): | |
| """Testa calculo de faithfulness.""" | |
| response = "Python e uma linguagem de programacao" | |
| contexts = ["Python e uma linguagem de programacao moderna"] | |
| score = evaluator_simple._calculate_faithfulness_simple(response, contexts) | |
| assert 0 <= score <= 1 | |
| assert score > 0 # Deve ter overlap | |
| def test_calculate_faithfulness_no_overlap(self, evaluator_simple): | |
| """Testa faithfulness sem overlap.""" | |
| response = "Java e estaticamente tipada" | |
| contexts = ["Python e dinamicamente tipada"] | |
| score = evaluator_simple._calculate_faithfulness_simple(response, contexts) | |
| assert 0 <= score <= 1 | |
| def test_calculate_relevancy_simple(self, evaluator_simple): | |
| """Testa calculo de relevancia.""" | |
| query = "O que e Python" | |
| response = "Python e uma linguagem de programacao" | |
| score = evaluator_simple._calculate_relevancy_simple(query, response) | |
| assert 0 <= score <= 1 | |
| assert score > 0 # Deve ter overlap em "Python" | |
| def test_calculate_precision_simple(self, evaluator_simple): | |
| """Testa calculo de precisao.""" | |
| query = "Python linguagem" | |
| contexts = [ | |
| "Python e uma linguagem", | |
| "Java e outra linguagem", | |
| "JavaScript nao tem nada" | |
| ] | |
| score = evaluator_simple._calculate_precision_simple(query, contexts) | |
| assert 0 <= score <= 1 | |
| # Deve encontrar "Python" em contexto 1 e "linguagem" em 1 e 2 | |
| def test_calculate_recall_simple(self, evaluator_simple): | |
| """Testa calculo de recall.""" | |
| ground_truth = "Python e uma linguagem de programacao" | |
| contexts = [ | |
| "Python e uma linguagem", | |
| "Usada para programacao" | |
| ] | |
| score = evaluator_simple._calculate_recall_simple(ground_truth, contexts) | |
| assert 0 <= score <= 1 | |
| def test_evaluate_batch(self, evaluator_simple): | |
| """Testa avaliacao em lote.""" | |
| test_cases = [ | |
| { | |
| 'query': 'O que e Python?', | |
| 'response': 'Python e uma linguagem.', | |
| 'contexts': ['Python e uma linguagem moderna.'] | |
| }, | |
| { | |
| 'query': 'O que e Java?', | |
| 'response': 'Java e uma linguagem.', | |
| 'contexts': ['Java e uma linguagem enterprise.'] | |
| } | |
| ] | |
| results = evaluator_simple.evaluate_batch(test_cases) | |
| assert len(results) == 2 | |
| assert all(r.faithfulness is not None for r in results) | |
| assert all(r.response_time is not None for r in results) | |
| def test_generate_report(self, evaluator_simple): | |
| """Testa geracao de relatorio.""" | |
| # Criar resultados fake | |
| results = [ | |
| RAGEvaluationResult( | |
| query=f"query{i}", | |
| response=f"response{i}", | |
| contexts=[f"context{i}"], | |
| faithfulness=0.7 + i * 0.1, | |
| answer_relevancy=0.8, | |
| context_precision=0.75 | |
| ) | |
| for i in range(3) | |
| ] | |
| report = evaluator_simple.generate_report(results) | |
| assert report['total_cases'] == 3 | |
| assert 'average_scores' in report | |
| assert 'min_scores' in report | |
| assert 'max_scores' in report | |
| assert 0 <= report['average_scores']['faithfulness'] <= 1 | |
| def test_generate_report_empty(self, evaluator_simple): | |
| """Testa relatorio com resultados vazios.""" | |
| report = evaluator_simple.generate_report([]) | |
| assert report == {} | |
| def test_generate_report_worst_cases(self, evaluator_simple): | |
| """Testa identificacao de piores casos.""" | |
| results = [ | |
| RAGEvaluationResult( | |
| query=f"query{i}", | |
| response=f"response{i}", | |
| contexts=[f"context{i}"], | |
| faithfulness=0.3 + i * 0.1, | |
| answer_relevancy=0.4 + i * 0.1 | |
| ) | |
| for i in range(10) | |
| ] | |
| report = evaluator_simple.generate_report(results) | |
| assert 'worst_cases' in report | |
| assert len(report['worst_cases']) == 5 | |
| # Primeiro caso deve ser o pior (score mais baixo) | |
| assert report['worst_cases'][0]['index'] == 0 | |
| if __name__ == "__main__": | |
| pytest.main([__file__, "-v"]) | |