""" Testes para sistema de avaliacao RAG. """ import pytest from src.evaluation import RAGEvaluator, RAGEvaluationResult class TestRAGEvaluationResult: """Testes para classe RAGEvaluationResult.""" def test_create_result(self): """Testa criacao de resultado.""" result = RAGEvaluationResult( query="teste query", response="teste response", contexts=["contexto 1", "contexto 2"], ground_truth="verdade" ) assert result.query == "teste query" assert result.response == "teste response" assert len(result.contexts) == 2 assert result.ground_truth == "verdade" def test_to_dict(self): """Testa conversao para dicionario.""" result = RAGEvaluationResult( query="query", response="response", contexts=["ctx"], faithfulness=0.8 ) data = result.to_dict() assert data['query'] == "query" assert data['response'] == "response" assert data['faithfulness'] == 0.8 def test_get_overall_score_all_metrics(self): """Testa calculo de score geral com todas metricas.""" result = RAGEvaluationResult( query="q", response="r", contexts=["c"], faithfulness=0.8, answer_relevancy=0.9, context_precision=0.7, context_recall=0.85 ) score = result.get_overall_score() assert 0 <= score <= 1 assert abs(score - 0.8125) < 0.01 # Media: (0.8+0.9+0.7+0.85)/4 def test_get_overall_score_partial_metrics(self): """Testa score geral com metricas parciais.""" result = RAGEvaluationResult( query="q", response="r", contexts=["c"], faithfulness=0.8, answer_relevancy=0.9 ) score = result.get_overall_score() assert abs(score - 0.85) < 0.01 # Media: (0.8+0.9)/2 def test_get_overall_score_no_metrics(self): """Testa score geral sem metricas.""" result = RAGEvaluationResult( query="q", response="r", contexts=["c"] ) score = result.get_overall_score() assert score == 0.0 class TestRAGEvaluator: """Testes para classe RAGEvaluator.""" @pytest.fixture def evaluator_simple(self): """Evaluador com metricas simplificadas.""" return RAGEvaluator(use_ragas=False) def test_create_evaluator(self): """Testa criacao de evaluador.""" evaluator = RAGEvaluator(use_ragas=False) assert evaluator is not None assert not evaluator.use_ragas def test_evaluate_single_simple(self, evaluator_simple): """Testa avaliacao simples de caso unico.""" result = evaluator_simple.evaluate_single( query="O que e Python?", response="Python e uma linguagem de programacao.", contexts=["Python e uma linguagem de programacao moderna."] ) assert result is not None assert result.faithfulness is not None assert result.answer_relevancy is not None assert result.context_precision is not None assert 0 <= result.faithfulness <= 1 assert 0 <= result.answer_relevancy <= 1 def test_calculate_faithfulness_simple(self, evaluator_simple): """Testa calculo de faithfulness.""" response = "Python e uma linguagem de programacao" contexts = ["Python e uma linguagem de programacao moderna"] score = evaluator_simple._calculate_faithfulness_simple(response, contexts) assert 0 <= score <= 1 assert score > 0 # Deve ter overlap def test_calculate_faithfulness_no_overlap(self, evaluator_simple): """Testa faithfulness sem overlap.""" response = "Java e estaticamente tipada" contexts = ["Python e dinamicamente tipada"] score = evaluator_simple._calculate_faithfulness_simple(response, contexts) assert 0 <= score <= 1 def test_calculate_relevancy_simple(self, evaluator_simple): """Testa calculo de relevancia.""" query = "O que e Python" response = "Python e uma linguagem de programacao" score = evaluator_simple._calculate_relevancy_simple(query, response) assert 0 <= score <= 1 assert score > 0 # Deve ter overlap em "Python" def test_calculate_precision_simple(self, evaluator_simple): """Testa calculo de precisao.""" query = "Python linguagem" contexts = [ "Python e uma linguagem", "Java e outra linguagem", "JavaScript nao tem nada" ] score = evaluator_simple._calculate_precision_simple(query, contexts) assert 0 <= score <= 1 # Deve encontrar "Python" em contexto 1 e "linguagem" em 1 e 2 def test_calculate_recall_simple(self, evaluator_simple): """Testa calculo de recall.""" ground_truth = "Python e uma linguagem de programacao" contexts = [ "Python e uma linguagem", "Usada para programacao" ] score = evaluator_simple._calculate_recall_simple(ground_truth, contexts) assert 0 <= score <= 1 def test_evaluate_batch(self, evaluator_simple): """Testa avaliacao em lote.""" test_cases = [ { 'query': 'O que e Python?', 'response': 'Python e uma linguagem.', 'contexts': ['Python e uma linguagem moderna.'] }, { 'query': 'O que e Java?', 'response': 'Java e uma linguagem.', 'contexts': ['Java e uma linguagem enterprise.'] } ] results = evaluator_simple.evaluate_batch(test_cases) assert len(results) == 2 assert all(r.faithfulness is not None for r in results) assert all(r.response_time is not None for r in results) def test_generate_report(self, evaluator_simple): """Testa geracao de relatorio.""" # Criar resultados fake results = [ RAGEvaluationResult( query=f"query{i}", response=f"response{i}", contexts=[f"context{i}"], faithfulness=0.7 + i * 0.1, answer_relevancy=0.8, context_precision=0.75 ) for i in range(3) ] report = evaluator_simple.generate_report(results) assert report['total_cases'] == 3 assert 'average_scores' in report assert 'min_scores' in report assert 'max_scores' in report assert 0 <= report['average_scores']['faithfulness'] <= 1 def test_generate_report_empty(self, evaluator_simple): """Testa relatorio com resultados vazios.""" report = evaluator_simple.generate_report([]) assert report == {} def test_generate_report_worst_cases(self, evaluator_simple): """Testa identificacao de piores casos.""" results = [ RAGEvaluationResult( query=f"query{i}", response=f"response{i}", contexts=[f"context{i}"], faithfulness=0.3 + i * 0.1, answer_relevancy=0.4 + i * 0.1 ) for i in range(10) ] report = evaluator_simple.generate_report(results) assert 'worst_cases' in report assert len(report['worst_cases']) == 5 # Primeiro caso deve ser o pior (score mais baixo) assert report['worst_cases'][0]['index'] == 0 if __name__ == "__main__": pytest.main([__file__, "-v"])