rag_template / tests /test_evaluation.py
Guilherme Favaron
Sync: Complete project update (Phase 6) - API, Metadata, Eval, Docs
a686b1b
"""
Testes para sistema de avaliacao RAG.
"""
import pytest
from src.evaluation import RAGEvaluator, RAGEvaluationResult
class TestRAGEvaluationResult:
"""Testes para classe RAGEvaluationResult."""
def test_create_result(self):
"""Testa criacao de resultado."""
result = RAGEvaluationResult(
query="teste query",
response="teste response",
contexts=["contexto 1", "contexto 2"],
ground_truth="verdade"
)
assert result.query == "teste query"
assert result.response == "teste response"
assert len(result.contexts) == 2
assert result.ground_truth == "verdade"
def test_to_dict(self):
"""Testa conversao para dicionario."""
result = RAGEvaluationResult(
query="query",
response="response",
contexts=["ctx"],
faithfulness=0.8
)
data = result.to_dict()
assert data['query'] == "query"
assert data['response'] == "response"
assert data['faithfulness'] == 0.8
def test_get_overall_score_all_metrics(self):
"""Testa calculo de score geral com todas metricas."""
result = RAGEvaluationResult(
query="q",
response="r",
contexts=["c"],
faithfulness=0.8,
answer_relevancy=0.9,
context_precision=0.7,
context_recall=0.85
)
score = result.get_overall_score()
assert 0 <= score <= 1
assert abs(score - 0.8125) < 0.01 # Media: (0.8+0.9+0.7+0.85)/4
def test_get_overall_score_partial_metrics(self):
"""Testa score geral com metricas parciais."""
result = RAGEvaluationResult(
query="q",
response="r",
contexts=["c"],
faithfulness=0.8,
answer_relevancy=0.9
)
score = result.get_overall_score()
assert abs(score - 0.85) < 0.01 # Media: (0.8+0.9)/2
def test_get_overall_score_no_metrics(self):
"""Testa score geral sem metricas."""
result = RAGEvaluationResult(
query="q",
response="r",
contexts=["c"]
)
score = result.get_overall_score()
assert score == 0.0
class TestRAGEvaluator:
"""Testes para classe RAGEvaluator."""
@pytest.fixture
def evaluator_simple(self):
"""Evaluador com metricas simplificadas."""
return RAGEvaluator(use_ragas=False)
def test_create_evaluator(self):
"""Testa criacao de evaluador."""
evaluator = RAGEvaluator(use_ragas=False)
assert evaluator is not None
assert not evaluator.use_ragas
def test_evaluate_single_simple(self, evaluator_simple):
"""Testa avaliacao simples de caso unico."""
result = evaluator_simple.evaluate_single(
query="O que e Python?",
response="Python e uma linguagem de programacao.",
contexts=["Python e uma linguagem de programacao moderna."]
)
assert result is not None
assert result.faithfulness is not None
assert result.answer_relevancy is not None
assert result.context_precision is not None
assert 0 <= result.faithfulness <= 1
assert 0 <= result.answer_relevancy <= 1
def test_calculate_faithfulness_simple(self, evaluator_simple):
"""Testa calculo de faithfulness."""
response = "Python e uma linguagem de programacao"
contexts = ["Python e uma linguagem de programacao moderna"]
score = evaluator_simple._calculate_faithfulness_simple(response, contexts)
assert 0 <= score <= 1
assert score > 0 # Deve ter overlap
def test_calculate_faithfulness_no_overlap(self, evaluator_simple):
"""Testa faithfulness sem overlap."""
response = "Java e estaticamente tipada"
contexts = ["Python e dinamicamente tipada"]
score = evaluator_simple._calculate_faithfulness_simple(response, contexts)
assert 0 <= score <= 1
def test_calculate_relevancy_simple(self, evaluator_simple):
"""Testa calculo de relevancia."""
query = "O que e Python"
response = "Python e uma linguagem de programacao"
score = evaluator_simple._calculate_relevancy_simple(query, response)
assert 0 <= score <= 1
assert score > 0 # Deve ter overlap em "Python"
def test_calculate_precision_simple(self, evaluator_simple):
"""Testa calculo de precisao."""
query = "Python linguagem"
contexts = [
"Python e uma linguagem",
"Java e outra linguagem",
"JavaScript nao tem nada"
]
score = evaluator_simple._calculate_precision_simple(query, contexts)
assert 0 <= score <= 1
# Deve encontrar "Python" em contexto 1 e "linguagem" em 1 e 2
def test_calculate_recall_simple(self, evaluator_simple):
"""Testa calculo de recall."""
ground_truth = "Python e uma linguagem de programacao"
contexts = [
"Python e uma linguagem",
"Usada para programacao"
]
score = evaluator_simple._calculate_recall_simple(ground_truth, contexts)
assert 0 <= score <= 1
def test_evaluate_batch(self, evaluator_simple):
"""Testa avaliacao em lote."""
test_cases = [
{
'query': 'O que e Python?',
'response': 'Python e uma linguagem.',
'contexts': ['Python e uma linguagem moderna.']
},
{
'query': 'O que e Java?',
'response': 'Java e uma linguagem.',
'contexts': ['Java e uma linguagem enterprise.']
}
]
results = evaluator_simple.evaluate_batch(test_cases)
assert len(results) == 2
assert all(r.faithfulness is not None for r in results)
assert all(r.response_time is not None for r in results)
def test_generate_report(self, evaluator_simple):
"""Testa geracao de relatorio."""
# Criar resultados fake
results = [
RAGEvaluationResult(
query=f"query{i}",
response=f"response{i}",
contexts=[f"context{i}"],
faithfulness=0.7 + i * 0.1,
answer_relevancy=0.8,
context_precision=0.75
)
for i in range(3)
]
report = evaluator_simple.generate_report(results)
assert report['total_cases'] == 3
assert 'average_scores' in report
assert 'min_scores' in report
assert 'max_scores' in report
assert 0 <= report['average_scores']['faithfulness'] <= 1
def test_generate_report_empty(self, evaluator_simple):
"""Testa relatorio com resultados vazios."""
report = evaluator_simple.generate_report([])
assert report == {}
def test_generate_report_worst_cases(self, evaluator_simple):
"""Testa identificacao de piores casos."""
results = [
RAGEvaluationResult(
query=f"query{i}",
response=f"response{i}",
contexts=[f"context{i}"],
faithfulness=0.3 + i * 0.1,
answer_relevancy=0.4 + i * 0.1
)
for i in range(10)
]
report = evaluator_simple.generate_report(results)
assert 'worst_cases' in report
assert len(report['worst_cases']) == 5
# Primeiro caso deve ser o pior (score mais baixo)
assert report['worst_cases'][0]['index'] == 0
if __name__ == "__main__":
pytest.main([__file__, "-v"])