rag_template / src /evaluation.py
Guilherme Favaron
Sync: Complete project update (Phase 6) - API, Metadata, Eval, Docs
a686b1b
"""
Sistema de avaliacao automatica de qualidade RAG usando RAGAS.
RAGAS (RAG Assessment) e um framework para avaliar sistemas RAG usando metricas objetivas.
"""
from typing import List, Dict, Optional, Any
from dataclasses import dataclass
import time
@dataclass
class RAGEvaluationResult:
"""Resultado de avaliacao RAG."""
query: str
response: str
contexts: List[str]
ground_truth: Optional[str] = None
# Metricas RAGAS
faithfulness: Optional[float] = None
answer_relevancy: Optional[float] = None
context_precision: Optional[float] = None
context_recall: Optional[float] = None
# Metricas adicionais
response_time: Optional[float] = None
num_contexts: Optional[int] = None
def to_dict(self) -> Dict[str, Any]:
"""Converte para dicionario."""
return {
'query': self.query,
'response': self.response,
'contexts': self.contexts,
'ground_truth': self.ground_truth,
'faithfulness': self.faithfulness,
'answer_relevancy': self.answer_relevancy,
'context_precision': self.context_precision,
'context_recall': self.context_recall,
'response_time': self.response_time,
'num_contexts': self.num_contexts or len(self.contexts)
}
def get_overall_score(self) -> float:
"""Calcula score geral (media das metricas)."""
scores = []
if self.faithfulness is not None:
scores.append(self.faithfulness)
if self.answer_relevancy is not None:
scores.append(self.answer_relevancy)
if self.context_precision is not None:
scores.append(self.context_precision)
if self.context_recall is not None:
scores.append(self.context_recall)
return sum(scores) / len(scores) if scores else 0.0
class RAGEvaluator:
"""Avaliador de sistemas RAG usando RAGAS."""
def __init__(self, use_ragas: bool = True):
"""
Inicializa avaliador.
Args:
use_ragas: Se True, usa biblioteca RAGAS (requer instalacao)
"""
self.use_ragas = use_ragas
self.ragas_metrics = None
if use_ragas:
try:
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_precision,
context_recall
)
self.ragas_metrics = {
'faithfulness': faithfulness,
'answer_relevancy': answer_relevancy,
'context_precision': context_precision,
'context_recall': context_recall
}
self.evaluate_fn = evaluate
except ImportError:
print("Aviso: RAGAS nao instalado. Usando metricas simplificadas.")
print("Instale com: pip install ragas")
self.use_ragas = False
def evaluate_single(
self,
query: str,
response: str,
contexts: List[str],
ground_truth: Optional[str] = None
) -> RAGEvaluationResult:
"""
Avalia uma unica query-response.
Args:
query: Pergunta do usuario
response: Resposta gerada
contexts: Contextos recuperados
ground_truth: Resposta esperada (opcional)
Returns:
Resultado de avaliacao
"""
result = RAGEvaluationResult(
query=query,
response=response,
contexts=contexts,
ground_truth=ground_truth,
num_contexts=len(contexts)
)
if self.use_ragas and self.ragas_metrics:
# Avaliar com RAGAS
result = self._evaluate_with_ragas(result)
else:
# Avaliar com metricas simplificadas
result = self._evaluate_simple(result)
return result
def _evaluate_with_ragas(self, result: RAGEvaluationResult) -> RAGEvaluationResult:
"""Avalia usando biblioteca RAGAS."""
try:
from datasets import Dataset
# Preparar dataset
data = {
'question': [result.query],
'answer': [result.response],
'contexts': [result.contexts],
}
if result.ground_truth:
data['ground_truth'] = [result.ground_truth]
dataset = Dataset.from_dict(data)
# Avaliar
eval_result = self.evaluate_fn(
dataset,
metrics=list(self.ragas_metrics.values())
)
# Extrair scores
if 'faithfulness' in eval_result:
result.faithfulness = eval_result['faithfulness']
if 'answer_relevancy' in eval_result:
result.answer_relevancy = eval_result['answer_relevancy']
if 'context_precision' in eval_result:
result.context_precision = eval_result['context_precision']
if 'context_recall' in eval_result and result.ground_truth:
result.context_recall = eval_result['context_recall']
except Exception as e:
print(f"Erro ao avaliar com RAGAS: {e}")
result = self._evaluate_simple(result)
return result
def _evaluate_simple(self, result: RAGEvaluationResult) -> RAGEvaluationResult:
"""Avalia usando metricas simplificadas (sem RAGAS)."""
# Faithfulness: Resposta menciona informacoes dos contextos?
result.faithfulness = self._calculate_faithfulness_simple(
result.response,
result.contexts
)
# Answer relevancy: Resposta e relevante para a query?
result.answer_relevancy = self._calculate_relevancy_simple(
result.query,
result.response
)
# Context precision: Contextos sao relevantes?
result.context_precision = self._calculate_precision_simple(
result.query,
result.contexts
)
# Context recall: Todos contextos relevantes foram recuperados?
if result.ground_truth:
result.context_recall = self._calculate_recall_simple(
result.ground_truth,
result.contexts
)
return result
def _calculate_faithfulness_simple(
self,
response: str,
contexts: List[str]
) -> float:
"""
Calcula faithfulness simplificado.
Verifica se resposta menciona informacoes dos contextos.
"""
if not contexts:
return 0.0
# Conta quantas palavras da resposta aparecem nos contextos
response_words = set(response.lower().split())
context_words = set()
for ctx in contexts:
context_words.update(ctx.lower().split())
if not response_words:
return 0.0
# Proporcao de palavras da resposta que aparecem nos contextos
overlap = len(response_words & context_words)
return overlap / len(response_words)
def _calculate_relevancy_simple(self, query: str, response: str) -> float:
"""
Calcula relevancia simplificada.
Verifica overlap entre query e resposta.
"""
query_words = set(query.lower().split())
response_words = set(response.lower().split())
if not query_words:
return 0.0
overlap = len(query_words & response_words)
return overlap / len(query_words)
def _calculate_precision_simple(
self,
query: str,
contexts: List[str]
) -> float:
"""
Calcula precisao simplificada.
Verifica se contextos contem palavras da query.
"""
if not contexts:
return 0.0
query_words = set(query.lower().split())
relevant_contexts = 0
for ctx in contexts:
ctx_words = set(ctx.lower().split())
overlap = len(query_words & ctx_words)
if overlap > 0:
relevant_contexts += 1
return relevant_contexts / len(contexts)
def _calculate_recall_simple(
self,
ground_truth: str,
contexts: List[str]
) -> float:
"""
Calcula recall simplificado.
Verifica se contextos contem informacoes da ground truth.
"""
if not contexts:
return 0.0
ground_truth_words = set(ground_truth.lower().split())
context_words = set()
for ctx in contexts:
context_words.update(ctx.lower().split())
if not ground_truth_words:
return 0.0
overlap = len(ground_truth_words & context_words)
return overlap / len(ground_truth_words)
def evaluate_batch(
self,
test_cases: List[Dict[str, Any]]
) -> List[RAGEvaluationResult]:
"""
Avalia multiplos casos de teste.
Args:
test_cases: Lista de dicts com keys: query, response, contexts, ground_truth
Returns:
Lista de resultados
"""
results = []
for i, test_case in enumerate(test_cases):
print(f"Avaliando caso {i+1}/{len(test_cases)}...")
start_time = time.time()
result = self.evaluate_single(
query=test_case['query'],
response=test_case['response'],
contexts=test_case['contexts'],
ground_truth=test_case.get('ground_truth')
)
result.response_time = time.time() - start_time
results.append(result)
return results
def generate_report(
self,
results: List[RAGEvaluationResult]
) -> Dict[str, Any]:
"""
Gera relatorio de avaliacao.
Args:
results: Lista de resultados
Returns:
Dicionario com estatisticas
"""
if not results:
return {}
# Calcular medias
faithfulness_scores = [r.faithfulness for r in results if r.faithfulness is not None]
relevancy_scores = [r.answer_relevancy for r in results if r.answer_relevancy is not None]
precision_scores = [r.context_precision for r in results if r.context_precision is not None]
recall_scores = [r.context_recall for r in results if r.context_recall is not None]
overall_scores = [r.get_overall_score() for r in results]
report = {
'total_cases': len(results),
'average_scores': {
'faithfulness': sum(faithfulness_scores) / len(faithfulness_scores) if faithfulness_scores else 0.0,
'answer_relevancy': sum(relevancy_scores) / len(relevancy_scores) if relevancy_scores else 0.0,
'context_precision': sum(precision_scores) / len(precision_scores) if precision_scores else 0.0,
'context_recall': sum(recall_scores) / len(recall_scores) if recall_scores else 0.0,
'overall': sum(overall_scores) / len(overall_scores) if overall_scores else 0.0
},
'min_scores': {
'faithfulness': min(faithfulness_scores) if faithfulness_scores else 0.0,
'answer_relevancy': min(relevancy_scores) if relevancy_scores else 0.0,
'context_precision': min(precision_scores) if precision_scores else 0.0,
'context_recall': min(recall_scores) if recall_scores else 0.0
},
'max_scores': {
'faithfulness': max(faithfulness_scores) if faithfulness_scores else 0.0,
'answer_relevancy': max(relevancy_scores) if relevancy_scores else 0.0,
'context_precision': max(precision_scores) if precision_scores else 0.0,
'context_recall': max(recall_scores) if recall_scores else 0.0
}
}
# Identificar piores casos (para analise)
if overall_scores:
worst_cases = sorted(
[(i, score) for i, score in enumerate(overall_scores)],
key=lambda x: x[1]
)[:5] # Top 5 piores
report['worst_cases'] = [
{
'index': idx,
'query': results[idx].query,
'score': score
}
for idx, score in worst_cases
]
return report