""" Sistema de avaliacao automatica de qualidade RAG usando RAGAS. RAGAS (RAG Assessment) e um framework para avaliar sistemas RAG usando metricas objetivas. """ from typing import List, Dict, Optional, Any from dataclasses import dataclass import time @dataclass class RAGEvaluationResult: """Resultado de avaliacao RAG.""" query: str response: str contexts: List[str] ground_truth: Optional[str] = None # Metricas RAGAS faithfulness: Optional[float] = None answer_relevancy: Optional[float] = None context_precision: Optional[float] = None context_recall: Optional[float] = None # Metricas adicionais response_time: Optional[float] = None num_contexts: Optional[int] = None def to_dict(self) -> Dict[str, Any]: """Converte para dicionario.""" return { 'query': self.query, 'response': self.response, 'contexts': self.contexts, 'ground_truth': self.ground_truth, 'faithfulness': self.faithfulness, 'answer_relevancy': self.answer_relevancy, 'context_precision': self.context_precision, 'context_recall': self.context_recall, 'response_time': self.response_time, 'num_contexts': self.num_contexts or len(self.contexts) } def get_overall_score(self) -> float: """Calcula score geral (media das metricas).""" scores = [] if self.faithfulness is not None: scores.append(self.faithfulness) if self.answer_relevancy is not None: scores.append(self.answer_relevancy) if self.context_precision is not None: scores.append(self.context_precision) if self.context_recall is not None: scores.append(self.context_recall) return sum(scores) / len(scores) if scores else 0.0 class RAGEvaluator: """Avaliador de sistemas RAG usando RAGAS.""" def __init__(self, use_ragas: bool = True): """ Inicializa avaliador. Args: use_ragas: Se True, usa biblioteca RAGAS (requer instalacao) """ self.use_ragas = use_ragas self.ragas_metrics = None if use_ragas: try: from ragas import evaluate from ragas.metrics import ( faithfulness, answer_relevancy, context_precision, context_recall ) self.ragas_metrics = { 'faithfulness': faithfulness, 'answer_relevancy': answer_relevancy, 'context_precision': context_precision, 'context_recall': context_recall } self.evaluate_fn = evaluate except ImportError: print("Aviso: RAGAS nao instalado. Usando metricas simplificadas.") print("Instale com: pip install ragas") self.use_ragas = False def evaluate_single( self, query: str, response: str, contexts: List[str], ground_truth: Optional[str] = None ) -> RAGEvaluationResult: """ Avalia uma unica query-response. Args: query: Pergunta do usuario response: Resposta gerada contexts: Contextos recuperados ground_truth: Resposta esperada (opcional) Returns: Resultado de avaliacao """ result = RAGEvaluationResult( query=query, response=response, contexts=contexts, ground_truth=ground_truth, num_contexts=len(contexts) ) if self.use_ragas and self.ragas_metrics: # Avaliar com RAGAS result = self._evaluate_with_ragas(result) else: # Avaliar com metricas simplificadas result = self._evaluate_simple(result) return result def _evaluate_with_ragas(self, result: RAGEvaluationResult) -> RAGEvaluationResult: """Avalia usando biblioteca RAGAS.""" try: from datasets import Dataset # Preparar dataset data = { 'question': [result.query], 'answer': [result.response], 'contexts': [result.contexts], } if result.ground_truth: data['ground_truth'] = [result.ground_truth] dataset = Dataset.from_dict(data) # Avaliar eval_result = self.evaluate_fn( dataset, metrics=list(self.ragas_metrics.values()) ) # Extrair scores if 'faithfulness' in eval_result: result.faithfulness = eval_result['faithfulness'] if 'answer_relevancy' in eval_result: result.answer_relevancy = eval_result['answer_relevancy'] if 'context_precision' in eval_result: result.context_precision = eval_result['context_precision'] if 'context_recall' in eval_result and result.ground_truth: result.context_recall = eval_result['context_recall'] except Exception as e: print(f"Erro ao avaliar com RAGAS: {e}") result = self._evaluate_simple(result) return result def _evaluate_simple(self, result: RAGEvaluationResult) -> RAGEvaluationResult: """Avalia usando metricas simplificadas (sem RAGAS).""" # Faithfulness: Resposta menciona informacoes dos contextos? result.faithfulness = self._calculate_faithfulness_simple( result.response, result.contexts ) # Answer relevancy: Resposta e relevante para a query? result.answer_relevancy = self._calculate_relevancy_simple( result.query, result.response ) # Context precision: Contextos sao relevantes? result.context_precision = self._calculate_precision_simple( result.query, result.contexts ) # Context recall: Todos contextos relevantes foram recuperados? if result.ground_truth: result.context_recall = self._calculate_recall_simple( result.ground_truth, result.contexts ) return result def _calculate_faithfulness_simple( self, response: str, contexts: List[str] ) -> float: """ Calcula faithfulness simplificado. Verifica se resposta menciona informacoes dos contextos. """ if not contexts: return 0.0 # Conta quantas palavras da resposta aparecem nos contextos response_words = set(response.lower().split()) context_words = set() for ctx in contexts: context_words.update(ctx.lower().split()) if not response_words: return 0.0 # Proporcao de palavras da resposta que aparecem nos contextos overlap = len(response_words & context_words) return overlap / len(response_words) def _calculate_relevancy_simple(self, query: str, response: str) -> float: """ Calcula relevancia simplificada. Verifica overlap entre query e resposta. """ query_words = set(query.lower().split()) response_words = set(response.lower().split()) if not query_words: return 0.0 overlap = len(query_words & response_words) return overlap / len(query_words) def _calculate_precision_simple( self, query: str, contexts: List[str] ) -> float: """ Calcula precisao simplificada. Verifica se contextos contem palavras da query. """ if not contexts: return 0.0 query_words = set(query.lower().split()) relevant_contexts = 0 for ctx in contexts: ctx_words = set(ctx.lower().split()) overlap = len(query_words & ctx_words) if overlap > 0: relevant_contexts += 1 return relevant_contexts / len(contexts) def _calculate_recall_simple( self, ground_truth: str, contexts: List[str] ) -> float: """ Calcula recall simplificado. Verifica se contextos contem informacoes da ground truth. """ if not contexts: return 0.0 ground_truth_words = set(ground_truth.lower().split()) context_words = set() for ctx in contexts: context_words.update(ctx.lower().split()) if not ground_truth_words: return 0.0 overlap = len(ground_truth_words & context_words) return overlap / len(ground_truth_words) def evaluate_batch( self, test_cases: List[Dict[str, Any]] ) -> List[RAGEvaluationResult]: """ Avalia multiplos casos de teste. Args: test_cases: Lista de dicts com keys: query, response, contexts, ground_truth Returns: Lista de resultados """ results = [] for i, test_case in enumerate(test_cases): print(f"Avaliando caso {i+1}/{len(test_cases)}...") start_time = time.time() result = self.evaluate_single( query=test_case['query'], response=test_case['response'], contexts=test_case['contexts'], ground_truth=test_case.get('ground_truth') ) result.response_time = time.time() - start_time results.append(result) return results def generate_report( self, results: List[RAGEvaluationResult] ) -> Dict[str, Any]: """ Gera relatorio de avaliacao. Args: results: Lista de resultados Returns: Dicionario com estatisticas """ if not results: return {} # Calcular medias faithfulness_scores = [r.faithfulness for r in results if r.faithfulness is not None] relevancy_scores = [r.answer_relevancy for r in results if r.answer_relevancy is not None] precision_scores = [r.context_precision for r in results if r.context_precision is not None] recall_scores = [r.context_recall for r in results if r.context_recall is not None] overall_scores = [r.get_overall_score() for r in results] report = { 'total_cases': len(results), 'average_scores': { 'faithfulness': sum(faithfulness_scores) / len(faithfulness_scores) if faithfulness_scores else 0.0, 'answer_relevancy': sum(relevancy_scores) / len(relevancy_scores) if relevancy_scores else 0.0, 'context_precision': sum(precision_scores) / len(precision_scores) if precision_scores else 0.0, 'context_recall': sum(recall_scores) / len(recall_scores) if recall_scores else 0.0, 'overall': sum(overall_scores) / len(overall_scores) if overall_scores else 0.0 }, 'min_scores': { 'faithfulness': min(faithfulness_scores) if faithfulness_scores else 0.0, 'answer_relevancy': min(relevancy_scores) if relevancy_scores else 0.0, 'context_precision': min(precision_scores) if precision_scores else 0.0, 'context_recall': min(recall_scores) if recall_scores else 0.0 }, 'max_scores': { 'faithfulness': max(faithfulness_scores) if faithfulness_scores else 0.0, 'answer_relevancy': max(relevancy_scores) if relevancy_scores else 0.0, 'context_precision': max(precision_scores) if precision_scores else 0.0, 'context_recall': max(recall_scores) if recall_scores else 0.0 } } # Identificar piores casos (para analise) if overall_scores: worst_cases = sorted( [(i, score) for i, score in enumerate(overall_scores)], key=lambda x: x[1] )[:5] # Top 5 piores report['worst_cases'] = [ { 'index': idx, 'query': results[idx].query, 'score': score } for idx, score in worst_cases ] return report