Spaces:
Sleeping
Sleeping
| """ | |
| Sistema de avaliacao automatica de qualidade RAG usando RAGAS. | |
| RAGAS (RAG Assessment) e um framework para avaliar sistemas RAG usando metricas objetivas. | |
| """ | |
| from typing import List, Dict, Optional, Any | |
| from dataclasses import dataclass | |
| import time | |
| class RAGEvaluationResult: | |
| """Resultado de avaliacao RAG.""" | |
| query: str | |
| response: str | |
| contexts: List[str] | |
| ground_truth: Optional[str] = None | |
| # Metricas RAGAS | |
| faithfulness: Optional[float] = None | |
| answer_relevancy: Optional[float] = None | |
| context_precision: Optional[float] = None | |
| context_recall: Optional[float] = None | |
| # Metricas adicionais | |
| response_time: Optional[float] = None | |
| num_contexts: Optional[int] = None | |
| def to_dict(self) -> Dict[str, Any]: | |
| """Converte para dicionario.""" | |
| return { | |
| 'query': self.query, | |
| 'response': self.response, | |
| 'contexts': self.contexts, | |
| 'ground_truth': self.ground_truth, | |
| 'faithfulness': self.faithfulness, | |
| 'answer_relevancy': self.answer_relevancy, | |
| 'context_precision': self.context_precision, | |
| 'context_recall': self.context_recall, | |
| 'response_time': self.response_time, | |
| 'num_contexts': self.num_contexts or len(self.contexts) | |
| } | |
| def get_overall_score(self) -> float: | |
| """Calcula score geral (media das metricas).""" | |
| scores = [] | |
| if self.faithfulness is not None: | |
| scores.append(self.faithfulness) | |
| if self.answer_relevancy is not None: | |
| scores.append(self.answer_relevancy) | |
| if self.context_precision is not None: | |
| scores.append(self.context_precision) | |
| if self.context_recall is not None: | |
| scores.append(self.context_recall) | |
| return sum(scores) / len(scores) if scores else 0.0 | |
| class RAGEvaluator: | |
| """Avaliador de sistemas RAG usando RAGAS.""" | |
| def __init__(self, use_ragas: bool = True): | |
| """ | |
| Inicializa avaliador. | |
| Args: | |
| use_ragas: Se True, usa biblioteca RAGAS (requer instalacao) | |
| """ | |
| self.use_ragas = use_ragas | |
| self.ragas_metrics = None | |
| if use_ragas: | |
| try: | |
| from ragas import evaluate | |
| from ragas.metrics import ( | |
| faithfulness, | |
| answer_relevancy, | |
| context_precision, | |
| context_recall | |
| ) | |
| self.ragas_metrics = { | |
| 'faithfulness': faithfulness, | |
| 'answer_relevancy': answer_relevancy, | |
| 'context_precision': context_precision, | |
| 'context_recall': context_recall | |
| } | |
| self.evaluate_fn = evaluate | |
| except ImportError: | |
| print("Aviso: RAGAS nao instalado. Usando metricas simplificadas.") | |
| print("Instale com: pip install ragas") | |
| self.use_ragas = False | |
| def evaluate_single( | |
| self, | |
| query: str, | |
| response: str, | |
| contexts: List[str], | |
| ground_truth: Optional[str] = None | |
| ) -> RAGEvaluationResult: | |
| """ | |
| Avalia uma unica query-response. | |
| Args: | |
| query: Pergunta do usuario | |
| response: Resposta gerada | |
| contexts: Contextos recuperados | |
| ground_truth: Resposta esperada (opcional) | |
| Returns: | |
| Resultado de avaliacao | |
| """ | |
| result = RAGEvaluationResult( | |
| query=query, | |
| response=response, | |
| contexts=contexts, | |
| ground_truth=ground_truth, | |
| num_contexts=len(contexts) | |
| ) | |
| if self.use_ragas and self.ragas_metrics: | |
| # Avaliar com RAGAS | |
| result = self._evaluate_with_ragas(result) | |
| else: | |
| # Avaliar com metricas simplificadas | |
| result = self._evaluate_simple(result) | |
| return result | |
| def _evaluate_with_ragas(self, result: RAGEvaluationResult) -> RAGEvaluationResult: | |
| """Avalia usando biblioteca RAGAS.""" | |
| try: | |
| from datasets import Dataset | |
| # Preparar dataset | |
| data = { | |
| 'question': [result.query], | |
| 'answer': [result.response], | |
| 'contexts': [result.contexts], | |
| } | |
| if result.ground_truth: | |
| data['ground_truth'] = [result.ground_truth] | |
| dataset = Dataset.from_dict(data) | |
| # Avaliar | |
| eval_result = self.evaluate_fn( | |
| dataset, | |
| metrics=list(self.ragas_metrics.values()) | |
| ) | |
| # Extrair scores | |
| if 'faithfulness' in eval_result: | |
| result.faithfulness = eval_result['faithfulness'] | |
| if 'answer_relevancy' in eval_result: | |
| result.answer_relevancy = eval_result['answer_relevancy'] | |
| if 'context_precision' in eval_result: | |
| result.context_precision = eval_result['context_precision'] | |
| if 'context_recall' in eval_result and result.ground_truth: | |
| result.context_recall = eval_result['context_recall'] | |
| except Exception as e: | |
| print(f"Erro ao avaliar com RAGAS: {e}") | |
| result = self._evaluate_simple(result) | |
| return result | |
| def _evaluate_simple(self, result: RAGEvaluationResult) -> RAGEvaluationResult: | |
| """Avalia usando metricas simplificadas (sem RAGAS).""" | |
| # Faithfulness: Resposta menciona informacoes dos contextos? | |
| result.faithfulness = self._calculate_faithfulness_simple( | |
| result.response, | |
| result.contexts | |
| ) | |
| # Answer relevancy: Resposta e relevante para a query? | |
| result.answer_relevancy = self._calculate_relevancy_simple( | |
| result.query, | |
| result.response | |
| ) | |
| # Context precision: Contextos sao relevantes? | |
| result.context_precision = self._calculate_precision_simple( | |
| result.query, | |
| result.contexts | |
| ) | |
| # Context recall: Todos contextos relevantes foram recuperados? | |
| if result.ground_truth: | |
| result.context_recall = self._calculate_recall_simple( | |
| result.ground_truth, | |
| result.contexts | |
| ) | |
| return result | |
| def _calculate_faithfulness_simple( | |
| self, | |
| response: str, | |
| contexts: List[str] | |
| ) -> float: | |
| """ | |
| Calcula faithfulness simplificado. | |
| Verifica se resposta menciona informacoes dos contextos. | |
| """ | |
| if not contexts: | |
| return 0.0 | |
| # Conta quantas palavras da resposta aparecem nos contextos | |
| response_words = set(response.lower().split()) | |
| context_words = set() | |
| for ctx in contexts: | |
| context_words.update(ctx.lower().split()) | |
| if not response_words: | |
| return 0.0 | |
| # Proporcao de palavras da resposta que aparecem nos contextos | |
| overlap = len(response_words & context_words) | |
| return overlap / len(response_words) | |
| def _calculate_relevancy_simple(self, query: str, response: str) -> float: | |
| """ | |
| Calcula relevancia simplificada. | |
| Verifica overlap entre query e resposta. | |
| """ | |
| query_words = set(query.lower().split()) | |
| response_words = set(response.lower().split()) | |
| if not query_words: | |
| return 0.0 | |
| overlap = len(query_words & response_words) | |
| return overlap / len(query_words) | |
| def _calculate_precision_simple( | |
| self, | |
| query: str, | |
| contexts: List[str] | |
| ) -> float: | |
| """ | |
| Calcula precisao simplificada. | |
| Verifica se contextos contem palavras da query. | |
| """ | |
| if not contexts: | |
| return 0.0 | |
| query_words = set(query.lower().split()) | |
| relevant_contexts = 0 | |
| for ctx in contexts: | |
| ctx_words = set(ctx.lower().split()) | |
| overlap = len(query_words & ctx_words) | |
| if overlap > 0: | |
| relevant_contexts += 1 | |
| return relevant_contexts / len(contexts) | |
| def _calculate_recall_simple( | |
| self, | |
| ground_truth: str, | |
| contexts: List[str] | |
| ) -> float: | |
| """ | |
| Calcula recall simplificado. | |
| Verifica se contextos contem informacoes da ground truth. | |
| """ | |
| if not contexts: | |
| return 0.0 | |
| ground_truth_words = set(ground_truth.lower().split()) | |
| context_words = set() | |
| for ctx in contexts: | |
| context_words.update(ctx.lower().split()) | |
| if not ground_truth_words: | |
| return 0.0 | |
| overlap = len(ground_truth_words & context_words) | |
| return overlap / len(ground_truth_words) | |
| def evaluate_batch( | |
| self, | |
| test_cases: List[Dict[str, Any]] | |
| ) -> List[RAGEvaluationResult]: | |
| """ | |
| Avalia multiplos casos de teste. | |
| Args: | |
| test_cases: Lista de dicts com keys: query, response, contexts, ground_truth | |
| Returns: | |
| Lista de resultados | |
| """ | |
| results = [] | |
| for i, test_case in enumerate(test_cases): | |
| print(f"Avaliando caso {i+1}/{len(test_cases)}...") | |
| start_time = time.time() | |
| result = self.evaluate_single( | |
| query=test_case['query'], | |
| response=test_case['response'], | |
| contexts=test_case['contexts'], | |
| ground_truth=test_case.get('ground_truth') | |
| ) | |
| result.response_time = time.time() - start_time | |
| results.append(result) | |
| return results | |
| def generate_report( | |
| self, | |
| results: List[RAGEvaluationResult] | |
| ) -> Dict[str, Any]: | |
| """ | |
| Gera relatorio de avaliacao. | |
| Args: | |
| results: Lista de resultados | |
| Returns: | |
| Dicionario com estatisticas | |
| """ | |
| if not results: | |
| return {} | |
| # Calcular medias | |
| faithfulness_scores = [r.faithfulness for r in results if r.faithfulness is not None] | |
| relevancy_scores = [r.answer_relevancy for r in results if r.answer_relevancy is not None] | |
| precision_scores = [r.context_precision for r in results if r.context_precision is not None] | |
| recall_scores = [r.context_recall for r in results if r.context_recall is not None] | |
| overall_scores = [r.get_overall_score() for r in results] | |
| report = { | |
| 'total_cases': len(results), | |
| 'average_scores': { | |
| 'faithfulness': sum(faithfulness_scores) / len(faithfulness_scores) if faithfulness_scores else 0.0, | |
| 'answer_relevancy': sum(relevancy_scores) / len(relevancy_scores) if relevancy_scores else 0.0, | |
| 'context_precision': sum(precision_scores) / len(precision_scores) if precision_scores else 0.0, | |
| 'context_recall': sum(recall_scores) / len(recall_scores) if recall_scores else 0.0, | |
| 'overall': sum(overall_scores) / len(overall_scores) if overall_scores else 0.0 | |
| }, | |
| 'min_scores': { | |
| 'faithfulness': min(faithfulness_scores) if faithfulness_scores else 0.0, | |
| 'answer_relevancy': min(relevancy_scores) if relevancy_scores else 0.0, | |
| 'context_precision': min(precision_scores) if precision_scores else 0.0, | |
| 'context_recall': min(recall_scores) if recall_scores else 0.0 | |
| }, | |
| 'max_scores': { | |
| 'faithfulness': max(faithfulness_scores) if faithfulness_scores else 0.0, | |
| 'answer_relevancy': max(relevancy_scores) if relevancy_scores else 0.0, | |
| 'context_precision': max(precision_scores) if precision_scores else 0.0, | |
| 'context_recall': max(recall_scores) if recall_scores else 0.0 | |
| } | |
| } | |
| # Identificar piores casos (para analise) | |
| if overall_scores: | |
| worst_cases = sorted( | |
| [(i, score) for i, score in enumerate(overall_scores)], | |
| key=lambda x: x[1] | |
| )[:5] # Top 5 piores | |
| report['worst_cases'] = [ | |
| { | |
| 'index': idx, | |
| 'query': results[idx].query, | |
| 'score': score | |
| } | |
| for idx, score in worst_cases | |
| ] | |
| return report | |