""" Evaluation module for assessing RAG system performance. Provides metrics like RAGAS, ROUGE, BLEU, and BERTScore. """ from typing import List, Dict, Any, Optional import numpy as np class RAGEvaluator: """Evaluator for RAG system performance metrics.""" def __init__(self): """Initialize the evaluator with optional metric libraries.""" self.metrics_available = self._check_available_metrics() print(f"✓ RAG Evaluator initialized") print(f" Available metrics: {', '.join(self.metrics_available)}") def _check_available_metrics(self) -> List[str]: """Check which evaluation metrics are available.""" available = [] try: from rouge_score import rouge_scorer available.append("ROUGE") except ImportError: pass try: from bert_score import score available.append("BERTScore") except ImportError: pass try: from ragas import evaluate available.append("RAGAS") except ImportError: pass # Basic metrics are always available available.extend(["Length", "Similarity"]) return available def evaluate_response( self, query: str, generated_answer: str, reference_answer: Optional[str] = None, retrieved_contexts: Optional[List[str]] = None ) -> Dict[str, Any]: """ Evaluate a single response. Args: query: Original question generated_answer: Generated answer reference_answer: Optional ground truth answer retrieved_contexts: Optional retrieved contexts Returns: Dictionary of evaluation metrics """ results = { "query": query, "generated_answer": generated_answer, "metrics": {} } # Basic metrics results["metrics"]["answer_length"] = len(generated_answer) results["metrics"]["word_count"] = len(generated_answer.split()) # Reference-based metrics (if reference answer provided) if reference_answer: results["reference_answer"] = reference_answer # ROUGE scores if "ROUGE" in self.metrics_available: rouge_scores = self._calculate_rouge(generated_answer, reference_answer) results["metrics"]["rouge"] = rouge_scores # BERTScore if "BERTScore" in self.metrics_available: bert_score = self._calculate_bertscore(generated_answer, reference_answer) results["metrics"]["bertscore"] = bert_score # Context-based metrics (if contexts provided) if retrieved_contexts: results["num_contexts"] = len(retrieved_contexts) # Calculate answer-context relevance if "BERTScore" in self.metrics_available: context_relevance = self._calculate_context_relevance( generated_answer, retrieved_contexts ) results["metrics"]["context_relevance"] = context_relevance return results def _calculate_rouge( self, generated: str, reference: str ) -> Dict[str, float]: """Calculate ROUGE scores.""" try: from rouge_score import rouge_scorer scorer = rouge_scorer.RougeScorer( ['rouge1', 'rouge2', 'rougeL'], use_stemmer=True ) scores = scorer.score(reference, generated) return { "rouge1": scores['rouge1'].fmeasure, "rouge2": scores['rouge2'].fmeasure, "rougeL": scores['rougeL'].fmeasure } except Exception as e: print(f"Error calculating ROUGE: {e}") return {} def _calculate_bertscore( self, generated: str, reference: str ) -> Dict[str, float]: """Calculate BERTScore.""" try: from bert_score import score P, R, F1 = score( [generated], [reference], lang="en", verbose=False ) return { "precision": float(P[0]), "recall": float(R[0]), "f1": float(F1[0]) } except Exception as e: print(f"Error calculating BERTScore: {e}") return {} def _calculate_context_relevance( self, answer: str, contexts: List[str] ) -> float: """Calculate relevance between answer and contexts using BERTScore.""" try: from bert_score import score # Calculate BERTScore between answer and each context scores = [] for context in contexts: _, _, F1 = score( [answer], [context], lang="en", verbose=False ) scores.append(float(F1[0])) # Return average score return float(np.mean(scores)) if scores else 0.0 except Exception as e: print(f"Error calculating context relevance: {e}") return 0.0 def evaluate_batch( self, evaluations: List[Dict[str, Any]] ) -> Dict[str, Any]: """ Evaluate multiple responses and aggregate results. Args: evaluations: List of evaluation dictionaries with query, generated_answer, etc. Returns: Aggregated evaluation results with statistics """ print(f"\n{'='*60}") print(f"📊 BATCH EVALUATION - {len(evaluations)} responses") print(f"{'='*60}\n") all_results = [] for i, eval_data in enumerate(evaluations, 1): print(f"[{i}/{len(evaluations)}] Evaluating: {eval_data['query'][:50]}...") result = self.evaluate_response( query=eval_data["query"], generated_answer=eval_data["generated_answer"], reference_answer=eval_data.get("reference_answer"), retrieved_contexts=eval_data.get("retrieved_contexts") ) all_results.append(result) # Aggregate metrics aggregated = self._aggregate_metrics(all_results) print(f"\n{'='*60}") print("✅ BATCH EVALUATION COMPLETE") print(f"{'='*60}\n") self._print_summary(aggregated) return aggregated def _aggregate_metrics( self, results: List[Dict[str, Any]] ) -> Dict[str, Any]: """ Aggregate metrics from multiple evaluations. Args: results: List of evaluation results Returns: Aggregated metrics dictionary """ aggregated = { "total_evaluations": len(results), "metrics": {}, "individual_results": results } # Aggregate basic metrics lengths = [r["metrics"]["answer_length"] for r in results] word_counts = [r["metrics"]["word_count"] for r in results] aggregated["metrics"]["avg_answer_length"] = float(np.mean(lengths)) aggregated["metrics"]["avg_word_count"] = float(np.mean(word_counts)) # Aggregate ROUGE scores if available rouge_results = [r["metrics"].get("rouge") for r in results if "rouge" in r["metrics"]] if rouge_results: rouge_agg = { "rouge1": float(np.mean([r["rouge1"] for r in rouge_results])), "rouge2": float(np.mean([r["rouge2"] for r in rouge_results])), "rougeL": float(np.mean([r["rougeL"] for r in rouge_results])) } aggregated["metrics"]["avg_rouge"] = rouge_agg # Aggregate BERTScore if available bert_results = [r["metrics"].get("bertscore") for r in results if "bertscore" in r["metrics"]] if bert_results: bert_agg = { "precision": float(np.mean([r["precision"] for r in bert_results])), "recall": float(np.mean([r["recall"] for r in bert_results])), "f1": float(np.mean([r["f1"] for r in bert_results])) } aggregated["metrics"]["avg_bertscore"] = bert_agg # Aggregate context relevance if available context_scores = [r["metrics"].get("context_relevance") for r in results if "context_relevance" in r["metrics"]] if context_scores: aggregated["metrics"]["avg_context_relevance"] = float(np.mean(context_scores)) return aggregated def _print_summary(self, aggregated: Dict[str, Any]) -> None: """ Print a formatted summary of aggregated metrics. Args: aggregated: Aggregated metrics dictionary """ print("📊 EVALUATION SUMMARY") print("-" * 60) print(f"Total Evaluations: {aggregated['total_evaluations']}") print() metrics = aggregated["metrics"] print("Basic Metrics:") print(f" Average Answer Length: {metrics.get('avg_answer_length', 0):.1f} characters") print(f" Average Word Count: {metrics.get('avg_word_count', 0):.1f} words") print() if "avg_rouge" in metrics: rouge = metrics["avg_rouge"] print("ROUGE Scores:") print(f" ROUGE-1: {rouge['rouge1']:.3f}") print(f" ROUGE-2: {rouge['rouge2']:.3f}") print(f" ROUGE-L: {rouge['rougeL']:.3f}") print() if "avg_bertscore" in metrics: bert = metrics["avg_bertscore"] print("BERTScore:") print(f" Precision: {bert['precision']:.3f}") print(f" Recall: {bert['recall']:.3f}") print(f" F1: {bert['f1']:.3f}") print() if "avg_context_relevance" in metrics: print(f"Context Relevance: {metrics['avg_context_relevance']:.3f}") print() def save_results( self, results: Dict[str, Any], output_path: str ) -> None: """ Save evaluation results to a JSON file. Args: results: Evaluation results dictionary output_path: Path to save the results """ import json try: with open(output_path, 'w', encoding='utf-8') as f: json.dump(results, f, indent=2, ensure_ascii=False) print(f"✓ Results saved to: {output_path}") except Exception as e: print(f"❌ Error saving results: {e}") # Convenience function to create evaluator def create_evaluator() -> RAGEvaluator: """Create and return a RAG evaluator instance.""" return RAGEvaluator() # Example usage if __name__ == "__main__": # Example batch evaluation evaluator = create_evaluator() sample_evaluations = [ { "query": "What is machine learning?", "generated_answer": "Machine learning is a subset of AI that enables systems to learn from data.", "reference_answer": "Machine learning is a type of artificial intelligence that allows software applications to learn from data.", "retrieved_contexts": ["ML is part of AI...", "Data-driven learning..."] }, { "query": "Explain Python programming", "generated_answer": "Python is a high-level programming language known for its readability.", "reference_answer": "Python is an interpreted, high-level programming language.", "retrieved_contexts": ["Python was created...", "Python is versatile..."] } ] results = evaluator.evaluate_batch(sample_evaluations) evaluator.save_results(results, "evaluation_results.json")