Spaces:
Sleeping
Sleeping
| """ | |
| Evaluation module for assessing RAG system performance. | |
| Provides metrics like RAGAS, ROUGE, BLEU, and BERTScore. | |
| """ | |
| from typing import List, Dict, Any, Optional | |
| import numpy as np | |
| class RAGEvaluator: | |
| """Evaluator for RAG system performance metrics.""" | |
| def __init__(self): | |
| """Initialize the evaluator with optional metric libraries.""" | |
| self.metrics_available = self._check_available_metrics() | |
| print(f"✓ RAG Evaluator initialized") | |
| print(f" Available metrics: {', '.join(self.metrics_available)}") | |
| def _check_available_metrics(self) -> List[str]: | |
| """Check which evaluation metrics are available.""" | |
| available = [] | |
| try: | |
| from rouge_score import rouge_scorer | |
| available.append("ROUGE") | |
| except ImportError: | |
| pass | |
| try: | |
| from bert_score import score | |
| available.append("BERTScore") | |
| except ImportError: | |
| pass | |
| try: | |
| from ragas import evaluate | |
| available.append("RAGAS") | |
| except ImportError: | |
| pass | |
| # Basic metrics are always available | |
| available.extend(["Length", "Similarity"]) | |
| return available | |
| def evaluate_response( | |
| self, | |
| query: str, | |
| generated_answer: str, | |
| reference_answer: Optional[str] = None, | |
| retrieved_contexts: Optional[List[str]] = None | |
| ) -> Dict[str, Any]: | |
| """ | |
| Evaluate a single response. | |
| Args: | |
| query: Original question | |
| generated_answer: Generated answer | |
| reference_answer: Optional ground truth answer | |
| retrieved_contexts: Optional retrieved contexts | |
| Returns: | |
| Dictionary of evaluation metrics | |
| """ | |
| results = { | |
| "query": query, | |
| "generated_answer": generated_answer, | |
| "metrics": {} | |
| } | |
| # Basic metrics | |
| results["metrics"]["answer_length"] = len(generated_answer) | |
| results["metrics"]["word_count"] = len(generated_answer.split()) | |
| # Reference-based metrics (if reference answer provided) | |
| if reference_answer: | |
| results["reference_answer"] = reference_answer | |
| # ROUGE scores | |
| if "ROUGE" in self.metrics_available: | |
| rouge_scores = self._calculate_rouge(generated_answer, reference_answer) | |
| results["metrics"]["rouge"] = rouge_scores | |
| # BERTScore | |
| if "BERTScore" in self.metrics_available: | |
| bert_score = self._calculate_bertscore(generated_answer, reference_answer) | |
| results["metrics"]["bertscore"] = bert_score | |
| # Context-based metrics (if contexts provided) | |
| if retrieved_contexts: | |
| results["num_contexts"] = len(retrieved_contexts) | |
| # Calculate answer-context relevance | |
| if "BERTScore" in self.metrics_available: | |
| context_relevance = self._calculate_context_relevance( | |
| generated_answer, | |
| retrieved_contexts | |
| ) | |
| results["metrics"]["context_relevance"] = context_relevance | |
| return results | |
| def _calculate_rouge( | |
| self, | |
| generated: str, | |
| reference: str | |
| ) -> Dict[str, float]: | |
| """Calculate ROUGE scores.""" | |
| try: | |
| from rouge_score import rouge_scorer | |
| scorer = rouge_scorer.RougeScorer( | |
| ['rouge1', 'rouge2', 'rougeL'], | |
| use_stemmer=True | |
| ) | |
| scores = scorer.score(reference, generated) | |
| return { | |
| "rouge1": scores['rouge1'].fmeasure, | |
| "rouge2": scores['rouge2'].fmeasure, | |
| "rougeL": scores['rougeL'].fmeasure | |
| } | |
| except Exception as e: | |
| print(f"Error calculating ROUGE: {e}") | |
| return {} | |
| def _calculate_bertscore( | |
| self, | |
| generated: str, | |
| reference: str | |
| ) -> Dict[str, float]: | |
| """Calculate BERTScore.""" | |
| try: | |
| from bert_score import score | |
| P, R, F1 = score( | |
| [generated], | |
| [reference], | |
| lang="en", | |
| verbose=False | |
| ) | |
| return { | |
| "precision": float(P[0]), | |
| "recall": float(R[0]), | |
| "f1": float(F1[0]) | |
| } | |
| except Exception as e: | |
| print(f"Error calculating BERTScore: {e}") | |
| return {} | |
| def _calculate_context_relevance( | |
| self, | |
| answer: str, | |
| contexts: List[str] | |
| ) -> float: | |
| """Calculate relevance between answer and contexts using BERTScore.""" | |
| try: | |
| from bert_score import score | |
| # Calculate BERTScore between answer and each context | |
| scores = [] | |
| for context in contexts: | |
| _, _, F1 = score( | |
| [answer], | |
| [context], | |
| lang="en", | |
| verbose=False | |
| ) | |
| scores.append(float(F1[0])) | |
| # Return average score | |
| return float(np.mean(scores)) if scores else 0.0 | |
| except Exception as e: | |
| print(f"Error calculating context relevance: {e}") | |
| return 0.0 | |
| def evaluate_batch( | |
| self, | |
| evaluations: List[Dict[str, Any]] | |
| ) -> Dict[str, Any]: | |
| """ | |
| Evaluate multiple responses and aggregate results. | |
| Args: | |
| evaluations: List of evaluation dictionaries with query, generated_answer, etc. | |
| Returns: | |
| Aggregated evaluation results with statistics | |
| """ | |
| print(f"\n{'='*60}") | |
| print(f"📊 BATCH EVALUATION - {len(evaluations)} responses") | |
| print(f"{'='*60}\n") | |
| all_results = [] | |
| for i, eval_data in enumerate(evaluations, 1): | |
| print(f"[{i}/{len(evaluations)}] Evaluating: {eval_data['query'][:50]}...") | |
| result = self.evaluate_response( | |
| query=eval_data["query"], | |
| generated_answer=eval_data["generated_answer"], | |
| reference_answer=eval_data.get("reference_answer"), | |
| retrieved_contexts=eval_data.get("retrieved_contexts") | |
| ) | |
| all_results.append(result) | |
| # Aggregate metrics | |
| aggregated = self._aggregate_metrics(all_results) | |
| print(f"\n{'='*60}") | |
| print("✅ BATCH EVALUATION COMPLETE") | |
| print(f"{'='*60}\n") | |
| self._print_summary(aggregated) | |
| return aggregated | |
| def _aggregate_metrics( | |
| self, | |
| results: List[Dict[str, Any]] | |
| ) -> Dict[str, Any]: | |
| """ | |
| Aggregate metrics from multiple evaluations. | |
| Args: | |
| results: List of evaluation results | |
| Returns: | |
| Aggregated metrics dictionary | |
| """ | |
| aggregated = { | |
| "total_evaluations": len(results), | |
| "metrics": {}, | |
| "individual_results": results | |
| } | |
| # Aggregate basic metrics | |
| lengths = [r["metrics"]["answer_length"] for r in results] | |
| word_counts = [r["metrics"]["word_count"] for r in results] | |
| aggregated["metrics"]["avg_answer_length"] = float(np.mean(lengths)) | |
| aggregated["metrics"]["avg_word_count"] = float(np.mean(word_counts)) | |
| # Aggregate ROUGE scores if available | |
| rouge_results = [r["metrics"].get("rouge") for r in results if "rouge" in r["metrics"]] | |
| if rouge_results: | |
| rouge_agg = { | |
| "rouge1": float(np.mean([r["rouge1"] for r in rouge_results])), | |
| "rouge2": float(np.mean([r["rouge2"] for r in rouge_results])), | |
| "rougeL": float(np.mean([r["rougeL"] for r in rouge_results])) | |
| } | |
| aggregated["metrics"]["avg_rouge"] = rouge_agg | |
| # Aggregate BERTScore if available | |
| bert_results = [r["metrics"].get("bertscore") for r in results if "bertscore" in r["metrics"]] | |
| if bert_results: | |
| bert_agg = { | |
| "precision": float(np.mean([r["precision"] for r in bert_results])), | |
| "recall": float(np.mean([r["recall"] for r in bert_results])), | |
| "f1": float(np.mean([r["f1"] for r in bert_results])) | |
| } | |
| aggregated["metrics"]["avg_bertscore"] = bert_agg | |
| # Aggregate context relevance if available | |
| context_scores = [r["metrics"].get("context_relevance") for r in results if "context_relevance" in r["metrics"]] | |
| if context_scores: | |
| aggregated["metrics"]["avg_context_relevance"] = float(np.mean(context_scores)) | |
| return aggregated | |
| def _print_summary(self, aggregated: Dict[str, Any]) -> None: | |
| """ | |
| Print a formatted summary of aggregated metrics. | |
| Args: | |
| aggregated: Aggregated metrics dictionary | |
| """ | |
| print("📊 EVALUATION SUMMARY") | |
| print("-" * 60) | |
| print(f"Total Evaluations: {aggregated['total_evaluations']}") | |
| print() | |
| metrics = aggregated["metrics"] | |
| print("Basic Metrics:") | |
| print(f" Average Answer Length: {metrics.get('avg_answer_length', 0):.1f} characters") | |
| print(f" Average Word Count: {metrics.get('avg_word_count', 0):.1f} words") | |
| print() | |
| if "avg_rouge" in metrics: | |
| rouge = metrics["avg_rouge"] | |
| print("ROUGE Scores:") | |
| print(f" ROUGE-1: {rouge['rouge1']:.3f}") | |
| print(f" ROUGE-2: {rouge['rouge2']:.3f}") | |
| print(f" ROUGE-L: {rouge['rougeL']:.3f}") | |
| print() | |
| if "avg_bertscore" in metrics: | |
| bert = metrics["avg_bertscore"] | |
| print("BERTScore:") | |
| print(f" Precision: {bert['precision']:.3f}") | |
| print(f" Recall: {bert['recall']:.3f}") | |
| print(f" F1: {bert['f1']:.3f}") | |
| print() | |
| if "avg_context_relevance" in metrics: | |
| print(f"Context Relevance: {metrics['avg_context_relevance']:.3f}") | |
| print() | |
| def save_results( | |
| self, | |
| results: Dict[str, Any], | |
| output_path: str | |
| ) -> None: | |
| """ | |
| Save evaluation results to a JSON file. | |
| Args: | |
| results: Evaluation results dictionary | |
| output_path: Path to save the results | |
| """ | |
| import json | |
| try: | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| json.dump(results, f, indent=2, ensure_ascii=False) | |
| print(f"✓ Results saved to: {output_path}") | |
| except Exception as e: | |
| print(f"❌ Error saving results: {e}") | |
| # Convenience function to create evaluator | |
| def create_evaluator() -> RAGEvaluator: | |
| """Create and return a RAG evaluator instance.""" | |
| return RAGEvaluator() | |
| # Example usage | |
| if __name__ == "__main__": | |
| # Example batch evaluation | |
| evaluator = create_evaluator() | |
| sample_evaluations = [ | |
| { | |
| "query": "What is machine learning?", | |
| "generated_answer": "Machine learning is a subset of AI that enables systems to learn from data.", | |
| "reference_answer": "Machine learning is a type of artificial intelligence that allows software applications to learn from data.", | |
| "retrieved_contexts": ["ML is part of AI...", "Data-driven learning..."] | |
| }, | |
| { | |
| "query": "Explain Python programming", | |
| "generated_answer": "Python is a high-level programming language known for its readability.", | |
| "reference_answer": "Python is an interpreted, high-level programming language.", | |
| "retrieved_contexts": ["Python was created...", "Python is versatile..."] | |
| } | |
| ] | |
| results = evaluator.evaluate_batch(sample_evaluations) | |
| evaluator.save_results(results, "evaluation_results.json") |