Spaces:
Running
Running
| """ | |
| Comprehensive Evaluation System for Summarization Models | |
| Implements ROUGE metrics, comparison analysis, and statistical testing | |
| """ | |
| # Handle different rouge library installations | |
| try: | |
| from rouge import Rouge | |
| ROUGE_AVAILABLE = True | |
| ROUGE_TYPE = "rouge" | |
| except ImportError: | |
| try: | |
| from rouge_score import rouge_scorer | |
| ROUGE_AVAILABLE = True | |
| ROUGE_TYPE = "rouge_score" | |
| except ImportError: | |
| ROUGE_AVAILABLE = False | |
| ROUGE_TYPE = None | |
| print("Warning: No ROUGE library found. Install with: pip install rouge-score") | |
| import numpy as np | |
| from typing import Dict, List, Tuple, Optional | |
| import pandas as pd | |
| import logging | |
| from scipy import stats | |
| import time | |
| logger = logging.getLogger(__name__) | |
| class SummarizerEvaluator: | |
| """ | |
| Professional evaluation system for summarization models. | |
| Metrics Implemented: | |
| - ROUGE-1: Unigram overlap | |
| - ROUGE-2: Bigram overlap | |
| - ROUGE-L: Longest common subsequence | |
| - ROUGE-W: Weighted longest common subsequence | |
| Additional Analysis: | |
| - Compression ratio | |
| - Processing time | |
| - Statistical significance testing | |
| - Model comparison | |
| """ | |
| def __init__(self): | |
| """Initialize evaluator with ROUGE scorer""" | |
| if ROUGE_AVAILABLE: | |
| if ROUGE_TYPE == "rouge": | |
| self.rouge = Rouge() | |
| self.rouge_scorer = None | |
| else: # rouge_score | |
| self.rouge = None | |
| self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) | |
| logger.info(f"Evaluator initialized with {ROUGE_TYPE} library") | |
| else: | |
| self.rouge = None | |
| self.rouge_scorer = None | |
| logger.warning("ROUGE library not available - only basic metrics will be computed") | |
| self.evaluation_history = [] | |
| def _calculate_rouge_scores(self, generated: str, reference: str) -> Dict: | |
| """Calculate ROUGE scores using available library""" | |
| if not ROUGE_AVAILABLE: | |
| return { | |
| 'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0}, | |
| 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, | |
| 'rouge-l': {'f': 0.0, 'p': 0.0, 'r': 0.0} | |
| } | |
| if ROUGE_TYPE == "rouge": | |
| # Original rouge library | |
| scores = self.rouge.get_scores(generated, reference)[0] | |
| return scores | |
| else: | |
| # rouge_score library | |
| scores = self.rouge_scorer.score(reference, generated) | |
| return { | |
| 'rouge-1': { | |
| 'f': scores['rouge1'].fmeasure, | |
| 'p': scores['rouge1'].precision, | |
| 'r': scores['rouge1'].recall | |
| }, | |
| 'rouge-2': { | |
| 'f': scores['rouge2'].fmeasure, | |
| 'p': scores['rouge2'].precision, | |
| 'r': scores['rouge2'].recall | |
| }, | |
| 'rouge-l': { | |
| 'f': scores['rougeL'].fmeasure, | |
| 'p': scores['rougeL'].precision, | |
| 'r': scores['rougeL'].recall | |
| } | |
| } | |
| def evaluate_single(self, | |
| generated: str, | |
| reference: str, | |
| model_name: str = "Unknown") -> Dict: | |
| """ | |
| Evaluate a single summary against reference | |
| ROUGE Metrics Explained: | |
| - Precision: What % of generated words are in reference | |
| - Recall: What % of reference words are in generated | |
| - F1-Score: Harmonic mean of precision and recall | |
| Args: | |
| generated: Generated summary | |
| reference: Human reference summary | |
| model_name: Name of the model | |
| Returns: | |
| Dictionary containing all metrics | |
| """ | |
| if not generated or not reference: | |
| logger.warning("Empty summary or reference provided") | |
| return self._empty_scores() | |
| try: | |
| # Calculate ROUGE scores | |
| scores = self._calculate_rouge_scores(generated, reference) | |
| # Calculate additional metrics | |
| compression_ratio = len(generated.split()) / len(reference.split()) if len(reference.split()) > 0 else 0 | |
| result = { | |
| 'model_name': model_name, | |
| 'rouge_1_f1': scores['rouge-1']['f'], | |
| 'rouge_1_precision': scores['rouge-1']['p'], | |
| 'rouge_1_recall': scores['rouge-1']['r'], | |
| 'rouge_2_f1': scores['rouge-2']['f'], | |
| 'rouge_2_precision': scores['rouge-2']['p'], | |
| 'rouge_2_recall': scores['rouge-2']['r'], | |
| 'rouge_l_f1': scores['rouge-l']['f'], | |
| 'rouge_l_precision': scores['rouge-l']['p'], | |
| 'rouge_l_recall': scores['rouge-l']['r'], | |
| 'compression_ratio': compression_ratio, | |
| 'generated_length': len(generated.split()), | |
| 'reference_length': len(reference.split()) | |
| } | |
| return result | |
| except Exception as e: | |
| logger.error(f"Error evaluating summary: {e}") | |
| return self._empty_scores() | |
| def _empty_scores(self) -> Dict: | |
| """Return empty scores for error cases""" | |
| return { | |
| 'rouge_1_f1': 0.0, | |
| 'rouge_1_precision': 0.0, | |
| 'rouge_1_recall': 0.0, | |
| 'rouge_2_f1': 0.0, | |
| 'rouge_2_precision': 0.0, | |
| 'rouge_2_recall': 0.0, | |
| 'rouge_l_f1': 0.0, | |
| 'rouge_l_precision': 0.0, | |
| 'rouge_l_recall': 0.0, | |
| 'compression_ratio': 0.0, | |
| 'generated_length': 0, | |
| 'reference_length': 0 | |
| } | |
| def evaluate_batch(self, | |
| generated_summaries: List[str], | |
| reference_summaries: List[str], | |
| model_name: str = "Unknown") -> Dict: | |
| """ | |
| Evaluate multiple summaries and aggregate results | |
| Args: | |
| generated_summaries: List of generated summaries | |
| reference_summaries: List of reference summaries | |
| model_name: Name of the model | |
| Returns: | |
| Dictionary with aggregated statistics | |
| """ | |
| assert len(generated_summaries) == len(reference_summaries), \ | |
| "Generated and reference lists must have same length" | |
| logger.info(f"Evaluating {len(generated_summaries)} summaries for {model_name}") | |
| results = [] | |
| for gen, ref in zip(generated_summaries, reference_summaries): | |
| scores = self.evaluate_single(gen, ref, model_name) | |
| results.append(scores) | |
| # Aggregate statistics | |
| df = pd.DataFrame(results) | |
| aggregated = { | |
| 'model_name': model_name, | |
| 'num_samples': len(results), | |
| 'rouge_1_f1_mean': df['rouge_1_f1'].mean(), | |
| 'rouge_1_f1_std': df['rouge_1_f1'].std(), | |
| 'rouge_2_f1_mean': df['rouge_2_f1'].mean(), | |
| 'rouge_2_f1_std': df['rouge_2_f1'].std(), | |
| 'rouge_l_f1_mean': df['rouge_l_f1'].mean(), | |
| 'rouge_l_f1_std': df['rouge_l_f1'].std(), | |
| 'compression_ratio_mean': df['compression_ratio'].mean(), | |
| 'compression_ratio_std': df['compression_ratio'].std(), | |
| 'individual_scores': results | |
| } | |
| # Store in history | |
| self.evaluation_history.append(aggregated) | |
| return aggregated | |
| def compare_models(self, | |
| models_dict: Dict, | |
| test_texts: List[str], | |
| reference_summaries: List[str], | |
| **summarize_kwargs) -> pd.DataFrame: | |
| """ | |
| Compare multiple models on the same dataset | |
| Args: | |
| models_dict: Dictionary {model_name: model_instance} | |
| test_texts: List of texts to summarize | |
| reference_summaries: List of reference summaries | |
| **summarize_kwargs: Additional parameters for summarization | |
| Returns: | |
| DataFrame with comparison results | |
| """ | |
| logger.info(f"Comparing {len(models_dict)} models on {len(test_texts)} texts") | |
| comparison_results = [] | |
| for model_name, model in models_dict.items(): | |
| logger.info(f"Evaluating {model_name}...") | |
| start_time = time.time() | |
| # Generate summaries | |
| generated_summaries = [] | |
| for text in test_texts: | |
| try: | |
| summary = model.summarize(text, **summarize_kwargs) | |
| generated_summaries.append(summary) | |
| except Exception as e: | |
| logger.error(f"Error with {model_name}: {e}") | |
| generated_summaries.append("") | |
| total_time = time.time() - start_time | |
| # Evaluate | |
| eval_results = self.evaluate_batch( | |
| generated_summaries, | |
| reference_summaries, | |
| model_name | |
| ) | |
| # Add timing information | |
| eval_results['total_time'] = total_time | |
| eval_results['avg_time_per_summary'] = total_time / len(test_texts) | |
| comparison_results.append(eval_results) | |
| # Create comparison DataFrame | |
| df = pd.DataFrame([ | |
| { | |
| 'Model': r['model_name'], | |
| 'ROUGE-1': f"{r['rouge_1_f1_mean']:.4f} ± {r['rouge_1_f1_std']:.4f}", | |
| 'ROUGE-2': f"{r['rouge_2_f1_mean']:.4f} ± {r['rouge_2_f1_std']:.4f}", | |
| 'ROUGE-L': f"{r['rouge_l_f1_mean']:.4f} ± {r['rouge_l_f1_std']:.4f}", | |
| 'Compression': f"{r['compression_ratio_mean']:.2f}x", | |
| 'Avg Time (s)': f"{r['avg_time_per_summary']:.3f}" | |
| } | |
| for r in comparison_results | |
| ]) | |
| logger.info("Model comparison completed") | |
| return df | |
| def statistical_significance_test(self, | |
| model1_scores: List[float], | |
| model2_scores: List[float], | |
| test_name: str = "paired t-test") -> Dict: | |
| """ | |
| Test if difference between models is statistically significant | |
| Args: | |
| model1_scores: Scores from first model | |
| model2_scores: Scores from second model | |
| test_name: Type of statistical test | |
| Returns: | |
| Dictionary with test results | |
| """ | |
| if test_name == "paired t-test": | |
| statistic, p_value = stats.ttest_rel(model1_scores, model2_scores) | |
| elif test_name == "wilcoxon": | |
| statistic, p_value = stats.wilcoxon(model1_scores, model2_scores) | |
| else: | |
| raise ValueError(f"Unknown test: {test_name}") | |
| is_significant = p_value < 0.05 | |
| return { | |
| 'test_name': test_name, | |
| 'statistic': statistic, | |
| 'p_value': p_value, | |
| 'is_significant': is_significant, | |
| 'significance_level': 0.05, | |
| 'interpretation': ( | |
| f"The difference is {'statistically significant' if is_significant else 'not statistically significant'} " | |
| f"(p={p_value:.4f})" | |
| ) | |
| } | |
| def get_detailed_report(self, | |
| evaluation_result: Dict) -> str: | |
| """ | |
| Generate a detailed text report | |
| Args: | |
| evaluation_result: Results from evaluate_batch | |
| Returns: | |
| Formatted report string | |
| """ | |
| report = [] | |
| report.append("=" * 70) | |
| report.append(f"EVALUATION REPORT: {evaluation_result['model_name']}") | |
| report.append("=" * 70) | |
| report.append(f"\nDataset Size: {evaluation_result['num_samples']} samples\n") | |
| report.append("ROUGE Scores (F1):") | |
| report.append(f" ROUGE-1: {evaluation_result['rouge_1_f1_mean']:.4f} (±{evaluation_result['rouge_1_f1_std']:.4f})") | |
| report.append(f" ROUGE-2: {evaluation_result['rouge_2_f1_mean']:.4f} (±{evaluation_result['rouge_2_f1_std']:.4f})") | |
| report.append(f" ROUGE-L: {evaluation_result['rouge_l_f1_mean']:.4f} (±{evaluation_result['rouge_l_f1_std']:.4f})") | |
| report.append(f"\nCompression Ratio: {evaluation_result['compression_ratio_mean']:.2f}x") | |
| report.append(f" (Standard Deviation: {evaluation_result['compression_ratio_std']:.2f})") | |
| report.append("\n" + "=" * 70) | |
| return "\n".join(report) | |
| def export_results(self, | |
| evaluation_result: Dict, | |
| filename: str = "evaluation_results.json"): | |
| """ | |
| Export evaluation results to file | |
| Args: | |
| evaluation_result: Results to export | |
| filename: Output filename | |
| """ | |
| import json | |
| with open(filename, 'w') as f: | |
| json.dump(evaluation_result, f, indent=2) | |
| logger.info(f"Results exported to {filename}") | |
| # Test the evaluator | |
| if __name__ == "__main__": | |
| print("=" * 70) | |
| print("EVALUATOR SYSTEM TEST") | |
| print("=" * 70) | |
| # Sample data | |
| generated = "Machine learning revolutionizes AI. Neural networks perform complex tasks." | |
| reference = "Machine learning has transformed artificial intelligence. Deep neural networks can now handle complicated tasks with high accuracy." | |
| # Initialize evaluator | |
| evaluator = SummarizerEvaluator() | |
| # Evaluate single summary | |
| scores = evaluator.evaluate_single(generated, reference, "TestModel") | |
| print("\nSingle Summary Evaluation:") | |
| print(f"ROUGE-1 F1: {scores['rouge_1_f1']:.4f}") | |
| print(f"ROUGE-2 F1: {scores['rouge_2_f1']:.4f}") | |
| print(f"ROUGE-L F1: {scores['rouge_l_f1']:.4f}") | |
| print(f"Compression Ratio: {scores['compression_ratio']:.2f}x") | |
| # Test batch evaluation | |
| generated_list = [generated] * 5 | |
| reference_list = [reference] * 5 | |
| batch_scores = evaluator.evaluate_batch(generated_list, reference_list, "TestModel") | |
| print("\n" + evaluator.get_detailed_report(batch_scores)) |