Spaces:
Runtime error
Runtime error
| """ | |
| Evaluation pipeline for Multilingual News Article Summarizer. | |
| Evaluates model performance on: | |
| - CNN/DailyMail (English) | |
| - MLSUM French | |
| - MLSUM German | |
| Uses 100 test samples per language and calculates ROUGE-1, ROUGE-2, and ROUGE-L scores. | |
| """ | |
| import csv | |
| import os | |
| from typing import List, Dict, Any | |
| from datasets import load_dataset | |
| from rouge_score import rouge_scorer | |
| from summariser import Summarizer | |
| class SummarizerEvaluator: | |
| """Evaluator for multilingual news article summarization.""" | |
| def __init__(self): | |
| """Initialize the evaluator with required models.""" | |
| print("Initializing Summarizer...") | |
| self.summarizer = Summarizer() | |
| self.rouge_scorer = rouge_scorer.RougeScorer( | |
| ["rouge1", "rouge2", "rougeL"], use_stemmer=True | |
| ) | |
| print("Evaluator initialized successfully.\n") | |
| def load_dataset_samples( | |
| self, dataset_name: str, language: str, num_samples: int = 100 | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Load samples from a dataset using streaming to avoid memory issues. | |
| Args: | |
| dataset_name: Name of the dataset to load | |
| language: Language identifier | |
| num_samples: Number of samples to load (default: 100) | |
| Returns: | |
| List of sample dictionaries with 'article' and 'summary' keys | |
| """ | |
| print(f"Loading {num_samples} samples from {dataset_name} ({language})...") | |
| samples = [] | |
| successful_loads = 0 | |
| try: | |
| # Configure dataset loading | |
| if dataset_name == "cnn_dailymail": | |
| dataset = load_dataset( | |
| dataset_name, "3.0.0", streaming=True, split="test" | |
| ) | |
| article_key = "article" | |
| summary_key = "highlights" | |
| elif dataset_name == "mlsum": | |
| # MLSUM needs trust_remote_code=True | |
| dataset = load_dataset( | |
| dataset_name, | |
| language, | |
| streaming=True, | |
| split="test", | |
| trust_remote_code=True, | |
| ) | |
| article_key = "text" | |
| summary_key = "summary" | |
| else: | |
| raise ValueError(f"Unsupported dataset: {dataset_name}") | |
| # Stream and collect samples until we get exactly num_samples successful ones | |
| for sample in dataset: | |
| try: | |
| # Extract article and summary | |
| article = sample.get(article_key, "").strip() | |
| summary = sample.get(summary_key, "").strip() | |
| # Skip if either article or summary is empty | |
| if not article or not summary: | |
| continue | |
| samples.append({"article": article, "summary": summary}) | |
| successful_loads += 1 # Progress logging every 10 samples | |
| if successful_loads % 10 == 0: | |
| print(f" Loaded {successful_loads}/{num_samples} samples...") | |
| # Stop when we have enough samples | |
| if successful_loads >= num_samples: | |
| break | |
| except Exception as e: | |
| print(f" Warning: Skipping sample due to error: {e}") | |
| continue | |
| print( | |
| f"Successfully loaded {len(samples)} samples from {dataset_name} ({language})\n" | |
| ) | |
| return samples | |
| except Exception as e: | |
| print(f"Error loading dataset {dataset_name} ({language}): {e}") | |
| return [] | |
| def evaluate_sample(self, article: str, reference_summary: str) -> Dict[str, Any]: | |
| """ | |
| Evaluate a single article-summary pair. | |
| Args: | |
| article: Input article text | |
| reference_summary: Ground truth summary | |
| Returns: | |
| Dictionary with evaluation results | |
| """ | |
| try: | |
| # Generate summary using the summarizer | |
| result = self.summarizer.summarize(article) | |
| if result["error"]: | |
| return { | |
| "success": False, | |
| "error": result["error"], | |
| "rouge1_f": 0.0, | |
| "rouge2_f": 0.0, | |
| "rougeL_f": 0.0, | |
| "generated_summary": None, | |
| "detected_language": result.get("detected_language_ld"), | |
| } | |
| generated_summary = result["final_summary"] | |
| if not generated_summary: | |
| return { | |
| "success": False, | |
| "error": "No summary generated", | |
| "rouge1_f": 0.0, | |
| "rouge2_f": 0.0, | |
| "rougeL_f": 0.0, | |
| "generated_summary": None, | |
| "detected_language": result.get("detected_language_ld"), | |
| } | |
| # Calculate ROUGE scores | |
| scores = self.rouge_scorer.score(reference_summary, generated_summary) | |
| return { | |
| "success": True, | |
| "error": None, | |
| "rouge1_f": scores["rouge1"].fmeasure, | |
| "rouge2_f": scores["rouge2"].fmeasure, | |
| "rougeL_f": scores["rougeL"].fmeasure, | |
| "generated_summary": generated_summary, | |
| "detected_language": result.get("detected_language_ld"), | |
| } | |
| except Exception as e: | |
| return { | |
| "success": False, | |
| "error": str(e), | |
| "rouge1_f": 0.0, | |
| "rouge2_f": 0.0, | |
| "rougeL_f": 0.0, | |
| "generated_summary": None, | |
| "detected_language": None, | |
| } | |
| def evaluate_dataset( | |
| self, dataset_name: str, language: str, num_samples: int = 25 | |
| ) -> Dict[str, Any]: | |
| """ | |
| Evaluate summarizer on a complete dataset. | |
| Args: | |
| dataset_name: Name of the dataset | |
| language: Language identifier | |
| num_samples: Number of samples to evaluate | |
| Returns: | |
| Dictionary with aggregated results | |
| """ | |
| print(f"=== Evaluating {dataset_name} ({language}) ===") | |
| # Load samples | |
| samples = self.load_dataset_samples(dataset_name, language, num_samples) | |
| if not samples: | |
| return { | |
| "dataset": dataset_name, | |
| "language": language, | |
| "total_samples": 0, | |
| "successful_evaluations": 0, | |
| "avg_rouge1_f": 0.0, | |
| "avg_rouge2_f": 0.0, | |
| "avg_rougeL_f": 0.0, | |
| "individual_results": [], | |
| "error": "Failed to load samples", | |
| } | |
| # Evaluate each sample | |
| individual_results = [] | |
| successful_evaluations = 0 | |
| total_rouge1 = 0.0 | |
| total_rouge2 = 0.0 | |
| total_rougeL = 0.0 | |
| for i, sample in enumerate(samples): | |
| try: # Progress logging every 10 evaluations | |
| if (i + 1) % 10 == 0: | |
| print(f" Evaluating sample {i + 1}/{len(samples)}...") | |
| # Evaluate single sample | |
| eval_result = self.evaluate_sample(sample["article"], sample["summary"]) | |
| # Store individual result | |
| individual_result = { | |
| "sample_id": i + 1, | |
| "dataset": dataset_name, | |
| "language": language, | |
| "success": eval_result["success"], | |
| "error": eval_result["error"], | |
| "rouge1_f": eval_result["rouge1_f"], | |
| "rouge2_f": eval_result["rouge2_f"], | |
| "rougeL_f": eval_result["rougeL_f"], | |
| "detected_language": eval_result["detected_language"], | |
| "reference_summary": ( | |
| sample["summary"][:200] + "..." | |
| if len(sample["summary"]) > 200 | |
| else sample["summary"] | |
| ), | |
| "generated_summary": ( | |
| eval_result["generated_summary"][:200] + "..." | |
| if eval_result["generated_summary"] | |
| and len(eval_result["generated_summary"]) > 200 | |
| else eval_result["generated_summary"] | |
| ), | |
| } | |
| individual_results.append(individual_result) | |
| # Accumulate scores for successful evaluations | |
| if eval_result["success"]: | |
| successful_evaluations += 1 | |
| total_rouge1 += eval_result["rouge1_f"] | |
| total_rouge2 += eval_result["rouge2_f"] | |
| total_rougeL += eval_result["rougeL_f"] | |
| except Exception as e: | |
| print(f" Warning: Error evaluating sample {i + 1}: {e}") | |
| individual_results.append( | |
| { | |
| "sample_id": i + 1, | |
| "dataset": dataset_name, | |
| "language": language, | |
| "success": False, | |
| "error": str(e), | |
| "rouge1_f": 0.0, | |
| "rouge2_f": 0.0, | |
| "rougeL_f": 0.0, | |
| "detected_language": None, | |
| "reference_summary": ( | |
| sample["summary"][:200] + "..." | |
| if len(sample["summary"]) > 200 | |
| else sample["summary"] | |
| ), | |
| "generated_summary": None, | |
| } | |
| ) | |
| continue | |
| # Calculate averages | |
| avg_rouge1 = ( | |
| total_rouge1 / successful_evaluations if successful_evaluations > 0 else 0.0 | |
| ) | |
| avg_rouge2 = ( | |
| total_rouge2 / successful_evaluations if successful_evaluations > 0 else 0.0 | |
| ) | |
| avg_rougeL = ( | |
| total_rougeL / successful_evaluations if successful_evaluations > 0 else 0.0 | |
| ) | |
| print( | |
| f"Completed evaluation: {successful_evaluations}/{len(samples)} successful" | |
| ) | |
| print(f"Average ROUGE-1: {avg_rouge1:.4f}") | |
| print(f"Average ROUGE-2: {avg_rouge2:.4f}") | |
| print(f"Average ROUGE-L: {avg_rougeL:.4f}\n") | |
| return { | |
| "dataset": dataset_name, | |
| "language": language, | |
| "total_samples": len(samples), | |
| "successful_evaluations": successful_evaluations, | |
| "avg_rouge1_f": avg_rouge1, | |
| "avg_rouge2_f": avg_rouge2, | |
| "avg_rougeL_f": avg_rougeL, | |
| "individual_results": individual_results, | |
| "error": None, | |
| } | |
| def run_full_evaluation(self, num_samples: int = 25) -> str: | |
| """ | |
| Run complete evaluation pipeline on all datasets. | |
| Args: | |
| num_samples: Number of samples per dataset | |
| Returns: | |
| Path to the saved results CSV file | |
| """ | |
| print( | |
| "π Starting Full Multilingual Evaluation Pipeline π\n" | |
| ) # Define datasets to evaluate | |
| datasets_config = [ | |
| {"dataset_name": "cnn_dailymail", "language": "en"}, | |
| {"dataset_name": "mlsum", "language": "fr"}, | |
| ] | |
| all_summary_results = [] | |
| # Evaluate each dataset | |
| for config in datasets_config: | |
| dataset_result = self.evaluate_dataset( | |
| config["dataset_name"], config["language"], num_samples | |
| ) | |
| # Save individual CSV immediately after each language evaluation | |
| individual_csv_path = self.save_individual_results_to_csv( | |
| dataset_result["individual_results"], | |
| dataset_result["dataset"], | |
| dataset_result["language"], | |
| ) | |
| print( | |
| f"β Saved {dataset_result['language'].upper()} results to: {individual_csv_path}" | |
| ) | |
| # Store summary results | |
| all_summary_results.append( | |
| { | |
| "dataset": dataset_result["dataset"], | |
| "language": dataset_result["language"], | |
| "total_samples": dataset_result["total_samples"], | |
| "successful_evaluations": dataset_result["successful_evaluations"], | |
| "success_rate": ( | |
| dataset_result["successful_evaluations"] | |
| / dataset_result["total_samples"] | |
| if dataset_result["total_samples"] > 0 | |
| else 0.0 | |
| ), | |
| "avg_rouge1_f": dataset_result["avg_rouge1_f"], | |
| "avg_rouge2_f": dataset_result["avg_rouge2_f"], | |
| "avg_rougeL_f": dataset_result["avg_rougeL_f"], | |
| "error": dataset_result["error"], | |
| } | |
| ) | |
| # Save combined summary CSV | |
| summary_csv_path = self.save_summary_results_to_csv( | |
| all_summary_results | |
| ) # Print final summary | |
| print("π FINAL EVALUATION SUMMARY π") | |
| print("=" * 50) | |
| for result in all_summary_results: | |
| print(f"{result['dataset']} ({result['language'].upper()}):") | |
| print( | |
| f" Success Rate: {result['success_rate']:.1%} ({result['successful_evaluations']}/{result['total_samples']})" | |
| ) | |
| print(f" ROUGE-1: {result['avg_rouge1_f']:.4f}") | |
| print(f" ROUGE-2: {result['avg_rouge2_f']:.4f}") | |
| print(f" ROUGE-L: {result['avg_rougeL_f']:.4f}") | |
| print() | |
| print(f"β Combined summary saved to: {summary_csv_path}") | |
| return summary_csv_path | |
| def save_individual_results_to_csv( | |
| self, individual_results: List[Dict], dataset_name: str, language: str | |
| ) -> str: | |
| """ | |
| Save individual evaluation results to a language-specific CSV file with summary row. | |
| Args: | |
| individual_results: List of individual sample results | |
| dataset_name: Name of the dataset | |
| language: Language identifier | |
| Returns: | |
| Path to the saved CSV file | |
| """ | |
| # Create filename | |
| csv_path = f"evaluation_results_{language}.csv" | |
| with open(csv_path, "w", newline="", encoding="utf-8") as csvfile: | |
| if individual_results: | |
| fieldnames = individual_results[0].keys() | |
| writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
| writer.writeheader() | |
| writer.writerows(individual_results) | |
| # Add summary row | |
| successful_results = [r for r in individual_results if r["success"]] | |
| if successful_results: | |
| avg_rouge1 = sum(r["rouge1_f"] for r in successful_results) / len( | |
| successful_results | |
| ) | |
| avg_rouge2 = sum(r["rouge2_f"] for r in successful_results) / len( | |
| successful_results | |
| ) | |
| avg_rougeL = sum(r["rougeL_f"] for r in successful_results) / len( | |
| successful_results | |
| ) | |
| summary_row = { | |
| "sample_id": "SUMMARY", | |
| "dataset": dataset_name, | |
| "language": language, | |
| "success": f"{len(successful_results)}/{len(individual_results)}", | |
| "error": None, | |
| "rouge1_f": avg_rouge1, | |
| "rouge2_f": avg_rouge2, | |
| "rougeL_f": avg_rougeL, | |
| "detected_language": None, | |
| "reference_summary": "AVERAGE SCORES", | |
| "generated_summary": f"Success Rate: {len(successful_results)/len(individual_results):.1%}", | |
| } | |
| writer.writerow(summary_row) | |
| return csv_path | |
| def save_summary_results_to_csv(self, summary_results: List[Dict]) -> str: | |
| """ | |
| Save aggregated summary results to CSV file. | |
| Args: | |
| summary_results: List of aggregated dataset results | |
| Returns: | |
| Path to the saved CSV file | |
| """ | |
| summary_csv_path = "evaluation_results_summary.csv" | |
| with open(summary_csv_path, "w", newline="", encoding="utf-8") as csvfile: | |
| if summary_results: | |
| fieldnames = summary_results[0].keys() | |
| writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
| writer.writeheader() | |
| writer.writerows(summary_results) | |
| return summary_csv_path | |
| def main(): | |
| """Main evaluation function.""" | |
| try: | |
| evaluator = SummarizerEvaluator() | |
| csv_path = evaluator.run_full_evaluation(num_samples=25) | |
| print(f"\nπ Evaluation completed successfully!") | |
| print(f"Results available in: {csv_path}") | |
| except Exception as e: | |
| print(f"β Evaluation failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| if __name__ == "__main__": | |
| main() | |