Spaces:
Running
Running
Add comprehensive CNN/DailyMail evaluation system - dataset loading, model evaluation, topic analysis, and comparison
Browse files- .ipynb_checkpoints/railway-checkpoint.json +10 -0
- EVALUATION_GUIDE.md +174 -0
- evaluation/__init__.py +1 -0
- evaluation/dataset_loader.py +140 -0
- evaluation/model_evaluator.py +226 -0
- evaluation/results_analyzer.py +202 -0
- evaluation/run_evaluation.py +123 -0
- notebooks/.ipynb_checkpoints/01_data_exploration-checkpoint.ipynb +0 -0
- notebooks/.ipynb_checkpoints/02_model_testing-checkpoint.ipynb +0 -0
- notebooks/.ipynb_checkpoints/03_evaluation_analysis-checkpoint.ipynb +478 -0
- notebooks/.ipynb_checkpoints/03_evaluation_analysis_cnn_dailymail-checkpoint.ipynb +0 -0
- notebooks/.ipynb_checkpoints/Smart-Summarizer-checkpoint.ipynb +6 -0
- notebooks/01_data_exploration.ipynb +1 -1
- notebooks/02_model_testing.ipynb +0 -0
- notebooks/03_evaluation_analysis.ipynb +1 -1
- notebooks/03_evaluation_analysis_cnn_dailymail.ipynb +0 -0
- notebooks/Smart-Summarizer.ipynb +0 -0
- results/cnn_dailymail_evaluation_export.json +109 -0
- results/cnn_dailymail_evaluation_results.csv +4 -0
- results/cnn_dailymail_report_summary.md +29 -0
- run_evaluation.py +130 -0
.ipynb_checkpoints/railway-checkpoint.json
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"build": {
|
| 3 |
+
"builder": "NIXPACKS"
|
| 4 |
+
},
|
| 5 |
+
"deploy": {
|
| 6 |
+
"startCommand": "cd webapp && gunicorn app:app --bind 0.0.0.0:$PORT --timeout 120 --workers 2",
|
| 7 |
+
"restartPolicyType": "ON_FAILURE",
|
| 8 |
+
"restartPolicyMaxRetries": 10
|
| 9 |
+
}
|
| 10 |
+
}
|
EVALUATION_GUIDE.md
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Model Evaluation Guide
|
| 2 |
+
|
| 3 |
+
This guide explains how to run comprehensive evaluation of the summarization models using the CNN/DailyMail dataset.
|
| 4 |
+
|
| 5 |
+
## Quick Start
|
| 6 |
+
|
| 7 |
+
### 1. Install Dependencies
|
| 8 |
+
```bash
|
| 9 |
+
pip install -r requirements.txt
|
| 10 |
+
```
|
| 11 |
+
|
| 12 |
+
### 2. Run Evaluation
|
| 13 |
+
```bash
|
| 14 |
+
python run_evaluation.py
|
| 15 |
+
```
|
| 16 |
+
|
| 17 |
+
This will:
|
| 18 |
+
- Download CNN/DailyMail dataset
|
| 19 |
+
- Evaluate all three models (TextRank, BART, PEGASUS)
|
| 20 |
+
- Generate comparison reports and visualizations
|
| 21 |
+
- Save results to `evaluation_results/` directory
|
| 22 |
+
|
| 23 |
+
## What Gets Evaluated
|
| 24 |
+
|
| 25 |
+
### Models
|
| 26 |
+
- **TextRank**: Extractive summarization using graph-based ranking
|
| 27 |
+
- **BART**: Abstractive summarization using transformer encoder-decoder
|
| 28 |
+
- **PEGASUS**: Abstractive summarization specialized for summarization tasks
|
| 29 |
+
|
| 30 |
+
### Metrics
|
| 31 |
+
- **ROUGE-1**: Overlap of unigrams between generated and reference summaries
|
| 32 |
+
- **ROUGE-2**: Overlap of bigrams between generated and reference summaries
|
| 33 |
+
- **ROUGE-L**: Longest common subsequence between generated and reference summaries
|
| 34 |
+
- **Processing Time**: Average time to generate each summary
|
| 35 |
+
|
| 36 |
+
### Topic Categories
|
| 37 |
+
Articles are automatically categorized into:
|
| 38 |
+
- Politics
|
| 39 |
+
- Business
|
| 40 |
+
- Technology
|
| 41 |
+
- Sports
|
| 42 |
+
- Health
|
| 43 |
+
- Entertainment
|
| 44 |
+
- Other
|
| 45 |
+
|
| 46 |
+
## Advanced Usage
|
| 47 |
+
|
| 48 |
+
### Custom Evaluation
|
| 49 |
+
```bash
|
| 50 |
+
# Evaluate specific number of samples
|
| 51 |
+
python evaluation/run_evaluation.py --samples 200
|
| 52 |
+
|
| 53 |
+
# Evaluate by topic categories
|
| 54 |
+
python evaluation/run_evaluation.py --by-topic --samples 100
|
| 55 |
+
|
| 56 |
+
# Evaluate specific models only
|
| 57 |
+
python evaluation/run_evaluation.py --models textrank bart --samples 50
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
### Using Individual Components
|
| 61 |
+
|
| 62 |
+
#### Load Dataset
|
| 63 |
+
```python
|
| 64 |
+
from evaluation.dataset_loader import CNNDailyMailLoader
|
| 65 |
+
|
| 66 |
+
loader = CNNDailyMailLoader()
|
| 67 |
+
dataset = loader.load_dataset()
|
| 68 |
+
eval_data = loader.create_evaluation_subset(size=100)
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
#### Evaluate Single Model
|
| 72 |
+
```python
|
| 73 |
+
from evaluation.model_evaluator import ModelEvaluator
|
| 74 |
+
|
| 75 |
+
evaluator = ModelEvaluator()
|
| 76 |
+
evaluator.initialize_models()
|
| 77 |
+
results = evaluator.evaluate_single_model('bart', eval_data, max_samples=50)
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
#### Analyze Results
|
| 81 |
+
```python
|
| 82 |
+
from evaluation.results_analyzer import ResultsAnalyzer
|
| 83 |
+
|
| 84 |
+
analyzer = ResultsAnalyzer()
|
| 85 |
+
analyzer.create_performance_charts(results, 'output_dir')
|
| 86 |
+
analyzer.create_detailed_report(results, 'output_dir')
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
## Output Files
|
| 90 |
+
|
| 91 |
+
After running evaluation, you'll find these files in `evaluation_results/`:
|
| 92 |
+
|
| 93 |
+
### Data Files
|
| 94 |
+
- `eval_data.json` - Evaluation dataset
|
| 95 |
+
- `data_[topic].json` - Topic-specific datasets
|
| 96 |
+
|
| 97 |
+
### Results Files
|
| 98 |
+
- `results_overall.json` - Detailed evaluation results
|
| 99 |
+
- `comparison_overall.csv` - Summary comparison table
|
| 100 |
+
- `results_[topic].json` - Topic-specific results
|
| 101 |
+
|
| 102 |
+
### Visualizations
|
| 103 |
+
- `performance_comparison.png` - Model performance charts
|
| 104 |
+
- `topic_performance_heatmap.png` - Topic analysis heatmap
|
| 105 |
+
|
| 106 |
+
### Reports
|
| 107 |
+
- `evaluation_report.md` - Detailed evaluation report
|
| 108 |
+
- `topic_summary.csv` - Topic performance breakdown
|
| 109 |
+
|
| 110 |
+
## Understanding Results
|
| 111 |
+
|
| 112 |
+
### ROUGE Scores
|
| 113 |
+
- **Higher is better** (range: 0.0 to 1.0)
|
| 114 |
+
- ROUGE-1: Measures content overlap
|
| 115 |
+
- ROUGE-2: Measures fluency and coherence
|
| 116 |
+
- ROUGE-L: Measures structural similarity
|
| 117 |
+
|
| 118 |
+
### Processing Time
|
| 119 |
+
- **Lower is better**
|
| 120 |
+
- Measured in seconds per summary
|
| 121 |
+
- Important for real-time applications
|
| 122 |
+
|
| 123 |
+
### Model Characteristics
|
| 124 |
+
- **TextRank**: Fast, extractive, good for quick summaries
|
| 125 |
+
- **BART**: Balanced performance, good fluency
|
| 126 |
+
- **PEGASUS**: Best quality, slower processing
|
| 127 |
+
|
| 128 |
+
## Troubleshooting
|
| 129 |
+
|
| 130 |
+
### Memory Issues
|
| 131 |
+
If you encounter memory issues:
|
| 132 |
+
```bash
|
| 133 |
+
# Reduce sample size
|
| 134 |
+
python run_evaluation.py --samples 20
|
| 135 |
+
|
| 136 |
+
# Evaluate models individually
|
| 137 |
+
python evaluation/run_evaluation.py --models textrank --samples 50
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
### Dataset Download Issues
|
| 141 |
+
The CNN/DailyMail dataset is large (~1.3GB). Ensure you have:
|
| 142 |
+
- Stable internet connection
|
| 143 |
+
- Sufficient disk space
|
| 144 |
+
- Proper HuggingFace datasets cache directory
|
| 145 |
+
|
| 146 |
+
### Model Loading Issues
|
| 147 |
+
If models fail to load:
|
| 148 |
+
- Check PyTorch installation
|
| 149 |
+
- Verify transformers library version
|
| 150 |
+
- Ensure sufficient RAM (8GB+ recommended)
|
| 151 |
+
|
| 152 |
+
## Configuration
|
| 153 |
+
|
| 154 |
+
### Sample Sizes
|
| 155 |
+
- **Development**: 20-50 samples
|
| 156 |
+
- **Testing**: 100-200 samples
|
| 157 |
+
- **Full evaluation**: 500+ samples
|
| 158 |
+
|
| 159 |
+
### Topic Evaluation
|
| 160 |
+
Minimum 5 articles per topic for meaningful results. Topics with fewer articles are skipped.
|
| 161 |
+
|
| 162 |
+
## Performance Expectations
|
| 163 |
+
|
| 164 |
+
### Processing Times (CPU)
|
| 165 |
+
- TextRank: ~0.1 seconds per summary
|
| 166 |
+
- BART: ~10-15 seconds per summary
|
| 167 |
+
- PEGASUS: ~8-12 seconds per summary
|
| 168 |
+
|
| 169 |
+
### Typical ROUGE Scores
|
| 170 |
+
- TextRank: ROUGE-1 ~0.35, ROUGE-2 ~0.15
|
| 171 |
+
- BART: ROUGE-1 ~0.42, ROUGE-2 ~0.20
|
| 172 |
+
- PEGASUS: ROUGE-1 ~0.44, ROUGE-2 ~0.21
|
| 173 |
+
|
| 174 |
+
Results may vary based on dataset and configuration.
|
evaluation/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# Evaluation package for Smart Summarizer
|
evaluation/dataset_loader.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Dataset Loader for CNN/DailyMail Dataset
|
| 3 |
+
Handles loading, splitting, and preprocessing of evaluation data
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from datasets import load_dataset
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
from typing import Dict, List, Tuple
|
| 11 |
+
import logging
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
class CNNDailyMailLoader:
|
| 16 |
+
"""Load and manage CNN/DailyMail dataset for summarization evaluation"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, cache_dir: str = "data/cache"):
|
| 19 |
+
self.cache_dir = cache_dir
|
| 20 |
+
self.dataset = None
|
| 21 |
+
os.makedirs(cache_dir, exist_ok=True)
|
| 22 |
+
|
| 23 |
+
def load_dataset(self, version: str = "3.0.0") -> Dict:
|
| 24 |
+
"""Load CNN/DailyMail dataset from HuggingFace"""
|
| 25 |
+
logger.info(f"Loading CNN/DailyMail dataset version {version}")
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
self.dataset = load_dataset("abisee/cnn_dailymail", version)
|
| 29 |
+
logger.info("Dataset loaded successfully")
|
| 30 |
+
return self.dataset
|
| 31 |
+
except Exception as e:
|
| 32 |
+
logger.error(f"Failed to load dataset: {e}")
|
| 33 |
+
raise
|
| 34 |
+
|
| 35 |
+
def get_splits(self) -> Tuple[List[Dict], List[Dict], List[Dict]]:
|
| 36 |
+
"""Get train, validation, and test splits"""
|
| 37 |
+
if not self.dataset:
|
| 38 |
+
self.load_dataset()
|
| 39 |
+
|
| 40 |
+
train_data = list(self.dataset['train'])
|
| 41 |
+
val_data = list(self.dataset['validation'])
|
| 42 |
+
test_data = list(self.dataset['test'])
|
| 43 |
+
|
| 44 |
+
logger.info(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")
|
| 45 |
+
return train_data, val_data, test_data
|
| 46 |
+
|
| 47 |
+
def create_evaluation_subset(self, split: str = "test", size: int = 100) -> List[Dict]:
|
| 48 |
+
"""Create a smaller subset for evaluation"""
|
| 49 |
+
if not self.dataset:
|
| 50 |
+
self.load_dataset()
|
| 51 |
+
|
| 52 |
+
data = list(self.dataset[split])
|
| 53 |
+
subset = data[:size]
|
| 54 |
+
|
| 55 |
+
# Clean and format data
|
| 56 |
+
evaluation_data = []
|
| 57 |
+
for item in subset:
|
| 58 |
+
evaluation_data.append({
|
| 59 |
+
'id': item.get('id', ''),
|
| 60 |
+
'article': item['article'],
|
| 61 |
+
'highlights': item['highlights'],
|
| 62 |
+
'url': item.get('url', '')
|
| 63 |
+
})
|
| 64 |
+
|
| 65 |
+
return evaluation_data
|
| 66 |
+
|
| 67 |
+
def save_evaluation_data(self, data: List[Dict], filename: str):
|
| 68 |
+
"""Save evaluation data to JSON file"""
|
| 69 |
+
filepath = os.path.join(self.cache_dir, filename)
|
| 70 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 71 |
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
| 72 |
+
logger.info(f"Saved {len(data)} items to {filepath}")
|
| 73 |
+
|
| 74 |
+
def load_evaluation_data(self, filename: str) -> List[Dict]:
|
| 75 |
+
"""Load evaluation data from JSON file"""
|
| 76 |
+
filepath = os.path.join(self.cache_dir, filename)
|
| 77 |
+
if os.path.exists(filepath):
|
| 78 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 79 |
+
data = json.load(f)
|
| 80 |
+
logger.info(f"Loaded {len(data)} items from {filepath}")
|
| 81 |
+
return data
|
| 82 |
+
else:
|
| 83 |
+
logger.warning(f"File not found: {filepath}")
|
| 84 |
+
return []
|
| 85 |
+
|
| 86 |
+
def get_topic_categories(self) -> Dict[str, List[str]]:
|
| 87 |
+
"""Define topic categories for evaluation"""
|
| 88 |
+
return {
|
| 89 |
+
'politics': ['election', 'government', 'president', 'congress', 'senate', 'political'],
|
| 90 |
+
'business': ['company', 'market', 'stock', 'economy', 'financial', 'business'],
|
| 91 |
+
'technology': ['tech', 'computer', 'software', 'internet', 'digital', 'AI'],
|
| 92 |
+
'sports': ['game', 'team', 'player', 'sport', 'match', 'championship'],
|
| 93 |
+
'health': ['medical', 'health', 'doctor', 'hospital', 'disease', 'treatment'],
|
| 94 |
+
'entertainment': ['movie', 'actor', 'celebrity', 'film', 'music', 'entertainment']
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
def categorize_by_topic(self, data: List[Dict]) -> Dict[str, List[Dict]]:
|
| 98 |
+
"""Categorize articles by topic"""
|
| 99 |
+
categories = self.get_topic_categories()
|
| 100 |
+
categorized = {topic: [] for topic in categories.keys()}
|
| 101 |
+
categorized['other'] = []
|
| 102 |
+
|
| 103 |
+
for item in data:
|
| 104 |
+
article_text = item['article'].lower()
|
| 105 |
+
assigned = False
|
| 106 |
+
|
| 107 |
+
for topic, keywords in categories.items():
|
| 108 |
+
if any(keyword in article_text for keyword in keywords):
|
| 109 |
+
categorized[topic].append(item)
|
| 110 |
+
assigned = True
|
| 111 |
+
break
|
| 112 |
+
|
| 113 |
+
if not assigned:
|
| 114 |
+
categorized['other'].append(item)
|
| 115 |
+
|
| 116 |
+
# Log distribution
|
| 117 |
+
for topic, items in categorized.items():
|
| 118 |
+
logger.info(f"{topic}: {len(items)} articles")
|
| 119 |
+
|
| 120 |
+
return categorized
|
| 121 |
+
|
| 122 |
+
if __name__ == "__main__":
|
| 123 |
+
# Example usage
|
| 124 |
+
loader = CNNDailyMailLoader()
|
| 125 |
+
|
| 126 |
+
# Load dataset
|
| 127 |
+
dataset = loader.load_dataset()
|
| 128 |
+
|
| 129 |
+
# Create evaluation subset
|
| 130 |
+
eval_data = loader.create_evaluation_subset(size=200)
|
| 131 |
+
|
| 132 |
+
# Categorize by topic
|
| 133 |
+
categorized = loader.categorize_by_topic(eval_data)
|
| 134 |
+
|
| 135 |
+
# Save data
|
| 136 |
+
loader.save_evaluation_data(eval_data, "cnn_dailymail_eval_200.json")
|
| 137 |
+
|
| 138 |
+
for topic, items in categorized.items():
|
| 139 |
+
if items:
|
| 140 |
+
loader.save_evaluation_data(items, f"cnn_dailymail_{topic}.json")
|
evaluation/model_evaluator.py
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Model Evaluator for Summarization Models
|
| 3 |
+
Evaluates individual models and compares their performance
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import sys
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
project_root = Path(__file__).parent.parent
|
| 9 |
+
if str(project_root) not in sys.path:
|
| 10 |
+
sys.path.insert(0, str(project_root))
|
| 11 |
+
|
| 12 |
+
import time
|
| 13 |
+
import json
|
| 14 |
+
import pandas as pd
|
| 15 |
+
from typing import Dict, List, Any
|
| 16 |
+
import logging
|
| 17 |
+
from rouge_score import rouge_scorer
|
| 18 |
+
from models.textrank import TextRankSummarizer
|
| 19 |
+
from models.bart import BARTSummarizer
|
| 20 |
+
from models.pegasus import PEGASUSSummarizer
|
| 21 |
+
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
class ModelEvaluator:
|
| 25 |
+
"""Evaluate summarization models on CNN/DailyMail dataset"""
|
| 26 |
+
|
| 27 |
+
def __init__(self):
|
| 28 |
+
self.models = {}
|
| 29 |
+
self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
|
| 30 |
+
self.results = {}
|
| 31 |
+
|
| 32 |
+
def initialize_models(self):
|
| 33 |
+
"""Initialize all summarization models"""
|
| 34 |
+
logger.info("Initializing models...")
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
self.models['textrank'] = TextRankSummarizer()
|
| 38 |
+
logger.info("TextRank model initialized")
|
| 39 |
+
except Exception as e:
|
| 40 |
+
logger.error(f"Failed to initialize TextRank: {e}")
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
self.models['bart'] = BARTSummarizer(device='cpu')
|
| 44 |
+
logger.info("BART model initialized")
|
| 45 |
+
except Exception as e:
|
| 46 |
+
logger.error(f"Failed to initialize BART: {e}")
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
self.models['pegasus'] = PEGASUSSummarizer(device='cpu')
|
| 50 |
+
logger.info("PEGASUS model initialized")
|
| 51 |
+
except Exception as e:
|
| 52 |
+
logger.error(f"Failed to initialize PEGASUS: {e}")
|
| 53 |
+
|
| 54 |
+
def evaluate_single_model(self, model_name: str, data: List[Dict], max_samples: int = None) -> Dict:
|
| 55 |
+
"""Evaluate a single model on the dataset"""
|
| 56 |
+
if model_name not in self.models:
|
| 57 |
+
raise ValueError(f"Model {model_name} not initialized")
|
| 58 |
+
|
| 59 |
+
model = self.models[model_name]
|
| 60 |
+
results = {
|
| 61 |
+
'model': model_name,
|
| 62 |
+
'total_samples': len(data),
|
| 63 |
+
'processed_samples': 0,
|
| 64 |
+
'rouge_scores': {'rouge1': [], 'rouge2': [], 'rougeL': []},
|
| 65 |
+
'processing_times': [],
|
| 66 |
+
'summaries': [],
|
| 67 |
+
'errors': 0
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
if max_samples:
|
| 71 |
+
data = data[:max_samples]
|
| 72 |
+
|
| 73 |
+
logger.info(f"Evaluating {model_name} on {len(data)} samples")
|
| 74 |
+
|
| 75 |
+
for i, item in enumerate(data):
|
| 76 |
+
try:
|
| 77 |
+
start_time = time.time()
|
| 78 |
+
|
| 79 |
+
# Generate summary
|
| 80 |
+
if model_name == 'textrank':
|
| 81 |
+
# Calculate appropriate number of sentences
|
| 82 |
+
sentences = item['article'].count('.') + item['article'].count('!') + item['article'].count('?')
|
| 83 |
+
num_sentences = max(2, int(sentences * 0.3))
|
| 84 |
+
summary = model.summarize(item['article'], num_sentences=num_sentences)
|
| 85 |
+
else:
|
| 86 |
+
# For BART and PEGASUS
|
| 87 |
+
input_words = len(item['article'].split())
|
| 88 |
+
target_length = max(30, min(150, int(input_words * 0.22)))
|
| 89 |
+
summary = model.summarize(
|
| 90 |
+
item['article'],
|
| 91 |
+
max_length=target_length,
|
| 92 |
+
min_length=max(20, int(target_length * 0.5))
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
processing_time = time.time() - start_time
|
| 96 |
+
|
| 97 |
+
# Calculate ROUGE scores
|
| 98 |
+
rouge_scores = self.rouge_scorer.score(item['highlights'], summary)
|
| 99 |
+
|
| 100 |
+
# Store results
|
| 101 |
+
results['rouge_scores']['rouge1'].append(rouge_scores['rouge1'].fmeasure)
|
| 102 |
+
results['rouge_scores']['rouge2'].append(rouge_scores['rouge2'].fmeasure)
|
| 103 |
+
results['rouge_scores']['rougeL'].append(rouge_scores['rougeL'].fmeasure)
|
| 104 |
+
results['processing_times'].append(processing_time)
|
| 105 |
+
results['summaries'].append({
|
| 106 |
+
'id': item.get('id', i),
|
| 107 |
+
'original': item['article'][:200] + '...',
|
| 108 |
+
'reference': item['highlights'],
|
| 109 |
+
'generated': summary,
|
| 110 |
+
'rouge1': rouge_scores['rouge1'].fmeasure,
|
| 111 |
+
'rouge2': rouge_scores['rouge2'].fmeasure,
|
| 112 |
+
'rougeL': rouge_scores['rougeL'].fmeasure,
|
| 113 |
+
'processing_time': processing_time
|
| 114 |
+
})
|
| 115 |
+
|
| 116 |
+
results['processed_samples'] += 1
|
| 117 |
+
|
| 118 |
+
if (i + 1) % 10 == 0:
|
| 119 |
+
logger.info(f"{model_name}: Processed {i + 1}/{len(data)} samples")
|
| 120 |
+
|
| 121 |
+
except Exception as e:
|
| 122 |
+
logger.error(f"Error processing sample {i} with {model_name}: {e}")
|
| 123 |
+
results['errors'] += 1
|
| 124 |
+
|
| 125 |
+
# Calculate average scores
|
| 126 |
+
results['avg_rouge1'] = sum(results['rouge_scores']['rouge1']) / len(results['rouge_scores']['rouge1']) if results['rouge_scores']['rouge1'] else 0
|
| 127 |
+
results['avg_rouge2'] = sum(results['rouge_scores']['rouge2']) / len(results['rouge_scores']['rouge2']) if results['rouge_scores']['rouge2'] else 0
|
| 128 |
+
results['avg_rougeL'] = sum(results['rouge_scores']['rougeL']) / len(results['rouge_scores']['rougeL']) if results['rouge_scores']['rougeL'] else 0
|
| 129 |
+
results['avg_processing_time'] = sum(results['processing_times']) / len(results['processing_times']) if results['processing_times'] else 0
|
| 130 |
+
|
| 131 |
+
logger.info(f"{model_name} evaluation complete:")
|
| 132 |
+
logger.info(f" ROUGE-1: {results['avg_rouge1']:.4f}")
|
| 133 |
+
logger.info(f" ROUGE-2: {results['avg_rouge2']:.4f}")
|
| 134 |
+
logger.info(f" ROUGE-L: {results['avg_rougeL']:.4f}")
|
| 135 |
+
logger.info(f" Avg Time: {results['avg_processing_time']:.4f}s")
|
| 136 |
+
|
| 137 |
+
return results
|
| 138 |
+
|
| 139 |
+
def evaluate_all_models(self, data: List[Dict], max_samples: int = None) -> Dict:
|
| 140 |
+
"""Evaluate all models on the same dataset"""
|
| 141 |
+
if not self.models:
|
| 142 |
+
self.initialize_models()
|
| 143 |
+
|
| 144 |
+
all_results = {}
|
| 145 |
+
|
| 146 |
+
for model_name in self.models.keys():
|
| 147 |
+
logger.info(f"Starting evaluation for {model_name}")
|
| 148 |
+
all_results[model_name] = self.evaluate_single_model(model_name, data, max_samples)
|
| 149 |
+
|
| 150 |
+
return all_results
|
| 151 |
+
|
| 152 |
+
def compare_models(self, results: Dict) -> pd.DataFrame:
|
| 153 |
+
"""Create comparison table of model performance"""
|
| 154 |
+
comparison_data = []
|
| 155 |
+
|
| 156 |
+
for model_name, result in results.items():
|
| 157 |
+
comparison_data.append({
|
| 158 |
+
'Model': model_name.upper(),
|
| 159 |
+
'ROUGE-1': f"{result['avg_rouge1']:.4f}",
|
| 160 |
+
'ROUGE-2': f"{result['avg_rouge2']:.4f}",
|
| 161 |
+
'ROUGE-L': f"{result['avg_rougeL']:.4f}",
|
| 162 |
+
'Avg Time (s)': f"{result['avg_processing_time']:.4f}",
|
| 163 |
+
'Samples': result['processed_samples'],
|
| 164 |
+
'Errors': result['errors']
|
| 165 |
+
})
|
| 166 |
+
|
| 167 |
+
df = pd.DataFrame(comparison_data)
|
| 168 |
+
return df
|
| 169 |
+
|
| 170 |
+
def save_results(self, results: Dict, filename: str):
|
| 171 |
+
"""Save evaluation results to JSON file"""
|
| 172 |
+
# Convert numpy types to native Python types for JSON serialization
|
| 173 |
+
serializable_results = {}
|
| 174 |
+
for model_name, result in results.items():
|
| 175 |
+
serializable_results[model_name] = {
|
| 176 |
+
'model': result['model'],
|
| 177 |
+
'total_samples': result['total_samples'],
|
| 178 |
+
'processed_samples': result['processed_samples'],
|
| 179 |
+
'errors': result['errors'],
|
| 180 |
+
'avg_rouge1': float(result['avg_rouge1']),
|
| 181 |
+
'avg_rouge2': float(result['avg_rouge2']),
|
| 182 |
+
'avg_rougeL': float(result['avg_rougeL']),
|
| 183 |
+
'avg_processing_time': float(result['avg_processing_time']),
|
| 184 |
+
'summaries': result['summaries'][:10] # Save only first 10 for space
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
with open(filename, 'w', encoding='utf-8') as f:
|
| 188 |
+
json.dump(serializable_results, f, indent=2, ensure_ascii=False)
|
| 189 |
+
|
| 190 |
+
logger.info(f"Results saved to {filename}")
|
| 191 |
+
|
| 192 |
+
def evaluate_by_topic(self, categorized_data: Dict[str, List[Dict]], max_samples_per_topic: int = 20) -> Dict:
|
| 193 |
+
"""Evaluate models on different topic categories"""
|
| 194 |
+
topic_results = {}
|
| 195 |
+
|
| 196 |
+
for topic, data in categorized_data.items():
|
| 197 |
+
if not data:
|
| 198 |
+
continue
|
| 199 |
+
|
| 200 |
+
logger.info(f"Evaluating topic: {topic} ({len(data)} samples)")
|
| 201 |
+
topic_results[topic] = self.evaluate_all_models(data, max_samples_per_topic)
|
| 202 |
+
|
| 203 |
+
return topic_results
|
| 204 |
+
|
| 205 |
+
if __name__ == "__main__":
|
| 206 |
+
# Example usage
|
| 207 |
+
from evaluation.dataset_loader import CNNDailyMailLoader
|
| 208 |
+
|
| 209 |
+
# Load data
|
| 210 |
+
loader = CNNDailyMailLoader()
|
| 211 |
+
eval_data = loader.create_evaluation_subset(size=50)
|
| 212 |
+
|
| 213 |
+
# Initialize evaluator
|
| 214 |
+
evaluator = ModelEvaluator()
|
| 215 |
+
evaluator.initialize_models()
|
| 216 |
+
|
| 217 |
+
# Run evaluation
|
| 218 |
+
results = evaluator.evaluate_all_models(eval_data, max_samples=20)
|
| 219 |
+
|
| 220 |
+
# Create comparison
|
| 221 |
+
comparison_df = evaluator.compare_models(results)
|
| 222 |
+
print("\nModel Comparison:")
|
| 223 |
+
print(comparison_df.to_string(index=False))
|
| 224 |
+
|
| 225 |
+
# Save results
|
| 226 |
+
evaluator.save_results(results, "evaluation_results.json")
|
evaluation/results_analyzer.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Results Analyzer
|
| 3 |
+
Analyzes and visualizes evaluation results
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import matplotlib.pyplot as plt
|
| 9 |
+
import seaborn as sns
|
| 10 |
+
from typing import Dict, List
|
| 11 |
+
import os
|
| 12 |
+
import logging
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
class ResultsAnalyzer:
|
| 17 |
+
"""Analyze and visualize evaluation results"""
|
| 18 |
+
|
| 19 |
+
def __init__(self):
|
| 20 |
+
plt.style.use('default')
|
| 21 |
+
sns.set_palette("husl")
|
| 22 |
+
|
| 23 |
+
def load_results(self, filepath: str) -> Dict:
|
| 24 |
+
"""Load results from JSON file"""
|
| 25 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 26 |
+
return json.load(f)
|
| 27 |
+
|
| 28 |
+
def create_performance_charts(self, results: Dict, output_dir: str):
|
| 29 |
+
"""Create performance comparison charts"""
|
| 30 |
+
# Prepare data for plotting
|
| 31 |
+
models = list(results.keys())
|
| 32 |
+
rouge1_scores = [results[model]['avg_rouge1'] for model in models]
|
| 33 |
+
rouge2_scores = [results[model]['avg_rouge2'] for model in models]
|
| 34 |
+
rougeL_scores = [results[model]['avg_rougeL'] for model in models]
|
| 35 |
+
processing_times = [results[model]['avg_processing_time'] for model in models]
|
| 36 |
+
|
| 37 |
+
# Create subplots
|
| 38 |
+
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
|
| 39 |
+
|
| 40 |
+
# ROUGE scores comparison
|
| 41 |
+
x_pos = range(len(models))
|
| 42 |
+
width = 0.25
|
| 43 |
+
|
| 44 |
+
ax1.bar([x - width for x in x_pos], rouge1_scores, width, label='ROUGE-1', alpha=0.8)
|
| 45 |
+
ax1.bar(x_pos, rouge2_scores, width, label='ROUGE-2', alpha=0.8)
|
| 46 |
+
ax1.bar([x + width for x in x_pos], rougeL_scores, width, label='ROUGE-L', alpha=0.8)
|
| 47 |
+
ax1.set_xlabel('Models')
|
| 48 |
+
ax1.set_ylabel('ROUGE Score')
|
| 49 |
+
ax1.set_title('ROUGE Scores Comparison')
|
| 50 |
+
ax1.set_xticks(x_pos)
|
| 51 |
+
ax1.set_xticklabels([m.upper() for m in models])
|
| 52 |
+
ax1.legend()
|
| 53 |
+
ax1.grid(True, alpha=0.3)
|
| 54 |
+
|
| 55 |
+
# Processing time comparison
|
| 56 |
+
ax2.bar(models, processing_times, alpha=0.8, color='orange')
|
| 57 |
+
ax2.set_xlabel('Models')
|
| 58 |
+
ax2.set_ylabel('Processing Time (seconds)')
|
| 59 |
+
ax2.set_title('Average Processing Time')
|
| 60 |
+
ax2.set_xticklabels([m.upper() for m in models])
|
| 61 |
+
ax2.grid(True, alpha=0.3)
|
| 62 |
+
|
| 63 |
+
# ROUGE-1 vs ROUGE-2 scatter
|
| 64 |
+
ax3.scatter(rouge1_scores, rouge2_scores, s=100, alpha=0.7)
|
| 65 |
+
for i, model in enumerate(models):
|
| 66 |
+
ax3.annotate(model.upper(), (rouge1_scores[i], rouge2_scores[i]),
|
| 67 |
+
xytext=(5, 5), textcoords='offset points')
|
| 68 |
+
ax3.set_xlabel('ROUGE-1')
|
| 69 |
+
ax3.set_ylabel('ROUGE-2')
|
| 70 |
+
ax3.set_title('ROUGE-1 vs ROUGE-2')
|
| 71 |
+
ax3.grid(True, alpha=0.3)
|
| 72 |
+
|
| 73 |
+
# Performance vs Speed
|
| 74 |
+
ax4.scatter(processing_times, rouge1_scores, s=100, alpha=0.7, color='green')
|
| 75 |
+
for i, model in enumerate(models):
|
| 76 |
+
ax4.annotate(model.upper(), (processing_times[i], rouge1_scores[i]),
|
| 77 |
+
xytext=(5, 5), textcoords='offset points')
|
| 78 |
+
ax4.set_xlabel('Processing Time (seconds)')
|
| 79 |
+
ax4.set_ylabel('ROUGE-1 Score')
|
| 80 |
+
ax4.set_title('Performance vs Speed Trade-off')
|
| 81 |
+
ax4.grid(True, alpha=0.3)
|
| 82 |
+
|
| 83 |
+
plt.tight_layout()
|
| 84 |
+
plt.savefig(f"{output_dir}/performance_comparison.png", dpi=300, bbox_inches='tight')
|
| 85 |
+
plt.close()
|
| 86 |
+
|
| 87 |
+
logger.info(f"Performance charts saved to {output_dir}/performance_comparison.png")
|
| 88 |
+
|
| 89 |
+
def analyze_topic_performance(self, topic_results: Dict, output_dir: str):
|
| 90 |
+
"""Analyze performance across different topics"""
|
| 91 |
+
# Prepare data
|
| 92 |
+
topics = list(topic_results.keys())
|
| 93 |
+
models = list(topic_results[topics[0]].keys()) if topics else []
|
| 94 |
+
|
| 95 |
+
# Create topic performance matrix
|
| 96 |
+
rouge1_matrix = []
|
| 97 |
+
rouge2_matrix = []
|
| 98 |
+
rougeL_matrix = []
|
| 99 |
+
|
| 100 |
+
for topic in topics:
|
| 101 |
+
rouge1_row = [topic_results[topic][model]['avg_rouge1'] for model in models]
|
| 102 |
+
rouge2_row = [topic_results[topic][model]['avg_rouge2'] for model in models]
|
| 103 |
+
rougeL_row = [topic_results[topic][model]['avg_rougeL'] for model in models]
|
| 104 |
+
|
| 105 |
+
rouge1_matrix.append(rouge1_row)
|
| 106 |
+
rouge2_matrix.append(rouge2_row)
|
| 107 |
+
rougeL_matrix.append(rougeL_row)
|
| 108 |
+
|
| 109 |
+
# Create heatmaps
|
| 110 |
+
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 6))
|
| 111 |
+
|
| 112 |
+
# ROUGE-1 heatmap
|
| 113 |
+
sns.heatmap(rouge1_matrix, annot=True, fmt='.3f',
|
| 114 |
+
xticklabels=[m.upper() for m in models],
|
| 115 |
+
yticklabels=[t.upper() for t in topics],
|
| 116 |
+
ax=ax1, cmap='YlOrRd')
|
| 117 |
+
ax1.set_title('ROUGE-1 Scores by Topic')
|
| 118 |
+
|
| 119 |
+
# ROUGE-2 heatmap
|
| 120 |
+
sns.heatmap(rouge2_matrix, annot=True, fmt='.3f',
|
| 121 |
+
xticklabels=[m.upper() for m in models],
|
| 122 |
+
yticklabels=[t.upper() for t in topics],
|
| 123 |
+
ax=ax2, cmap='YlOrRd')
|
| 124 |
+
ax2.set_title('ROUGE-2 Scores by Topic')
|
| 125 |
+
|
| 126 |
+
# ROUGE-L heatmap
|
| 127 |
+
sns.heatmap(rougeL_matrix, annot=True, fmt='.3f',
|
| 128 |
+
xticklabels=[m.upper() for m in models],
|
| 129 |
+
yticklabels=[t.upper() for t in topics],
|
| 130 |
+
ax=ax3, cmap='YlOrRd')
|
| 131 |
+
ax3.set_title('ROUGE-L Scores by Topic')
|
| 132 |
+
|
| 133 |
+
plt.tight_layout()
|
| 134 |
+
plt.savefig(f"{output_dir}/topic_performance_heatmap.png", dpi=300, bbox_inches='tight')
|
| 135 |
+
plt.close()
|
| 136 |
+
|
| 137 |
+
# Create topic summary table
|
| 138 |
+
topic_summary = []
|
| 139 |
+
for topic in topics:
|
| 140 |
+
for model in models:
|
| 141 |
+
topic_summary.append({
|
| 142 |
+
'Topic': topic.upper(),
|
| 143 |
+
'Model': model.upper(),
|
| 144 |
+
'ROUGE-1': f"{topic_results[topic][model]['avg_rouge1']:.4f}",
|
| 145 |
+
'ROUGE-2': f"{topic_results[topic][model]['avg_rouge2']:.4f}",
|
| 146 |
+
'ROUGE-L': f"{topic_results[topic][model]['avg_rougeL']:.4f}",
|
| 147 |
+
'Samples': topic_results[topic][model]['processed_samples']
|
| 148 |
+
})
|
| 149 |
+
|
| 150 |
+
df = pd.DataFrame(topic_summary)
|
| 151 |
+
df.to_csv(f"{output_dir}/topic_summary.csv", index=False)
|
| 152 |
+
|
| 153 |
+
logger.info(f"Topic analysis saved to {output_dir}")
|
| 154 |
+
logger.info("\nTopic Performance Summary:")
|
| 155 |
+
logger.info(df.to_string(index=False))
|
| 156 |
+
|
| 157 |
+
def create_detailed_report(self, results: Dict, output_dir: str):
|
| 158 |
+
"""Create detailed evaluation report"""
|
| 159 |
+
report_lines = []
|
| 160 |
+
report_lines.append("# Summarization Model Evaluation Report")
|
| 161 |
+
report_lines.append("")
|
| 162 |
+
|
| 163 |
+
# Overall statistics
|
| 164 |
+
report_lines.append("## Overall Performance")
|
| 165 |
+
report_lines.append("")
|
| 166 |
+
|
| 167 |
+
for model_name, result in results.items():
|
| 168 |
+
report_lines.append(f"### {model_name.upper()}")
|
| 169 |
+
report_lines.append(f"- Samples Processed: {result['processed_samples']}")
|
| 170 |
+
report_lines.append(f"- ROUGE-1: {result['avg_rouge1']:.4f}")
|
| 171 |
+
report_lines.append(f"- ROUGE-2: {result['avg_rouge2']:.4f}")
|
| 172 |
+
report_lines.append(f"- ROUGE-L: {result['avg_rougeL']:.4f}")
|
| 173 |
+
report_lines.append(f"- Average Processing Time: {result['avg_processing_time']:.4f}s")
|
| 174 |
+
report_lines.append(f"- Errors: {result['errors']}")
|
| 175 |
+
report_lines.append("")
|
| 176 |
+
|
| 177 |
+
# Best performing model
|
| 178 |
+
best_rouge1 = max(results.items(), key=lambda x: x[1]['avg_rouge1'])
|
| 179 |
+
best_rouge2 = max(results.items(), key=lambda x: x[1]['avg_rouge2'])
|
| 180 |
+
fastest = min(results.items(), key=lambda x: x[1]['avg_processing_time'])
|
| 181 |
+
|
| 182 |
+
report_lines.append("## Summary")
|
| 183 |
+
report_lines.append(f"- Best ROUGE-1: {best_rouge1[0].upper()} ({best_rouge1[1]['avg_rouge1']:.4f})")
|
| 184 |
+
report_lines.append(f"- Best ROUGE-2: {best_rouge2[0].upper()} ({best_rouge2[1]['avg_rouge2']:.4f})")
|
| 185 |
+
report_lines.append(f"- Fastest: {fastest[0].upper()} ({fastest[1]['avg_processing_time']:.4f}s)")
|
| 186 |
+
report_lines.append("")
|
| 187 |
+
|
| 188 |
+
# Save report
|
| 189 |
+
with open(f"{output_dir}/evaluation_report.md", 'w', encoding='utf-8') as f:
|
| 190 |
+
f.write('\n'.join(report_lines))
|
| 191 |
+
|
| 192 |
+
logger.info(f"Detailed report saved to {output_dir}/evaluation_report.md")
|
| 193 |
+
|
| 194 |
+
if __name__ == "__main__":
|
| 195 |
+
# Example usage
|
| 196 |
+
analyzer = ResultsAnalyzer()
|
| 197 |
+
|
| 198 |
+
# Load and analyze results
|
| 199 |
+
if os.path.exists("evaluation_results.json"):
|
| 200 |
+
results = analyzer.load_results("evaluation_results.json")
|
| 201 |
+
analyzer.create_performance_charts(results, ".")
|
| 202 |
+
analyzer.create_detailed_report(results, ".")
|
evaluation/run_evaluation.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Main Evaluation Script
|
| 3 |
+
Runs comprehensive evaluation of all models on CNN/DailyMail dataset
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import logging
|
| 9 |
+
import argparse
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
# Add project root to path
|
| 13 |
+
project_root = Path(__file__).parent.parent
|
| 14 |
+
sys.path.insert(0, str(project_root))
|
| 15 |
+
|
| 16 |
+
from evaluation.dataset_loader import CNNDailyMailLoader
|
| 17 |
+
from evaluation.model_evaluator import ModelEvaluator
|
| 18 |
+
from evaluation.results_analyzer import ResultsAnalyzer
|
| 19 |
+
|
| 20 |
+
# Setup logging
|
| 21 |
+
logging.basicConfig(
|
| 22 |
+
level=logging.INFO,
|
| 23 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
| 24 |
+
handlers=[
|
| 25 |
+
logging.FileHandler('evaluation.log'),
|
| 26 |
+
logging.StreamHandler()
|
| 27 |
+
]
|
| 28 |
+
)
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
def main():
|
| 32 |
+
parser = argparse.ArgumentParser(description='Evaluate summarization models')
|
| 33 |
+
parser.add_argument('--samples', type=int, default=100, help='Number of samples to evaluate')
|
| 34 |
+
parser.add_argument('--by-topic', action='store_true', help='Evaluate by topic categories')
|
| 35 |
+
parser.add_argument('--output-dir', type=str, default='evaluation/results', help='Output directory')
|
| 36 |
+
parser.add_argument('--models', nargs='+', default=['textrank', 'bart', 'pegasus'],
|
| 37 |
+
help='Models to evaluate')
|
| 38 |
+
|
| 39 |
+
args = parser.parse_args()
|
| 40 |
+
|
| 41 |
+
# Create output directory
|
| 42 |
+
os.makedirs(args.output_dir, exist_ok=True)
|
| 43 |
+
|
| 44 |
+
logger.info("Starting comprehensive model evaluation")
|
| 45 |
+
logger.info(f"Samples: {args.samples}")
|
| 46 |
+
logger.info(f"Models: {args.models}")
|
| 47 |
+
logger.info(f"By topic: {args.by_topic}")
|
| 48 |
+
|
| 49 |
+
# Initialize components
|
| 50 |
+
loader = CNNDailyMailLoader()
|
| 51 |
+
evaluator = ModelEvaluator()
|
| 52 |
+
analyzer = ResultsAnalyzer()
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
# Load dataset
|
| 56 |
+
logger.info("Loading CNN/DailyMail dataset...")
|
| 57 |
+
dataset = loader.load_dataset()
|
| 58 |
+
|
| 59 |
+
# Create evaluation subset
|
| 60 |
+
logger.info(f"Creating evaluation subset of {args.samples} samples...")
|
| 61 |
+
eval_data = loader.create_evaluation_subset(size=args.samples)
|
| 62 |
+
|
| 63 |
+
# Save evaluation data
|
| 64 |
+
loader.save_evaluation_data(eval_data, f"eval_data_{args.samples}.json")
|
| 65 |
+
|
| 66 |
+
# Initialize models
|
| 67 |
+
logger.info("Initializing models...")
|
| 68 |
+
evaluator.initialize_models()
|
| 69 |
+
|
| 70 |
+
if args.by_topic:
|
| 71 |
+
# Evaluate by topic
|
| 72 |
+
logger.info("Categorizing data by topics...")
|
| 73 |
+
categorized_data = loader.categorize_by_topic(eval_data)
|
| 74 |
+
|
| 75 |
+
# Save categorized data
|
| 76 |
+
for topic, data in categorized_data.items():
|
| 77 |
+
if data:
|
| 78 |
+
loader.save_evaluation_data(data, f"eval_data_{topic}.json")
|
| 79 |
+
|
| 80 |
+
# Run topic-based evaluation
|
| 81 |
+
logger.info("Running topic-based evaluation...")
|
| 82 |
+
topic_results = evaluator.evaluate_by_topic(categorized_data, max_samples_per_topic=20)
|
| 83 |
+
|
| 84 |
+
# Save topic results
|
| 85 |
+
for topic, results in topic_results.items():
|
| 86 |
+
evaluator.save_results(results, f"{args.output_dir}/results_{topic}.json")
|
| 87 |
+
|
| 88 |
+
# Create topic comparison
|
| 89 |
+
comparison_df = evaluator.compare_models(results)
|
| 90 |
+
comparison_df.to_csv(f"{args.output_dir}/comparison_{topic}.csv", index=False)
|
| 91 |
+
|
| 92 |
+
logger.info(f"\n{topic.upper()} Topic Results:")
|
| 93 |
+
logger.info(comparison_df.to_string(index=False))
|
| 94 |
+
|
| 95 |
+
# Analyze topic results
|
| 96 |
+
analyzer.analyze_topic_performance(topic_results, args.output_dir)
|
| 97 |
+
|
| 98 |
+
else:
|
| 99 |
+
# Standard evaluation
|
| 100 |
+
logger.info("Running standard evaluation...")
|
| 101 |
+
results = evaluator.evaluate_all_models(eval_data, max_samples=args.samples)
|
| 102 |
+
|
| 103 |
+
# Save results
|
| 104 |
+
evaluator.save_results(results, f"{args.output_dir}/results_overall.json")
|
| 105 |
+
|
| 106 |
+
# Create comparison
|
| 107 |
+
comparison_df = evaluator.compare_models(results)
|
| 108 |
+
comparison_df.to_csv(f"{args.output_dir}/comparison_overall.csv", index=False)
|
| 109 |
+
|
| 110 |
+
logger.info("\nOverall Results:")
|
| 111 |
+
logger.info(comparison_df.to_string(index=False))
|
| 112 |
+
|
| 113 |
+
# Analyze results
|
| 114 |
+
analyzer.create_performance_charts(results, args.output_dir)
|
| 115 |
+
|
| 116 |
+
logger.info(f"Evaluation complete. Results saved to {args.output_dir}")
|
| 117 |
+
|
| 118 |
+
except Exception as e:
|
| 119 |
+
logger.error(f"Evaluation failed: {e}")
|
| 120 |
+
raise
|
| 121 |
+
|
| 122 |
+
if __name__ == "__main__":
|
| 123 |
+
main()
|
notebooks/.ipynb_checkpoints/01_data_exploration-checkpoint.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/.ipynb_checkpoints/02_model_testing-checkpoint.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/.ipynb_checkpoints/03_evaluation_analysis-checkpoint.ipynb
ADDED
|
@@ -0,0 +1,478 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 3,
|
| 6 |
+
"id": "0c688166",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [
|
| 9 |
+
{
|
| 10 |
+
"name": "stdout",
|
| 11 |
+
"output_type": "stream",
|
| 12 |
+
"text": [
|
| 13 |
+
"⚠ rouge library not found. Installing rouge-score...\n",
|
| 14 |
+
"✓ Successfully installed rouge-score\n",
|
| 15 |
+
"✗ Installation succeeded but import still fails.\n",
|
| 16 |
+
" Please restart the kernel and run this cell again.\n"
|
| 17 |
+
]
|
| 18 |
+
}
|
| 19 |
+
],
|
| 20 |
+
"source": [
|
| 21 |
+
"# FIX: Install and verify rouge-score package\n",
|
| 22 |
+
"# Run this cell FIRST if you get \"ModuleNotFoundError: No module named 'rouge'\"\n",
|
| 23 |
+
"\n",
|
| 24 |
+
"import sys\n",
|
| 25 |
+
"import subprocess\n",
|
| 26 |
+
"\n",
|
| 27 |
+
"def install_package(package_name):\n",
|
| 28 |
+
" \"\"\"Install package using pip\"\"\"\n",
|
| 29 |
+
" try:\n",
|
| 30 |
+
" subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", package_name, \"--quiet\"])\n",
|
| 31 |
+
" return True\n",
|
| 32 |
+
" except subprocess.CalledProcessError:\n",
|
| 33 |
+
" return False\n",
|
| 34 |
+
"\n",
|
| 35 |
+
"# Check if rouge is available\n",
|
| 36 |
+
"try:\n",
|
| 37 |
+
" from rouge import Rouge\n",
|
| 38 |
+
" print(\"✓ rouge library is already installed\")\n",
|
| 39 |
+
"except ImportError:\n",
|
| 40 |
+
" print(\"⚠ rouge library not found. Installing rouge-score...\")\n",
|
| 41 |
+
" if install_package(\"rouge-score\"):\n",
|
| 42 |
+
" print(\"✓ Successfully installed rouge-score\")\n",
|
| 43 |
+
" # Try importing again\n",
|
| 44 |
+
" try:\n",
|
| 45 |
+
" from rouge import Rouge\n",
|
| 46 |
+
" print(\"✓ rouge library now available\")\n",
|
| 47 |
+
" except ImportError:\n",
|
| 48 |
+
" print(\"✗ Installation succeeded but import still fails.\")\n",
|
| 49 |
+
" print(\" Please restart the kernel and run this cell again.\")\n",
|
| 50 |
+
" else:\n",
|
| 51 |
+
" print(\"✗ Failed to install rouge-score\")\n",
|
| 52 |
+
" print(\" Please run manually: pip install rouge-score\")\n",
|
| 53 |
+
" print(\" Then restart the kernel.\")\n"
|
| 54 |
+
]
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"cell_type": "code",
|
| 58 |
+
"execution_count": 1,
|
| 59 |
+
"id": "1aa43993",
|
| 60 |
+
"metadata": {},
|
| 61 |
+
"outputs": [
|
| 62 |
+
{
|
| 63 |
+
"name": "stdout",
|
| 64 |
+
"output_type": "stream",
|
| 65 |
+
"text": [
|
| 66 |
+
"✗ Import error: No module named 'rouge'\n",
|
| 67 |
+
" Make sure you've run the previous cell to install dependencies\n"
|
| 68 |
+
]
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"ename": "ModuleNotFoundError",
|
| 72 |
+
"evalue": "No module named 'rouge'",
|
| 73 |
+
"output_type": "error",
|
| 74 |
+
"traceback": [
|
| 75 |
+
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
| 76 |
+
"\u001b[31mModuleNotFoundError\u001b[39m Traceback (most recent call last)",
|
| 77 |
+
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 10\u001b[39m\n\u001b[32m 8\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mmodels\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mbart\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m BARTSummarizer\n\u001b[32m 9\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mmodels\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mpegasus\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m PEGASUSSummarizer\n\u001b[32m---> \u001b[39m\u001b[32m10\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mutils\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mevaluator\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m SummarizerEvaluator\n\u001b[32m 11\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mutils\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdata_loader\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m DataLoader\n\u001b[32m 12\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m✓ All imports successful\u001b[39m\u001b[33m\"\u001b[39m)\n",
|
| 78 |
+
"\u001b[36mFile \u001b[39m\u001b[32m~/Downloads/smart-summarizer/notebooks/../utils/evaluator.py:6\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 2\u001b[39m \u001b[33;03mComprehensive Evaluation System for Summarization Models\u001b[39;00m\n\u001b[32m 3\u001b[39m \u001b[33;03mImplements ROUGE metrics, comparison analysis, and statistical testing\u001b[39;00m\n\u001b[32m 4\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mrouge\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Rouge\n\u001b[32m 7\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnp\u001b[39;00m\n\u001b[32m 8\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtyping\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Dict, List, Tuple, Optional\n",
|
| 79 |
+
"\u001b[31mModuleNotFoundError\u001b[39m: No module named 'rouge'"
|
| 80 |
+
]
|
| 81 |
+
}
|
| 82 |
+
],
|
| 83 |
+
"source": [
|
| 84 |
+
"# Add project root to path\n",
|
| 85 |
+
"import sys\n",
|
| 86 |
+
"sys.path.append('..')\n",
|
| 87 |
+
"\n",
|
| 88 |
+
"# Import models and utilities\n",
|
| 89 |
+
"try:\n",
|
| 90 |
+
" from models.textrank import TextRankSummarizer\n",
|
| 91 |
+
" from models.bart import BARTSummarizer\n",
|
| 92 |
+
" from models.pegasus import PEGASUSSummarizer\n",
|
| 93 |
+
" from utils.evaluator import SummarizerEvaluator\n",
|
| 94 |
+
" from utils.data_loader import DataLoader\n",
|
| 95 |
+
" print(\"✓ All imports successful\")\n",
|
| 96 |
+
"except ImportError as e:\n",
|
| 97 |
+
" print(f\"✗ Import error: {e}\")\n",
|
| 98 |
+
" print(\" Make sure you've run the previous cell to install dependencies\")\n",
|
| 99 |
+
" raise\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"# Import standard libraries\n",
|
| 102 |
+
"import pandas as pd\n",
|
| 103 |
+
"import numpy as np\n",
|
| 104 |
+
"import matplotlib.pyplot as plt\n",
|
| 105 |
+
"import seaborn as sns\n",
|
| 106 |
+
"from scipy import stats\n",
|
| 107 |
+
"import json\n",
|
| 108 |
+
"\n",
|
| 109 |
+
"plt.style.use('seaborn-v0_8')"
|
| 110 |
+
]
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"cell_type": "code",
|
| 114 |
+
"execution_count": null,
|
| 115 |
+
"id": "e28695c0",
|
| 116 |
+
"metadata": {},
|
| 117 |
+
"outputs": [],
|
| 118 |
+
"source": [
|
| 119 |
+
"print(\"Loading test dataset...\")\n",
|
| 120 |
+
"loader = DataLoader()\n",
|
| 121 |
+
"\n",
|
| 122 |
+
"# Load your saved samples (or load fresh)\n",
|
| 123 |
+
"try:\n",
|
| 124 |
+
" test_data = loader.load_samples('../data/samples/test_50.json')\n",
|
| 125 |
+
" print(f\"✓ Loaded {len(test_data)} test samples\")\n",
|
| 126 |
+
"except:\n",
|
| 127 |
+
" print(\"Downloading test data...\")\n",
|
| 128 |
+
" test_data = loader.load_cnn_dailymail(split='test', num_samples=50)\n",
|
| 129 |
+
" loader.save_samples(test_data, '../data/samples/test_50.json')\n",
|
| 130 |
+
" print(f\"✓ Downloaded and saved {len(test_data)} samples\")\n",
|
| 131 |
+
"\n",
|
| 132 |
+
"# Extract texts and references\n",
|
| 133 |
+
"texts = [item['article'] for item in test_data]\n",
|
| 134 |
+
"references = [item['reference_summary'] for item in test_data]\n",
|
| 135 |
+
"\n",
|
| 136 |
+
"print(f\"\\nDataset Statistics:\")\n",
|
| 137 |
+
"print(f\" - Number of samples: {len(texts)}\")\n",
|
| 138 |
+
"print(f\" - Avg article length: {np.mean([len(t.split()) for t in texts]):.0f} words\")\n",
|
| 139 |
+
"print(f\" - Avg reference length: {np.mean([len(r.split()) for r in references]):.0f}words\")"
|
| 140 |
+
]
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"cell_type": "code",
|
| 144 |
+
"execution_count": null,
|
| 145 |
+
"id": "3b7dc004",
|
| 146 |
+
"metadata": {},
|
| 147 |
+
"outputs": [],
|
| 148 |
+
"source": [
|
| 149 |
+
"print(\"\\nInitializing models...\")\n",
|
| 150 |
+
"\n",
|
| 151 |
+
"models = {\n",
|
| 152 |
+
" 'TextRank': TextRankSummarizer(),\n",
|
| 153 |
+
" 'BART': BARTSummarizer(device='cpu'),\n",
|
| 154 |
+
" 'PEGASUS': PEGASUSSummarizer(device='cpu')\n",
|
| 155 |
+
"}\n",
|
| 156 |
+
"\n",
|
| 157 |
+
"print(\"✓ All models ready\")\n",
|
| 158 |
+
"\n",
|
| 159 |
+
"# Cell 4: Generate Summaries (Takes ~10-20 minutes for 50 samples)\n",
|
| 160 |
+
"print(\"\\nGenerating summaries for all models...\")\n",
|
| 161 |
+
"print(\"This will take 10-20 minutes. Grab a coffee! ☕\")\n",
|
| 162 |
+
"\n",
|
| 163 |
+
"all_summaries = {}\n",
|
| 164 |
+
"all_times = {}\n",
|
| 165 |
+
"\n",
|
| 166 |
+
"for model_name, model in models.items():\n",
|
| 167 |
+
" print(f\"\\n{model_name}:\")\n",
|
| 168 |
+
" summaries = []\n",
|
| 169 |
+
" times = []\n",
|
| 170 |
+
" \n",
|
| 171 |
+
" for i, text in enumerate(texts[:10], 1): # Start with 10 samples\n",
|
| 172 |
+
" print(f\" Processing {i}/10...\", end='\\r')\n",
|
| 173 |
+
" \n",
|
| 174 |
+
" if model_name == 'TextRank':\n",
|
| 175 |
+
" result = model.summarize_with_metrics(text)\n",
|
| 176 |
+
" else:\n",
|
| 177 |
+
" result = model.summarize_with_metrics(text, max_length=100, min_length=30)\n",
|
| 178 |
+
" \n",
|
| 179 |
+
" summaries.append(result['summary'])\n",
|
| 180 |
+
" times.append(result['metadata']['processing_time'])\n",
|
| 181 |
+
" \n",
|
| 182 |
+
" all_summaries[model_name] = summaries\n",
|
| 183 |
+
" all_times[model_name] = times\n",
|
| 184 |
+
" print(f\" ✓ Completed {model_name} \")\n",
|
| 185 |
+
"\n",
|
| 186 |
+
"print(\"\\n✓ All summaries generated!\")"
|
| 187 |
+
]
|
| 188 |
+
},
|
| 189 |
+
{
|
| 190 |
+
"cell_type": "code",
|
| 191 |
+
"execution_count": null,
|
| 192 |
+
"id": "bf78630d",
|
| 193 |
+
"metadata": {},
|
| 194 |
+
"outputs": [],
|
| 195 |
+
"source": [
|
| 196 |
+
"print(\"\\nEvaluating models...\")\n",
|
| 197 |
+
"\n",
|
| 198 |
+
"evaluator = SummarizerEvaluator()\n",
|
| 199 |
+
"evaluation_results = {}\n",
|
| 200 |
+
"\n",
|
| 201 |
+
"for model_name in models.keys():\n",
|
| 202 |
+
" print(f\"\\nEvaluating {model_name}...\")\n",
|
| 203 |
+
" results = evaluator.evaluate_batch(\n",
|
| 204 |
+
" all_summaries[model_name],\n",
|
| 205 |
+
" references[:len(all_summaries[model_name])],\n",
|
| 206 |
+
" model_name\n",
|
| 207 |
+
" )\n",
|
| 208 |
+
" results['avg_time'] = np.mean(all_times[model_name])\n",
|
| 209 |
+
" results['std_time'] = np.std(all_times[model_name])\n",
|
| 210 |
+
" evaluation_results[model_name] = results\n",
|
| 211 |
+
"\n",
|
| 212 |
+
"print(\"✓ Evaluation complete\")"
|
| 213 |
+
]
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"cell_type": "code",
|
| 217 |
+
"execution_count": null,
|
| 218 |
+
"id": "c7ebcf59",
|
| 219 |
+
"metadata": {},
|
| 220 |
+
"outputs": [],
|
| 221 |
+
"source": [
|
| 222 |
+
"print(\"\\n\" + \"=\"*70)\n",
|
| 223 |
+
"print(\"EVALUATION RESULTS\")\n",
|
| 224 |
+
"print(\"=\"*70)\n",
|
| 225 |
+
"\n",
|
| 226 |
+
"results_table = []\n",
|
| 227 |
+
"\n",
|
| 228 |
+
"for model_name, results in evaluation_results.items():\n",
|
| 229 |
+
" results_table.append({\n",
|
| 230 |
+
" 'Model': model_name,\n",
|
| 231 |
+
" 'Type': 'Extractive' if model_name == 'TextRank' else 'Abstractive',\n",
|
| 232 |
+
" 'ROUGE-1': f\"{results['rouge_1_f1_mean']:.4f} ± {results['rouge_1_f1_std']:.4f}\",\n",
|
| 233 |
+
" 'ROUGE-2': f\"{results['rouge_2_f1_mean']:.4f} ± {results['rouge_2_f1_std']:.4f}\",\n",
|
| 234 |
+
" 'ROUGE-L': f\"{results['rouge_l_f1_mean']:.4f} ± {results['rouge_l_f1_std']:.4f}\",\n",
|
| 235 |
+
" 'Avg Time (s)': f\"{results['avg_time']:.3f} ± {results['std_time']:.3f}\",\n",
|
| 236 |
+
" 'Samples': results['num_samples']\n",
|
| 237 |
+
" })\n",
|
| 238 |
+
"\n",
|
| 239 |
+
"results_df = pd.DataFrame(results_table)\n",
|
| 240 |
+
"print(results_df.to_string(index=False))\n",
|
| 241 |
+
"\n",
|
| 242 |
+
"# Save to CSV for report\n",
|
| 243 |
+
"results_df.to_csv('../results/evaluation_results.csv', index=False)\n",
|
| 244 |
+
"print(\"\\n✓ Results saved to results/evaluation_results.csv\")\n"
|
| 245 |
+
]
|
| 246 |
+
},
|
| 247 |
+
{
|
| 248 |
+
"cell_type": "code",
|
| 249 |
+
"execution_count": null,
|
| 250 |
+
"id": "a65fac0c",
|
| 251 |
+
"metadata": {},
|
| 252 |
+
"outputs": [],
|
| 253 |
+
"source": [
|
| 254 |
+
"print(\"\\n\" + \"=\"*70)\n",
|
| 255 |
+
"print(\"STATISTICAL SIGNIFICANCE TESTS\")\n",
|
| 256 |
+
"print(\"=\"*70)\n",
|
| 257 |
+
"\n",
|
| 258 |
+
"# Compare BART vs PEGASUS (both abstractive)\n",
|
| 259 |
+
"bart_rouge1 = [s['rouge_1_f1'] for s in evaluation_results['BART']['individual_scores']]\n",
|
| 260 |
+
"peg_rouge1 = [s['rouge_1_f1'] for s in evaluation_results['PEGASUS']['individual_scores']]\n",
|
| 261 |
+
"\n",
|
| 262 |
+
"sig_test = evaluator.statistical_significance_test(\n",
|
| 263 |
+
" bart_rouge1,\n",
|
| 264 |
+
" peg_rouge1,\n",
|
| 265 |
+
" test_name='paired t-test'\n",
|
| 266 |
+
")\n",
|
| 267 |
+
"\n",
|
| 268 |
+
"print(f\"\\nBART vs PEGASUS (ROUGE-1):\")\n",
|
| 269 |
+
"print(f\" Test: {sig_test['test_name']}\")\n",
|
| 270 |
+
"print(f\" p-value: {sig_test['p_value']:.6f}\")\n",
|
| 271 |
+
"print(f\" {sig_test['interpretation']}\")"
|
| 272 |
+
]
|
| 273 |
+
},
|
| 274 |
+
{
|
| 275 |
+
"cell_type": "code",
|
| 276 |
+
"execution_count": null,
|
| 277 |
+
"id": "ae272f7a",
|
| 278 |
+
"metadata": {},
|
| 279 |
+
"outputs": [],
|
| 280 |
+
"source": [
|
| 281 |
+
"fig = plt.figure(figsize=(16, 12))\n",
|
| 282 |
+
"\n",
|
| 283 |
+
"# Create grid\n",
|
| 284 |
+
"gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)\n",
|
| 285 |
+
"\n",
|
| 286 |
+
"# 1. ROUGE Scores Comparison\n",
|
| 287 |
+
"ax1 = fig.add_subplot(gs[0, :2])\n",
|
| 288 |
+
"rouge_data = pd.DataFrame({\n",
|
| 289 |
+
" 'Model': list(evaluation_results.keys()) * 3,\n",
|
| 290 |
+
" 'Metric': ['ROUGE-1']*3 + ['ROUGE-2']*3 + ['ROUGE-L']*3,\n",
|
| 291 |
+
" 'Score': [\n",
|
| 292 |
+
" evaluation_results['TextRank']['rouge_1_f1_mean'],\n",
|
| 293 |
+
" evaluation_results['BART']['rouge_1_f1_mean'],\n",
|
| 294 |
+
" evaluation_results['PEGASUS']['rouge_1_f1_mean'],\n",
|
| 295 |
+
" evaluation_results['TextRank']['rouge_2_f1_mean'],\n",
|
| 296 |
+
" evaluation_results['BART']['rouge_2_f1_mean'],\n",
|
| 297 |
+
" evaluation_results['PEGASUS']['rouge_2_f1_mean'],\n",
|
| 298 |
+
" evaluation_results['TextRank']['rouge_l_f1_mean'],\n",
|
| 299 |
+
" evaluation_results['BART']['rouge_l_f1_mean'],\n",
|
| 300 |
+
" evaluation_results['PEGASUS']['rouge_l_f1_mean']\n",
|
| 301 |
+
" ]\n",
|
| 302 |
+
"})\n",
|
| 303 |
+
"\n",
|
| 304 |
+
"sns.barplot(data=rouge_data, x='Metric', y='Score', hue='Model', ax=ax1)\n",
|
| 305 |
+
"ax1.set_title('ROUGE Score Comparison', fontsize=14, fontweight='bold')\n",
|
| 306 |
+
"ax1.set_ylabel('F1 Score')\n",
|
| 307 |
+
"ax1.set_ylim([0, 0.5])\n",
|
| 308 |
+
"ax1.legend(title='Model')\n",
|
| 309 |
+
"ax1.grid(axis='y', alpha=0.3)\n",
|
| 310 |
+
"\n",
|
| 311 |
+
"# 2. Processing Time\n",
|
| 312 |
+
"ax2 = fig.add_subplot(gs[0, 2])\n",
|
| 313 |
+
"times = [evaluation_results[m]['avg_time'] for m in models.keys()]\n",
|
| 314 |
+
"colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']\n",
|
| 315 |
+
"ax2.bar(models.keys(), times, color=colors)\n",
|
| 316 |
+
"ax2.set_title('Processing Time', fontsize=12, fontweight='bold')\n",
|
| 317 |
+
"ax2.set_ylabel('Time (seconds)')\n",
|
| 318 |
+
"ax2.grid(axis='y', alpha=0.3)\n",
|
| 319 |
+
"\n",
|
| 320 |
+
"# 3. ROUGE-1 Distribution\n",
|
| 321 |
+
"ax3 = fig.add_subplot(gs[1, 0])\n",
|
| 322 |
+
"for model_name, color in zip(models.keys(), colors):\n",
|
| 323 |
+
" rouge1_scores = [s['rouge_1_f1'] for s in evaluation_results[model_name]['individual_scores']]\n",
|
| 324 |
+
" ax3.hist(rouge1_scores, alpha=0.6, label=model_name, bins=10, color=color)\n",
|
| 325 |
+
"ax3.set_title('ROUGE-1 Score Distribution', fontsize=12, fontweight='bold')\n",
|
| 326 |
+
"ax3.set_xlabel('ROUGE-1 F1 Score')\n",
|
| 327 |
+
"ax3.set_ylabel('Frequency')\n",
|
| 328 |
+
"ax3.legend()\n",
|
| 329 |
+
"ax3.grid(axis='y', alpha=0.3)\n",
|
| 330 |
+
"\n",
|
| 331 |
+
"# 4. ROUGE-2 Distribution\n",
|
| 332 |
+
"ax4 = fig.add_subplot(gs[1, 1])\n",
|
| 333 |
+
"for model_name, color in zip(models.keys(), colors):\n",
|
| 334 |
+
" rouge2_scores = [s['rouge_2_f1'] for s in evaluation_results[model_name]['individual_scores']]\n",
|
| 335 |
+
" ax4.hist(rouge2_scores, alpha=0.6, label=model_name, bins=10, color=color)\n",
|
| 336 |
+
"ax4.set_title('ROUGE-2 Score Distribution', fontsize=12, fontweight='bold')\n",
|
| 337 |
+
"ax4.set_xlabel('ROUGE-2 F1 Score')\n",
|
| 338 |
+
"ax4.set_ylabel('Frequency')\n",
|
| 339 |
+
"ax4.legend()\n",
|
| 340 |
+
"ax4.grid(axis='y', alpha=0.3)\n",
|
| 341 |
+
"\n",
|
| 342 |
+
"# 5. ROUGE-L Distribution\n",
|
| 343 |
+
"ax5 = fig.add_subplot(gs[1, 2])\n",
|
| 344 |
+
"for model_name, color in zip(models.keys(), colors):\n",
|
| 345 |
+
" rougel_scores = [s['rouge_l_f1'] for s in evaluation_results[model_name]['individual_scores']]\n",
|
| 346 |
+
" ax5.hist(rougel_scores, alpha=0.6, label=model_name, bins=10, color=color)\n",
|
| 347 |
+
"ax5.set_title('ROUGE-L Score Distribution', fontsize=12, fontweight='bold')\n",
|
| 348 |
+
"ax5.set_xlabel('ROUGE-L F1 Score')\n",
|
| 349 |
+
"ax5.set_ylabel('Frequency')\n",
|
| 350 |
+
"ax5.legend()\n",
|
| 351 |
+
"ax5.grid(axis='y', alpha=0.3)\n",
|
| 352 |
+
"\n",
|
| 353 |
+
"# 6. Box Plot Comparison\n",
|
| 354 |
+
"ax6 = fig.add_subplot(gs[2, :])\n",
|
| 355 |
+
"box_data = []\n",
|
| 356 |
+
"for model_name in models.keys():\n",
|
| 357 |
+
" rouge1_scores = [s['rouge_1_f1'] for s in evaluation_results[model_name]['individual_scores']]\n",
|
| 358 |
+
" for score in rouge1_scores:\n",
|
| 359 |
+
" box_data.append({'Model': model_name, 'ROUGE-1': score})\n",
|
| 360 |
+
"\n",
|
| 361 |
+
"box_df = pd.DataFrame(box_data)\n",
|
| 362 |
+
"sns.boxplot(data=box_df, x='Model', y='ROUGE-1', ax=ax6, palette=colors)\n",
|
| 363 |
+
"ax6.set_title('ROUGE-1 Score Distribution (Box Plot)', fontsize=14, fontweight='bold')\n",
|
| 364 |
+
"ax6.grid(axis='y', alpha=0.3)\n",
|
| 365 |
+
"\n",
|
| 366 |
+
"plt.savefig('../results/comprehensive_evaluation.png', dpi=300, bbox_inches='tight')\n",
|
| 367 |
+
"print(\"\\n✓ Comprehensive visualization saved!\")\n",
|
| 368 |
+
"plt.show()"
|
| 369 |
+
]
|
| 370 |
+
},
|
| 371 |
+
{
|
| 372 |
+
"cell_type": "code",
|
| 373 |
+
"execution_count": null,
|
| 374 |
+
"id": "3e24f94c",
|
| 375 |
+
"metadata": {},
|
| 376 |
+
"outputs": [],
|
| 377 |
+
"source": [
|
| 378 |
+
"print(\"\\n\" + \"=\"*70)\n",
|
| 379 |
+
"print(\"EXPORTING RESULTS FOR REPORT\")\n",
|
| 380 |
+
"print(\"=\"*70)\n",
|
| 381 |
+
"\n",
|
| 382 |
+
"# Create comprehensive export\n",
|
| 383 |
+
"export_data = {\n",
|
| 384 |
+
" 'evaluation_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),\n",
|
| 385 |
+
" 'dataset': {\n",
|
| 386 |
+
" 'name': 'CNN/DailyMail',\n",
|
| 387 |
+
" 'samples_evaluated': len(all_summaries['TextRank']),\n",
|
| 388 |
+
" 'split': 'test'\n",
|
| 389 |
+
" },\n",
|
| 390 |
+
" 'models': {\n",
|
| 391 |
+
" model_name: {\n",
|
| 392 |
+
" 'type': results_table[i]['Type'],\n",
|
| 393 |
+
" 'rouge_1': {\n",
|
| 394 |
+
" 'mean': evaluation_results[model_name]['rouge_1_f1_mean'],\n",
|
| 395 |
+
" 'std': evaluation_results[model_name]['rouge_1_f1_std']\n",
|
| 396 |
+
" },\n",
|
| 397 |
+
" 'rouge_2': {\n",
|
| 398 |
+
" 'mean': evaluation_results[model_name]['rouge_2_f1_mean'],\n",
|
| 399 |
+
" 'std': evaluation_results[model_name]['rouge_2_f1_std']\n",
|
| 400 |
+
" },\n",
|
| 401 |
+
" 'rouge_l': {\n",
|
| 402 |
+
" 'mean': evaluation_results[model_name]['rouge_l_f1_mean'],\n",
|
| 403 |
+
" 'std': evaluation_results[model_name]['rouge_l_f1_std']\n",
|
| 404 |
+
" },\n",
|
| 405 |
+
" 'processing_time': {\n",
|
| 406 |
+
" 'mean': evaluation_results[model_name]['avg_time'],\n",
|
| 407 |
+
" 'std': evaluation_results[model_name]['std_time']\n",
|
| 408 |
+
" }\n",
|
| 409 |
+
" }\n",
|
| 410 |
+
" for i, model_name in enumerate(models.keys())\n",
|
| 411 |
+
" },\n",
|
| 412 |
+
" 'statistical_tests': {\n",
|
| 413 |
+
" 'bart_vs_pegasus': sig_test\n",
|
| 414 |
+
" }\n",
|
| 415 |
+
"}\n",
|
| 416 |
+
"\n",
|
| 417 |
+
"with open('../results/final_evaluation.json', 'w') as f:\n",
|
| 418 |
+
" json.dump(export_data, f, indent=2)\n",
|
| 419 |
+
"\n",
|
| 420 |
+
"print(\"✓ Exported to results/final_evaluation.json\")\n",
|
| 421 |
+
"print(\"\\nFiles created for your report:\")\n",
|
| 422 |
+
"print(\" 1. results/evaluation_results.csv - Table for report\")\n",
|
| 423 |
+
"print(\" 2. results/comprehensive_evaluation.png - Main figure\")\n",
|
| 424 |
+
"print(\" 3. results/final_evaluation.json - All data\")\n",
|
| 425 |
+
"\n",
|
| 426 |
+
"# Cell 10: Summary for Report\n",
|
| 427 |
+
"print(\"\\n\" + \"=\"*70)\n",
|
| 428 |
+
"print(\"KEY FINDINGS FOR YOUR REPORT\")\n",
|
| 429 |
+
"print(\"=\"*70)\n",
|
| 430 |
+
"\n",
|
| 431 |
+
"best_model = max(evaluation_results.keys(), \n",
|
| 432 |
+
" key=lambda x: evaluation_results[x]['rouge_1_f1_mean'])\n",
|
| 433 |
+
"fastest_model = min(evaluation_results.keys(),\n",
|
| 434 |
+
" key=lambda x: evaluation_results[x]['avg_time'])\n",
|
| 435 |
+
"\n",
|
| 436 |
+
"print(f\"\\n1. Best Overall Performance: {best_model}\")\n",
|
| 437 |
+
"print(f\" - ROUGE-1: {evaluation_results[best_model]['rouge_1_f1_mean']:.4f}\")\n",
|
| 438 |
+
"print(f\" - ROUGE-2: {evaluation_results[best_model]['rouge_2_f1_mean']:.4f}\")\n",
|
| 439 |
+
"print(f\" - ROUGE-L: {evaluation_results[best_model]['rouge_l_f1_mean']:.4f}\")\n",
|
| 440 |
+
"\n",
|
| 441 |
+
"print(f\"\\n2. Fastest Processing: {fastest_model}\")\n",
|
| 442 |
+
"print(f\" - Avg time: {evaluation_results[fastest_model]['avg_time']:.3f}s\")\n",
|
| 443 |
+
"print(f\" - {evaluation_results[max(evaluation_results.keys(), key=lambda x: evaluation_results[x]['avg_time'])]['avg_time'] / evaluation_results[fastest_model]['avg_time']:.1f}x faster than slowest\")\n",
|
| 444 |
+
"\n",
|
| 445 |
+
"print(f\"\\n3. Extractive vs Abstractive:\")\n",
|
| 446 |
+
"print(f\" - TextRank (Extractive): ROUGE-1 = {evaluation_results['TextRank']['rouge_1_f1_mean']:.4f}\")\n",
|
| 447 |
+
"print(f\" - BART (Abstractive): ROUGE-1 = {evaluation_results['BART']['rouge_1_f1_mean']:.4f}\")\n",
|
| 448 |
+
"print(f\" - PEGASUS (Abstractive): ROUGE-1 = {evaluation_results['PEGASUS']['rouge_1_f1_mean']:.4f}\")\n",
|
| 449 |
+
"print(f\" - Abstractive models outperform extractive by {(evaluation_results[best_model]['rouge_1_f1_mean'] / evaluation_results['TextRank']['rouge_1_f1_mean'] - 1) * 100:.1f}%\")\n",
|
| 450 |
+
"\n",
|
| 451 |
+
"print(\"\\n\" + \"=\"*70)\n",
|
| 452 |
+
"print(\"✓ Evaluation complete! Use these results in your report.\")\n",
|
| 453 |
+
"print(\"=\"*70)"
|
| 454 |
+
]
|
| 455 |
+
}
|
| 456 |
+
],
|
| 457 |
+
"metadata": {
|
| 458 |
+
"kernelspec": {
|
| 459 |
+
"display_name": "Workshop2",
|
| 460 |
+
"language": "python",
|
| 461 |
+
"name": "python3"
|
| 462 |
+
},
|
| 463 |
+
"language_info": {
|
| 464 |
+
"codemirror_mode": {
|
| 465 |
+
"name": "ipython",
|
| 466 |
+
"version": 3
|
| 467 |
+
},
|
| 468 |
+
"file_extension": ".py",
|
| 469 |
+
"mimetype": "text/x-python",
|
| 470 |
+
"name": "python",
|
| 471 |
+
"nbconvert_exporter": "python",
|
| 472 |
+
"pygments_lexer": "ipython3",
|
| 473 |
+
"version": "3.13.9"
|
| 474 |
+
}
|
| 475 |
+
},
|
| 476 |
+
"nbformat": 4,
|
| 477 |
+
"nbformat_minor": 5
|
| 478 |
+
}
|
notebooks/.ipynb_checkpoints/03_evaluation_analysis_cnn_dailymail-checkpoint.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/.ipynb_checkpoints/Smart-Summarizer-checkpoint.ipynb
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [],
|
| 3 |
+
"metadata": {},
|
| 4 |
+
"nbformat": 4,
|
| 5 |
+
"nbformat_minor": 5
|
| 6 |
+
}
|
notebooks/01_data_exploration.ipynb
CHANGED
|
@@ -240,7 +240,7 @@
|
|
| 240 |
],
|
| 241 |
"metadata": {
|
| 242 |
"kernelspec": {
|
| 243 |
-
"display_name": "
|
| 244 |
"language": "python",
|
| 245 |
"name": "python3"
|
| 246 |
},
|
|
|
|
| 240 |
],
|
| 241 |
"metadata": {
|
| 242 |
"kernelspec": {
|
| 243 |
+
"display_name": "Python 3 (ipykernel)",
|
| 244 |
"language": "python",
|
| 245 |
"name": "python3"
|
| 246 |
},
|
notebooks/02_model_testing.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/03_evaluation_analysis.ipynb
CHANGED
|
@@ -456,7 +456,7 @@
|
|
| 456 |
],
|
| 457 |
"metadata": {
|
| 458 |
"kernelspec": {
|
| 459 |
-
"display_name": "
|
| 460 |
"language": "python",
|
| 461 |
"name": "python3"
|
| 462 |
},
|
|
|
|
| 456 |
],
|
| 457 |
"metadata": {
|
| 458 |
"kernelspec": {
|
| 459 |
+
"display_name": "Python 3 (ipykernel)",
|
| 460 |
"language": "python",
|
| 461 |
"name": "python3"
|
| 462 |
},
|
notebooks/03_evaluation_analysis_cnn_dailymail.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/Smart-Summarizer.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
results/cnn_dailymail_evaluation_export.json
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"evaluation_metadata": {
|
| 3 |
+
"date": "2026-01-05 11:07:16",
|
| 4 |
+
"dataset": "CNN/DailyMail",
|
| 5 |
+
"dataset_version": "3.0.0",
|
| 6 |
+
"dataset_source": "abisee/cnn_dailymail",
|
| 7 |
+
"split": "test",
|
| 8 |
+
"samples_evaluated": 20,
|
| 9 |
+
"student_id": "23049149",
|
| 10 |
+
"module": "CU6051NI Artificial Intelligence"
|
| 11 |
+
},
|
| 12 |
+
"models_evaluated": {
|
| 13 |
+
"TextRank": {
|
| 14 |
+
"model_type": "Extractive",
|
| 15 |
+
"samples_processed": 20,
|
| 16 |
+
"rouge_scores": {
|
| 17 |
+
"rouge_1": {
|
| 18 |
+
"mean": 0.2506712046180422,
|
| 19 |
+
"std": 0.12486457066759726,
|
| 20 |
+
"interpretation": "Unigram overlap with reference"
|
| 21 |
+
},
|
| 22 |
+
"rouge_2": {
|
| 23 |
+
"mean": 0.10349963104031257,
|
| 24 |
+
"std": 0.0726814983362185,
|
| 25 |
+
"interpretation": "Bigram overlap with reference"
|
| 26 |
+
},
|
| 27 |
+
"rouge_l": {
|
| 28 |
+
"mean": 0.16371538220717308,
|
| 29 |
+
"std": 0.07898183592353582,
|
| 30 |
+
"interpretation": "Longest common subsequence"
|
| 31 |
+
}
|
| 32 |
+
},
|
| 33 |
+
"performance_metrics": {
|
| 34 |
+
"avg_processing_time": 0.005225980281829834,
|
| 35 |
+
"std_processing_time": 0.013506362618507201,
|
| 36 |
+
"total_processing_time": 0.10451960563659668,
|
| 37 |
+
"compression_ratio_mean": 4.669225541097807,
|
| 38 |
+
"compression_ratio_std": 2.4839200547893845
|
| 39 |
+
}
|
| 40 |
+
},
|
| 41 |
+
"BART": {
|
| 42 |
+
"model_type": "Abstractive",
|
| 43 |
+
"samples_processed": 20,
|
| 44 |
+
"rouge_scores": {
|
| 45 |
+
"rouge_1": {
|
| 46 |
+
"mean": 0.35022025793945055,
|
| 47 |
+
"std": 0.09190543055324636,
|
| 48 |
+
"interpretation": "Unigram overlap with reference"
|
| 49 |
+
},
|
| 50 |
+
"rouge_2": {
|
| 51 |
+
"mean": 0.1478972899837078,
|
| 52 |
+
"std": 0.08392194073728265,
|
| 53 |
+
"interpretation": "Bigram overlap with reference"
|
| 54 |
+
},
|
| 55 |
+
"rouge_l": {
|
| 56 |
+
"mean": 0.2604310393319945,
|
| 57 |
+
"std": 0.10189025331501939,
|
| 58 |
+
"interpretation": "Longest common subsequence"
|
| 59 |
+
}
|
| 60 |
+
},
|
| 61 |
+
"performance_metrics": {
|
| 62 |
+
"avg_processing_time": 6.735281562805175,
|
| 63 |
+
"std_processing_time": 1.252485361304747,
|
| 64 |
+
"total_processing_time": 134.70563125610352,
|
| 65 |
+
"compression_ratio_mean": 1.4679364788911673,
|
| 66 |
+
"compression_ratio_std": 0.3564447507091954
|
| 67 |
+
}
|
| 68 |
+
},
|
| 69 |
+
"PEGASUS": {
|
| 70 |
+
"model_type": "Abstractive",
|
| 71 |
+
"samples_processed": 20,
|
| 72 |
+
"rouge_scores": {
|
| 73 |
+
"rouge_1": {
|
| 74 |
+
"mean": 0.3530379619461269,
|
| 75 |
+
"std": 0.10720945707466437,
|
| 76 |
+
"interpretation": "Unigram overlap with reference"
|
| 77 |
+
},
|
| 78 |
+
"rouge_2": {
|
| 79 |
+
"mean": 0.1531830157168635,
|
| 80 |
+
"std": 0.08764155739126663,
|
| 81 |
+
"interpretation": "Bigram overlap with reference"
|
| 82 |
+
},
|
| 83 |
+
"rouge_l": {
|
| 84 |
+
"mean": 0.25491739595110097,
|
| 85 |
+
"std": 0.09604101774475897,
|
| 86 |
+
"interpretation": "Longest common subsequence"
|
| 87 |
+
}
|
| 88 |
+
},
|
| 89 |
+
"performance_metrics": {
|
| 90 |
+
"avg_processing_time": 8.351530861854553,
|
| 91 |
+
"std_processing_time": 0.8606459954310681,
|
| 92 |
+
"total_processing_time": 167.03061723709106,
|
| 93 |
+
"compression_ratio_mean": 1.268746653225481,
|
| 94 |
+
"compression_ratio_std": 0.34500943090569686
|
| 95 |
+
}
|
| 96 |
+
}
|
| 97 |
+
},
|
| 98 |
+
"summary_statistics": {
|
| 99 |
+
"total_models": 3,
|
| 100 |
+
"successful_evaluations": 3,
|
| 101 |
+
"best_rouge1_model": "PEGASUS",
|
| 102 |
+
"fastest_model": "TextRank"
|
| 103 |
+
},
|
| 104 |
+
"dataset_characteristics": {
|
| 105 |
+
"avg_article_length": 530.35,
|
| 106 |
+
"avg_reference_length": 36.7,
|
| 107 |
+
"avg_compression_ratio": 0.09504919499509967
|
| 108 |
+
}
|
| 109 |
+
}
|
results/cnn_dailymail_evaluation_results.csv
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,Type,ROUGE-1,ROUGE-2,ROUGE-L,Avg Time (s),Total Time (s),Samples
|
| 2 |
+
TextRank,Extractive,0.2507 ± 0.1249,0.1035 ± 0.0727,0.1637 ± 0.0790,0.005 ± 0.014,0.1,20
|
| 3 |
+
BART,Abstractive,0.3502 ± 0.0919,0.1479 ± 0.0839,0.2604 ± 0.1019,6.735 ± 1.252,134.7,20
|
| 4 |
+
PEGASUS,Abstractive,0.3530 ± 0.1072,0.1532 ± 0.0876,0.2549 ± 0.0960,8.352 ± 0.861,167.0,20
|
results/cnn_dailymail_report_summary.md
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# CNN/DailyMail Evaluation Report Summary
|
| 3 |
+
|
| 4 |
+
## Dataset Information
|
| 5 |
+
- **Dataset**: CNN/DailyMail v3.0.0 (abisee/cnn_dailymail)
|
| 6 |
+
- **Split**: Test set
|
| 7 |
+
- **Samples**: 20 articles evaluated
|
| 8 |
+
- **Average Article Length**: 530 words
|
| 9 |
+
- **Average Reference Length**: 37 words
|
| 10 |
+
|
| 11 |
+
## Model Performance (ROUGE-1 F1 Scores)
|
| 12 |
+
|
| 13 |
+
### PEGASUS (Abstractive)
|
| 14 |
+
- **ROUGE-1**: 0.3530 ± 0.1072
|
| 15 |
+
- **ROUGE-2**: 0.1532 ± 0.0876
|
| 16 |
+
- **ROUGE-L**: 0.2549 ± 0.0960
|
| 17 |
+
- **Avg Processing Time**: 8.352s per sample
|
| 18 |
+
|
| 19 |
+
### BART (Abstractive)
|
| 20 |
+
- **ROUGE-1**: 0.3502 ± 0.0919
|
| 21 |
+
- **ROUGE-2**: 0.1479 ± 0.0839
|
| 22 |
+
- **ROUGE-L**: 0.2604 ± 0.1019
|
| 23 |
+
- **Avg Processing Time**: 6.735s per sample
|
| 24 |
+
|
| 25 |
+
### TextRank (Extractive)
|
| 26 |
+
- **ROUGE-1**: 0.2507 ± 0.1249
|
| 27 |
+
- **ROUGE-2**: 0.1035 ± 0.0727
|
| 28 |
+
- **ROUGE-L**: 0.1637 ± 0.0790
|
| 29 |
+
- **Avg Processing Time**: 0.005s per sample
|
run_evaluation.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Simple script to run model evaluation on CNN/DailyMail dataset
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
import logging
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
# Add project root to path
|
| 12 |
+
project_root = Path(__file__).parent
|
| 13 |
+
sys.path.insert(0, str(project_root))
|
| 14 |
+
|
| 15 |
+
from evaluation.dataset_loader import CNNDailyMailLoader
|
| 16 |
+
from evaluation.model_evaluator import ModelEvaluator
|
| 17 |
+
from evaluation.results_analyzer import ResultsAnalyzer
|
| 18 |
+
|
| 19 |
+
# Setup logging
|
| 20 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
def main():
|
| 24 |
+
"""Run comprehensive evaluation"""
|
| 25 |
+
|
| 26 |
+
# Configuration
|
| 27 |
+
SAMPLE_SIZE = 50 # Number of samples to evaluate
|
| 28 |
+
OUTPUT_DIR = "evaluation_results"
|
| 29 |
+
|
| 30 |
+
# Create output directory
|
| 31 |
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 32 |
+
|
| 33 |
+
logger.info("Starting Smart Summarizer Evaluation")
|
| 34 |
+
logger.info(f"Sample size: {SAMPLE_SIZE}")
|
| 35 |
+
|
| 36 |
+
try:
|
| 37 |
+
# Step 1: Load dataset
|
| 38 |
+
logger.info("Step 1: Loading CNN/DailyMail dataset...")
|
| 39 |
+
loader = CNNDailyMailLoader()
|
| 40 |
+
dataset = loader.load_dataset()
|
| 41 |
+
|
| 42 |
+
# Step 2: Create evaluation subset
|
| 43 |
+
logger.info("Step 2: Creating evaluation subset...")
|
| 44 |
+
eval_data = loader.create_evaluation_subset(size=SAMPLE_SIZE)
|
| 45 |
+
loader.save_evaluation_data(eval_data, f"{OUTPUT_DIR}/eval_data.json")
|
| 46 |
+
|
| 47 |
+
# Step 3: Categorize by topics
|
| 48 |
+
logger.info("Step 3: Categorizing by topics...")
|
| 49 |
+
categorized_data = loader.categorize_by_topic(eval_data)
|
| 50 |
+
|
| 51 |
+
# Save categorized data
|
| 52 |
+
for topic, data in categorized_data.items():
|
| 53 |
+
if data:
|
| 54 |
+
loader.save_evaluation_data(data, f"{OUTPUT_DIR}/data_{topic}.json")
|
| 55 |
+
logger.info(f" {topic}: {len(data)} articles")
|
| 56 |
+
|
| 57 |
+
# Step 4: Initialize models
|
| 58 |
+
logger.info("Step 4: Initializing models...")
|
| 59 |
+
evaluator = ModelEvaluator()
|
| 60 |
+
evaluator.initialize_models()
|
| 61 |
+
|
| 62 |
+
# Step 5: Run overall evaluation
|
| 63 |
+
logger.info("Step 5: Running overall evaluation...")
|
| 64 |
+
overall_results = evaluator.evaluate_all_models(eval_data, max_samples=SAMPLE_SIZE)
|
| 65 |
+
|
| 66 |
+
# Save overall results
|
| 67 |
+
evaluator.save_results(overall_results, f"{OUTPUT_DIR}/results_overall.json")
|
| 68 |
+
|
| 69 |
+
# Create overall comparison
|
| 70 |
+
comparison_df = evaluator.compare_models(overall_results)
|
| 71 |
+
comparison_df.to_csv(f"{OUTPUT_DIR}/comparison_overall.csv", index=False)
|
| 72 |
+
|
| 73 |
+
print("\n" + "="*60)
|
| 74 |
+
print("OVERALL EVALUATION RESULTS")
|
| 75 |
+
print("="*60)
|
| 76 |
+
print(comparison_df.to_string(index=False))
|
| 77 |
+
|
| 78 |
+
# Step 6: Run topic-based evaluation
|
| 79 |
+
logger.info("Step 6: Running topic-based evaluation...")
|
| 80 |
+
topic_results = {}
|
| 81 |
+
|
| 82 |
+
for topic, data in categorized_data.items():
|
| 83 |
+
if len(data) >= 5: # Only evaluate topics with sufficient data
|
| 84 |
+
logger.info(f" Evaluating topic: {topic}")
|
| 85 |
+
topic_results[topic] = evaluator.evaluate_all_models(data, max_samples=20)
|
| 86 |
+
|
| 87 |
+
# Save topic results
|
| 88 |
+
evaluator.save_results(topic_results[topic], f"{OUTPUT_DIR}/results_{topic}.json")
|
| 89 |
+
|
| 90 |
+
# Create topic comparison
|
| 91 |
+
topic_comparison = evaluator.compare_models(topic_results[topic])
|
| 92 |
+
topic_comparison.to_csv(f"{OUTPUT_DIR}/comparison_{topic}.csv", index=False)
|
| 93 |
+
|
| 94 |
+
print(f"\n{topic.upper()} TOPIC RESULTS:")
|
| 95 |
+
print("-" * 40)
|
| 96 |
+
print(topic_comparison.to_string(index=False))
|
| 97 |
+
|
| 98 |
+
# Step 7: Create visualizations and analysis
|
| 99 |
+
logger.info("Step 7: Creating analysis and visualizations...")
|
| 100 |
+
analyzer = ResultsAnalyzer()
|
| 101 |
+
|
| 102 |
+
# Overall performance charts
|
| 103 |
+
analyzer.create_performance_charts(overall_results, OUTPUT_DIR)
|
| 104 |
+
|
| 105 |
+
# Topic analysis if we have topic results
|
| 106 |
+
if topic_results:
|
| 107 |
+
analyzer.analyze_topic_performance(topic_results, OUTPUT_DIR)
|
| 108 |
+
|
| 109 |
+
# Detailed report
|
| 110 |
+
analyzer.create_detailed_report(overall_results, OUTPUT_DIR)
|
| 111 |
+
|
| 112 |
+
print(f"\n" + "="*60)
|
| 113 |
+
print("EVALUATION COMPLETE")
|
| 114 |
+
print("="*60)
|
| 115 |
+
print(f"Results saved to: {OUTPUT_DIR}/")
|
| 116 |
+
print("Files created:")
|
| 117 |
+
print(f" - results_overall.json (detailed results)")
|
| 118 |
+
print(f" - comparison_overall.csv (summary table)")
|
| 119 |
+
print(f" - performance_comparison.png (charts)")
|
| 120 |
+
print(f" - evaluation_report.md (detailed report)")
|
| 121 |
+
if topic_results:
|
| 122 |
+
print(f" - topic_performance_heatmap.png (topic analysis)")
|
| 123 |
+
print(f" - topic_summary.csv (topic breakdown)")
|
| 124 |
+
|
| 125 |
+
except Exception as e:
|
| 126 |
+
logger.error(f"Evaluation failed: {e}")
|
| 127 |
+
raise
|
| 128 |
+
|
| 129 |
+
if __name__ == "__main__":
|
| 130 |
+
main()
|