Spaces:

Rajak13
/

smart-summarizer

Running

App Files Files Community

smart-summarizer / models /pegasus.py

Rajak13

Upload folder using huggingface_hub (#1)

634567d verified 23 days ago

raw

history blame contribute delete

14.6 kB

	"""
	PEGASUS (Pre-training with Extracted Gap-sentences for Abstractive SUmmarization)
	State-of-the-art model specifically designed for summarization tasks
	Professional implementation with Gap Sentence Generation pre-training
	"""

	# Handle imports when running directly (python models/pegasus.py)
	# For proper package usage, run as: python -m models.pegasus
	import sys
	from pathlib import Path
	project_root = Path(__file__).parent.parent
	if str(project_root) not in sys.path:
	sys.path.insert(0, str(project_root))

	from transformers import PegasusForConditionalGeneration, PegasusTokenizer
	import torch
	import logging
	from typing import Dict, List, Optional
	from models.base_summarizer import BaseSummarizer

	logger = logging.getLogger(__name__)


	class PEGASUSSummarizer(BaseSummarizer):
	"""
	PEGASUS implementation for abstractive text summarization.

	Innovation: Gap Sentence Generation (GSG)
	- Pre-training task: Predict important missing sentences
	- Directly aligned with summarization objective
	- Superior transfer learning for summarization

	Model Architecture:
	- Transformer encoder-decoder (16 layers each)
	- Pre-trained on C4 and HugeNews datasets
	- Fine-tuned on domain-specific summarization data

	Key Advantages:
	- Highest ROUGE scores on multiple benchmarks
	- Excellent zero-shot and few-shot capabilities
	- Generates highly coherent summaries
	- Handles long documents effectively

	Performance Highlights (CNN/DailyMail):
	- ROUGE-1: 44.17
	- ROUGE-2: 21.47
	- ROUGE-L: 41.11

	Mathematical Foundation:
	Sentence Importance: ROUGE-F1(Si, D\Si)
	Where Si = sentence i, D\Si = document without sentence i
	"""

	def __init__(self,
	model_name: str = "google/pegasus-cnn_dailymail",
	device: Optional[str] = None,
	use_fp16: bool = False):
	"""
	Initialize PEGASUS Summarizer

	Args:
	model_name: HuggingFace model identifier
	Options: 'google/pegasus-cnn_dailymail' (recommended)
	'google/pegasus-xsum' (for extreme summarization)
	'google/pegasus-large' (base model)
	device: Computing device ('cuda', 'cpu', or None for auto-detect)
	use_fp16: Use 16-bit floating point for faster inference
	"""
	super().__init__(model_name="PEGASUS", model_type="Abstractive")

	logger.info(f"Loading PEGASUS model: {model_name}")
	logger.info("PEGASUS is a large model. Initial loading may take 3-5 minutes...")

	# Determine device
	if device is None:
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	else:
	self.device = device

	logger.info(f"Using device: {self.device}")

	# Load tokenizer and model
	try:
	logger.info("Loading tokenizer...")
	self.tokenizer = PegasusTokenizer.from_pretrained(model_name)

	logger.info("Loading model weights...")
	self.model = PegasusForConditionalGeneration.from_pretrained(model_name)

	# Move to device
	self.model.to(self.device)

	# Enable FP16 if requested
	if use_fp16 and self.device == "cuda":
	self.model.half()
	logger.info("Using FP16 precision")

	# Set to evaluation mode
	self.model.eval()

	self.model_name_full = model_name
	self.is_initialized = True

	# Get model configuration
	self.config = self.model.config

	logger.info("PEGASUS model loaded successfully!")
	logger.info(f"Model size: {self._count_parameters() / 1e6:.1f}M parameters")

	except Exception as e:
	logger.error(f"Failed to load PEGASUS model: {e}")
	raise

	def _count_parameters(self) -> int:
	"""Count total number of trainable parameters"""
	return sum(p.numel() for p in self.model.parameters() if p.requires_grad)

	def summarize(self,
	text: str,
	max_length: int = 128,
	min_length: int = 32,
	num_beams: int = 4,
	length_penalty: float = 2.0,
	no_repeat_ngram_size: int = 3,
	early_stopping: bool = True,
	do_sample: bool = False,
	temperature: float = 1.0) -> str:
	"""
	Generate abstractive summary using PEGASUS

	PEGASUS uses special tokens:
	- <pad>: Padding token (also used as decoder start token)
	- </s>: End of sequence token
	- <unk>: Unknown token
	- <mask_1>, <mask_2>: Gap sentence masks

	Args:
	text: Input text to summarize
	max_length: Maximum summary length in tokens (PEGASUS optimal: 128)
	min_length: Minimum summary length in tokens
	num_beams: Beam search width (4-8 recommended)
	length_penalty: Controls summary length (>1.0 = longer)
	no_repeat_ngram_size: Prevent n-gram repetition
	early_stopping: Stop when beams complete
	do_sample: Use sampling instead of beam search
	temperature: Sampling randomness (lower = more deterministic)

	Returns:
	Generated summary string
	"""
	# Validate input
	self.validate_input(text)

	# Tokenize input
	inputs = self.tokenizer(
	text,
	max_length=1024, # PEGASUS max input
	truncation=True,
	padding="max_length",
	return_tensors="pt"
	)

	# Move to device
	input_ids = inputs["input_ids"].to(self.device)
	attention_mask = inputs["attention_mask"].to(self.device)

	# Generate summary
	with torch.no_grad():
	if do_sample:
	# Sampling-based generation
	summary_ids = self.model.generate(
	input_ids,
	attention_mask=attention_mask,
	max_length=max_length,
	min_length=min_length,
	do_sample=True,
	temperature=temperature,
	top_k=50,
	top_p=0.95,
	no_repeat_ngram_size=no_repeat_ngram_size
	)
	else:
	# Beam search generation (recommended for PEGASUS)
	summary_ids = self.model.generate(
	input_ids,
	attention_mask=attention_mask,
	max_length=max_length,
	min_length=min_length,
	num_beams=num_beams,
	length_penalty=length_penalty,
	no_repeat_ngram_size=no_repeat_ngram_size,
	early_stopping=early_stopping
	)

	# Decode summary
	summary = self.tokenizer.decode(
	summary_ids[0],
	skip_special_tokens=True,
	clean_up_tokenization_spaces=True
	)

	return summary

	def batch_summarize(self,
	texts: List[str],
	batch_size: int = 2,
	max_length: int = 128,
	**kwargs) -> List[str]:
	"""
	Batch summarization (PEGASUS is large, use smaller batches)

	Args:
	texts: List of texts to summarize
	batch_size: Texts per batch (2-4 recommended for PEGASUS)
	max_length: Maximum summary length
	**kwargs: Additional generation parameters

	Returns:
	List of generated summaries
	"""
	logger.info(f"Batch summarizing {len(texts)} texts (batch_size={batch_size})")

	summaries = []

	for i in range(0, len(texts), batch_size):
	batch = texts[i:i + batch_size]

	# Tokenize
	inputs = self.tokenizer(
	batch,
	max_length=1024,
	truncation=True,
	padding=True,
	return_tensors="pt"
	)

	input_ids = inputs["input_ids"].to(self.device)
	attention_mask = inputs["attention_mask"].to(self.device)

	# Generate
	with torch.no_grad():
	summary_ids = self.model.generate(
	input_ids,
	attention_mask=attention_mask,
	max_length=max_length,
	num_beams=kwargs.get('num_beams', 4),
	length_penalty=kwargs.get('length_penalty', 2.0),
	early_stopping=True
	)

	# Decode
	batch_summaries = [
	self.tokenizer.decode(ids, skip_special_tokens=True)
	for ids in summary_ids
	]

	summaries.extend(batch_summaries)

	logger.info(f"Completed batch {i//batch_size + 1}/{(len(texts)-1)//batch_size + 1}")

	return summaries

	def get_model_info(self) -> Dict:
	"""Return comprehensive model information"""
	info = super().get_model_info()
	info.update({
	'algorithm': 'Gap Sentence Generation (GSG) + Transformer',
	'innovation': 'Pre-training specifically designed for summarization',
	'architecture': {
	'encoder_layers': 16,
	'decoder_layers': 16,
	'attention_heads': 16,
	'hidden_size': 1024,
	'parameters': f'{self._count_parameters() / 1e6:.1f}M',
	'vocabulary_size': self.tokenizer.vocab_size
	},
	'pre_training': {
	'objective': 'Gap Sentence Generation (GSG)',
	'method': 'Mask and predict important sentences',
	'datasets': ['C4 corpus', 'HugeNews dataset'],
	'sentence_selection': 'ROUGE-based importance scoring'
	},
	'fine_tuning': {
	'dataset': 'CNN/DailyMail',
	'task': 'Abstractive summarization'
	},
	'performance': {
	'rouge_1': '44.17',
	'rouge_2': '21.47',
	'rouge_l': '41.11',
	'benchmark': 'CNN/DailyMail test set',
	'ranking': 'State-of-the-art (as of 2020)'
	},
	'advantages': [
	'Highest ROUGE scores on benchmarks',
	'Excellent zero-shot performance',
	'Generates highly coherent summaries',
	'Pre-training aligned with summarization',
	'Strong transfer learning capabilities'
	],
	'limitations': [
	'Very large model (high memory requirements)',
	'Slower inference than smaller models',
	'May hallucinate facts',
	'Less interpretable (black-box)',
	'Requires powerful GPU for real-time use'
	],
	'optimal_use_cases': [
	'High-quality abstractive summaries needed',
	'News article summarization',
	'Long document summarization',
	'Multi-document summarization',
	'Research paper abstracts'
	]
	})
	return info

	def get_special_tokens(self) -> Dict:
	"""Get information about special tokens"""
	return {
	'pad_token': self.tokenizer.pad_token,
	'eos_token': self.tokenizer.eos_token,
	'unk_token': self.tokenizer.unk_token,
	'mask_token': self.tokenizer.mask_token,
	'vocab_size': self.tokenizer.vocab_size
	}

	def __del__(self):
	"""Cleanup GPU memory"""
	if hasattr(self, 'device') and self.device == 'cuda':
	torch.cuda.empty_cache()
	logger.info("Cleared GPU cache")


	# Test the implementation
	if __name__ == "__main__":
	sample_text = """
	Climate change poses one of the greatest challenges to humanity in the 21st century.
	Rising global temperatures are causing ice caps to melt and sea levels to rise.
	Extreme weather events like hurricanes, droughts, and floods are becoming more frequent.
	Scientists warn that without immediate action, the consequences could be catastrophic.
	Renewable energy sources like solar and wind power offer sustainable alternatives to
	fossil fuels. Many countries have committed to reducing carbon emissions through the
	Paris Agreement. However, implementing these changes requires unprecedented international
	cooperation and technological innovation. The transition to a green economy will create
	new jobs while protecting the environment for future generations.
	"""

	print("=" * 70)
	print("PEGASUS SUMMARIZER - PROFESSIONAL TEST")
	print("=" * 70)

	# Initialize summarizer
	summarizer = PEGASUSSummarizer()

	# Generate summary with metrics
	result = summarizer.summarize_with_metrics(
	sample_text,
	max_length=100,
	min_length=30,
	num_beams=4,
	length_penalty=2.0
	)

	print(f"\nModel: {result['metadata']['model_name']}")
	print(f"Type: {result['metadata']['model_type']}")
	print(f"Device: {summarizer.device}")
	print(f"Input Length: {result['metadata']['input_length']} words")
	print(f"Summary Length: {result['metadata']['summary_length']} words")
	print(f"Compression Ratio: {result['metadata']['compression_ratio']:.2%}")
	print(f"Processing Time: {result['metadata']['processing_time']:.4f} seconds")

	print(f"\n{'Generated Summary:':-^70}")
	print(result['summary'])

	print(f"\n{'Model Architecture:':-^70}")
	model_info = summarizer.get_model_info()
	print(f"Parameters: {model_info['architecture']['parameters']}")
	print(f"Pre-training: {model_info['pre_training']['objective']}")
	print(f"Performance (CNN/DM): ROUGE-1={model_info['performance']['rouge_1']}, "
	f"ROUGE-2={model_info['performance']['rouge_2']}, "
	f"ROUGE-L={model_info['performance']['rouge_l']}")

	print("\n" + "=" * 70)