""" TextRank Extractive Summarization Graph-based ranking algorithm inspired by PageRank Professional implementation with extensive documentation """ # Handle imports when running directly (python models/textrank.py) # For proper package usage, run as: python -m models.textrank import sys from pathlib import Path project_root = Path(__file__).parent.parent if str(project_root) not in sys.path: sys.path.insert(0, str(project_root)) import numpy as np import networkx as nx from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import logging from typing import Dict, List, Tuple, Optional from models.base_summarizer import BaseSummarizer # Setup logging logger = logging.getLogger(__name__) class TextRankSummarizer(BaseSummarizer): """ TextRank implementation for extractive text summarization. Algorithm Overview: 1. Split text into sentences 2. Create TF-IDF vectors for each sentence 3. Calculate cosine similarity between all sentence pairs 4. Build weighted graph (sentences as nodes, similarities as edges) 5. Apply PageRank algorithm to rank sentences 6. Select top-ranked sentences for summary Advantages: - Fast and efficient (no neural networks) - Language-agnostic (works on any language) - Interpretable results - No training required Limitations: - Cannot generate new sentences - May select redundant information - Limited semantic understanding """ def __init__(self, damping: float = 0.85, max_iter: int = 100, tol: float = 1e-4, summary_ratio: float = 0.3, min_sentence_length: int = 5): """ Initialize TextRank Summarizer Args: damping: PageRank damping factor (0-1). Higher = more weight to neighbors max_iter: Maximum iterations for PageRank convergence tol: Convergence tolerance for PageRank summary_ratio: Proportion of sentences to include (0-1) min_sentence_length: Minimum words per sentence to consider """ super().__init__(model_name="TextRank", model_type="Extractive") self.damping = damping self.max_iter = max_iter self.tol = tol self.summary_ratio = summary_ratio self.min_sentence_length = min_sentence_length # Initialize stopwords try: self.stop_words = set(stopwords.words('english')) except LookupError: logger.warning("NLTK stopwords not found. Downloading...") import nltk nltk.download('stopwords') self.stop_words = set(stopwords.words('english')) self.is_initialized = True logger.info("TextRank summarizer initialized successfully") def preprocess(self, text: str) -> Tuple[List[str], List[str]]: """ Preprocess text into sentences Args: text: Input text string Returns: Tuple of (original_sentences, cleaned_sentences) """ # Split into sentences with error handling for NLTK data try: sentences = sent_tokenize(text) except LookupError: logger.warning("NLTK punkt tokenizer not found. Downloading...") import nltk nltk.download('punkt') nltk.download('punkt_tab') # Download both punkt versions sentences = sent_tokenize(text) # Filter out very short sentences filtered_sentences = [ s for s in sentences if len(s.split()) >= self.min_sentence_length ] if not filtered_sentences: filtered_sentences = sentences # Keep all if filtering removes everything # Clean sentences for similarity calculation cleaned_sentences = [] for sent in filtered_sentences: # Tokenize and lowercase with error handling try: words = word_tokenize(sent.lower()) except LookupError: logger.warning("NLTK punkt tokenizer not found for word tokenization. Downloading...") import nltk nltk.download('punkt') nltk.download('punkt_tab') words = word_tokenize(sent.lower()) # Remove stopwords and non-alphanumeric tokens words = [w for w in words if w.isalnum() and w not in self.stop_words] cleaned_sentences.append(' '.join(words)) return filtered_sentences, cleaned_sentences def build_similarity_matrix(self, sentences: List[str]) -> np.ndarray: """ Build sentence similarity matrix using TF-IDF and cosine similarity Mathematical Foundation: - TF-IDF: Term Frequency-Inverse Document Frequency - Cosine Similarity: cos(θ) = (A·B) / (||A|| × ||B||) Args: sentences: List of cleaned sentences Returns: Similarity matrix (numpy array) of shape [n_sentences, n_sentences] """ # Edge case handling n_sentences = len(sentences) if n_sentences < 2: return np.zeros((n_sentences, n_sentences)) # Remove empty sentences valid_sentences = [s for s in sentences if s.strip()] if not valid_sentences: return np.zeros((n_sentences, n_sentences)) try: # Create TF-IDF vectors vectorizer = TfidfVectorizer( max_features=1000, # Limit features for efficiency ngram_range=(1, 2) # Use unigrams and bigrams ) tfidf_matrix = vectorizer.fit_transform(valid_sentences) # Calculate cosine similarity similarity_matrix = cosine_similarity(tfidf_matrix) # Set diagonal to 0 (sentence shouldn't be similar to itself) np.fill_diagonal(similarity_matrix, 0) return similarity_matrix except ValueError as e: logger.error(f"Error building similarity matrix: {e}") return np.zeros((n_sentences, n_sentences)) def calculate_pagerank(self, similarity_matrix: np.ndarray) -> Dict[int, float]: """ Apply PageRank algorithm to rank sentences PageRank Formula: WS(Vi) = (1-d) + d × Σ(wji / Σwjk) × WS(Vj) Where: - WS(Vi) = Score of sentence i - d = damping factor - wji = weight of edge from sentence j to i Args: similarity_matrix: Sentence similarity matrix Returns: Dictionary mapping sentence index to score """ # Create graph from similarity matrix nx_graph = nx.from_numpy_array(similarity_matrix) try: # Calculate PageRank scores scores = nx.pagerank( nx_graph, alpha=self.damping, # damping factor max_iter=self.max_iter, tol=self.tol ) return scores except Exception as e: logger.error(f"PageRank calculation failed: {e}") # Return uniform scores as fallback n_nodes = similarity_matrix.shape[0] return {i: 1.0/n_nodes for i in range(n_nodes)} def summarize(self, text: str, num_sentences: Optional[int] = None, return_scores: bool = False) -> str: """ Generate extractive summary using TextRank Args: text: Input text to summarize num_sentences: Number of sentences in summary (overrides ratio) return_scores: If True, return tuple of (summary, scores) Returns: Summary string, or tuple of (summary, scores) if return_scores=True """ # Validate input self.validate_input(text) # Preprocess original_sentences, cleaned_sentences = self.preprocess(text) # Edge cases if len(original_sentences) == 0: return "" if not return_scores else ("", {}) if len(original_sentences) == 1: summary = original_sentences[0] return summary if not return_scores else (summary, {0: 1.0}) # Build similarity matrix similarity_matrix = self.build_similarity_matrix(cleaned_sentences) # Calculate sentence scores using PageRank scores = self.calculate_pagerank(similarity_matrix) # Determine number of sentences for summary if num_sentences is None: num_sentences = max(1, int(len(original_sentences) * self.summary_ratio)) num_sentences = min(num_sentences, len(original_sentences)) # Rank sentences by score ranked_sentences = sorted( ((scores[i], i, s) for i, s in enumerate(original_sentences)), reverse=True ) # Select top sentences and maintain original order top_sentences = sorted( ranked_sentences[:num_sentences], key=lambda x: x[1] # Sort by original position ) # Build summary summary = ' '.join([sent for _, _, sent in top_sentences]) if return_scores: return summary, { 'sentence_scores': scores, 'selected_indices': [idx for _, idx, _ in top_sentences], 'num_sentences_original': len(original_sentences), 'num_sentences_summary': num_sentences } return summary def get_sentence_importance(self, text: str) -> List[Tuple[str, float]]: """ Get all sentences with their importance scores Args: text: Input text Returns: List of (sentence, score) tuples sorted by importance """ original_sentences, cleaned_sentences = self.preprocess(text) if len(original_sentences) < 2: return [(s, 1.0) for s in original_sentences] similarity_matrix = self.build_similarity_matrix(cleaned_sentences) scores = self.calculate_pagerank(similarity_matrix) # Combine sentences with scores sentence_importance = [ (original_sentences[i], scores[i]) for i in range(len(original_sentences)) ] # Sort by importance sentence_importance.sort(key=lambda x: x[1], reverse=True) return sentence_importance def get_model_info(self) -> Dict: """Return detailed model information""" info = super().get_model_info() info.update({ 'algorithm': 'Graph-based PageRank', 'parameters': { 'damping_factor': self.damping, 'max_iterations': self.max_iter, 'tolerance': self.tol, 'summary_ratio': self.summary_ratio, 'min_sentence_length': self.min_sentence_length }, 'complexity': 'O(V²) where V = number of sentences', 'advantages': [ 'Fast and efficient', 'No training required', 'Language-agnostic', 'Interpretable results' ], 'limitations': [ 'Cannot generate new sentences', 'Limited semantic understanding', 'May miss context' ] }) return info # Test the implementation if __name__ == "__main__": # Sample academic text sample_text = """ Artificial intelligence has become one of the most transformative technologies of the 21st century. Machine learning, a subset of AI, enables computers to learn from data without explicit programming. Deep learning uses neural networks with multiple layers to process complex patterns. Natural language processing allows machines to understand and generate human language. Computer vision enables machines to interpret visual information from the world. AI applications span healthcare, finance, education, transportation, and entertainment. Ethical considerations around AI include privacy, bias, and job displacement. The future of AI promises both unprecedented opportunities and significant challenges that society must navigate carefully. """ # Initialize summarizer summarizer = TextRankSummarizer(summary_ratio=0.3) print("=" * 70) print("TEXTRANK SUMMARIZER - PROFESSIONAL TEST") print("=" * 70) # Generate summary with metrics result = summarizer.summarize_with_metrics(sample_text) print(f"\nModel: {result['metadata']['model_name']}") print(f"Type: {result['metadata']['model_type']}") print(f"Input Length: {result['metadata']['input_length']} words") print(f"Summary Length: {result['metadata']['summary_length']} words") print(f"Compression Ratio: {result['metadata']['compression_ratio']:.2%}") print(f"Processing Time: {result['metadata']['processing_time']:.4f} seconds") print(f"\n{'Summary:':-^70}") print(result['summary']) print(f"\n{'Sentence Importance Ranking:':-^70}") importance = summarizer.get_sentence_importance(sample_text) for i, (sent, score) in enumerate(importance[:5], 1): print(f"{i}. [Score: {score:.4f}] {sent[:80]}...") print("\n" + "=" * 70) print(summarizer.get_model_info())