"""
TextRank Extractive Summarization
Graph-based ranking algorithm inspired by PageRank
Professional implementation with extensive documentation
"""

# Handle imports when running directly (python models/textrank.py)
# For proper package usage, run as: python -m models.textrank
import sys
from pathlib import Path
project_root = Path(__file__).parent.parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import numpy as np
import networkx as nx
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import logging
from typing import Dict, List, Tuple, Optional
from models.base_summarizer import BaseSummarizer

# Setup logging
logger = logging.getLogger(__name__)


class TextRankSummarizer(BaseSummarizer):
    """
    TextRank implementation for extractive text summarization.
    
    Algorithm Overview:
    1. Split text into sentences
    2. Create TF-IDF vectors for each sentence
    3. Calculate cosine similarity between all sentence pairs
    4. Build weighted graph (sentences as nodes, similarities as edges)
    5. Apply PageRank algorithm to rank sentences
    6. Select top-ranked sentences for summary
    
    Advantages:
    - Fast and efficient (no neural networks)
    - Language-agnostic (works on any language)
    - Interpretable results
    - No training required
    
    Limitations:
    - Cannot generate new sentences
    - May select redundant information
    - Limited semantic understanding
    """
    
    def __init__(self, 
                 damping: float = 0.85,
                 max_iter: int = 100,
                 tol: float = 1e-4,
                 summary_ratio: float = 0.3,
                 min_sentence_length: int = 5):
        """
        Initialize TextRank Summarizer
        
        Args:
            damping: PageRank damping factor (0-1). Higher = more weight to neighbors
            max_iter: Maximum iterations for PageRank convergence
            tol: Convergence tolerance for PageRank
            summary_ratio: Proportion of sentences to include (0-1)
            min_sentence_length: Minimum words per sentence to consider
        """
        super().__init__(model_name="TextRank", model_type="Extractive")
        
        self.damping = damping
        self.max_iter = max_iter
        self.tol = tol
        self.summary_ratio = summary_ratio
        self.min_sentence_length = min_sentence_length
        
        # Initialize stopwords
        try:
            self.stop_words = set(stopwords.words('english'))
        except LookupError:
            logger.warning("NLTK stopwords not found. Downloading...")
            import nltk
            nltk.download('stopwords')
            self.stop_words = set(stopwords.words('english'))
        
        self.is_initialized = True
        logger.info("TextRank summarizer initialized successfully")
    
    def preprocess(self, text: str) -> Tuple[List[str], List[str]]:
        """
        Preprocess text into sentences
        
        Args:
            text: Input text string
            
        Returns:
            Tuple of (original_sentences, cleaned_sentences)
        """
        # Split into sentences with error handling for NLTK data
        try:
            sentences = sent_tokenize(text)
        except LookupError:
            logger.warning("NLTK punkt tokenizer not found. Downloading...")
            import nltk
            nltk.download('punkt')
            nltk.download('punkt_tab')  # Download both punkt versions
            sentences = sent_tokenize(text)
        
        # Filter out very short sentences
        filtered_sentences = [
            s for s in sentences 
            if len(s.split()) >= self.min_sentence_length
        ]
        
        if not filtered_sentences:
            filtered_sentences = sentences  # Keep all if filtering removes everything
        
        # Clean sentences for similarity calculation
        cleaned_sentences = []
        for sent in filtered_sentences:
            # Tokenize and lowercase with error handling
            try:
                words = word_tokenize(sent.lower())
            except LookupError:
                logger.warning("NLTK punkt tokenizer not found for word tokenization. Downloading...")
                import nltk
                nltk.download('punkt')
                nltk.download('punkt_tab')
                words = word_tokenize(sent.lower())
            
            # Remove stopwords and non-alphanumeric tokens
            words = [w for w in words if w.isalnum() and w not in self.stop_words]
            cleaned_sentences.append(' '.join(words))
        
        return filtered_sentences, cleaned_sentences
    
    def build_similarity_matrix(self, sentences: List[str]) -> np.ndarray:
        """
        Build sentence similarity matrix using TF-IDF and cosine similarity
        
        Mathematical Foundation:
        - TF-IDF: Term Frequency-Inverse Document Frequency
        - Cosine Similarity: cos(θ) = (A·B) / (||A|| × ||B||)
        
        Args:
            sentences: List of cleaned sentences
            
        Returns:
            Similarity matrix (numpy array) of shape [n_sentences, n_sentences]
        """
        # Edge case handling
        n_sentences = len(sentences)
        if n_sentences < 2:
            return np.zeros((n_sentences, n_sentences))
        
        # Remove empty sentences
        valid_sentences = [s for s in sentences if s.strip()]
        if not valid_sentences:
            return np.zeros((n_sentences, n_sentences))
        
        try:
            # Create TF-IDF vectors
            vectorizer = TfidfVectorizer(
                max_features=1000,  # Limit features for efficiency
                ngram_range=(1, 2)  # Use unigrams and bigrams
            )
            tfidf_matrix = vectorizer.fit_transform(valid_sentences)
            
            # Calculate cosine similarity
            similarity_matrix = cosine_similarity(tfidf_matrix)
            
            # Set diagonal to 0 (sentence shouldn't be similar to itself)
            np.fill_diagonal(similarity_matrix, 0)
            
            return similarity_matrix
            
        except ValueError as e:
            logger.error(f"Error building similarity matrix: {e}")
            return np.zeros((n_sentences, n_sentences))
    
    def calculate_pagerank(self, similarity_matrix: np.ndarray) -> Dict[int, float]:
        """
        Apply PageRank algorithm to rank sentences
        
        PageRank Formula:
        WS(Vi) = (1-d) + d × Σ(wji / Σwjk) × WS(Vj)
        
        Where:
        - WS(Vi) = Score of sentence i
        - d = damping factor
        - wji = weight of edge from sentence j to i
        
        Args:
            similarity_matrix: Sentence similarity matrix
            
        Returns:
            Dictionary mapping sentence index to score
        """
        # Create graph from similarity matrix
        nx_graph = nx.from_numpy_array(similarity_matrix)
        
        try:
            # Calculate PageRank scores
            scores = nx.pagerank(
                nx_graph,
                alpha=self.damping,  # damping factor
                max_iter=self.max_iter,
                tol=self.tol
            )
            return scores
            
        except Exception as e:
            logger.error(f"PageRank calculation failed: {e}")
            # Return uniform scores as fallback
            n_nodes = similarity_matrix.shape[0]
            return {i: 1.0/n_nodes for i in range(n_nodes)}
    
    def summarize(self, 
                  text: str, 
                  num_sentences: Optional[int] = None,
                  return_scores: bool = False) -> str:
        """
        Generate extractive summary using TextRank
        
        Args:
            text: Input text to summarize
            num_sentences: Number of sentences in summary (overrides ratio)
            return_scores: If True, return tuple of (summary, scores)
            
        Returns:
            Summary string, or tuple of (summary, scores) if return_scores=True
        """
        # Validate input
        self.validate_input(text)
        
        # Preprocess
        original_sentences, cleaned_sentences = self.preprocess(text)
        
        # Edge cases
        if len(original_sentences) == 0:
            return "" if not return_scores else ("", {})
        if len(original_sentences) == 1:
            summary = original_sentences[0]
            return summary if not return_scores else (summary, {0: 1.0})
        
        # Build similarity matrix
        similarity_matrix = self.build_similarity_matrix(cleaned_sentences)
        
        # Calculate sentence scores using PageRank
        scores = self.calculate_pagerank(similarity_matrix)
        
        # Determine number of sentences for summary
        if num_sentences is None:
            num_sentences = max(1, int(len(original_sentences) * self.summary_ratio))
        num_sentences = min(num_sentences, len(original_sentences))
        
        # Rank sentences by score
        ranked_sentences = sorted(
            ((scores[i], i, s) for i, s in enumerate(original_sentences)),
            reverse=True
        )
        
        # Select top sentences and maintain original order
        top_sentences = sorted(
            ranked_sentences[:num_sentences],
            key=lambda x: x[1]  # Sort by original position
        )
        
        # Build summary
        summary = ' '.join([sent for _, _, sent in top_sentences])
        
        if return_scores:
            return summary, {
                'sentence_scores': scores,
                'selected_indices': [idx for _, idx, _ in top_sentences],
                'num_sentences_original': len(original_sentences),
                'num_sentences_summary': num_sentences
            }
        
        return summary
    
    def get_sentence_importance(self, text: str) -> List[Tuple[str, float]]:
        """
        Get all sentences with their importance scores
        
        Args:
            text: Input text
            
        Returns:
            List of (sentence, score) tuples sorted by importance
        """
        original_sentences, cleaned_sentences = self.preprocess(text)
        
        if len(original_sentences) < 2:
            return [(s, 1.0) for s in original_sentences]
        
        similarity_matrix = self.build_similarity_matrix(cleaned_sentences)
        scores = self.calculate_pagerank(similarity_matrix)
        
        # Combine sentences with scores
        sentence_importance = [
            (original_sentences[i], scores[i])
            for i in range(len(original_sentences))
        ]
        
        # Sort by importance
        sentence_importance.sort(key=lambda x: x[1], reverse=True)
        
        return sentence_importance
    
    def get_model_info(self) -> Dict:
        """Return detailed model information"""
        info = super().get_model_info()
        info.update({
            'algorithm': 'Graph-based PageRank',
            'parameters': {
                'damping_factor': self.damping,
                'max_iterations': self.max_iter,
                'tolerance': self.tol,
                'summary_ratio': self.summary_ratio,
                'min_sentence_length': self.min_sentence_length
            },
            'complexity': 'O(V²) where V = number of sentences',
            'advantages': [
                'Fast and efficient',
                'No training required',
                'Language-agnostic',
                'Interpretable results'
            ],
            'limitations': [
                'Cannot generate new sentences',
                'Limited semantic understanding',
                'May miss context'
            ]
        })
        return info


# Test the implementation
if __name__ == "__main__":
    # Sample academic text
    sample_text = """
    Artificial intelligence has become one of the most transformative technologies 
    of the 21st century. Machine learning, a subset of AI, enables computers to 
    learn from data without explicit programming. Deep learning uses neural networks 
    with multiple layers to process complex patterns. Natural language processing 
    allows machines to understand and generate human language. Computer vision enables 
    machines to interpret visual information from the world. AI applications span 
    healthcare, finance, education, transportation, and entertainment. Ethical 
    considerations around AI include privacy, bias, and job displacement. The future 
    of AI promises both unprecedented opportunities and significant challenges that 
    society must navigate carefully.
    """
    
    # Initialize summarizer
    summarizer = TextRankSummarizer(summary_ratio=0.3)
    
    print("=" * 70)
    print("TEXTRANK SUMMARIZER - PROFESSIONAL TEST")
    print("=" * 70)
    
    # Generate summary with metrics
    result = summarizer.summarize_with_metrics(sample_text)
    
    print(f"\nModel: {result['metadata']['model_name']}")
    print(f"Type: {result['metadata']['model_type']}")
    print(f"Input Length: {result['metadata']['input_length']} words")
    print(f"Summary Length: {result['metadata']['summary_length']} words")
    print(f"Compression Ratio: {result['metadata']['compression_ratio']:.2%}")
    print(f"Processing Time: {result['metadata']['processing_time']:.4f} seconds")
    
    print(f"\n{'Summary:':-^70}")
    print(result['summary'])
    
    print(f"\n{'Sentence Importance Ranking:':-^70}")
    importance = summarizer.get_sentence_importance(sample_text)
    for i, (sent, score) in enumerate(importance[:5], 1):
        print(f"{i}. [Score: {score:.4f}] {sent[:80]}...")
    
    print("\n" + "=" * 70)
    print(summarizer.get_model_info())