"""
LSTM Model for Sentiment Analysis
=================================
Basic LSTM model for 3-class sentiment classification.
This serves as baseline model for comparison.
"""

import torch
import torch.nn as nn
import torch.nn.functional as F


class LSTMModel(nn.Module):
    """
    Basic LSTM model for sentiment classification.
    
    Architecture:
        Embedding -> LSTM -> Dropout -> Fully Connected -> Softmax
    
    Args:
        vocab_size: Size of vocabulary
        embedding_dim: Dimension of word embeddings
        hidden_dim: Dimension of LSTM hidden state
        output_dim: Number of output classes (3 for sentiment)
        n_layers: Number of LSTM layers
        dropout: Dropout probability
        bidirectional: Whether to use bidirectional LSTM
        pad_idx: Index of padding token
    """
    
    def __init__(
        self,
        vocab_size,
        embedding_dim=128,
        hidden_dim=256,
        output_dim=3,
        n_layers=2,
        dropout=0.5,
        bidirectional=False,
        pad_idx=0
    ):
        super(LSTMModel, self).__init__()
        
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embedding_dim,
            padding_idx=pad_idx
        )
        
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=n_layers,
            batch_first=True,
            dropout=dropout if n_layers > 1 else 0,
            bidirectional=bidirectional
        )
        
        # Adjust for bidirectional
        lstm_output_dim = hidden_dim * 2 if bidirectional else hidden_dim
        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(lstm_output_dim, output_dim)
        
        # Store config for saving/loading
        self.config = {
            'vocab_size': vocab_size,
            'embedding_dim': embedding_dim,
            'hidden_dim': hidden_dim,
            'output_dim': output_dim,
            'n_layers': n_layers,
            'dropout': dropout,
            'bidirectional': bidirectional,
            'pad_idx': pad_idx
        }
        
    def forward(self, input_ids, lengths=None):
        """
        Forward pass.
        
        Args:
            input_ids: Tensor of shape (batch_size, seq_length)
            lengths: Tensor of actual sequence lengths (optional)
            
        Returns:
            logits: Tensor of shape (batch_size, output_dim)
        """
        # Embedding: (batch_size, seq_length) -> (batch_size, seq_length, embedding_dim)
        embedded = self.embedding(input_ids)
        embedded = self.dropout(embedded)
        
        # LSTM: (batch_size, seq_length, embedding_dim) -> (batch_size, seq_length, hidden_dim)
        if lengths is not None:
            # Pack padded sequence for efficiency
            packed = nn.utils.rnn.pack_padded_sequence(
                embedded, 
                lengths.cpu(), 
                batch_first=True, 
                enforce_sorted=False
            )
            lstm_out, (hidden, cell) = self.lstm(packed)
            # Unpack
            lstm_out, _ = nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
        else:
            lstm_out, (hidden, cell) = self.lstm(embedded)
        
        # Use the last hidden state
        if self.lstm.bidirectional:
            # Concatenate forward and backward hidden states
            hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        else:
            hidden = hidden[-1,:,:]
        
        # Dropout and fully connected
        hidden = self.dropout(hidden)
        logits = self.fc(hidden)
        
        return logits
    
    def predict(self, input_ids, lengths=None):
        """Get predicted class."""
        with torch.no_grad():
            logits = self.forward(input_ids, lengths)
            predictions = torch.argmax(logits, dim=1)
        return predictions
    
    def predict_proba(self, input_ids, lengths=None):
        """Get class probabilities."""
        with torch.no_grad():
            logits = self.forward(input_ids, lengths)
            probabilities = F.softmax(logits, dim=1)
        return probabilities


class BiLSTMModel(LSTMModel):
    """
    Bidirectional LSTM model.
    Simply sets bidirectional=True in parent class.
    """
    
    def __init__(
        self,
        vocab_size,
        embedding_dim=128,
        hidden_dim=256,
        output_dim=3,
        n_layers=2,
        dropout=0.5,
        pad_idx=0
    ):
        super(BiLSTMModel, self).__init__(
            vocab_size=vocab_size,
            embedding_dim=embedding_dim,
            hidden_dim=hidden_dim,
            output_dim=output_dim,
            n_layers=n_layers,
            dropout=dropout,
            bidirectional=True,
            pad_idx=pad_idx
        )


# ==================== TESTING ====================

if __name__ == "__main__":
    # Test model
    vocab_size = 10000
    batch_size = 32
    seq_length = 50
    
    # Create random input
    input_ids = torch.randint(0, vocab_size, (batch_size, seq_length))
    lengths = torch.randint(10, seq_length, (batch_size,))
    
    # Test LSTM model
    print("Testing LSTM Model:")
    model = LSTMModel(vocab_size=vocab_size)
    output = model(input_ids, lengths)
    print(f"  Input shape: {input_ids.shape}")
    print(f"  Output shape: {output.shape}")
    print(f"  Parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    # Test BiLSTM model
    print("\nTesting BiLSTM Model:")
    model_bi = BiLSTMModel(vocab_size=vocab_size)
    output_bi = model_bi(input_ids, lengths)
    print(f"  Input shape: {input_ids.shape}")
    print(f"  Output shape: {output_bi.shape}")
    print(f"  Parameters: {sum(p.numel() for p in model_bi.parameters()):,}")