"""HuggingFace compatible Namer model."""

from __future__ import annotations

import math
from typing import Optional, Union

import torch
import torch.nn as nn
from transformers import PreTrainedModel, PretrainedConfig
from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
from transformers.generation import GenerationMixin


class NamerConfig(PretrainedConfig):
    """Configuration class for NamerModel."""
    
    model_type = "custom"
    
    def __init__(
        self,
        vocab_size: int = 41,
        max_output_len: int = 20,
        d_model: int = 128,
        nhead: int = 4,
        num_encoder_layers: int = 4,
        dim_feedforward: int = 512,
        dropout: float = 0.1,
        pad_token_id: int = 10,
        eos_token_id: int = 40,  # <EOS> token index
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_output_len = max_output_len
        self.d_model = d_model
        self.nhead = nhead
        self.num_encoder_layers = num_encoder_layers
        self.dim_feedforward = dim_feedforward
        self.dropout = dropout
        
        super().__init__(
            pad_token_id=pad_token_id,
            eos_token_id=eos_token_id,
            **kwargs,
        )


class PositionalEncoding(nn.Module):
    """Sinusoidal positional encoding for transformer."""

    def __init__(self, d_model: int, max_len: int = 5000) -> None:
        super().__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float()
            * (-math.log(10000.0) / d_model)
        )

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer("pe", pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Add positional encoding to input."""
        return x + self.pe[: x.size(1)]


class NamerModel(PreTrainedModel, GenerationMixin):
    """HuggingFace compatible Namer transformer model.
    
    Converts integer digit sequences to English number names.
    """
    
    config_class = NamerConfig
    base_model_prefix = "namer"
    
    def __init__(self, config: NamerConfig):
        super().__init__(config)
        
        self.vocab_size = config.vocab_size
        self.max_output_len = config.max_output_len
        self.d_model = config.d_model
        
        # Digit embedding (10 digits + 1 padding token = 11)
        self.digit_embedding = nn.Embedding(11, config.d_model, padding_idx=config.pad_token_id)
        
        # Positional encoding
        self.pos_encoder = PositionalEncoding(config.d_model, max_len=100)
        
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=config.d_model,
            nhead=config.nhead,
            dim_feedforward=config.dim_feedforward,
            dropout=config.dropout,
            batch_first=True,
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=config.num_encoder_layers
        )
        
        # Output projection
        self.output_projection = nn.Linear(config.d_model, config.vocab_size)
        
        # Learned queries for each output position
        self.output_queries = nn.Parameter(torch.randn(config.max_output_len, config.d_model))
        
        # Cross-attention from output positions to encoded input
        self.cross_attention = nn.MultiheadAttention(
            config.d_model, config.nhead, dropout=config.dropout, batch_first=True
        )
        
        # Final output layers
        self.output_norm = nn.LayerNorm(config.d_model)
        
        self.post_init()
    
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> CausalLMOutputWithCrossAttentions:
        """Forward pass for HF compatibility.
        
        Args:
            input_ids: (batch_size, seq_len) tensor of digit indices (0-9), padding=10
            attention_mask: Optional mask for padding
            labels: Optional target labels for training
            
        Returns:
            CausalLMOutputWithCrossAttentions with logits
        """
        if input_ids is None:
            raise ValueError("input_ids must be provided")
        
        batch_size, seq_len = input_ids.shape
        
        # Handle padding: convert -1 padding to 10 (our padding index)
        digits = input_ids.clone()
        digits[digits == -1] = self.config.pad_token_id
        
        # Create padding mask for transformer (True = padding)
        if attention_mask is None:
            src_key_padding_mask = digits == self.config.pad_token_id
        else:
            src_key_padding_mask = ~attention_mask.bool()
        
        # Embed digits: (batch, seq_len, d_model)
        embedded = self.digit_embedding(digits)
        
        # Add positional encoding
        embedded = self.pos_encoder(embedded)
        
        # Transformer encoder: (batch, seq_len, d_model)
        memory = self.transformer_encoder(
            embedded, src_key_padding_mask=src_key_padding_mask
        )
        
        # Expand queries for batch: (batch, max_output_len, d_model)
        queries = self.output_queries.unsqueeze(0).expand(batch_size, -1, -1)
        
        # Cross-attention from queries to encoded input
        attn_output, _ = self.cross_attention(
            queries, memory, memory, key_padding_mask=src_key_padding_mask
        )
        
        # Normalize and project to vocab
        output = self.output_norm(attn_output)
        logits = self.output_projection(output)
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
            loss = loss_fct(logits.view(-1, self.vocab_size), labels.view(-1))
        
        return CausalLMOutputWithCrossAttentions(
            loss=loss,
            logits=logits,
            hidden_states=None,
            attentions=None,
            cross_attentions=None,
        )
    
    def prepare_inputs_for_generation(self, input_ids, **kwargs):
        """Prepare inputs for text generation."""
        return {"input_ids": input_ids}
    
    def _reorder_cache(self, past_key_values, beam_idx):
        """Reorder cache for beam search."""
        return past_key_values


class NamerPipeline:
    """Simple pipeline for Namer model inference.
    
    Usage:
        from transformers import AutoModel
        
        # Load model
        model = AutoModel.from_pretrained(
            "edwinhere/namer", 
            trust_remote_code=True
        )
        
        # Create pipeline
        pipe = NamerPipeline(model)
        
        # Generate
        result = pipe.generate(42)  # "forty two"
        result = pipe(42)  # {"generated_text": "forty two"}
    """
    
    def __init__(self, model: NamerModel, tokenizer=None, device: str = None):
        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = model.to(device)
        self.model.eval()
        self.device = device
        self.tokenizer = tokenizer  # Placeholder if we add a tokenizer later
        
        # Vocabulary mapping (index -> word)
        # Must match utils.py vocabulary exactly
        self.id2word = {
            0: "zero", 1: "one", 2: "two", 3: "three", 4: "four",
            5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine",
            10: "ten", 11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen",
            15: "fifteen", 16: "sixteen", 17: "seventeen", 18: "eighteen", 19: "nineteen",
            20: "twenty", 21: "thirty", 22: "forty", 23: "fifty",
            24: "sixty", 25: "seventy", 26: "eighty", 27: "ninety",
            28: "hundred",
            29: "thousand", 30: "million", 31: "billion", 32: "trillion",
            33: "quadrillion", 34: "quintillion", 35: "sextillion",
            36: "septillion", 37: "octillion", 38: "nonillion", 39: "decillion",
            40: "<EOS>"
        }
        
        # Reverse mapping
        self.word2id = {v: k for k, v in self.id2word.items()}
    
    def _int_to_digits(self, n: int) -> list[int]:
        """Convert integer to list of digit indices."""
        if n == 0:
            return [0]
        digits = []
        while n > 0:
            digits.append(n % 10)
            n //= 10
        return digits[::-1]  # Reverse to get most significant digit first
    
    def _decode(self, token_ids: list[int]) -> str:
        """Decode token IDs to text, stopping at first EOS."""
        words = []
        eos_idx = self.model.config.eos_token_id  # Should be 40
        
        for idx in token_ids:
            if idx == eos_idx:  # Stop at EOS
                break
            if idx in self.id2word:
                word = self.id2word[idx]
                if word != "<EOS>":  # Skip EOS token itself
                    words.append(word)
        
        return " ".join(words) if words else "zero"
    
    def generate(self, text: Union[str, int], **kwargs) -> str:
        """Generate English name for a number.
        
        Args:
            text: Integer or string representation of integer
            
        Returns:
            English name of the number
        """
        # Parse input
        if isinstance(text, str):
            n = int(text.strip())
        else:
            n = int(text)
        
        # Convert to digits
        digits = self._int_to_digits(n)
        
        # Pad to max length (20)
        while len(digits) < 20:
            digits.append(10)  # padding token
        
        # Create tensor
        input_ids = torch.tensor([digits], dtype=torch.long).to(self.device)
        
        # Forward pass
        with torch.no_grad():
            outputs = self.model(input_ids)
            logits = outputs.logits
            predictions = logits.argmax(dim=-1)[0].cpu().tolist()
        
        # Decode
        return self._decode(predictions)
    
    def __call__(self, text: Union[str, int], **kwargs) -> dict:
        """Callable interface for pipeline.
        
        Returns dict with 'generated_text' key for HF pipeline compatibility.
        """
        result = self.generate(text, **kwargs)
        return {"generated_text": result}


def load_namer_pipeline(model_name_or_path: str = "edwinhere/namer", device: str = None, **kwargs):
    """Load a Namer pipeline with model.
    
    This is a convenience function that loads both the model and creates
    a pipeline for easy inference.
    
    Args:
        model_name_or_path: HuggingFace model ID or local path
        device: Device to run on ('cuda', 'cpu', or None for auto)
        **kwargs: Additional args passed to from_pretrained
        
    Returns:
        NamerPipeline instance ready for inference
        
    Example:
        >>> pipe = load_namer_pipeline("edwinhere/namer")
        >>> pipe.generate(42)
        'forty two'
        >>> pipe(123)
        {'generated_text': 'one hundred twenty three'}
    """
    from transformers import AutoModel
    
    model = AutoModel.from_pretrained(
        model_name_or_path,
        trust_remote_code=True,
        **kwargs
    )
    
    return NamerPipeline(model, device=device)