"""
VicAI Model Architecture
A 5B parameter decoder-only transformer language model.
"""

import math
from typing import Optional, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F


class RMSNorm(nn.Module):
    """Root Mean Square Layer Normalization."""
    
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))
    
    def forward(self, x):
        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight


class RotaryPositionalEmbedding(nn.Module):
    """Rotary Position Embedding (RoPE)."""
    
    def __init__(self, dim: int, max_seq_len: int = 8192, base: float = 10000.0):
        super().__init__()
        self.dim = dim
        self.max_seq_len = max_seq_len
        self.base = base
        
        inv_freq = 1.0 / (self.base ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer("inv_freq", inv_freq)
        
        t = torch.arange(max_seq_len)
        freqs = torch.einsum("i,j->ij", t, inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1)
        self.register_buffer("cos_cached", emb.cos()[None, None, :, :])
        self.register_buffer("sin_cached", emb.sin()[None, None, :, :])
    
    def rotate_half(self, x):
        x1, x2 = x.chunk(2, dim=-1)
        return torch.cat((-x2, x1), dim=-1)
    
    def apply_rotary_pos_emb(self, q, k, cos, sin):
        q_embed = (q * cos) + (self.rotate_half(q) * sin)
        k_embed = (k * cos) + (self.rotate_half(k) * sin)
        return q_embed, k_embed
    
    def forward(self, q, k, seq_len: int):
        cos = self.cos_cached[:, :, :seq_len, :]
        sin = self.sin_cached[:, :, :seq_len, :]
        return self.apply_rotary_pos_emb(q, k, cos, sin)


class GroupedQueryAttention(nn.Module):
    """Grouped Query Attention (GQA) for efficient inference."""
    
    def __init__(
        self,
        dim: int,
        n_heads: int,
        n_kv_heads: int,
        dropout: float = 0.0,
    ):
        super().__init__()
        self.dim = dim
        self.n_heads = n_heads
        self.n_kv_heads = n_kv_heads
        self.head_dim = dim // n_heads
        self.n_rep = n_heads // n_kv_heads
        
        self.wq = nn.Linear(dim, n_heads * self.head_dim, bias=False)
        self.wk = nn.Linear(dim, n_kv_heads * self.head_dim, bias=False)
        self.wv = nn.Linear(dim, n_kv_heads * self.head_dim, bias=False)
        self.wo = nn.Linear(n_heads * self.head_dim, dim, bias=False)
        
        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)
        
        self.rope = RotaryPositionalEmbedding(self.head_dim)
        
    def forward(
        self,
        x: torch.Tensor,
        mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
    ):
        bsz, seq_len, _ = x.shape
        
        q = self.wq(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
        k = self.wk(x).view(bsz, seq_len, self.n_kv_heads, self.head_dim).transpose(1, 2)
        v = self.wv(x).view(bsz, seq_len, self.n_kv_heads, self.head_dim).transpose(1, 2)
        
        q, k = self.rope(q, k, seq_len)
        
        if past_key_value is not None:
            past_k, past_v = past_key_value
            k = torch.cat([past_k, k], dim=2)
            v = torch.cat([past_v, v], dim=2)
        
        past_key_value = (k, v)
        
        # Repeat k/v for grouped query attention
        k = k.repeat_interleave(self.n_rep, dim=1)
        v = v.repeat_interleave(self.n_rep, dim=1)
        
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
        
        if mask is not None:
            scores = scores + mask
        
        attn = F.softmax(scores, dim=-1)
        attn = self.attn_dropout(attn)
        
        out = torch.matmul(attn, v)
        out = out.transpose(1, 2).contiguous().view(bsz, seq_len, self.dim)
        out = self.wo(out)
        out = self.resid_dropout(out)
        
        return out, past_key_value


class FeedForward(nn.Module):
    """SwiGLU Feed-Forward Network."""
    
    def __init__(self, dim: int, hidden_dim: int, dropout: float = 0.0):
        super().__init__()
        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        return self.w2(F.silu(self.w1(x)) * self.w3(x))


class TransformerBlock(nn.Module):
    """Single transformer block with pre-normalization."""
    
    def __init__(
        self,
        dim: int,
        n_heads: int,
        n_kv_heads: int,
        hidden_dim: int,
        dropout: float = 0.0,
    ):
        super().__init__()
        self.attention_norm = RMSNorm(dim)
        self.attention = GroupedQueryAttention(dim, n_heads, n_kv_heads, dropout)
        self.ffn_norm = RMSNorm(dim)
        self.feed_forward = FeedForward(dim, hidden_dim, dropout)
    
    def forward(
        self,
        x: torch.Tensor,
        mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
    ):
        # Attention with residual
        attn_out, past_key_value = self.attention(
            self.attention_norm(x), mask, past_key_value
        )
        x = x + attn_out
        
        # FFN with residual
        x = x + self.feed_forward(self.ffn_norm(x))
        
        return x, past_key_value


class VicAIConfig:
    """Configuration for VicAI model."""
    
    def __init__(
        self,
        vocab_size: int = 32000,
        dim: int = 4096,
        n_layers: int = 32,
        n_heads: int = 32,
        n_kv_heads: int = 8,
        hidden_dim: int = 14336,
        max_seq_len: int = 8192,
        dropout: float = 0.0,
        tie_weights: bool = False,
    ):
        self.vocab_size = vocab_size
        self.dim = dim
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.n_kv_heads = n_kv_heads
        self.hidden_dim = hidden_dim
        self.max_seq_len = max_seq_len
        self.dropout = dropout
        self.tie_weights = tie_weights
    
    @property
    def num_parameters(self):
        """Calculate approximate parameter count."""
        # Embedding
        params = self.vocab_size * self.dim
        # Attention per layer
        attn_params = 4 * self.dim * self.dim  # q, k, v, o projections
        # FFN per layer
        ffn_params = 3 * self.dim * self.hidden_dim  # w1, w2, w3
        # Layers
        params += self.n_layers * (attn_params + ffn_params)
        # Output
        params += self.vocab_size * self.dim
        return params


class VicAIModel(nn.Module):
    """
    VicAI: A 5B parameter decoder-only transformer language model.
    
    Architecture details:
    - 32 layers
    - 4096 model dimension
    - 32 attention heads (8 key-value heads for GQA)
    - SwiGLU FFN with 14336 hidden dimension
    - RoPE positional embeddings
    - RMSNorm pre-normalization
    - ~5.1B total parameters
    """
    
    def __init__(self, config: VicAIConfig):
        super().__init__()
        self.config = config
        
        self.token_embedding = nn.Embedding(config.vocab_size, config.dim)
        self.dropout = nn.Dropout(config.dropout)
        
        self.layers = nn.ModuleList([
            TransformerBlock(
                config.dim,
                config.n_heads,
                config.n_kv_heads,
                config.hidden_dim,
                config.dropout,
            )
            for _ in range(config.n_layers)
        ])
        
        self.norm = RMSNorm(config.dim)
        self.lm_head = nn.Linear(config.dim, config.vocab_size, bias=False)
        
        if config.tie_weights:
            self.lm_head.weight = self.token_embedding.weight
        
        self.apply(self._init_weights)
        
        # Print model info
        total_params = self.get_num_params()
        print(f"VicAI Model initialized with {total_params / 1e9:.2f}B parameters")
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    
    def get_num_params(self, non_embedding=True):
        n_params = sum(p.numel() for p in self.parameters())
        if non_embedding:
            n_params -= self.token_embedding.weight.numel()
        return n_params
    
    def forward(
        self,
        input_ids: torch.Tensor,
        targets: Optional[torch.Tensor] = None,
        past_key_values: Optional[list] = None,
    ):
        bsz, seq_len = input_ids.shape
        
        # Create causal mask
        mask = torch.triu(
            torch.ones(seq_len, seq_len, device=input_ids.device),
            diagonal=1
        ).bool()
        mask = mask.unsqueeze(0).unsqueeze(0)
        mask = mask.to(input_ids.device)
        mask = torch.where(mask, float('-inf'), 0.0)
        
        x = self.token_embedding(input_ids)
        x = self.dropout(x)
        
        new_key_values = []
        for i, layer in enumerate(self.layers):
            past_kv = past_key_values[i] if past_key_values is not None else None
            x, kv = layer(x, mask, past_kv)
            new_key_values.append(kv)
        
        x = self.norm(x)
        logits = self.lm_head(x)
        
        loss = None
        if targets is not None:
            loss = F.cross_entropy(
                logits.view(-1, logits.size(-1)),
                targets.view(-1),
                ignore_index=-100
            )
        
        return {
            'logits': logits,
            'loss': loss,
            'past_key_values': new_key_values,
        }
    
    @torch.no_grad()
    def generate(
        self,
        input_ids: torch.Tensor,
        max_new_tokens: int = 100,
        temperature: float = 1.0,
        top_k: int = 50,
        top_p: float = 0.9,
        repetition_penalty: float = 1.0,
        eos_token_id: Optional[int] = None,
    ):
        """Generate text autoregressively."""
        self.eval()
        
        batch_size = input_ids.shape[0]
        device = input_ids.device
        past_key_values = None
        
        for _ in range(max_new_tokens):
            outputs = self(input_ids, past_key_values=past_key_values)
            logits = outputs['logits']
            past_key_values = outputs['past_key_values']
            
            # Get logits for last token
            logits = logits[:, -1, :] / temperature
            
            # Apply repetition penalty
            if repetition_penalty != 1.0:
                for i in range(batch_size):
                    for token_id in set(input_ids[i].tolist()):
                        logits[i, token_id] /= repetition_penalty
            
            # Top-k filtering
            if top_k > 0:
                indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
                logits[indices_to_remove] = float('-inf')
            
            # Top-p (nucleus) filtering
            if top_p < 1.0:
                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
                sorted_indices_to_remove[..., 0] = 0
                indices_to_remove = sorted_indices_to_remove.scatter(
                    1, sorted_indices, sorted_indices_to_remove
                )
                logits[indices_to_remove] = float('-inf')
            
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            
            input_ids = torch.cat([input_ids, next_token], dim=1)
            
            # Early stopping if EOS token generated
            if eos_token_id is not None and (next_token == eos_token_id).all():
                break
        
        return input_ids


def create_vicai_5b(vocab_size: int = 32000) -> VicAIModel:
    """Create a 5B parameter VicAI model."""
    config = VicAIConfig(
        vocab_size=vocab_size,
        dim=4096,
        n_layers=32,
        n_heads=32,
        n_kv_heads=8,
        hidden_dim=14336,
        max_seq_len=8192,
        dropout=0.0,
    )
    return VicAIModel(config)


if __name__ == "__main__":
    # Test model creation
    model = create_vicai_5b()
    print(f"Total parameters: {model.get_num_params() / 1e9:.2f}B")
    
    # Test forward pass
    x = torch.randint(0, 32000, (2, 128))
    outputs = model(x)
    print(f"Output shape: {outputs['logits'].shape}")
    print(f"Loss: {outputs['loss']}")