"""
Latent Attention Implementation for nanoKimi

This module implements the Latent Attention mechanism used in Kimi-K2,
which compresses attention representations to reduce memory footprint
while maintaining performance on long sequences.
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import math


class LatentAttention(nn.Module):
    """
    Latent Attention mechanism that compresses attention representations
    
    The key idea is to project keys and values into a lower-dimensional
    latent space, reducing memory usage while preserving attention quality.
    
    Args:
        n_embd: embedding dimension
        n_head: number of attention heads
        latent_dim: dimension of the latent space
        dropout: dropout probability
        bias: whether to use bias in linear layers
    """
    
    def __init__(self, n_embd, n_head, latent_dim=64, dropout=0.0, bias=True):
        super().__init__()
        assert n_embd % n_head == 0
        
        self.n_embd = n_embd
        self.n_head = n_head
        self.latent_dim = latent_dim
        self.head_dim = n_embd // n_head
        
        # Query projection (full dimension)
        self.q_proj = nn.Linear(n_embd, n_embd, bias=bias)
        
        # Key and Value projections to latent space
        self.k_proj = nn.Linear(n_embd, n_head * latent_dim, bias=bias)
        self.v_proj = nn.Linear(n_embd, n_head * latent_dim, bias=bias)
        
        # Output projection
        self.o_proj = nn.Linear(n_head * latent_dim, n_embd, bias=bias)
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)
        
        # Scale factor for attention
        self.scale = 1.0 / math.sqrt(latent_dim)
        
    def forward(self, x, mask=None):
        B, T, C = x.size()  # batch, sequence length, embedding dim
        
        # Project to query, key, value
        q = self.q_proj(x)  # (B, T, n_embd)
        k = self.k_proj(x)  # (B, T, n_head * latent_dim)
        v = self.v_proj(x)  # (B, T, n_head * latent_dim)
        
        # Reshape for multi-head attention
        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)  # (B, n_head, T, head_dim)
        k = k.view(B, T, self.n_head, self.latent_dim).transpose(1, 2)  # (B, n_head, T, latent_dim)
        v = v.view(B, T, self.n_head, self.latent_dim).transpose(1, 2)  # (B, n_head, T, latent_dim)
        
        # Compress queries to latent dimension for attention computation
        # We use a learnable compression matrix
        if not hasattr(self, 'q_compress'):
            self.q_compress = nn.Linear(self.head_dim, self.latent_dim, bias=False).to(x.device)
        
        q_compressed = self.q_compress(q)  # (B, n_head, T, latent_dim)
        
        # Compute attention scores in latent space
        att = torch.matmul(q_compressed, k.transpose(-2, -1)) * self.scale  # (B, n_head, T, T)
        
        # Apply causal mask
        if mask is not None:
            att = att.masked_fill(mask == 0, float('-inf'))
        else:
            # Create causal mask
            causal_mask = torch.tril(torch.ones(T, T, device=x.device)).view(1, 1, T, T)
            att = att.masked_fill(causal_mask == 0, float('-inf'))
        
        # Apply softmax
        att = F.softmax(att, dim=-1)
        att = self.dropout(att)
        
        # Apply attention to values
        y = torch.matmul(att, v)  # (B, n_head, T, latent_dim)
        
        # Reshape and project back
        y = y.transpose(1, 2).contiguous().view(B, T, self.n_head * self.latent_dim)
        y = self.o_proj(y)
        y = self.resid_dropout(y)
        
        return y


class MultiHeadAttention(nn.Module):
    """
    Standard multi-head attention for comparison
    """
    
    def __init__(self, n_embd, n_head, dropout=0.0, bias=True):
        super().__init__()
        assert n_embd % n_head == 0
        
        self.n_embd = n_embd
        self.n_head = n_head
        self.head_dim = n_embd // n_head
        
        # QKV projection
        self.qkv_proj = nn.Linear(n_embd, 3 * n_embd, bias=bias)
        
        # Output projection
        self.o_proj = nn.Linear(n_embd, n_embd, bias=bias)
        
        # Dropout
        self.dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)
        
        # Scale factor
        self.scale = 1.0 / math.sqrt(self.head_dim)
        
    def forward(self, x, mask=None):
        B, T, C = x.size()
        
        # Compute QKV
        qkv = self.qkv_proj(x)
        q, k, v = qkv.chunk(3, dim=-1)
        
        # Reshape for multi-head attention
        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
        
        # Compute attention
        att = torch.matmul(q, k.transpose(-2, -1)) * self.scale
        
        # Apply causal mask
        if mask is not None:
            att = att.masked_fill(mask == 0, float('-inf'))
        else:
            causal_mask = torch.tril(torch.ones(T, T, device=x.device)).view(1, 1, T, T)
            att = att.masked_fill(causal_mask == 0, float('-inf'))
        
        att = F.softmax(att, dim=-1)
        att = self.dropout(att)
        
        # Apply attention to values
        y = torch.matmul(att, v)
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.o_proj(y)
        y = self.resid_dropout(y)
        
        return y