"""
Mamba-2 SSD — OPTIMIZED: intra-chunk parallelism via matrix multiply.

The key Mamba-2 insight (State Space Duality):
Within each chunk of size T, the SSM can be computed as a MATRIX MULTIPLY:

    Y_chunk = (L ⊙ (C B^T)) @ (Δ ⊙ X)

Where L is a lower-triangular mask with cumulative A products.
This replaces the T sequential steps with a single matmul of size T×T.

For L=256, T=16, num_chunks=16:
- Within chunk: parallel matmul (T×T = 16×16) 
- Across chunks: 16 sequential state carries (unavoidable, but trivial)

Total: 16 sequential state carries + 16 parallel matmuls = FAST.

NO in-place ops. Fully autograd safe. Works on CPU and GPU.
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import math


class Mamba2SSD(nn.Module):
    """
    Mamba-2 SSD with intra-chunk matrix-multiply parallelism.
    
    Args:
        dim: Input/output dimension
        d_state: SSM state dimension (default 16)
        d_conv: Conv1d kernel size (default 4)
        expand: Inner dimension expansion (default 2)
        chunk_size: Chunk size for scan (default 64 — larger = more parallel)
    """
    
    def __init__(self, dim, d_state=16, d_conv=4, expand=2, chunk_size=64):
        super().__init__()
        self.dim = dim
        self.d_state = d_state
        self.chunk_size = chunk_size
        self.inner_dim = dim * expand
        
        # Input projection: x and gate
        self.in_proj = nn.Linear(dim, self.inner_dim * 2, bias=False)
        
        # Short causal conv for local context
        self.conv1d = nn.Conv1d(
            self.inner_dim, self.inner_dim,
            kernel_size=d_conv, padding=d_conv - 1,
            groups=self.inner_dim, bias=True
        )
        
        # SSM parameter projections
        self.dt_proj = nn.Linear(self.inner_dim, self.inner_dim, bias=True)
        self.B_proj = nn.Linear(self.inner_dim, d_state, bias=False)
        self.C_proj = nn.Linear(self.inner_dim, d_state, bias=False)
        
        # A: fixed decay rates (log-space, negative for stability)
        A = torch.arange(1, d_state + 1, dtype=torch.float32)
        self.A_log = nn.Parameter(torch.log(A))
        
        # D: residual skip
        self.D = nn.Parameter(torch.ones(self.inner_dim))
        
        # Output
        self.norm = nn.LayerNorm(self.inner_dim)
        self.out_proj = nn.Linear(self.inner_dim, dim, bias=False)
        
        self._init_weights()
    
    def _init_weights(self):
        nn.init.constant_(self.dt_proj.bias, -4.0)  # softplus(-4) ≈ 0.018
        nn.init.xavier_uniform_(self.in_proj.weight, gain=0.1)
        nn.init.xavier_uniform_(self.out_proj.weight, gain=0.1)
    
    def forward(self, x):
        """x: [B, L, dim] → [B, L, dim]"""
        return self._process(x)
    
    def _process(self, x):
        B, L, D = x.shape
        
        # Input projection
        xz = self.in_proj(x)
        x_inner, z = xz.chunk(2, dim=-1)
        
        # Causal conv
        x_conv = self.conv1d(x_inner.transpose(1, 2))[:, :, :L].transpose(1, 2)
        x_conv = F.silu(x_conv)
        
        # SSM params
        dt = F.softplus(self.dt_proj(x_conv))  # [B, L, inner_dim], positive
        B_mat = self.B_proj(x_conv)             # [B, L, d_state]
        C_mat = self.C_proj(x_conv)             # [B, L, d_state]
        A = -torch.exp(self.A_log)              # [d_state], negative
        
        # Chunk-parallel scan
        y = self._chunk_ssm(x_conv, dt, A, B_mat, C_mat)
        
        # Skip + norm + gate
        y = y + x_conv * self.D.unsqueeze(0).unsqueeze(0)
        y = self.norm(y) * F.silu(z)
        
        return self.out_proj(y)
    
    def _chunk_ssm(self, u, dt, A, B, C):
        """
        Chunk-parallel SSM computation.
        
        Within each chunk: compute via cumulative decay matrix (parallel).
        Across chunks: propagate final state (sequential, only num_chunks steps).
        
        The intra-chunk computation uses the identity:
            h_t = sum_{s=0}^{t} (prod_{k=s+1}^{t} dA_k) * dB_s * u_s
        
        This is a lower-triangular matrix-vector product, computable in parallel.
        """
        batch, L, d_inner = u.shape
        d_state = A.shape[0]
        T = min(self.chunk_size, L)
        
        # Pad to multiple of T
        pad = (T - L % T) % T
        if pad > 0:
            u = F.pad(u, (0, 0, 0, pad))
            dt = F.pad(dt, (0, 0, 0, pad))
            B = F.pad(B, (0, 0, 0, pad))
            C = F.pad(C, (0, 0, 0, pad))
        
        L_pad = u.shape[1]
        n_chunks = L_pad // T
        
        # Reshape: [B, n_chunks, T, ...]
        u_c = u.reshape(batch, n_chunks, T, d_inner)
        dt_c = dt.reshape(batch, n_chunks, T, d_inner)
        B_c = B.reshape(batch, n_chunks, T, d_state)
        C_c = C.reshape(batch, n_chunks, T, d_state)
        
        # Mean dt per position for state decay (simplification for scalar-A)
        dt_mean = dt_c.mean(dim=-1)  # [B, n_chunks, T]
        
        # Compute log(dA) per position: log_dA = dt_mean * A
        # A is [d_state], dt_mean is [B, nc, T]
        log_dA = dt_mean.unsqueeze(-1) * A.unsqueeze(0).unsqueeze(0).unsqueeze(0)
        # log_dA: [B, nc, T, d_state]
        
        # Cumulative sum for decay within chunk: cumsum along T dimension
        # For position t, decay from position s is: exp(sum_{k=s+1}^{t} log_dA_k)
        log_dA_cumsum = torch.cumsum(log_dA, dim=2)  # [B, nc, T, d_state]
        
        # Lower-triangular decay matrix: L[t,s] = exp(cumsum[t] - cumsum[s])
        # L[t,s,n] = exp(sum_{k=s+1}^{t} log_dA_k_n) for t >= s, else 0
        # Shape: [B, nc, T, T, d_state]
        decay_matrix = log_dA_cumsum.unsqueeze(3) - log_dA_cumsum.unsqueeze(2)
        # decay_matrix[..., t, s, :] = cumsum[t] - cumsum[s]
        
        # Apply causal mask (t >= s only)
        causal_mask = torch.tril(torch.ones(T, T, device=u.device))  # [T, T]
        decay_matrix = decay_matrix * causal_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1)
        decay_matrix = torch.exp(decay_matrix) * causal_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1)
        # [B, nc, T, T, d_state]
        
        # Compute dBu: dt * B * u → state input at each position
        # dt_c: [B, nc, T, d_inner], B_c: [B, nc, T, d_state], u_c: [B, nc, T, d_inner]
        # We need [B, nc, T, d_state, d_inner]
        dBu = dt_c.unsqueeze(-2) * B_c.unsqueeze(-1) * u_c.unsqueeze(-2)
        # dBu: [B, nc, T, d_state, d_inner]
        
        # Intra-chunk SSM via matrix multiply:
        # h[t] = sum_s decay[t,s] * dBu[s]
        # h: [B, nc, T, d_state, d_inner]
        # decay_matrix: [B, nc, T, T, d_state]
        # dBu: [B, nc, T, d_state, d_inner]
        
        # Einsum: h[b,c,t,n,d] = sum_s decay[b,c,t,s,n] * dBu[b,c,s,n,d]
        h_intra = torch.einsum('bctsn,bcsnd->bctnd', decay_matrix, dBu)
        # h_intra: [B, nc, T, d_state, d_inner]
        
        # Inter-chunk state propagation
        # Decay of previous chunk's final state into current chunk
        # Total decay for a full chunk: exp(sum of all T log_dA values)
        chunk_decay = torch.exp(log_dA_cumsum[:, :, -1, :])  # [B, nc, d_state]
        # Decay from chunk start to each position within chunk:
        # position_decay[t] = exp(cumsum[t]) (from position 0)
        position_decay = torch.exp(log_dA_cumsum)  # [B, nc, T, d_state]
        
        # Propagate states across chunks
        h_carry = torch.zeros(batch, d_state, d_inner, device=u.device)
        h_chunks = []
        
        for c_idx in range(n_chunks):
            # Decay carry state to each position in this chunk
            # h_from_prev[t] = position_decay[t] * h_carry
            h_from_prev = position_decay[:, c_idx, :, :].unsqueeze(-1) * h_carry.unsqueeze(1)
            # h_from_prev: [B, T, d_state, d_inner]
            
            # Total hidden state
            h_total = h_intra[:, c_idx] + h_from_prev  # [B, T, d_state, d_inner]
            h_chunks.append(h_total)
            
            # Update carry: final state of this chunk
            h_carry = h_total[:, -1, :, :]  # [B, d_state, d_inner]
        
        # Stack chunks: [B, nc, T, d_state, d_inner]
        h_all = torch.stack(h_chunks, dim=1)
        
        # Output: y[t] = C[t]^T @ h[t]
        # C_c: [B, nc, T, d_state], h_all: [B, nc, T, d_state, d_inner]
        y = torch.einsum('bctn,bctnd->bctd', C_c, h_all)
        # y: [B, nc, T, d_inner]
        
        # Reshape back
        y = y.reshape(batch, L_pad, d_inner)
        return y[:, :L, :]


class Mamba2Block(nn.Module):
    """
    Mamba-2 block with bidirectional scanning for 2D images.
    Forward + backward raster scan, merged via learned projection.
    """
    
    def __init__(self, dim, d_state=16, d_conv=4, expand=2, dropout=0.0):
        super().__init__()
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)
        
        self.ssd_fwd = Mamba2SSD(dim, d_state, d_conv, expand)
        self.ssd_bwd = Mamba2SSD(dim, d_state, d_conv, expand)
        self.merge = nn.Linear(dim * 2, dim, bias=False)
        
        ff_dim = dim * expand
        self.ff = nn.Sequential(
            nn.Linear(dim, ff_dim), nn.GELU(), nn.Dropout(dropout),
            nn.Linear(ff_dim, dim), nn.Dropout(dropout),
        )
    
    def forward(self, x):
        """x: [B, C, H, W] or [B, L, C]"""
        is_2d = x.dim() == 4
        if is_2d:
            B, C, H, W = x.shape
            x = x.flatten(2).transpose(1, 2)
        
        residual = x
        x_norm = self.norm1(x)
        
        fwd = self.ssd_fwd(x_norm)
        bwd = torch.flip(self.ssd_bwd(torch.flip(x_norm, [1])), [1])
        merged = self.merge(torch.cat([fwd, bwd], dim=-1))
        
        x = residual + merged
        x = x + self.ff(self.norm2(x))
        
        if is_2d:
            x = x.transpose(1, 2).reshape(B, C, H, W)
        return x