"""
COLM Model Components
=====================
Complex Oscillating Language Model — all neural network modules.

Components:
  - ComplexRMSNorm: magnitude normalization preserving phase
  - ComplexOscillator: sin(W⊙Z+B)·tanh(Z) oscillating neuron
  - ComplexMixer: fixed unitary cross-dimension routing
  - OscillatingCausalScanner: O(N) causal sequence scanner
  - SparseGate: smooth sigmoid voltage-spike gate
  - ZeroLinearBlock: scanner + oscillating MLP block
  - COLM: full autoregressive model
"""

import math
import torch
import torch.nn as nn
from torch.nn import functional as F


# =============================================================================
# COMPLEX RMSNORM — norm the magnitude, preserve the angle
# =============================================================================

class ComplexRMSNorm(nn.Module):
    """RMSNorm adapted for complex tensors.
    Normalizes the magnitude while preserving phase angles.
    Learnable weight is real-valued (scales magnitude)."""

    def __init__(self, dim, eps=1e-6):
        super().__init__()
        self.eps = eps
        self.weight = nn.Parameter(torch.ones(dim))

    def forward(self, Z):
        rms = torch.rsqrt((Z.real.square() + Z.imag.square()).mean(-1, keepdim=True) + self.eps)
        return Z * (rms * self.weight)


# =============================================================================
# COMPLEX OSCILLATOR — sin(W⊙Z+B)·tanh(Z), W,B ∈ ℂ
# =============================================================================

def _softcap_imag(z, limit=6.0):
    return torch.complex(z.real, limit * torch.tanh(z.imag / limit))


def safe_abs(Z, eps=1e-12):
    """Gradient-safe complex magnitude. torch.abs() on complex is sqrt(re²+im²),
    and sqrt'(0) = inf. Adding eps inside the sqrt prevents inf gradients
    when the sparse gate zeros out features. Forward values are unchanged
    to ~6 decimal places."""
    return torch.sqrt(Z.real.square() + Z.imag.square() + eps)


class ComplexOscillator(nn.Module):
    """Native Complex Oscillating Neuron.
    W = ω + iφ (frequency + phase as single complex param)
    B = real_bias + i·imag_bias (complex baseline)

    PyTorch supports complex sin() and tanh() natively.
    Wirtinger derivatives flow through automatically."""

    def __init__(self, dim):
        super().__init__()
        # W: real part = frequency (ω), imag part = phase (φ)
        omega = torch.randn(dim) * 0.1 + 1.0
        phi = torch.randn(dim) * 0.1
        self.W = nn.Parameter(torch.complex(omega, phi))

        # B: complex baseline
        self.B = nn.Parameter(torch.complex(torch.zeros(dim), torch.zeros(dim)))

    def forward(self, Z):
        # Z is cfloat. Inductor can fuse this into a single kernel.
        Z = _softcap_imag(Z, limit=math.pi/2 - 0.2)  # stays below first pole at π/2
        WZ = _softcap_imag(self.W * Z + self.B, limit=6.0)
        return torch.sin(WZ) * torch.tanh(Z)


# =============================================================================
# COMPLEX MIXER — fixed unitary matrix, zero learnable params
# =============================================================================

class ComplexMixer(nn.Module):
    """Zero-parameter cross-dimension routing via fixed unitary matrix.
    QR-orthogonalized complex matrix ensures energy preservation.

    NOTE: This is O(D²) per token — the FWHT was O(D log D).
    Chosen for torch.compile compatibility over raw compute efficiency.
    If compile handles FWHT well on your hardware, swap back."""

    def __init__(self, dim):
        super().__init__()
        # Random complex matrix → QR decomposition → unitary Q
        real_part = torch.randn(dim, dim)
        imag_part = torch.randn(dim, dim)
        complex_mat = torch.complex(real_part, imag_part)
        q, _ = torch.linalg.qr(complex_mat)
        self.register_buffer('mix_matrix', q)

    def forward(self, Z):
        # Z: (B, T, D) @ (D, D) -> (B, T, D)
        return Z @ self.mix_matrix.T


# =============================================================================
# O(N) COMPLEX OSCILLATOR CAUSAL SCANNER — replaces O(N²) attention
# =============================================================================

class OscillatingCausalScanner(nn.Module):
    """O(N) sequence routing replacing scaled_dot_product_attention.

    Uses ComplexOscillator to generate:
      - gate: complex decay (magnitude=retention, angle=phase rotation)
      - val: complex value signal
    Then accumulates causally across sequence length T in O(N) time.

    This is mathematically related to Linear Attention / State Space Models
    (Mamba, RWKV, Griffin) but powered entirely by oscillating neurons."""

    def __init__(self, dim, clamp=70.0):
        super().__init__()
        self.clamp = clamp
        self.osc_gate = ComplexOscillator(dim)
        self.osc_val = ComplexOscillator(dim)
        self.osc_out = ComplexOscillator(dim)

        # Tame the gate's initial W so first gates aren't too aggressive
        with torch.no_grad():
            self.osc_gate.W.data = torch.complex(
                torch.empty(dim).uniform_(-0.05, 0.05),
                torch.empty(dim).uniform_(-0.05, 0.05)
            )

    def forward(self, Z):
        # Z: (B, T, D) complex
        gate = self.osc_gate(Z)
        val = self.osc_val(Z)

        decay = torch.sigmoid(gate.real)
        phase = math.pi * torch.tanh(gate.imag / math.pi)

        # Build log_gate directly — no torch.polar, no .angle()
        # This avoids the atan2(0,0) NaN gradient when decay → 0
        log_gate = torch.complex(torch.log(decay.clamp(min=1e-8)), phase)

        cum_log = torch.cumsum(log_gate, dim=1)

        CLAMP = self.clamp
        exp_real = cum_log.real.clamp(min=-CLAMP)
        exp_cum = torch.exp(torch.complex(exp_real, cum_log.imag))

        neg_real = (-cum_log.real).clamp(max=CLAMP)
        exp_neg = torch.exp(torch.complex(neg_real, -cum_log.imag))

        H = exp_cum * torch.cumsum(val * exp_neg, dim=1)

        # GRADIENT ECOLOGY: soft magnitude channel (preserves phase, smooth gradients)
        H_mag = safe_abs(H).clamp(min=1e-8)
        H = H * (torch.tanh(H_mag / 8.0) / H_mag)
        return self.osc_out(H)


# =============================================================================
# SMOOTH SPARSE GATE — proper sigmoid
# =============================================================================

class SparseGate(nn.Module):
    """Decoupled spike gate with learnable temperature.
    Uses smooth sigmoid for clean gradients.

    voltage = sigmoid(gate_w * x)
    spike = sigmoid((voltage - threshold) * temperature)
    output = x * spike
    """

    def __init__(self, num_features, threshold_init=0.3):
        super().__init__()
        self.gate_w = nn.Parameter(torch.ones(num_features) * 0.25)
        self.threshold = nn.Parameter(torch.full((num_features,), threshold_init))
        self.temperature = nn.Parameter(torch.ones(num_features) * 10.0)

    def forward(self, x):
        voltage = torch.sigmoid(self.gate_w * x)
        spike = torch.sigmoid((voltage - self.threshold) * self.temperature)
        return x * spike

    @torch.no_grad()
    def get_sparsity(self, x=None):
        if x is None:
            return 0.0
        voltage = torch.sigmoid(self.gate_w * x)
        return (voltage > self.threshold).float().mean().item()


# =============================================================================
# ZERO-LINEAR BLOCK — scanner + complex mixer/oscillator MLP
# =============================================================================

class ZeroLinearBlock(nn.Module):
    """Complete transformer-replacement block.

    Sub-block 1: OscillatingCausalScanner (replaces attention)
    Sub-block 2: ComplexMixer→Oscillator→Mixer→Oscillator (replaces MLP)

    Both sub-blocks use pre-norm residual connections.
    Complex sinc resonance coupling at the end."""

    def __init__(self, layer_idx, cfg):
        super().__init__()
        dim = cfg.n_embd

        self.norm1 = ComplexRMSNorm(dim)
        self.scanner = OscillatingCausalScanner(dim, clamp=cfg.scanner_clamp)

        self.norm2 = ComplexRMSNorm(dim)
        self.mix1 = ComplexMixer(dim)
        self.osc1 = ComplexOscillator(dim)
        self.mix2 = ComplexMixer(dim)
        self.osc2 = ComplexOscillator(dim)
        self.sparse_gate = SparseGate(dim)
        self.last_mlp_mag = None
        self.last_gate_open = None

        alpha_init = cfg.coupling_alpha_init[layer_idx]
        self.coupling_alpha = nn.Parameter(
            torch.complex(torch.tensor(alpha_init), torch.tensor(0.0))
        )
        print(f"  Layer {layer_idx}: α = {alpha_init:.4f} (complex: {self.coupling_alpha.item()})")

    def forward(self, Z):
        # Sub-block 1: O(N) Causal Scanner (replaces attention)
        Z_res = Z
        Z_normed = self.norm1(Z)
        Z = Z_res + self.scanner(Z_normed)

        # Sub-block 2: Oscillating Zero-Linear "MLP"
        Z_res = Z
        Z_normed = self.norm2(Z)
        Z_mlp = self.mix1(Z_normed)
        Z_mlp = self.osc1(Z_mlp)
        Z_mlp = self.mix2(Z_mlp)
        Z_mlp = self.osc2(Z_mlp)

        # Voltage spike gate — feature-level sparsity
        mag = safe_abs(Z_mlp)
        self.last_mlp_mag = mag.detach()
        # Compute spike directly for clean logging
        sg = self.sparse_gate
        voltage = torch.sigmoid(sg.gate_w * mag)
        spike = torch.sigmoid((voltage - sg.threshold) * sg.temperature)
        self.last_gate_open = spike.detach()
        Z_mlp = spike * Z_mlp  # gate on spike, apply to full complex

        # Complex sinc resonance coupling
        mag = safe_abs(Z_mlp)
        sinc_coupling = torch.sinc(mag / math.pi) * Z_mlp

        Z = Z_res + self.coupling_alpha * sinc_coupling

        return Z


# =============================================================================
# COLM — Complex Oscillating Language Model
# =============================================================================

class COLM(nn.Module):
    """Complex Oscillating Language Model.

    Architecture:
      - Real embedding → linear projection → complex conversion
      - ComplexOscillator initial oscillation
      - N × ZeroLinearBlock (scanner + oscillating MLP)
      - Complex → real concatenation → linear head
    """

    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg

        # Embedding: real tokens → thin embed → linear up → convert to complex
        self.thin_embed = nn.Embedding(cfg.vocab_size, cfg.embed_dim)
        self.embed_up = nn.Linear(cfg.embed_dim, cfg.n_embd, bias=False)
        # Initial oscillation in real space before complex conversion
        self.embed_osc = ComplexOscillator(cfg.n_embd)

        # Position embedding (real-valued, added to real part)
        self.position_emb = nn.Embedding(cfg.block_size, cfg.n_embd)

        self.ln_pre = ComplexRMSNorm(cfg.n_embd)
        self.blocks = nn.ModuleList([ZeroLinearBlock(i, cfg) for i in range(cfg.n_layer)])
        self.ln_f = ComplexRMSNorm(cfg.n_embd)

        # Output head: preserve full complex information by concatenating real + imag
        self.lm_head = nn.Linear(2 * cfg.n_embd, cfg.vocab_size, bias=False)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, Tseq = idx.size()

        # Real embedding path
        x_real = self.embed_up(self.thin_embed(idx))  # (B, T, n_embd) real

        # Add position embeddings (real)
        pos = torch.arange(0, Tseq, dtype=torch.long, device=idx.device)
        x_real = x_real + self.position_emb(pos)

        # Convert to complex: real part = features, imag part = 0 initially
        Z = torch.complex(x_real, torch.zeros_like(x_real))

        # Initial complex oscillation
        Z = self.embed_osc(Z)

        Z = self.ln_pre(Z)

        for block in self.blocks:
            Z = block(Z)

        Z = self.ln_f(Z)

        # Preserve both real and imaginary channels for the classifier head
        x_out = torch.cat([Z.real, Z.imag], dim=-1)  # (B, T, 2*n_embd)
        logits = self.lm_head(x_out)

        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(B * Tseq, -1), targets.view(B * Tseq))

        return logits, loss