KitsuVp
/

NeoLLM

@@ -1,8 +1,8 @@
 #!/usr/bin/env python3
 """
 NeoLLM Model with FANformer Integration in both Attention and FFN, Dropout Regularization,
-SeeDNorm (Self-Rescaled Dynamic Normalization), and ResFormer Value Residual Learning
-for enhanced information flow through deep layers.
 Updated to include:
 - Fourier Analysis Network (FAN) layer for effective periodicity modeling in attention (relational space)
@@ -10,7 +10,8 @@ Updated to include:
 - SeeDNorm: Dynamic normalization with input-dependent scaling for better adaptability
 - Dropout regularization at strategic locations
 - ResFormer: Feature residual connections from first layer (applied before projections)
-- PoPE (Polar Coordinate Position Embedding): Decouples 'what' and 'where' for superior length extrapolation
 """
 import math
@@ -27,33 +28,130 @@ from transformers.masking_utils import create_causal_mask
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from transformers.modeling_layers import GradientCheckpointingLayer
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import TransformersKwargs, logging
 from transformers.utils.generic import check_model_inputs
-from transformers.utils.import_utils import (
-    is_causal_conv1d_available,
-    is_flash_linear_attention_available,
-)
-from .configuration_neollm import NeoLLMConfig
-if is_causal_conv1d_available():
-    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
-else:
-    causal_conv1d_update, causal_conv1d_fn = None, None
-if is_flash_linear_attention_available():
-    from fla.modules import FusedRMSNormGated
-    from fla.ops.gated_delta_rule import chunk_gated_delta_rule, fused_recurrent_gated_delta_rule
-else:
-    chunk_gated_delta_rule, fused_recurrent_gated_delta_rule = None, None
-    FusedRMSNormGated = None
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 logger = logging.get_logger(__name__)
 class FANLayer(nn.Module):
     """
     Fourier Analysis Network (FAN) layer for effective periodicity modeling.
@@ -217,200 +315,67 @@ class SeeDNorm(nn.Module):
         return f"dim={self.dim}, eps={self.eps}"
-class NeoLLMRMSNormGated(nn.Module):
-    """
-    Gated RMSNorm variant used in specific contexts.
-    """
-    def __init__(self, hidden_size, eps=1e-6, **kwargs):
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-    def forward(self, hidden_states, gate=None):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        # Norm before gate
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        hidden_states = self.weight * hidden_states.to(input_dtype)
-        hidden_states = hidden_states * F.silu(gate.to(torch.float32))
-        return hidden_states.to(input_dtype)
-class PolarPositionalEmbedding(nn.Module):
-    """
-    Polar Coordinate Position Embedding (PoPE) - FlashAttention2-compatible implementation
-    From "Decoupling the 'What' and 'Where' with Polar Coordinate Positional Embedding":
-    THEORETICAL FORMULATION (from paper):
-    - Magnitudes: μ_q̃tc = softplus(qtc), μ_k̃sc = softplus(ksc) (content only)
-    - Phases: φ_q̃tc = t*θc, φ_k̃sc = s*θc (position only)
-    - Attention score: a^PoPE_ts = Re[q̃^H @ k̃] = Σ (x_q * x_k + y_q * y_k)
-    Where x = μ*cos(φ), y = μ*sin(φ) are Cartesian coordinates.
-    PRACTICAL IMPLEMENTATION (this code):
-    To enable FlashAttention2 compatibility without custom kernels, we use the
-    mathematically equivalent formulation:
-    Q' = [x_q; y_q] ∈ ℝ^(2d)  (concatenation of real and imaginary parts)
-    K' = [x_k; y_k] ∈ ℝ^(2d)
-    This doubles head_dim (d → 2d) but allows:
-    - Standard FlashAttention2 kernel usage
-    - Q'·K' = Σ(x_q*x_k + y_q*y_k) = a^PoPE_ts (mathematically equivalent)
-    - ~2× overhead in attention computation (acceptable tradeoff vs custom kernels)
-    Benefits retained:
-    - Superior length extrapolation without fine-tuning
-    - Decoupled 'what' and 'where' information
-    - Better performance on content/position independent matching tasks
-    Args:
-        dim: Original dimension per attention head (will be doubled to 2d internally)
-        max_position_embeddings: Maximum sequence length
-        base: Base wavelength (theta) for frequency components
-        device: Device to place tensors on
-    """
-    def __init__(
-        self,
-        dim: int,
-        max_position_embeddings: int = 2048,
-        base: float = 10000.0,
-        device=None
-    ):
-        super().__init__()
-        self.dim = dim  # Original head_dim (d)
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        # Compute frequency components: θc = base^(-(c-1)/d) for c = 1, ..., d
-        # PoPE uses d frequencies (not d/2 like RoPE)
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 1, dtype=torch.float32) / self.dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-    def forward(
-        self,
-        q: torch.Tensor,
-        k: torch.Tensor,
-        position_ids: torch.LongTensor,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        """
-        Apply PoPE transformation with concatenation for FlashAttention2 compatibility.
-        Args:
-            q: Query tensor of shape (batch, num_heads, seq_len, head_dim)
-            k: Key tensor of shape (batch, num_kv_heads, seq_len, head_dim)
-            position_ids: Position indices of shape (batch, seq_len)
-        Returns:
-            Tuple of (Q', K') with doubled head_dim:
-            - Q': shape (batch, num_heads, seq_len, 2*head_dim) = [x_q; y_q]
-            - K': shape (batch, num_kv_heads, seq_len, 2*head_dim) = [x_k; y_k]
-        """
-        # Step 1: Apply softplus to get magnitudes (Equation 3 from paper)
-        # μ_q̃tc = softplus(qtc), μ_k̃sc = softplus(ksc)
-        mu_q = F.softplus(q)
-        mu_k = F.softplus(k)
-        # Step 2: Compute phase angles (Equation 4 from paper)
-        # φ_q̃tc = t*θc, φ_k̃sc = s*θc
-        # freqs shape: (batch, 1, seq_len, head_dim)
-        inv_freq_expanded = self.inv_freq[None, None, None, :].to(q.device)
-        position_ids_expanded = position_ids[:, None, :, None].float()
-        freqs = position_ids_expanded * inv_freq_expanded
-        # Step 3: Convert to Cartesian coordinates (Equations 7-8 from paper)
-        # x = μ * cos(φ), y = μ * sin(φ)
-        # Note: Compute trigonometric functions in float32 for precision, then convert
-        # to input dtype (fp8/fp16/bf16) to maintain efficiency in subsequent operations
-        cos_freqs = torch.cos(freqs).to(q.dtype)
-        sin_freqs = torch.sin(freqs).to(q.dtype)
-        q_real = mu_q * cos_freqs  # x_q component
-        q_imag = mu_q * sin_freqs  # y_q component
-        k_real = mu_k * cos_freqs  # x_k component
-        k_imag = mu_k * sin_freqs  # y_k component
-        # Step 4: Concatenate [real; imag] to create 2d dimensional vectors
-        # This enables Q'·K' = Σ(x_q*x_k + y_q*y_k) via standard dot product
-        q_pope = torch.cat([q_real, q_imag], dim=-1)  # (batch, num_heads, seq_len, 2*head_dim)
-        k_pope = torch.cat([k_real, k_imag], dim=-1)  # (batch, num_kv_heads, seq_len, 2*head_dim)
-        return q_pope, k_pope
-def apply_pope_embedding(
-    q_pope: torch.Tensor,
-    k_pope: torch.Tensor,
-    delta_bias: Optional[torch.Tensor] = None,
-    num_key_value_groups: int = 1
-) -> tuple[torch.Tensor, torch.Tensor]:
-    """
-    Apply learnable phase bias δc to PoPE embeddings (Equation 6 from paper).
-    With phase bias: a^PoPE_ts = Σ μ_q μ_k cos((s-t)θc + δc)
-    This is implemented by rotating k by exp(i*δ) in the concatenated representation.
-    Args:
-        q_pope: Query with PoPE applied, shape (batch, num_heads, seq_len, 2*head_dim)
-                Format: [x_q; y_q] where first head_dim is real, second head_dim is imaginary
-        k_pope: Key with PoPE applied, shape (batch, num_kv_heads, seq_len, 2*head_dim)
-                Format: [x_k; y_k]
-        delta_bias: Learnable phase bias per head/dim, shape (num_attention_heads, head_dim)
-                   Bounded to [-2π, 0] as per paper. Applied only to keys.
-        num_key_value_groups: Number of query groups per key/value head for GQA
-    Returns:
-        Tuple of (q_out, k_out) with delta_bias applied:
-        - q_out: Query unchanged (phase bias only affects keys)
-        - k_out: Key rotated by delta_bias
-        Both maintain shape with 2*head_dim
-    """
-    # Query passes through unchanged (phase bias only affects keys)
-    q_out = q_pope
-    # Apply learnable phase bias to key if provided
-    if delta_bias is not None:
-        # Get head_dim (original dimension, half of current last dim)
-        head_dim = k_pope.shape[-1] // 2
-        # Split k into real and imaginary components
-        k_real, k_imag = k_pope[..., :head_dim], k_pope[..., head_dim:]
-        # Clamp delta_bias to [-2π, 0] as specified in paper Section 3
-        delta_clamped = torch.clamp(delta_bias, min=-2*math.pi, max=0)
-        # Adapt delta_bias for GQA: (num_attention_heads, head_dim) -> (num_kv_heads, head_dim)
-        # Group the attention heads' biases by averaging/selecting
-        if num_key_value_groups > 1:
-            # Reshape: (num_attention_heads, head_dim) -> (num_kv_heads, num_key_value_groups, head_dim)
-            num_kv_heads = delta_clamped.shape[0] // num_key_value_groups
-            delta_clamped = delta_clamped.view(num_kv_heads, num_key_value_groups, head_dim)
-            # Average across the groups to get one bias per kv_head
-            delta_clamped = delta_clamped.mean(dim=1)  # (num_kv_heads, head_dim)
-        # Reshape for broadcasting: (num_kv_heads, head_dim) -> (1, num_kv_heads, 1, head_dim)
-        delta_clamped = delta_clamped.unsqueeze(0).unsqueeze(2)
-        # Compute rotation components: exp(i*δ) = cos(δ) + i*sin(δ)
-        cos_delta = torch.cos(delta_clamped)
-        sin_delta = torch.sin(delta_clamped)
-        # Apply complex multiplication: k * exp(i*δ)
-        # Real part: k_real*cos(δ) - k_imag*sin(δ)
-        # Imag part: k_real*sin(δ) + k_imag*cos(δ)
-        k_real_rotated = k_real * cos_delta - k_imag * sin_delta
-        k_imag_rotated = k_real * sin_delta + k_imag * cos_delta
-        # Recombine into concatenated form [real; imag]
-        k_out = torch.cat([k_real_rotated, k_imag_rotated], dim=-1)
-    else:
-        k_out = k_pope
-    return q_out, k_out
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
@@ -435,18 +400,10 @@ def eager_attention_forward(
     dropout: float = 0.0,
     **kwargs: Unpack[TransformersKwargs],
 ):
-    """
-    Standard eager attention implementation for PoPE.
-    Note: query and key have 2*head_dim due to PoPE concatenation [real; imag].
-    Value is padded to match this dimension for kernel compatibility.
-    """
     key_states = repeat_kv(key, module.num_key_value_groups)
     value_states = repeat_kv(value, module.num_key_value_groups)
-    # Standard attention computation
     attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
     if attention_mask is not None:
         causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
         attn_weights = attn_weights + causal_mask
@@ -462,14 +419,16 @@ def eager_attention_forward(
 class NeoLLMAttention(nn.Module):
     """
     Multi-headed attention with FANformer integration, SeeDNorm for Q/K normalization,
-    PoPE for positional encoding, and ResFormer feature residual connections.
     ResFormer enhancement: Applies learnable feature residual connections from the first layer
     BEFORE QKV projections: H'_fan_n = λ_1 * H_fan_1 + λ_2 * H_fan_n
-    PoPE enhancement: Decouples 'what' and 'where' via polar coordinates for superior
-    length extrapolation and content/position independent matching. Uses concatenated
-    [real; imag] representation for FlashAttention2 compatibility (2× head_dim overhead).
     """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
@@ -477,11 +436,7 @@ class NeoLLMAttention(nn.Module):
         self.config = config
         self.layer_idx = layer_idx
         self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-        self.num_attention_heads = config.num_attention_heads
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_attention_heads // self.num_key_value_heads
-        # PoPE uses original head_dim for scaling (not 2*head_dim)
         self.scaling = self.head_dim**-0.5
         self.attention_dropout = config.attention_dropout
         self.is_causal = True
@@ -495,36 +450,36 @@ class NeoLLMAttention(nn.Module):
         # Calculate the output dimension after FAN transformation
         fan_output_dim = config.hidden_size + int(config.hidden_size * getattr(config, 'fan_ratio', 0.125))
-        # QKV projections operate on FAN-transformed features
-        self.q_proj = nn.Linear(
-            fan_output_dim, self.num_attention_heads * self.head_dim * 2, bias=config.attention_bias
         )
         self.k_proj = nn.Linear(
-            fan_output_dim, self.num_key_value_heads * self.head_dim, bias=config.attention_bias
         )
         self.v_proj = nn.Linear(
-            fan_output_dim, self.num_key_value_heads * self.head_dim, bias=config.attention_bias
         )
-        self.o_proj = nn.Linear(
-            self.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
         )
         # SeeDNorm for Q/K normalization (replaces RMSNorm)
         self.q_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
         self.k_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
-        # PoPE: Learnable phase bias δc for each head and dimension
-        # Initialized based on pope_bias_init config: 'zero' or 'uniform'
-        pope_bias_init = getattr(config, 'pope_bias_init', 'zero')
-        if pope_bias_init == 'uniform':
-            # Uniform initialization in [-2π, 0]
-            delta_init = torch.empty(self.num_attention_heads, self.head_dim).uniform_(-2 * math.pi, 0)
-        else:
-            # Zero initialization (better for length extrapolation)
-            delta_init = torch.zeros(self.num_attention_heads, self.head_dim)
-        self.delta_bias = nn.Parameter(delta_init)
         # Dropout for attention output
         self.dropout = nn.Dropout(config.dropout_rate)
@@ -541,61 +496,35 @@ class NeoLLMAttention(nn.Module):
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
         input_shape = hidden_states.shape[:-1]
-        batch_size, seq_len = input_shape
         # Apply FANformer transformation first
         hidden_states_fan = self.fan_layer(hidden_states)
         # ResFormer: Apply feature residual connection BEFORE projections
         if first_layer_fan is not None:
             hidden_states_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * hidden_states_fan
         # Store current FAN features for potential use as first_layer_fan in subsequent layers
         current_layer_fan = hidden_states_fan.clone()
-        # Project to Q, K, V
         query_states, gate = torch.chunk(
-            self.q_proj(hidden_states_fan).view(batch_size, seq_len, self.num_attention_heads, self.head_dim * 2),
-            2, dim=-1
-        )
-        gate = gate.reshape(batch_size, seq_len, -1)
-        key_states = self.k_proj(hidden_states_fan).view(
-            batch_size, seq_len, self.num_key_value_heads, self.head_dim
-        )
-        value_states = self.v_proj(hidden_states_fan).view(
-            batch_size, seq_len, self.num_key_value_heads, self.head_dim
         )
-        # Apply SeeDNorm to Q and K before PoPE
-        query_states = self.q_norm(query_states)
-        key_states = self.k_norm(key_states)
-        # Transpose to (batch, num_heads, seq_len, head_dim)
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-        # Apply PoPE: position_embeddings is (pope_emb, position_ids)
-        pope_emb, position_ids = position_embeddings
-        # Get PoPE embeddings with concatenated [real; imag] representation
-        # Returns Q', K' with shape (..., 2*head_dim)
-        query_states, key_states = pope_emb(query_states, key_states, position_ids)
-        # Apply learnable phase bias δc
-        # Apply learnable phase bias δc
-        query_states, key_states = apply_pope_embedding(
-            query_states,
-            key_states,
-            self.delta_bias,
-            num_key_value_groups=self.num_key_value_groups  # AGREGAR ESTE PARÁMETRO
-        )
-        # Pad value to 2*head_dim for dimension compatibility
-        # Only first head_dim components are used in output
-        value_states = F.pad(value_states, (0, self.head_dim), value=0.0)
-        # Call attention with doubled head_dim
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
             attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
@@ -611,391 +540,16 @@ class NeoLLMAttention(nn.Module):
             **kwargs,
         )
-        # Extract only the first head_dim components (discard padding)
-        attn_output = attn_output[..., :self.head_dim]
-        attn_output = attn_output.reshape(batch_size, seq_len, -1).contiguous()
         attn_output = attn_output * torch.sigmoid(gate)
         attn_output = self.o_proj(attn_output)
         attn_output = self.dropout(attn_output)
         return attn_output, attn_weights, current_layer_fan
-def apply_mask_to_padding_states(hidden_states, attention_mask):
-    """
-    Tunes out the hidden states for padding tokens
-    """
-    if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
-        dtype = hidden_states.dtype
-        hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
-    return hidden_states
-is_fast_path_available = all(
-    (causal_conv1d_fn, causal_conv1d_update, chunk_gated_delta_rule, fused_recurrent_gated_delta_rule)
-)
-def torch_causal_conv1d_update(
-    hidden_states,
-    conv_state,
-    weight,
-    bias=None,
-    activation=None,
-):
-    _, hidden_size, seq_len = hidden_states.shape
-    state_len = conv_state.shape[-1]
-    hidden_states_new = torch.cat([conv_state, hidden_states], dim=-1).to(weight.dtype)
-    conv_state.copy_(hidden_states_new[:, :, -state_len:])
-    out = F.conv1d(hidden_states_new, weight.unsqueeze(1), bias, padding=0, groups=hidden_size)
-    out = F.silu(out[:, :, -seq_len:])
-    out = out.to(hidden_states.dtype)
-    return out
-def l2norm(x: torch.FloatTensor, dim: int = -1, eps: float = 1e-6):
-    """This function is intended to align with the l2norm implementation in the FLA library."""
-    inv_norm = 1 / torch.sqrt((x * x).sum(dim=dim, keepdim=True) + eps)
-    return x * inv_norm
-def torch_chunk_gated_delta_rule(
-    query,
-    key,
-    value,
-    g,
-    beta,
-    chunk_size=64,
-    initial_state=None,
-    output_final_state=False,
-    use_qk_l2norm_in_kernel=False,
-):
-    initial_dtype = query.dtype
-    if use_qk_l2norm_in_kernel:
-        query = l2norm(query, dim=-1, eps=1e-6)
-        key = l2norm(key, dim=-1, eps=1e-6)
-    query, key, value, beta, g = [
-        x.transpose(1, 2).contiguous().to(torch.float32) for x in (query, key, value, beta, g)
-    ]
-    batch_size, sequence_length, num_heads, k_head_dim = key.shape
-    v_head_dim = value.shape[-1]
-    pad_size = (chunk_size - num_heads % chunk_size) % chunk_size
-    query = F.pad(query, (0, 0, 0, pad_size))
-    key = F.pad(key, (0, 0, 0, pad_size))
-    value = F.pad(value, (0, 0, 0, pad_size))
-    beta = F.pad(beta, (0, pad_size))
-    g = F.pad(g, (0, pad_size))
-    tot_heads = num_heads + pad_size
-    scale = 1 / (query.shape[-1] ** 0.5)
-    query = query * scale
-    v_beta = value * beta.unsqueeze(-1)
-    k_beta = key * beta.unsqueeze(-1)
-    # reshape to chunks
-    query, key, value, k_beta, v_beta = [
-        x.reshape(x.shape[0], x.shape[1], -1, chunk_size, x.shape[-1]) for x in (query, key, value, k_beta, v_beta)
-    ]
-    g = g.reshape(g.shape[0], g.shape[1], -1, chunk_size)
-    mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), diagonal=0)
-    # chunk decay
-    g = g.cumsum(dim=-1)
-    decay_mask = ((g.unsqueeze(-1) - g.unsqueeze(-2)).tril().exp().float()).tril()
-    attn = -((k_beta @ key.transpose(-1, -2)) * decay_mask).masked_fill(mask, 0)
-    for i in range(1, chunk_size):
-        row = attn[..., i, :i].clone()
-        sub = attn[..., :i, :i].clone()
-        attn[..., i, :i] = row + (row.unsqueeze(-1) * sub).sum(-2)
-    attn = attn + torch.eye(chunk_size, dtype=attn.dtype, device=attn.device)
-    value = attn @ v_beta
-    k_cumdecay = attn @ (k_beta * g.exp().unsqueeze(-1))
-    last_recurrent_state = (
-        torch.zeros(batch_size, sequence_length, k_head_dim, v_head_dim).to(value)
-        if initial_state is None
-        else initial_state.to(value)
-    )
-    core_attn_out = torch.zeros_like(value)
-    mask = torch.triu(torch.ones(chunk_size, chunk_size, dtype=torch.bool, device=query.device), diagonal=1)
-    # for each chunk
-    for i in range(0, tot_heads // chunk_size):
-        q_i, k_i, v_i = query[:, :, i], key[:, :, i], value[:, :, i]
-        attn = (q_i @ k_i.transpose(-1, -2) * decay_mask[:, :, i]).masked_fill_(mask, 0)
-        v_prime = (k_cumdecay[:, :, i]) @ last_recurrent_state
-        v_new = v_i - v_prime
-        attn_inter = (q_i * g[:, :, i, :, None].exp()) @ last_recurrent_state
-        core_attn_out[:, :, i] = attn_inter + attn @ v_new
-        last_recurrent_state = (
-            last_recurrent_state * g[:, :, i, -1, None, None].exp()
-            + (k_i * (g[:, :, i, -1, None] - g[:, :, i]).exp()[..., None]).transpose(-1, -2) @ v_new
-        )
-    if not output_final_state:
-        last_recurrent_state = None
-    core_attn_out = core_attn_out.reshape(core_attn_out.shape[0], core_attn_out.shape[1], -1, core_attn_out.shape[-1])
-    core_attn_out = core_attn_out[:, :, :num_heads]
-    core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
-    return core_attn_out, last_recurrent_state
-def torch_recurrent_gated_delta_rule(
-    query, key, value, g, beta, initial_state, output_final_state, use_qk_l2norm_in_kernel=False
-):
-    initial_dtype = query.dtype
-    if use_qk_l2norm_in_kernel:
-        query = l2norm(query, dim=-1, eps=1e-6)
-        key = l2norm(key, dim=-1, eps=1e-6)
-    query, key, value, beta, g = [
-        x.transpose(1, 2).contiguous().to(torch.float32) for x in (query, key, value, beta, g)
-    ]
-    batch_size, sequence_length, num_heads, k_head_dim = key.shape
-    v_head_dim = value.shape[-1]
-    scale = 1 / (query.shape[-1] ** 0.5)
-    query = query * scale
-    core_attn_out = torch.zeros(batch_size, sequence_length, num_heads, v_head_dim).to(value)
-    last_recurrent_state = (
-        torch.zeros(batch_size, sequence_length, k_head_dim, v_head_dim).to(value)
-        if initial_state is None
-        else initial_state.to(value)
-    )
-    for i in range(num_heads):
-        q_t = query[:, :, i]
-        k_t = key[:, :, i]
-        v_t = value[:, :, i]
-        g_t = g[:, :, i].exp().unsqueeze(-1).unsqueeze(-1)
-        beta_t = beta[:, :, i].unsqueeze(-1)
-        last_recurrent_state = last_recurrent_state * g_t
-        kv_mem = (last_recurrent_state * k_t.unsqueeze(-1)).sum(dim=-2)
-        delta = (v_t - kv_mem) * beta_t
-        last_recurrent_state = last_recurrent_state + k_t.unsqueeze(-1) * delta.unsqueeze(-2)
-        core_attn_out[:, :, i] = (last_recurrent_state * q_t.unsqueeze(-1)).sum(dim=-2)
-    if not output_final_state:
-        last_recurrent_state = None
-    core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
-    return core_attn_out, last_recurrent_state
-class NeoLLMGatedDeltaNet(nn.Module):
-    """
-    Linear attention with FANformer integration, SeeDNorm for normalization,
-    and ResFormer feature residual connections for enhanced information flow.
-    ResFormer enhancement: Applies learnable feature residual connections from the first layer
-    BEFORE QKV projections: H'_fan_n = λ_1 * H_fan_1 + λ_2 * H_fan_n
-    """
-    def __init__(self, config: NeoLLMConfig, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.num_v_heads = config.linear_num_value_heads
-        self.num_k_heads = config.linear_num_key_heads
-        self.head_k_dim = config.linear_key_head_dim
-        self.head_v_dim = config.linear_value_head_dim
-        self.key_dim = self.head_k_dim * self.num_k_heads
-        self.value_dim = self.head_v_dim * self.num_v_heads
-        self.conv_kernel_size = config.linear_conv_kernel_dim
-        self.layer_idx = layer_idx
-        self.activation = config.hidden_act
-        self.act = ACT2FN[config.hidden_act]
-        self.layer_norm_epsilon = config.rms_norm_eps
-        # FANformer integration: FAN layer before projections
-        self.fan_layer = FANLayer(
-            hidden_size=config.hidden_size,
-            fan_ratio=getattr(config, 'fan_ratio', 0.125)
-        )
-        # Calculate the output dimension after FAN transformation
-        fan_output_dim = config.hidden_size + int(config.hidden_size * getattr(config, 'fan_ratio', 0.125))
-        # QKV - operates on FAN-transformed features
-        self.conv_dim = self.key_dim * 2 + self.value_dim
-        self.conv1d = nn.Conv1d(
-            in_channels=self.conv_dim,
-            out_channels=self.conv_dim,
-            bias=False,
-            kernel_size=self.conv_kernel_size,
-            groups=self.conv_dim,
-            padding=self.conv_kernel_size - 1,
-        )
-        # projection of the FAN-transformed hidden states
-        projection_size_qkvz = self.key_dim * 2 + self.value_dim * 2
-        projection_size_ba = self.num_v_heads * 2
-        self.in_proj_qkvz = nn.Linear(fan_output_dim, projection_size_qkvz, bias=False)
-        self.in_proj_ba = nn.Linear(fan_output_dim, projection_size_ba, bias=False)
-        # time step projection (discretization)
-        self.dt_bias = nn.Parameter(torch.ones(self.num_v_heads))
-        A = torch.empty(self.num_v_heads).uniform_(0, 16)
-        self.A_log = nn.Parameter(torch.log(A))
-        # FLA compatibility: use "silu" for FusedRMSNormGated, original activation elsewhere
-        fla_compatible_activation = "silu" if self.activation not in ['swish', 'silu', 'sigmoid'] else self.activation
-        self.norm = (
-            NeoLLMRMSNormGated(self.head_v_dim, eps=self.layer_norm_epsilon)
-            if FusedRMSNormGated is None
-            else FusedRMSNormGated(
-                self.head_v_dim,
-                eps=self.layer_norm_epsilon,
-                activation=fla_compatible_activation,
-                device=torch.cuda.current_device(),
-                dtype=config.dtype if config.dtype is not None else torch.get_default_dtype(),
-            )
-        )
-        self.out_proj = nn.Linear(self.value_dim, self.hidden_size, bias=False)
-        # Dropout for attention output
-        self.dropout = nn.Dropout(config.dropout_rate)
-        self.causal_conv1d_fn = causal_conv1d_fn
-        self.causal_conv1d_update = causal_conv1d_update or torch_causal_conv1d_update
-        self.chunk_gated_delta_rule = chunk_gated_delta_rule or torch_chunk_gated_delta_rule
-        self.recurrent_gated_delta_rule = fused_recurrent_gated_delta_rule or torch_recurrent_gated_delta_rule
-        # ResFormer: learnable feature residual parameters (initialized to 0.5)
-        self.lambda_1 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_1 (first layer features)
-        self.lambda_2 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_n (current layer features)
-        if not is_fast_path_available:
-            logger.warning_once(
-                "The fast path is not available because one of the required library is not installed. Falling back to "
-                "torch implementation. To install follow https://github.com/fla-org/flash-linear-attention#installation and"
-                " https://github.com/Dao-AILab/causal-conv1d"
-            )
-    def fix_query_key_value_ordering(self, mixed_qkvz, mixed_ba):
-        """
-        Derives `query`, `key` and `value` tensors from `mixed_qkvz` and `mixed_ba`.
-        """
-        new_tensor_shape_qkvz = mixed_qkvz.size()[:-1] + (
-            self.num_k_heads,
-            2 * self.head_k_dim + 2 * self.head_v_dim * self.num_v_heads // self.num_k_heads,
-        )
-        new_tensor_shape_ba = mixed_ba.size()[:-1] + (self.num_k_heads, 2 * self.num_v_heads // self.num_k_heads)
-        mixed_qkvz = mixed_qkvz.view(*new_tensor_shape_qkvz)
-        mixed_ba = mixed_ba.view(*new_tensor_shape_ba)
-        split_arg_list_qkvz = [
-            self.head_k_dim,
-            self.head_k_dim,
-            (self.num_v_heads // self.num_k_heads * self.head_v_dim),
-            (self.num_v_heads // self.num_k_heads * self.head_v_dim),
-        ]
-        split_arg_list_ba = [self.num_v_heads // self.num_k_heads, self.num_v_heads // self.num_k_heads]
-        query, key, value, z = torch.split(mixed_qkvz, split_arg_list_qkvz, dim=3)
-        b, a = torch.split(mixed_ba, split_arg_list_ba, dim=3)
-        # [b, sq, ng, np/ng * hn] -> [b, sq, np, hn]
-        value = value.reshape(value.size(0), value.size(1), -1, self.head_v_dim)
-        z = z.reshape(z.size(0), z.size(1), -1, self.head_v_dim)
-        b = b.reshape(b.size(0), b.size(1), self.num_v_heads)
-        a = a.reshape(a.size(0), a.size(1), self.num_v_heads)
-        return query, key, value, z, b, a
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        first_layer_fan: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
-        # Set up dimensions for reshapes later
-        batch_size, seq_len, _ = hidden_states.shape
-        # Apply FANformer transformation first
-        hidden_states_fan = self.fan_layer(hidden_states)
-        # ResFormer: Apply feature residual connection BEFORE projections
-        # This ensures dimensional compatibility across all layer types
-        if first_layer_fan is not None:
-            hidden_states_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * hidden_states_fan
-        # Store current FAN features for potential use as first_layer_fan in subsequent layers
-        current_layer_fan = hidden_states_fan.clone()
-        # Use FAN-transformed features (with residual applied) for projections
-        projected_states_qkvz = self.in_proj_qkvz(hidden_states_fan)
-        projected_states_ba = self.in_proj_ba(hidden_states_fan)
-        query, key, value, z, b, a = self.fix_query_key_value_ordering(projected_states_qkvz, projected_states_ba)
-        query, key, value = (x.reshape(x.shape[0], x.shape[1], -1) for x in (query, key, value))
-        mixed_qkv = torch.cat((query, key, value), dim=-1)
-        mixed_qkv = mixed_qkv.transpose(1, 2)
-        # Simple convolution without cache
-        if self.causal_conv1d_fn is not None:
-            mixed_qkv = self.causal_conv1d_fn(
-                x=mixed_qkv,
-                weight=self.conv1d.weight.squeeze(1),
-                bias=self.conv1d.bias,
-                activation="silu",  # Keep original activation for conv1d
-                seq_idx=None,
-            )
-        else:
-            mixed_qkv = F.silu(self.conv1d(mixed_qkv)[:, :, :seq_len])
-        mixed_qkv = mixed_qkv.transpose(1, 2)
-        query, key, value = torch.split(
-            mixed_qkv,
-            [
-                self.key_dim,
-                self.key_dim,
-                self.value_dim,
-            ],
-            dim=-1,
-        )
-        query = query.reshape(query.shape[0], query.shape[1], -1, self.head_k_dim)
-        key = key.reshape(key.shape[0], key.shape[1], -1, self.head_k_dim)
-        value = value.reshape(value.shape[0], value.shape[1], -1, self.head_v_dim)
-        beta = b.sigmoid()
-        # If the model is loaded in fp16, without the .float() here, A might be -inf
-        g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
-        if self.num_v_heads // self.num_k_heads > 1:
-            query = query.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)
-            key = key.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)
-        # Use chunk-based implementation without cache
-        core_attn_out, _ = self.chunk_gated_delta_rule(
-            query,
-            key,
-            value,
-            g=g,
-            beta=beta,
-            initial_state=None,
-            output_final_state=False,
-            use_qk_l2norm_in_kernel=True,
-        )
-        z_shape_og = z.shape
-        # reshape input data into 2D tensor
-        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
-        z = z.reshape(-1, z.shape[-1])
-        core_attn_out = self.norm(core_attn_out, z)
-        core_attn_out = core_attn_out.reshape(z_shape_og)
-        core_attn_out = core_attn_out.reshape(core_attn_out.shape[0], core_attn_out.shape[1], -1)
-        output = self.out_proj(core_attn_out)
-        output = self.dropout(output)  # Apply dropout after output projection
-        return output, current_layer_fan
 class PolyNorm(torch.nn.Module):
     def __init__(self, eps=1e-6):
         super(PolyNorm, self).__init__()
@@ -1012,11 +566,17 @@ class PolyNorm(torch.nn.Module):
 class NeoLLMMLP(nn.Module):
     """
-    MLP with FANformer integration for featural periodicity modeling.
     This captures periodicities in the feature space (semantic/embedding dimensions)
     complementary to the relational periodicities captured by attention mechanisms.
     Works in conjunction with ResFormer for comprehensive information flow.
     """
     def __init__(self, config):
         super().__init__()
@@ -1024,7 +584,7 @@ class NeoLLMMLP(nn.Module):
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
-        # NEW: FANformer integration for featural space periodicity
         self.fan_layer = FANLayer(
             hidden_size=config.hidden_size,
             fan_ratio=getattr(config, 'fan_ratio_ffn', 0.0625)  # Half of attention's fan_ratio
@@ -1033,17 +593,35 @@ class NeoLLMMLP(nn.Module):
         # Calculate the output dimension after FAN transformation
         fan_output_dim = config.hidden_size + int(config.hidden_size * getattr(config, 'fan_ratio_ffn', 0.0625))
-        # SwiGLU/Gated architecture - now operates on FAN-transformed features
-        self.gate_proj = nn.Linear(fan_output_dim, self.intermediate_size, bias=False)
         self.up_proj = nn.Linear(fan_output_dim, self.intermediate_size, bias=False)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
         self.act_fn = PolyNorm()
         # Dropout for MLP hidden layer
         self.dropout = nn.Dropout(config.dropout_rate)
     def forward(self, x):
-        # NEW: Apply FAN transformation before projections
         x_fan = self.fan_layer(x)
         # Use FAN-transformed features for gate and up projections
@@ -1055,19 +633,27 @@ class NeoLLMMLP(nn.Module):
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.layer_idx = layer_idx
-        # token mixer
-        self.layer_type = config.layer_types[layer_idx]
-        if self.layer_type == "linear_attention":
-            self.linear_attn = NeoLLMGatedDeltaNet(config, layer_idx)
-        elif self.layer_type == "full_attention":
-            self.self_attn = NeoLLMAttention(config, layer_idx)
-        # MLP with FANformer integration
         self.mlp = NeoLLMMLP(config)
         # SeeDNorm for input and post-attention normalization (replaces RMSNorm)
@@ -1093,6 +679,9 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         first_layer_fan: Optional[torch.Tensor] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> torch.FloatTensor:
         residual = hidden_states
         # Apply SeeDNorm normalization
@@ -1101,22 +690,14 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         # Apply LNS scaling after normalization
         hidden_states = self.lns_attn(hidden_states)
-        # Token Mixer with ResFormer feature residual connections
-        if self.layer_type == "linear_attention":
-            hidden_states, self.current_layer_fan = self.linear_attn(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                first_layer_fan=first_layer_fan,
-            )
-        elif self.layer_type == "full_attention":
-            # Self Attention
-            hidden_states, _, self.current_layer_fan = self.self_attn(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_embeddings=position_embeddings,
-                first_layer_fan=first_layer_fan,
-                **kwargs,
-            )
         # Standard residual connection
         hidden_states = residual + hidden_states
@@ -1124,14 +705,16 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         # Apply GPAS after attention residual connection
         hidden_states = self.gpas_attn(hidden_states)
-        # Fully Connected with FANformer
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         # Apply LNS scaling after normalization
         hidden_states = self.lns_mlp(hidden_states)
-        # MLP now includes FAN transformation internally
         hidden_states = self.mlp(hidden_states)
         # Standard residual connection
@@ -1144,6 +727,16 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
 class NeoLLMPreTrainedModel(PreTrainedModel):
     config: NeoLLMConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
@@ -1153,59 +746,88 @@ class NeoLLMPreTrainedModel(PreTrainedModel):
     _is_stateful = True
     def _init_weights(self, module):
         super()._init_weights(module)
-        if isinstance(module, NeoLLMGatedDeltaNet):
-            module.dt_bias.data.fill_(1.0)
-            module.A_log.data.uniform_(0, 16).log_()
-            # ResFormer: initialize lambda parameters for linear attention
-            if hasattr(module, 'lambda_1'):
-                module.lambda_1.data.fill_(0.5)
-            if hasattr(module, 'lambda_2'):
-                module.lambda_2.data.fill_(0.5)
-        elif isinstance(module, NeoLLMAttention):
             # ResFormer: initialize lambda parameters for full attention
             if hasattr(module, 'lambda_1'):
                 module.lambda_1.data.fill_(0.5)
             if hasattr(module, 'lambda_2'):
                 module.lambda_2.data.fill_(0.5)
-            # PoPE delta_bias already initialized in __init__
         elif isinstance(module, GPAS):
             # Initialize GPAS alpha to 0 as per paper
             module.alpha.data.fill_(0.0)
         elif isinstance(module, FANLayer):
-            # FANLayer initialization is handled within the class
             pass
         elif isinstance(module, SeeDNorm):
-            # SeeDNorm initialization:
-            # gamma (γ) initialized to 1 (default in Parameter definition)
-            # beta (β) initialized to 0 (default in Parameter definition)
-            # alpha (α) initialized to 1 (default in Parameter definition)
             pass
-        elif isinstance(module, PolarPositionalEmbedding):
-            # PoPE frequency initialization handled in __init__
-            pass
 class NeoLLMModel(NeoLLMPreTrainedModel):
     def __init__(self, config: NeoLLMConfig):
         super().__init__(config)
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
         # Each layer creates its own components (no shared parameters)
         self.layers = nn.ModuleList(
             [NeoLLMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         # SeeDNorm for final output normalization (replaces RMSNorm)
         self.norm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
-        # PoPE positional embedding (replaces RoPE)
-        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-        self.pope_emb = PolarPositionalEmbedding(
-            dim=head_dim,
-            max_position_embeddings=config.max_position_embeddings,
-            base=getattr(config, 'rope_theta', 10000.0),  # Use rope_theta for backward compatibility
-        )
         self.gradient_checkpointing = False
         # ResFormer: storage for first layer's FAN features (H_fan_1)
@@ -1226,6 +848,10 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
         if position_ids is None:
@@ -1239,24 +865,20 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             past_key_values=None,
             position_ids=position_ids,
         )
-        linear_attn_mask = self._update_linear_attn_mask(attention_mask, position_ids.squeeze(0))
         hidden_states = inputs_embeds
-        # Create position embeddings for PoPE
-        # position_embeddings is a tuple of (pope_emb, position_ids)
-        position_embeddings = (self.pope_emb, position_ids)
         # ResFormer: reset first_layer_fan at the start of each forward pass
         self.first_layer_fan = None
         for decoder_layer in self.layers[: self.config.num_hidden_layers]:
-            layer_mask = linear_attn_mask if decoder_layer.layer_type == "linear_attention" else causal_mask
             hidden_states = decoder_layer(
                 hidden_states,
                 position_embeddings=position_embeddings,
-                attention_mask=layer_mask,
                 first_layer_fan=self.first_layer_fan,  # Pass H_fan_1 to all layers
                 **kwargs,
             )
@@ -1273,16 +895,6 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             past_key_values=None,
         )
-    def _update_linear_attn_mask(self, attention_mask, cache_position):
-        """
-        NOTE: Left-padding is used for linear attention mask.
-        No need for zeroing states when attending to all inputs
-        """
-        linear_attn_mask = attention_mask
-        if attention_mask is not None and torch.all(attention_mask == 1):
-            linear_attn_mask = None
-        return linear_attn_mask
 @torch.compiler.disable
 def compute_cce_loss(hidden_states, labels, lm_head_weight, lm_head_bias=None, pad_token_id=None):
@@ -1313,13 +925,26 @@ def compute_cce_loss(hidden_states, labels, lm_head_weight, lm_head_bias=None, p
 class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
         self.model = NeoLLMModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.post_init()
     def forward(
@@ -1376,7 +1001,9 @@ __all__ = [
     "NeoLLMConfig",
     "FANLayer",
     "SeeDNorm",
-    "PolarPositionalEmbedding",
 ]
 # Register the configuration and model for AutoClass support

 #!/usr/bin/env python3
 """
 NeoLLM Model with FANformer Integration in both Attention and FFN, Dropout Regularization,
+SeeDNorm (Self-Rescaled Dynamic Normalization), ResFormer Value Residual Learning,
+and Learnable Multipliers for enhanced scale adaptation and information flow through deep layers.
 Updated to include:
 - Fourier Analysis Network (FAN) layer for effective periodicity modeling in attention (relational space)
 - SeeDNorm: Dynamic normalization with input-dependent scaling for better adaptability
 - Dropout regularization at strategic locations
 - ResFormer: Feature residual connections from first layer (applied before projections)
+- Learnable Multipliers: Frees weight matrix scale from WD-noise equilibrium for data-adaptive scaling
+- Full Attention only (linear attention removed)
 """
 import math
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from transformers.modeling_layers import GradientCheckpointingLayer
 from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import TransformersKwargs, logging
 from transformers.utils.generic import check_model_inputs
+from configuration_neollm import NeoLLMConfig
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 logger = logging.get_logger(__name__)
+# ==================== LEARNABLE MULTIPLIERS ====================
+class ScalarMultiplier(nn.Module):
+    """
+    Scalar Learnable Multiplier: W̃ = s·W
+    From "Learnable Multipliers: Freeing the Scale of Language Model Matrix Layers":
+    Allows the effective matrix norm ||W̃|| = s·||W|| to adapt to data, escaping the
+    WD-noise equilibrium that constrains ||W|| ∝ √(η/λ).
+    Args:
+        initial_value: Initial multiplier value (default: 1.0 for identity)
+    """
+    def __init__(self, initial_value: float = 1.0):
+        super().__init__()
+        self.multiplier = nn.Parameter(torch.tensor(initial_value))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.multiplier * x
+class VectorMultiplier(nn.Module):
+    """
+    Vector Learnable Multipliers: W̃ = diag(r)·W·diag(c)
+    From "Learnable Multipliers: Freeing the Scale of Language Model Matrix Layers":
+    Frees not only the overall matrix norm but also individual row/column norms from
+    the WD-noise equilibrium, enabling richer feature scale diversity.
+    Args:
+        dim: Dimension size for the multiplier vector
+        multiplier_type: Either "row" or "column"
+        initial_value: Initial multiplier value (default: 1.0)
+    """
+    def __init__(self, dim: int, multiplier_type: str = "row", initial_value: float = 1.0):
+        super().__init__()
+        self.multiplier_type = multiplier_type
+        self.multiplier = nn.Parameter(torch.ones(dim) * initial_value)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Apply row or column multiplier.
+        For row multipliers: x shape is (batch, seq, out_features) or (batch, heads, seq, head_dim)
+        For column multipliers: applied before matrix multiplication
+        """
+        if self.multiplier_type == "row":
+            # Broadcast along the last dimension (output features)
+            return x * self.multiplier
+        else:  # column
+            # For column multipliers, typically applied before linear layer
+            return x * self.multiplier
+class LinearWithMultipliers(nn.Module):
+    """
+    Linear layer with optional row and/or column learnable multipliers.
+    Implements: y = (r ⊙ (W @ (c ⊙ x))) + b
+    where r and c are learnable multipliers, W is the base weight matrix.
+    From "Learnable Multipliers: Freeing the Scale of Language Model Matrix Layers":
+    The base matrix W remains subject to WD-noise equilibrium with ||W|| ∝ √(η/λ),
+    while multipliers r,c learn freely to adapt the effective scale to data.
+    Args:
+        in_features: Input feature dimension
+        out_features: Output feature dimension
+        bias: Whether to include bias term
+        use_row_multiplier: Enable row (output) multipliers
+        use_column_multiplier: Enable column (input) multipliers
+    """
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        use_row_multiplier: bool = False,
+        use_column_multiplier: bool = False
+    ):
+        super().__init__()
+        # Base weight matrix (subject to WD)
+        self.linear = nn.Linear(in_features, out_features, bias=bias)
+        # Learnable multipliers (NOT subject to WD)
+        self.use_row_multiplier = use_row_multiplier
+        self.use_column_multiplier = use_column_multiplier
+        if use_row_multiplier:
+            self.row_multiplier = VectorMultiplier(out_features, multiplier_type="row")
+        if use_column_multiplier:
+            self.column_multiplier = VectorMultiplier(in_features, multiplier_type="column")
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Apply column multiplier before linear transformation
+        if self.use_column_multiplier:
+            x = self.column_multiplier(x)
+        # Linear transformation with base weights
+        x = self.linear(x)
+        # Apply row multiplier after linear transformation
+        if self.use_row_multiplier:
+            x = self.row_multiplier(x)
+        return x
+# ==================== ORIGINAL COMPONENTS ====================
 class FANLayer(nn.Module):
     """
     Fourier Analysis Network (FAN) layer for effective periodicity modeling.
         return f"dim={self.dim}, eps={self.eps}"
+class NeoLLMRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, config: NeoLLMConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors."""
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    # Keep half or full tensor for later concatenation
+    rotary_dim = cos.shape[-1]
+    q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
+    k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
+    # Apply rotary embeddings on the first half or full tensor
+    q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
+    k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
+    # Concatenate back to full shape
+    q_embed = torch.cat([q_embed, q_pass], dim=-1)
+    k_embed = torch.cat([k_embed, k_pass], dim=-1)
+    return q_embed, k_embed
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     dropout: float = 0.0,
     **kwargs: Unpack[TransformersKwargs],
 ):
     key_states = repeat_kv(key, module.num_key_value_groups)
     value_states = repeat_kv(value, module.num_key_value_groups)
     attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
     if attention_mask is not None:
         causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
         attn_weights = attn_weights + causal_mask
 class NeoLLMAttention(nn.Module):
     """
     Multi-headed attention with FANformer integration, SeeDNorm for Q/K normalization,
+    ResFormer feature residual connections, and Learnable Multipliers for enhanced
+    information flow and scale adaptation.
     ResFormer enhancement: Applies learnable feature residual connections from the first layer
     BEFORE QKV projections: H'_fan_n = λ_1 * H_fan_1 + λ_2 * H_fan_n
+    Learnable Multipliers placement (from "Learnable Multipliers" paper Appendix C):
+    - Q projection: row multipliers only (enables per-head attention scaling in GQA)
+    - K, V projections: no multipliers (avoids redundancy with Q multipliers)
+    - Output projection: row + column multipliers (maximally expressive without symmetries)
     """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         self.config = config
         self.layer_idx = layer_idx
         self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
         self.scaling = self.head_dim**-0.5
         self.attention_dropout = config.attention_dropout
         self.is_causal = True
         # Calculate the output dimension after FAN transformation
         fan_output_dim = config.hidden_size + int(config.hidden_size * getattr(config, 'fan_ratio', 0.125))
+        # Q projection with row multipliers (per-head scaling capability)
+        self.q_proj = LinearWithMultipliers(
+            fan_output_dim,
+            config.num_attention_heads * self.head_dim * 2,
+            bias=config.attention_bias,
+            use_row_multiplier=True,
+            use_column_multiplier=False
         )
+        # K, V projections without multipliers (avoids Q-K symmetry)
         self.k_proj = nn.Linear(
+            fan_output_dim, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
         )
         self.v_proj = nn.Linear(
+            fan_output_dim, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
         )
+        # Output projection with row + column multipliers (maximally expressive)
+        self.o_proj = LinearWithMultipliers(
+            config.num_attention_heads * self.head_dim,
+            config.hidden_size,
+            bias=config.attention_bias,
+            use_row_multiplier=True,
+            use_column_multiplier=True
         )
         # SeeDNorm for Q/K normalization (replaces RMSNorm)
         self.q_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
         self.k_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
         # Dropout for attention output
         self.dropout = nn.Dropout(config.dropout_rate)
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
         input_shape = hidden_states.shape[:-1]
         # Apply FANformer transformation first
         hidden_states_fan = self.fan_layer(hidden_states)
         # ResFormer: Apply feature residual connection BEFORE projections
+        # This ensures dimensional compatibility across all layer types
         if first_layer_fan is not None:
             hidden_states_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * hidden_states_fan
         # Store current FAN features for potential use as first_layer_fan in subsequent layers
         current_layer_fan = hidden_states_fan.clone()
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        # Use FAN-transformed features (with residual applied) for projections
+        # Q projection with learnable row multipliers
         query_states, gate = torch.chunk(
+            self.q_proj(hidden_states_fan).view(*input_shape, -1, self.head_dim * 2), 2, dim=-1
         )
+        gate = gate.reshape(*input_shape, -1)
+        # Apply SeeDNorm to Q and K
+        query_states = self.q_norm(query_states.view(hidden_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(hidden_states_fan).view(hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states_fan).view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
             attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
             **kwargs,
         )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = attn_output * torch.sigmoid(gate)
+        # Output projection with learnable row + column multipliers
         attn_output = self.o_proj(attn_output)
         attn_output = self.dropout(attn_output)
         return attn_output, attn_weights, current_layer_fan
 class PolyNorm(torch.nn.Module):
     def __init__(self, eps=1e-6):
         super(PolyNorm, self).__init__()
 class NeoLLMMLP(nn.Module):
     """
+    MLP with FANformer integration for featural periodicity modeling and
+    Learnable Multipliers for adaptive scale control.
     This captures periodicities in the feature space (semantic/embedding dimensions)
     complementary to the relational periodicities captured by attention mechanisms.
     Works in conjunction with ResFormer for comprehensive information flow.
+    Learnable Multipliers placement (from "Learnable Multipliers" paper Appendix C):
+    - gate_proj: row multipliers only (controls gating mechanism scale)
+    - up_proj: no multipliers (avoids redundancy with down_proj)
+    - down_proj: row + column multipliers (maximally expressive output scaling)
     """
     def __init__(self, config):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
+        # FANformer integration for featural space periodicity
         self.fan_layer = FANLayer(
             hidden_size=config.hidden_size,
             fan_ratio=getattr(config, 'fan_ratio_ffn', 0.0625)  # Half of attention's fan_ratio
         # Calculate the output dimension after FAN transformation
         fan_output_dim = config.hidden_size + int(config.hidden_size * getattr(config, 'fan_ratio_ffn', 0.0625))
+        # SwiGLU/Gated architecture with learnable multipliers
+        # gate_proj: row multipliers for gating scale control
+        self.gate_proj = LinearWithMultipliers(
+            fan_output_dim,
+            self.intermediate_size,
+            bias=False,
+            use_row_multiplier=True,
+            use_column_multiplier=False
+        )
+        # up_proj: no multipliers (avoids redundancy)
         self.up_proj = nn.Linear(fan_output_dim, self.intermediate_size, bias=False)
+        # down_proj: row + column multipliers (maximally expressive)
+        self.down_proj = LinearWithMultipliers(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=False,
+            use_row_multiplier=True,
+            use_column_multiplier=True
+        )
         self.act_fn = PolyNorm()
         # Dropout for MLP hidden layer
         self.dropout = nn.Dropout(config.dropout_rate)
     def forward(self, x):
+        # Apply FAN transformation before projections
         x_fan = self.fan_layer(x)
         # Use FAN-transformed features for gate and up projections
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
+    """
+    Decoder layer with standard residual connections.
+    Arquitectura:
+    1. Pre-norm (SeeDNorm) → LNS scaling → Self-Attention con ResFormer y Learnable Multipliers
+    2. Standard Residual Connection (suma simple)
+    3. GPAS activation scaling
+    4. Pre-norm (SeeDNorm) → LNS scaling → MLP con FANformer y Learnable Multipliers
+    5. Standard Residual Connection (suma simple)
+    6. GPAS activation scaling
+    """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.layer_idx = layer_idx
+        # Full attention with learnable multipliers
+        self.self_attn = NeoLLMAttention(config, layer_idx)
+        # MLP with FANformer integration and learnable multipliers
         self.mlp = NeoLLMMLP(config)
         # SeeDNorm for input and post-attention normalization (replaces RMSNorm)
         first_layer_fan: Optional[torch.Tensor] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> torch.FloatTensor:
+        # ============================================================
+        # Attention Block with standard residual connection
+        # ============================================================
         residual = hidden_states
         # Apply SeeDNorm normalization
         # Apply LNS scaling after normalization
         hidden_states = self.lns_attn(hidden_states)
+        # Self Attention with ResFormer feature residual connections and learnable multipliers
+        hidden_states, _, self.current_layer_fan = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            first_layer_fan=first_layer_fan,
+            **kwargs,
+        )
         # Standard residual connection
         hidden_states = residual + hidden_states
         # Apply GPAS after attention residual connection
         hidden_states = self.gpas_attn(hidden_states)
+        # ============================================================
+        # MLP Block with standard residual connection
+        # ============================================================
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         # Apply LNS scaling after normalization
         hidden_states = self.lns_mlp(hidden_states)
+        # MLP now includes FAN transformation and learnable multipliers internally
         hidden_states = self.mlp(hidden_states)
         # Standard residual connection
 class NeoLLMPreTrainedModel(PreTrainedModel):
+    """
+    Base class for NeoLLM models with custom weight initialization.
+    Handles initialization for:
+    - NeoLLMAttention (ResFormer lambda parameters)
+    - GPAS (Gradient-Preserving Activation Scaling)
+    - FANLayer (Fourier Analysis Network)
+    - SeeDNorm (Self-Rescaled Dynamic Normalization)
+    - Learnable Multipliers (ScalarMultiplier, VectorMultiplier)
+    """
     config: NeoLLMConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _is_stateful = True
     def _init_weights(self, module):
+        """
+        Initialize weights for all custom modules in NeoLLM.
+        Strategy:
+        - Standard layers (Linear, Embedding): handled by parent class
+        - Custom modules: specialized initialization per component
+        - Learnable Multipliers: initialized to 1.0 for identity transformation
+        """
         super()._init_weights(module)
+        if isinstance(module, NeoLLMAttention):
             # ResFormer: initialize lambda parameters for full attention
+            # Lambda values control the interpolation between first layer and current layer features
+            # Starting at 0.5 provides balanced contribution from both sources
             if hasattr(module, 'lambda_1'):
                 module.lambda_1.data.fill_(0.5)
             if hasattr(module, 'lambda_2'):
                 module.lambda_2.data.fill_(0.5)
         elif isinstance(module, GPAS):
             # Initialize GPAS alpha to 0 as per paper
+            # This starts with no activation scaling, allowing the model to learn gradually
             module.alpha.data.fill_(0.0)
         elif isinstance(module, FANLayer):
+            # FANLayer initialization is handled within the class __init__
+            # Uses normal initialization with std=0.02 for weights
             pass
         elif isinstance(module, SeeDNorm):
+            # SeeDNorm initialization (parameters already initialized correctly in __init__):
+            # gamma (γ) initialized to 1 (static scaling component, like RMSNorm)
+            # beta (β) initialized to 0 (self-rescaling starts disabled)
+            # alpha (α) initialized to 1 (dynamic modulation at full strength)
             pass
+        elif isinstance(module, (ScalarMultiplier, VectorMultiplier)):
+            # Learnable Multipliers: initialize to 1.0 for identity transformation
+            # This allows the model to start from the standard behavior and learn
+            # scale adaptations from data without initial bias
+            if hasattr(module, 'multiplier'):
+                module.multiplier.data.fill_(1.0)
 class NeoLLMModel(NeoLLMPreTrainedModel):
+    """
+    NeoLLM base model with transformer decoder architecture.
+    Note on embeddings and weight tying: This model uses weight tying between
+    embed_tokens and lm_head (shared weights). Following "Learnable Multipliers"
+    paper analysis, we do NOT add multipliers to embeddings because:
+    1. Weight tying creates conflicting gradient paths: multipliers would scale
+       gradients from embedding lookup but not from lm_head projection, causing
+       the multiplier to receive incomplete optimization signals.
+    2. The paper explicitly warns against multipliers in lm_head (creates shortcuts
+       for learning marginal token distribution), and with weight tying this
+       restriction propagates to embeddings.
+    3. Compensating mechanisms provide scale adaptation immediately after embedding:
+       - First layer attention has multipliers in Q/O projections
+       - FANformer transforms the representation space
+       - SeeDNorm provides input-dependent dynamic scaling
+       - ResFormer propagates first-layer features with learnable scaling
+    """
     def __init__(self, config: NeoLLMConfig):
         super().__init__(config)
+        # Standard embedding without learnable multipliers
+        # Due to weight tying with lm_head, multipliers would create
+        # conflicting optimization dynamics (see class docstring)
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
         # Each layer creates its own components (no shared parameters)
         self.layers = nn.ModuleList(
             [NeoLLMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         # SeeDNorm for final output normalization (replaces RMSNorm)
         self.norm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = NeoLLMRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
         # ResFormer: storage for first layer's FAN features (H_fan_1)
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         if inputs_embeds is None:
+            # Standard embedding lookup without multipliers
+            # Scale adaptation occurs in subsequent layers via:
+            # (1) First layer attention multipliers, (2) FANformer transformation,
+            # (3) SeeDNorm dynamic scaling, (4) ResFormer feature propagation
             inputs_embeds = self.embed_tokens(input_ids)
         if position_ids is None:
             past_key_values=None,
             position_ids=position_ids,
         )
         hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
         # ResFormer: reset first_layer_fan at the start of each forward pass
         self.first_layer_fan = None
         for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             hidden_states = decoder_layer(
                 hidden_states,
                 position_embeddings=position_embeddings,
+                attention_mask=causal_mask,
                 first_layer_fan=self.first_layer_fan,  # Pass H_fan_1 to all layers
                 **kwargs,
             )
             past_key_values=None,
         )
 @torch.compiler.disable
 def compute_cce_loss(hidden_states, labels, lm_head_weight, lm_head_bias=None, pad_token_id=None):
 class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
+    """
+    Causal Language Model with NeoLLM architecture.
+    Note on LM head: Following "Learnable Multipliers" paper recommendations,
+    the output projection (lm_head) does NOT include learnable multipliers because:
+    1. The preceding RMSNorm (self.model.norm) already acts as column multipliers
+    2. Adding row multipliers to lm_head can create shortcuts where the model
+       learns marginal token distribution without updating internal features
+    """
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, config):
         super().__init__(config)
         self.model = NeoLLMModel(config)
         self.vocab_size = config.vocab_size
+        # LM head without learnable multipliers (standard linear layer)
+        # Preceding norm layer provides sufficient scale adaptation
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.post_init()
     def forward(
     "NeoLLMConfig",
     "FANLayer",
     "SeeDNorm",
+    "ScalarMultiplier",
+    "VectorMultiplier",
+    "LinearWithMultipliers",
 ]
 # Register the configuration and model for AutoClass support