KitsuVp
/

NeoLLM

@@ -1,18 +1,7 @@
 #!/usr/bin/env python3
 """
-NeoLLM Model with FANformer Integration in both Attention and FFN, Dropout Regularization,
-SeeDNorm (Self-Rescaled Dynamic Normalization), ResFormer Value Residual Learning,
-Learnable Multipliers for enhanced scale adaptation and information flow through deep layers,
-and StackMemory for hierarchical pattern modeling.
-Updated to include:
-- Fourier Analysis Network (FAN) layer for effective periodicity modeling in attention (relational space)
-- FAN layer in FFN for featural periodicity modeling (complementary coverage)
-- SeeDNorm: Dynamic normalization with input-dependent scaling for better adaptability
-- Dropout regularization at strategic locations
-- ResFormer: Feature residual connections from first layer (applied before projections)
-- Learnable Multipliers: Frees weight matrix scale from WD-noise equilibrium for data-adaptive scaling
-- StackMemory: Differentiable hidden state stack for modeling Chomsky hierarchy grammars
-- Full Attention only (linear attention removed)
 """
 import math
@@ -36,7 +25,6 @@ from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_u
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import TransformersKwargs, logging
-from transformers.utils.generic import check_model_inputs
 from configuration_neollm import NeoLLMConfig
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
@@ -259,6 +247,7 @@ class SeeDNorm(nn.Module):
     Self-Rescaled Dynamic Normalization (SeeDNorm) with dual dropout regularization.
     SeeDNorm(x) = [σ(x·β^T)·α + γ] ⊙ x/RMS(x)
     Args:
         dim: Hidden dimension size
@@ -300,7 +289,7 @@ class SeeDNorm(nn.Module):
             Normalized and dynamically scaled tensor of same shape
         """
-        x_for_dynamic = F.dropout(x, p=self.dropout_input, training=self.training)
         rescale_factor = torch.tanh(torch.sum(x_for_dynamic * self.beta,
                                                dim=-1, keepdim=True))
@@ -310,7 +299,7 @@ class SeeDNorm(nn.Module):
         # Apply RMS normalization on ORIGINAL input (not dropped version)
         x_normalized = self._rms_norm(x.float())
-        x_normalized = F.dropout(x_normalized, p=self.dropout_hidden, training=self.training)
         # Apply dynamic scaling
         output = x_normalized * dynamic_scale.float()
@@ -320,263 +309,6 @@ class SeeDNorm(nn.Module):
     def extra_repr(self) -> str:
         return (f"dim={self.dim}, eps={self.eps}, "
                 f"dropout_input={self.dropout_input}, dropout_hidden={self.dropout_hidden}")
-# ==================== STACK MEMORY MODULE ====================
-class StackMemory(nn.Module):
-    """
-    From "Improving Formal Reasoning of Transformer with State Stack":
-    Implements a multi-head differentiable stack with soft push, pop, and no-op operations.
-    Each head maintains its own stack and mask, which are updated based on learned action
-    probabilities. Global reading is performed via query-over-stack attention.
-    This module is inserted between Transformer layers to augment information flow with
-    stack-like memory operations, enabling the model to better capture hierarchical and
-    recursive patterns characteristic of regular expressions and context-free grammars.
-    Note: StackMemory uses standard nn.Linear to maintain architectural
-    independence and avoid introducing additional complexity in the memory operations.
-    Args:
-        config: Model configuration containing stack-related hyperparameters
-    """
-    def __init__(self, config: NeoLLMConfig):
-        super().__init__()
-        self.config = config
-        self.num_stack_heads = getattr(config, 'num_stack_heads', 4)
-        self.stack_slots = getattr(config, 'stack_slots', 24)
-        self.stack_d_model = getattr(config, 'stack_d_model', 128)
-        self.head_dim = self.stack_d_model // self.num_stack_heads
-        # Dimension reduction projections for efficiency
-        # Uses standard nn.Linear
-        self.down_proj = nn.Linear(config.hidden_size, self.stack_d_model, bias=True)
-        self.up_proj = nn.Linear(self.stack_d_model, config.hidden_size, bias=True)
-        # Action prediction: generates push/pop/no-op probabilities for each head
-        self.action_head = nn.Linear(self.stack_d_model, 3 * self.num_stack_heads, bias=True)
-        # Query projection for global reading (one per head)
-        self.gate_proj = nn.Linear(self.head_dim, 1, bias=True)
-        # Residual weight for gating stack contribution
-        self.res_weight = nn.Parameter(torch.ones(1))
-        # Cache for autoregressive generation (matches OLMo reference)
-        self.cache_size = getattr(config, "cache_size", 2048)
-        # Initialization fix: Register buffers for cache
-        # Default to batch_size=1 if forward_bs is not in config (standard inference)
-        forward_bs = getattr(config, 'forward_bs', 1)
-        self.register_buffer("k_cache", torch.zeros(forward_bs, self.cache_size, self.num_stack_heads, self.head_dim))
-        self.register_buffer("action_cache", torch.zeros(forward_bs, self.cache_size, self.num_stack_heads, 3))
-        self.cache_position = 0
-        self.enable_cache = False
-    def reset_cache(self):
-        self.cache_position = 0
-    def _vectorized_update(
-        self,
-        stack: torch.Tensor,
-        mask: torch.Tensor,
-        actions: torch.Tensor,
-        k_values: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Vectorized stack update mechanism applying soft push/pop/no-op operations.
-        Implements the differentiable stack operations from the paper:
-        - Push: shifts all elements down and places k_values at top
-        - Pop: shifts all elements up and removes top
-        - No-op: maintains current stack state
-        Args:
-            stack: Current stack state [batch, seq, num_heads, stack_slots, head_dim]
-            mask: Current stack mask [batch, seq, num_heads, stack_slots]
-            actions: Action probabilities [batch, seq, num_heads, 3] (push/pop/no-op)
-            k_values: New values to push [batch, seq, num_heads, head_dim]
-        Returns:
-            Tuple of (updated_stack, updated_mask)
-        """
-        batch_size, seq_len = actions.shape[:2]
-        # Expand stack and mask along sequence dimension for parallel processing
-        # Only expand if checking against initial state dimensions (4D)
-        if stack.dim() == 4:
-            stack = stack.unsqueeze(1).expand(-1, seq_len, -1, -1, -1)
-            mask = mask.unsqueeze(1).expand(-1, seq_len, -1, -1)
-        # Generate pushed stack: new value at top, shift others down
-        push_stack = torch.cat([
-            k_values.unsqueeze(3),  # New value at position 0
-            stack[:, :, :, :-1]     # Shift existing elements down
-        ], dim=3)
-        push_mask = torch.cat([
-            torch.ones_like(mask[:, :, :, :1]),
-            mask[:, :, :, :-1]
-        ], dim=3)
-        # Generate popped stack: shift all up, zero at bottom
-        pop_stack = torch.cat([
-            stack[:, :, :, 1:],
-            torch.zeros_like(stack[:, :, :, :1])
-        ], dim=3)
-        pop_mask = torch.cat([
-            mask[:, :, :, 1:],
-            torch.zeros_like(mask[:, :, :, :1])
-        ], dim=3)
-        # Combine operations weighted by action probabilities
-        action_weights = actions.unsqueeze(-1).unsqueeze(-1)  # [batch, seq, heads, 3, 1, 1]
-        stacks = torch.stack([push_stack, pop_stack, stack], dim=3)  # [batch, seq, heads, 3, slots, dim]
-        masks = torch.stack([push_mask, pop_mask, mask], dim=3)  # [batch, seq, heads, 3, slots]
-        # Weighted combination of all operations
-        new_stack = (stacks * action_weights).sum(dim=3)
-        new_mask = (masks * action_weights.squeeze(-1)).sum(dim=3)
-        return new_stack, new_mask
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        stack: Optional[torch.Tensor] = None,
-        mask: Optional[torch.Tensor] = None
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        """
-        Apply differentiable stack operations to hidden states.
-        Args:
-            hidden_states: Input hidden states [batch, seq, hidden_size]
-            stack: Previous stack state [batch, num_heads, stack_slots, head_dim] or None
-            mask: Previous stack mask [batch, num_heads, stack_slots] or None
-        Returns:
-            Tuple of (output_hidden_states, updated_stack, updated_mask)
-        """
-        batch_size, seq_len, _ = hidden_states.shape
-        device = hidden_states.device
-        # Initialize stack and mask if not provided
-        if stack is None:
-            stack = torch.zeros(
-                batch_size, self.num_stack_heads, self.stack_slots, self.head_dim,
-                device=device, dtype=hidden_states.dtype
-            )
-        if mask is None:
-            mask = torch.zeros(
-                batch_size, self.num_stack_heads, self.stack_slots,
-                device=device, dtype=hidden_states.dtype
-            )
-        # Project to lower dimension for efficiency
-        new_hidden_states = self.down_proj(hidden_states)
-        # Generate action probabilities: [batch, seq, num_heads, 3]
-        action_logits = self.action_head(new_hidden_states) / math.sqrt(self.head_dim)
-        actions = F.softmax(
-            action_logits.view(batch_size, seq_len, self.num_stack_heads, 3),
-            dim=-1
-        )
-        # Prepare values to push (split into heads)
-        k_values = new_hidden_states.view(batch_size, seq_len, self.num_stack_heads, self.head_dim)
-        # Update stack and mask using vectorized operations
-        new_stack, new_mask = self._vectorized_update(stack, mask, actions, k_values)
-        # Global reading via query-over-stack attention
-        gate_scores = self.gate_proj(new_stack).squeeze(-1)  # [batch, seq, heads, slots]
-        gate_weights = F.softmax(gate_scores + (1 - new_mask) * -1e9, dim=-1)
-        # Weighted sum over stack slots
-        memory_output = (new_stack * gate_weights.unsqueeze(-1)).sum(dim=3)
-        memory_output = memory_output.view(batch_size, seq_len, -1)
-        memory_output = self.up_proj(memory_output)
-        # Residual Connection
-        output = memory_output * self.res_weight + hidden_states
-        # Update Cache Logic
-        if self.enable_cache:
-            self._update_cache(k_values.detach(), actions.detach())
-        return output, new_stack[:, -1], new_mask[:, -1]
-    def _update_cache(self, k_values: torch.Tensor, actions: torch.Tensor):
-        seq_len = k_values.shape[1]
-        if self.cache_position + seq_len <= self.cache_size:
-            # Assumes standard batch processing for inference (usually batch_size=1)
-            self.k_cache[:, self.cache_position:self.cache_position+seq_len] = k_values
-            self.action_cache[:, self.cache_position:self.cache_position+seq_len] = actions
-            self.cache_position += seq_len
-        else:
-            self.reset_cache()
-    def step(self, hidden_state: torch.Tensor, stack: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        if not self.enable_cache:
-            return self.forward(hidden_state.unsqueeze(1), stack, mask)
-        batch_size = hidden_state.shape[0]
-        # Compute features for current token
-        new_hidden_states = self.down_proj(hidden_state)
-        action_logits = self.action_head(new_hidden_states) / math.sqrt(self.head_dim)
-        current_actions = F.softmax(
-            action_logits.view(batch_size, 1, self.num_stack_heads, 3),
-            dim=-1
-        )
-        current_k = new_hidden_states.view(batch_size, 1, self.num_stack_heads, self.head_dim)
-        # Reconstruct History
-        if self.cache_position > 0:
-            cached_k = self.k_cache[:, :self.cache_position]
-            cached_actions = self.action_cache[:, :self.cache_position]
-            k_values = torch.cat([cached_k, current_k], dim=1)
-            actions = torch.cat([cached_actions, current_actions], dim=1)
-        else:
-            k_values = current_k
-            actions = current_actions
-        # Dimension Fix: Pass sequences directly without unsqueeze(0)
-        # k_values is [batch, seq_len_total, heads, dim]
-        # actions is [batch, seq_len_total, heads, 3]
-        new_stack_seq, new_mask_seq = self._vectorized_update(
-            stack, # Initial stack [batch, heads, slots, dim]
-            mask,
-            actions,
-            k_values
-        )
-        # Extract last step
-        current_stack = new_stack_seq[:, -1]
-        current_mask = new_mask_seq[:, -1]
-        gate_scores = self.gate_proj(current_stack).squeeze(-1)
-        gate_weights = F.softmax(gate_scores + (1 - current_mask) * -1e9, dim=-1)
-        memory_output = (current_stack * gate_weights.unsqueeze(-1)).sum(dim=2)
-        memory_output = memory_output.view(batch_size, -1)
-        memory_output_proj = self.up_proj(memory_output)
-        self._update_cache(current_k, current_actions)
-        return (
-            memory_output_proj * self.res_weight + hidden_state,
-            current_stack,
-            current_mask
-        )
 # ==================== ROTARY EMBEDDING ====================
 class NeoLLMRotaryEmbedding(nn.Module):
     inv_freq: torch.Tensor  # fix linting for `register_buffer`
@@ -662,9 +394,6 @@ class NeoLLMRotaryEmbedding(nn.Module):
             sin = emb.sin() * self.attention_scaling
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
@@ -677,16 +406,13 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     cos = cos.unsqueeze(unsqueeze_dim)
     sin = sin.unsqueeze(unsqueeze_dim)
-    # Keep half or full tensor for later concatenation
     rotary_dim = cos.shape[-1]
     q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
     k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
-    # Apply rotary embeddings on the first half or full tensor
     q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
     k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
-    # Concatenate back to full shape
     q_embed = torch.cat([q_embed, q_pass], dim=-1)
     k_embed = torch.cat([k_embed, k_pass], dim=-1)
     return q_embed, k_embed
@@ -704,6 +430,98 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 def eager_attention_forward(
     module: nn.Module,
     query: torch.Tensor,
@@ -732,17 +550,9 @@ def eager_attention_forward(
 class NeoLLMAttention(nn.Module):
     """
-    Multi-headed attention with FANformer integration, SeeDNorm for Q/K normalization,
-    ResFormer feature residual connections, and Learnable Multipliers for enhanced
-    information flow and scale adaptation.
-    ResFormer enhancement: Applies learnable feature residual connections from first layer
-    BEFORE QKV projections: H'_fan_n = λ_1 * H_fan_1 + λ_2 * H_fan_n
-    Learnable Multipliers placement (from "Learnable Multipliers" paper Appendix C):
-    - Q projection: row multipliers only (enables per-head attention scaling in GQA)
-    - K, V projections: no multipliers (avoids redundancy with Q multipliers)
-    - Output projection: row + column multipliers (maximally expressive without symmetries)
     """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
@@ -752,54 +562,141 @@ class NeoLLMAttention(nn.Module):
         self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
         self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
         self.scaling = self.head_dim**-0.5
         self.attention_dropout = config.attention_dropout
         self.is_causal = True
-        # FANformer integration: FAN layer before QKV projections
         self.fan_layer = FANLayer(
-            hidden_size=config.hidden_size,
-            fan_ratio=getattr(config, 'fan_ratio', 0.125)
         )
-        # Calculate the output dimension after FAN transformation
-        fan_output_dim = config.hidden_size + int(config.hidden_size * getattr(config, 'fan_ratio', 0.125))
-        # Q projection with row multipliers (per-head scaling capability)
         self.q_proj = LinearWithMultipliers(
-            fan_output_dim,
-            config.num_attention_heads * self.head_dim * 2,
             bias=config.attention_bias,
             use_row_multiplier=True,
-            use_column_multiplier=False
         )
-        # K, V projections without multipliers (avoids Q-K symmetry)
         self.k_proj = nn.Linear(
-            fan_output_dim, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
         )
         self.v_proj = nn.Linear(
-            fan_output_dim, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
         )
-        # Output projection with row + column multipliers (maximally expressive)
         self.o_proj = LinearWithMultipliers(
             config.num_attention_heads * self.head_dim,
             config.hidden_size,
             bias=config.attention_bias,
             use_row_multiplier=True,
-            use_column_multiplier=True
         )
-        # SeeDNorm for Q/K normalization (replaces RMSNorm)
         self.q_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
         self.k_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
-        # Dropout for attention output
         self.dropout = nn.Dropout(config.dropout_rate)
-        # ResFormer: learnable feature residual parameters (initialized to 0.5)
-        self.lambda_1 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_1
-        self.lambda_2 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_n
     def forward(
         self,
@@ -809,45 +706,31 @@ class NeoLLMAttention(nn.Module):
         first_layer_fan: Optional[torch.Tensor] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
-        """
-        Forward pass with ResFormer feature residual connections.
-        Args:
-            hidden_states: Current layer input [batch, seq, hidden_size]
-            position_embeddings: Tuple of (cos, sin) for RoPE
-            attention_mask: Causal attention mask
-            first_layer_fan: First layer FAN features (for ResFormer)
-        Returns:
-            Tuple of (attn_output, attn_weights, current_layer_fan)
-        """
         input_shape = hidden_states.shape[:-1]
-        # Apply FANformer transformation
         hidden_states_fan = self.fan_layer(hidden_states)
-        # ResFormer: Apply feature residual connection BEFORE projections
         if first_layer_fan is not None:
             hidden_states_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * hidden_states_fan
-        # Store current FAN features for ResFormer
         current_layer_fan = hidden_states_fan.clone()
-        hidden_shape = (*input_shape, -1, self.head_dim)
-        # Q projection with learnable row multipliers
         query_states, gate = torch.chunk(
-            self.q_proj(hidden_states_fan).view(*input_shape, -1, self.head_dim * 2), 2, dim=-1
         )
         gate = gate.reshape(*input_shape, -1)
-        # Apply SeeDNorm to Q and K
-        query_states = self.q_norm(query_states.view(hidden_shape)).transpose(1, 2)
-        key_states = self.k_norm(self.k_proj(hidden_states_fan).view(hidden_shape)).transpose(1, 2)
-        value_states = self.v_proj(hidden_states_fan).view(hidden_shape).transpose(1, 2)
         cos, sin = position_embeddings
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
@@ -864,15 +747,14 @@ class NeoLLMAttention(nn.Module):
             **kwargs,
         )
         attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = attn_output * torch.sigmoid(gate)
-        # Output projection with learnable row + column multipliers
         attn_output = self.o_proj(attn_output)
         attn_output = self.dropout(attn_output)
-        return attn_output, attn_weights, current_layer_fan
 class PolyNorm(torch.nn.Module):
     def __init__(self, eps=1e-6):
@@ -957,16 +839,15 @@ class NeoLLMMLP(nn.Module):
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     """
-    Decoder layer with standard residual connections and optional StackMemory.
-    Architecture (Updated Flow):
-    1. Optional: StackMemory module (Pre-processing context injection)
-    2. Pre-norm (SeeDNorm) → LNS scaling → Self-Attention with ResFormer and Learnable Multipliers
-    3. Standard Residual Connection
-    4. GPAS activation scaling
-    5. Pre-norm (SeeDNorm) → LNS scaling → MLP with FANformer and Learnable Multipliers
-    6. Standard Residual Connection
-    7. GPAS activation scaling
     """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
@@ -980,7 +861,7 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         # MLP with FANformer integration and learnable multipliers
         self.mlp = NeoLLMMLP(config)
-        # SeeDNorm for input and post-attention normalization
         self.input_layernorm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -988,15 +869,10 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         self.lns_attn = LNS(layer_idx)
         self.lns_mlp = LNS(layer_idx)
-        # GPAS (Gradient-Preserving Activation Scaling)
         self.gpas_attn = GPAS(config.hidden_size)
         self.gpas_mlp = GPAS(config.hidden_size)
-        # StackMemory: Differentiable hidden state stack
-        self.use_stack = getattr(config, 'use_stack', False)
-        if self.use_stack:
-            self.stack_memory = StackMemory(config)
         # ResFormer: storage for current layer's FAN features
         self.current_layer_fan = None
@@ -1006,39 +882,11 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor] = None,
         first_layer_fan: Optional[torch.Tensor] = None,
-        stack_state: Optional[torch.Tensor] = None,
-        stack_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
-        """
-        Forward pass with ResFormer and optional StackMemory.
-        Args:
-            hidden_states: Current layer input [batch, seq, hidden_size]
-            position_embeddings: Tuple of (cos, sin) for RoPE
-            attention_mask: Causal attention mask
-            first_layer_fan: First layer FAN features (for ResFormer)
-            stack_state: StackMemory state (optional)
-            stack_mask: StackMemory mask (optional)
-            output_attentions: Whether to return attention weights
-        Returns:
-            Tuple of (hidden_states, attn_weights, stack_state, stack_mask)
-        """
         # ============================================================
-        # 1. Stack Memory Module (MOVED TO START)
-        # ============================================================
-        # We process memory first so the Attention layer can "see" the
-        # retrieved context. This eliminates the 1-layer lag.
-        if self.use_stack:
-            hidden_states, stack_state, stack_mask = self.stack_memory(
-                hidden_states, stack_state, stack_mask
-            )
-        # ============================================================
-        # 2. Attention Block with Standard Residual Connection
         # ============================================================
         residual = hidden_states
@@ -1048,23 +896,24 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         # Apply LNS scaling after normalization
         hidden_states = self.lns_attn(hidden_states)
-        # Self Attention with ResFormer
-        attn_output, attn_weights, self.current_layer_fan = self.self_attn(
             hidden_states=hidden_states,
-            position_embeddings=position_embeddings,
             attention_mask=attention_mask,
             first_layer_fan=first_layer_fan,
             **kwargs,
         )
-        # Standard Residual Connection
-        hidden_states = residual + attn_output
-        # Apply GPAS after residual connection
         hidden_states = self.gpas_attn(hidden_states)
         # ============================================================
-        # 3. MLP Block with Standard Residual Connection
         # ============================================================
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
@@ -1072,20 +921,20 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         # Apply LNS scaling after normalization
         hidden_states = self.lns_mlp(hidden_states)
-        # MLP with FANformer
-        mlp_output = self.mlp(hidden_states)
-        # Standard Residual Connection
-        hidden_states = residual + mlp_output
-        # Apply GPAS after residual connection
         hidden_states = self.gpas_mlp(hidden_states)
-        # Return tuple matching the expected signature
-        if self.use_stack:
-            return (hidden_states, attn_weights, stack_state, stack_mask)
-        else:
-            return (hidden_states, attn_weights, None, None)
 class NeoLLMPreTrainedModel(PreTrainedModel):
@@ -1098,7 +947,6 @@ class NeoLLMPreTrainedModel(PreTrainedModel):
     - FANLayer (Fourier Analysis Network)
     - SeeDNorm (Self-Rescaled Dynamic Normalization)
     - Learnable Multipliers (ScalarMultiplier, VectorMultiplier)
-    - StackMemory (Differentiable Hidden State Stack)
     """
     config: NeoLLMConfig
     base_model_prefix = "model"
@@ -1111,58 +959,90 @@ class NeoLLMPreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         """
         Initialize weights for all custom modules in NeoLLM.
         """
         super()._init_weights(module)
         if isinstance(module, NeoLLMAttention):
             if hasattr(module, 'lambda_1'):
                 module.lambda_1.data.fill_(0.5)
             if hasattr(module, 'lambda_2'):
                 module.lambda_2.data.fill_(0.5)
         elif isinstance(module, GPAS):
             module.alpha.data.fill_(0.0)
         elif isinstance(module, (ScalarMultiplier, VectorMultiplier)):
             if hasattr(module, 'multiplier'):
                 module.multiplier.data.fill_(1.0)
-        elif isinstance(module, StackMemory):
-            std = self.config.initializer_range if hasattr(self.config, 'initializer_range') else 0.02
-            if hasattr(module, 'down_proj'):
-                module.down_proj.weight.data.normal_(mean=0.0, std=std)
-            if hasattr(module, 'up_proj'):
-                module.up_proj.weight.data.normal_(mean=0.0, std=std)
-            if hasattr(module, 'action_head'):
-                module.action_head.weight.data.normal_(mean=0.0, std=std)
-                if module.action_head.bias is not None:
-                    module.action_head.bias.data.zero_()
-            if hasattr(module, 'gate_proj'):
-                module.gate_proj.weight.data.normal_(mean=0.0, std=std)
-            if hasattr(module, 'res_weight'):
-                module.res_weight.data.fill_(1.0)
 class NeoLLMModel(NeoLLMPreTrainedModel):
     """
     NeoLLM base model with transformer decoder architecture.
-    Uses ResFormer for first-layer feature propagation with standard residual connections
-    and optional StackMemory for hierarchical pattern modeling.
     Note on embeddings and weight tying: This model uses weight tying between
     embed_tokens and lm_head (shared weights). Following "Learnable Multipliers"
     paper analysis, we do NOT add multipliers to embeddings because:
-    1. Weight tying creates conflicting gradient paths
-    2. The paper explicitly warns against multipliers in lm_head
-    3. Compensating mechanisms provide scale adaptation immediately after embedding
     """
     def __init__(self, config: NeoLLMConfig):
         super().__init__(config)
         # Standard embedding without learnable multipliers
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
         # Each layer creates its own components (no shared parameters)
@@ -1175,10 +1055,7 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         self.rotary_emb = NeoLLMRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
-        # Configuration
-        self.use_stack = getattr(config, 'use_stack', False)
-        # ResFormer: storage for first layer's FAN features
         self.first_layer_fan = None
         # Initialize weights and apply final processing
@@ -1193,8 +1070,6 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        use_cache: Optional[bool] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPast:
         output_hidden_states = (
@@ -1211,6 +1086,10 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
         if position_ids is None:
@@ -1226,29 +1105,16 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         )
         hidden_states = inputs_embeds
-        next_decoder_cache = None
         all_hidden_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
-        # Create position embeddings to be shared across the decoder layers
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
-        # ResFormer with first-layer feature propagation
         self.first_layer_fan = None
-        # Initialize Stack states (always None at start of forward, rebuilt via cache step or vertical flow)
-        stack_state = None
-        stack_mask = None
-        # Propagate use_cache and reset if starting a new sequence
-        if self.use_stack:
-            for layer in self.layers:
-                if hasattr(layer, 'stack_memory'):
-                    layer.stack_memory.enable_cache = use_cache if use_cache is not None else False
-                    if past_key_values is None:
-                        layer.stack_memory.reset_cache()
-        for decoder_layer in self.layers:
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
@@ -1256,9 +1122,7 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
                 hidden_states,
                 position_embeddings=position_embeddings,
                 attention_mask=causal_mask,
-                first_layer_fan=self.first_layer_fan,
-                stack_state=stack_state,
-                stack_mask=stack_mask,
                 output_attentions=output_attentions,
                 **kwargs,
             )
@@ -1268,15 +1132,7 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             if output_attentions:
                 all_attentions = all_attentions + (layer_outputs[1],)
-            if self.use_stack:
-                # Vertical memory logic:
-                # The layer returns updated stack for the next layer to use (Vertical passing)
-                # But we do NOT persist it temporally here. The Module's internal cache handles temporal.
-                stack_state = layer_outputs[2]
-                stack_mask = layer_outputs[3]
             # ResFormer: capture H_fan_1 from the first layer
-            # Dynamically capture for the current pass
             if self.first_layer_fan is None and hasattr(decoder_layer, 'current_layer_fan'):
                 self.first_layer_fan = decoder_layer.current_layer_fan
@@ -1287,11 +1143,11 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             all_hidden_states = all_hidden_states + (hidden_states,)
         if not return_dict:
-            return tuple(v for v in [hidden_states, next_decoder_cache, all_hidden_states, all_attentions] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
-            past_key_values=next_decoder_cache,
             hidden_states=all_hidden_states,
             attentions=all_attentions,
         )
@@ -1346,37 +1202,6 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
         self.post_init()
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
-    ):
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[2]
-            # If past_length > input_ids length, we are likely generating token by token
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default standard HF behavior
-                remove_prefix_length = input_ids.shape[1] - 1
-            input_ids = input_ids[:, remove_prefix_length:]
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
-        return {
-            "input_ids": input_ids,
-            "past_key_values": past_key_values,
-            "use_cache": kwargs.get("use_cache"),
-            "position_ids": position_ids,
-            "attention_mask": attention_mask,
-            "inputs_embeds": inputs_embeds,
-        }
     def forward(
         self,
@@ -1388,7 +1213,6 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
         logits_to_keep: Union[int, torch.Tensor] = 0,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         outputs: BaseModelOutputWithPast = self.model(
@@ -1398,7 +1222,6 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
             inputs_embeds=inputs_embeds,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             **kwargs,
         )
@@ -1423,7 +1246,7 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
-            past_key_values=outputs.past_key_values,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
@@ -1440,7 +1263,7 @@ __all__ = [
     "ScalarMultiplier",
     "VectorMultiplier",
     "LinearWithMultipliers",
-    "StackMemory",
 ]
 # Register the configuration and model for AutoClass support

 #!/usr/bin/env python3
 """
+NeoLLM model with FANformer, SeeDNorm, ResFormer, Learnable Multipliers,
+and full attention augmented with optional Momentum, MEA, and LUCID operators.
 """
 import math
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import TransformersKwargs, logging
 from configuration_neollm import NeoLLMConfig
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
     Self-Rescaled Dynamic Normalization (SeeDNorm) with dual dropout regularization.
     SeeDNorm(x) = [σ(x·β^T)·α + γ] ⊙ x/RMS(x)
     Args:
         dim: Hidden dimension size
             Normalized and dynamically scaled tensor of same shape
         """
+        x_for_dynamic = F.dropout(x, p=self.dropout_input)
         rescale_factor = torch.tanh(torch.sum(x_for_dynamic * self.beta,
                                                dim=-1, keepdim=True))
         # Apply RMS normalization on ORIGINAL input (not dropped version)
         x_normalized = self._rms_norm(x.float())
+        x_normalized = F.dropout(x_normalized, p=self.dropout_hidden)
         # Apply dynamic scaling
         output = x_normalized * dynamic_scale.float()
     def extra_repr(self) -> str:
         return (f"dim={self.dim}, eps={self.eps}, "
                 f"dropout_input={self.dropout_input}, dropout_hidden={self.dropout_hidden}")
 # ==================== ROTARY EMBEDDING ====================
 class NeoLLMRotaryEmbedding(nn.Module):
     inv_freq: torch.Tensor  # fix linting for `register_buffer`
             sin = emb.sin() * self.attention_scaling
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
     cos = cos.unsqueeze(unsqueeze_dim)
     sin = sin.unsqueeze(unsqueeze_dim)
     rotary_dim = cos.shape[-1]
     q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:]
     k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:]
     q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin)
     k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin)
     q_embed = torch.cat([q_embed, q_pass], dim=-1)
     k_embed = torch.cat([k_embed, k_pass], dim=-1)
     return q_embed, k_embed
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def causal_first_difference(x: torch.Tensor) -> torch.Tensor:
+    """Causal first difference along sequence length without Python loops."""
+    previous = F.pad(x[..., :-1, :], (0, 0, 1, 0))
+    return x - previous
+def rms_key_unit_norm(x: torch.Tensor, eps: float) -> torch.Tensor:
+    """RMS-style key normalization used by the LUCID preconditioner."""
+    scale = math.sqrt(x.shape[-1])
+    return F.normalize(x.float(), p=2, dim=-1, eps=eps) * scale
+def infer_key_validity(attention_mask: Optional[torch.Tensor], seq_len: int, num_heads: int) -> Optional[torch.Tensor]:
+    """Infer valid key positions from a square additive attention mask when available."""
+    if attention_mask is None or attention_mask.ndim != 4:
+        return None
+    if attention_mask.shape[-2] != seq_len or attention_mask.shape[-1] != seq_len:
+        return None
+    diag = attention_mask.diagonal(dim1=-2, dim2=-1)
+    valid = torch.isfinite(diag) & (diag == 0)
+    if valid.shape[1] == 1 and num_heads != 1:
+        valid = valid.expand(-1, num_heads, -1)
+    elif valid.shape[1] != num_heads:
+        valid = valid[:, :1, :].expand(-1, num_heads, -1)
+    return valid
+def head_linear_compose(hidden_states: torch.Tensor, mixing_matrix: torch.Tensor) -> torch.Tensor:
+    """Head-level linear composition over head axis without Python loops."""
+    return torch.einsum("bhtd,hk->bktd", hidden_states, mixing_matrix.to(device=hidden_states.device, dtype=hidden_states.dtype))
+def build_mea_reconstruction_matrix(num_component_heads: int, num_output_heads: int) -> torch.Tensor:
+    """Build an identity-preserving MEA reconstruction initializer from component heads to output heads."""
+    matrix = torch.zeros(num_component_heads, num_output_heads, dtype=torch.float32)
+    if num_component_heads <= 0 or num_output_heads <= 0:
+        raise ValueError("MEA head counts must be positive")
+    output_indices = torch.arange(num_output_heads, dtype=torch.long)
+    component_indices = torch.div(output_indices * num_component_heads, num_output_heads, rounding_mode="floor")
+    matrix[component_indices, output_indices] = 1.0
+    return matrix
+class MEAHeadSeeDNorm(nn.Module):
+    """
+    MEA head-level normalization using SeeDNorm grouped by KV structure (GQA-aware).
+    In GQA, query heads that share the same K and V are structurally correlated —
+    they received identical values and only differ in their Q projection. Normalizing
+    them independently (as the original MEA paper assumes for MHA) ignores this
+    correlation. Instead, we normalize per KV group: all query heads sharing the
+    same KV head are flattened together and normalized as a single unit.
+    With num_attention_heads=8 and num_key_value_heads=2 (num_kv_groups=4):
+      - 2 independent SeeDNorm groups
+      - each group covers 4 query heads × head_dim = 256 dims
+      - SeeDNorm's dynamic scale operates over the group's full 256-dim space
+    This allows SeeDNorm's dynamic scale to detect and compensate for
+    LUCID decorrelation magnitude within each KV-coherent group of heads,
+    while respecting the GQA structural dependency between heads.
+    """
+    def __init__(self, num_heads: int, head_dim: int, num_kv_groups: int, eps: float = 1e-6):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.num_kv_groups = num_kv_groups
+        self.num_kv_heads = num_heads // num_kv_groups   # number of KV groups = num_key_value_heads
+        self.group_dim = num_kv_groups * head_dim        # dims per KV group
+        # One SeeDNorm instance shared across all KV groups, operating over group_dim
+        self.norm = SeeDNorm(self.group_dim, eps=eps)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch, seq_len, num_heads, head_dim = hidden_states.shape
+        if num_heads != self.num_heads or head_dim != self.head_dim:
+            raise ValueError(
+                f"MEAHeadSeeDNorm expected ({self.num_heads}, {self.head_dim}) heads, "
+                f"received ({num_heads}, {head_dim})"
+            )
+        # Reshape into KV groups: (batch, seq, num_kv_heads, num_kv_groups * head_dim)
+        # heads within each KV group are contiguous after attention_interface transpose
+        grouped = hidden_states.reshape(batch, seq_len, self.num_kv_heads, self.group_dim)
+        # SeeDNorm operates over last dim → independently per KV group
+        normed = self.norm(grouped)
+        return normed.reshape(batch, seq_len, num_heads, head_dim)
 def eager_attention_forward(
     module: nn.Module,
     query: torch.Tensor,
 class NeoLLMAttention(nn.Module):
     """
+    Full attention with FANformer, SeeDNorm, ResFormer, Learnable Multipliers,
+    optional post-RoPE Momentum attention, full MEA head-level composition over
+    K/V, and optional LUCID value preconditioning.
     """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
         self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
         self.scaling = self.head_dim**-0.5
+        self.sqrt_head_dim = math.sqrt(self.head_dim)
         self.attention_dropout = config.attention_dropout
         self.is_causal = True
+        self.use_momentum_attention = getattr(config, "use_momentum_attention", False)
+        self.momentum_gamma = float(getattr(config, "momentum_gamma", 0.0))
+        self.use_mea_attention = getattr(config, "use_mea_attention", False)
+        self.mea_component_key_value_heads = int(
+            getattr(config, "mea_component_key_value_heads", config.num_key_value_heads)
+        )
+        self.mea_groupnorm_eps = float(getattr(config, "mea_groupnorm_eps", config.rms_norm_eps))
+        self.use_lucid_attention = getattr(config, "use_lucid_attention", False)
+        self.lucid_attention_eps = float(getattr(config, "lucid_attention_eps", config.rms_norm_eps))
         self.fan_layer = FANLayer(
+            hidden_size=config.hidden_size,
+            fan_ratio=getattr(config, "fan_ratio", 0.125),
         )
+        fan_output_dim = config.hidden_size + int(config.hidden_size * getattr(config, "fan_ratio", 0.125))
         self.q_proj = LinearWithMultipliers(
+            fan_output_dim,
+            config.num_attention_heads * self.head_dim * 2,
             bias=config.attention_bias,
             use_row_multiplier=True,
+            use_column_multiplier=False,
+        )
+        self.num_mea_component_heads = (
+            self.mea_component_key_value_heads if self.use_mea_attention else config.num_key_value_heads
         )
         self.k_proj = nn.Linear(
+            fan_output_dim, self.num_mea_component_heads * self.head_dim, bias=config.attention_bias
         )
         self.v_proj = nn.Linear(
+            fan_output_dim, self.num_mea_component_heads * self.head_dim, bias=config.attention_bias
         )
         self.o_proj = LinearWithMultipliers(
             config.num_attention_heads * self.head_dim,
             config.hidden_size,
             bias=config.attention_bias,
             use_row_multiplier=True,
+            use_column_multiplier=True,
         )
         self.q_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
         self.k_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
+        if self.use_mea_attention:
+            self.mea_key_mix = nn.Parameter(
+                build_mea_reconstruction_matrix(self.num_mea_component_heads, config.num_key_value_heads)
+            )
+            self.mea_value_mix = nn.Parameter(
+                build_mea_reconstruction_matrix(self.num_mea_component_heads, config.num_key_value_heads)
+            )
+            self.mea_output_norm = MEAHeadSeeDNorm(
+                num_heads=config.num_attention_heads,
+                head_dim=self.head_dim,
+                num_kv_groups=self.num_key_value_groups,
+                eps=self.mea_groupnorm_eps,
+            )
+        else:
+            self.mea_key_mix = None
+            self.mea_value_mix = None
+            self.mea_output_norm = None
         self.dropout = nn.Dropout(config.dropout_rate)
+        self.lambda_1 = nn.Parameter(torch.tensor(0.5))
+        self.lambda_2 = nn.Parameter(torch.tensor(0.5))
+    def _apply_momentum_attention(
+        self,
+        query_states: torch.Tensor,
+        key_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Apply post-RoPE momentum shear to Q and K only."""
+        if not self.use_momentum_attention or self.momentum_gamma == 0.0:
+            return query_states, key_states
+        query_states = query_states + self.momentum_gamma * causal_first_difference(query_states)
+        key_states = key_states + self.momentum_gamma * causal_first_difference(key_states)
+        return query_states, key_states
+    def _apply_mea_head_mixing(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Apply explicit KV head interaction before repeat_kv and attention."""
+        if not self.use_mea_attention:
+            return key_states, value_states
+        mixed_keys = head_linear_compose(key_states, self.mea_key_mix).contiguous()
+        mixed_values = head_linear_compose(value_states, self.mea_value_mix).contiguous()
+        return mixed_keys, mixed_values
+    def _apply_lucid_preconditioner(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        """Compute LUCID preconditioned values via a batched lower-triangular solve."""
+        if not self.use_lucid_attention:
+            return value_states
+        key_rn = rms_key_unit_norm(key_states, eps=self.lucid_attention_eps)
+        precondition_logits = torch.matmul(key_rn, key_rn.transpose(-1, -2)) * self.scaling - self.sqrt_head_dim
+        preconditioner = torch.tril(torch.exp(precondition_logits))
+        key_validity = infer_key_validity(attention_mask, key_states.shape[-2], key_states.shape[1])
+        if key_validity is not None:
+            pair_valid = key_validity.unsqueeze(-1) & key_validity.unsqueeze(-2)
+            preconditioner = preconditioner * pair_valid.to(preconditioner.dtype)
+        eye = torch.eye(
+            preconditioner.shape[-1],
+            device=preconditioner.device,
+            dtype=preconditioner.dtype,
+        ).view(1, 1, preconditioner.shape[-1], preconditioner.shape[-1])
+        preconditioner = preconditioner * (1.0 - eye) + eye
+        lucid_values = torch.linalg.solve_triangular(
+            preconditioner,
+            value_states.float(),
+            upper=False,
+            unitriangular=True,
+        )
+        return lucid_values.to(value_states.dtype).contiguous()
+    def _apply_mea_output_norm(self, attn_output: torch.Tensor) -> torch.Tensor:
+        """Apply MEA GQA-grouped SeeDNorm on the per-head attention output."""
+        if not self.use_mea_attention:
+            return attn_output
+        return self.mea_output_norm(attn_output)
     def forward(
         self,
         first_layer_fan: Optional[torch.Tensor] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """Forward pass for the full attention block."""
         input_shape = hidden_states.shape[:-1]
         hidden_states_fan = self.fan_layer(hidden_states)
         if first_layer_fan is not None:
             hidden_states_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * hidden_states_fan
         current_layer_fan = hidden_states_fan.clone()
+        query_shape = (*input_shape, self.config.num_attention_heads, self.head_dim)
+        key_value_shape = (*input_shape, self.num_mea_component_heads, self.head_dim)
         query_states, gate = torch.chunk(
+            self.q_proj(hidden_states_fan).view(*input_shape, self.config.num_attention_heads, self.head_dim * 2), 2, dim=-1
         )
         gate = gate.reshape(*input_shape, -1)
+        query_states = self.q_norm(query_states.view(query_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(hidden_states_fan).view(key_value_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states_fan).view(key_value_shape).transpose(1, 2)
         cos, sin = position_embeddings
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        query_states, key_states = self._apply_momentum_attention(query_states, key_states)
+        key_states, value_states = self._apply_mea_head_mixing(key_states, value_states)
+        value_states = self._apply_lucid_preconditioner(key_states, value_states, attention_mask)
         attention_interface: Callable = eager_attention_forward
         if self.config._attn_implementation != "eager":
             **kwargs,
         )
+        attn_output = attn_output.reshape(*input_shape, -1, self.head_dim)
+        attn_output = self._apply_mea_output_norm(attn_output)
         attn_output = attn_output.reshape(*input_shape, -1).contiguous()
         attn_output = attn_output * torch.sigmoid(gate)
         attn_output = self.o_proj(attn_output)
         attn_output = self.dropout(attn_output)
+        return attn_output, attn_weights, current_layer_fan
 class PolyNorm(torch.nn.Module):
     def __init__(self, eps=1e-6):
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     """
+    Decoder layer with standard residual connections.
+    Arquitectura:
+    1. Pre-norm (SeeDNorm) → LNS scaling → Self-Attention con ResFormer y Learnable Multipliers
+    2. Standard Residual Connection (suma simple)
+    3. GPAS activation scaling
+    4. Pre-norm (SeeDNorm) → LNS scaling → MLP con FANformer y Learnable Multipliers
+    5. Standard Residual Connection (suma simple)
+    6. GPAS activation scaling
     """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         # MLP with FANformer integration and learnable multipliers
         self.mlp = NeoLLMMLP(config)
+        # SeeDNorm for input and post-attention normalization (replaces RMSNorm)
         self.input_layernorm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.lns_attn = LNS(layer_idx)
         self.lns_mlp = LNS(layer_idx)
+        # GPAS (Gradient-Preserving Activation Scaling) - applied after residual connections
         self.gpas_attn = GPAS(config.hidden_size)
         self.gpas_mlp = GPAS(config.hidden_size)
         # ResFormer: storage for current layer's FAN features
         self.current_layer_fan = None
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor] = None,
         first_layer_fan: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
         **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         # ============================================================
+        # Attention Block with standard residual connection
         # ============================================================
         residual = hidden_states
         # Apply LNS scaling after normalization
         hidden_states = self.lns_attn(hidden_states)
+        # Self Attention with ResFormer feature residual connections and learnable multipliers
+        # We capture attn_weights here instead of ignoring them
+        hidden_states, attn_weights, self.current_layer_fan = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
             first_layer_fan=first_layer_fan,
             **kwargs,
         )
+        # Standard residual connection
+        hidden_states = residual + hidden_states
+        # Apply GPAS after attention residual connection
         hidden_states = self.gpas_attn(hidden_states)
         # ============================================================
+        # MLP Block with standard residual connection
         # ============================================================
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         # Apply LNS scaling after normalization
         hidden_states = self.lns_mlp(hidden_states)
+        # MLP now includes FAN transformation and learnable multipliers internally
+        hidden_states = self.mlp(hidden_states)
+        # Standard residual connection
+        hidden_states = residual + hidden_states
+        # Apply GPAS after MLP residual connection
         hidden_states = self.gpas_mlp(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
 class NeoLLMPreTrainedModel(PreTrainedModel):
     - FANLayer (Fourier Analysis Network)
     - SeeDNorm (Self-Rescaled Dynamic Normalization)
     - Learnable Multipliers (ScalarMultiplier, VectorMultiplier)
     """
     config: NeoLLMConfig
     base_model_prefix = "model"
     def _init_weights(self, module):
         """
         Initialize weights for all custom modules in NeoLLM.
+        Strategy:
+        - Standard layers (Linear, Embedding): handled by parent class
+        - Custom modules: specialized initialization per component
+        - Learnable Multipliers: initialized to 1.0 for identity transformation
         """
         super()._init_weights(module)
         if isinstance(module, NeoLLMAttention):
+            # ResFormer: initialize lambda parameters for full attention
+            # Lambda values control the interpolation between first layer and current layer features
+            # Starting at 0.5 provides balanced contribution from both sources
             if hasattr(module, 'lambda_1'):
                 module.lambda_1.data.fill_(0.5)
             if hasattr(module, 'lambda_2'):
                 module.lambda_2.data.fill_(0.5)
+            if hasattr(module, 'mea_key_mix') and module.mea_key_mix is not None:
+                module.mea_key_mix.data.copy_(
+                    build_mea_reconstruction_matrix(
+                        module.mea_key_mix.shape[0],
+                        module.mea_key_mix.shape[1],
+                    ).to(device=module.mea_key_mix.device, dtype=module.mea_key_mix.dtype)
+                )
+            if hasattr(module, 'mea_value_mix') and module.mea_value_mix is not None:
+                module.mea_value_mix.data.copy_(
+                    build_mea_reconstruction_matrix(
+                        module.mea_value_mix.shape[0],
+                        module.mea_value_mix.shape[1],
+                    ).to(device=module.mea_value_mix.device, dtype=module.mea_value_mix.dtype)
+                )
         elif isinstance(module, GPAS):
+            # Initialize GPAS alpha to 0 as per paper
+            # This starts with no activation scaling, allowing the model to learn gradually
             module.alpha.data.fill_(0.0)
+        elif isinstance(module, FANLayer):
+            # FANLayer initialization is handled within the class __init__
+            # Uses normal initialization with std=0.02 for weights
+            pass
+        elif isinstance(module, SeeDNorm):
+            # SeeDNorm initialization (parameters already initialized correctly in __init__):
+            # gamma (γ) initialized to 1 (static scaling component, like RMSNorm)
+            # beta (β) initialized to 0 (self-rescaling starts disabled)
+            # alpha (α) initialized to 1 (dynamic modulation at full strength)
+            pass
         elif isinstance(module, (ScalarMultiplier, VectorMultiplier)):
+            # Learnable Multipliers: initialize to 1.0 for identity transformation
+            # This allows the model to start from the standard behavior and learn
+            # scale adaptations from data without initial bias
             if hasattr(module, 'multiplier'):
                 module.multiplier.data.fill_(1.0)
 class NeoLLMModel(NeoLLMPreTrainedModel):
     """
     NeoLLM base model with transformer decoder architecture.
     Note on embeddings and weight tying: This model uses weight tying between
     embed_tokens and lm_head (shared weights). Following "Learnable Multipliers"
     paper analysis, we do NOT add multipliers to embeddings because:
+    1. Weight tying creates conflicting gradient paths: multipliers would scale
+       gradients from embedding lookup but not from lm_head projection, causing
+       the multiplier to receive incomplete optimization signals.
+    2. The paper explicitly warns against multipliers in lm_head (creates shortcuts
+       for learning marginal token distribution), and with weight tying this
+       restriction propagates to embeddings.
+    3. Compensating mechanisms provide scale adaptation immediately after embedding:
+       - First layer attention has multipliers in Q/O projections
+       - FANformer transforms the representation space
+       - SeeDNorm provides input-dependent dynamic scaling
+       - ResFormer propagates first-layer features with learnable scaling
     """
     def __init__(self, config: NeoLLMConfig):
         super().__init__(config)
         # Standard embedding without learnable multipliers
+        # Due to weight tying with lm_head, multipliers would create
+        # conflicting optimization dynamics (see class docstring)
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
         # Each layer creates its own components (no shared parameters)
         self.rotary_emb = NeoLLMRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
+        # ResFormer: storage for first layer's FAN features (H_fan_1)
         self.first_layer_fan = None
         # Initialize weights and apply final processing
         output_hidden_states: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPast:
         output_hidden_states = (
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         if inputs_embeds is None:
+            # Standard embedding lookup without multipliers
+            # Scale adaptation occurs in subsequent layers via:
+            # (1) First layer attention multipliers, (2) FANformer transformation,
+            # (3) SeeDNorm dynamic scaling, (4) ResFormer feature propagation
             inputs_embeds = self.embed_tokens(input_ids)
         if position_ids is None:
         )
         hidden_states = inputs_embeds
         all_hidden_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
+        # create position embeddings to be shared across the decoder layers
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # ResFormer: reset first_layer_fan at the start of each forward pass
         self.first_layer_fan = None
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
                 hidden_states,
                 position_embeddings=position_embeddings,
                 attention_mask=causal_mask,
+                first_layer_fan=self.first_layer_fan,  # Pass H_fan_1 to all layers
                 output_attentions=output_attentions,
                 **kwargs,
             )
             if output_attentions:
                 all_attentions = all_attentions + (layer_outputs[1],)
             # ResFormer: capture H_fan_1 from the first layer
             if self.first_layer_fan is None and hasattr(decoder_layer, 'current_layer_fan'):
                 self.first_layer_fan = decoder_layer.current_layer_fan
             all_hidden_states = all_hidden_states + (hidden_states,)
         if not return_dict:
+            return tuple(v for v in [hidden_states, None, all_hidden_states, all_attentions] if v is not None)
         return BaseModelOutputWithPast(
             last_hidden_state=hidden_states,
+            past_key_values=None,
             hidden_states=all_hidden_states,
             attentions=all_attentions,
         )
         self.post_init()
     def forward(
         self,
         logits_to_keep: Union[int, torch.Tensor] = 0,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> CausalLMOutputWithPast:
         outputs: BaseModelOutputWithPast = self.model(
             inputs_embeds=inputs_embeds,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             **kwargs,
         )
         return CausalLMOutputWithPast(
             loss=loss,
             logits=logits,
+            past_key_values=None,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
     "ScalarMultiplier",
     "VectorMultiplier",
     "LinearWithMultipliers",
+    "MEAHeadRMSNorm",
 ]
 # Register the configuration and model for AutoClass support