KitsuVp
/

NeoLLM

@@ -2,7 +2,9 @@
 """
 NeoLLM Model with FANformer Integration in both Attention and FFN, Dropout Regularization,
 SeeDNorm (Self-Rescaled Dynamic Normalization), ResFormer Value Residual Learning,
-and Learnable Multipliers for enhanced scale adaptation and information flow through deep layers.
 Updated to include:
 - Fourier Analysis Network (FAN) layer for effective periodicity modeling in attention (relational space)
 - FAN layer in FFN for featural periodicity modeling (complementary coverage)
@@ -10,16 +12,20 @@ Updated to include:
 - Dropout regularization at strategic locations
 - ResFormer: Feature residual connections from first layer (applied before projections)
 - Learnable Multipliers: Frees weight matrix scale from WD-noise equilibrium for data-adaptive scaling
 - Full Attention only (linear attention removed)
 """
 import math
-from typing import Any, Callable, Optional, Union, Tuple
 import torch
 import torch.nn.functional as F
 from torch import nn
 from cut_cross_entropy import linear_cross_entropy
 from transformers.activations import ACT2FN
 from transformers.generation import GenerationMixin
@@ -296,7 +302,7 @@ class SeeDNorm(nn.Module):
             Normalized and dynamically scaled tensor of same shape
         """
-        x_for_dynamic = F.dropout(x, p=self.dropout_input)
         rescale_factor = torch.tanh(torch.sum(x_for_dynamic * self.beta,
                                                dim=-1, keepdim=True))
@@ -306,7 +312,7 @@ class SeeDNorm(nn.Module):
         # Apply RMS normalization on ORIGINAL input (not dropped version)
         x_normalized = self._rms_norm(x.float())
-        x_normalized = F.dropout(x_normalized, p=self.dropout_hidden)
         # Apply dynamic scaling
         output = x_normalized * dynamic_scale.float()
@@ -317,6 +323,189 @@ class SeeDNorm(nn.Module):
         return (f"dim={self.dim}, eps={self.eps}, "
                 f"dropout_input={self.dropout_input}, dropout_hidden={self.dropout_hidden}")
 class NeoLLMRotaryEmbedding(nn.Module):
     inv_freq: torch.Tensor  # fix linting for `register_buffer`
@@ -424,7 +613,7 @@ class NeoLLMAttention(nn.Module):
     ResFormer feature residual connections, and Learnable Multipliers for enhanced
     information flow and scale adaptation.
-    ResFormer enhancement: Applies learnable feature residual connections from the first layer
     BEFORE QKV projections: H'_fan_n = λ_1 * H_fan_1 + λ_2 * H_fan_n
     Learnable Multipliers placement (from "Learnable Multipliers" paper Appendix C):
@@ -486,33 +675,43 @@ class NeoLLMAttention(nn.Module):
         self.dropout = nn.Dropout(config.dropout_rate)
         # ResFormer: learnable feature residual parameters (initialized to 0.5)
-        self.lambda_1 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_1 (first layer features)
-        self.lambda_2 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_n (current layer features)
     def forward(
         self,
         hidden_states: torch.Tensor,
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
-        attention_mask: Optional[torch.Tensor],
         first_layer_fan: Optional[torch.Tensor] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
         input_shape = hidden_states.shape[:-1]
-        # Apply FANformer transformation first
         hidden_states_fan = self.fan_layer(hidden_states)
         # ResFormer: Apply feature residual connection BEFORE projections
-        # This ensures dimensional compatibility across all layer types
         if first_layer_fan is not None:
             hidden_states_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * hidden_states_fan
-        # Store current FAN features for potential use as first_layer_fan in subsequent layers
         current_layer_fan = hidden_states_fan.clone()
         hidden_shape = (*input_shape, -1, self.head_dim)
-        # Use FAN-transformed features (with residual applied) for projections
         # Q projection with learnable row multipliers
         query_states, gate = torch.chunk(
             self.q_proj(hidden_states_fan).view(*input_shape, -1, self.head_dim * 2), 2, dim=-1
@@ -633,17 +832,19 @@ class NeoLLMMLP(nn.Module):
         hidden = self.dropout(hidden)
         return self.down_proj(hidden)
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     """
-    Decoder layer with standard residual connections.
-    Arquitectura:
-    1. Pre-norm (SeeDNorm) → LNS scaling → Self-Attention con ResFormer y Learnable Multipliers
-    2. Standard Residual Connection (suma simple)
     3. GPAS activation scaling
-    4. Pre-norm (SeeDNorm) → LNS scaling → MLP con FANformer y Learnable Multipliers
-    5. Standard Residual Connection (suma simple)
     6. GPAS activation scaling
     """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
@@ -657,7 +858,7 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         # MLP with FANformer integration and learnable multipliers
         self.mlp = NeoLLMMLP(config)
-        # SeeDNorm for input and post-attention normalization (replaces RMSNorm)
         self.input_layernorm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -665,10 +866,15 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         self.lns_attn = LNS(layer_idx)
         self.lns_mlp = LNS(layer_idx)
-        # GPAS (Gradient-Preserving Activation Scaling) - applied after residual connections
         self.gpas_attn = GPAS(config.hidden_size)
         self.gpas_mlp = GPAS(config.hidden_size)
         # ResFormer: storage for current layer's FAN features
         self.current_layer_fan = None
@@ -678,11 +884,28 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor] = None,
         first_layer_fan: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         # ============================================================
-        # Attention Block with standard residual connection
         # ============================================================
         residual = hidden_states
@@ -692,24 +915,23 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         # Apply LNS scaling after normalization
         hidden_states = self.lns_attn(hidden_states)
-        # Self Attention with ResFormer feature residual connections and learnable multipliers
-        # We capture attn_weights here instead of ignoring them
-        hidden_states, attn_weights, self.current_layer_fan = self.self_attn(
             hidden_states=hidden_states,
-            attention_mask=attention_mask,
             position_embeddings=position_embeddings,
             first_layer_fan=first_layer_fan,
             **kwargs,
         )
-        # Standard residual connection
-        hidden_states = residual + hidden_states
-        # Apply GPAS after attention residual connection
         hidden_states = self.gpas_attn(hidden_states)
         # ============================================================
-        # MLP Block with standard residual connection
         # ============================================================
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
@@ -717,20 +939,27 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         # Apply LNS scaling after normalization
         hidden_states = self.lns_mlp(hidden_states)
-        # MLP now includes FAN transformation and learnable multipliers internally
-        hidden_states = self.mlp(hidden_states)
-        # Standard residual connection
-        hidden_states = residual + hidden_states
-        # Apply GPAS after MLP residual connection
         hidden_states = self.gpas_mlp(hidden_states)
-        outputs = (hidden_states,)
-        if output_attentions:
-            outputs += (attn_weights,)
-        return outputs
 class NeoLLMPreTrainedModel(PreTrainedModel):
@@ -743,6 +972,7 @@ class NeoLLMPreTrainedModel(PreTrainedModel):
     - FANLayer (Fourier Analysis Network)
     - SeeDNorm (Self-Rescaled Dynamic Normalization)
     - Learnable Multipliers (ScalarMultiplier, VectorMultiplier)
     """
     config: NeoLLMConfig
     base_model_prefix = "model"
@@ -755,76 +985,58 @@ class NeoLLMPreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         """
         Initialize weights for all custom modules in NeoLLM.
-        Strategy:
-        - Standard layers (Linear, Embedding): handled by parent class
-        - Custom modules: specialized initialization per component
-        - Learnable Multipliers: initialized to 1.0 for identity transformation
         """
         super()._init_weights(module)
         if isinstance(module, NeoLLMAttention):
-            # ResFormer: initialize lambda parameters for full attention
-            # Lambda values control the interpolation between first layer and current layer features
-            # Starting at 0.5 provides balanced contribution from both sources
             if hasattr(module, 'lambda_1'):
                 module.lambda_1.data.fill_(0.5)
             if hasattr(module, 'lambda_2'):
                 module.lambda_2.data.fill_(0.5)
         elif isinstance(module, GPAS):
-            # Initialize GPAS alpha to 0 as per paper
-            # This starts with no activation scaling, allowing the model to learn gradually
             module.alpha.data.fill_(0.0)
-        elif isinstance(module, FANLayer):
-            # FANLayer initialization is handled within the class __init__
-            # Uses normal initialization with std=0.02 for weights
-            pass
-        elif isinstance(module, SeeDNorm):
-            # SeeDNorm initialization (parameters already initialized correctly in __init__):
-            # gamma (γ) initialized to 1 (static scaling component, like RMSNorm)
-            # beta (β) initialized to 0 (self-rescaling starts disabled)
-            # alpha (α) initialized to 1 (dynamic modulation at full strength)
-            pass
         elif isinstance(module, (ScalarMultiplier, VectorMultiplier)):
-            # Learnable Multipliers: initialize to 1.0 for identity transformation
-            # This allows the model to start from the standard behavior and learn
-            # scale adaptations from data without initial bias
             if hasattr(module, 'multiplier'):
                 module.multiplier.data.fill_(1.0)
 class NeoLLMModel(NeoLLMPreTrainedModel):
     """
     NeoLLM base model with transformer decoder architecture.
     Note on embeddings and weight tying: This model uses weight tying between
     embed_tokens and lm_head (shared weights). Following "Learnable Multipliers"
     paper analysis, we do NOT add multipliers to embeddings because:
-    1. Weight tying creates conflicting gradient paths: multipliers would scale
-       gradients from embedding lookup but not from lm_head projection, causing
-       the multiplier to receive incomplete optimization signals.
-    2. The paper explicitly warns against multipliers in lm_head (creates shortcuts
-       for learning marginal token distribution), and with weight tying this
-       restriction propagates to embeddings.
-    3. Compensating mechanisms provide scale adaptation immediately after embedding:
-       - First layer attention has multipliers in Q/O projections
-       - FANformer transforms the representation space
-       - SeeDNorm provides input-dependent dynamic scaling
-       - ResFormer propagates first-layer features with learnable scaling
     """
     def __init__(self, config: NeoLLMConfig):
         super().__init__(config)
         # Standard embedding without learnable multipliers
-        # Due to weight tying with lm_head, multipliers would create
-        # conflicting optimization dynamics (see class docstring)
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
         # Each layer creates its own components (no shared parameters)
@@ -837,7 +1049,10 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         self.rotary_emb = NeoLLMRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
-        # ResFormer: storage for first layer's FAN features (H_fan_1)
         self.first_layer_fan = None
         # Initialize weights and apply final processing
@@ -868,10 +1083,6 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         if inputs_embeds is None:
-            # Standard embedding lookup without multipliers
-            # Scale adaptation occurs in subsequent layers via:
-            # (1) First layer attention multipliers, (2) FANformer transformation,
-            # (3) SeeDNorm dynamic scaling, (4) ResFormer feature propagation
             inputs_embeds = self.embed_tokens(input_ids)
         if position_ids is None:
@@ -890,13 +1101,15 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         all_hidden_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
-        # create position embeddings to be shared across the decoder layers
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
-        # ResFormer: reset first_layer_fan at the start of each forward pass
         self.first_layer_fan = None
-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
@@ -904,7 +1117,9 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
                 hidden_states,
                 position_embeddings=position_embeddings,
                 attention_mask=causal_mask,
-                first_layer_fan=self.first_layer_fan,  # Pass H_fan_1 to all layers
                 output_attentions=output_attentions,
                 **kwargs,
             )
@@ -914,6 +1129,10 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             if output_attentions:
                 all_attentions = all_attentions + (layer_outputs[1],)
             # ResFormer: capture H_fan_1 from the first layer
             if self.first_layer_fan is None and hasattr(decoder_layer, 'current_layer_fan'):
                 self.first_layer_fan = decoder_layer.current_layer_fan
@@ -967,11 +1186,10 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
     """
     Causal Language Model with NeoLLM architecture.
     Note on LM head: Following "Learnable Multipliers" paper recommendations,
-    the output projection (lm_head) does NOT include learnable multipliers because:
-    1. The preceding RMSNorm (self.model.norm) already acts as column multipliers
-    2. Adding row multipliers to lm_head can create shortcuts where the model
-       learns marginal token distribution without updating internal features
     """
     _tied_weights_keys = ["lm_head.weight"]
@@ -981,7 +1199,6 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
         self.vocab_size = config.vocab_size
         # LM head without learnable multipliers (standard linear layer)
-        # Preceding norm layer provides sufficient scale adaptation
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.post_init()
@@ -1046,6 +1263,7 @@ __all__ = [
     "ScalarMultiplier",
     "VectorMultiplier",
     "LinearWithMultipliers",
 ]
 # Register the configuration and model for AutoClass support

 """
 NeoLLM Model with FANformer Integration in both Attention and FFN, Dropout Regularization,
 SeeDNorm (Self-Rescaled Dynamic Normalization), ResFormer Value Residual Learning,
+Learnable Multipliers for enhanced scale adaptation and information flow through deep layers,
+and StackMemory for hierarchical pattern modeling.
 Updated to include:
 - Fourier Analysis Network (FAN) layer for effective periodicity modeling in attention (relational space)
 - FAN layer in FFN for featural periodicity modeling (complementary coverage)
 - Dropout regularization at strategic locations
 - ResFormer: Feature residual connections from first layer (applied before projections)
 - Learnable Multipliers: Frees weight matrix scale from WD-noise equilibrium for data-adaptive scaling
+- StackMemory: Differentiable hidden state stack for modeling Chomsky hierarchy grammars
 - Full Attention only (linear attention removed)
 """
 import math
+from typing import Any, Callable, Optional, Union, Tuple, List
 import torch
 import torch.nn.functional as F
 from torch import nn
 from cut_cross_entropy import linear_cross_entropy
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+from typing import Optional, Tuple
 from transformers.activations import ACT2FN
 from transformers.generation import GenerationMixin
             Normalized and dynamically scaled tensor of same shape
         """
+        x_for_dynamic = F.dropout(x, p=self.dropout_input, training=self.training)
         rescale_factor = torch.tanh(torch.sum(x_for_dynamic * self.beta,
                                                dim=-1, keepdim=True))
         # Apply RMS normalization on ORIGINAL input (not dropped version)
         x_normalized = self._rms_norm(x.float())
+        x_normalized = F.dropout(x_normalized, p=self.dropout_hidden, training=self.training)
         # Apply dynamic scaling
         output = x_normalized * dynamic_scale.float()
         return (f"dim={self.dim}, eps={self.eps}, "
                 f"dropout_input={self.dropout_input}, dropout_hidden={self.dropout_hidden}")
+# ==================== STACK MEMORY MODULE ====================
+class StackMemory(nn.Module):
+    """
+    Differentiable Hidden State Stack for modeling Chomsky hierarchy grammars.
+    From "Improving Formal Reasoning of Transformer with State Stack":
+    Implements a multi-head differentiable stack with soft push, pop, and no-op operations.
+    Each head maintains its own stack and mask, which are updated based on learned action
+    probabilities. Global reading is performed via query-over-stack attention.
+    This module is inserted between Transformer layers to augment information flow with
+    stack-like memory operations, enabling the model to better capture hierarchical and
+    recursive patterns characteristic of regular expressions and context-free grammars.
+    Note: StackMemory uses standard nn.Linear to maintain architectural
+    independence and avoid introducing additional complexity in the memory operations.
+    Args:
+        config: Model configuration containing stack-related hyperparameters
+    """
+    def __init__(self, config: NeoLLMConfig):
+        super().__init__()
+        self.config = config
+        self.num_stack_heads = getattr(config, 'num_stack_heads', 4)
+        self.stack_slots = getattr(config, 'stack_slots', 24)
+        self.stack_d_model = getattr(config, 'stack_d_model', 128)
+        self.head_dim = self.stack_d_model // self.num_stack_heads
+        # Dimension reduction projections for efficiency
+        # Uses standard nn.Linear
+        self.down_proj = nn.Linear(config.hidden_size, self.stack_d_model, bias=False)
+        self.up_proj = nn.Linear(self.stack_d_model, config.hidden_size, bias=False)
+        # Action prediction: generates push/pop/no-op probabilities for each head
+        self.action_head = nn.Linear(self.stack_d_model, 3 * self.num_stack_heads, bias=True)
+        # Query projection for global reading (one per head)
+        self.gate_proj = nn.Linear(self.head_dim, 1, bias=False)
+        # Residual weight for gating stack contribution
+        self.res_weight = nn.Parameter(torch.ones(1))
+    def _vectorized_update(
+        self,
+        stack: torch.Tensor,
+        mask: torch.Tensor,
+        actions: torch.Tensor,
+        k_values: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Vectorized stack update mechanism applying soft push/pop/no-op operations.
+        Implements the differentiable stack operations from the paper:
+        - Push: shifts all elements down and places k_values at top
+        - Pop: shifts all elements up and removes top
+        - No-op: maintains current stack state
+        Args:
+            stack: Current stack state [batch, seq, num_heads, stack_slots, head_dim]
+            mask: Current stack mask [batch, seq, num_heads, stack_slots]
+            actions: Action probabilities [batch, seq, num_heads, 3] (push/pop/no-op)
+            k_values: New values to push [batch, seq, num_heads, head_dim]
+        Returns:
+            Tuple of (updated_stack, updated_mask)
+        """
+        batch_size, seq_len = actions.shape[:2]
+        # Expand stack and mask along sequence dimension for parallel processing
+        stack = stack.unsqueeze(1).expand(-1, seq_len, -1, -1, -1)
+        mask = mask.unsqueeze(1).expand(-1, seq_len, -1, -1)
+        # Generate pushed stack: new value at top, shift others down
+        push_stack = torch.cat([
+            k_values.unsqueeze(3),  # New value at position 0
+            stack[:, :, :, :-1]     # Shift existing elements down
+        ], dim=3)
+        push_mask = torch.cat([
+            torch.ones_like(mask[:, :, :, :1]),
+            mask[:, :, :, :-1]
+        ], dim=3)
+        # Generate popped stack: shift all up, zero at bottom
+        pop_stack = torch.cat([
+            stack[:, :, :, 1:],
+            torch.zeros_like(stack[:, :, :, :1])
+        ], dim=3)
+        pop_mask = torch.cat([
+            mask[:, :, :, 1:],
+            torch.zeros_like(mask[:, :, :, :1])
+        ], dim=3)
+        # Combine operations weighted by action probabilities
+        action_weights = actions.unsqueeze(-1).unsqueeze(-1)  # [batch, seq, heads, 3, 1, 1]
+        stacks = torch.stack([push_stack, pop_stack, stack], dim=3)  # [batch, seq, heads, 3, slots, dim]
+        masks = torch.stack([push_mask, pop_mask, mask], dim=3)  # [batch, seq, heads, 3, slots]
+        # Weighted combination of all operations
+        new_stack = (stacks * action_weights).sum(dim=3)
+        new_mask = (masks * action_weights.squeeze(-1)).sum(dim=3)
+        return new_stack, new_mask
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        stack: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Apply differentiable stack operations to hidden states.
+        Args:
+            hidden_states: Input hidden states [batch, seq, hidden_size]
+            stack: Previous stack state [batch, num_heads, stack_slots, head_dim] or None
+            mask: Previous stack mask [batch, num_heads, stack_slots] or None
+        Returns:
+            Tuple of (output_hidden_states, updated_stack, updated_mask)
+        """
+        batch_size, seq_len, _ = hidden_states.shape
+        device = hidden_states.device
+        # Initialize stack and mask if not provided
+        if stack is None:
+            stack = torch.zeros(
+                batch_size, self.num_stack_heads, self.stack_slots, self.head_dim,
+                device=device, dtype=hidden_states.dtype
+            )
+        if mask is None:
+            mask = torch.zeros(
+                batch_size, self.num_stack_heads, self.stack_slots,
+                device=device, dtype=hidden_states.dtype
+            )
+        # Project to lower dimension for efficiency
+        new_hidden_states = self.down_proj(hidden_states)
+        # Generate action probabilities: [batch, seq, num_heads, 3]
+        action_logits = self.action_head(new_hidden_states) / math.sqrt(self.head_dim)
+        actions = F.softmax(
+            action_logits.view(batch_size, seq_len, self.num_stack_heads, 3),
+            dim=-1
+        )
+        # Prepare values to push (split into heads)
+        k_values = new_hidden_states.view(batch_size, seq_len, self.num_stack_heads, self.head_dim)
+        # Update stack and mask using vectorized operations
+        new_stack, new_mask = self._vectorized_update(stack, mask, actions, k_values)
+        # Global reading via query-over-stack attention
+        # Apply mask before attention computation
+        masked_stack = new_stack * new_mask.unsqueeze(-1)
+        # Compute attention scores for each head
+        gate_scores = self.gate_proj(masked_stack).squeeze(-1)  # [batch, seq, heads, slots]
+        # Mask out invalid positions (add large negative value)
+        gate_scores = gate_scores + (1 - new_mask) * -1e9
+        # Softmax to get attention weights
+        gate_weights = F.softmax(gate_scores, dim=-1)
+        # Weighted sum over stack slots
+        memory_output = (new_stack * gate_weights.unsqueeze(-1)).sum(dim=3)
+        memory_output = memory_output.view(batch_size, seq_len, -1)
+        # Project back to original dimension
+        memory_output = self.up_proj(memory_output)
+        # Gated residual connection
+        output = memory_output * self.res_weight + hidden_states
+        # Return output and updated stack state (use last timestep's state)
+        return output, new_stack[:, -1], new_mask[:, -1]
+# ==================== ROTARY EMBEDDING ====================
 class NeoLLMRotaryEmbedding(nn.Module):
     inv_freq: torch.Tensor  # fix linting for `register_buffer`
     ResFormer feature residual connections, and Learnable Multipliers for enhanced
     information flow and scale adaptation.
+    ResFormer enhancement: Applies learnable feature residual connections from first layer
     BEFORE QKV projections: H'_fan_n = λ_1 * H_fan_1 + λ_2 * H_fan_n
     Learnable Multipliers placement (from "Learnable Multipliers" paper Appendix C):
         self.dropout = nn.Dropout(config.dropout_rate)
         # ResFormer: learnable feature residual parameters (initialized to 0.5)
+        self.lambda_1 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_1
+        self.lambda_2 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_n
     def forward(
         self,
         hidden_states: torch.Tensor,
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
         first_layer_fan: Optional[torch.Tensor] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Forward pass with ResFormer feature residual connections.
+        Args:
+            hidden_states: Current layer input [batch, seq, hidden_size]
+            position_embeddings: Tuple of (cos, sin) for RoPE
+            attention_mask: Causal attention mask
+            first_layer_fan: First layer FAN features (for ResFormer)
+        Returns:
+            Tuple of (attn_output, attn_weights, current_layer_fan)
+        """
         input_shape = hidden_states.shape[:-1]
+        # Apply FANformer transformation
         hidden_states_fan = self.fan_layer(hidden_states)
         # ResFormer: Apply feature residual connection BEFORE projections
         if first_layer_fan is not None:
             hidden_states_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * hidden_states_fan
+        # Store current FAN features for ResFormer
         current_layer_fan = hidden_states_fan.clone()
         hidden_shape = (*input_shape, -1, self.head_dim)
         # Q projection with learnable row multipliers
         query_states, gate = torch.chunk(
             self.q_proj(hidden_states_fan).view(*input_shape, -1, self.head_dim * 2), 2, dim=-1
         hidden = self.dropout(hidden)
         return self.down_proj(hidden)
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     """
+    Decoder layer with standard residual connections and optional StackMemory.
+    Architecture:
+    1. Pre-norm (SeeDNorm) → LNS scaling → Self-Attention with ResFormer and Learnable Multipliers
+    2. Standard Residual Connection
     3. GPAS activation scaling
+    4. Pre-norm (SeeDNorm) → LNS scaling → MLP with FANformer and Learnable Multipliers
+    5. Standard Residual Connection
     6. GPAS activation scaling
+    7. Optional: StackMemory module
     """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         # MLP with FANformer integration and learnable multipliers
         self.mlp = NeoLLMMLP(config)
+        # SeeDNorm for input and post-attention normalization
         self.input_layernorm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.lns_attn = LNS(layer_idx)
         self.lns_mlp = LNS(layer_idx)
+        # GPAS (Gradient-Preserving Activation Scaling)
         self.gpas_attn = GPAS(config.hidden_size)
         self.gpas_mlp = GPAS(config.hidden_size)
+        # StackMemory: Differentiable hidden state stack
+        self.use_stack = getattr(config, 'use_stack', False)
+        if self.use_stack:
+            self.stack_memory = StackMemory(config)
         # ResFormer: storage for current layer's FAN features
         self.current_layer_fan = None
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor] = None,
         first_layer_fan: Optional[torch.Tensor] = None,
+        stack_state: Optional[torch.Tensor] = None,
+        stack_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
         **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Forward pass with ResFormer and optional StackMemory.
+        Args:
+            hidden_states: Current layer input [batch, seq, hidden_size]
+            position_embeddings: Tuple of (cos, sin) for RoPE
+            attention_mask: Causal attention mask
+            first_layer_fan: First layer FAN features (for ResFormer)
+            stack_state: StackMemory state (optional)
+            stack_mask: StackMemory mask (optional)
+            output_attentions: Whether to return attention weights
+        Returns:
+            Tuple of (hidden_states, attn_weights, stack_state, stack_mask)
+        """
         # ============================================================
+        # Attention Block with Standard Residual Connection
         # ============================================================
         residual = hidden_states
         # Apply LNS scaling after normalization
         hidden_states = self.lns_attn(hidden_states)
+        # Self Attention with ResFormer
+        attn_output, attn_weights, self.current_layer_fan = self.self_attn(
             hidden_states=hidden_states,
             position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
             first_layer_fan=first_layer_fan,
             **kwargs,
         )
+        # Standard Residual Connection
+        hidden_states = residual + attn_output
+        # Apply GPAS after residual connection
         hidden_states = self.gpas_attn(hidden_states)
         # ============================================================
+        # MLP Block with Standard Residual Connection
         # ============================================================
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         # Apply LNS scaling after normalization
         hidden_states = self.lns_mlp(hidden_states)
+        # MLP with FANformer
+        mlp_output = self.mlp(hidden_states)
+        # Standard Residual Connection
+        hidden_states = residual + mlp_output
+        # Apply GPAS after residual connection
         hidden_states = self.gpas_mlp(hidden_states)
+        # ============================================================
+        # Stack Memory Module
+        # ============================================================
+        if self.use_stack:
+            hidden_states, stack_state, stack_mask = self.stack_memory(
+                hidden_states, stack_state, stack_mask
+            )
+        if self.use_stack:
+            return (hidden_states, attn_weights, stack_state, stack_mask)
+        else:
+            return (hidden_states, attn_weights, None, None)
 class NeoLLMPreTrainedModel(PreTrainedModel):
     - FANLayer (Fourier Analysis Network)
     - SeeDNorm (Self-Rescaled Dynamic Normalization)
     - Learnable Multipliers (ScalarMultiplier, VectorMultiplier)
+    - StackMemory (Differentiable Hidden State Stack)
     """
     config: NeoLLMConfig
     base_model_prefix = "model"
     def _init_weights(self, module):
         """
         Initialize weights for all custom modules in NeoLLM.
         """
         super()._init_weights(module)
         if isinstance(module, NeoLLMAttention):
             if hasattr(module, 'lambda_1'):
                 module.lambda_1.data.fill_(0.5)
             if hasattr(module, 'lambda_2'):
                 module.lambda_2.data.fill_(0.5)
         elif isinstance(module, GPAS):
             module.alpha.data.fill_(0.0)
         elif isinstance(module, (ScalarMultiplier, VectorMultiplier)):
             if hasattr(module, 'multiplier'):
                 module.multiplier.data.fill_(1.0)
+        elif isinstance(module, StackMemory):
+            std = self.config.initializer_range if hasattr(self.config, 'initializer_range') else 0.02
+            if hasattr(module, 'down_proj'):
+                module.down_proj.weight.data.normal_(mean=0.0, std=std)
+            if hasattr(module, 'up_proj'):
+                module.up_proj.weight.data.normal_(mean=0.0, std=std)
+            if hasattr(module, 'action_head'):
+                module.action_head.weight.data.normal_(mean=0.0, std=std)
+                if module.action_head.bias is not None:
+                    module.action_head.bias.data.zero_()
+            if hasattr(module, 'gate_proj'):
+                module.gate_proj.weight.data.normal_(mean=0.0, std=std)
+            if hasattr(module, 'res_weight'):
+                module.res_weight.data.fill_(1.0)
 class NeoLLMModel(NeoLLMPreTrainedModel):
     """
     NeoLLM base model with transformer decoder architecture.
+    Uses ResFormer for first-layer feature propagation with standard residual connections
+    and optional StackMemory for hierarchical pattern modeling.
     Note on embeddings and weight tying: This model uses weight tying between
     embed_tokens and lm_head (shared weights). Following "Learnable Multipliers"
     paper analysis, we do NOT add multipliers to embeddings because:
+    1. Weight tying creates conflicting gradient paths
+    2. The paper explicitly warns against multipliers in lm_head
+    3. Compensating mechanisms provide scale adaptation immediately after embedding
     """
     def __init__(self, config: NeoLLMConfig):
         super().__init__(config)
         # Standard embedding without learnable multipliers
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
         # Each layer creates its own components (no shared parameters)
         self.rotary_emb = NeoLLMRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
+        # Configuration
+        self.use_stack = getattr(config, 'use_stack', False)
+        # ResFormer: storage for first layer's FAN features
         self.first_layer_fan = None
         # Initialize weights and apply final processing
             raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
         if position_ids is None:
         all_hidden_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
+        # Create position embeddings to be shared across the decoder layers
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # ResFormer with first-layer feature propagation
         self.first_layer_fan = None
+        stack_state = None
+        stack_mask = None
+        for decoder_layer in self.layers:
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
                 hidden_states,
                 position_embeddings=position_embeddings,
                 attention_mask=causal_mask,
+                first_layer_fan=self.first_layer_fan,
+                stack_state=stack_state,
+                stack_mask=stack_mask,
                 output_attentions=output_attentions,
                 **kwargs,
             )
             if output_attentions:
                 all_attentions = all_attentions + (layer_outputs[1],)
+            if self.use_stack:
+                stack_state = layer_outputs[2]
+                stack_mask = layer_outputs[3]
             # ResFormer: capture H_fan_1 from the first layer
             if self.first_layer_fan is None and hasattr(decoder_layer, 'current_layer_fan'):
                 self.first_layer_fan = decoder_layer.current_layer_fan
     """
     Causal Language Model with NeoLLM architecture.
+    Supports ResFormer with standard residuals and optional StackMemory.
     Note on LM head: Following "Learnable Multipliers" paper recommendations,
+    the output projection (lm_head) does NOT include learnable multipliers.
     """
     _tied_weights_keys = ["lm_head.weight"]
         self.vocab_size = config.vocab_size
         # LM head without learnable multipliers (standard linear layer)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.post_init()
     "ScalarMultiplier",
     "VectorMultiplier",
     "LinearWithMultipliers",
+    "StackMemory",
 ]
 # Register the configuration and model for AutoClass support