KitsuVp
/

NeoLLM

@@ -4,7 +4,6 @@ NeoLLM Model with FANformer Integration in both Attention and FFN, Dropout Regul
 SeeDNorm (Self-Rescaled Dynamic Normalization), ResFormer Value Residual Learning,
 Learnable Multipliers for enhanced scale adaptation and information flow through deep layers,
 and StackMemory for hierarchical pattern modeling.
 Updated to include:
 - Fourier Analysis Network (FAN) layer for effective periodicity modeling in attention (relational space)
 - FAN layer in FFN for featural periodicity modeling (complementary coverage)
@@ -260,7 +259,6 @@ class SeeDNorm(nn.Module):
     Self-Rescaled Dynamic Normalization (SeeDNorm) with dual dropout regularization.
     SeeDNorm(x) = [σ(x·β^T)·α + γ] ⊙ x/RMS(x)
     Args:
         dim: Hidden dimension size
@@ -325,7 +323,6 @@ class SeeDNorm(nn.Module):
 # ==================== STACK MEMORY MODULE ====================
 class StackMemory(nn.Module):
     """
     Differentiable Hidden State Stack for modeling Chomsky hierarchy grammars.
@@ -364,7 +361,7 @@ class StackMemory(nn.Module):
         self.action_head = nn.Linear(self.stack_d_model, 3 * self.num_stack_heads, bias=True)
         # Query projection for global reading (one per head)
-        self.gate_proj = nn.Linear(self.head_dim, 1, bias=False)
         # Residual weight for gating stack contribution
         self.res_weight = nn.Parameter(torch.ones(1))
@@ -479,19 +476,21 @@ class StackMemory(nn.Module):
         new_stack, new_mask = self._vectorized_update(stack, mask, actions, k_values)
         # Global reading via query-over-stack attention
-        # Apply mask before attention computation
-        masked_stack = new_stack * new_mask.unsqueeze(-1)
-        # Compute attention scores for each head
-        gate_scores = self.gate_proj(masked_stack).squeeze(-1)  # [batch, seq, heads, slots]
-        # Mask out invalid positions (add large negative value)
         gate_scores = gate_scores + (1 - new_mask) * -1e9
         # Softmax to get attention weights
         gate_weights = F.softmax(gate_scores, dim=-1)
         # Weighted sum over stack slots
         memory_output = (new_stack * gate_weights.unsqueeze(-1)).sum(dim=3)
         memory_output = memory_output.view(batch_size, seq_len, -1)
@@ -882,19 +881,18 @@ class NeoLLMMLP(nn.Module):
         hidden = self.dropout(hidden)
         return self.down_proj(hidden)
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     """
     Decoder layer with standard residual connections and optional StackMemory.
-    Architecture:
-    1. Pre-norm (SeeDNorm) → LNS scaling → Self-Attention with ResFormer and Learnable Multipliers
-    2. Standard Residual Connection
-    3. GPAS activation scaling
-    4. Pre-norm (SeeDNorm) → LNS scaling → MLP with FANformer and Learnable Multipliers
-    5. Standard Residual Connection
-    6. GPAS activation scaling
-    7. Optional: StackMemory module
     """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
@@ -954,8 +952,19 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         Returns:
             Tuple of (hidden_states, attn_weights, stack_state, stack_mask)
         """
         # ============================================================
-        # Attention Block with Standard Residual Connection
         # ============================================================
         residual = hidden_states
@@ -981,7 +990,7 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         hidden_states = self.gpas_attn(hidden_states)
         # ============================================================
-        # MLP Block with Standard Residual Connection
         # ============================================================
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
@@ -998,14 +1007,7 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         # Apply GPAS after residual connection
         hidden_states = self.gpas_mlp(hidden_states)
-        # ============================================================
-        # Stack Memory Module
-        # ============================================================
-        if self.use_stack:
-            hidden_states, stack_state, stack_mask = self.stack_memory(
-                hidden_states, stack_state, stack_mask
-            )
         if self.use_stack:
             return (hidden_states, attn_weights, stack_state, stack_mask)
         else:

 SeeDNorm (Self-Rescaled Dynamic Normalization), ResFormer Value Residual Learning,
 Learnable Multipliers for enhanced scale adaptation and information flow through deep layers,
 and StackMemory for hierarchical pattern modeling.
 Updated to include:
 - Fourier Analysis Network (FAN) layer for effective periodicity modeling in attention (relational space)
 - FAN layer in FFN for featural periodicity modeling (complementary coverage)
     Self-Rescaled Dynamic Normalization (SeeDNorm) with dual dropout regularization.
     SeeDNorm(x) = [σ(x·β^T)·α + γ] ⊙ x/RMS(x)
     Args:
         dim: Hidden dimension size
 # ==================== STACK MEMORY MODULE ====================
 class StackMemory(nn.Module):
     """
     Differentiable Hidden State Stack for modeling Chomsky hierarchy grammars.
         self.action_head = nn.Linear(self.stack_d_model, 3 * self.num_stack_heads, bias=True)
         # Query projection for global reading (one per head)
+        self.gate_proj = nn.Linear(self.head_dim, 1, bias=True)
         # Residual weight for gating stack contribution
         self.res_weight = nn.Parameter(torch.ones(1))
         new_stack, new_mask = self._vectorized_update(stack, mask, actions, k_values)
         # Global reading via query-over-stack attention
+        # FIX: Project the raw stack content directly.
+        # Previously, masking before projection killed gradients for "empty" slots
+        # preventing them from ever becoming "full".
+        gate_scores = self.gate_proj(new_stack).squeeze(-1)  # [batch, seq, heads, slots]
+        # Apply mask to the SCORES, not the features.
+        # Mask out invalid positions (add large negative value where mask is 0)
         gate_scores = gate_scores + (1 - new_mask) * -1e9
         # Softmax to get attention weights
         gate_weights = F.softmax(gate_scores, dim=-1)
         # Weighted sum over stack slots
+        # new_stack contains the features, gate_weights contains the validity/relevance
         memory_output = (new_stack * gate_weights.unsqueeze(-1)).sum(dim=3)
         memory_output = memory_output.view(batch_size, seq_len, -1)
         hidden = self.dropout(hidden)
         return self.down_proj(hidden)
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     """
     Decoder layer with standard residual connections and optional StackMemory.
+    Architecture (Updated Flow):
+    1. Optional: StackMemory module (Pre-processing context injection)
+    2. Pre-norm (SeeDNorm) → LNS scaling → Self-Attention with ResFormer and Learnable Multipliers
+    3. Standard Residual Connection
+    4. GPAS activation scaling
+    5. Pre-norm (SeeDNorm) → LNS scaling → MLP with FANformer and Learnable Multipliers
+    6. Standard Residual Connection
+    7. GPAS activation scaling
     """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         Returns:
             Tuple of (hidden_states, attn_weights, stack_state, stack_mask)
         """
         # ============================================================
+        # 1. Stack Memory Module (MOVED TO START)
+        # ============================================================
+        # We process memory first so the Attention layer can "see" the
+        # retrieved context. This eliminates the 1-layer lag.
+        if self.use_stack:
+            hidden_states, stack_state, stack_mask = self.stack_memory(
+                hidden_states, stack_state, stack_mask
+            )
+        # ============================================================
+        # 2. Attention Block with Standard Residual Connection
         # ============================================================
         residual = hidden_states
         hidden_states = self.gpas_attn(hidden_states)
         # ============================================================
+        # 3. MLP Block with Standard Residual Connection
         # ============================================================
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         # Apply GPAS after residual connection
         hidden_states = self.gpas_mlp(hidden_states)
+        # Return tuple matching the expected signature
         if self.use_stack:
             return (hidden_states, attn_weights, stack_state, stack_mask)
         else: