KitsuVp
/

NeoLLM

@@ -1,8 +1,12 @@
 #!/usr/bin/env python3
 """
-NeoLLM Model with FANformer Integration, Dropout Regularization, and Selective Self-Attention (SSA)
-Updated to include Fourier Analysis Network (FAN) layer for effective periodicity modeling,
-dropout regularization at strategic locations
 """
 import math
@@ -28,7 +32,7 @@ from transformers.utils.import_utils import (
     is_causal_conv1d_available,
     is_flash_linear_attention_available,
 )
-from .configuration_neollm import NeoLLMConfig
 if is_causal_conv1d_available():
@@ -45,6 +49,8 @@ else:
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 logger = logging.get_logger(__name__)
 class FANLayer(nn.Module):
     """
     Fourier Analysis Network (FAN) layer for effective periodicity modeling.
@@ -283,7 +289,13 @@ def eager_attention_forward(
 class NeoLLMAttention(nn.Module):
-    """Multi-headed attention with FANformer integration and Selective Self-Attention for periodicity modeling"""
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         super().__init__()
@@ -322,22 +334,35 @@ class NeoLLMAttention(nn.Module):
         # Dropout for attention output
         self.dropout = nn.Dropout(config.dropout_rate)
     def forward(
         self,
         hidden_states: torch.Tensor,
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor],
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         input_shape = hidden_states.shape[:-1]
         # Apply FANformer transformation first
         hidden_states_fan = self.fan_layer(hidden_states)
         hidden_shape = (*input_shape, -1, self.head_dim)
-        # Use FAN-transformed features directly for projections
         query_states, gate = torch.chunk(
             self.q_proj(hidden_states_fan).view(*input_shape, -1, self.head_dim * 2), 2, dim=-1
         )
@@ -369,8 +394,9 @@ class NeoLLMAttention(nn.Module):
         attn_output = attn_output * torch.sigmoid(gate)
         attn_output = self.o_proj(attn_output)
-        attn_output = self.dropout(attn_output)  # Apply dropout after output projection
-        return attn_output, attn_weights
 def apply_mask_to_padding_states(hidden_states, attention_mask):
@@ -534,8 +560,15 @@ def torch_recurrent_gated_delta_rule(
     core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
     return core_attn_out, last_recurrent_state
 class NeoLLMGatedDeltaNet(nn.Module):
-    """Linear attention with FANformer integration and Selective Self-Attention for periodicity modeling"""
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         super().__init__()
@@ -610,6 +643,10 @@ class NeoLLMGatedDeltaNet(nn.Module):
         self.chunk_gated_delta_rule = chunk_gated_delta_rule or torch_chunk_gated_delta_rule
         self.recurrent_gated_delta_rule = fused_recurrent_gated_delta_rule or torch_recurrent_gated_delta_rule
         if not is_fast_path_available:
             logger.warning_once(
                 "The fast path is not available because one of the required library is not installed. Falling back to "
@@ -649,7 +686,8 @@ class NeoLLMGatedDeltaNet(nn.Module):
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-    ):
         hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
         # Set up dimensions for reshapes later
@@ -658,7 +696,15 @@ class NeoLLMGatedDeltaNet(nn.Module):
         # Apply FANformer transformation first
         hidden_states_fan = self.fan_layer(hidden_states)
-        # Use FAN-transformed features directly for projections
         projected_states_qkvz = self.in_proj_qkvz(hidden_states_fan)
         projected_states_ba = self.in_proj_ba(hidden_states_fan)
         query, key, value, z, b, a = self.fix_query_key_value_ordering(projected_states_qkvz, projected_states_ba)
@@ -722,7 +768,9 @@ class NeoLLMGatedDeltaNet(nn.Module):
         output = self.out_proj(core_attn_out)
         output = self.dropout(output)  # Apply dropout after output projection
-        return output
 class PolyNorm(torch.nn.Module):
     def __init__(self, eps=1e-6):
@@ -737,6 +785,7 @@ class PolyNorm(torch.nn.Module):
     def forward(self, x):
         return self.weight[0] * self._norm(x**3) + self.weight[1] * self._norm(x**2) + self.weight[2] * self._norm(x) + self.bias
 class NeoLLMMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -760,6 +809,7 @@ class NeoLLMMLP(nn.Module):
         hidden = self.dropout(hidden)
         return self.down_proj(hidden)
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         super().__init__()
@@ -786,12 +836,16 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         # GPAS (Gradient-Preserving Activation Scaling) - applied after residual connections
         self.gpas_attn = GPAS(config.hidden_size)
         self.gpas_mlp = GPAS(config.hidden_size)
     def forward(
         self,
         hidden_states: torch.Tensor,
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> torch.FloatTensor:
         residual = hidden_states
@@ -802,18 +856,20 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         # Apply LNS scaling after normalization
         hidden_states = self.lns_attn(hidden_states)
-        # Token Mixer
         if self.layer_type == "linear_attention":
-            hidden_states = self.linear_attn(
                 hidden_states=hidden_states,
                 attention_mask=attention_mask,
             )
         elif self.layer_type == "full_attention":
             # Self Attention
-            hidden_states, _ = self.self_attn(
                 hidden_states=hidden_states,
                 attention_mask=attention_mask,
                 position_embeddings=position_embeddings,
                 **kwargs,
             )
@@ -855,6 +911,17 @@ class NeoLLMPreTrainedModel(PreTrainedModel):
         if isinstance(module, NeoLLMGatedDeltaNet):
             module.dt_bias.data.fill_(1.0)
             module.A_log.data.uniform_(0, 16).log_()
         elif isinstance(module, GPAS):
             # Initialize GPAS alpha to 0 as per paper
             module.alpha.data.fill_(0.0)
@@ -875,6 +942,10 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         self.norm = NeoLLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.rotary_emb = NeoLLMRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         self.post_init()
@@ -910,6 +981,9 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         # create position embeddings to be shared across the decoder layers
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
         for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             layer_mask = linear_attn_mask if decoder_layer.layer_type == "linear_attention" else causal_mask
@@ -917,8 +991,13 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
                 hidden_states,
                 position_embeddings=position_embeddings,
                 attention_mask=layer_mask,
                 **kwargs,
             )
         hidden_states = self.norm(hidden_states)
@@ -937,6 +1016,7 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             linear_attn_mask = None
         return linear_attn_mask
 @torch.compiler.disable
 def compute_cce_loss(hidden_states, labels, lm_head_weight, lm_head_bias=None, pad_token_id=None):
     """
@@ -1019,6 +1099,7 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
             attentions=outputs.attentions,
         )
 # ==================== AUTOMODEL REGISTRATION ====================
 __all__ = [

 #!/usr/bin/env python3
 """
+NeoLLM Model with FANformer Integration, Dropout Regularization, Selective Self-Attention (SSA),
+and ResFormer Value Residual Learning for enhanced information flow through deep layers.
+Updated to include:
+- Fourier Analysis Network (FAN) layer for effective periodicity modeling
+- Dropout regularization at strategic locations
+- ResFormer: Feature residual connections from first layer (applied before projections)
 """
 import math
     is_causal_conv1d_available,
     is_flash_linear_attention_available,
 )
+from configuration_neollm import NeoLLMConfig
 if is_causal_conv1d_available():
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 logger = logging.get_logger(__name__)
 class FANLayer(nn.Module):
     """
     Fourier Analysis Network (FAN) layer for effective periodicity modeling.
 class NeoLLMAttention(nn.Module):
+    """
+    Multi-headed attention with FANformer integration, Selective Self-Attention for periodicity modeling,
+    and ResFormer feature residual connections for enhanced information flow.
+    ResFormer enhancement: Applies learnable feature residual connections from the first layer
+    BEFORE QKV projections: H'_fan_n = λ_1 * H_fan_1 + λ_2 * H_fan_n
+    """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         super().__init__()
         # Dropout for attention output
         self.dropout = nn.Dropout(config.dropout_rate)
+        # ResFormer: learnable feature residual parameters (initialized to 0.5)
+        self.lambda_1 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_1 (first layer features)
+        self.lambda_2 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_n (current layer features)
     def forward(
         self,
         hidden_states: torch.Tensor,
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor],
+        first_layer_fan: Optional[torch.Tensor] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
         input_shape = hidden_states.shape[:-1]
         # Apply FANformer transformation first
         hidden_states_fan = self.fan_layer(hidden_states)
+        # ResFormer: Apply feature residual connection BEFORE projections
+        # This ensures dimensional compatibility across all layer types
+        if first_layer_fan is not None:
+            hidden_states_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * hidden_states_fan
+        # Store current FAN features for potential use as first_layer_fan in subsequent layers
+        current_layer_fan = hidden_states_fan.clone()
         hidden_shape = (*input_shape, -1, self.head_dim)
+        # Use FAN-transformed features (with residual applied) for projections
         query_states, gate = torch.chunk(
             self.q_proj(hidden_states_fan).view(*input_shape, -1, self.head_dim * 2), 2, dim=-1
         )
         attn_output = attn_output * torch.sigmoid(gate)
         attn_output = self.o_proj(attn_output)
+        attn_output = self.dropout(attn_output)
+        return attn_output, attn_weights, current_layer_fan
 def apply_mask_to_padding_states(hidden_states, attention_mask):
     core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
     return core_attn_out, last_recurrent_state
 class NeoLLMGatedDeltaNet(nn.Module):
+    """
+    Linear attention with FANformer integration, Selective Self-Attention for periodicity modeling,
+    and ResFormer feature residual connections for enhanced information flow.
+    ResFormer enhancement: Applies learnable feature residual connections from the first layer
+    BEFORE QKV projections: H'_fan_n = λ_1 * H_fan_1 + λ_2 * H_fan_n
+    """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         super().__init__()
         self.chunk_gated_delta_rule = chunk_gated_delta_rule or torch_chunk_gated_delta_rule
         self.recurrent_gated_delta_rule = fused_recurrent_gated_delta_rule or torch_recurrent_gated_delta_rule
+        # ResFormer: learnable feature residual parameters (initialized to 0.5)
+        self.lambda_1 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_1 (first layer features)
+        self.lambda_2 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_n (current layer features)
         if not is_fast_path_available:
             logger.warning_once(
                 "The fast path is not available because one of the required library is not installed. Falling back to "
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
+        first_layer_fan: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
         # Set up dimensions for reshapes later
         # Apply FANformer transformation first
         hidden_states_fan = self.fan_layer(hidden_states)
+        # ResFormer: Apply feature residual connection BEFORE projections
+        # This ensures dimensional compatibility across all layer types
+        if first_layer_fan is not None:
+            hidden_states_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * hidden_states_fan
+        # Store current FAN features for potential use as first_layer_fan in subsequent layers
+        current_layer_fan = hidden_states_fan.clone()
+        # Use FAN-transformed features (with residual applied) for projections
         projected_states_qkvz = self.in_proj_qkvz(hidden_states_fan)
         projected_states_ba = self.in_proj_ba(hidden_states_fan)
         query, key, value, z, b, a = self.fix_query_key_value_ordering(projected_states_qkvz, projected_states_ba)
         output = self.out_proj(core_attn_out)
         output = self.dropout(output)  # Apply dropout after output projection
+        return output, current_layer_fan
 class PolyNorm(torch.nn.Module):
     def __init__(self, eps=1e-6):
     def forward(self, x):
         return self.weight[0] * self._norm(x**3) + self.weight[1] * self._norm(x**2) + self.weight[2] * self._norm(x) + self.bias
 class NeoLLMMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         hidden = self.dropout(hidden)
         return self.down_proj(hidden)
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         super().__init__()
         # GPAS (Gradient-Preserving Activation Scaling) - applied after residual connections
         self.gpas_attn = GPAS(config.hidden_size)
         self.gpas_mlp = GPAS(config.hidden_size)
+        # ResFormer: storage for current layer's FAN features
+        self.current_layer_fan = None
     def forward(
         self,
         hidden_states: torch.Tensor,
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor] = None,
+        first_layer_fan: Optional[torch.Tensor] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> torch.FloatTensor:
         residual = hidden_states
         # Apply LNS scaling after normalization
         hidden_states = self.lns_attn(hidden_states)
+        # Token Mixer with ResFormer feature residual connections
         if self.layer_type == "linear_attention":
+            hidden_states, self.current_layer_fan = self.linear_attn(
                 hidden_states=hidden_states,
                 attention_mask=attention_mask,
+                first_layer_fan=first_layer_fan,
             )
         elif self.layer_type == "full_attention":
             # Self Attention
+            hidden_states, _, self.current_layer_fan = self.self_attn(
                 hidden_states=hidden_states,
                 attention_mask=attention_mask,
                 position_embeddings=position_embeddings,
+                first_layer_fan=first_layer_fan,
                 **kwargs,
             )
         if isinstance(module, NeoLLMGatedDeltaNet):
             module.dt_bias.data.fill_(1.0)
             module.A_log.data.uniform_(0, 16).log_()
+            # ResFormer: initialize lambda parameters for linear attention
+            if hasattr(module, 'lambda_1'):
+                module.lambda_1.data.fill_(0.5)
+            if hasattr(module, 'lambda_2'):
+                module.lambda_2.data.fill_(0.5)
+        elif isinstance(module, NeoLLMAttention):
+            # ResFormer: initialize lambda parameters for full attention
+            if hasattr(module, 'lambda_1'):
+                module.lambda_1.data.fill_(0.5)
+            if hasattr(module, 'lambda_2'):
+                module.lambda_2.data.fill_(0.5)
         elif isinstance(module, GPAS):
             # Initialize GPAS alpha to 0 as per paper
             module.alpha.data.fill_(0.0)
         self.norm = NeoLLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.rotary_emb = NeoLLMRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
+        # ResFormer: storage for first layer's FAN features (H_fan_1)
+        self.first_layer_fan = None
         # Initialize weights and apply final processing
         self.post_init()
         # create position embeddings to be shared across the decoder layers
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # ResFormer: reset first_layer_fan at the start of each forward pass
+        self.first_layer_fan = None
         for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             layer_mask = linear_attn_mask if decoder_layer.layer_type == "linear_attention" else causal_mask
                 hidden_states,
                 position_embeddings=position_embeddings,
                 attention_mask=layer_mask,
+                first_layer_fan=self.first_layer_fan,  # Pass H_fan_1 to all layers
                 **kwargs,
             )
+            # ResFormer: capture H_fan_1 from the first layer
+            if self.first_layer_fan is None and hasattr(decoder_layer, 'current_layer_fan'):
+                self.first_layer_fan = decoder_layer.current_layer_fan
         hidden_states = self.norm(hidden_states)
             linear_attn_mask = None
         return linear_attn_mask
 @torch.compiler.disable
 def compute_cce_loss(hidden_states, labels, lm_head_weight, lm_head_bias=None, pad_token_id=None):
     """
             attentions=outputs.attentions,
         )
 # ==================== AUTOMODEL REGISTRATION ====================
 __all__ = [