KitsuVp
/

NeoLLM

@@ -1,12 +1,8 @@
 #!/usr/bin/env python3
 """
-NeoLLM Model with FANformer Integration, Dropout Regularization, Selective Self-Attention (SSA),
-and ResFormer Value Residual Learning for enhanced information flow through deep layers.
-Updated to include:
-- Fourier Analysis Network (FAN) layer for effective periodicity modeling
-- Dropout regularization at strategic locations
-- ResFormer: Feature residual connections from first layer (applied before projections)
 """
 import math
@@ -32,7 +28,7 @@ from transformers.utils.import_utils import (
     is_causal_conv1d_available,
     is_flash_linear_attention_available,
 )
-from .configuration_neollm import NeoLLMConfig
 if is_causal_conv1d_available():
@@ -49,8 +45,6 @@ else:
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 logger = logging.get_logger(__name__)
 class FANLayer(nn.Module):
     """
     Fourier Analysis Network (FAN) layer for effective periodicity modeling.
@@ -289,13 +283,7 @@ def eager_attention_forward(
 class NeoLLMAttention(nn.Module):
-    """
-    Multi-headed attention with FANformer integration, Selective Self-Attention for periodicity modeling,
-    and ResFormer feature residual connections for enhanced information flow.
-    ResFormer enhancement: Applies learnable feature residual connections from the first layer
-    BEFORE QKV projections: H'_fan_n = λ_1 * H_fan_1 + λ_2 * H_fan_n
-    """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         super().__init__()
@@ -334,35 +322,22 @@ class NeoLLMAttention(nn.Module):
         # Dropout for attention output
         self.dropout = nn.Dropout(config.dropout_rate)
-        # ResFormer: learnable feature residual parameters (initialized to 0.5)
-        self.lambda_1 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_1 (first layer features)
-        self.lambda_2 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_n (current layer features)
     def forward(
         self,
         hidden_states: torch.Tensor,
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor],
-        first_layer_fan: Optional[torch.Tensor] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], torch.Tensor]:
         input_shape = hidden_states.shape[:-1]
         # Apply FANformer transformation first
         hidden_states_fan = self.fan_layer(hidden_states)
-        # ResFormer: Apply feature residual connection BEFORE projections
-        # This ensures dimensional compatibility across all layer types
-        if first_layer_fan is not None:
-            hidden_states_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * hidden_states_fan
-        # Store current FAN features for potential use as first_layer_fan in subsequent layers
-        current_layer_fan = hidden_states_fan.clone()
         hidden_shape = (*input_shape, -1, self.head_dim)
-        # Use FAN-transformed features (with residual applied) for projections
         query_states, gate = torch.chunk(
             self.q_proj(hidden_states_fan).view(*input_shape, -1, self.head_dim * 2), 2, dim=-1
         )
@@ -394,9 +369,8 @@ class NeoLLMAttention(nn.Module):
         attn_output = attn_output * torch.sigmoid(gate)
         attn_output = self.o_proj(attn_output)
-        attn_output = self.dropout(attn_output)
-        return attn_output, attn_weights, current_layer_fan
 def apply_mask_to_padding_states(hidden_states, attention_mask):
@@ -560,15 +534,8 @@ def torch_recurrent_gated_delta_rule(
     core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
     return core_attn_out, last_recurrent_state
 class NeoLLMGatedDeltaNet(nn.Module):
-    """
-    Linear attention with FANformer integration, Selective Self-Attention for periodicity modeling,
-    and ResFormer feature residual connections for enhanced information flow.
-    ResFormer enhancement: Applies learnable feature residual connections from the first layer
-    BEFORE QKV projections: H'_fan_n = λ_1 * H_fan_1 + λ_2 * H_fan_n
-    """
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         super().__init__()
@@ -643,10 +610,6 @@ class NeoLLMGatedDeltaNet(nn.Module):
         self.chunk_gated_delta_rule = chunk_gated_delta_rule or torch_chunk_gated_delta_rule
         self.recurrent_gated_delta_rule = fused_recurrent_gated_delta_rule or torch_recurrent_gated_delta_rule
-        # ResFormer: learnable feature residual parameters (initialized to 0.5)
-        self.lambda_1 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_1 (first layer features)
-        self.lambda_2 = nn.Parameter(torch.tensor(0.5))  # Weight for H_fan_n (current layer features)
         if not is_fast_path_available:
             logger.warning_once(
                 "The fast path is not available because one of the required library is not installed. Falling back to "
@@ -686,8 +649,7 @@ class NeoLLMGatedDeltaNet(nn.Module):
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
-        first_layer_fan: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, torch.Tensor]:
         hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
         # Set up dimensions for reshapes later
@@ -696,15 +658,7 @@ class NeoLLMGatedDeltaNet(nn.Module):
         # Apply FANformer transformation first
         hidden_states_fan = self.fan_layer(hidden_states)
-        # ResFormer: Apply feature residual connection BEFORE projections
-        # This ensures dimensional compatibility across all layer types
-        if first_layer_fan is not None:
-            hidden_states_fan = self.lambda_1 * first_layer_fan + self.lambda_2 * hidden_states_fan
-        # Store current FAN features for potential use as first_layer_fan in subsequent layers
-        current_layer_fan = hidden_states_fan.clone()
-        # Use FAN-transformed features (with residual applied) for projections
         projected_states_qkvz = self.in_proj_qkvz(hidden_states_fan)
         projected_states_ba = self.in_proj_ba(hidden_states_fan)
         query, key, value, z, b, a = self.fix_query_key_value_ordering(projected_states_qkvz, projected_states_ba)
@@ -768,9 +722,7 @@ class NeoLLMGatedDeltaNet(nn.Module):
         output = self.out_proj(core_attn_out)
         output = self.dropout(output)  # Apply dropout after output projection
-        return output, current_layer_fan
 class PolyNorm(torch.nn.Module):
     def __init__(self, eps=1e-6):
@@ -785,7 +737,6 @@ class PolyNorm(torch.nn.Module):
     def forward(self, x):
         return self.weight[0] * self._norm(x**3) + self.weight[1] * self._norm(x**2) + self.weight[2] * self._norm(x) + self.bias
 class NeoLLMMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -809,7 +760,6 @@ class NeoLLMMLP(nn.Module):
         hidden = self.dropout(hidden)
         return self.down_proj(hidden)
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         super().__init__()
@@ -836,16 +786,12 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         # GPAS (Gradient-Preserving Activation Scaling) - applied after residual connections
         self.gpas_attn = GPAS(config.hidden_size)
         self.gpas_mlp = GPAS(config.hidden_size)
-        # ResFormer: storage for current layer's FAN features
-        self.current_layer_fan = None
     def forward(
         self,
         hidden_states: torch.Tensor,
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor] = None,
-        first_layer_fan: Optional[torch.Tensor] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> torch.FloatTensor:
         residual = hidden_states
@@ -856,20 +802,18 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         # Apply LNS scaling after normalization
         hidden_states = self.lns_attn(hidden_states)
-        # Token Mixer with ResFormer feature residual connections
         if self.layer_type == "linear_attention":
-            hidden_states, self.current_layer_fan = self.linear_attn(
                 hidden_states=hidden_states,
                 attention_mask=attention_mask,
-                first_layer_fan=first_layer_fan,
             )
         elif self.layer_type == "full_attention":
             # Self Attention
-            hidden_states, _, self.current_layer_fan = self.self_attn(
                 hidden_states=hidden_states,
                 attention_mask=attention_mask,
                 position_embeddings=position_embeddings,
-                first_layer_fan=first_layer_fan,
                 **kwargs,
             )
@@ -911,17 +855,6 @@ class NeoLLMPreTrainedModel(PreTrainedModel):
         if isinstance(module, NeoLLMGatedDeltaNet):
             module.dt_bias.data.fill_(1.0)
             module.A_log.data.uniform_(0, 16).log_()
-            # ResFormer: initialize lambda parameters for linear attention
-            if hasattr(module, 'lambda_1'):
-                module.lambda_1.data.fill_(0.5)
-            if hasattr(module, 'lambda_2'):
-                module.lambda_2.data.fill_(0.5)
-        elif isinstance(module, NeoLLMAttention):
-            # ResFormer: initialize lambda parameters for full attention
-            if hasattr(module, 'lambda_1'):
-                module.lambda_1.data.fill_(0.5)
-            if hasattr(module, 'lambda_2'):
-                module.lambda_2.data.fill_(0.5)
         elif isinstance(module, GPAS):
             # Initialize GPAS alpha to 0 as per paper
             module.alpha.data.fill_(0.0)
@@ -942,10 +875,6 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         self.norm = NeoLLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.rotary_emb = NeoLLMRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
-        # ResFormer: storage for first layer's FAN features (H_fan_1)
-        self.first_layer_fan = None
         # Initialize weights and apply final processing
         self.post_init()
@@ -981,9 +910,6 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         # create position embeddings to be shared across the decoder layers
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
-        # ResFormer: reset first_layer_fan at the start of each forward pass
-        self.first_layer_fan = None
         for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             layer_mask = linear_attn_mask if decoder_layer.layer_type == "linear_attention" else causal_mask
@@ -991,13 +917,8 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
                 hidden_states,
                 position_embeddings=position_embeddings,
                 attention_mask=layer_mask,
-                first_layer_fan=self.first_layer_fan,  # Pass H_fan_1 to all layers
                 **kwargs,
             )
-            # ResFormer: capture H_fan_1 from the first layer
-            if self.first_layer_fan is None and hasattr(decoder_layer, 'current_layer_fan'):
-                self.first_layer_fan = decoder_layer.current_layer_fan
         hidden_states = self.norm(hidden_states)
@@ -1016,7 +937,6 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             linear_attn_mask = None
         return linear_attn_mask
 @torch.compiler.disable
 def compute_cce_loss(hidden_states, labels, lm_head_weight, lm_head_bias=None, pad_token_id=None):
     """
@@ -1099,7 +1019,6 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
             attentions=outputs.attentions,
         )
 # ==================== AUTOMODEL REGISTRATION ====================
 __all__ = [

 #!/usr/bin/env python3
 """
+NeoLLM Model with FANformer Integration, Dropout Regularization, and Selective Self-Attention (SSA)
+Updated to include Fourier Analysis Network (FAN) layer for effective periodicity modeling,
+dropout regularization at strategic locations
 """
 import math
     is_causal_conv1d_available,
     is_flash_linear_attention_available,
 )
+from configuration_neollm import NeoLLMConfig
 if is_causal_conv1d_available():
 from transformers import AutoConfig, AutoModel, AutoModelForCausalLM
 logger = logging.get_logger(__name__)
 class FANLayer(nn.Module):
     """
     Fourier Analysis Network (FAN) layer for effective periodicity modeling.
 class NeoLLMAttention(nn.Module):
+    """Multi-headed attention with FANformer integration and Selective Self-Attention for periodicity modeling"""
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         super().__init__()
         # Dropout for attention output
         self.dropout = nn.Dropout(config.dropout_rate)
     def forward(
         self,
         hidden_states: torch.Tensor,
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor],
         **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
         input_shape = hidden_states.shape[:-1]
         # Apply FANformer transformation first
         hidden_states_fan = self.fan_layer(hidden_states)
         hidden_shape = (*input_shape, -1, self.head_dim)
+        # Use FAN-transformed features directly for projections
         query_states, gate = torch.chunk(
             self.q_proj(hidden_states_fan).view(*input_shape, -1, self.head_dim * 2), 2, dim=-1
         )
         attn_output = attn_output * torch.sigmoid(gate)
         attn_output = self.o_proj(attn_output)
+        attn_output = self.dropout(attn_output)  # Apply dropout after output projection
+        return attn_output, attn_weights
 def apply_mask_to_padding_states(hidden_states, attention_mask):
     core_attn_out = core_attn_out.transpose(1, 2).contiguous().to(initial_dtype)
     return core_attn_out, last_recurrent_state
 class NeoLLMGatedDeltaNet(nn.Module):
+    """Linear attention with FANformer integration and Selective Self-Attention for periodicity modeling"""
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         super().__init__()
         self.chunk_gated_delta_rule = chunk_gated_delta_rule or torch_chunk_gated_delta_rule
         self.recurrent_gated_delta_rule = fused_recurrent_gated_delta_rule or torch_recurrent_gated_delta_rule
         if not is_fast_path_available:
             logger.warning_once(
                 "The fast path is not available because one of the required library is not installed. Falling back to "
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
+    ):
         hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
         # Set up dimensions for reshapes later
         # Apply FANformer transformation first
         hidden_states_fan = self.fan_layer(hidden_states)
+        # Use FAN-transformed features directly for projections
         projected_states_qkvz = self.in_proj_qkvz(hidden_states_fan)
         projected_states_ba = self.in_proj_ba(hidden_states_fan)
         query, key, value, z, b, a = self.fix_query_key_value_ordering(projected_states_qkvz, projected_states_ba)
         output = self.out_proj(core_attn_out)
         output = self.dropout(output)  # Apply dropout after output projection
+        return output
 class PolyNorm(torch.nn.Module):
     def __init__(self, eps=1e-6):
     def forward(self, x):
         return self.weight[0] * self._norm(x**3) + self.weight[1] * self._norm(x**2) + self.weight[2] * self._norm(x) + self.bias
 class NeoLLMMLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         hidden = self.dropout(hidden)
         return self.down_proj(hidden)
 class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     def __init__(self, config: NeoLLMConfig, layer_idx: int):
         super().__init__()
         # GPAS (Gradient-Preserving Activation Scaling) - applied after residual connections
         self.gpas_attn = GPAS(config.hidden_size)
         self.gpas_mlp = GPAS(config.hidden_size)
     def forward(
         self,
         hidden_states: torch.Tensor,
         position_embeddings: tuple[torch.Tensor, torch.Tensor],
         attention_mask: Optional[torch.Tensor] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> torch.FloatTensor:
         residual = hidden_states
         # Apply LNS scaling after normalization
         hidden_states = self.lns_attn(hidden_states)
+        # Token Mixer
         if self.layer_type == "linear_attention":
+            hidden_states = self.linear_attn(
                 hidden_states=hidden_states,
                 attention_mask=attention_mask,
             )
         elif self.layer_type == "full_attention":
             # Self Attention
+            hidden_states, _ = self.self_attn(
                 hidden_states=hidden_states,
                 attention_mask=attention_mask,
                 position_embeddings=position_embeddings,
                 **kwargs,
             )
         if isinstance(module, NeoLLMGatedDeltaNet):
             module.dt_bias.data.fill_(1.0)
             module.A_log.data.uniform_(0, 16).log_()
         elif isinstance(module, GPAS):
             # Initialize GPAS alpha to 0 as per paper
             module.alpha.data.fill_(0.0)
         self.norm = NeoLLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.rotary_emb = NeoLLMRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
         self.post_init()
         # create position embeddings to be shared across the decoder layers
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
         for decoder_layer in self.layers[: self.config.num_hidden_layers]:
             layer_mask = linear_attn_mask if decoder_layer.layer_type == "linear_attention" else causal_mask
                 hidden_states,
                 position_embeddings=position_embeddings,
                 attention_mask=layer_mask,
                 **kwargs,
             )
         hidden_states = self.norm(hidden_states)
             linear_attn_mask = None
         return linear_attn_mask
 @torch.compiler.disable
 def compute_cce_loss(hidden_states, labels, lm_head_weight, lm_head_bias=None, pad_token_id=None):
     """
             attentions=outputs.attentions,
         )
 # ==================== AUTOMODEL REGISTRATION ====================
 __all__ = [