KitsuVp
/

NeoLLM

@@ -1,13 +1,13 @@
-# ==================== modeling_neollm.py ====================
 #!/usr/bin/env python3
 """
 NeoLLM Model with FANformer Integration in both Attention and FFN, Dropout Regularization,
- and ResFormer Value Residual Learning for enhanced information
-flow through deep layers.
 Updated to include:
 - Fourier Analysis Network (FAN) layer for effective periodicity modeling in attention (relational space)
-- NEW: FAN layer in FFN for featural periodicity modeling (complementary coverage)
 - Dropout regularization at strategic locations
 - ResFormer: Feature residual connections from first layer (applied before projections)
 """
@@ -35,7 +35,7 @@ from transformers.utils.import_utils import (
     is_causal_conv1d_available,
     is_flash_linear_attention_available,
 )
-from .configuration_neollm import NeoLLMConfig
 if is_causal_conv1d_available():
@@ -153,7 +153,74 @@ class GPAS(nn.Module):
         return x_scaled
 class NeoLLMRMSNormGated(nn.Module):
     def __init__(self, hidden_size, eps=1e-6, **kwargs):
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
@@ -207,25 +274,6 @@ class NeoLLMRotaryEmbedding(nn.Module):
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-class NeoLLMRMSNorm(nn.Module):
-    def __init__(self, dim: int, eps: float = 1e-6):
-        super().__init__()
-        self.eps = eps
-        self.weight = nn.Parameter(torch.zeros(dim))
-    def _norm(self, x):
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-    def forward(self, x):
-        output = self._norm(x.float())
-        # Llama does x.to(float16) * w whilst NeoLLM is (x * w).to(float16)
-        output = output * (1.0 + self.weight.float())
-        return output.type_as(x)
-    def extra_repr(self):
-        return f"{tuple(self.weight.shape)}, eps={self.eps}"
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
@@ -293,7 +341,7 @@ def eager_attention_forward(
 class NeoLLMAttention(nn.Module):
     """
-    Multi-headed attention with FANformer integration, Selective Self-Attention for periodicity modeling,
     and ResFormer feature residual connections for enhanced information flow.
     ResFormer enhancement: Applies learnable feature residual connections from the first layer
@@ -332,8 +380,10 @@ class NeoLLMAttention(nn.Module):
         self.o_proj = nn.Linear(
             config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
         )
-        self.q_norm = NeoLLMRMSNorm(self.head_dim, eps=config.rms_norm_eps)
-        self.k_norm = NeoLLMRMSNorm(self.head_dim, eps=config.rms_norm_eps)
         # Dropout for attention output
         self.dropout = nn.Dropout(config.dropout_rate)
@@ -371,6 +421,7 @@ class NeoLLMAttention(nn.Module):
         )
         gate = gate.reshape(*input_shape, -1)
         query_states = self.q_norm(query_states.view(hidden_shape)).transpose(1, 2)
         key_states = self.k_norm(self.k_proj(hidden_states_fan).view(hidden_shape)).transpose(1, 2)
         value_states = self.v_proj(hidden_states_fan).view(hidden_shape).transpose(1, 2)
@@ -566,7 +617,7 @@ def torch_recurrent_gated_delta_rule(
 class NeoLLMGatedDeltaNet(nn.Module):
     """
-    Linear attention with FANformer integration, Selective Self-Attention for periodicity modeling,
     and ResFormer feature residual connections for enhanced information flow.
     ResFormer enhancement: Applies learnable feature residual connections from the first layer
@@ -630,7 +681,7 @@ class NeoLLMGatedDeltaNet(nn.Module):
             else FusedRMSNormGated(
                 self.head_v_dim,
                 eps=self.layer_norm_epsilon,
-                activation=fla_compatible_activation,  # Use FLA-compatible activation
                 device=torch.cuda.current_device(),
                 dtype=config.dtype if config.dtype is not None else torch.get_default_dtype(),
             )
@@ -849,8 +900,9 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         # MLP with FANformer integration
         self.mlp = NeoLLMMLP(config)
-        self.input_layernorm = NeoLLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = NeoLLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         # LNS (LayerNorm Scaling) - applies 1/√ℓ scaling
         self.lns_attn = LNS(layer_idx)
@@ -873,7 +925,7 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
     ) -> torch.FloatTensor:
         residual = hidden_states
-        # Apply layer normalization
         hidden_states = self.input_layernorm(hidden_states)
         # Apply LNS scaling after normalization
@@ -952,6 +1004,12 @@ class NeoLLMPreTrainedModel(PreTrainedModel):
         elif isinstance(module, FANLayer):
             # FANLayer initialization is handled within the class
             pass
 class NeoLLMModel(NeoLLMPreTrainedModel):
@@ -963,7 +1021,8 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         self.layers = nn.ModuleList(
             [NeoLLMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
-        self.norm = NeoLLMRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.rotary_emb = NeoLLMRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
@@ -1023,6 +1082,7 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
             if self.first_layer_fan is None and hasattr(decoder_layer, 'current_layer_fan'):
                 self.first_layer_fan = decoder_layer.current_layer_fan
         hidden_states = self.norm(hidden_states)
         return BaseModelOutputWithPast(
@@ -1132,6 +1192,7 @@ __all__ = [
     "NeoLLMPreTrainedModel",
     "NeoLLMConfig",
     "FANLayer",
 ]
 # Register the configuration and model for AutoClass support

 #!/usr/bin/env python3
 """
 NeoLLM Model with FANformer Integration in both Attention and FFN, Dropout Regularization,
+SeeDNorm (Self-Rescaled Dynamic Normalization), and ResFormer Value Residual Learning
+for enhanced information flow through deep layers.
 Updated to include:
 - Fourier Analysis Network (FAN) layer for effective periodicity modeling in attention (relational space)
+- FAN layer in FFN for featural periodicity modeling (complementary coverage)
+- SeeDNorm: Dynamic normalization with input-dependent scaling for better adaptability
 - Dropout regularization at strategic locations
 - ResFormer: Feature residual connections from first layer (applied before projections)
 """
     is_causal_conv1d_available,
     is_flash_linear_attention_available,
 )
+from configuration_neollm import NeoLLMConfig
 if is_causal_conv1d_available():
         return x_scaled
+class SeeDNorm(nn.Module):
+    """
+    Self-Rescaled Dynamic Normalization (SeeDNorm)
+    From "SeeDNorm: Self-Rescaled Dynamic Normalization":
+    SeeDNorm(x) = [σ(x·β^T)·α + γ] ⊙ x/RMS(x)
+    Dynamically adjusts the scaling coefficient based on the current input,
+    preserving input norm information and enabling data-dependent normalization.
+    Key features:
+    - γ: Static scaling factor (like RMSNorm), initialized to 1
+    - β: Self-rescaling parameter, initialized to 0
+    - α: Dynamic modulation parameter, initialized to 1
+    - σ: tanh activation to constrain dynamic scaling range [-1, 1]
+    Args:
+        dim: Hidden dimension size
+        eps: Small constant for numerical stability
+    """
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        # Learnable parameters
+        self.gamma = nn.Parameter(torch.ones(dim))      # γ: static scaling (RMSNorm-like)
+        self.beta = nn.Parameter(torch.zeros(dim))      # β: self-rescaling parameter
+        self.alpha = nn.Parameter(torch.ones(dim))      # α: dynamic modulation parameter
+    def _rms_norm(self, x: torch.Tensor) -> torch.Tensor:
+        """Compute RMS normalization: x / RMS(x)"""
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Apply Self-Rescaled Dynamic Normalization.
+        Args:
+            x: Input tensor of shape (..., dim)
+        Returns:
+            Normalized and dynamically scaled tensor of same shape
+        """
+        # Compute input-dependent rescaling: σ(x·β^T)
+        # x·β^T produces scalar per token via dot product
+        rescale_factor = torch.tanh(torch.sum(x * self.beta, dim=-1, keepdim=True))
+        # Dynamic scaling coefficient: σ(x·β^T)·α + γ
+        dynamic_scale = rescale_factor * self.alpha + self.gamma
+        # Apply RMS normalization
+        x_normalized = self._rms_norm(x.float())
+        # Apply dynamic scaling
+        output = x_normalized * dynamic_scale.float()
+        return output.type_as(x)
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, eps={self.eps}"
 class NeoLLMRMSNormGated(nn.Module):
+    """
+    Gated RMSNorm variant used in specific contexts.
+    """
     def __init__(self, hidden_size, eps=1e-6, **kwargs):
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]
 class NeoLLMAttention(nn.Module):
     """
+    Multi-headed attention with FANformer integration, SeeDNorm for Q/K normalization,
     and ResFormer feature residual connections for enhanced information flow.
     ResFormer enhancement: Applies learnable feature residual connections from the first layer
         self.o_proj = nn.Linear(
             config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
         )
+        # SeeDNorm for Q/K normalization (replaces RMSNorm)
+        self.q_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = SeeDNorm(self.head_dim, eps=config.rms_norm_eps)
         # Dropout for attention output
         self.dropout = nn.Dropout(config.dropout_rate)
         )
         gate = gate.reshape(*input_shape, -1)
+        # Apply SeeDNorm to Q and K
         query_states = self.q_norm(query_states.view(hidden_shape)).transpose(1, 2)
         key_states = self.k_norm(self.k_proj(hidden_states_fan).view(hidden_shape)).transpose(1, 2)
         value_states = self.v_proj(hidden_states_fan).view(hidden_shape).transpose(1, 2)
 class NeoLLMGatedDeltaNet(nn.Module):
     """
+    Linear attention with FANformer integration, SeeDNorm for normalization,
     and ResFormer feature residual connections for enhanced information flow.
     ResFormer enhancement: Applies learnable feature residual connections from the first layer
             else FusedRMSNormGated(
                 self.head_v_dim,
                 eps=self.layer_norm_epsilon,
+                activation=fla_compatible_activation,
                 device=torch.cuda.current_device(),
                 dtype=config.dtype if config.dtype is not None else torch.get_default_dtype(),
             )
         # MLP with FANformer integration
         self.mlp = NeoLLMMLP(config)
+        # SeeDNorm for input and post-attention normalization (replaces RMSNorm)
+        self.input_layernorm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
         # LNS (LayerNorm Scaling) - applies 1/√ℓ scaling
         self.lns_attn = LNS(layer_idx)
     ) -> torch.FloatTensor:
         residual = hidden_states
+        # Apply SeeDNorm normalization
         hidden_states = self.input_layernorm(hidden_states)
         # Apply LNS scaling after normalization
         elif isinstance(module, FANLayer):
             # FANLayer initialization is handled within the class
             pass
+        elif isinstance(module, SeeDNorm):
+            # SeeDNorm initialization:
+            # gamma (γ) initialized to 1 (default in Parameter definition)
+            # beta (β) initialized to 0 (default in Parameter definition)
+            # alpha (α) initialized to 1 (default in Parameter definition)
+            pass
 class NeoLLMModel(NeoLLMPreTrainedModel):
         self.layers = nn.ModuleList(
             [NeoLLMDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
+        # SeeDNorm for final output normalization (replaces RMSNorm)
+        self.norm = SeeDNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.rotary_emb = NeoLLMRotaryEmbedding(config=config)
         self.gradient_checkpointing = False
             if self.first_layer_fan is None and hasattr(decoder_layer, 'current_layer_fan'):
                 self.first_layer_fan = decoder_layer.current_layer_fan
+        # Apply SeeDNorm for final normalization
         hidden_states = self.norm(hidden_states)
         return BaseModelOutputWithPast(
     "NeoLLMPreTrainedModel",
     "NeoLLMConfig",
     "FANLayer",
+    "SeeDNorm",
 ]
 # Register the configuration and model for AutoClass support