KitsuVp
/

NeoLLM

@@ -3,7 +3,6 @@
 NeoLLM Model with FANformer Integration in both Attention and FFN, Dropout Regularization,
 SeeDNorm (Self-Rescaled Dynamic Normalization), ResFormer Value Residual Learning,
 and Learnable Multipliers for enhanced scale adaptation and information flow through deep layers.
 Updated to include:
 - Fourier Analysis Network (FAN) layer for effective periodicity modeling in attention (relational space)
 - FAN layer in FFN for featural periodicity modeling (complementary coverage)
@@ -250,37 +249,37 @@ class GPAS(nn.Module):
         return x_scaled
 class SeeDNorm(nn.Module):
     """
-    Self-Rescaled Dynamic Normalization (SeeDNorm)
-    From "SeeDNorm: Self-Rescaled Dynamic Normalization":
     SeeDNorm(x) = [σ(x·β^T)·α + γ] ⊙ x/RMS(x)
-    Dynamically adjusts the scaling coefficient based on the current input,
-    preserving input norm information and enabling data-dependent normalization.
-    Key features:
-    - γ: Static scaling factor (like RMSNorm), initialized to 1
-    - β: Self-rescaling parameter, initialized to 0
-    - α: Dynamic modulation parameter, initialized to 1
-    - σ: tanh activation to constrain dynamic scaling range [-1, 1]
     Args:
         dim: Hidden dimension size
         eps: Small constant for numerical stability
     """
-    def __init__(self, dim: int, eps: float = 1e-6):
         super().__init__()
         self.dim = dim
         self.eps = eps
         # Learnable parameters
-        self.gamma = nn.Parameter(torch.ones(dim))      # γ: static scaling (RMSNorm-like)
-        self.beta = nn.Parameter(torch.zeros(dim))      # β: self-rescaling parameter
-        self.alpha = nn.Parameter(torch.ones(dim))      # α: dynamic modulation parameter
     def _rms_norm(self, x: torch.Tensor) -> torch.Tensor:
         """Compute RMS normalization: x / RMS(x)"""
@@ -288,7 +287,7 @@ class SeeDNorm(nn.Module):
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
-        Apply Self-Rescaled Dynamic Normalization.
         Args:
             x: Input tensor of shape (..., dim)
@@ -296,24 +295,27 @@ class SeeDNorm(nn.Module):
         Returns:
             Normalized and dynamically scaled tensor of same shape
         """
-        # Compute input-dependent rescaling: σ(x·β^T)
-        # x·β^T produces scalar per token via dot product
-        rescale_factor = torch.tanh(torch.sum(x * self.beta, dim=-1, keepdim=True))
-        # Dynamic scaling coefficient: σ(x·β^T)·α + γ
         dynamic_scale = rescale_factor * self.alpha + self.gamma
-        # Apply RMS normalization
         x_normalized = self._rms_norm(x.float())
         # Apply dynamic scaling
         output = x_normalized * dynamic_scale.float()
         return output.type_as(x)
     def extra_repr(self) -> str:
-        return f"dim={self.dim}, eps={self.eps}"
 class NeoLLMRotaryEmbedding(nn.Module):
     inv_freq: torch.Tensor  # fix linting for `register_buffer`
@@ -1049,4 +1051,4 @@ __all__ = [
 # Register the configuration and model for AutoClass support
 AutoConfig.register("neollm", NeoLLMConfig)
 AutoModel.register(NeoLLMConfig, NeoLLMModel)
-AutoModelForCausalLM.register(NeoLLMConfig, NeoLLMForCausalLM)

 NeoLLM Model with FANformer Integration in both Attention and FFN, Dropout Regularization,
 SeeDNorm (Self-Rescaled Dynamic Normalization), ResFormer Value Residual Learning,
 and Learnable Multipliers for enhanced scale adaptation and information flow through deep layers.
 Updated to include:
 - Fourier Analysis Network (FAN) layer for effective periodicity modeling in attention (relational space)
 - FAN layer in FFN for featural periodicity modeling (complementary coverage)
         return x_scaled
 class SeeDNorm(nn.Module):
     """
+    Self-Rescaled Dynamic Normalization (SeeDNorm) with dual dropout regularization.
     SeeDNorm(x) = [σ(x·β^T)·α + γ] ⊙ x/RMS(x)
     Args:
         dim: Hidden dimension size
         eps: Small constant for numerical stability
+        dropout_input: Dropout on input features for dynamic mechanism (default: 0.0)
+        dropout_hidden: Dropout on normalized hidden states (default: 0.0)
     """
+    def __init__(
+        self,
+        dim: int,
+        eps: float = 1e-6,
+        dropout_input: float = 0.01,
+        dropout_hidden: float = 0.01,
+    ):
         super().__init__()
         self.dim = dim
         self.eps = eps
+        self.dropout_input = dropout_input
+        self.dropout_hidden = dropout_hidden
         # Learnable parameters
+        self.gamma = nn.Parameter(torch.ones(dim))   # γ: static scaling
+        self.beta = nn.Parameter(torch.zeros(dim))   # β: self-rescaling
+        self.alpha = nn.Parameter(torch.ones(dim))   # α: dynamic modulation
     def _rms_norm(self, x: torch.Tensor) -> torch.Tensor:
         """Compute RMS normalization: x / RMS(x)"""
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
+        Apply Self-Rescaled Dynamic Normalization with dual dropout.
         Args:
             x: Input tensor of shape (..., dim)
         Returns:
             Normalized and dynamically scaled tensor of same shape
         """
+        x_for_dynamic = F.dropout(x, p=self.dropout_input)
+        rescale_factor = torch.tanh(torch.sum(x_for_dynamic * self.beta,
+                                               dim=-1, keepdim=True))
+        # Compute dynamic scaling coefficient: σ(x·β^T)·α + γ
         dynamic_scale = rescale_factor * self.alpha + self.gamma
+        # Apply RMS normalization on ORIGINAL input (not dropped version)
         x_normalized = self._rms_norm(x.float())
+        x_normalized = F.dropout(x_normalized, p=self.dropout_hidden)
         # Apply dynamic scaling
         output = x_normalized * dynamic_scale.float()
         return output.type_as(x)
     def extra_repr(self) -> str:
+        return (f"dim={self.dim}, eps={self.eps}, "
+                f"dropout_input={self.dropout_input}, dropout_hidden={self.dropout_hidden}")
 class NeoLLMRotaryEmbedding(nn.Module):
     inv_freq: torch.Tensor  # fix linting for `register_buffer`
 # Register the configuration and model for AutoClass support
 AutoConfig.register("neollm", NeoLLMConfig)
 AutoModel.register(NeoLLMConfig, NeoLLMModel)
+AutoModelForCausalLM.register(NeoLLMConfig, NeoLLMForCausalLM)