KitsuVp commited on
Commit
b8e000c
·
verified ·
1 Parent(s): ea5ba09

Update modeling_neollm.py

Browse files
Files changed (1) hide show
  1. modeling_neollm.py +28 -26
modeling_neollm.py CHANGED
@@ -3,7 +3,6 @@
3
  NeoLLM Model with FANformer Integration in both Attention and FFN, Dropout Regularization,
4
  SeeDNorm (Self-Rescaled Dynamic Normalization), ResFormer Value Residual Learning,
5
  and Learnable Multipliers for enhanced scale adaptation and information flow through deep layers.
6
-
7
  Updated to include:
8
  - Fourier Analysis Network (FAN) layer for effective periodicity modeling in attention (relational space)
9
  - FAN layer in FFN for featural periodicity modeling (complementary coverage)
@@ -250,37 +249,37 @@ class GPAS(nn.Module):
250
 
251
  return x_scaled
252
 
253
-
254
  class SeeDNorm(nn.Module):
255
  """
256
- Self-Rescaled Dynamic Normalization (SeeDNorm)
257
 
258
- From "SeeDNorm: Self-Rescaled Dynamic Normalization":
259
  SeeDNorm(x) = [σ(x·β^T)·α + γ] ⊙ x/RMS(x)
260
-
261
- Dynamically adjusts the scaling coefficient based on the current input,
262
- preserving input norm information and enabling data-dependent normalization.
263
-
264
- Key features:
265
- - γ: Static scaling factor (like RMSNorm), initialized to 1
266
- - β: Self-rescaling parameter, initialized to 0
267
- - α: Dynamic modulation parameter, initialized to 1
268
- - σ: tanh activation to constrain dynamic scaling range [-1, 1]
269
 
270
  Args:
271
  dim: Hidden dimension size
272
  eps: Small constant for numerical stability
 
 
273
  """
274
 
275
- def __init__(self, dim: int, eps: float = 1e-6):
 
 
 
 
 
 
276
  super().__init__()
277
  self.dim = dim
278
  self.eps = eps
 
 
279
 
280
  # Learnable parameters
281
- self.gamma = nn.Parameter(torch.ones(dim)) # γ: static scaling (RMSNorm-like)
282
- self.beta = nn.Parameter(torch.zeros(dim)) # β: self-rescaling parameter
283
- self.alpha = nn.Parameter(torch.ones(dim)) # α: dynamic modulation parameter
284
 
285
  def _rms_norm(self, x: torch.Tensor) -> torch.Tensor:
286
  """Compute RMS normalization: x / RMS(x)"""
@@ -288,7 +287,7 @@ class SeeDNorm(nn.Module):
288
 
289
  def forward(self, x: torch.Tensor) -> torch.Tensor:
290
  """
291
- Apply Self-Rescaled Dynamic Normalization.
292
 
293
  Args:
294
  x: Input tensor of shape (..., dim)
@@ -296,24 +295,27 @@ class SeeDNorm(nn.Module):
296
  Returns:
297
  Normalized and dynamically scaled tensor of same shape
298
  """
299
- # Compute input-dependent rescaling: σ(x·β^T)
300
- # x·β^T produces scalar per token via dot product
301
- rescale_factor = torch.tanh(torch.sum(x * self.beta, dim=-1, keepdim=True))
 
302
 
303
- # Dynamic scaling coefficient: σ(x·β^T)·α + γ
304
  dynamic_scale = rescale_factor * self.alpha + self.gamma
305
 
306
- # Apply RMS normalization
307
  x_normalized = self._rms_norm(x.float())
308
 
 
 
309
  # Apply dynamic scaling
310
  output = x_normalized * dynamic_scale.float()
311
 
312
  return output.type_as(x)
313
 
314
  def extra_repr(self) -> str:
315
- return f"dim={self.dim}, eps={self.eps}"
316
-
317
 
318
  class NeoLLMRotaryEmbedding(nn.Module):
319
  inv_freq: torch.Tensor # fix linting for `register_buffer`
@@ -1049,4 +1051,4 @@ __all__ = [
1049
  # Register the configuration and model for AutoClass support
1050
  AutoConfig.register("neollm", NeoLLMConfig)
1051
  AutoModel.register(NeoLLMConfig, NeoLLMModel)
1052
- AutoModelForCausalLM.register(NeoLLMConfig, NeoLLMForCausalLM)
 
3
  NeoLLM Model with FANformer Integration in both Attention and FFN, Dropout Regularization,
4
  SeeDNorm (Self-Rescaled Dynamic Normalization), ResFormer Value Residual Learning,
5
  and Learnable Multipliers for enhanced scale adaptation and information flow through deep layers.
 
6
  Updated to include:
7
  - Fourier Analysis Network (FAN) layer for effective periodicity modeling in attention (relational space)
8
  - FAN layer in FFN for featural periodicity modeling (complementary coverage)
 
249
 
250
  return x_scaled
251
 
 
252
  class SeeDNorm(nn.Module):
253
  """
254
+ Self-Rescaled Dynamic Normalization (SeeDNorm) with dual dropout regularization.
255
 
 
256
  SeeDNorm(x) = [σ(x·β^T)·α + γ] ⊙ x/RMS(x)
257
+
 
 
 
 
 
 
 
 
258
 
259
  Args:
260
  dim: Hidden dimension size
261
  eps: Small constant for numerical stability
262
+ dropout_input: Dropout on input features for dynamic mechanism (default: 0.0)
263
+ dropout_hidden: Dropout on normalized hidden states (default: 0.0)
264
  """
265
 
266
+ def __init__(
267
+ self,
268
+ dim: int,
269
+ eps: float = 1e-6,
270
+ dropout_input: float = 0.01,
271
+ dropout_hidden: float = 0.01,
272
+ ):
273
  super().__init__()
274
  self.dim = dim
275
  self.eps = eps
276
+ self.dropout_input = dropout_input
277
+ self.dropout_hidden = dropout_hidden
278
 
279
  # Learnable parameters
280
+ self.gamma = nn.Parameter(torch.ones(dim)) # γ: static scaling
281
+ self.beta = nn.Parameter(torch.zeros(dim)) # β: self-rescaling
282
+ self.alpha = nn.Parameter(torch.ones(dim)) # α: dynamic modulation
283
 
284
  def _rms_norm(self, x: torch.Tensor) -> torch.Tensor:
285
  """Compute RMS normalization: x / RMS(x)"""
 
287
 
288
  def forward(self, x: torch.Tensor) -> torch.Tensor:
289
  """
290
+ Apply Self-Rescaled Dynamic Normalization with dual dropout.
291
 
292
  Args:
293
  x: Input tensor of shape (..., dim)
 
295
  Returns:
296
  Normalized and dynamically scaled tensor of same shape
297
  """
298
+
299
+ x_for_dynamic = F.dropout(x, p=self.dropout_input)
300
+ rescale_factor = torch.tanh(torch.sum(x_for_dynamic * self.beta,
301
+ dim=-1, keepdim=True))
302
 
303
+ # Compute dynamic scaling coefficient: σ(x·β^T)·α + γ
304
  dynamic_scale = rescale_factor * self.alpha + self.gamma
305
 
306
+ # Apply RMS normalization on ORIGINAL input (not dropped version)
307
  x_normalized = self._rms_norm(x.float())
308
 
309
+ x_normalized = F.dropout(x_normalized, p=self.dropout_hidden)
310
+
311
  # Apply dynamic scaling
312
  output = x_normalized * dynamic_scale.float()
313
 
314
  return output.type_as(x)
315
 
316
  def extra_repr(self) -> str:
317
+ return (f"dim={self.dim}, eps={self.eps}, "
318
+ f"dropout_input={self.dropout_input}, dropout_hidden={self.dropout_hidden}")
319
 
320
  class NeoLLMRotaryEmbedding(nn.Module):
321
  inv_freq: torch.Tensor # fix linting for `register_buffer`
 
1051
  # Register the configuration and model for AutoClass support
1052
  AutoConfig.register("neollm", NeoLLMConfig)
1053
  AutoModel.register(NeoLLMConfig, NeoLLMModel)
1054
+ AutoModelForCausalLM.register(NeoLLMConfig, NeoLLMForCausalLM)