Update modeling_neollm.py
Browse files- modeling_neollm.py +28 -26
modeling_neollm.py
CHANGED
|
@@ -3,7 +3,6 @@
|
|
| 3 |
NeoLLM Model with FANformer Integration in both Attention and FFN, Dropout Regularization,
|
| 4 |
SeeDNorm (Self-Rescaled Dynamic Normalization), ResFormer Value Residual Learning,
|
| 5 |
and Learnable Multipliers for enhanced scale adaptation and information flow through deep layers.
|
| 6 |
-
|
| 7 |
Updated to include:
|
| 8 |
- Fourier Analysis Network (FAN) layer for effective periodicity modeling in attention (relational space)
|
| 9 |
- FAN layer in FFN for featural periodicity modeling (complementary coverage)
|
|
@@ -250,37 +249,37 @@ class GPAS(nn.Module):
|
|
| 250 |
|
| 251 |
return x_scaled
|
| 252 |
|
| 253 |
-
|
| 254 |
class SeeDNorm(nn.Module):
|
| 255 |
"""
|
| 256 |
-
Self-Rescaled Dynamic Normalization (SeeDNorm)
|
| 257 |
|
| 258 |
-
From "SeeDNorm: Self-Rescaled Dynamic Normalization":
|
| 259 |
SeeDNorm(x) = [σ(x·β^T)·α + γ] ⊙ x/RMS(x)
|
| 260 |
-
|
| 261 |
-
Dynamically adjusts the scaling coefficient based on the current input,
|
| 262 |
-
preserving input norm information and enabling data-dependent normalization.
|
| 263 |
-
|
| 264 |
-
Key features:
|
| 265 |
-
- γ: Static scaling factor (like RMSNorm), initialized to 1
|
| 266 |
-
- β: Self-rescaling parameter, initialized to 0
|
| 267 |
-
- α: Dynamic modulation parameter, initialized to 1
|
| 268 |
-
- σ: tanh activation to constrain dynamic scaling range [-1, 1]
|
| 269 |
|
| 270 |
Args:
|
| 271 |
dim: Hidden dimension size
|
| 272 |
eps: Small constant for numerical stability
|
|
|
|
|
|
|
| 273 |
"""
|
| 274 |
|
| 275 |
-
def __init__(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
super().__init__()
|
| 277 |
self.dim = dim
|
| 278 |
self.eps = eps
|
|
|
|
|
|
|
| 279 |
|
| 280 |
# Learnable parameters
|
| 281 |
-
self.gamma = nn.Parameter(torch.ones(dim))
|
| 282 |
-
self.beta = nn.Parameter(torch.zeros(dim))
|
| 283 |
-
self.alpha = nn.Parameter(torch.ones(dim))
|
| 284 |
|
| 285 |
def _rms_norm(self, x: torch.Tensor) -> torch.Tensor:
|
| 286 |
"""Compute RMS normalization: x / RMS(x)"""
|
|
@@ -288,7 +287,7 @@ class SeeDNorm(nn.Module):
|
|
| 288 |
|
| 289 |
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 290 |
"""
|
| 291 |
-
Apply Self-Rescaled Dynamic Normalization.
|
| 292 |
|
| 293 |
Args:
|
| 294 |
x: Input tensor of shape (..., dim)
|
|
@@ -296,24 +295,27 @@ class SeeDNorm(nn.Module):
|
|
| 296 |
Returns:
|
| 297 |
Normalized and dynamically scaled tensor of same shape
|
| 298 |
"""
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
rescale_factor = torch.tanh(torch.sum(
|
|
|
|
| 302 |
|
| 303 |
-
#
|
| 304 |
dynamic_scale = rescale_factor * self.alpha + self.gamma
|
| 305 |
|
| 306 |
-
# Apply RMS normalization
|
| 307 |
x_normalized = self._rms_norm(x.float())
|
| 308 |
|
|
|
|
|
|
|
| 309 |
# Apply dynamic scaling
|
| 310 |
output = x_normalized * dynamic_scale.float()
|
| 311 |
|
| 312 |
return output.type_as(x)
|
| 313 |
|
| 314 |
def extra_repr(self) -> str:
|
| 315 |
-
return f"dim={self.dim}, eps={self.eps}"
|
| 316 |
-
|
| 317 |
|
| 318 |
class NeoLLMRotaryEmbedding(nn.Module):
|
| 319 |
inv_freq: torch.Tensor # fix linting for `register_buffer`
|
|
@@ -1049,4 +1051,4 @@ __all__ = [
|
|
| 1049 |
# Register the configuration and model for AutoClass support
|
| 1050 |
AutoConfig.register("neollm", NeoLLMConfig)
|
| 1051 |
AutoModel.register(NeoLLMConfig, NeoLLMModel)
|
| 1052 |
-
AutoModelForCausalLM.register(NeoLLMConfig, NeoLLMForCausalLM)
|
|
|
|
| 3 |
NeoLLM Model with FANformer Integration in both Attention and FFN, Dropout Regularization,
|
| 4 |
SeeDNorm (Self-Rescaled Dynamic Normalization), ResFormer Value Residual Learning,
|
| 5 |
and Learnable Multipliers for enhanced scale adaptation and information flow through deep layers.
|
|
|
|
| 6 |
Updated to include:
|
| 7 |
- Fourier Analysis Network (FAN) layer for effective periodicity modeling in attention (relational space)
|
| 8 |
- FAN layer in FFN for featural periodicity modeling (complementary coverage)
|
|
|
|
| 249 |
|
| 250 |
return x_scaled
|
| 251 |
|
|
|
|
| 252 |
class SeeDNorm(nn.Module):
|
| 253 |
"""
|
| 254 |
+
Self-Rescaled Dynamic Normalization (SeeDNorm) with dual dropout regularization.
|
| 255 |
|
|
|
|
| 256 |
SeeDNorm(x) = [σ(x·β^T)·α + γ] ⊙ x/RMS(x)
|
| 257 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
|
| 259 |
Args:
|
| 260 |
dim: Hidden dimension size
|
| 261 |
eps: Small constant for numerical stability
|
| 262 |
+
dropout_input: Dropout on input features for dynamic mechanism (default: 0.0)
|
| 263 |
+
dropout_hidden: Dropout on normalized hidden states (default: 0.0)
|
| 264 |
"""
|
| 265 |
|
| 266 |
+
def __init__(
|
| 267 |
+
self,
|
| 268 |
+
dim: int,
|
| 269 |
+
eps: float = 1e-6,
|
| 270 |
+
dropout_input: float = 0.01,
|
| 271 |
+
dropout_hidden: float = 0.01,
|
| 272 |
+
):
|
| 273 |
super().__init__()
|
| 274 |
self.dim = dim
|
| 275 |
self.eps = eps
|
| 276 |
+
self.dropout_input = dropout_input
|
| 277 |
+
self.dropout_hidden = dropout_hidden
|
| 278 |
|
| 279 |
# Learnable parameters
|
| 280 |
+
self.gamma = nn.Parameter(torch.ones(dim)) # γ: static scaling
|
| 281 |
+
self.beta = nn.Parameter(torch.zeros(dim)) # β: self-rescaling
|
| 282 |
+
self.alpha = nn.Parameter(torch.ones(dim)) # α: dynamic modulation
|
| 283 |
|
| 284 |
def _rms_norm(self, x: torch.Tensor) -> torch.Tensor:
|
| 285 |
"""Compute RMS normalization: x / RMS(x)"""
|
|
|
|
| 287 |
|
| 288 |
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
| 289 |
"""
|
| 290 |
+
Apply Self-Rescaled Dynamic Normalization with dual dropout.
|
| 291 |
|
| 292 |
Args:
|
| 293 |
x: Input tensor of shape (..., dim)
|
|
|
|
| 295 |
Returns:
|
| 296 |
Normalized and dynamically scaled tensor of same shape
|
| 297 |
"""
|
| 298 |
+
|
| 299 |
+
x_for_dynamic = F.dropout(x, p=self.dropout_input)
|
| 300 |
+
rescale_factor = torch.tanh(torch.sum(x_for_dynamic * self.beta,
|
| 301 |
+
dim=-1, keepdim=True))
|
| 302 |
|
| 303 |
+
# Compute dynamic scaling coefficient: σ(x·β^T)·α + γ
|
| 304 |
dynamic_scale = rescale_factor * self.alpha + self.gamma
|
| 305 |
|
| 306 |
+
# Apply RMS normalization on ORIGINAL input (not dropped version)
|
| 307 |
x_normalized = self._rms_norm(x.float())
|
| 308 |
|
| 309 |
+
x_normalized = F.dropout(x_normalized, p=self.dropout_hidden)
|
| 310 |
+
|
| 311 |
# Apply dynamic scaling
|
| 312 |
output = x_normalized * dynamic_scale.float()
|
| 313 |
|
| 314 |
return output.type_as(x)
|
| 315 |
|
| 316 |
def extra_repr(self) -> str:
|
| 317 |
+
return (f"dim={self.dim}, eps={self.eps}, "
|
| 318 |
+
f"dropout_input={self.dropout_input}, dropout_hidden={self.dropout_hidden}")
|
| 319 |
|
| 320 |
class NeoLLMRotaryEmbedding(nn.Module):
|
| 321 |
inv_freq: torch.Tensor # fix linting for `register_buffer`
|
|
|
|
| 1051 |
# Register the configuration and model for AutoClass support
|
| 1052 |
AutoConfig.register("neollm", NeoLLMConfig)
|
| 1053 |
AutoModel.register(NeoLLMConfig, NeoLLMModel)
|
| 1054 |
+
AutoModelForCausalLM.register(NeoLLMConfig, NeoLLMForCausalLM)
|