add attn_rms_norm_eps (#8)
Browse files- use specific rms_norm_eps for attn layer (d417ea779138edf602457609bc43ecb399d188ff)
- config.json +2 -1
- configuration_motif.py +6 -2
- modeling_motif.py +1 -1
config.json
CHANGED
|
@@ -22,6 +22,7 @@
|
|
| 22 |
"num_hidden_layers": 32,
|
| 23 |
"num_key_value_heads": 16,
|
| 24 |
"rms_norm_eps": 1e-06,
|
|
|
|
| 25 |
"rope_scaling": null,
|
| 26 |
"rope_theta": 500000.0,
|
| 27 |
"sliding_window": null,
|
|
@@ -32,4 +33,4 @@
|
|
| 32 |
"use_cache": true,
|
| 33 |
"use_sliding_window": false,
|
| 34 |
"vocab_size": 219520
|
| 35 |
-
}
|
|
|
|
| 22 |
"num_hidden_layers": 32,
|
| 23 |
"num_key_value_heads": 16,
|
| 24 |
"rms_norm_eps": 1e-06,
|
| 25 |
+
"attn_rms_norm_eps": 1e-05,
|
| 26 |
"rope_scaling": null,
|
| 27 |
"rope_theta": 500000.0,
|
| 28 |
"sliding_window": null,
|
|
|
|
| 33 |
"use_cache": true,
|
| 34 |
"use_sliding_window": false,
|
| 35 |
"vocab_size": 219520
|
| 36 |
+
}
|
configuration_motif.py
CHANGED
|
@@ -42,7 +42,9 @@ class MotifConfig(PretrainedConfig):
|
|
| 42 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
| 43 |
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
| 44 |
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
| 45 |
-
The epsilon used by the rms normalization layers.
|
|
|
|
|
|
|
| 46 |
use_cache (`bool`, *optional*, defaults to `True`):
|
| 47 |
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
| 48 |
relevant if `config.is_decoder=True`.
|
|
@@ -120,6 +122,7 @@ class MotifConfig(PretrainedConfig):
|
|
| 120 |
max_position_embeddings=32768,
|
| 121 |
initializer_range=0.02,
|
| 122 |
rms_norm_eps=1e-6,
|
|
|
|
| 123 |
use_cache=True,
|
| 124 |
tie_word_embeddings=False,
|
| 125 |
rope_theta=10000.0,
|
|
@@ -149,6 +152,7 @@ class MotifConfig(PretrainedConfig):
|
|
| 149 |
self.hidden_act = hidden_act
|
| 150 |
self.initializer_range = initializer_range
|
| 151 |
self.rms_norm_eps = rms_norm_eps
|
|
|
|
| 152 |
self.use_cache = use_cache
|
| 153 |
self.rope_theta = rope_theta
|
| 154 |
self.rope_scaling = rope_scaling
|
|
@@ -164,4 +168,4 @@ class MotifConfig(PretrainedConfig):
|
|
| 164 |
tie_word_embeddings=tie_word_embeddings,
|
| 165 |
**kwargs,
|
| 166 |
)
|
| 167 |
-
logger.info(f' kwargs : {kwargs}')
|
|
|
|
| 42 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
| 43 |
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
| 44 |
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
| 45 |
+
The epsilon used by the rms normalization layers, except for the rms normalization in the attention layer.
|
| 46 |
+
attn_rms_norm_eps (`float`, *optional*, defaults to 1e-05):
|
| 47 |
+
The epsilon used by the rms normalization in the attention layer.
|
| 48 |
use_cache (`bool`, *optional*, defaults to `True`):
|
| 49 |
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
| 50 |
relevant if `config.is_decoder=True`.
|
|
|
|
| 122 |
max_position_embeddings=32768,
|
| 123 |
initializer_range=0.02,
|
| 124 |
rms_norm_eps=1e-6,
|
| 125 |
+
attn_rms_norm_eps=1e-5,
|
| 126 |
use_cache=True,
|
| 127 |
tie_word_embeddings=False,
|
| 128 |
rope_theta=10000.0,
|
|
|
|
| 152 |
self.hidden_act = hidden_act
|
| 153 |
self.initializer_range = initializer_range
|
| 154 |
self.rms_norm_eps = rms_norm_eps
|
| 155 |
+
self.attn_rms_norm_eps = attn_rms_norm_eps
|
| 156 |
self.use_cache = use_cache
|
| 157 |
self.rope_theta = rope_theta
|
| 158 |
self.rope_scaling = rope_scaling
|
|
|
|
| 168 |
tie_word_embeddings=tie_word_embeddings,
|
| 169 |
**kwargs,
|
| 170 |
)
|
| 171 |
+
logger.info(f' kwargs : {kwargs}')
|
modeling_motif.py
CHANGED
|
@@ -362,7 +362,7 @@ class MotifAttention(nn.Module):
|
|
| 362 |
setattr(self, name, nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32)))
|
| 363 |
getattr(self, name).data.normal_(mean=0.0, std=0.1)
|
| 364 |
|
| 365 |
-
self.subln = MotifRMSNorm(2 * self.head_dim, eps=config.
|
| 366 |
self.lambda_init = 0.8 - 0.6 * math.exp(-0.3 * (layer_idx - 1))
|
| 367 |
|
| 368 |
self.rotary_emb = MotifRotaryEmbeddingWithCache(self.head_dim,
|
|
|
|
| 362 |
setattr(self, name, nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32)))
|
| 363 |
getattr(self, name).data.normal_(mean=0.0, std=0.1)
|
| 364 |
|
| 365 |
+
self.subln = MotifRMSNorm(2 * self.head_dim, eps=config.attn_rms_norm_eps)
|
| 366 |
self.lambda_init = 0.8 - 0.6 * math.exp(-0.3 * (layer_idx - 1))
|
| 367 |
|
| 368 |
self.rotary_emb = MotifRotaryEmbeddingWithCache(self.head_dim,
|