add attn_rms_norm_eps (#8)

- use specific rms_norm_eps for attn layer (d417ea779138edf602457609bc43ecb399d188ff)

Files changed (3) hide show

config.json CHANGED Viewed

@@ -22,6 +22,7 @@
   "num_hidden_layers": 32,
   "num_key_value_heads": 16,
   "rms_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 500000.0,
   "sliding_window": null,
@@ -32,4 +33,4 @@
   "use_cache": true,
   "use_sliding_window": false,
   "vocab_size": 219520
-}

   "num_hidden_layers": 32,
   "num_key_value_heads": 16,
   "rms_norm_eps": 1e-06,
+  "attn_rms_norm_eps": 1e-05,
   "rope_scaling": null,
   "rope_theta": 500000.0,
   "sliding_window": null,
   "use_cache": true,
   "use_sliding_window": false,
   "vocab_size": 219520
+}

configuration_motif.py CHANGED Viewed

@@ -42,7 +42,9 @@ class MotifConfig(PretrainedConfig):
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         rms_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the rms normalization layers.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
@@ -120,6 +122,7 @@ class MotifConfig(PretrainedConfig):
         max_position_embeddings=32768,
         initializer_range=0.02,
         rms_norm_eps=1e-6,
         use_cache=True,
         tie_word_embeddings=False,
         rope_theta=10000.0,
@@ -149,6 +152,7 @@ class MotifConfig(PretrainedConfig):
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
         self.rope_theta = rope_theta
         self.rope_scaling = rope_scaling
@@ -164,4 +168,4 @@ class MotifConfig(PretrainedConfig):
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )
-        logger.info(f' kwargs : {kwargs}')

         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers, except for the rms normalization in the attention layer.
+        attn_rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization in the attention layer.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
         max_position_embeddings=32768,
         initializer_range=0.02,
         rms_norm_eps=1e-6,
+        attn_rms_norm_eps=1e-5,
         use_cache=True,
         tie_word_embeddings=False,
         rope_theta=10000.0,
         self.hidden_act = hidden_act
         self.initializer_range = initializer_range
         self.rms_norm_eps = rms_norm_eps
+        self.attn_rms_norm_eps = attn_rms_norm_eps
         self.use_cache = use_cache
         self.rope_theta = rope_theta
         self.rope_scaling = rope_scaling
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )
+        logger.info(f' kwargs : {kwargs}')

modeling_motif.py CHANGED Viewed

@@ -362,7 +362,7 @@ class MotifAttention(nn.Module):
             setattr(self, name, nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32)))
             getattr(self, name).data.normal_(mean=0.0, std=0.1)
-        self.subln = MotifRMSNorm(2 * self.head_dim, eps=config.rms_norm_eps)
         self.lambda_init = 0.8 - 0.6 * math.exp(-0.3 * (layer_idx - 1))
         self.rotary_emb = MotifRotaryEmbeddingWithCache(self.head_dim,

             setattr(self, name, nn.Parameter(torch.zeros(self.head_dim, dtype=torch.float32)))
             getattr(self, name).data.normal_(mean=0.0, std=0.1)
+        self.subln = MotifRMSNorm(2 * self.head_dim, eps=config.attn_rms_norm_eps)
         self.lambda_init = 0.8 - 0.6 * math.exp(-0.3 * (layer_idx - 1))
         self.rotary_emb = MotifRotaryEmbeddingWithCache(self.head_dim,