tattabio
/

gLM2_650M_embed

Model card Files Files and versions

andrecornman commited on Feb 11

Commit

45bc549

·

verified ·

1 Parent(s): 1b5c960

fix init_weights

Files changed (1) hide show

modeling_glm2.py +19 -3

modeling_glm2.py CHANGED Viewed

@@ -353,7 +353,7 @@ class gLM2PreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = False
     # https://github.com/huggingface/transformers/blob/7032e0203262ebb2ebf55da8d2e01f873973e835/src/transformers/models/bert/modeling_bert.py#L748
-    def _init_weights(module, initializer_range=0.02):
         if isinstance(module, nn.Linear):
             nn.init.normal_(module.weight, std=initializer_range)
             if module.bias is not None:
@@ -362,7 +362,22 @@ class gLM2PreTrainedModel(PreTrainedModel):
             nn.init.normal_(module.weight, std=initializer_range)
             if module.padding_idx is not None:
                 nn.init.zeros_(module.weight[module.padding_idx])
 class gLM2Model(gLM2PreTrainedModel):
     """gLM2 Model."""
@@ -438,6 +453,7 @@ class gLM2ForEmbedding(gLM2PreTrainedModel):
         self.glm2 = gLM2Model(config)
         self.pool = MeanPooling()
         self.projection = nn.Linear(config.dim, config.projection_dim, bias=False)
     def forward(
         self,
@@ -466,7 +482,7 @@ class gLM2ForMaskedLM(gLM2PreTrainedModel):
         self.glm2 = gLM2Model(config)
         self.lm_head = gLM2LMHead(config)
-        self.init_weights()
     def forward(
         self,

     supports_gradient_checkpointing = False
     # https://github.com/huggingface/transformers/blob/7032e0203262ebb2ebf55da8d2e01f873973e835/src/transformers/models/bert/modeling_bert.py#L748
+    def _init_weights(self, module, initializer_range=0.02):
         if isinstance(module, nn.Linear):
             nn.init.normal_(module.weight, std=initializer_range)
             if module.bias is not None:
             nn.init.normal_(module.weight, std=initializer_range)
             if module.padding_idx is not None:
                 nn.init.zeros_(module.weight[module.padding_idx])
+        elif isinstance(module, RotaryEmbedding):
+            # Re-calculate the frequencies using the module's stored attributes
+            inv_freq = 1.0 / (
+                module.base
+                ** (
+                    torch.arange(0, module.dim, 2, device=module.inv_freq.device, dtype=torch.float32)
+                    / module.dim
+                )
+            )
+            # Force the buffer to update
+            with torch.no_grad():
+                module.inv_freq.copy_(inv_freq)
+        elif isinstance(module, RMSNorm):
+            if hasattr(module, "variance_epsilon"):
+                with torch.no_grad():
+                    module.variance_epsilon.fill_(self.config.norm_eps)
 class gLM2Model(gLM2PreTrainedModel):
     """gLM2 Model."""
         self.glm2 = gLM2Model(config)
         self.pool = MeanPooling()
         self.projection = nn.Linear(config.dim, config.projection_dim, bias=False)
+        self.post_init()
     def forward(
         self,
         self.glm2 = gLM2Model(config)
         self.lm_head = gLM2LMHead(config)
+        self.post_init()
     def forward(
         self,