Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

esm_nv.py +26 -2
model-00001-of-00013.safetensors +2 -2
model-00013-of-00013.safetensors +2 -2
model.safetensors.index.json +2 -2

esm_nv.py CHANGED Viewed

@@ -259,9 +259,13 @@ class NVEsmPreTrainedModel(PreTrainedModel):
         "EsmEmbeddings",
     )
-    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
     def _init_weights(self, module: nn.Module):
-        """Initialize the weights.
         Args:
             module (nn.Module): The module to initialize the weights for.
@@ -282,9 +286,29 @@ class NVEsmPreTrainedModel(PreTrainedModel):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
         if isinstance(module, transformer_engine.pytorch.LayerNormLinear):
             module.layer_norm_weight.data.fill_(1.0)
             if module.layer_norm_bias is not None:
                 module.layer_norm_bias.data.zero_()
     @classmethod
     def get_init_context(cls, is_quantized: bool, _is_ds_init_called: bool):

         "EsmEmbeddings",
     )
     def _init_weights(self, module: nn.Module):
+        """Initialize model weights.
+        This method ensures that models with randomly-initialized weights get the correct initial value distribution,
+        which can be critical for training stability. We also call this method directly when using meta-device init, as
+        the `to_empty` method does not initialize the weights. While the base Transformers model has a similar method,
+        we need to extend it to handle TE-specific modules.
         Args:
             module (nn.Module): The module to initialize the weights for.
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
         if isinstance(module, transformer_engine.pytorch.LayerNormLinear):
+            if module.layer_norm_bias is not None:
+                module.layer_norm_bias.data.zero_()
             module.layer_norm_weight.data.fill_(1.0)
             if module.layer_norm_bias is not None:
                 module.layer_norm_bias.data.zero_()
+        if isinstance(module, transformer_engine.pytorch.LayerNormMLP):
+            if module.layer_norm_bias is not None:
+                module.layer_norm_bias.data.zero_()
+            module.layer_norm_weight.data.fill_(1.0)
+            if hasattr(module, "fc1_weight") and module.fc1_weight is not None:
+                module.fc1_weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if hasattr(module, "fc2_weight") and module.fc2_weight is not None:
+                module.fc2_weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if hasattr(module, "fc1_bias") and module.fc1_bias is not None and module.fc1_bias.numel() > 0:
+                module.fc1_bias.data.zero_()
+            if hasattr(module, "fc2_bias") and module.fc2_bias is not None and module.fc2_bias.numel() > 0:
+                module.fc2_bias.data.zero_()
+        if isinstance(module, RotaryPositionEmbedding) and hasattr(module, "inv_freq"):
+            # When we initialize the model with `to_empty`, the `inv_freq` attribute is not initialized, so we need to
+            # re-initialize it here with the correct values.
+            module.inv_freq = RotaryPositionEmbedding(
+                self.config.hidden_size // self.config.num_attention_heads
+            ).inv_freq.to(module.inv_freq.device)
     @classmethod
     def get_init_context(cls, is_quantized: bool, _is_ds_init_called: bool):

model-00001-of-00013.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:02007bbf2c42108ab281ff3fea39f6a495181b2390b05e4e469d54e51eb161b2
-size 4616119920

 version https://git-lfs.github.com/spec/v1
+oid sha256:172497ffc270184c31badaf14e0132e234c16112ddd2eee373838faaead6d573
+size 4616120040

model-00013-of-00013.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4ada1f6ab3d781e8848fa39585d06bedaed2372b97c0e80f479648e2ac617d34
-size 3880597101

 version https://git-lfs.github.com/spec/v1
+oid sha256:0956689eebee89d381080f4605597b90e5e33a82ff7a581ac9ae8f6f2f067f0a
+size 3880596984

model.safetensors.index.json CHANGED Viewed

@@ -1,11 +1,11 @@
 {
   "metadata": {
     "total_parameters": 15129257024,
-    "total_size": 60517028357
   },
   "weight_map": {
     "esm.embeddings.word_embeddings.weight": "model-00001-of-00013.safetensors",
-    "esm.encoder.emb_layer_norm_after._extra_state": "model-00013-of-00013.safetensors",
     "esm.encoder.emb_layer_norm_after.bias": "model-00013-of-00013.safetensors",
     "esm.encoder.emb_layer_norm_after.weight": "model-00013-of-00013.safetensors",
     "esm.encoder.layers.0.layernorm_mlp._extra_state": "model-00001-of-00013.safetensors",

 {
   "metadata": {
     "total_parameters": 15129257024,
+    "total_size": 60517028352
   },
   "weight_map": {
     "esm.embeddings.word_embeddings.weight": "model-00001-of-00013.safetensors",
+    "esm.encoder.emb_layer_norm_after._extra_state": "model-00001-of-00013.safetensors",
     "esm.encoder.emb_layer_norm_after.bias": "model-00013-of-00013.safetensors",
     "esm.encoder.emb_layer_norm_after.weight": "model-00013-of-00013.safetensors",
     "esm.encoder.layers.0.layernorm_mlp._extra_state": "model-00001-of-00013.safetensors",