Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

config.json +4 -1
esm_nv.py +30 -8
model.safetensors +2 -2
tokenizer_config.json +4 -46

config.json CHANGED Viewed

@@ -1,4 +1,5 @@
 {
   "architectures": [
     "NVEsmForMaskedLM"
   ],
@@ -22,6 +23,7 @@
   "hidden_size": 1280,
   "initializer_range": 0.02,
   "intermediate_size": 5120,
   "is_folding_model": false,
   "layer_norm_eps": 1e-05,
   "mask_token_id": 32,
@@ -35,8 +37,9 @@
   "padded_vocab_size": 64,
   "position_embedding_type": "rotary",
   "qkv_weight_interleaved": true,
   "token_dropout": true,
-  "transformers_version": "4.57.6",
   "use_cache": true,
   "vocab_list": null,
   "vocab_size": 33

 {
+  "add_cross_attention": false,
   "architectures": [
     "NVEsmForMaskedLM"
   ],
   "hidden_size": 1280,
   "initializer_range": 0.02,
   "intermediate_size": 5120,
+  "is_decoder": false,
   "is_folding_model": false,
   "layer_norm_eps": 1e-05,
   "mask_token_id": 32,
   "padded_vocab_size": 64,
   "position_embedding_type": "rotary",
   "qkv_weight_interleaved": true,
+  "tie_word_embeddings": true,
   "token_dropout": true,
+  "transformers_version": "5.0.0",
   "use_cache": true,
   "vocab_list": null,
   "vocab_size": 33

esm_nv.py CHANGED Viewed

@@ -22,7 +22,7 @@
 Adapted from `modeling_esm.py` in huggingface/transformers.
 """
-from typing import Literal, Optional, Unpack
 # TODO: put import guard around transformer_engine here, with an informative error message around
 # installation and the nvidia docker container.
@@ -256,10 +256,34 @@ class NVEsmPreTrainedModel(EsmPreTrainedModel):
         # Meta-device init seems to break weight tying, so we re-tie the weights here.
         self.tie_weights()
-    @classmethod
-    def get_init_context(cls, is_quantized: bool, _is_ds_init_called: bool):
-        """Override the default get_init_context method to allow for fp8 model initialization."""
-        return []
 class NVEsmModel(NVEsmPreTrainedModel):
@@ -367,7 +391,7 @@ class NVEsmModel(NVEsmPreTrainedModel):
 class NVEsmForMaskedLM(NVEsmPreTrainedModel):
     """NVEsmForMaskedLM is a TransformerEngine-optimized ESM model for masked language modeling."""
-    _tied_weights_keys = ("lm_head.decoder.weight",)
     def __init__(self, config: NVEsmConfig):
         """Initialize a NVEsmForMaskedLM.
@@ -386,7 +410,6 @@ class NVEsmForMaskedLM(NVEsmPreTrainedModel):
         self.esm = NVEsmModel(config, add_pooling_layer=False)
         self.lm_head = NVEsmLMHead(config)
-        self.init_weights()
         self.post_init()
     def get_output_embeddings(self):
@@ -614,7 +637,6 @@ class NVEsmForTokenClassification(NVEsmPreTrainedModel):
             init_method=lambda x: torch.nn.init.normal_(x, mean=0.0, std=config.initializer_range),
         )
-        self.init_weights()
         self.post_init()
     def forward(

 Adapted from `modeling_esm.py` in huggingface/transformers.
 """
+from typing import ClassVar, Literal, Optional, Unpack
 # TODO: put import guard around transformer_engine here, with an informative error message around
 # installation and the nvidia docker container.
         # Meta-device init seems to break weight tying, so we re-tie the weights here.
         self.tie_weights()
+    def _init_weights(self, module):
+        """Initialize module weights.
+        We only use this method for standard pytorch modules, TE modules handle their own weight initialization through
+        `init_method` parameters and the `reset_parameters` method.
+        """
+        if module.__module__.startswith("transformer_engine.pytorch"):
+            # Notably, we need to avoid calling the parent method for TE modules, since the default _init_weights will
+            # assume any class with `LayerNorm` in the name should have weights initialized to 1.0; breaking
+            # `LayerNormLinear` and `LayerNormMLP` modules that use `weight` for the linear layer and
+            # `layer_norm_weight` for the layer norm. Instead, we call `reset_parameters` if the module has it and the
+            # weights are not in fp8. We still need to figure out why this raises an error if we're using
+            # `quantized_model_init`.
+            if hasattr(module, "reset_parameters") and not getattr(module, "primary_weights_in_fp8", False):
+                module.reset_parameters()
+            return
+        super()._init_weights(module)
+    def state_dict(self, *args, **kwargs):
+        """Override state_dict to filter out TransformerEngine's _extra_state keys.
+        TransformerEngine layers add _extra_state attributes that are not compatible with HuggingFace v5 model loading.
+        These are filtered out to ensure checkpoints can be loaded with from_pretrained().
+        """
+        state_dict = super().state_dict(*args, **kwargs)
+        # Filter out _extra_state keys which are TransformerEngine-specific and not loadable
+        return {k: v for k, v in state_dict.items() if not k.endswith("_extra_state")}
 class NVEsmModel(NVEsmPreTrainedModel):
 class NVEsmForMaskedLM(NVEsmPreTrainedModel):
     """NVEsmForMaskedLM is a TransformerEngine-optimized ESM model for masked language modeling."""
+    _tied_weights_keys: ClassVar[dict[str, str]] = {"lm_head.decoder.weight": "esm.embeddings.word_embeddings.weight"}
     def __init__(self, config: NVEsmConfig):
         """Initialize a NVEsmForMaskedLM.
         self.esm = NVEsmModel(config, add_pooling_layer=False)
         self.lm_head = NVEsmLMHead(config)
         self.post_init()
     def get_output_embeddings(self):
             init_method=lambda x: torch.nn.init.normal_(x, mean=0.0, std=config.initializer_range),
         )
         self.post_init()
     def forward(

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6a3cbb14dad1e96e8abd94b0cbc5bdebf600e4aab21d90af9954feea4d2da881
-size 2604396920

 version https://git-lfs.github.com/spec/v1
+oid sha256:504f43d8ef30f80d1c8603b9839602a9b298a0819b0d569cdb3e9f77f03c735a
+size 2604379992

tokenizer_config.json CHANGED Viewed

@@ -1,60 +1,18 @@
 {
-  "add_bos_token": true,
-  "add_eos_token": true,
-  "added_tokens_decoder": {
-    "0": {
-      "content": "<cls>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "1": {
-      "content": "<pad>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "2": {
-      "content": "<eos>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "3": {
-      "content": "<unk>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "32": {
-      "content": "<mask>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
   "bos_token": "<cls>",
   "clean_up_tokenization_spaces": false,
   "cls_token": "<cls>",
   "eos_token": "<eos>",
-  "extra_special_tokens": {},
   "mask_token": "<mask>",
   "model_input_names": [
     "input_ids",
     "attention_mask"
   ],
   "model_max_length": 1000000000000000019884624838656,
   "pad_token": "<pad>",
-  "tokenizer_class": "PreTrainedTokenizerFast",
   "unk_token": "<unk>"
 }

 {
+  "backend": "tokenizers",
   "bos_token": "<cls>",
   "clean_up_tokenization_spaces": false,
   "cls_token": "<cls>",
   "eos_token": "<eos>",
+  "is_local": true,
   "mask_token": "<mask>",
   "model_input_names": [
     "input_ids",
     "attention_mask"
   ],
   "model_max_length": 1000000000000000019884624838656,
+  "model_specific_special_tokens": {},
   "pad_token": "<pad>",
+  "tokenizer_class": "TokenizersBackend",
   "unk_token": "<unk>"
 }