Exquisique
/

GrownUpBaby

Text Generation

custom-architecture

Model card Files Files and versions

Exquisique commited on Feb 8

Commit

ad1fef5

·

verified ·

1 Parent(s): cefd7e8

Upload model.py with huggingface_hub

Files changed (1) hide show

model.py +50 -1

model.py CHANGED Viewed

@@ -310,9 +310,9 @@ class GemmaForCausalLM(PreTrainedModel):
     AutoModelForCausalLM and the transformers ecosystem.
     """
     config_class = GemmaConfig
-    base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["GemmaDecoderLayer"]
     def __init__(self, config: GemmaConfig):
         super().__init__(config)
@@ -326,6 +326,55 @@ class GemmaForCausalLM(PreTrainedModel):
         self.post_init()
     def get_input_embeddings(self):
         return self.model.embed_tokens

     AutoModelForCausalLM and the transformers ecosystem.
     """
     config_class = GemmaConfig
     supports_gradient_checkpointing = True
     _no_split_modules = ["GemmaDecoderLayer"]
+    _supports_param_buffer_assignment = False  # Fix for accelerate weight loading
     def __init__(self, config: GemmaConfig):
         super().__init__(config)
         self.post_init()
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        """
+        Custom from_pretrained that properly loads weights for this custom model.
+        This overrides the default behavior to ensure weights are loaded correctly.
+        """
+        import os
+        from huggingface_hub import hf_hub_download
+        # Get config
+        trust_remote_code = kwargs.pop("trust_remote_code", True)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        device_map = kwargs.pop("device_map", None)
+        # Load config
+        config = cls.config_class.from_pretrained(
+            pretrained_model_name_or_path,
+            trust_remote_code=trust_remote_code,
+            **kwargs
+        )
+        # Create model
+        model = cls(config)
+        # Find weight file
+        if os.path.isdir(pretrained_model_name_or_path):
+            weight_file = os.path.join(pretrained_model_name_or_path, "pytorch_model.bin")
+        else:
+            # Download from hub
+            weight_file = hf_hub_download(
+                repo_id=pretrained_model_name_or_path,
+                filename="pytorch_model.bin"
+            )
+        # Load weights
+        state_dict = torch.load(weight_file, map_location="cpu")
+        model.load_state_dict(state_dict, strict=False)
+        # Handle dtype and device
+        if torch_dtype is not None:
+            model = model.to(torch_dtype)
+        if device_map == "auto":
+            if torch.cuda.is_available():
+                model = model.to("cuda")
+        elif device_map is not None:
+            model = model.to(device_map)
+        return model
     def get_input_embeddings(self):
         return self.model.embed_tokens