robinfaro
/

TiMoE-1B-fineweb_edu-30BT

model_hub_mixin

pytorch_model_hub_mixin

Model card Files Files and versions

robinfaro commited on May 20, 2025

Commit

6a3cafd

·

verified ·

1 Parent(s): 0f280d8

Upload modeling.py

Files changed (1) hide show

modeling.py +17 -1

modeling.py CHANGED Viewed

@@ -180,6 +180,7 @@ class MoEGPTForCausalLM(PreTrainedModel):
         assert config.sequence_length is not None
         self.config = config
         self.tokenizer = tiktoken.get_encoding("gpt2")
         self.transformer = nn.ModuleDict(
             dict(
@@ -260,7 +261,7 @@ class MoEGPTForCausalLM(PreTrainedModel):
         elif isinstance(module, nn.Embedding):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
-    def forward(self, idx, date=None, targets=None, get_logits=True, moe=False):
         device = idx.device
         b, t = idx.size()
         assert (
@@ -463,3 +464,18 @@ class MoEGPTForCausalLM(PreTrainedModel):
             .numpy()
         )
         return self.tokenizer.decode(out_idx).split(in_str)[-1]

         assert config.sequence_length is not None
         self.config = config
         self.tokenizer = tiktoken.get_encoding("gpt2")
+        self.base_model_prefix = "timoe"
         self.transformer = nn.ModuleDict(
             dict(
         elif isinstance(module, nn.Embedding):
             torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(self, idx, date=None, targets=None, attention_mask=None, get_logits=True, moe=False):
         device = idx.device
         b, t = idx.size()
         assert (
             .numpy()
         )
         return self.tokenizer.decode(out_idx).split(in_str)[-1]
+    def get_input_embeddings(self):
+        return self.transformer.wte
+    def set_input_embeddings(self, new_embeddings):
+        self.transformer.wte = new_embeddings
+        # reset the lm_head to use the new embeddings
+        # this is necessary because the lm_head is tied to the input embeddings
+        self.lm_head = nn.Linear(
+            self.config.n_embd, new_embeddings.weight.shape[0] , bias=False
+        )
+        #self.transformer.wte.weight = (
+        #    self.lm_head.weight
+        #)