KitsuVp
/

NeoLLM

@@ -1200,6 +1200,7 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
     Note on LM head: Following "Learnable Multipliers" paper recommendations,
     the output projection (lm_head) does NOT include learnable multipliers.
     """
     def __init__(self, config):
         super().__init__(config)
@@ -1210,7 +1211,6 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.post_init()
-        _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
     def forward(
         self,

     Note on LM head: Following "Learnable Multipliers" paper recommendations,
     the output projection (lm_head) does NOT include learnable multipliers.
     """
+    _tied_weights_keys = {"lm_head.weight": "model.embed_tokens.weight"}
     def __init__(self, config):
         super().__init__(config)
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.post_init()
     def forward(
         self,