dill-dev
/

Momo-336M-sft

@@ -10,10 +10,6 @@ from transformers.modeling_outputs import CausalLMOutputWithPast
 from .configuration_momo import MomoConfig
-# ════════════════════════════════════════════════════════════════
-#  COMPONENTS
-# ════════════════════════════════════════════════════════════════
 class RMSNorm(nn.Module):
     def __init__(self, dim, eps=1e-5):
         super().__init__()
@@ -57,10 +53,6 @@ def apply_rope(q, k, cos, sin):
     return (q * cos) + (rot_half(q) * sin), (k * cos) + (rot_half(k) * sin)
-# ════════════════════════════════════════════════════════════════
-#  ATTENTION — Grouped Query Attention (GQA)
-# ════════════════════════════════════════════════════════════════
 class MomoAttention(nn.Module):
     def __init__(self, cfg: MomoConfig):
         super().__init__()
@@ -112,10 +104,6 @@ class MomoAttention(nn.Module):
         return self.o(out), pres
-# ════════════════════════════════════════════════════════════════
-#  FEED-FORWARD — SwiGLU
-# ════════════════════════════════════════════════════════════════
 class MomoFFN(nn.Module):
     def __init__(self, cfg: MomoConfig):
         super().__init__()
@@ -127,10 +115,6 @@ class MomoFFN(nn.Module):
         return self.down(F.silu(self.gate(x)) * self.up(x))
-# ════════════════════════════════════════════════════════════════
-#  TRANSFORMER BLOCK
-# ════════════════════════════════════════════════════════════════
 class MomoBlock(nn.Module):
     def __init__(self, cfg: MomoConfig):
         super().__init__()
@@ -146,50 +130,37 @@ class MomoBlock(nn.Module):
         return x, p
-# ════════════════════════════════════════════════════════════════
-#  🌸 MOMO FOR CAUSAL LM
-# ════════════════════════════════════════════════════════════════
 class MomoForCausalLM(PreTrainedModel):
     config_class = MomoConfig
     _no_split_modules = ["MomoBlock"]
     _tied_weights_keys = ["lm_head.weight"]
     def __init__(self, cfg: MomoConfig):
         super().__init__(cfg)
-        self.embed    = nn.Embedding(cfg.vocab_size, cfg.hidden_size)
-        self.layers   = nn.ModuleList([MomoBlock(cfg) for _ in range(cfg.num_hidden_layers)])
-        self.norm     = RMSNorm(cfg.hidden_size, cfg.rms_norm_eps)
-        # lm_head weight is tied to embed — do NOT pre-tie here,
-        # HF will call tie_weights() after loading the state dict
-        self.lm_head  = nn.Linear(cfg.hidden_size, cfg.vocab_size, bias=False)
         self.grad_ckpt = cfg.use_gradient_checkpointing
         self.apply(self._init_weights)
-    # ── Required by HF 4.40+ ────────────────────────────────────
-    @property
-    def all_tied_weights_keys(self):
-        # Must return a dict: {weight_to_tie: source_weight}
-        return {"lm_head.weight": "embed.weight"}
-    def tie_weights(self, missing_keys=None, recompute_mapping=False, **kwargs):
-        self.lm_head.weight = self.embed.weight
-    # ── Embedding accessors (needed by HF tie_weights logic) ─────
     def get_input_embeddings(self):
         return self.embed
     def set_input_embeddings(self, value):
         self.embed = value
-        self.tie_weights()
     def get_output_embeddings(self):
         return self.lm_head
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-    # ── Weight init ──────────────────────────────────────────────
     def _init_weights(self, m):
         if isinstance(m, nn.Linear):
             nn.init.normal_(m.weight, std=0.02)
@@ -198,7 +169,6 @@ class MomoForCausalLM(PreTrainedModel):
         elif isinstance(m, nn.Embedding):
             nn.init.normal_(m.weight, std=0.02)
-    # ── Forward ──────────────────────────────────────────────────
     def forward(
         self,
         input_ids=None,
@@ -216,7 +186,7 @@ class MomoForCausalLM(PreTrainedModel):
             if self.grad_ckpt and self.training:
                 def _fn(layer):
                     def fn(x):
-                        out, _ = layer(x, mask=attention_mask, use_cache=False)
                         return out
                     return fn
                 x = torch.utils.checkpoint.checkpoint(
@@ -244,7 +214,6 @@ class MomoForCausalLM(PreTrainedModel):
             past_key_values=cache if use_cache else None,
         )
-    # ── Generate ─────────────────────────────────────────────────
     @torch.no_grad()
     def generate(
         self,
@@ -268,7 +237,6 @@ class MomoForCausalLM(PreTrainedModel):
             past   = out.past_key_values
             logits = out.logits[:, -1, :].float()
-            # Repetition penalty
             if rep_penalty != 1.0:
                 for tok in set(gen[0].tolist()):
                     if logits[0, tok] > 0:

 from .configuration_momo import MomoConfig
 class RMSNorm(nn.Module):
     def __init__(self, dim, eps=1e-5):
         super().__init__()
     return (q * cos) + (rot_half(q) * sin), (k * cos) + (rot_half(k) * sin)
 class MomoAttention(nn.Module):
     def __init__(self, cfg: MomoConfig):
         super().__init__()
         return self.o(out), pres
 class MomoFFN(nn.Module):
     def __init__(self, cfg: MomoConfig):
         super().__init__()
         return self.down(F.silu(self.gate(x)) * self.up(x))
 class MomoBlock(nn.Module):
     def __init__(self, cfg: MomoConfig):
         super().__init__()
         return x, p
 class MomoForCausalLM(PreTrainedModel):
     config_class = MomoConfig
     _no_split_modules = ["MomoBlock"]
     _tied_weights_keys = ["lm_head.weight"]
+    # HF 4.40+ calls model.all_tied_weights_keys.keys() — must be a dict on the instance
+    all_tied_weights_keys = {"lm_head.weight": "embed.weight"}
     def __init__(self, cfg: MomoConfig):
         super().__init__(cfg)
+        self.embed   = nn.Embedding(cfg.vocab_size, cfg.hidden_size)
+        self.layers  = nn.ModuleList([MomoBlock(cfg) for _ in range(cfg.num_hidden_layers)])
+        self.norm    = RMSNorm(cfg.hidden_size, cfg.rms_norm_eps)
+        self.lm_head = nn.Linear(cfg.hidden_size, cfg.vocab_size, bias=False)
+        # Tie weights now — HF post-load also calls get_output_embeddings to re-tie
+        self.lm_head.weight = self.embed.weight
         self.grad_ckpt = cfg.use_gradient_checkpointing
         self.apply(self._init_weights)
+    # HF calls these to re-tie after loading — must be defined
     def get_input_embeddings(self):
         return self.embed
     def set_input_embeddings(self, value):
         self.embed = value
     def get_output_embeddings(self):
         return self.lm_head
+    def set_output_embeddings(self, value):
+        self.lm_head = value
     def _init_weights(self, m):
         if isinstance(m, nn.Linear):
             nn.init.normal_(m.weight, std=0.02)
         elif isinstance(m, nn.Embedding):
             nn.init.normal_(m.weight, std=0.02)
     def forward(
         self,
         input_ids=None,
             if self.grad_ckpt and self.training:
                 def _fn(layer):
                     def fn(x):
+                        out, _ = layer(x, mask=None, use_cache=False)
                         return out
                     return fn
                 x = torch.utils.checkpoint.checkpoint(
             past_key_values=cache if use_cache else None,
         )
     @torch.no_grad()
     def generate(
         self,
             past   = out.past_key_values
             logits = out.logits[:, -1, :].float()
             if rep_penalty != 1.0:
                 for tok in set(gen[0].tolist()):
                     if logits[0, tok] > 0: