KitsuVp
/

NeoLLM

@@ -4701,38 +4701,11 @@ class SpellingBeeEmbedding(nn.Module):
             persistent=True,
         )
-        # ── Non-persistent buffers (recomputed from fixed formula on load) ─
-        # RoPE cos/sin for intra-token positions 0..MAX_BYTES-1.
-        # Shape [MAX_BYTES, d//2] — applied over the 256-type axis in
-        # _build_rope_bytes, not over the batch/sequence axis.
-        half  = d // 2
-        theta = 1.0 / (base ** (torch.arange(0, half, dtype=torch.float) * 2.0 / d))
-        pos   = torch.arange(self.MAX_BYTES, dtype=torch.float)
-        freqs = torch.outer(pos, theta)                  # [MAX_BYTES, half]
-        self.register_buffer("intra_cos", freqs.cos(), persistent=False)
-        self.register_buffer("intra_sin", freqs.sin(), persistent=False)
-        # Static position index [MAX_BYTES] used as the column index in the
-        # vocab-level gather.  Registered as buffer to avoid dynamic tensor
-        # creation inside forward (which would trigger torch.compile retracing).
-        self.register_buffer(
-            "pos_idx",
-            torch.arange(self.MAX_BYTES, dtype=torch.long),
-            persistent=False,
-        )
         # LayerNorm over character embeddings — mirrors the reference impl
         # (character_norm=True by default in littletrainingloop).  Runs in
         # float32 for stability, applied at vocab level before the batch gather.
         self.char_norm = nn.LayerNorm(d)
-        # ── Hook post-carga ────────────────────────────────────────────────
-        # Los buffers non-persistent (intra_cos, intra_sin, pos_idx) se
-        # calculan desde una fórmula fija y NUNCA deben venir del safetensors.
-        # Si el checkpoint fue guardado con persistent=True (versión anterior),
-        # from_pretrained los sobreescribiría con valores corruptos.
-        # _load_from_state_dict los elimina antes de aplicar el state_dict.
     def _load_from_state_dict(
         self, state_dict, prefix, local_metadata, strict, missing_keys,
         unexpected_keys, error_msgs,
@@ -4800,8 +4773,11 @@ class SpellingBeeEmbedding(nn.Module):
         byte_emb.weight.  All shapes are fully static, so torch.compile can
         fuse this into a single kernel.
-        Called once per forward pass; the result is discarded afterward.
-        The cost is two broadcast elementwise ops + one cat over fixed dims.
         Returns:
             rope_bytes [256, MAX_BYTES, d]
@@ -4810,8 +4786,14 @@ class SpellingBeeEmbedding(nn.Module):
         half = w.shape[-1] // 2
         w1   = w[:, :half].unsqueeze(1)          # [256, 1, half]
         w2   = w[:, half:].unsqueeze(1)          # [256, 1, half]
-        cos  = self.intra_cos.unsqueeze(0)       # [1, MAX_BYTES, half]
-        sin  = self.intra_sin.unsqueeze(0)       # [1, MAX_BYTES, half]
         return torch.cat(
             [w1 * cos - w2 * sin,
              w1 * sin + w2 * cos],
@@ -4843,8 +4825,8 @@ class SpellingBeeEmbedding(nn.Module):
         # vocab token and each position, the RoPE-rotated embedding of that
         # byte at that position.  Result [V, MAX_BYTES, d], then sum → [V, d].
         e_chars_vocab = rope_bytes[
-            self.token_bytes,             # [V, MAX_BYTES] — row index
-            self.pos_idx.unsqueeze(0),    # [1, MAX_BYTES] → broadcast [V, MAX_BYTES]
         ].sum(1)                          # [V, d]
         # ── Step 3: apply precomputed 1/√byte_len per vocab type ────────────
@@ -4894,8 +4876,9 @@ class SpellingBeeEmbedding(nn.Module):
         rope_bytes = self._build_rope_bytes()            # [256, MAX_BYTES, d]
         e_chars_vocab = rope_bytes[
             self.token_bytes,
-            self.pos_idx.unsqueeze(0),
         ].sum(1) * self.inv_sqrt_lens.unsqueeze(-1)      # [V, d]
         return (token_emb_weight + e_chars_vocab) * 0.5

             persistent=True,
         )
         # LayerNorm over character embeddings — mirrors the reference impl
         # (character_norm=True by default in littletrainingloop).  Runs in
         # float32 for stability, applied at vocab level before the batch gather.
         self.char_norm = nn.LayerNorm(d)
     def _load_from_state_dict(
         self, state_dict, prefix, local_metadata, strict, missing_keys,
         unexpected_keys, error_msgs,
         byte_emb.weight.  All shapes are fully static, so torch.compile can
         fuse this into a single kernel.
+        cos/sin se computan inline aquí en vez de usar buffers registrados.
+        Con device_map + accelerate, from_pretrained materializa tensores
+        del safetensors directamente — non-persistent buffers que no están
+        en el checkpoint quedan como memoria sin inicializar. Computar inline
+        elimina esa dependencia sin overhead apreciable ([16, d//2] es ínfimo).
         Returns:
             rope_bytes [256, MAX_BYTES, d]
         half = w.shape[-1] // 2
         w1   = w[:, :half].unsqueeze(1)          # [256, 1, half]
         w2   = w[:, half:].unsqueeze(1)          # [256, 1, half]
+        # Computar RoPE inline — formas estáticas, torch.compile lo fusiona.
+        theta = 1.0 / (self._rope_base ** (
+            torch.arange(0, half, dtype=torch.float32, device=w.device) * 2.0 / (half * 2)
+        ))
+        pos   = torch.arange(self.MAX_BYTES, dtype=torch.float32, device=w.device)
+        freqs = torch.outer(pos, theta)                   # [MAX_BYTES, half]
+        cos   = freqs.cos().to(w.dtype).unsqueeze(0)     # [1, MAX_BYTES, half]
+        sin   = freqs.sin().to(w.dtype).unsqueeze(0)     # [1, MAX_BYTES, half]
         return torch.cat(
             [w1 * cos - w2 * sin,
              w1 * sin + w2 * cos],
         # vocab token and each position, the RoPE-rotated embedding of that
         # byte at that position.  Result [V, MAX_BYTES, d], then sum → [V, d].
         e_chars_vocab = rope_bytes[
+            self.token_bytes,                                          # [V, MAX_BYTES] — row index
+            torch.arange(self.MAX_BYTES, device=rope_bytes.device),   # [MAX_BYTES] — col index
         ].sum(1)                          # [V, d]
         # ── Step 3: apply precomputed 1/√byte_len per vocab type ────────────
         rope_bytes = self._build_rope_bytes()            # [256, MAX_BYTES, d]
         e_chars_vocab = rope_bytes[
             self.token_bytes,
+            torch.arange(self.MAX_BYTES, device=rope_bytes.device),
         ].sum(1) * self.inv_sqrt_lens.unsqueeze(-1)      # [V, d]
+        e_chars_vocab = self.char_norm(e_chars_vocab.float()).to(token_emb_weight.dtype)
         return (token_emb_weight + e_chars_vocab) * 0.5