KitsuVp
/

NeoLLM

@@ -4836,7 +4836,7 @@ class SpellingBeeEmbedding(nn.Module):
         # ── Step 3b: LayerNorm over character embeddings (float32 for stability)
         # Mirrors reference impl (character_norm=True).  Ensures E[‖e_chars‖²]
         # matches E[‖e_tok‖²] regardless of token byte-length distribution.
-        e_chars_vocab = self.char_norm(e_chars_vocab.float()).to(token_embeds.dtype)
         # ── Step 4: gather only the tokens present in this batch ────────────
         # This is the only B×S operation — a single embedding lookup.
@@ -4878,7 +4878,7 @@ class SpellingBeeEmbedding(nn.Module):
             self.token_bytes,
             torch.arange(self.MAX_BYTES, device=rope_bytes.device),
         ].sum(1) * self.inv_sqrt_lens.unsqueeze(-1)      # [V, d]
-        e_chars_vocab = self.char_norm(e_chars_vocab.float()).to(token_emb_weight.dtype)
         return (token_emb_weight + e_chars_vocab) * 0.5

         # ── Step 3b: LayerNorm over character embeddings (float32 for stability)
         # Mirrors reference impl (character_norm=True).  Ensures E[‖e_chars‖²]
         # matches E[‖e_tok‖²] regardless of token byte-length distribution.
+        e_chars_vocab = self.char_norm(e_chars_vocab.to(self.char_norm.weight.dtype)).to(token_embeds.dtype)
         # ── Step 4: gather only the tokens present in this batch ────────────
         # This is the only B×S operation — a single embedding lookup.
             self.token_bytes,
             torch.arange(self.MAX_BYTES, device=rope_bytes.device),
         ].sum(1) * self.inv_sqrt_lens.unsqueeze(-1)      # [V, d]
+        e_chars_vocab = self.char_norm(e_chars_vocab.to(self.char_norm.weight.dtype)).to(token_emb_weight.dtype)
         return (token_emb_weight + e_chars_vocab) * 0.5