KitsuVp
/

NeoLLM

@@ -4727,22 +4727,6 @@ class SpellingBeeEmbedding(nn.Module):
             missing_keys, unexpected_keys, error_msgs,
         )
-    def get_char_embeddings_vocab(self) -> torch.Tensor:
-        """
-        Devuelve e_chars para todo el vocabulario [V, d], sin mezclar con
-        token embeddings.  Usado por el output path cuando tie_word_embeddings=False
-        para aplicar spelling bee en la proyección de salida, siguiendo la
-        implementación de referencia (littletrainingloop, spelling_bee_out).
-        """
-        rope_bytes    = self._build_rope_bytes()                      # [256, MAX_BYTES, d]
-        e_chars_vocab = rope_bytes[
-            self.token_bytes,
-            torch.arange(self.MAX_BYTES, device=rope_bytes.device),
-        ].sum(1) * self.inv_sqrt_lens.unsqueeze(-1)                   # [V, d]
-        return self.char_norm(
-            e_chars_vocab.to(self.char_norm.weight.dtype)
-        ).to(self.byte_emb.weight.dtype)                              # [V, d]
     def set_byte_table(self, tokenizer) -> None:
         """
         Precompute the UTF-8 byte table and inv_sqrt_lens from a tokenizer.
@@ -4852,7 +4836,7 @@ class SpellingBeeEmbedding(nn.Module):
         # ── Step 3b: LayerNorm over character embeddings (float32 for stability)
         # Mirrors reference impl (character_norm=True).  Ensures E[‖e_chars‖²]
         # matches E[‖e_tok‖²] regardless of token byte-length distribution.
-        e_chars_vocab = self.char_norm(e_chars_vocab.to(self.char_norm.weight.dtype)).to(token_embeds.dtype)
         # ── Step 4: gather only the tokens present in this batch ────────────
         # This is the only B×S operation — a single embedding lookup.
@@ -4894,7 +4878,7 @@ class SpellingBeeEmbedding(nn.Module):
             self.token_bytes,
             torch.arange(self.MAX_BYTES, device=rope_bytes.device),
         ].sum(1) * self.inv_sqrt_lens.unsqueeze(-1)      # [V, d]
-        e_chars_vocab = self.char_norm(e_chars_vocab.to(self.char_norm.weight.dtype)).to(token_emb_weight.dtype)
         return (token_emb_weight + e_chars_vocab) * 0.5
@@ -5452,7 +5436,7 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
         self.vocab_size = config.vocab_size
         self.lm_head    = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        if config.use_token_generator or not config.tie_word_embeddings:
             self._tied_weights_keys = {}
         # ── Analysis infrastructure ───────────────────────────────────────
@@ -5519,30 +5503,6 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
     # ── Standard model API ────────────────────────────────────────────────
-    def _get_lm_head_weight(self) -> torch.Tensor:
-        """
-        Devuelve el weight efectivo del lm_head.
-        - tie_word_embeddings=True:  lm_head.weight == embed_tokens.weight
-          (tying estándar, spelling bee solo en entrada).
-        - tie_word_embeddings=False + use_spelling_bee_embeddings=True:
-          blendea lm_head.weight con e_chars del vocabulario completo,
-          siguiendo la implementación de referencia del paper
-          (littletrainingloop, spelling_bee_out, separate_token_embedding=False).
-          El byte_emb compartido recibe gradiente desde ambos paths.
-        - tie_word_embeddings=False sin spelling bee:
-          devuelve lm_head.weight directamente.
-        """
-        weights = self.lm_head.weight                                  # [V, d]
-        if (not self.config.tie_word_embeddings
-                and getattr(self.config, "use_spelling_bee_embeddings", False)
-                and self.model.spelling_bee is not None):
-            e_chars = self.model.spelling_bee.get_char_embeddings_vocab()  # [V, d]
-            weights = (weights + e_chars.to(weights.dtype)) * 0.5
-        return weights
     def get_input_embeddings(self):
         return self.model.get_input_embeddings()
@@ -5622,7 +5582,7 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
         loss = None
         if labels is not None:
             loss = compute_cce_loss(
-                hidden_states, labels, self._get_lm_head_weight(),
                 getattr(self.lm_head, "bias", None), self.config.pad_token_id,
             )
             # Add JTok-M load-balancing auxiliary loss
@@ -5657,9 +5617,7 @@ class NeoLLMForCausalLM(NeoLLMPreTrainedModel, GenerationMixin):
                 slice(-logits_to_keep, None)
                 if isinstance(logits_to_keep, int) else logits_to_keep
             )
-            logits = torch.nn.functional.linear(
-                hidden_states[:, slice_indices, :], self._get_lm_head_weight()
-            )
         # ── Finalise and store analysis state ─────────────────────────────
         if analysis_state is not None:

             missing_keys, unexpected_keys, error_msgs,
         )
     def set_byte_table(self, tokenizer) -> None:
         """
         Precompute the UTF-8 byte table and inv_sqrt_lens from a tokenizer.
         # ── Step 3b: LayerNorm over character embeddings (float32 for stability)
         # Mirrors reference impl (character_norm=True).  Ensures E[‖e_chars‖²]
         # matches E[‖e_tok‖²] regardless of token byte-length distribution.
+        e_chars_vocab = self.char_norm(e_chars_vocab.float()).to(token_embeds.dtype)
         # ── Step 4: gather only the tokens present in this batch ────────────
         # This is the only B×S operation — a single embedding lookup.
             self.token_bytes,
             torch.arange(self.MAX_BYTES, device=rope_bytes.device),
         ].sum(1) * self.inv_sqrt_lens.unsqueeze(-1)      # [V, d]
+        e_chars_vocab = self.char_norm(e_chars_vocab.float()).to(token_emb_weight.dtype)
         return (token_emb_weight + e_chars_vocab) * 0.5
         self.vocab_size = config.vocab_size
         self.lm_head    = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        if config.use_token_generator:
             self._tied_weights_keys = {}
         # ── Analysis infrastructure ───────────────────────────────────────
     # ── Standard model API ────────────────────────────────────────────────
     def get_input_embeddings(self):
         return self.model.get_input_embeddings()
         loss = None
         if labels is not None:
             loss = compute_cce_loss(
+                hidden_states, labels, self.lm_head.weight,
                 getattr(self.lm_head, "bias", None), self.config.pad_token_id,
             )
             # Add JTok-M load-balancing auxiliary loss
                 slice(-logits_to_keep, None)
                 if isinstance(logits_to_keep, int) else logits_to_keep
             )
+            logits = self.lm_head(hidden_states[:, slice_indices, :])
         # ── Finalise and store analysis state ─────────────────────────────
         if analysis_state is not None: