KitsuVp
/

NeoLLM

@@ -1954,8 +1954,8 @@ def _apply_repo_rope(
     n_groups   = H // H_kv
     rotary_dim = inv_freq.shape[0] * 2   # inv_freq covers half the rotary dim
-    # inv_freq is already float32 on the correct device (registered as buffer
-    # via set_repo_inv_freq) — no .to() needed, no DeviceCopy op.
     # No autocast barrier: explicit .float() casts on z_q/z_k are sufficient
     # to maintain float32 precision for the trig ops. Removing the context
     # manager lets Inductor plan all intermediate tensors as part of a single
@@ -2225,37 +2225,9 @@ class NeoLLMAttention(nn.Module):
                 d_p=_d_p,
                 num_heads=config.num_attention_heads,
             )
-            # _repo_inv_freq is registered as a non-persistent buffer by
-            # set_repo_inv_freq(), called from NeoLLMModel.__init__ after
-            # rotary_emb is built. Declaring it here would conflict.
-            self._repo_attn_scaling: float = 1.0
         else:
             self.repo_module = None
-    def set_repo_inv_freq(
-        self,
-        inv_freq: torch.Tensor,
-        attention_scaling: float,
-    ) -> None:
-        """
-        Inject the rotary frequency vector from NeoLLMRotaryEmbedding so that
-        REPO can build cos/sin inline from continuous positions.
-        Called once by NeoLLMModel.__init__ after rotary_emb is constructed.
-        Only has effect when use_repo=True for this layer.
-        Args:
-            inv_freq:          [rotary_dim/2] — frozen inv_freq buffer from
-                               NeoLLMRotaryEmbedding.
-            attention_scaling: float — attention_scaling from the same module.
-        """
-        if self.use_repo:
-            # Register as non-persistent buffer so .to(device) / .cuda() moves
-            # it automatically — eliminates the DeviceCopy op that splits the
-            # CUDAGraph into 2 partitions when _apply_repo_rope runs.
-            self.register_buffer("_repo_inv_freq", inv_freq.float(), persistent=False)
-            self._repo_attn_scaling = attention_scaling
     def _apply_momentum_attention(
         self,
         q: torch.Tensor,
@@ -2391,6 +2363,7 @@ class NeoLLMAttention(nn.Module):
         attention_mask: Optional[torch.Tensor] = None,
         first_layer_fan: Optional[torch.Tensor] = None,
         attn_analysis: Optional[AttentionAnalysis] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
         input_shape = hidden_states.shape[:-1]
@@ -2428,25 +2401,14 @@ class NeoLLMAttention(nn.Module):
             # REPO path: f_ϕ predicts continuous per-head positions from the
             # residual stream, then cos/sin are built inline from those positions
             # so the rotation is differentiable w.r.t. REPOModule parameters.
             # (Li et al., 2026, §3.2 — Eq. 6–7)
             repo_a = attn_analysis.repo if attn_analysis is not None else None
             z = self.repo_module(hidden_states, repo_analysis=repo_a)  # [B, H, S]
-            # Meta-device guard: _repo_inv_freq heredó el meta device de
-            # rotary_emb.inv_freq si el modelo fue cargado con from_pretrained.
-            # Se materializa una sola vez; los forwards siguientes toman el
-            # path normal sin overhead adicional.
-            if self._repo_inv_freq.device.type == "meta":
-                inv_freq_data, _ = NeoLLMRotaryEmbedding.compute_default_rope_parameters(
-                    self.config, device=hidden_states.device
-                )
-                self.register_buffer("_repo_inv_freq", inv_freq_data.float(), persistent=False)
-            q, k = _apply_repo_rope(
-                q, k, z,
-                self._repo_inv_freq,
-                self._repo_attn_scaling,
-            )
         else:
             # Standard path: integer positions pre-computed by NeoLLMModel.
             q, k = apply_rotary_pos_emb(q, k, cos, sin)
@@ -3213,6 +3175,7 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
         attn_res_partial: Optional[torch.Tensor] = None,
         layer_analysis: Optional[LayerAnalysis] = None,
         output_attentions: Optional[bool] = False,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> Tuple:
         # ── Snapshot input ────────────────────────────────────���───────────
@@ -3250,6 +3213,7 @@ class NeoLLMDecoderLayer(GradientCheckpointingLayer):
             position_embeddings=position_embeddings,
             first_layer_fan=first_layer_fan,
             attn_analysis=layer_analysis.attention if layer_analysis is not None else None,
             **kwargs,
         )
@@ -3775,17 +3739,6 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         self.post_init()
-        # ── REPO: inject inv_freq into every attention layer that uses it ─────
-        # Done after post_init so rotary_emb.inv_freq is already initialized.
-        # Layers below repo_start_layer never call set_repo_inv_freq (their
-        # use_repo flag is False) so the call is harmless for those layers.
-        if getattr(config, "use_repo", False):
-            for layer in self.layers:
-                layer.self_attn.set_repo_inv_freq(
-                    self.rotary_emb.inv_freq,
-                    self.rotary_emb.attention_scaling,
-                )
     def get_input_embeddings(self):
         if self.config.use_token_generator:
             return self.token_generator
@@ -3919,6 +3872,17 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
         self.first_layer_fan = None
         # ── Attention Residuals state ──────────────────────────────────────
         # Full AttnRes (attn_res_num_blocks=0): sources grows by one entry per
         # decoder layer — all previous outputs are kept, max N=num_layers+1.
@@ -3979,6 +3943,7 @@ class NeoLLMModel(NeoLLMPreTrainedModel):
                 attn_res_partial=attn_res_partial if use_attn_res else None,
                 layer_analysis=layer_analysis,
                 output_attentions=output_attentions,
                 **kwargs,
             )
             hidden_states = layer_outputs[0]

     n_groups   = H // H_kv
     rotary_dim = inv_freq.shape[0] * 2   # inv_freq covers half the rotary dim
+    # inv_freq arrives from rotary_emb at forward time via repo_rope_args —
+    # already float32 on the correct device, no .to() needed, no DeviceCopy op.
     # No autocast barrier: explicit .float() casts on z_q/z_k are sufficient
     # to maintain float32 precision for the trig ops. Removing the context
     # manager lets Inductor plan all intermediate tensors as part of a single
                 d_p=_d_p,
                 num_heads=config.num_attention_heads,
             )
         else:
             self.repo_module = None
     def _apply_momentum_attention(
         self,
         q: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         first_layer_fan: Optional[torch.Tensor] = None,
         attn_analysis: Optional[AttentionAnalysis] = None,
+        repo_rope_args: Optional[Tuple[torch.Tensor, float]] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
         input_shape = hidden_states.shape[:-1]
             # REPO path: f_ϕ predicts continuous per-head positions from the
             # residual stream, then cos/sin are built inline from those positions
             # so the rotation is differentiable w.r.t. REPOModule parameters.
+            # inv_freq and attention_scaling arrive via repo_rope_args, sourced
+            # directly from rotary_emb at forward time — no buffer on this module,
+            # no meta-tensor issue on lm_eval / to(device) paths.
             # (Li et al., 2026, §3.2 — Eq. 6–7)
             repo_a = attn_analysis.repo if attn_analysis is not None else None
             z = self.repo_module(hidden_states, repo_analysis=repo_a)  # [B, H, S]
+            inv_freq, attn_scaling = repo_rope_args
+            q, k = _apply_repo_rope(q, k, z, inv_freq, attn_scaling)
         else:
             # Standard path: integer positions pre-computed by NeoLLMModel.
             q, k = apply_rotary_pos_emb(q, k, cos, sin)
         attn_res_partial: Optional[torch.Tensor] = None,
         layer_analysis: Optional[LayerAnalysis] = None,
         output_attentions: Optional[bool] = False,
+        repo_rope_args: Optional[Tuple[torch.Tensor, float]] = None,
         **kwargs: Unpack[FlashAttentionKwargs],
     ) -> Tuple:
         # ── Snapshot input ────────────────────────────────────���───────────
             position_embeddings=position_embeddings,
             first_layer_fan=first_layer_fan,
             attn_analysis=layer_analysis.attention if layer_analysis is not None else None,
+            repo_rope_args=repo_rope_args,
             **kwargs,
         )
         self.post_init()
     def get_input_embeddings(self):
         if self.config.use_token_generator:
             return self.token_generator
         position_embeddings = self.rotary_emb(hidden_states, position_ids)
         self.first_layer_fan = None
+        # ── REPO: pass inv_freq by reference at forward time ──────────────────
+        # rotary_emb.inv_freq is already on the correct device (managed by
+        # NeoLLMRotaryEmbedding as a buffer) — no .to(), no DeviceCopy op.
+        # Computed once here and passed through the decoder layer chain so
+        # NeoLLMAttention never needs to store it as a buffer itself, avoiding
+        # the meta-tensor issue that occurs when lm_eval calls .to(device).
+        repo_rope_args = (
+            (self.rotary_emb.inv_freq, self.rotary_emb.attention_scaling)
+            if getattr(self.config, "use_repo", False) else None
+        )
         # ── Attention Residuals state ──────────────────────────────────────
         # Full AttnRes (attn_res_num_blocks=0): sources grows by one entry per
         # decoder layer — all previous outputs are kept, max N=num_layers+1.
                 attn_res_partial=attn_res_partial if use_attn_res else None,
                 layer_analysis=layer_analysis,
                 output_attentions=output_attentions,
+                repo_rope_args=repo_rope_args,
                 **kwargs,
             )
             hidden_states = layer_outputs[0]