KitsuVp
/

NeoLLM

@@ -563,20 +563,34 @@ class NeoLLMRotaryEmbedding(nn.Module):
         return inv_freq, attention_scaling
     @torch.no_grad()
-    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
-        position_ids_expanded = position_ids[:, None, :].float()
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]

         return inv_freq, attention_scaling
     @torch.no_grad()
+    @dynamic_rope_update
     def forward(self, x, position_ids):
+        # Asegura forma [B, S]
+        if position_ids.dim() == 1:
+            position_ids = position_ids.unsqueeze(0)  # [1, S]
+        B = x.shape[0]
+        if position_ids.shape[0] != B:
+            # Replica posiciones idénticas por batch (semántica correcta)
+            position_ids = position_ids.expand(B, -1)  # [B, S]
         device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        # inv_freq en float32 en el device correcto (sin expand con stride 0)
+        inv_freq = self.inv_freq.to(device=x.device, dtype=torch.float32)  # [d/2]
+        with torch.autocast(device_type=device_type, enabled=False):  # fuerza float32
+            # Θ[b,s,i] = position_ids[b,s] * inv_freq[i]
+            freqs = position_ids.to(dtype=torch.float32).unsqueeze(-1) * inv_freq.unsqueeze(0).unsqueeze(0)
+            # freqs: [B, S, d/2]
+            emb = torch.cat((freqs, freqs), dim=-1)  # [B, S, d]
             cos = emb.cos() * self.attention_scaling
             sin = emb.sin() * self.attention_scaling
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]