KitsuVp
/

NeoLLM

@@ -1406,18 +1406,28 @@ class NeoLLMRotaryEmbedding(nn.Module):
         B = x.shape[0]
         if position_ids.shape[0] != B:
             position_ids = position_ids.expand(B, -1)
         device_type = (x.device.type
                        if isinstance(x.device.type, str) and x.device.type != "mps"
                        else "cpu")
         inv_freq = self.inv_freq.to(device=x.device, dtype=torch.float32)
         with torch.autocast(device_type=device_type, enabled=False):
             freqs = (position_ids.to(dtype=torch.float32).unsqueeze(-1)
                      * inv_freq.unsqueeze(0).unsqueeze(0))
             emb   = torch.cat((freqs, freqs), dim=-1)
             cos   = emb.cos() * self.attention_scaling
             sin   = emb.sin() * self.attention_scaling
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)

         B = x.shape[0]
         if position_ids.shape[0] != B:
             position_ids = position_ids.expand(B, -1)
         device_type = (x.device.type
                        if isinstance(x.device.type, str) and x.device.type != "mps"
                        else "cpu")
+        if self.inv_freq.device.type == "meta":
+            inv_freq_data, _ = self.compute_default_rope_parameters(
+                self.config, device=x.device
+            )
+            self.register_buffer("inv_freq",          inv_freq_data,        persistent=False)
+            self.register_buffer("original_inv_freq", inv_freq_data.clone(), persistent=False)
         inv_freq = self.inv_freq.to(device=x.device, dtype=torch.float32)
         with torch.autocast(device_type=device_type, enabled=False):
             freqs = (position_ids.to(dtype=torch.float32).unsqueeze(-1)
                      * inv_freq.unsqueeze(0).unsqueeze(0))
             emb   = torch.cat((freqs, freqs), dim=-1)
             cos   = emb.cos() * self.attention_scaling
             sin   = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)