KitsuVp
/

NeoLLM

@@ -510,30 +510,56 @@ class NeoLLMRotaryEmbedding(nn.Module):
     def __init__(self, config: NeoLLMConfig, device=None):
         super().__init__()
-        self.config = config
         self.max_seq_len_cached = config.max_position_embeddings
         self.original_max_seq_len = config.max_position_embeddings
-        rope_type = None
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None and isinstance(config.rope_scaling, dict):
             rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
-        if rope_type and rope_type != "default" and rope_type in ROPE_INIT_FUNCTIONS:
-            self.rope_type = rope_type
-            self.rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
-            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
-        else:
-            self.rope_type = "default"
-            self.attention_scaling = 1.0
-            inv_freq = self.compute_default_rope_parameters(config, device)[0]
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = self.inv_freq
-    def compute_default_rope_parameters(self, config, device=None):
-        dim = int(config.head_dim * config.partial_rotary_factor)
-        inv_freq = 1.0 / (config.rope_theta ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim))
-        attention_scaling = 1.0
         return inv_freq, attention_scaling
     @torch.no_grad()

     def __init__(self, config: NeoLLMConfig, device=None):
         super().__init__()
         self.max_seq_len_cached = config.max_position_embeddings
         self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        # Determine rope_type from rope_scaling config
+        self.rope_type = "default"
         if hasattr(config, "rope_scaling") and config.rope_scaling is not None and isinstance(config.rope_scaling, dict):
             rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            if rope_type and rope_type in ROPE_INIT_FUNCTIONS:
+                self.rope_type = rope_type
+        # Initialize rope parameters
+        rope_init_fn = self.compute_default_rope_parameters
+        if self.rope_type != "default":
+            rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
+    @staticmethod
+    def compute_default_rope_parameters(
+        config: NeoLLMConfig = None,
+        device: Optional["torch.device"] = None,
+        seq_len: int = None,
+    ) -> tuple["torch.Tensor", float]:
+        """
+        Computes the inverse frequencies according to the original RoPE implementation
+        Args:
+            config: The model configuration.
+            device: The device to use for initialization of the inverse frequencies.
+            seq_len: The current sequence length. Unused for this type of RoPE.
+        Returns:
+            Tuple of (torch.Tensor, float), containing the inverse frequencies for the RoPE
+            embeddings and the post-processing scaling factor applied to the computed cos/sin.
+        """
+        base = config.rope_theta
+        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
+        dim = int(dim * partial_rotary_factor)
+        attention_scaling = 1.0  # Unused in default RoPE
+        # Compute the inverse frequencies
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
+        )
         return inv_freq, attention_scaling
     @torch.no_grad()