KitsuVp
/

NeoLLM

@@ -505,7 +505,6 @@ class StackMemory(nn.Module):
         return output, new_stack[:, -1], new_mask[:, -1]
 # ==================== ROTARY EMBEDDING ====================
 class NeoLLMRotaryEmbedding(nn.Module):
     inv_freq: torch.Tensor  # fix linting for `register_buffer`
@@ -524,14 +523,19 @@ class NeoLLMRotaryEmbedding(nn.Module):
             self.rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
             inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         else:
-            self.rope_type = None
             self.attention_scaling = 1.0
-            dim = int(config.head_dim * config.partial_rotary_factor)
-            inv_freq = 1.0 / (config.rope_theta ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim))
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
     @torch.no_grad()
     @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
@@ -547,7 +551,6 @@ class NeoLLMRotaryEmbedding(nn.Module):
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]

         return output, new_stack[:, -1], new_mask[:, -1]
 # ==================== ROTARY EMBEDDING ====================
 class NeoLLMRotaryEmbedding(nn.Module):
     inv_freq: torch.Tensor  # fix linting for `register_buffer`
             self.rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
             inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         else:
+            self.rope_type = "default"
             self.attention_scaling = 1.0
+            inv_freq = self.compute_default_rope_parameters(config, device)[0]
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
+    def compute_default_rope_parameters(self, config, device=None):
+        dim = int(config.head_dim * config.partial_rotary_factor)
+        inv_freq = 1.0 / (config.rope_theta ** (torch.arange(0, dim, 2, dtype=torch.float32, device=device) / dim))
+        attention_scaling = 1.0
+        return inv_freq, attention_scaling
     @torch.no_grad()
     @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
     def forward(self, x, position_ids):
         return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 def rotate_half(x):
     """Rotates half the hidden dims of the input."""
     x1 = x[..., : x.shape[-1] // 2]