I propose modifying the KORMo modelling to ensure compatibility with both Transformers 4.57.1 and 5.2.

In the case of RotaryEmbedding, the inv_freq value is calculated in the init and reused.
In Transformers 5.2, the model is loaded using the meta device, so this calculation does not take place. Consequently, in 5.2, logic was added to the _init_weights function to restore inv_freq via an else statement. In the case of KORMo, as it uses a custom _init_weights function, this logic was not applied, resulting in the issue where the RoPE value was not used during inference.
The following changes have been made to the code:

Added logic to restore inv_freq in _init_weights to KORMoPreTrainedModel.
Added the copy_ function used in _init_weights to the top of the file.
We resolved an issue where the original_inv_freq key value was not registered in _buffer by cloning the self.inv_freq value, which previously returned None because it was not calculated. (RotaryEmbedding)
We added the compute_default_rope_parameters function, which was missing in version 5.2. (RotaryEmbedding)
Compatible with both version 4.57.1 and version 5.2
Thank you.

Files changed (1) hide show

_modeling_kormo.py +31 -3

_modeling_kormo.py CHANGED Viewed

@@ -94,7 +94,13 @@ def rotate_half(x):
     x1 = x[..., : x.shape[-1] // 2]
     x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
 class Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
@@ -237,11 +243,24 @@ class RotaryEmbedding(nn.Module):
         self.original_max_seq_len = config.max_position_embeddings
         self.config = config
-        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
         inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = self.inv_freq
     @torch.no_grad()
     @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
@@ -289,6 +308,15 @@ class KORMoPreTrainedModel(PreTrainedModel):
                 module.weight.data[module.padding_idx].zero_()
         elif isinstance(module, RMSNorm):
             module.weight.data.fill_(1.0)
 class KORMoModel(KORMoPreTrainedModel):

     x1 = x[..., : x.shape[-1] // 2]
     x2 = x[..., x.shape[-1] // 2 :]
     return torch.cat((-x2, x1), dim=-1)
+def copy_(tensor: torch.Tensor, other: torch.Tensor) -> torch.Tensor:
+    if not getattr(tensor, "_is_hf_initialized", False):
+        with torch.no_grad():
+            return tensor.copy_(other)
+    return tensor
 class Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
         self.original_max_seq_len = config.max_position_embeddings
         self.config = config
+        rope_init_fn = self.compute_default_rope_parameters
+        rope_init_fn = ROPE_INIT_FUNCTIONS.get(self.rope_type, rope_init_fn)
+        self.rope_init_fn = rope_init_fn
         inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
+    @staticmethod
+    def compute_default_rope_parameters(config: KORMoConfig, device=None, seq_len =None):
+        base = config.rope_theta
+        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        attention_factor = 1.0
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
+        )
+        return inv_freq, attention_factor
     @torch.no_grad()
     @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
                 module.weight.data[module.padding_idx].zero_()
         elif isinstance(module, RMSNorm):
             module.weight.data.fill_(1.0)
+        elif "RotaryEmbedding" in module.__class__.__name__ and hasattr(module, "original_inv_freq"):
+            rope_fn = (
+                ROPE_INIT_FUNCTIONS[module.rope_type]
+                if module.rope_type != "default"
+                else module.compute_default_rope_parameters
+            )
+            buffer_value, _ = rope_fn(module.config)
+            copy_(module.inv_freq, buffer_value)
+            copy_(module.original_inv_freq, buffer_value)
 class KORMoModel(KORMoPreTrainedModel):