sdadas
/

stella-pl-retrieval-mini-8k

@@ -26,11 +26,11 @@
   "pack_qkv": true,
   "pad_token_id": 0,
   "position_embedding_type": "rope",
-  "rope_scaling": {
-    "factor": 2.0,
-    "type": "ntk"
   },
-  "rope_theta": 160000,
   "transformers_version": "4.56.1",
   "type_vocab_size": 2,
   "unpad_inputs": true,

   "pack_qkv": true,
   "pad_token_id": 0,
   "position_embedding_type": "rope",
+  "rope_parameters": {
+	"rope_theta": 160000,
+	"factor": 2.0,
+    "rope_type": "default"
   },
   "transformers_version": "4.56.1",
   "type_vocab_size": 2,
   "unpad_inputs": true,

configuration.py CHANGED Viewed

@@ -108,8 +108,7 @@ class NewConfig(PretrainedConfig):
         layer_norm_eps=1e-12,
         # pad_token_id=0,
         position_embedding_type="rope",
-        rope_theta=10000.0,
-        rope_scaling=None,
         classifier_dropout=None,
         pack_qkv=True,
         unpad_inputs=False,
@@ -134,9 +133,8 @@ class NewConfig(PretrainedConfig):
         self.layer_norm_type = layer_norm_type
         self.layer_norm_eps = layer_norm_eps
         self.position_embedding_type = position_embedding_type
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
         self.classifier_dropout = classifier_dropout
         self.pack_qkv = pack_qkv
         self.unpad_inputs = unpad_inputs

         layer_norm_eps=1e-12,
         # pad_token_id=0,
         position_embedding_type="rope",
+        rope_parameters=None,
         classifier_dropout=None,
         pack_qkv=True,
         unpad_inputs=False,
         self.layer_norm_type = layer_norm_type
         self.layer_norm_eps = layer_norm_eps
         self.position_embedding_type = position_embedding_type
         self.classifier_dropout = classifier_dropout
+        self.rope_parameters = rope_parameters
         self.pack_qkv = pack_qkv
         self.unpad_inputs = unpad_inputs

modeling.py CHANGED Viewed

@@ -16,11 +16,13 @@
 """PyTorch NEW model."""
 import math
-from typing import List, Optional, Tuple, Union
 import torch
 import torch.utils.checkpoint
 from torch import nn
 from transformers.activations import ACT2FN
 from transformers.modeling_outputs import (
@@ -139,6 +141,28 @@ class IndexPutFirstAxis(torch.autograd.Function):
 index_put_first_axis = IndexPutFirstAxis.apply
 def pad_input(inputs: torch.Tensor, indices: torch.Tensor, batch: int, seqlen: int) -> torch.Tensor:
     """Add padding to sequences.
@@ -162,7 +186,7 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_pos_emb(q, k, cos, sin):
     """Applies Rotary Position Embedding to the query and key tensors.
     Args:
@@ -170,84 +194,75 @@ def apply_rotary_pos_emb(q, k, cos, sin):
         k (`torch.Tensor`): The key tensor.
         cos (`torch.Tensor`): The cosine part of the rotary embedding.
         sin (`torch.Tensor`): The sine part of the rotary embedding.
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
-    cos, sin = cos.to(q.dtype), sin.to(q.dtype)
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
-class RotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=512, base=10000.0, device=None):
-        super().__init__()
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.float32)
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-        return (
-            self.cos_cached[:seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:seq_len, ...].to(dtype=x.dtype),
-        )
-class NTKScalingRotaryEmbedding(RotaryEmbedding):
-    """RotaryEmbedding extended with fixed and mixed NTK scaling. https://kexue.fm/archives/9706 """
-    def __init__(self, dim, max_position_embeddings=512, base=10000, device=None, scaling_factor=1.0, mixed_b=None):
-        self.scaling_factor = scaling_factor
-        self.mixed_b = mixed_b
-        super().__init__(dim, max_position_embeddings, base, device)
-        max_position_embeddings = max_position_embeddings * self.scaling_factor
-        self._set_cos_sin_cache(max_position_embeddings, self.inv_freq.device, torch.get_default_dtype())
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        if seq_len > self.max_position_embeddings:
-            base = self.base * (self.scaling_factor if self.mixed_b is None else 1)
-            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-            if self.mixed_b is None:
-                inv_freq = inv_freq / self.scaling_factor ** (2 / self.dim)  # (6)
-            else:
-                a = torch.tensor(self.scaling_factor).log() / (self.dim / 2) ** self.mixed_b  # (13)
-                lambda_1_m = (a * torch.arange(1, self.dim // 2 + 1).float().to(device) ** self.mixed_b).exp()  # (12)
-                inv_freq = inv_freq / lambda_1_m  # (10)
-            self.register_buffer("inv_freq", inv_freq, persistent=False)
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.float32)
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
 class RMSNorm(nn.Module):
@@ -291,7 +306,7 @@ class NewEmbeddings(nn.Module):
                 config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
             )
         elif self.position_embedding_type == 'rope':
-            self._init_rope(config)
         else:
             raise ValueError
@@ -308,27 +323,6 @@ class NewEmbeddings(nn.Module):
             "position_ids", torch.arange(config.max_position_embeddings), persistent=False
         )
-    def _init_rope(self, config):
-        kwargs = dict(
-            dim=int(config.hidden_size / config.num_attention_heads),
-            max_position_embeddings=config.max_position_embeddings,
-            base=config.rope_theta
-        )
-        if config.rope_scaling is None:
-            self.rotary_emb = RotaryEmbedding(**kwargs)
-        else:
-            kwargs.update(scaling_factor=config.rope_scaling["factor"])
-            scaling_type = config.rope_scaling["type"]
-            if scaling_type == 'ntk':
-                kwargs.update(mixed_b=config.rope_scaling.get('mixed_b', None))
-                self.rotary_emb = NTKScalingRotaryEmbedding(**kwargs)
-            # elif scaling_type == "linear":
-            #     self.rotary_emb = LinearScalingRotaryEmbedding(**kwargs)
-            # elif scaling_type == "dynamic":
-            #     self.rotary_emb = DynamicNTKScalingRotaryEmbedding(**kwargs)
-            else:
-                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
     def forward(
         self,
         unpad_inputs: bool,
@@ -339,8 +333,6 @@ class NewEmbeddings(nn.Module):
         position_ids: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[Tuple], Optional[List[int]]]:
-        """
-        """
         if inputs_embeds is None:
             device, input_shape = input_ids.device, input_ids.shape
         else:
@@ -372,24 +364,21 @@ class NewEmbeddings(nn.Module):
         # Set and unpad position_ids
         if position_ids is None:
-            if seq_length > self.position_ids.size(0):
-                self.register_buffer(
-                    "position_ids", torch.arange(seq_length), persistent=False
-                )
             if unpad_inputs:
                 # [1, cumsum_seq_len]
-                position_ids = torch.cat([self.position_ids[:l] for l in length]).unsqueeze(0)
             else:
                 # [bs, seq_len]
-                position_ids = self.position_ids[:seq_length].expand(batch_size, -1)
         elif unpad_inputs:
             position_ids = position_ids[attention_mask_bool].unsqueeze(0)  # [1, cumsum_seq_len]
         # Compute rotary embedding
         if self.position_embedding_type == 'rope':
-            rope_cos, rope_sin = self.rotary_emb(inputs_embeds, seq_len=seq_length)
-            rope_cos = rope_cos[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
-            rope_sin = rope_sin[position_ids].unsqueeze(2)  # [bs, seq_len, 1, dim]
             rope_embeds = rope_cos, rope_sin
         else:
             rope_embeds = None
@@ -793,22 +782,6 @@ class NewPreTrainedModel(PreTrainedModel):
     base_model_prefix = "new"
     supports_gradient_checkpointing = True
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, nn.Linear):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
 class NewModel(NewPreTrainedModel):
     """

 """PyTorch NEW model."""
 import math
+from contextlib import nullcontext
+from typing import List, Optional, Tuple, Union, Callable
 import torch
 import torch.utils.checkpoint
 from torch import nn
+from transformers import ROPE_INIT_FUNCTIONS
 from transformers.activations import ACT2FN
 from transformers.modeling_outputs import (
 index_put_first_axis = IndexPutFirstAxis.apply
+def maybe_autocast(
+    device_type: str,
+    dtype: Optional["_dtype"] = None,
+    enabled: bool = True,
+    cache_enabled: bool | None = None,
+):
+    """
+    Context manager that only autocasts if:
+    - `autocast` is already enabled in this context
+    - Or this call to `maybe_autocast` has `enabled=True`
+    This prevents `autocast` being added to the graph when it is effectively a no-op.
+    Which makes graph splitting in `torch.compile` more flexible as it removes the
+    requirement that partition IDs be monotonically increasing.
+    """
+    if torch.is_autocast_enabled(device_type) or enabled:
+        return torch.autocast(device_type, dtype=dtype, enabled=enabled, cache_enabled=cache_enabled)
+    else:
+        return nullcontext()
 def pad_input(inputs: torch.Tensor, indices: torch.Tensor, batch: int, seqlen: int) -> torch.Tensor:
     """Add padding to sequences.
     return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
     Args:
         k (`torch.Tensor`): The key tensor.
         cos (`torch.Tensor`): The cosine part of the rotary embedding.
         sin (`torch.Tensor`): The sine part of the rotary embedding.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
+class RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, config: NewConfig, device=None):
+        super().__init__()
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_type = self.config.rope_parameters["rope_type"]
+        if self.rope_type == "default":
+            rope_init_fn: Callable = self.compute_default_rope_parameters
+        else:
+            rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
+    @staticmethod
+    def compute_default_rope_parameters(
+        config: NewConfig | None = None,
+        device: Optional["torch.device"] = None,
+    ) -> tuple["torch.Tensor", float]:
+        """Computes rope parameters with NTK scaling"""
+        scaling_factor = config.rope_parameters.get("factor", 1.0)
+        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        base = config.rope_parameters["rope_theta"]
+        mixed_b = config.rope_parameters.get("mixed_b", None)
+        base = base * (scaling_factor if mixed_b is None else 1)
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+        if mixed_b is None:
+            inv_freq = inv_freq / scaling_factor ** (2 / dim)
+        else:
+            a = torch.tensor(scaling_factor).log() / (dim / 2) ** mixed_b
+            lambda_1_m = (a * torch.arange(1, dim // 2 + 1).float().to(device) ** mixed_b).exp()
+            inv_freq = inv_freq / lambda_1_m
+        return inv_freq, 1.0
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 class RMSNorm(nn.Module):
                 config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
             )
         elif self.position_embedding_type == 'rope':
+            self.rotary_emb = RotaryEmbedding(config)
         else:
             raise ValueError
             "position_ids", torch.arange(config.max_position_embeddings), persistent=False
         )
     def forward(
         self,
         unpad_inputs: bool,
         position_ids: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor, Optional[Tuple], Optional[List[int]]]:
         if inputs_embeds is None:
             device, input_shape = input_ids.device, input_ids.shape
         else:
         # Set and unpad position_ids
         if position_ids is None:
+            position_ids = torch.arange(seq_length, device=inputs_embeds.device)
             if unpad_inputs:
                 # [1, cumsum_seq_len]
+                position_ids = torch.cat([position_ids[:l] for l in length]).unsqueeze(0)
             else:
                 # [bs, seq_len]
+                position_ids = position_ids[:seq_length].expand(batch_size, -1)
         elif unpad_inputs:
             position_ids = position_ids[attention_mask_bool].unsqueeze(0)  # [1, cumsum_seq_len]
         # Compute rotary embedding
         if self.position_embedding_type == 'rope':
+            rope_cos, rope_sin = self.rotary_emb(inputs_embeds, position_ids)
+            rope_cos = rope_cos.unsqueeze(2)  # [bs, seq_len, 1, dim]
+            rope_sin = rope_sin.unsqueeze(2)  # [bs, seq_len, 1, dim]
             rope_embeds = rope_cos, rope_sin
         else:
             rope_embeds = None
     base_model_prefix = "new"
     supports_gradient_checkpointing = True
 class NewModel(NewPreTrainedModel):
     """