Salesforce
/

SweRankEmbed-Large

@@ -4,9 +4,9 @@
   ],
   "attention_dropout": 0.0,
   "auto_map": {
-    "AutoModel": "Alibaba-NLP/gte-Qwen2-7B-instruct--modeling_qwen.Qwen2Model",
-    "AutoModelForCausalLM": "Alibaba-NLP/gte-Qwen2-7B-instruct--modeling_qwen.Qwen2ForCausalLM",
-    "AutoModelForSequenceClassification": "Alibaba-NLP/gte-Qwen2-7B-instruct--modeling_qwen.Qwen2ForSequenceClassification"
   },
   "bos_token_id": 151643,
   "eos_token_id": 151643,

   ],
   "attention_dropout": 0.0,
   "auto_map": {
+    "AutoModel": "modeling_qwen.Qwen2Model",
+    "AutoModelForCausalLM": "modeling_qwen.Qwen2ForCausalLM",
+    "AutoModelForSequenceClassification": "modeling_qwen.Qwen2ForSequenceClassification"
   },
   "bos_token_id": 151643,
   "eos_token_id": 151643,

modeling_qwen.py CHANGED Viewed

@@ -17,6 +17,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ PyTorch Qwen2 model."""
 from transformers import Qwen2Config
 import inspect
@@ -46,11 +49,11 @@ from transformers.utils import (
 )
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
 logger = logging.get_logger(__name__)
@@ -144,6 +147,7 @@ def rotate_half(x):
 # Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
     Args:
         q (`torch.Tensor`): The query tensor.
         k (`torch.Tensor`): The key tensor.
@@ -273,7 +277,9 @@ class Qwen2Attention(nn.Module):
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
@@ -377,7 +383,9 @@ class Qwen2FlashAttention2(Qwen2Attention):
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         # Because the input can be padded, the absolute sequence length depends on the max position id.
         rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
@@ -494,6 +502,7 @@ class Qwen2FlashAttention2(Qwen2Attention):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
         Args:
             query_states (`torch.Tensor`):
                 Input query states to be passed to Flash Attention API
@@ -674,7 +683,9 @@ class Qwen2SdpaAttention(Qwen2Attention):
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
@@ -807,9 +818,11 @@ QWEN2_START_DOCSTRING = r"""
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
     Parameters:
         config ([`Qwen2Config`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
@@ -849,38 +862,50 @@ QWEN2_INPUTS_DOCSTRING = r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
             [What are attention masks?](../glossary#attention-mask)
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
             `past_key_values`).
             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
             information on the default strategy.
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
             config.n_positions - 1]`.
             [What are position IDs?](../glossary#position-ids)
         past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
             Two formats are allowed:
             - a [`~cache_utils.Cache`] instance;
             - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
             shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
             cache format.
             The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
             legacy cache format will be returned.
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
             of shape `(batch_size, sequence_length)`.
@@ -909,6 +934,7 @@ QWEN2_INPUTS_DOCSTRING = r"""
 class Qwen2Model(Qwen2PreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
     Args:
         config: Qwen2Config
     """
@@ -955,7 +981,6 @@ class Qwen2Model(Qwen2PreTrainedModel):
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # retrieve input_ids and inputs_embeds
@@ -976,12 +1001,28 @@ class Qwen2Model(Qwen2PreTrainedModel):
                 use_cache = False
         past_key_values_length = 0
         if use_cache:
-            use_legacy_cache = not isinstance(past_key_values, Cache)
-            if use_legacy_cache:
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
@@ -1087,7 +1128,10 @@ class Qwen2Model(Qwen2PreTrainedModel):
         next_cache = None
         if use_cache:
-            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
@@ -1099,6 +1143,7 @@ class Qwen2Model(Qwen2PreTrainedModel):
         )
 class Qwen2ForCausalLM(Qwen2PreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
@@ -1151,14 +1196,20 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
         Returns:
         Example:
         ```python
         >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
         >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
@@ -1220,21 +1271,32 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
         # Omit tokens covered by past_key_values
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
-            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
-            # input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                 input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
-            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
-            # input_ids based on the past_length.
             elif past_length < input_ids.shape[1]:
                 input_ids = input_ids[:, past_length:]
             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
@@ -1264,13 +1326,14 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
         model_inputs.update(
             {
                 "position_ids": position_ids,
-                "past_key_values": past_key_values,
                 "use_cache": kwargs.get("use_cache"),
                 "attention_mask": attention_mask,
             }
         )
         return model_inputs
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
@@ -1284,8 +1347,10 @@ class Qwen2ForCausalLM(Qwen2PreTrainedModel):
 @add_start_docstrings(
     """
     The Qwen2 Model transformer with a sequence classification head on top (linear layer).
     [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
     `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
     no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the

 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# includes edits by https://github.com/BBC-Esq to fix cache errors following transformers version post 4.53.3 major cache refactor
 """ PyTorch Qwen2 model."""
 from transformers import Qwen2Config
 import inspect
 )
+# if is_flash_attn_2_available():
+#     from flash_attn import flash_attn_func, flash_attn_varlen_func
+#     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+#     _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
 logger = logging.get_logger(__name__)
 # Copied from transformers.models.mistral.modeling_mistral.apply_rotary_pos_emb
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
     Args:
         q (`torch.Tensor`): The query tensor.
         k (`torch.Tensor`): The key tensor.
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
+            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            past_len = past_key_value.get_seq_length(self.layer_idx) if past_key_value is not None else 0
+            kv_seq_len += past_len
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
+            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            past_len = past_key_value.get_seq_length(self.layer_idx) if past_key_value is not None else 0
+            kv_seq_len += past_len
         # Because the input can be padded, the absolute sequence length depends on the max position id.
         rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
         first unpad the input, then computes the attention scores and pad the final attention scores.
         Args:
             query_states (`torch.Tensor`):
                 Input query states to be passed to Flash Attention API
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
+            # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            past_len = past_key_value.get_seq_length(self.layer_idx) if past_key_value is not None else 0
+            kv_seq_len += past_len
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
     etc.)
     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
     Parameters:
         config ([`Qwen2Config`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
             it.
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
             [What are attention masks?](../glossary#attention-mask)
             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
             [`PreTrainedTokenizer.__call__`] for details.
             If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
             `past_key_values`).
             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
             information on the default strategy.
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
             config.n_positions - 1]`.
             [What are position IDs?](../glossary#position-ids)
         past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
             Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
             blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
             returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
             Two formats are allowed:
             - a [`~cache_utils.Cache`] instance;
             - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
             shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
             cache format.
             The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
             legacy cache format will be returned.
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
             of shape `(batch_size, sequence_length)`.
 class Qwen2Model(Qwen2PreTrainedModel):
     """
     Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`Qwen2DecoderLayer`]
     Args:
         config: Qwen2Config
     """
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         # retrieve input_ids and inputs_embeds
                 use_cache = False
         past_key_values_length = 0
+        use_legacy_cache = False
         if use_cache:
+            # OLD behavior (removed in HF >= 4.55): treat anything not Cache as "legacy" but then
+            # directly used legacy methods on it (would crash if None or new API).
+            # use_legacy_cache = not isinstance(past_key_values, Cache)
+            # if use_legacy_cache:
+            #     # past_key_values_length = past_key_values.get_seq_length()
+            #     past_key_values_length = past_key_values.get_usable_length(seq_length)
+            # NEW behavior: if a legacy tuple is passed, convert it to the new Cache API,
+            # compute length via .get_seq_length(), and remember to return legacy if that’s what came in.
+            if past_key_values is not None and not isinstance(past_key_values, Cache):
+                use_legacy_cache = True  # remember input format for return
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            if isinstance(past_key_values, Cache):
+                # Layer-agnostic total length; cache_position is handled deeper if needed
+                past_key_values_length = past_key_values.get_seq_length()
+            else:
+                # No cache given on first forward, keep length at 0
+                past_key_values_length = 0
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
         next_cache = None
         if use_cache:
+            # If the caller passed legacy, return legacy. Otherwise return the Cache object.
+            next_cache = (
+                next_decoder_cache.to_legacy_cache() if (use_legacy_cache and next_decoder_cache is not None) else next_decoder_cache
+            )
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         )
 class Qwen2ForCausalLM(Qwen2PreTrainedModel):
     _tied_weights_keys = ["lm_head.weight"]
                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
         Returns:
         Example:
         ```python
         >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
         >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
         >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
         >>> prompt = "Hey, are you conscious? Can you talk to me?"
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         # Omit tokens covered by past_key_values
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
+                # NEW API (HF >= 4.55): use Cache methods
                 cache_length = past_key_values.get_seq_length()
+                past_length = cache_length  # `seen_tokens` removed; use total seq length instead
+                try:
+                    max_cache_length = past_key_values.get_max_cache_shape()
+                except Exception:
+                    max_cache_length = None
+                # OLD API (deprecated/removed):
+                # cache_length = past_key_values.get_seq_length()
+                # past_length = past_key_values.seen_tokens
+                # max_cache_length = past_key_values.get_max_length()
             else:
+                # Legacy tuple format: keep computing lengths directly from tensors
+                # (We keep it compatible without forcing a conversion here)
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as input)
             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
                 input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens.
+            # We can discard input_ids based on the past_length.
             elif past_length < input_ids.shape[1]:
                 input_ids = input_ids[:, past_length:]
             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
         model_inputs.update(
             {
                 "position_ids": position_ids,
+                "past_key_values": past_key_values,  # pass through unchanged (legacy or new Cache object)
                 "use_cache": kwargs.get("use_cache"),
                 "attention_mask": attention_mask,
             }
         )
         return model_inputs
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
         reordered_past = ()
 @add_start_docstrings(
     """
     The Qwen2 Model transformer with a sequence classification head on top (linear layer).
     [`Qwen2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
     (e.g. GPT-2) do.
     Since it does classification on the last token, it requires to know the position of the last token. If a
     `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
     no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the