OpenGVLab
/

VideoChat-Flash-Qwen2_5-7B_InternVideo2-1B

@@ -200,7 +200,7 @@
   "mm_vision_select_layer": -2,
   "mm_vision_tower": "internvideo2",
   "mm_vision_tower_lr": 2e-06,
-  "model_type": "qwen2",
   "num_attention_heads": 28,
   "num_hidden_layers": 28,
   "num_key_value_heads": 4,

   "mm_vision_select_layer": -2,
   "mm_vision_tower": "internvideo2",
   "mm_vision_tower_lr": 2e-06,
+  "model_type": "videochat_flash_qwen",
   "num_attention_heads": 28,
   "num_hidden_layers": 28,
   "num_key_value_heads": 4,

modeling_qwen2_flash.py CHANGED Viewed

@@ -276,7 +276,11 @@ class Qwen2Attention(nn.Module):
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
@@ -379,7 +383,11 @@ class Qwen2FlashAttention2(Qwen2Attention):
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         # Because the input can be padded, the absolute sequence length depends on the max position id.
         rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
@@ -673,7 +681,11 @@ class Qwen2SdpaAttention(Qwen2Attention):
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
@@ -993,7 +1005,11 @@ class Qwen2Model_Flash(Qwen2PreTrainedModel):
             use_legacy_cache = not isinstance(past_key_values, Cache)
             if use_legacy_cache:
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-            past_key_values_length = past_key_values.get_usable_length(seq_length)
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
@@ -1483,8 +1499,16 @@ class Qwen2ForCausalLM_Flash(Qwen2PreTrainedModel):
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
-                past_length = past_key_values.seen_tokens
-                max_cache_length = past_key_values.get_max_length()
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
@@ -1517,8 +1541,22 @@ class Qwen2ForCausalLM_Flash(Qwen2PreTrainedModel):
             if past_key_values:
                 position_ids = position_ids[:, -input_ids.shape[1] :]
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
             model_inputs = {"inputs_embeds": inputs_embeds}
         else:
             model_inputs = {"input_ids": input_ids}

                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
+            # get_usable_length has been removed in transformers 4.54.0
+            if hasattr(past_key_value, "get_usable_length"):
+                kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            else:
+                kv_seq_len += past_key_value.get_seq_length(self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
                     "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                     "with a layer index."
                 )
+            # get_usable_length has been removed in transformers 4.54.0
+            if hasattr(past_key_value, "get_usable_length"):
+                kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            else:
+                kv_seq_len += past_key_value.get_seq_length(self.layer_idx)
         # Because the input can be padded, the absolute sequence length depends on the max position id.
         rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
+            # get_usable_length has been removed in transformers 4.54.0
+            if hasattr(past_key_value, "get_usable_length"):
+                kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            else:
+                kv_seq_len += past_key_value.get_seq_length(self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
             use_legacy_cache = not isinstance(past_key_values, Cache)
             if use_legacy_cache:
                 past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            # get_usable_length has been removed in transformers 4.54.0
+            if hasattr(past_key_values, "get_usable_length"):
+                past_key_values_length = past_key_values.get_usable_length(seq_length)
+            else:
+                past_key_values_length = past_key_values.get_seq_length()
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
         if past_key_values is not None:
             if isinstance(past_key_values, Cache):
                 cache_length = past_key_values.get_seq_length()
+                # seen_tokens property has been removed in transformers 4.54.0
+                past_length = getattr(past_key_values, 'seen_tokens', cache_length)
+                # get_max_length() has been replaced by get_max_cache_shape() in transformers 4.49.0
+                # in transformers 4.54.0, DynamicCache returns -1 instead of None to indicate no limit
+                if hasattr(past_key_values, 'get_max_cache_shape'):
+                    max_cache_length = past_key_values.get_max_cache_shape()
+                    # Convert -1 to None for consistency with old behavior
+                    max_cache_length = None if max_cache_length == -1 else max_cache_length
+                else:
+                    max_cache_length = past_key_values.get_max_length()
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
             if past_key_values:
                 position_ids = position_ids[:, -input_ids.shape[1] :]
+        def is_cache_empty(past_key_values):
+            if past_key_values is None or len(past_key_values) == 0:
+                return True
+            if hasattr(past_key_values, 'is_initialized'):
+                return past_key_values.is_initialized == False
+            if isinstance(past_key_values, Cache):
+                for idx, layer in enumerate(past_key_values.layers):
+                    if past_key_values.get_seq_length(idx) > 0:
+                        return False
+                return True
+            return False
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        # in newer transformers versions, past_key_values can be an empty cache in the 1st generation step.
+        if inputs_embeds is not None and is_cache_empty(past_key_values):
             model_inputs = {"inputs_embeds": inputs_embeds}
         else:
             model_inputs = {"input_ids": input_ids}