Upload LLM/Florence-2-base-PromptGen-v2.0/modeling_florence2.py

Browse files

Files changed (1) hide show

LLM/Florence-2-base-PromptGen-v2.0/modeling_florence2.py +29 -8

LLM/Florence-2-base-PromptGen-v2.0/modeling_florence2.py CHANGED Viewed

@@ -29,6 +29,12 @@ from einops import rearrange
 from timm.models.layers import DropPath, trunc_normal_
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
     ModelOutput,
     add_start_docstrings,
@@ -812,6 +818,8 @@ class Florence2Attention(nn.Module):
         if (
             is_cross_attention
             and past_key_value is not None
             and past_key_value[0].shape[2] == key_value_states.shape[1]
         ):
             # reuse k,v, cross_attentions
@@ -821,7 +829,7 @@ class Florence2Attention(nn.Module):
             # cross_attentions
             key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
             value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
             # reuse k, v, self_attention
             key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
@@ -954,6 +962,8 @@ class Florence2FlashAttention2(Florence2Attention):
         if (
             is_cross_attention
             and past_key_value is not None
             and past_key_value[0].shape[2] == key_value_states.shape[1]
         ):
             # reuse k,v, cross_attentions
@@ -963,7 +973,7 @@ class Florence2FlashAttention2(Florence2Attention):
             # cross_attentions
             key_states = self._reshape(self.k_proj(key_value_states), -1, bsz)
             value_states = self._reshape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
             # reuse k, v, self_attention
             key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
@@ -985,7 +995,7 @@ class Florence2FlashAttention2(Florence2Attention):
             past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2))
         kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
             kv_seq_len += past_key_value[0].shape[-2]
         # In PEFT, usually we cast the layer norms in float32 for training stability reasons
@@ -1167,6 +1177,8 @@ class Florence2SdpaAttention(Florence2Attention):
         if (
             is_cross_attention
             and past_key_value is not None
             and past_key_value[0].shape[2] == key_value_states.shape[1]
         ):
             # reuse k,v, cross_attentions
@@ -1176,7 +1188,7 @@ class Florence2SdpaAttention(Florence2Attention):
             # cross_attentions
             key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
             value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
             # reuse k, v, self_attention
             key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
@@ -1795,7 +1807,7 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
         # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input)
@@ -2059,10 +2071,14 @@ class Florence2LanguageModel(Florence2LanguagePreTrainedModel):
         )
-class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel):
     base_model_prefix = "model"
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
     _keys_to_ignore_on_load_missing = ["final_logits_bias"]
     def __init__(self, config: Florence2LanguageConfig):
         super().__init__(config)
@@ -2194,7 +2210,7 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
         **kwargs,
     ):
         # cut decoder_input_ids if past_key_values is used
-        if past_key_values is not None:
             past_length = past_key_values[0][0].shape[2]
             # Some generation methods already pass only the last input ID
@@ -2530,6 +2546,11 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
     FLORENCE2_START_DOCSTRING,
 )
 class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
     def __init__(self, config: Florence2Config):
         super().__init__(config)
         assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
@@ -2814,7 +2835,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         **kwargs,
     ):
         # cut decoder_input_ids if past_key_values is used
-        if past_key_values is not None:
             past_length = past_key_values[0][0].shape[2]
             # Some generation methods already pass only the last input ID

 from timm.models.layers import DropPath, trunc_normal_
 from transformers.modeling_utils import PreTrainedModel
+try:
+    # Try new import path first (transformers >= 4.40.0)
+    from transformers.generation import GenerationMixin
+except ImportError:
+    # Fallback to old import path (transformers < 4.40.0)
+    from transformers.generation.utils import GenerationMixin
 from transformers.utils import (
     ModelOutput,
     add_start_docstrings,
         if (
             is_cross_attention
             and past_key_value is not None
+            and past_key_value[0] is not None
+            and past_key_value[1] is not None
             and past_key_value[0].shape[2] == key_value_states.shape[1]
         ):
             # reuse k,v, cross_attentions
             # cross_attentions
             key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
             value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None and past_key_value[0] is not None and past_key_value[1] is not None:
             # reuse k, v, self_attention
             key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
         if (
             is_cross_attention
             and past_key_value is not None
+            and past_key_value[0] is not None
+            and past_key_value[1] is not None
             and past_key_value[0].shape[2] == key_value_states.shape[1]
         ):
             # reuse k,v, cross_attentions
             # cross_attentions
             key_states = self._reshape(self.k_proj(key_value_states), -1, bsz)
             value_states = self._reshape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None and past_key_value[0] is not None and past_key_value[1] is not None:
             # reuse k, v, self_attention
             key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
             past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2))
         kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None and past_key_value[0] is not None:
             kv_seq_len += past_key_value[0].shape[-2]
         # In PEFT, usually we cast the layer norms in float32 for training stability reasons
         if (
             is_cross_attention
             and past_key_value is not None
+            and past_key_value[0] is not None
+            and past_key_value[1] is not None
             and past_key_value[0].shape[2] == key_value_states.shape[1]
         ):
             # reuse k,v, cross_attentions
             # cross_attentions
             key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
             value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None and past_key_value[0] is not None and past_key_value[1] is not None:
             # reuse k, v, self_attention
             key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
         # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if (past_key_values is not None and past_key_values[0] is not None and past_key_values[0][0] is not None) else 0
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input)
         )
+class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel, GenerationMixin):
     base_model_prefix = "model"
     _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
     _keys_to_ignore_on_load_missing = ["final_logits_bias"]
+    # Add support for new transformers versions
+    _supports_sdpa = True
+    _supports_flash_attn_2 = False
+    _supports_sdpa_4d_causal_mask = True
     def __init__(self, config: Florence2LanguageConfig):
         super().__init__(config)
         **kwargs,
     ):
         # cut decoder_input_ids if past_key_values is used
+        if past_key_values is not None and past_key_values[0] is not None and past_key_values[0][0] is not None:
             past_length = past_key_values[0][0].shape[2]
             # Some generation methods already pass only the last input ID
     FLORENCE2_START_DOCSTRING,
 )
 class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
+    # Add support for new transformers versions
+    _supports_sdpa = True
+    _supports_flash_attn_2 = False
+    _supports_sdpa_4d_causal_mask = True
     def __init__(self, config: Florence2Config):
         super().__init__(config)
         assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
         **kwargs,
     ):
         # cut decoder_input_ids if past_key_values is used
+        if past_key_values is not None and past_key_values[0] is not None and past_key_values[0][0] is not None:
             past_length = past_key_values[0][0].shape[2]
             # Some generation methods already pass only the last input ID