Spaces:

Thouph
/

Furrence-2-Large-Demo

Running

App Files Files Community

Fix demo

by Koda36 - opened Dec 1, 2025

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+307

-53748

Files changed (6) hide show

app.py +19 -30
florence2_implementation/configuration_florence2.py +4 -8
florence2_implementation/modeling_florence2.py +274 -278
requirements.txt +7 -6
tag_implications-2024-05-05.csv +0 -0
tags-2024-05-05.csv → tags-2025-11-25.csv.gz +2 -2

app.py CHANGED Viewed

@@ -1,24 +1,25 @@
 import json
-from collections import defaultdict
 import safetensors
 import timm
-from transformers import AutoProcessor
-import gradio as gr
 import torch
-import time
-from florence2_implementation.modeling_florence2 import Florence2ForConditionalGeneration
-from torchvision.transforms import InterpolationMode
-from PIL import Image
 import torchvision.transforms.functional as TF
 from torchvision.transforms import transforms
-import random
-import csv
-import os
 torch.set_grad_enabled(False)
 # HF now (Feb 20, 2025) imposes a storage limit of 1GB. Will have to pull JTP from other places.
-os.system("wget -nv https://huggingface.co/RedRocket/JointTaggerProject/resolve/main/JTP_PILOT2/JTP_PILOT2-e3-vit_so400m_patch14_siglip_384.safetensors")
 category_id_to_str = {
@@ -34,16 +35,12 @@ class Pruner:
     def __init__(self, path_to_tag_list_csv):
         species_tags = set()
         allowed_tags = set()
-        with open(path_to_tag_list_csv, "r") as f:
-            reader = csv.reader(f)
-            header = next(reader)
-            name_index = header.index("name")
-            category_index = header.index("category")
-            post_count_index = header.index("post_count")
             for row in reader:
-                if int(row[post_count_index]) > 20:
-                    category = row[category_index]
-                    name = row[name_index]
                     if category == "5":
                         species_tags.add(name)
                         allowed_tags.add(name)
@@ -198,13 +195,6 @@ model = Florence2ForConditionalGeneration.from_pretrained(model_id,).eval()
 processor = AutoProcessor.from_pretrained("./florence2_implementation/", trust_remote_code=True)
-tree = defaultdict(list)
-with open('tag_implications-2024-05-05.csv', 'rt') as csvfile:
-    reader = csv.DictReader(csvfile)
-    for row in reader:
-        if row["status"] == "active":
-            tree[row["consequent_name"]].append(row["antecedent_name"])
 title = """<h1 align="center">Furrence2 Captioner Demo</h1>"""
 description=(
@@ -237,10 +227,9 @@ allowed_tags = list(tags.keys())
 for idx, tag in enumerate(allowed_tags):
     allowed_tags[idx] = tag
-pruner = Pruner("tags-2024-05-05.csv")
 def generate_prompt(image, expected_caption_length):
-    global THRESHOLD, tree, tokenizer, model, tagger_model, tagger_transform
     tagger_input = tagger_transform(image.convert('RGBA')).unsqueeze(0)
     probabilities = tagger_model(tagger_input)
     for prob in probabilities:
@@ -319,7 +308,7 @@ def main():
                         value="Caption it!", interactive=True, variant="primary",
                     )
-                    caption_output = gr.Textbox(lines=1, label="Caption Output")
                     caption_button.click(
                         inference_caption,
                         [

+import csv
+import gzip
 import json
+import random
+import time
+import gradio as gr
 import safetensors
 import timm
 import torch
 import torchvision.transforms.functional as TF
+from PIL import Image
+from torchvision.transforms import InterpolationMode
 from torchvision.transforms import transforms
+from transformers import AutoProcessor
+from florence2_implementation.modeling_florence2 import Florence2ForConditionalGeneration
 torch.set_grad_enabled(False)
 # HF now (Feb 20, 2025) imposes a storage limit of 1GB. Will have to pull JTP from other places.
+# os.system("wget -nv https://huggingface.co/RedRocket/JointTaggerProject/resolve/main/JTP_PILOT2/JTP_PILOT2-e3-vit_so400m_patch14_siglip_384.safetensors")
 category_id_to_str = {
     def __init__(self, path_to_tag_list_csv):
         species_tags = set()
         allowed_tags = set()
+        with gzip.open(path_to_tag_list_csv, mode="rt", encoding="utf8") as csv_file:
+            reader = csv.DictReader(csv_file)
             for row in reader:
+                if int(row["post_count"]) > 20:
+                    category = row["category"]
+                    name = row["name"]
                     if category == "5":
                         species_tags.add(name)
                         allowed_tags.add(name)
 processor = AutoProcessor.from_pretrained("./florence2_implementation/", trust_remote_code=True)
 title = """<h1 align="center">Furrence2 Captioner Demo</h1>"""
 description=(
 for idx, tag in enumerate(allowed_tags):
     allowed_tags[idx] = tag
+pruner = Pruner("tags-2025-11-25.csv.gz")
 def generate_prompt(image, expected_caption_length):
     tagger_input = tagger_transform(image.convert('RGBA')).unsqueeze(0)
     probabilities = tagger_model(tagger_input)
     for prob in probabilities:
                         value="Caption it!", interactive=True, variant="primary",
                     )
+                    caption_output = gr.Textbox(lines=3, label="Caption Output")
                     caption_button.click(
                         inference_caption,
                         [

florence2_implementation/configuration_florence2.py CHANGED Viewed

@@ -14,9 +14,6 @@
 import warnings
 """ Florence-2 configuration"""
-from typing import Optional
-from transformers import AutoConfig
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
@@ -77,7 +74,7 @@ class Florence2VisionConfig(PretrainedConfig):
     >>> configuration = model.config
     ```"""
-    model_type = "florence2_vision"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(
@@ -118,7 +115,6 @@ class Florence2VisionConfig(PretrainedConfig):
         super().__init__(**kwargs)
 class Florence2LanguageConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
@@ -272,7 +268,7 @@ class Florence2LanguageConfig(PretrainedConfig):
 class Florence2Config(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
-    Florence-2 model according to the specified arguments, defining the model architecture.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -281,7 +277,7 @@ class Florence2Config(PretrainedConfig):
         vision_config (`Florence2VisionConfig`,  *optional*):
             Custom vision config or dict
         text_config (`Union[AutoConfig, dict]`, *optional*):
-            The config object of the text backbone.
         ignore_index (`int`, *optional*, defaults to -100):
             The ignore index for the loss function.
         vocab_size (`int`, *optional*, defaults to 51289):
@@ -327,7 +323,7 @@ class Florence2Config(PretrainedConfig):
         self.vocab_size = vocab_size
         self.projection_dim = projection_dim
         if vision_config is not None:
-            vision_config = PretrainedConfig(**vision_config)
         self.vision_config = vision_config
         self.vocab_size = self.vocab_size

 import warnings
 """ Florence-2 configuration"""
 from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import logging
     >>> configuration = model.config
     ```"""
+    model_type = "davit"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(
         super().__init__(**kwargs)
 class Florence2LanguageConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
 class Florence2Config(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
+    Florence-2 model according to the specified arguments, defining the model architecture.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
         vision_config (`Florence2VisionConfig`,  *optional*):
             Custom vision config or dict
         text_config (`Union[AutoConfig, dict]`, *optional*):
+            The config object of the text backbone.
         ignore_index (`int`, *optional*, defaults to -100):
             The ignore index for the loss function.
         vocab_size (`int`, *optional*, defaults to 51289):
         self.vocab_size = vocab_size
         self.projection_dim = projection_dim
         if vision_config is not None:
+            vision_config = Florence2VisionConfig(**vision_config)
         self.vision_config = vision_config
         self.vocab_size = self.vocab_size

florence2_implementation/modeling_florence2.py CHANGED Viewed

@@ -23,10 +23,10 @@ import torch.utils.checkpoint
 from torch import nn
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
-from torch.nn import CrossEntropyLoss
 from collections import OrderedDict
 from einops import rearrange
-from timm.models.layers import DropPath, trunc_normal_
 from transformers.modeling_utils import PreTrainedModel
 from transformers.generation.utils import GenerationMixin
@@ -34,17 +34,15 @@ from transformers.utils import (
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
     logging,
     replace_return_docstrings,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
 )
-from .configuration_florence2 import Florence2Config
 from .configuration_florence2 import Florence2LanguageConfig
 from .configuration_florence2 import Florence2VisionConfig
 from transformers.activations import ACT2FN
 from transformers.modeling_attn_mask_utils import (
     _prepare_4d_attention_mask,
@@ -59,7 +57,6 @@ from transformers.modeling_outputs import (
     Seq2SeqModelOutput,
 )
 if is_flash_attn_2_available():
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
@@ -67,6 +64,7 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "Florence2Config"
 class LearnedAbsolutePositionEmbedding2D(nn.Module):
     """
     This module learns positional embeddings up to a fixed maximum size.
@@ -79,7 +77,7 @@ class LearnedAbsolutePositionEmbedding2D(nn.Module):
     def forward(self, pixel_values):
         """
-        pixel_values: (batch_size, height, width, num_channels)
         returns: (batch_size, height, width, embedding_dim * 2)
         """
         if len(pixel_values.shape) != 4:
@@ -100,6 +98,7 @@ class LearnedAbsolutePositionEmbedding2D(nn.Module):
         pos = pos.permute(0, 2, 3, 1)
         return pos
 class PositionalEmbeddingCosine1D(nn.Module):
     """
     This class implements a very simple positional encoding. It follows closely
@@ -111,6 +110,7 @@ class PositionalEmbeddingCosine1D(nn.Module):
         dropout_prob: The dropout probability.
         max_seq_len: The maximum length to precompute the positional encodings.
     """
     def __init__(
             self,
             embed_dim: int = 512,
@@ -126,7 +126,7 @@ class PositionalEmbeddingCosine1D(nn.Module):
         # of the position index (i.e., the row index).
         frequencies = \
             torch.arange(0, self.max_seq_len) \
-            .reshape(self.max_seq_len, 1) * denominator
         pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim))
         # Populate uneven entries.
         pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
@@ -166,6 +166,7 @@ class LearnedAbsolutePositionEmbedding1D(nn.Module):
         embed_dim: The dimension of the embeddings.
         max_seq_len: The maximum length to precompute the positional encodings.
     """
     def __init__(
             self,
             embedding_dim: int = 512,
@@ -199,7 +200,6 @@ class LearnedAbsolutePositionEmbedding1D(nn.Module):
         return pos_embeds
 class MySequential(nn.Sequential):
     def forward(self, *inputs):
         for module in self._modules.values():
@@ -234,11 +234,11 @@ class PreNorm(nn.Module):
 class Mlp(nn.Module):
     def __init__(
-        self,
-        in_features,
-        hidden_features=None,
-        out_features=None,
-        act_layer=nn.GELU,
     ):
         super().__init__()
         out_features = out_features or in_features
@@ -255,12 +255,12 @@ class Mlp(nn.Module):
 class DepthWiseConv2d(nn.Module):
     def __init__(
-        self,
-        dim_in,
-        kernel_size,
-        padding,
-        stride,
-        bias=True,
     ):
         super().__init__()
         self.dw = nn.Conv2d(
@@ -288,14 +288,14 @@ class ConvEmbed(nn.Module):
     """
     def __init__(
-        self,
-        patch_size=7,
-        in_chans=3,
-        embed_dim=64,
-        stride=4,
-        padding=2,
-        norm_layer=None,
-        pre_norm=True
     ):
         super().__init__()
         self.patch_size = patch_size
@@ -374,7 +374,7 @@ class ChannelBlock(nn.Module):
         self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
         self.ffn = PreNorm(
             norm_layer(dim),
-            Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
             drop_path
         )
@@ -398,9 +398,9 @@ def window_partition(x, window_size: int):
 def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
-    B = batch_size
     # this will cause onnx conversion failed for dynamic axis, because treated as constant
-    # int(windows.shape[0] / (H * W / window_size / window_size))
     x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
     x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
     return x
@@ -408,7 +408,6 @@ def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
 class WindowAttention(nn.Module):
     def __init__(self, dim, num_heads, window_size, qkv_bias=True):
         super().__init__()
         self.dim = dim
         self.window_size = window_size
@@ -422,7 +421,6 @@ class WindowAttention(nn.Module):
         self.softmax = nn.Softmax(dim=-1)
     def forward(self, x, size):
         H, W = size
         B, L, C = x.shape
         assert L == H * W, "input feature has wrong size"
@@ -484,7 +482,7 @@ class SpatialBlock(nn.Module):
         self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
         self.ffn = PreNorm(
             norm_layer(dim),
-            Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
             drop_path
         )
@@ -523,26 +521,26 @@ class DaViT(nn.Module):
     """
     def __init__(
-        self,
-        in_chans=3,
-        num_classes=1000,
-        depths=(1, 1, 3, 1),
-        patch_size=(7, 2, 2, 2),
-        patch_stride=(4, 2, 2, 2),
-        patch_padding=(3, 0, 0, 0),
-        patch_prenorm=(False, False, False, False),
-        embed_dims=(64, 128, 192, 256),
-        num_heads=(3, 6, 12, 24),
-        num_groups=(3, 6, 12, 24),
-        window_size=7,
-        mlp_ratio=4.,
-        qkv_bias=True,
-        drop_path_rate=0.1,
-        norm_layer=nn.LayerNorm,
-        enable_checkpoint=False,
-        conv_at_attn=True,
-        conv_at_ffn=True,
-     ):
         super().__init__()
         self.num_classes = num_classes
@@ -554,7 +552,7 @@ class DaViT(nn.Module):
         assert self.num_stages == len(self.num_heads) == len(self.num_groups)
         num_stages = len(embed_dims)
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)*2)]
         depth_offset = 0
         convs = []
@@ -576,32 +574,32 @@ class DaViT(nn.Module):
                     MySequential(OrderedDict([
                         (
                             'spatial_block', SpatialBlock(
-                                embed_dims[i],
-                                num_heads[i],
-                                window_size,
-                                drop_path_rate=dpr[depth_offset+j*2],
-                                qkv_bias=qkv_bias,
-                                mlp_ratio=mlp_ratio,
-                                conv_at_attn=conv_at_attn,
-                                conv_at_ffn=conv_at_ffn,
-                            )
                         ),
                         (
                             'channel_block', ChannelBlock(
-                                embed_dims[i],
-                                num_groups[i],
-                                drop_path_rate=dpr[depth_offset+j*2+1],
-                                qkv_bias=qkv_bias,
-                                mlp_ratio=mlp_ratio,
-                                conv_at_attn=conv_at_attn,
-                                conv_at_ffn=conv_at_ffn,
-                            )
                         )
                     ])) for j in range(depths[i])
                 ]
             )
             blocks.append(block)
-            depth_offset += depths[i]*2
         self.convs = nn.ModuleList(convs)
         self.blocks = nn.ModuleList(blocks)
@@ -610,32 +608,13 @@ class DaViT(nn.Module):
         self.avgpool = nn.AdaptiveAvgPool1d(1)
         self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
-        self.apply(self._init_weights)
     @property
     def dim_out(self):
         return self.embed_dims[-1]
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=0.02)
-            if m.bias is not None:
-                nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.Conv2d):
-            nn.init.normal_(m.weight, std=0.02)
-            for name, _ in m.named_parameters():
-                if name in ['bias']:
-                    nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.LayerNorm):
-            nn.init.constant_(m.weight, 1.0)
-            nn.init.constant_(m.bias, 0)
-        elif isinstance(m, nn.BatchNorm2d):
-            nn.init.constant_(m.weight, 1.0)
-            nn.init.constant_(m.bias, 0)
     def forward_features_unpool(self, x):
         """
-        forward until avg pooling
         Args:
             x (_type_): input image tensor
         """
@@ -663,7 +642,7 @@ class DaViT(nn.Module):
         x = self.forward_features(x)
         x = self.head(x)
         return x
     @classmethod
     def from_config(cls, config):
         return cls(
@@ -680,12 +659,11 @@ class DaViT(nn.Module):
         )
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
@@ -754,14 +732,14 @@ class Florence2Attention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
     def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        dropout: float = 0.0,
-        is_decoder: bool = False,
-        bias: bool = True,
-        is_causal: bool = False,
-        config: Optional[Florence2LanguageConfig] = None,
     ):
         super().__init__()
         self.embed_dim = embed_dim
@@ -775,7 +753,7 @@ class Florence2Attention(nn.Module):
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                 f" and `num_heads`: {num_heads})."
             )
-        self.scaling = self.head_dim**-0.5
         self.is_decoder = is_decoder
         self.is_causal = is_causal
@@ -788,13 +766,13 @@ class Florence2Attention(nn.Module):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
     def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
@@ -811,9 +789,9 @@ class Florence2Attention(nn.Module):
         # is checking that the `sequence_length` of the `past_key_value` is the same as
         # the provided `key_value_states` to support prefix tuning
         if (
-            is_cross_attention
-            and past_key_value is not None
-            and past_key_value[0].shape[2] == key_value_states.shape[1]
         ):
             # reuse k,v, cross_attentions
             key_states = past_key_value[0]
@@ -928,13 +906,13 @@ class Florence2FlashAttention2(Florence2Attention):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
     def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         # Florence2FlashAttention2 attention does not support output_attentions
         if output_attentions:
@@ -953,9 +931,9 @@ class Florence2FlashAttention2(Florence2Attention):
         # is checking that the `sequence_length` of the `past_key_value` is the same as
         # the provided `key_value_states` to support prefix tuning
         if (
-            is_cross_attention
-            and past_key_value is not None
-            and past_key_value[0].shape[2] == key_value_states.shape[1]
         ):
             # reuse k,v, cross_attentions
             key_states = past_key_value[0].transpose(1, 2)
@@ -1029,7 +1007,7 @@ class Florence2FlashAttention2(Florence2Attention):
     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
     def _flash_attention_forward(
-        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
     ):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
@@ -1129,13 +1107,13 @@ class Florence2FlashAttention2(Florence2Attention):
 class Florence2SdpaAttention(Florence2Attention):
     def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
         if output_attentions or layer_head_mask is not None:
@@ -1166,9 +1144,9 @@ class Florence2SdpaAttention(Florence2Attention):
         # is checking that the `sequence_length` of the `past_key_value` is the same as
         # the provided `key_value_states` to support prefix tuning
         if (
-            is_cross_attention
-            and past_key_value is not None
-            and past_key_value[0].shape[2] == key_value_states.shape[1]
         ):
             # reuse k,v, cross_attentions
             key_states = past_key_value[0]
@@ -1260,11 +1238,11 @@ class Florence2EncoderLayer(nn.Module):
         self.final_layer_norm = nn.LayerNorm(self.embed_dim)
     def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        attention_mask: torch.FloatTensor,
-        layer_head_mask: torch.FloatTensor,
-        output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         """
         Args:
@@ -1297,7 +1275,7 @@ class Florence2EncoderLayer(nn.Module):
         hidden_states = self.final_layer_norm(hidden_states)
         if hidden_states.dtype == torch.float16 and (
-            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
         ):
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
@@ -1341,16 +1319,16 @@ class Florence2DecoderLayer(nn.Module):
         self.final_layer_norm = nn.LayerNorm(self.embed_dim)
     def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = True,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
@@ -1430,7 +1408,6 @@ class Florence2DecoderLayer(nn.Module):
         return outputs
 class Florence2LanguagePreTrainedModel(PreTrainedModel):
     config_class = Florence2LanguageConfig
     base_model_prefix = "model"
@@ -1451,6 +1428,17 @@ class Florence2LanguagePreTrainedModel(PreTrainedModel):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
     @property
     def dummy_inputs(self):
@@ -1511,14 +1499,14 @@ class Florence2Encoder(Florence2LanguagePreTrainedModel):
         self.embed_tokens = value
     def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutput]:
         r"""
         Args:
@@ -1696,19 +1684,19 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
         self.embed_tokens = value
     def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
         r"""
         Args:
@@ -1973,22 +1961,22 @@ class Florence2LanguageModel(Florence2LanguagePreTrainedModel):
         return self.decoder
     def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        decoder_input_ids: Optional[torch.LongTensor] = None,
-        decoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        decoder_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Seq2SeqModelOutput]:
         # different to other models, Florence2 automatically creates decoder_input_ids from
         # input_ids if no decoder_input_ids are provided
@@ -2074,14 +2062,21 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
         # Initialize weights and apply final processing
         self.post_init()
     def get_encoder(self):
         return self.model.get_encoder()
     def get_decoder(self):
         return self.model.get_decoder()
-    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
         self._resize_final_logits_bias(new_embeddings.weight.shape[0])
         return new_embeddings
@@ -2101,23 +2096,23 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
         self.lm_head = new_embeddings
     def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        decoder_input_ids: Optional[torch.LongTensor] = None,
-        decoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        decoder_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Seq2SeqLMOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -2182,17 +2177,17 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
         )
     def prepare_inputs_for_generation(
-        self,
-        decoder_input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        use_cache=None,
-        encoder_outputs=None,
-        **kwargs,
     ):
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
@@ -2234,6 +2229,7 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
             )
         return reordered_past
 @dataclass
 class Florence2Seq2SeqLMOutput(ModelOutput):
     """
@@ -2415,6 +2411,7 @@ FLORENCE2_INPUTS_DOCSTRING = r"""
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 @add_start_docstrings(
     """The FLORENCE2 vision model without any head""",
     FLORENCE2_START_DOCSTRING,
@@ -2426,7 +2423,7 @@ class Florence2VisionModel(Florence2PreTrainedModel):
         self.vision_tower = DaViT.from_config(config=config)
         self.post_init()
     def forward(self, pixel_values):
         if len(pixel_values.shape) == 4:
             x = self.vision_tower.forward_features_unpool(pixel_values)
@@ -2448,7 +2445,7 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
         self._build_image_projection_layers(config)
         self.post_init()
     def _build_image_projection_layers(self, config):
         image_dim_out = config.dim_embed[-1]
         dim_projection = config.projection_dim
@@ -2484,7 +2481,7 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
             x = self.vision_tower.forward_features_unpool(pixel_values)
         else:
             raise ValueError(f'invalid image shape {pixel_values.shape}')
         if self.image_pos_embed is not None:
             x = x.view(batch_size * T, -1, x.shape[-1])
             num_tokens = x.shape[-2]
@@ -2493,7 +2490,7 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
             x = x.view(batch_size * T, h, w, x.shape[-1])
             pos_embed = self.image_pos_embed(x)
             x = x + pos_embed
-            x = x.view(batch_size, T * h*w, x.shape[-1])
         if self.visual_temporal_embed is not None:
             visual_temporal_embed = self.visual_temporal_embed(x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
@@ -2521,21 +2518,22 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
         x = x @ self.image_projection
         x = self.image_proj_norm(x)
         return x
 @add_start_docstrings(
     """The FLORENCE2 model which consists of a vision backbone and a language model.""",
     FLORENCE2_START_DOCSTRING,
 )
-class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixin):
     def __init__(self, config: Florence2Config):
         super().__init__(config)
         assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
         self.vision_tower = DaViT.from_config(config=config.vision_config)
-        # remove unused layers
         del self.vision_tower.head
         del self.vision_tower.norms
@@ -2545,13 +2543,11 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
         language_model = Florence2LanguageForConditionalGeneration(config=config.text_config)
-        if language_model._tied_weights_keys is not None:
-            self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
         self.language_model = language_model
         self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
         self.post_init()
     def _build_image_projection_layers(self, config):
         image_dim_out = config.vision_config.dim_embed[-1]
         dim_projection = config.vision_config.projection_dim
@@ -2589,14 +2585,15 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
-    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
-        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
         # update vocab size
         self.config.text_config.vocab_size = model_embeds.num_embeddings
         self.config.vocab_size = model_embeds.num_embeddings
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
     def _encode_image(self, pixel_values):
         if len(pixel_values.shape) == 4:
             batch_size, C, H, W = pixel_values.shape
@@ -2604,7 +2601,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
             x = self.vision_tower.forward_features_unpool(pixel_values)
         else:
             raise ValueError(f'invalid image shape {pixel_values.shape}')
         if self.image_pos_embed is not None:
             x = x.view(batch_size * T, -1, x.shape[-1])
             num_tokens = x.shape[-2]
@@ -2613,7 +2610,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
             x = x.view(batch_size * T, h, w, x.shape[-1])
             pos_embed = self.image_pos_embed(x)
             x = x + pos_embed
-            x = x.view(batch_size, T * h*w, x.shape[-1])
         if self.visual_temporal_embed is not None:
             visual_temporal_embed = self.visual_temporal_embed(x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
@@ -2641,10 +2638,10 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
         x = x @ self.image_projection
         x = self.image_proj_norm(x)
-        return x
     def _merge_input_ids_with_image_features(
-        self, image_features, inputs_embeds
     ):
         batch_size, image_token_length = image_features.size()[:-1]
         device = image_features.device
@@ -2667,28 +2664,27 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
         return inputs_embeds, attention_mask
     @add_start_docstrings_to_model_forward(FLORENCE2_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Florence2Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        pixel_values: torch.FloatTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        decoder_input_ids: Optional[torch.LongTensor] = None,
-        decoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        decoder_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Florence2Seq2SeqLMOutput]:
         r"""
         Args:
@@ -2778,12 +2774,12 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
         )
     def generate(
-        self,
-        input_ids,
-        inputs_embeds=None,
-        pixel_values=None,
-        **kwargs
-        ):
         if inputs_embeds is None:
             # 1. Extra the input embeddings
@@ -2793,7 +2789,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
             if pixel_values is not None:
                 image_features = self._encode_image(pixel_values)
                 inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
         return self.language_model.generate(
             input_ids=None,
             inputs_embeds=inputs_embeds,
@@ -2801,18 +2797,18 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
         )
     def prepare_inputs_for_generation(
-        self,
-        decoder_input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        pixel_values=None,
-        decoder_attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        use_cache=None,
-        encoder_outputs=None,
-        **kwargs,
     ):
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
@@ -2826,7 +2822,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
                 remove_prefix_length = decoder_input_ids.shape[1] - 1
             decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
         return {
             "input_ids": None,  # encoder_outputs is defined. input_ids not needed
             "encoder_outputs": encoder_outputs,
@@ -2840,7 +2836,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
             "cross_attn_head_mask": cross_attn_head_mask,
             "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
         }
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return self.language_model.shift_tokens_right(labels)

 from torch import nn
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
+from torch.nn import CrossEntropyLoss
 from collections import OrderedDict
 from einops import rearrange
+from timm.layers import DropPath
 from transformers.modeling_utils import PreTrainedModel
 from transformers.generation.utils import GenerationMixin
     ModelOutput,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,
     replace_return_docstrings,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
 )
+from .configuration_florence2 import Florence2Config
 from .configuration_florence2 import Florence2LanguageConfig
 from .configuration_florence2 import Florence2VisionConfig
 from transformers.activations import ACT2FN
 from transformers.modeling_attn_mask_utils import (
     _prepare_4d_attention_mask,
     Seq2SeqModelOutput,
 )
 if is_flash_attn_2_available():
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 _CONFIG_FOR_DOC = "Florence2Config"
 class LearnedAbsolutePositionEmbedding2D(nn.Module):
     """
     This module learns positional embeddings up to a fixed maximum size.
     def forward(self, pixel_values):
         """
+        pixel_values: (batch_size, height, width, num_channels)
         returns: (batch_size, height, width, embedding_dim * 2)
         """
         if len(pixel_values.shape) != 4:
         pos = pos.permute(0, 2, 3, 1)
         return pos
 class PositionalEmbeddingCosine1D(nn.Module):
     """
     This class implements a very simple positional encoding. It follows closely
         dropout_prob: The dropout probability.
         max_seq_len: The maximum length to precompute the positional encodings.
     """
     def __init__(
             self,
             embed_dim: int = 512,
         # of the position index (i.e., the row index).
         frequencies = \
             torch.arange(0, self.max_seq_len) \
+                .reshape(self.max_seq_len, 1) * denominator
         pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim))
         # Populate uneven entries.
         pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
         embed_dim: The dimension of the embeddings.
         max_seq_len: The maximum length to precompute the positional encodings.
     """
     def __init__(
             self,
             embedding_dim: int = 512,
         return pos_embeds
 class MySequential(nn.Sequential):
     def forward(self, *inputs):
         for module in self._modules.values():
 class Mlp(nn.Module):
     def __init__(
+            self,
+            in_features,
+            hidden_features=None,
+            out_features=None,
+            act_layer=nn.GELU,
     ):
         super().__init__()
         out_features = out_features or in_features
 class DepthWiseConv2d(nn.Module):
     def __init__(
+            self,
+            dim_in,
+            kernel_size,
+            padding,
+            stride,
+            bias=True,
     ):
         super().__init__()
         self.dw = nn.Conv2d(
     """
     def __init__(
+            self,
+            patch_size=7,
+            in_chans=3,
+            embed_dim=64,
+            stride=4,
+            padding=2,
+            norm_layer=None,
+            pre_norm=True
     ):
         super().__init__()
         self.patch_size = patch_size
         self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
         self.ffn = PreNorm(
             norm_layer(dim),
+            Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer),
             drop_path
         )
 def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
+    B = batch_size
     # this will cause onnx conversion failed for dynamic axis, because treated as constant
+    # int(windows.shape[0] / (H * W / window_size / window_size))
     x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
     x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
     return x
 class WindowAttention(nn.Module):
     def __init__(self, dim, num_heads, window_size, qkv_bias=True):
         super().__init__()
         self.dim = dim
         self.window_size = window_size
         self.softmax = nn.Softmax(dim=-1)
     def forward(self, x, size):
         H, W = size
         B, L, C = x.shape
         assert L == H * W, "input feature has wrong size"
         self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
         self.ffn = PreNorm(
             norm_layer(dim),
+            Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer),
             drop_path
         )
     """
     def __init__(
+            self,
+            in_chans=3,
+            num_classes=1000,
+            depths=(1, 1, 3, 1),
+            patch_size=(7, 2, 2, 2),
+            patch_stride=(4, 2, 2, 2),
+            patch_padding=(3, 0, 0, 0),
+            patch_prenorm=(False, False, False, False),
+            embed_dims=(64, 128, 192, 256),
+            num_heads=(3, 6, 12, 24),
+            num_groups=(3, 6, 12, 24),
+            window_size=7,
+            mlp_ratio=4.,
+            qkv_bias=True,
+            drop_path_rate=0.1,
+            norm_layer=nn.LayerNorm,
+            enable_checkpoint=False,
+            conv_at_attn=True,
+            conv_at_ffn=True,
+    ):
         super().__init__()
         self.num_classes = num_classes
         assert self.num_stages == len(self.num_heads) == len(self.num_groups)
         num_stages = len(embed_dims)
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths) * 2)]
         depth_offset = 0
         convs = []
                     MySequential(OrderedDict([
                         (
                             'spatial_block', SpatialBlock(
+                            embed_dims[i],
+                            num_heads[i],
+                            window_size,
+                            drop_path_rate=dpr[depth_offset + j * 2],
+                            qkv_bias=qkv_bias,
+                            mlp_ratio=mlp_ratio,
+                            conv_at_attn=conv_at_attn,
+                            conv_at_ffn=conv_at_ffn,
+                        )
                         ),
                         (
                             'channel_block', ChannelBlock(
+                            embed_dims[i],
+                            num_groups[i],
+                            drop_path_rate=dpr[depth_offset + j * 2 + 1],
+                            qkv_bias=qkv_bias,
+                            mlp_ratio=mlp_ratio,
+                            conv_at_attn=conv_at_attn,
+                            conv_at_ffn=conv_at_ffn,
+                        )
                         )
                     ])) for j in range(depths[i])
                 ]
             )
             blocks.append(block)
+            depth_offset += depths[i] * 2
         self.convs = nn.ModuleList(convs)
         self.blocks = nn.ModuleList(blocks)
         self.avgpool = nn.AdaptiveAvgPool1d(1)
         self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
     @property
     def dim_out(self):
         return self.embed_dims[-1]
     def forward_features_unpool(self, x):
         """
+        forward until avg pooling
         Args:
             x (_type_): input image tensor
         """
         x = self.forward_features(x)
         x = self.head(x)
         return x
     @classmethod
     def from_config(cls, config):
         return cls(
         )
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     """Multi-headed attention from 'Attention Is All You Need' paper"""
     def __init__(
+            self,
+            embed_dim: int,
+            num_heads: int,
+            dropout: float = 0.0,
+            is_decoder: bool = False,
+            bias: bool = True,
+            is_causal: bool = False,
+            config: Optional[Florence2LanguageConfig] = None,
     ):
         super().__init__()
         self.embed_dim = embed_dim
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                 f" and `num_heads`: {num_heads})."
             )
+        self.scaling = self.head_dim ** -0.5
         self.is_decoder = is_decoder
         self.is_causal = is_causal
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
     def forward(
+            self,
+            hidden_states: torch.Tensor,
+            key_value_states: Optional[torch.Tensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            layer_head_mask: Optional[torch.Tensor] = None,
+            output_attentions: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
         # is checking that the `sequence_length` of the `past_key_value` is the same as
         # the provided `key_value_states` to support prefix tuning
         if (
+                is_cross_attention
+                and past_key_value is not None
+                and past_key_value[0].shape[2] == key_value_states.shape[1]
         ):
             # reuse k,v, cross_attentions
             key_states = past_key_value[0]
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
     def forward(
+            self,
+            hidden_states: torch.Tensor,
+            key_value_states: Optional[torch.Tensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            layer_head_mask: Optional[torch.Tensor] = None,
+            output_attentions: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         # Florence2FlashAttention2 attention does not support output_attentions
         if output_attentions:
         # is checking that the `sequence_length` of the `past_key_value` is the same as
         # the provided `key_value_states` to support prefix tuning
         if (
+                is_cross_attention
+                and past_key_value is not None
+                and past_key_value[0].shape[2] == key_value_states.shape[1]
         ):
             # reuse k,v, cross_attentions
             key_states = past_key_value[0].transpose(1, 2)
     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
     def _flash_attention_forward(
+            self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
     ):
         """
         Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
 class Florence2SdpaAttention(Florence2Attention):
     def forward(
+            self,
+            hidden_states: torch.Tensor,
+            key_value_states: Optional[torch.Tensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            layer_head_mask: Optional[torch.Tensor] = None,
+            output_attentions: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
         if output_attentions or layer_head_mask is not None:
         # is checking that the `sequence_length` of the `past_key_value` is the same as
         # the provided `key_value_states` to support prefix tuning
         if (
+                is_cross_attention
+                and past_key_value is not None
+                and past_key_value[0].shape[2] == key_value_states.shape[1]
         ):
             # reuse k,v, cross_attentions
             key_states = past_key_value[0]
         self.final_layer_norm = nn.LayerNorm(self.embed_dim)
     def forward(
+            self,
+            hidden_states: torch.FloatTensor,
+            attention_mask: torch.FloatTensor,
+            layer_head_mask: torch.FloatTensor,
+            output_attentions: Optional[bool] = False,
     ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
         """
         Args:
         hidden_states = self.final_layer_norm(hidden_states)
         if hidden_states.dtype == torch.float16 and (
+                torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
         ):
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
         self.final_layer_norm = nn.LayerNorm(self.embed_dim)
     def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+            layer_head_mask: Optional[torch.Tensor] = None,
+            cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+            use_cache: Optional[bool] = True,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
         return outputs
 class Florence2LanguagePreTrainedModel(PreTrainedModel):
     config_class = Florence2LanguageConfig
     base_model_prefix = "model"
             module.weight.data.normal_(mean=0.0, std=std)
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.Conv2d):
+            nn.init.normal_(module.weight, std=0.02)
+            for name, _ in module.named_parameters():
+                if name == "bias":
+                    nn.init.constant_(module.bias, 0)
+        elif isinstance(module, nn.LayerNorm):
+            nn.init.constant_(module.weight, 1.0)
+            nn.init.constant_(module.bias, 0)
+        elif isinstance(module, nn.BatchNorm2d):
+            nn.init.constant_(module.weight, 1.0)
+            nn.init.constant_(module.bias, 0)
     @property
     def dummy_inputs(self):
         self.embed_tokens = value
     def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutput]:
         r"""
         Args:
         self.embed_tokens = value
     def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            cross_attn_head_mask: Optional[torch.Tensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
     ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
         r"""
         Args:
         return self.decoder
     def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            decoder_input_ids: Optional[torch.LongTensor] = None,
+            decoder_attention_mask: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            decoder_head_mask: Optional[torch.Tensor] = None,
+            cross_attn_head_mask: Optional[torch.Tensor] = None,
+            encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Seq2SeqModelOutput]:
         # different to other models, Florence2 automatically creates decoder_input_ids from
         # input_ids if no decoder_input_ids are provided
         # Initialize weights and apply final processing
         self.post_init()
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.model.encoder.embed_tokens, self.model.shared)
+            self._tie_or_clone_weights(self.model.decoder.embed_tokens, self.model.shared)
+            self._tie_or_clone_weights(self.lm_head, self.model.shared)
     def get_encoder(self):
         return self.model.get_encoder()
     def get_decoder(self):
         return self.model.get_decoder()
+    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None,
+                                **kwargs) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of, **kwargs)
         self._resize_final_logits_bias(new_embeddings.weight.shape[0])
         return new_embeddings
         self.lm_head = new_embeddings
     def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            decoder_input_ids: Optional[torch.LongTensor] = None,
+            decoder_attention_mask: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            decoder_head_mask: Optional[torch.Tensor] = None,
+            cross_attn_head_mask: Optional[torch.Tensor] = None,
+            encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Seq2SeqLMOutput]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
         )
     def prepare_inputs_for_generation(
+            self,
+            decoder_input_ids,
+            past_key_values=None,
+            attention_mask=None,
+            decoder_attention_mask=None,
+            head_mask=None,
+            decoder_head_mask=None,
+            cross_attn_head_mask=None,
+            use_cache=None,
+            encoder_outputs=None,
+            **kwargs,
     ):
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
             )
         return reordered_past
 @dataclass
 class Florence2Seq2SeqLMOutput(ModelOutput):
     """
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 @add_start_docstrings(
     """The FLORENCE2 vision model without any head""",
     FLORENCE2_START_DOCSTRING,
         self.vision_tower = DaViT.from_config(config=config)
         self.post_init()
     def forward(self, pixel_values):
         if len(pixel_values.shape) == 4:
             x = self.vision_tower.forward_features_unpool(pixel_values)
         self._build_image_projection_layers(config)
         self.post_init()
     def _build_image_projection_layers(self, config):
         image_dim_out = config.dim_embed[-1]
         dim_projection = config.projection_dim
             x = self.vision_tower.forward_features_unpool(pixel_values)
         else:
             raise ValueError(f'invalid image shape {pixel_values.shape}')
         if self.image_pos_embed is not None:
             x = x.view(batch_size * T, -1, x.shape[-1])
             num_tokens = x.shape[-2]
             x = x.view(batch_size * T, h, w, x.shape[-1])
             pos_embed = self.image_pos_embed(x)
             x = x + pos_embed
+            x = x.view(batch_size, T * h * w, x.shape[-1])
         if self.visual_temporal_embed is not None:
             visual_temporal_embed = self.visual_temporal_embed(x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
         x = x @ self.image_projection
         x = self.image_proj_norm(x)
         return x
 @add_start_docstrings(
     """The FLORENCE2 model which consists of a vision backbone and a language model.""",
     FLORENCE2_START_DOCSTRING,
 )
+class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
+    _tied_weights_keys = ["language_model.encoder.embed_tokens.weight", "language_model.decoder.embed_tokens.weight",
+                          "language_model.lm_head.weight"]
     def __init__(self, config: Florence2Config):
         super().__init__(config)
         assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
         self.vision_tower = DaViT.from_config(config=config.vision_config)
+        # remove unused layers
         del self.vision_tower.head
         del self.vision_tower.norms
         language_model = Florence2LanguageForConditionalGeneration(config=config.text_config)
         self.language_model = language_model
         self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
         self.post_init()
     def _build_image_projection_layers(self, config):
         image_dim_out = config.vision_config.dim_embed[-1]
         dim_projection = config.vision_config.projection_dim
     def get_input_embeddings(self):
         return self.language_model.get_input_embeddings()
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None,
+                                **kwargs) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of, **kwargs)
         # update vocab size
         self.config.text_config.vocab_size = model_embeds.num_embeddings
         self.config.vocab_size = model_embeds.num_embeddings
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
     def _encode_image(self, pixel_values):
         if len(pixel_values.shape) == 4:
             batch_size, C, H, W = pixel_values.shape
             x = self.vision_tower.forward_features_unpool(pixel_values)
         else:
             raise ValueError(f'invalid image shape {pixel_values.shape}')
         if self.image_pos_embed is not None:
             x = x.view(batch_size * T, -1, x.shape[-1])
             num_tokens = x.shape[-2]
             x = x.view(batch_size * T, h, w, x.shape[-1])
             pos_embed = self.image_pos_embed(x)
             x = x + pos_embed
+            x = x.view(batch_size, T * h * w, x.shape[-1])
         if self.visual_temporal_embed is not None:
             visual_temporal_embed = self.visual_temporal_embed(x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
         x = x @ self.image_projection
         x = self.image_proj_norm(x)
+        return x
     def _merge_input_ids_with_image_features(
+            self, image_features, inputs_embeds
     ):
         batch_size, image_token_length = image_features.size()[:-1]
         device = image_features.device
         return inputs_embeds, attention_mask
     @add_start_docstrings_to_model_forward(FLORENCE2_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=Florence2Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            pixel_values: torch.FloatTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            decoder_input_ids: Optional[torch.LongTensor] = None,
+            decoder_attention_mask: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            decoder_head_mask: Optional[torch.Tensor] = None,
+            cross_attn_head_mask: Optional[torch.Tensor] = None,
+            encoder_outputs: Optional[List[torch.FloatTensor]] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Florence2Seq2SeqLMOutput]:
         r"""
         Args:
         )
     def generate(
+            self,
+            input_ids,
+            inputs_embeds=None,
+            pixel_values=None,
+            **kwargs
+    ):
         if inputs_embeds is None:
             # 1. Extra the input embeddings
             if pixel_values is not None:
                 image_features = self._encode_image(pixel_values)
                 inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
         return self.language_model.generate(
             input_ids=None,
             inputs_embeds=inputs_embeds,
         )
     def prepare_inputs_for_generation(
+            self,
+            decoder_input_ids,
+            past_key_values=None,
+            attention_mask=None,
+            pixel_values=None,
+            decoder_attention_mask=None,
+            head_mask=None,
+            decoder_head_mask=None,
+            cross_attn_head_mask=None,
+            use_cache=None,
+            encoder_outputs=None,
+            **kwargs,
     ):
         # cut decoder_input_ids if past_key_values is used
         if past_key_values is not None:
                 remove_prefix_length = decoder_input_ids.shape[1] - 1
             decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
         return {
             "input_ids": None,  # encoder_outputs is defined. input_ids not needed
             "encoder_outputs": encoder_outputs,
             "cross_attn_head_mask": cross_attn_head_mask,
             "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
         }
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return self.language_model.shift_tokens_right(labels)

requirements.txt CHANGED Viewed

@@ -1,8 +1,9 @@
-torch
-torchvision
-timm
 pillow
 safetensors
-transformers
-einops
-pydantic==2.10.6

+einops
+gradio
 pillow
+pydantic
 safetensors
+timm
+torch
+torchvision
+transformers==4.51.3

tag_implications-2024-05-05.csv DELETED Viewed

The diff for this file is too large to render. See raw diff

tags-2024-05-05.csv → tags-2025-11-25.csv.gz RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6d2dee2343cb402468867655a5df065a03bbd50fc68fe5f1b82f1685a5665370
-size 31973430

 version https://git-lfs.github.com/spec/v1
+oid sha256:f309ca05034df465bbb930a9cc29be067ef80e04ccb113d6294bd17861bf7f84
+size 16154767