Refactor Florence2Processor and utils.py for improved readability and maintainability; add colab setup script with necessary patches and dependencies for model execution.

Files changed (5) hide show

colab copy.py +187 -0
colab.py +52 -16
modeling_florence2.py +382 -209
processing_florence2.py +68 -45
utils.py +87 -42

colab copy.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# @title Ostateczne Uruchomienie `magiv3` z Wymaganymi Poprawkami
+# TO nie działa ale pobiera jakieś zależności by dziłąło colab.py
+import os
+import sys
+import re
+import requests
+import torch
+from PIL import Image, ImageDraw, ImageFont
+import numpy as np
+import json
+from IPython.display import display
+import warnings
+# --- KROK 1: PRZYGOTOWANIE ŚRODOWISKA Z POPRAWKAMI ---
+print("--- KROK 1: PRZYGOTOWANIE ŚRODOWISKA Z POPRAWKAMI ---")
+# --- Instalacja zależności ---
+print("⏳ Instaluję `uv` i wszystkie potrzebne pakiety...")
+!curl -LsSf https://astral.sh/uv/install.sh | sh
+os.environ['PATH'] = f"/root/.local/bin:{os.environ['PATH']}"
+!uv pip install --quiet transformers accelerate einops timm scipy tokenizers pulp torch pytorch-metric-learning Pillow requests shapely
+print("✅ Zależności zainstalowane.")
+# --- Klonowanie repozytorium ---
+repo_path = "/content/magiv3"
+print(f"\n⏳ Klonuję repozytorium do folderu `{repo_path}`...")
+if os.path.exists(repo_path):
+    !rm -rf {repo_path}
+!git clone https://huggingface.co/ragavsachdeva/magiv3 {repo_path}
+print("✅ Repozytorium sklonowane.")
+# --- OSTATECZNA, KOMPLEKSOWA POPRAWKA KODU ---
+file_to_patch = os.path.join(repo_path, "modeling_florence2.py")
+print(f"\n⏳ Nanoszę wszystkie wymagane poprawki na plik `{file_to_patch}`...")
+try:
+    with open(file_to_patch, 'r', encoding='utf-8') as f:
+        content = f.read()
+    # Poprawka 1: Dodanie importu GenerationMixin
+    if "from transformers.generation.utils import GenerationMixin" not in content:
+        content = content.replace(
+            "from transformers.modeling_utils import PreTrainedModel",
+            "from transformers.generation.utils import GenerationMixin\nfrom transformers.modeling_utils import PreTrainedModel"
+        )
+        print("PATCH 1: Dodano import `GenerationMixin`.")
+    # Poprawka 2: Naprawa klasy bazowej modelu językowego (dla metody .generate)
+    original_lang_class = "class Florence2LanguagePreTrainedModel(PreTrainedModel):"
+    patched_lang_class = "class Florence2LanguagePreTrainedModel(GenerationMixin, PreTrainedModel):"
+    if original_lang_class in content:
+        content = content.replace(original_lang_class, patched_lang_class)
+        print("PATCH 2: Poprawiono dziedziczenie `Florence2LanguagePreTrainedModel`.")
+    # Poprawka 3: Usunięcie wadliwych właściwości, które powodują błąd inicjalizacji
+    faulty_property_block = r"""
+    @property
+    def _supports_flash_attn_2\(self\):.*?return self.language_model._supports_flash_attn_2
+    @property
+    def _supports_sdpa\(self\):.*?return self.language_model._supports_sdpa"""
+    if re.search(faulty_property_block, content, flags=re.DOTALL):
+        content = re.sub(faulty_property_block, "", content, flags=re.DOTALL)
+        print("PATCH 3: Usunięto wadliwe właściwości `@property`.")
+    # Poprawka 4: Naprawa błędu nadpisywania modelu w __init__
+    faulty_init_block = r"""        language_model = Florence2LanguageForConditionalGeneration\(config=config.text_config\)
+        if language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = \[f"language_model.{k}" for k in language_model._tied_weights_keys\]
+        self.language_model = language_model"""
+    correct_init_block = r"""        # This line is intentionally left blank.
+        # The language_model is already initialized by the parent class.
+        # The original code had a bug here that overwrote the pretrained language model.
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]"""
+    if re.search(faulty_init_block, content, flags=re.DOTALL):
+        content = re.sub(faulty_init_block, correct_init_block, content, flags=re.DOTALL)
+        print("PATCH 4: Naprawiono błąd nadpisywania modelu w `__init__`.")
+    with open(file_to_patch, 'w', encoding='utf-8') as f:
+        f.write(content)
+    print("\n✅ Wszystkie poprawki zostały pomyślnie naniesione!")
+except Exception as e:
+    print(f"❌ Wystąpił krytyczny błąd podczas patchowania pliku: {e}")
+    sys.exit()
+# --- KROK 2: POBRANIE OBRAZKA TESTOWEGO ---
+print("\n--- KROK 2: POBRANIE OBRAZKA TESTOWEGO ---")
+IMAGE_URL = "https://raw.githubusercontent.com/MattyMroz/Manga_Whisperer/refs/heads/main/input/raw/04.jpg"
+IMAGE_PATH = "/content/test_image.jpg"
+try:
+    response = requests.get(IMAGE_URL)
+    response.raise_for_status()
+    with open(IMAGE_PATH, 'wb') as f:
+        f.write(response.content)
+    print(f"✅ Obrazek testowy został pomyślnie pobrany i zapisany jako `{IMAGE_PATH}`.")
+    display(Image.open(IMAGE_PATH).resize((300, 400)))
+except Exception as e:
+    print(f"❌ Nie udało się pobrać obrazka. Błąd: {e}")
+    sys.exit()
+# --- KROK 3: URUCHOMIENIE POPRAWIONEGO MODELU ---
+print("\n--- KROK 3: URUCHOMIENIE POPRAWIONEGO MODELU ---")
+warnings.filterwarnings("ignore", category=FutureWarning)
+# Dodajemy poprawiony kod do ścieżki Pythona
+if repo_path not in sys.path:
+    sys.path.insert(0, repo_path)
+# Importujemy klasy z naszego poprawionego kodu
+from magiv3.modeling_florence2 import Florence2ForConditionalGeneration
+from transformers import AutoProcessor
+model = None
+processor = None
+try:
+    print(f"⏳ Ładowanie modelu i procesora (z użyciem poprawionego kodu)...")
+    model_id = "ragavsachdeva/magiv3"
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Używamy AutoProcessor, ale dla modelu musimy wskazać naszą poprawioną klasę
+    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+    model = Florence2ForConditionalGeneration.from_pretrained(
+        model_id,
+        torch_dtype=torch.float16,
+        trust_remote_code=True
+    ).to(device).eval()
+    print("✅ Model i procesor załadowane pomyślnie.")
+except Exception as e:
+    print(f"\n❌ Wystąpił błąd podczas ładowania modelu, nawet po poprawkach: {e}")
+    sys.exit()
+# Uruchamiamy predykcję
+if model and processor:
+    try:
+        print("\n⏳ Przygotowuję dane wejściowe...")
+        images = [Image.open(IMAGE_PATH).convert("RGB")]
+        np_images = [np.array(img) for img in images]
+        print("✅ Dane wejściowe gotowe.")
+        print("\n⏳ Uruchamiam `predict_detections_and_associations`...")
+        with torch.no_grad():
+            results = model.predict_detections_and_associations(np_images, processor)
+        print("✅ Przetwarzanie zakończone pomyślnie!")
+        print("\n--- WYNIKI ---")
+        # Funkcja do wizualizacji
+        def visualize_results(image, results):
+            colors = {"panels": "red", "texts": "blue", "characters": "green", "tails": "yellow"}
+            draw_image = image.copy()
+            draw = ImageDraw.Draw(draw_image)
+            try:
+                font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", 15)
+            except IOError:
+                font = ImageFont.load_default()
+            for category, bboxes in results.items():
+                if category not in colors: continue
+                for i, box in enumerate(bboxes):
+                    draw.rectangle(box, outline=colors[category], width=3)
+                    draw.text((box[0], box[1]), f"{category}_{i}", fill=colors[category], font=font)
+            return draw_image
+        visualized_image = visualize_results(images[0], results[0])
+        display(visualized_image)
+        serializable_results = {k: (v.tolist() if isinstance(v, torch.Tensor) else v) for k, v in results[0].items()}
+        print(json.dumps(serializable_results, indent=2))
+    except Exception as e:
+        print(f"\n❌ WYSTĄPIŁ KRYTYCZNY BŁĄD PODCZAS PRZETWARZANIA:")
+        print(f"Błąd: {e}")

colab.py CHANGED Viewed

@@ -1,20 +1,20 @@
 # ==============================================================================
 # 1) INSTALACJA PAKIETÓW
 # ==============================================================================
 !pip -q install -U "transformers" "huggingface_hub" "accelerate" "timm" "sentencepiece" "safensors" "pillow" "einops" "pytorch_metric_learning"
 # ==============================================================================
 # 2) IMPORTY
 # ==============================================================================
-import re
-import requests
-import torch
-import json
-import math
-from PIL import Image, ImageDraw
-from io import BytesIO
-from IPython.display import display
-from transformers import AutoProcessor, AutoModelForCausalLM, AutoConfig
 # ==============================================================================
 # 3) POBRANIE OBRAZU
@@ -62,9 +62,15 @@ print("Model i procesor załadowane pomyślnie.")
 # ==============================================================================
-def create_visualization(image, data):
     """
     Rysuje zaawansowaną wizualizację detekcji i asocjacji na obrazie.
     """
     img_draw = image.copy()
     draw = ImageDraw.Draw(img_draw)
@@ -77,8 +83,10 @@ def create_visualization(image, data):
         "tails": "purple",
         "cluster_colors": ["#f50a8f", "#4b13b6", "#ddaa34", "#b7ff51", "#bea2a2"],
         "speaker_line": "magenta",
     }
-    line_widths = {"panels": 2, "texts": 1, "characters": 2, "tails": 1}
     def get_box_center(box):
         x1, y1, x2, y2 = box
@@ -131,10 +139,35 @@ def create_visualization(image, data):
                 draw_dashed_line(
                     draw, p1, p2, fill=colors["speaker_line"], width=1)
     return img_draw
-def process_image(image, caption_for_grounding="elf girl"):
     print("\n--- Rozpoczynanie przetwarzania obrazu ---")
     images = [image]
     captions = [caption_for_grounding]
@@ -157,8 +190,9 @@ def process_image(image, caption_for_grounding="elf girl"):
         "grounding": [{"phrase": grounding_results.get("grounded_caption", "")[start:end], "boxes": boxes} for boxes, (start, end) in zip(grounding_results.get("bboxes", []), grounding_results.get("indices_of_bboxes_in_caption", []))]
     }
-    print("Tworzenie zaawansowanej wizualizacji...")
-    visualization_image = create_visualization(image, final_json)
     print("--- Zakończono przetwarzanie ---")
     return final_json, visualization_image
@@ -167,9 +201,11 @@ def process_image(image, caption_for_grounding="elf girl"):
 # 6) URUCHOMIENIE I WYŚWIETLENIE WYNIKÓW
 # ==============================================================================
 json_output, image_output = process_image(
-    pil_img, caption_for_grounding="elf girl")
 print("\n\n===== WYNIKI W FORMACIE JSON (przed filtrowaniem) =====")
 print(json.dumps(json_output, indent=2))

 # ==============================================================================
 # 1) INSTALACJA PAKIETÓW
 # ==============================================================================
+from transformers import AutoProcessor, AutoModelForCausalLM, AutoConfig
+from IPython.display import display
+from io import BytesIO
+from PIL import Image, ImageDraw
+import math
+import json
+import torch
+import requests
+import re
 !pip -q install -U "transformers" "huggingface_hub" "accelerate" "timm" "sentencepiece" "safensors" "pillow" "einops" "pytorch_metric_learning"
 # ==============================================================================
 # 2) IMPORTY
 # ==============================================================================
 # ==============================================================================
 # 3) POBRANIE OBRAZU
 # ==============================================================================
+def create_visualization(image, data, detailed_mode=False):
     """
     Rysuje zaawansowaną wizualizację detekcji i asocjacji na obrazie.
+    Args:
+        image: Obraz wejściowy
+        data: Dane JSON z wynikami
+        detailed_mode: Jeśli True, rysuje wszystko z JSON (OCR, grounding).
+                      Jeśli False (domyślnie), rysuje tylko detekcje i asocjacje.
     """
     img_draw = image.copy()
     draw = ImageDraw.Draw(img_draw)
         "tails": "purple",
         "cluster_colors": ["#f50a8f", "#4b13b6", "#ddaa34", "#b7ff51", "#bea2a2"],
         "speaker_line": "magenta",
+        "ocr": "orange",
+        "grounding": "cyan",
     }
+    line_widths = {"panels": 2, "texts": 1, "characters": 2, "tails": 1, "ocr": 2, "grounding": 2}
     def get_box_center(box):
         x1, y1, x2, y2 = box
                 draw_dashed_line(
                     draw, p1, p2, fill=colors["speaker_line"], width=1)
+    # Tryb wybredny - rysowanie dodatkowych elementów z JSON
+    if detailed_mode:
+        # Rysowanie OCR boxes
+        ocr_data = data.get("ocr", [])
+        for ocr_item in ocr_data:
+            box = ocr_item.get("box")
+            if box:
+                draw.rectangle(box, outline=colors["ocr"], width=line_widths["ocr"])
+        # Rysowanie Grounding boxes
+        grounding_data = data.get("grounding", [])
+        for grounding_item in grounding_data:
+            boxes = grounding_item.get("boxes", [])
+            for box in boxes:
+                draw.rectangle(box, outline=colors["grounding"], width=line_widths["grounding"])
     return img_draw
+def process_image(image, caption_for_grounding="elf girl", detailed_mode=False):
+    """
+    Przetwarza obraz i tworzy wizualizację.
+    Args:
+        image: Obraz wejściowy
+        caption_for_grounding: Caption dla character grounding
+        detailed_mode: Jeśli True, wizualizacja zawiera wszystko z JSON (OCR, grounding).
+                      Jeśli False (domyślnie), tylko detekcje i asocjacje.
+    """
     print("\n--- Rozpoczynanie przetwarzania obrazu ---")
     images = [image]
     captions = [caption_for_grounding]
         "grounding": [{"phrase": grounding_results.get("grounded_caption", "")[start:end], "boxes": boxes} for boxes, (start, end) in zip(grounding_results.get("bboxes", []), grounding_results.get("indices_of_bboxes_in_caption", []))]
     }
+    mode_text = "wybredny (wszystkie elementy)" if detailed_mode else "domyślny (detekcje i asocjacje)"
+    print(f"Tworzenie wizualizacji w trybie: {mode_text}")
+    visualization_image = create_visualization(image, final_json, detailed_mode=detailed_mode)
     print("--- Zakończono przetwarzanie ---")
     return final_json, visualization_image
 # 6) URUCHOMIENIE I WYŚWIETLENIE WYNIKÓW
 # ==============================================================================
+# Tryb wizualizacji:
+# detailed_mode=False (domyślny) - rysuje tylko detekcje i asocjacje (obecne kolory)
+# detailed_mode=True (wybredny) - rysuje wszystko z JSON: OCR (pomarańczowy), grounding (cyjan)
 json_output, image_output = process_image(
+    pil_img, caption_for_grounding="elf girl", detailed_mode=True)
 print("\n\n===== WYNIKI W FORMACIE JSON (przed filtrowaniem) =====")
 print(json.dumps(json_output, indent=2))

modeling_florence2.py CHANGED Viewed

@@ -23,7 +23,7 @@ import torch.utils.checkpoint
 from torch import nn
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
-from torch.nn import CrossEntropyLoss
 from collections import OrderedDict
 from einops import rearrange, repeat
 from timm.models.layers import DropPath, trunc_normal_
@@ -41,7 +41,7 @@ from transformers.utils import (
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
 )
-from .configuration_florence2 import Florence2Config
 from .configuration_florence2 import Florence2LanguageConfig
 from .configuration_florence2 import Florence2VisionConfig
 from pytorch_metric_learning.utils.loss_and_miner_utils import get_all_pairs_indices
@@ -72,6 +72,7 @@ logger = logging.get_logger(__name__)
 _CONFIG_FOR_DOC = "Florence2Config"
 class LearnedAbsolutePositionEmbedding2D(nn.Module):
     """
     This module learns positional embeddings up to a fixed maximum size.
@@ -80,7 +81,8 @@ class LearnedAbsolutePositionEmbedding2D(nn.Module):
     def __init__(self, embedding_dim=256, num_pos=50):
         super().__init__()
         self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
-        self.column_embeddings = nn.Embedding(num_pos, embedding_dim - (embedding_dim // 2))
     def forward(self, pixel_values):
         """
@@ -95,7 +97,8 @@ class LearnedAbsolutePositionEmbedding2D(nn.Module):
         x_emb = self.column_embeddings(width_values)
         y_emb = self.row_embeddings(height_values)
         # (height, width, embedding_dim * 2)
-        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
         # (embedding_dim * 2, height, width)
         pos = pos.permute(2, 0, 1)
         pos = pos.unsqueeze(0)
@@ -105,6 +108,7 @@ class LearnedAbsolutePositionEmbedding2D(nn.Module):
         pos = pos.permute(0, 2, 3, 1)
         return pos
 class PositionalEmbeddingCosine1D(nn.Module):
     """
     This class implements a very simple positional encoding. It follows closely
@@ -116,6 +120,7 @@ class PositionalEmbeddingCosine1D(nn.Module):
         dropout_prob: The dropout probability.
         max_seq_len: The maximum length to precompute the positional encodings.
     """
     def __init__(
             self,
             embed_dim: int = 512,
@@ -171,6 +176,7 @@ class LearnedAbsolutePositionEmbedding1D(nn.Module):
         embed_dim: The dimension of the embeddings.
         max_seq_len: The maximum length to precompute the positional encodings.
     """
     def __init__(
             self,
             embedding_dim: int = 512,
@@ -196,7 +202,8 @@ class LearnedAbsolutePositionEmbedding1D(nn.Module):
         len_seq = seq_embeds.size(-2)
         assert len_seq <= self.num_pos
         # [T, D]
-        pos_embeds = self.embeddings(torch.arange(len_seq).to(seq_embeds.device))
         # Adapt pre-computed positional embeddings to the input.
         if shape_len == 3:
             pos_embeds = pos_embeds.view(
@@ -204,7 +211,6 @@ class LearnedAbsolutePositionEmbedding1D(nn.Module):
         return pos_embeds
 class MySequential(nn.Sequential):
     def forward(self, *inputs):
         for module in self._modules.values():
@@ -349,7 +355,8 @@ class ChannelAttention(nn.Module):
     def forward(self, x, size):
         B, N, C = x.shape
-        qkv = self.qkv(x).reshape(B, N, 3, self.groups, C // self.groups).permute(2, 0, 3, 1, 4)
         q, k, v = qkv[0], qkv[1], qkv[2]
         q = q * (float(N) ** -0.5)
@@ -368,18 +375,22 @@ class ChannelBlock(nn.Module):
                  conv_at_attn=True, conv_at_ffn=True):
         super().__init__()
-        drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
-        self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
         self.channel_attn = PreNorm(
             norm_layer(dim),
             ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
             drop_path
         )
-        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
         self.ffn = PreNorm(
             norm_layer(dim),
-            Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
             drop_path
         )
@@ -397,16 +408,19 @@ class ChannelBlock(nn.Module):
 def window_partition(x, window_size: int):
     B, H, W, C = x.shape
-    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
-    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
     return windows
 def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
-    B = batch_size
     # this will cause onnx conversion failed for dynamic axis, because treated as constant
-    # int(windows.shape[0] / (H * W / window_size / window_size))
-    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
     x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
     return x
@@ -447,7 +461,8 @@ class WindowAttention(nn.Module):
         # attn_windows = self.attn(x_windows)
         B_, N, C = x.shape
-        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
         q, k, v = qkv[0], qkv[1], qkv[2]
         q = q * self.scale
@@ -478,18 +493,22 @@ class SpatialBlock(nn.Module):
                  norm_layer=nn.LayerNorm, conv_at_attn=True, conv_at_ffn=True):
         super().__init__()
-        drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
-        self.conv1 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_attn else None
         self.window_attn = PreNorm(
             norm_layer(dim),
             WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias),
             drop_path
         )
-        self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
         self.ffn = PreNorm(
             norm_layer(dim),
-            Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
             drop_path
         )
@@ -547,7 +566,7 @@ class DaViT(nn.Module):
         enable_checkpoint=True,
         conv_at_attn=True,
         conv_at_ffn=True,
-     ):
         super().__init__()
         self.num_classes = num_classes
@@ -559,7 +578,8 @@ class DaViT(nn.Module):
         assert self.num_stages == len(self.num_heads) == len(self.num_groups)
         num_stages = len(embed_dims)
-        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)*2)]
         depth_offset = 0
         convs = []
@@ -613,7 +633,8 @@ class DaViT(nn.Module):
         self.norms = norm_layer(self.embed_dims[-1])
         self.avgpool = nn.AdaptiveAvgPool1d(1)
-        self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
         self.apply(self._init_weights)
@@ -648,7 +669,8 @@ class DaViT(nn.Module):
         for conv, block in zip(self.convs, self.blocks):
             x, input_size = conv(x, input_size)
             if self.enable_checkpoint:
-                x, input_size = checkpoint.checkpoint(block, x, input_size, use_reentrant=True)
             else:
                 x, input_size = block(x, input_size)
         return x
@@ -668,7 +690,7 @@ class DaViT(nn.Module):
         x = self.forward_features(x)
         x = self.head(x)
         return x
     @classmethod
     def from_config(cls, config):
         return cls(
@@ -685,18 +707,19 @@ class DaViT(nn.Module):
         )
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
     return (
         indices,
         cu_seqlens,
@@ -834,7 +857,8 @@ class Florence2Attention(nn.Module):
             if past_key_value[0] is not None:
                 key_states = torch.cat([past_key_value[0], key_states], dim=2)
             if past_key_value[1] is not None:
-                value_states = torch.cat([past_key_value[1], value_states], dim=2)
         else:
             # self_attention
             key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
@@ -851,7 +875,8 @@ class Florence2Attention(nn.Module):
             past_key_value = (key_states, value_states)
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
         key_states = key_states.reshape(*proj_shape)
         value_states = value_states.reshape(*proj_shape)
@@ -869,8 +894,10 @@ class Florence2Attention(nn.Module):
                 raise ValueError(
                     f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                 )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
@@ -880,20 +907,25 @@ class Florence2Attention(nn.Module):
                     f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
                     f" {layer_head_mask.size()}"
                 )
-            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
         if output_attentions:
             # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to be reshaped
             # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
         else:
             attn_weights_reshaped = None
-        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
         attn_output = torch.bmm(attn_probs, value_states)
@@ -903,7 +935,8 @@ class Florence2Attention(nn.Module):
                 f" {attn_output.size()}"
             )
-        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
@@ -945,7 +978,8 @@ class Florence2FlashAttention2(Florence2Attention):
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         # Florence2FlashAttention2 attention does not support output_attentions
         if output_attentions:
-            raise ValueError("Florence2FlashAttention2 attention does not support output_attentions")
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
@@ -970,13 +1004,16 @@ class Florence2FlashAttention2(Florence2Attention):
         elif is_cross_attention:
             # cross_attentions
             key_states = self._reshape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._reshape(self.v_proj(key_value_states), -1, bsz)
         elif past_key_value is not None:
             # reuse k, v, self_attention
             key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1)
-            value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1)
         else:
             # self_attention
             key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
@@ -990,7 +1027,8 @@ class Florence2FlashAttention2(Florence2Attention):
             # all previous decoder key/value_states. Further calls to uni-directional self-attention
             # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
             # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2))
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
@@ -1086,7 +1124,8 @@ class Florence2FlashAttention2(Florence2Attention):
                 causal=causal,
             )
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
             attn_output = flash_attn_func(
                 query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
@@ -1096,18 +1135,22 @@ class Florence2FlashAttention2(Florence2Attention):
     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
     def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
         key_layer = index_first_axis(
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
         )
         value_layer = index_first_axis(
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
         )
         if query_length == kv_seq_len:
             query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
             )
             cu_seqlens_q = cu_seqlens_k
             max_seqlen_in_batch_q = max_seqlen_in_batch_k
@@ -1122,7 +1165,8 @@ class Florence2FlashAttention2(Florence2Attention):
         else:
             # The -q_len: slice assumes left padding.
             attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
         return (
             query_layer,
@@ -1192,7 +1236,8 @@ class Florence2SdpaAttention(Florence2Attention):
             if past_key_value[0] is not None:
                 key_states = torch.cat([past_key_value[0], key_states], dim=2)
             if past_key_value[1] is not None:
-                value_states = torch.cat([past_key_value[1], value_states], dim=2)
         else:
             # self_attention
             key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
@@ -1294,23 +1339,28 @@ class Florence2EncoderLayer(nn.Module):
             layer_head_mask=layer_head_mask,
             output_attentions=output_attentions,
         )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
         residual = hidden_states
         hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
         hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
         hidden_states = self.final_layer_norm(hidden_states)
         if hidden_states.dtype == torch.float16 and (
-            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
         ):
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
         outputs = (hidden_states,)
@@ -1384,7 +1434,8 @@ class Florence2DecoderLayer(nn.Module):
         # Self Attention
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
         # add present self-attn cache to positions 1,2 of present_key_value tuple
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
@@ -1393,7 +1444,8 @@ class Florence2DecoderLayer(nn.Module):
             layer_head_mask=layer_head_mask,
             output_attentions=output_attentions,
         )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
@@ -1404,7 +1456,8 @@ class Florence2DecoderLayer(nn.Module):
             residual = hidden_states
             # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
             hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
                 hidden_states=hidden_states,
                 key_value_states=encoder_hidden_states,
@@ -1413,7 +1466,8 @@ class Florence2DecoderLayer(nn.Module):
                 past_key_value=cross_attn_past_key_value,
                 output_attentions=output_attentions,
             )
-            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
             hidden_states = residual + hidden_states
             hidden_states = self.encoder_attn_layer_norm(hidden_states)
@@ -1423,9 +1477,11 @@ class Florence2DecoderLayer(nn.Module):
         # Fully Connected
         residual = hidden_states
         hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
         hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
         hidden_states = self.final_layer_norm(hidden_states)
@@ -1440,7 +1496,6 @@ class Florence2DecoderLayer(nn.Module):
         return outputs
 class Florence2LanguagePreTrainedModel(PreTrainedModel):
     config_class = Florence2LanguageConfig
     base_model_prefix = "model"
@@ -1465,7 +1520,8 @@ class Florence2LanguagePreTrainedModel(PreTrainedModel):
     @property
     def dummy_inputs(self):
         pad_token = self.config.pad_token_id
-        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
         dummy_inputs = {
             "attention_mask": input_ids.ne(pad_token),
             "input_ids": input_ids,
@@ -1505,7 +1561,8 @@ class Florence2Encoder(Florence2LanguagePreTrainedModel):
             config.max_position_embeddings,
             embed_dim,
         )
-        self.layers = nn.ModuleList([Florence2EncoderLayer(config) for _ in range(config.encoder_layers)])
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self._use_sdpa = config._attn_implementation == "sdpa"
         self.layernorm_embedding = nn.LayerNorm(embed_dim)
@@ -1574,14 +1631,16 @@ class Florence2Encoder(Florence2LanguagePreTrainedModel):
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             input = input_ids
             input_ids = input_ids.view(-1, input_ids.shape[-1])
         elif inputs_embeds is not None:
             input = inputs_embeds[:, :, -1]
         else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
@@ -1591,7 +1650,8 @@ class Florence2Encoder(Florence2LanguagePreTrainedModel):
         hidden_states = inputs_embeds + embed_pos
         hidden_states = self.layernorm_embedding(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         # expand attention_mask
         if attention_mask is not None:
@@ -1601,10 +1661,12 @@ class Florence2Encoder(Florence2LanguagePreTrainedModel):
                 # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
                 # the manual implementation that requires a 4D causal mask in all cases.
                 # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
             else:
                 # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
@@ -1642,7 +1704,8 @@ class Florence2Encoder(Florence2LanguagePreTrainedModel):
                     layer_outputs = encoder_layer(
                         hidden_states,
                         attention_mask,
-                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                         output_attentions=output_attentions,
                     )
@@ -1676,7 +1739,8 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
         self.layerdrop = config.decoder_layerdrop
         self.padding_idx = config.pad_token_id
         self.max_target_positions = config.max_position_embeddings
-        embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
         self.embed_tokens = Florence2ScaledWordEmbedding(
             config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
@@ -1689,7 +1753,8 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
             config.max_position_embeddings,
             config.d_model,
         )
-        self.layers = nn.ModuleList([Florence2DecoderLayer(config) for _ in range(config.decoder_layers)])
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self._use_sdpa = config._attn_implementation == "sdpa"
@@ -1794,7 +1859,8 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
         elif input_ids is not None:
             input = input_ids
             input_shape = input.shape
@@ -1803,17 +1869,20 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
             input_shape = inputs_embeds.size()[:-1]
             input = inputs_embeds[:, :, -1]
         else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
         # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values and past_key_values[0] and past_key_values[0][0] is not None else 0
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input)
         if self._use_flash_attention_2:
             # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
         elif self._use_sdpa and not output_attentions and cross_attn_head_mask is None:
             # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
             # the manual implementation that requires a 4D causal mask in all cases.
@@ -1855,7 +1924,8 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
         hidden_states = inputs_embeds + positions
         hidden_states = self.layernorm_embedding(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
         if self.gradient_checkpointing and self.training:
             if use_cache:
@@ -1867,7 +1937,8 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
-        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
         next_decoder_cache = () if use_cache else None
         # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
@@ -1909,7 +1980,8 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
                     attention_mask=attention_mask,
                     encoder_hidden_states=encoder_hidden_states,
                     encoder_attention_mask=encoder_attention_mask,
-                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                     cross_attn_layer_head_mask=(
                         cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
                     ),
@@ -1920,7 +1992,8 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
             hidden_states = layer_outputs[0]
             if use_cache:
-                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
             if output_attentions:
                 all_self_attns += (layer_outputs[1],)
@@ -1949,7 +2022,8 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
 class Florence2LanguageModel(Florence2LanguagePreTrainedModel):
-    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
     def __init__(self, config: Florence2LanguageConfig):
         super().__init__(config)
@@ -2035,8 +2109,10 @@ class Florence2LanguageModel(Florence2LanguagePreTrainedModel):
         elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
             encoder_outputs = BaseModelOutput(
                 last_hidden_state=encoder_outputs[0],
-                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
             )
         # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
@@ -2072,14 +2148,17 @@ class Florence2LanguageModel(Florence2LanguagePreTrainedModel):
 class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel, GenerationMixin):
     base_model_prefix = "model"
-    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
     _keys_to_ignore_on_load_missing = ["final_logits_bias"]
     def __init__(self, config: Florence2LanguageConfig):
         super().__init__(config)
         self.model = Florence2LanguageModel(config)
-        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
-        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
         # Initialize weights and apply final processing
         self.post_init()
@@ -2091,7 +2170,8 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
         return self.model.get_decoder()
     def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
         self._resize_final_logits_bias(new_embeddings.weight.shape[0])
         return new_embeddings
@@ -2100,7 +2180,8 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
         if new_num_tokens <= old_num_tokens:
             new_bias = self.final_logits_bias[:, :new_num_tokens]
         else:
-            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
             new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
         self.register_buffer("final_logits_bias", new_bias)
@@ -2141,7 +2222,8 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
         if labels is not None:
             if use_cache:
-                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
             use_cache = False
             if decoder_input_ids is None and decoder_inputs_embeds is None:
                 decoder_input_ids = shift_tokens_right(
@@ -2173,7 +2255,8 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
         if labels is not None:
             labels = labels.to(lm_logits.device)
             loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
         if not return_dict:
             output = (lm_logits,) + outputs[1:]
@@ -2227,7 +2310,8 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
             "head_mask": head_mask,
             "decoder_head_mask": decoder_head_mask,
             "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
         }
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
@@ -2239,11 +2323,13 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
         for layer_past in past_key_values:
             # cached cross_attention states don't have to be reordered -> they are always the same
             reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
                 + layer_past[2:],
             )
         return reordered_past
 @dataclass
 class Florence2Seq2SeqLMOutput(ModelOutput):
     """
@@ -2429,6 +2515,7 @@ FLORENCE2_INPUTS_DOCSTRING = r"""
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 @add_start_docstrings(
     """The FLORENCE2 vision model without any head""",
     FLORENCE2_START_DOCSTRING,
@@ -2436,11 +2523,12 @@ FLORENCE2_INPUTS_DOCSTRING = r"""
 class Florence2VisionModel(Florence2PreTrainedModel):
     def __init__(self, config: Florence2VisionConfig):
         super().__init__(config)
-        assert config.model_type in ['davit', ""], 'only DaViT is supported for now'
         self.vision_tower = DaViT.from_config(config=config)
         self.post_init()
     def forward(self, pixel_values):
         if len(pixel_values.shape) == 4:
             x = self.vision_tower.forward_features_unpool(pixel_values)
@@ -2456,13 +2544,14 @@ class Florence2VisionModel(Florence2PreTrainedModel):
 class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
     def __init__(self, config: Florence2VisionConfig):
         super().__init__(config)
-        assert config.model_type in ['davit', ''], 'only DaViT is supported for now'
         self.vision_tower = DaViT.from_config(config=config)
         self._build_image_projection_layers(config)
         self.post_init()
     def _build_image_projection_layers(self, config):
         image_dim_out = config.dim_embed[-1]
         dim_projection = config.projection_dim
@@ -2498,7 +2587,7 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
             x = self.vision_tower.forward_features_unpool(pixel_values)
         else:
             raise ValueError(f'invalid image shape {pixel_values.shape}')
         if self.image_pos_embed is not None:
             x = x.view(batch_size * T, -1, x.shape[-1])
             num_tokens = x.shape[-2]
@@ -2510,15 +2599,18 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
             x = x.view(batch_size, T * h*w, x.shape[-1])
         if self.visual_temporal_embed is not None:
-            visual_temporal_embed = self.visual_temporal_embed(x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
-            x = x.view(batch_size, T, -1, x.shape[-1]) + visual_temporal_embed.view(1, T, 1, x.shape[-1])
         x_feat_dict = {}
         spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
         x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x
-        temporal_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=1)
         x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x
         x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
@@ -2527,7 +2619,8 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
         new_x = []
         for _image_feature_source in self.image_feature_source:
             if _image_feature_source not in x_feat_dict:
-                raise ValueError('invalid image feature source: {}'.format(_image_feature_source))
             new_x.append(x_feat_dict[_image_feature_source])
         x = torch.cat(new_x, dim=1)
@@ -2535,11 +2628,9 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
         x = x @ self.image_projection
         x = self.image_proj_norm(x)
         return x
 @add_start_docstrings(
     """The FLORENCE2 model which consists of a vision backbone and a language model.""",
     FLORENCE2_START_DOCSTRING,
@@ -2547,9 +2638,10 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
 class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
     def __init__(self, config: Florence2Config):
         super().__init__(config)
-        assert config.vision_config.model_type in ['davit', ''], 'only DaViT is supported for now'
         self.vision_tower = DaViT.from_config(config=config.vision_config)
-        # remove unused layers
         del self.vision_tower.head
         del self.vision_tower.norms
@@ -2557,10 +2649,12 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         self._attn_implementation = config._attn_implementation
         self._build_image_projection_layers(config)
-        language_model = Florence2LanguageForConditionalGeneration(config=config.text_config)
         if language_model._tied_weights_keys is not None:
-            self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
         self.language_model = language_model
         self.character_character_matching_head = nn.Sequential(
             nn.Linear(2 * 768, config.projection_dim),
@@ -2584,16 +2678,17 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             nn.Linear(config.projection_dim, 1)
         )
         self.text_classification_head = nn.Linear(config.projection_dim, 1)
-        self.character_embedding_projection = nn.Linear(config.projection_dim, 768)
         self._init_weights(self.character_character_matching_head)
         self._init_weights(self.text_character_matching_head)
         self._init_weights(self.text_tail_matching_head)
         self._init_weights(self.text_classification_head)
         self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
         self.post_init()
     def _init_weights(self, m):
         if isinstance(m, nn.Linear):
             trunc_normal_(m.weight, std=0.02)
@@ -2613,7 +2708,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         elif isinstance(m, nn.Sequential):
             for layer in m:
                 self._init_weights(layer)
     def _build_image_projection_layers(self, config):
         image_dim_out = config.vision_config.dim_embed[-1]
         dim_projection = config.vision_config.projection_dim
@@ -2652,13 +2747,14 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         return self.language_model.get_input_embeddings()
     def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
-        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
         # update vocab size
         self.config.text_config.vocab_size = model_embeds.num_embeddings
         self.config.vocab_size = model_embeds.num_embeddings
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
     def _encode_image(self, pixel_values):
         if len(pixel_values.shape) == 4:
             batch_size, C, H, W = pixel_values.shape
@@ -2666,7 +2762,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             x = self.vision_tower.forward_features_unpool(pixel_values)
         else:
             raise ValueError(f'invalid image shape {pixel_values.shape}')
         if self.image_pos_embed is not None:
             x = x.view(batch_size * T, -1, x.shape[-1])
             num_tokens = x.shape[-2]
@@ -2678,15 +2774,18 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             x = x.view(batch_size, T * h*w, x.shape[-1])
         if self.visual_temporal_embed is not None:
-            visual_temporal_embed = self.visual_temporal_embed(x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
-            x = x.view(batch_size, T, -1, x.shape[-1]) + visual_temporal_embed.view(1, T, 1, x.shape[-1])
         x_feat_dict = {}
         spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
         x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x
-        temporal_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=1)
         x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x
         x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
@@ -2695,7 +2794,8 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         new_x = []
         for _image_feature_source in self.image_feature_source:
             if _image_feature_source not in x_feat_dict:
-                raise ValueError('invalid image feature source: {}'.format(_image_feature_source))
             new_x.append(x_feat_dict[_image_feature_source])
         x = torch.cat(new_x, dim=1)
@@ -2703,14 +2803,15 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         x = x @ self.image_projection
         x = self.image_proj_norm(x)
-        return x
     def _merge_input_ids_with_image_features(
-        self, image_features, inputs_embeds
     ):
         batch_size, image_token_length = image_features.size()[:-1]
         device = image_features.device
-        image_attention_mask = torch.ones(batch_size, image_token_length, device=device)
         # task_prefix_embeds: [batch_size, padded_context_length, hidden_size]
         # task_prefix_attention_mask: [batch_size, context_length]
@@ -2718,17 +2819,19 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             return image_features, image_attention_mask
         task_prefix_embeds = inputs_embeds
-        task_prefix_attention_mask = torch.ones(batch_size, task_prefix_embeds.size(1), device=device)
         if len(task_prefix_attention_mask.shape) == 3:
             task_prefix_attention_mask = task_prefix_attention_mask[:, 0]
         # concat [image embeds, task prefix embeds]
         inputs_embeds = torch.cat([image_features, task_prefix_embeds], dim=1)
-        attention_mask = torch.cat([image_attention_mask, task_prefix_attention_mask], dim=1)
         return inputs_embeds, attention_mask
     @torch.no_grad()
     def predict_detections_and_associations(
         self,
@@ -2740,7 +2843,8 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         essential_text_threshold=0.8,
     ):
         batch_inputs = processor(
-            batch_input_text=["Find all panels, texts, characters, and speech-bubble tails in the image."] * len(images),
             batch_input_list_of_list_of_bboxes=[[]] * len(images),
             batch_images=images,
             padding=True,
@@ -2758,13 +2862,16 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             do_sample=False,
             num_beams=3,
         )
-        generated_texts, list_of_list_of_list_of_bboxes, batch_indices_of_bboxes_in_generated_text = processor.postprocess_output(generated_ids, images)
-        map_to_category = {"<pa": "panels", "<te": "texts", "<ch": "characters", "<ta": "tails"}
         results = []
         for generated_text, batch_indices_of_bboxes_in_generated_text, list_of_list_of_bboxes in zip(generated_texts, batch_indices_of_bboxes_in_generated_text, list_of_list_of_list_of_bboxes):
-            categories = [map_to_category.get(generated_text[j:j+3], None) for i, j in batch_indices_of_bboxes_in_generated_text]
             result_for_this_image = {
                 "panels": [],
                 "texts": [],
@@ -2779,9 +2886,11 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         cleaned_generated_ids = []
         for generated_id in generated_ids:
-            index_of_last_bos = torch.where(generated_id == processor.tokenizer.bos_token_id)[0][-1].item()
             cleaned_generated_ids.append(generated_id[index_of_last_bos:])
-        cleaned_generated_ids = pad_sequence(cleaned_generated_ids, batch_first=True, padding_value=processor.tokenizer.pad_token_id)
         association_outputs = self(
             input_ids=batch_inputs["input_ids"],
             pixel_values=batch_inputs["pixel_values"],
@@ -2790,17 +2899,21 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         )
         for img_idx in range(len(results)):
-            character_cluster_labels = UnionFind.from_adj_matrix(association_outputs.character_character_affinity_matrices[img_idx] > character_character_association_threshold).get_labels_for_connected_components()
-            text_character_association = torch.nonzero(association_outputs.text_character_association_matrices[img_idx] > text_character_association_threshold).tolist()
-            text_tail_association = torch.nonzero(association_outputs.text_tail_association_matrices[img_idx] > text_tail_association_threshold).tolist()
-            essential_text_logits = (association_outputs.essential_text_logits[img_idx] > essential_text_threshold).tolist()
             results[img_idx]["character_cluster_labels"] = character_cluster_labels
             results[img_idx]["text_character_associations"] = text_character_association
             results[img_idx]["text_tail_associations"] = text_tail_association
             results[img_idx]["is_essential_text"] = essential_text_logits
         return results
     @torch.no_grad()
     def predict_ocr(
             self,
@@ -2808,7 +2921,8 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             processor,
     ):
         batch_inputs = processor(
-            batch_input_text=["What is the text in the image, with regions?"] * len(images),
             batch_input_list_of_list_of_bboxes=[[]] * len(images),
             batch_images=images,
             padding=True,
@@ -2826,7 +2940,8 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             do_sample=False,
             num_beams=3,
         )
-        generated_texts, list_of_list_of_list_of_bboxes, batch_indices_of_bboxes_in_generated_text = processor.postprocess_output(generated_ids, images)
         results = []
         for generated_text, batch_indices_of_bboxes_in_generated_text, list_of_list_of_bboxes in zip(generated_texts, batch_indices_of_bboxes_in_generated_text, list_of_list_of_list_of_bboxes):
             ocr_texts = []
@@ -2850,9 +2965,10 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
     ):
         def convert_caption_to_instruction(caption):
             return "Locate the phrases in the caption: " + caption
         batch_inputs = processor(
-            batch_input_text=[convert_caption_to_instruction(caption) for caption in captions],
             batch_input_list_of_list_of_bboxes=[[]] * len(images),
             batch_images=images,
             padding=True,
@@ -2880,7 +2996,8 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
                 do_sample=False,
                 num_beams=3,
             )
-        generated_texts, list_of_list_of_list_of_bboxes, batch_indices_of_bboxes_in_generated_text = processor.postprocess_output(generated_ids, images)
         return [
             {
                 "grounded_caption": generated_text,
@@ -2909,7 +3026,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = True,
         return_dict: Optional[bool] = None,
-        tokenizer = None,
     ) -> Union[Tuple, Florence2Seq2SeqLMOutput]:
         assert output_hidden_states, "output_hidden_states must be True"
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -2927,7 +3044,8 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             if pixel_values is not None:
                 # (batch_size, num_image_tokens, hidden_size)
                 image_features = self._encode_image(pixel_values)
-                inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
         if inputs_embeds is not None:
             attention_mask = attention_mask.to(inputs_embeds.dtype)
@@ -2949,10 +3067,14 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             return_dict=return_dict,
         )
-        character_character_affinity_matrices = self.get_character_character_affinity_matrices(outputs.decoder_hidden_states[-1], decoder_input_ids, tokenizer, apply_sigmoid=True)
-        text_character_association_matrices = self.get_text_character_association_matrices(outputs.decoder_hidden_states[-1], decoder_input_ids, tokenizer, apply_sigmoid=True)
-        text_tail_association_matrices = self.get_text_tail_association_matrices(outputs.decoder_hidden_states[-1], decoder_input_ids, tokenizer, apply_sigmoid=True)
-        essential_text_logits = self.get_essential_text_logits(outputs.decoder_hidden_states[-1], decoder_input_ids, tokenizer, apply_sigmoid=True)
         return Florence2Seq2SeqLMOutput(
             character_character_affinity_matrices=character_character_affinity_matrices,
@@ -2963,11 +3085,11 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
     def generate(
         self,
-        input_ids,
         inputs_embeds=None,
         pixel_values=None,
         **kwargs
-        ):
         if inputs_embeds is None:
             # 1. Extra the input embeddings
@@ -2976,14 +3098,15 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             # 2. Merge text and images
             if pixel_values is not None:
                 image_features = self._encode_image(pixel_values)
-                inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
         return self.language_model.generate(
             input_ids=None,
             inputs_embeds=inputs_embeds,
             **kwargs
         )
     def slowly_generate_grounded_caption(
         self,
         input_ids,
@@ -2999,9 +3122,12 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
         """
         input_embeds = self.get_input_embeddings()(input_ids)
         image_features = self._encode_image(pixel_values)
-        inputs_embeds, _ = self._merge_input_ids_with_image_features(image_features, input_embeds)
-        decoder_input_ids = processor.tokenizer(captions, return_tensors="pt", truncation=False, padding=True)["input_ids"].to(self.device)
-        running_indices = torch.zeros(decoder_input_ids.shape[0], dtype=torch.long, device=self.device)
         running_decoder_input_ids = decoder_input_ids[:, :1]
         num_tokens_generated = 1
         CHUNK_SIZE = 8
@@ -3019,19 +3145,22 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             )[:, -(CHUNK_SIZE+1):-1]
             what_should_be_the_next_tokens = torch.stack([
-                decoder_input_ids[i, running_indices[i]+1:running_indices[i]+CHUNK_SIZE+1]
                 for i in range(decoder_input_ids.shape[0])
             ])
             # if the entire predicted chunk matches the next chunk, then we can saved some time and "jump" to the next chunk
             if predicted_next_tokens.shape[1] == what_should_be_the_next_tokens.shape[1] and torch.all(predicted_next_tokens == what_should_be_the_next_tokens):
                 running_indices += CHUNK_SIZE
-                running_decoder_input_ids = torch.cat([running_decoder_input_ids, what_should_be_the_next_tokens], dim=-1)
                 continue
             # if, however, there is a deviation find the maximum prefix that matches in the batch
             predicted_next_tokens = predicted_next_tokens[:, 0]
-            predicted_next_token_strings = processor.batch_decode(predicted_next_tokens)
             next_tokens_to_concat = []
             for i, (pnts, pnt) in enumerate(zip(predicted_next_token_strings, predicted_next_tokens)):
                 if (pnts.startswith("<loc_") or pnts in ["<s>", "<pad>", "</s>"]) and running_indices[i] < decoder_input_ids.shape[1] - 1:
@@ -3039,15 +3168,18 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
                 else:
                     running_indices[i] += 1
                     if running_indices[i] >= decoder_input_ids.shape[1]:
-                        next_tokens_to_concat.append(torch.tensor(processor.tokenizer.eos_token_id, device=self.device))
                     # elif "’" in pnts: # this is an annoying character which looks like ' (apostrophe) but isn't.
                     #     import pdb; pdb.set_trace()
                     else:
-                        next_tokens_to_concat.append(decoder_input_ids[i, running_indices[i]])
             next_tokens_to_concat = torch.stack(next_tokens_to_concat)[:, None]
             if (next_tokens_to_concat == processor.tokenizer.eos_token_id).all():
                 break
-            running_decoder_input_ids = torch.cat([running_decoder_input_ids, next_tokens_to_concat], dim=-1)
             if num_tokens_generated >= 1024:
                 break
         return running_decoder_input_ids
@@ -3078,7 +3210,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
                 remove_prefix_length = decoder_input_ids.shape[1] - 1
             decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
         return {
             "input_ids": None,  # encoder_outputs is defined. input_ids not needed
             "encoder_outputs": encoder_outputs,
@@ -3090,105 +3222,146 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
             "head_mask": head_mask,
             "decoder_head_mask": decoder_head_mask,
             "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
         }
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return self.language_model.shift_tokens_right(labels)
     def _reorder_cache(self, *args, **kwargs):
         return self.language_model._reorder_cache(*args, **kwargs)
     def get_character_character_affinity_matrices(self, decoder_hidden_states, decoder_input_ids, tokenizer, apply_sigmoid=False):
         character_character_affinity_matrices = []
         for index in range(len(decoder_hidden_states)):
-            character_embedding_indices = (decoder_input_ids[index] == tokenizer.convert_tokens_to_ids('<character>')).nonzero().squeeze(-1)
             character_embeddings = decoder_hidden_states[index][character_embedding_indices]
             if character_embeddings.shape[0] == 0:
-                character_character_affinity_matrices.append(torch.zeros(0, 0).type_as(character_embeddings))
                 continue
-            character_embeddings = self.character_embedding_projection(character_embeddings)
-            char_i = repeat(character_embeddings, "i d -> i repeat d", repeat=character_embeddings.shape[0])
-            char_j = repeat(character_embeddings, "j d -> repeat j d", repeat=character_embeddings.shape[0])
             char_ij = rearrange([char_i, char_j], "two i j d -> (i j) (two d)")
-            character_character_affinities = self.character_character_matching_head(char_ij)
-            character_character_affinities = rearrange(character_character_affinities, "(i j) 1 -> i j", i=char_i.shape[0])
-            character_character_affinities = (character_character_affinities + character_character_affinities.T) / 2
             if apply_sigmoid:
-                character_character_affinities = torch.sigmoid(character_character_affinities)
-            character_character_affinity_matrices.append(character_character_affinities)
         return character_character_affinity_matrices
     def get_text_character_association_matrices(self, decoder_hidden_states, decoder_input_ids, tokenizer, apply_sigmoid=False):
         text_character_association_matrices = []
         for index in range(len(decoder_hidden_states)):
-            text_embedding_indices = (decoder_input_ids[index] == tokenizer.convert_tokens_to_ids('<text>')).nonzero().squeeze(-1)
             text_embeddings = decoder_hidden_states[index][text_embedding_indices]
-            character_embedding_indices = (decoder_input_ids[index] == tokenizer.convert_tokens_to_ids('<character>')).nonzero().squeeze(-1)
             character_embeddings = decoder_hidden_states[index][character_embedding_indices]
             if character_embeddings.shape[0] == 0 or text_embeddings.shape[0] == 0:
-                text_character_association_matrices.append(torch.zeros(text_embeddings.shape[0], character_embeddings.shape[0]).type_as(text_embeddings))
                 continue
-            text_i = repeat(text_embeddings, "i d -> i repeat d", repeat=character_embeddings.shape[0])
-            char_j = repeat(character_embeddings, "j d -> repeat j d", repeat=text_embeddings.shape[0])
-            text_char_ij = rearrange([text_i, char_j], "two i j d -> (i j) (two d)")
-            text_character_affinities = self.text_character_matching_head(text_char_ij)
-            text_character_affinities = rearrange(text_character_affinities, "(i j) 1 -> i j", i=text_i.shape[0])
             if apply_sigmoid:
-                text_character_affinities = torch.sigmoid(text_character_affinities)
-            text_character_association_matrices.append(text_character_affinities)
         return text_character_association_matrices
     def get_text_tail_association_matrices(self, decoder_hidden_states, decoder_input_ids, tokenizer, apply_sigmoid=False):
         text_tail_association_matrices = []
         for index in range(len(decoder_hidden_states)):
-            text_embedding_indices = (decoder_input_ids[index] == tokenizer.convert_tokens_to_ids('<text>')).nonzero().squeeze(-1)
             text_embeddings = decoder_hidden_states[index][text_embedding_indices]
-            tail_embedding_indices = (decoder_input_ids[index] == tokenizer.convert_tokens_to_ids('<tail>')).nonzero().squeeze(-1)
             tail_embeddings = decoder_hidden_states[index][tail_embedding_indices]
             if tail_embeddings.shape[0] == 0 or text_embeddings.shape[0] == 0:
-                text_tail_association_matrices.append(torch.zeros(text_embeddings.shape[0], tail_embeddings.shape[0]).type_as(text_embeddings))
                 continue
-            text_i = repeat(text_embeddings, "i d -> i repeat d", repeat=tail_embeddings.shape[0])
-            tail_j = repeat(tail_embeddings, "j d -> repeat j d", repeat=text_embeddings.shape[0])
-            text_tail_ij = rearrange([text_i, tail_j], "two i j d -> (i j) (two d)")
             text_tail_affinities = self.text_tail_matching_head(text_tail_ij)
-            text_tail_affinities = rearrange(text_tail_affinities, "(i j) 1 -> i j", i=text_i.shape[0])
             if apply_sigmoid:
                 text_tail_affinities = torch.sigmoid(text_tail_affinities)
             text_tail_association_matrices.append(text_tail_affinities)
         return text_tail_association_matrices
     def get_tail_character_association_matrices(self, decoder_hidden_states, decoder_input_ids, tokenizer, apply_sigmoid=False):
         tail_character_association_matrices = []
         for index in range(len(decoder_hidden_states)):
-            tail_embedding_indices = (decoder_input_ids[index] == tokenizer.convert_tokens_to_ids('<tail>')).nonzero().squeeze(-1)
             tail_embeddings = decoder_hidden_states[index][tail_embedding_indices]
-            character_embedding_indices = (decoder_input_ids[index] == tokenizer.convert_tokens_to_ids('<character>')).nonzero().squeeze(-1)
             character_embeddings = decoder_hidden_states[index][character_embedding_indices]
             if character_embeddings.shape[0] == 0 or tail_embeddings.shape[0] == 0:
-                tail_character_association_matrices.append(torch.zeros(tail_embeddings.shape[0], character_embeddings.shape[0]).type_as(tail_embeddings))
                 continue
-            tail_i = repeat(tail_embeddings, "i d -> i repeat d", repeat=character_embeddings.shape[0])
-            char_j = repeat(character_embeddings, "j d -> repeat j d", repeat=tail_embeddings.shape[0])
-            tail_char_ij = rearrange([tail_i, char_j], "two i j d -> (i j) (two d)")
-            tail_character_affinities = self.tail_character_matching_head(tail_char_ij)
-            tail_character_affinities = rearrange(tail_character_affinities, "(i j) 1 -> i j", i=tail_i.shape[0])
             if apply_sigmoid:
-                tail_character_affinities = torch.sigmoid(tail_character_affinities)
-            tail_character_association_matrices.append(tail_character_affinities)
         return tail_character_association_matrices
     def get_essential_text_logits(self, decoder_hidden_states, decoder_input_ids, tokenizer, apply_sigmoid=False):
         essential_text_logits = []
         for index in range(len(decoder_hidden_states)):
-            text_embedding_indices = (decoder_input_ids[index] == tokenizer.convert_tokens_to_ids('<text>')).nonzero().squeeze(-1)
             text_embeddings = decoder_hidden_states[index][text_embedding_indices]
             if text_embeddings.shape[0] == 0:
-                essential_text_logits.append(torch.zeros(0).type_as(text_embeddings))
                 continue
-            text_logits = rearrange(self.text_classification_head(text_embeddings), "i 1 -> i")
             if apply_sigmoid:
                 text_logits = torch.sigmoid(text_logits)
             essential_text_logits.append(text_logits)
-        return essential_text_logits

 from torch import nn
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
+from torch.nn import CrossEntropyLoss
 from collections import OrderedDict
 from einops import rearrange, repeat
 from timm.models.layers import DropPath, trunc_normal_
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
 )
+from .configuration_florence2 import Florence2Config
 from .configuration_florence2 import Florence2LanguageConfig
 from .configuration_florence2 import Florence2VisionConfig
 from pytorch_metric_learning.utils.loss_and_miner_utils import get_all_pairs_indices
 _CONFIG_FOR_DOC = "Florence2Config"
 class LearnedAbsolutePositionEmbedding2D(nn.Module):
     """
     This module learns positional embeddings up to a fixed maximum size.
     def __init__(self, embedding_dim=256, num_pos=50):
         super().__init__()
         self.row_embeddings = nn.Embedding(num_pos, embedding_dim // 2)
+        self.column_embeddings = nn.Embedding(
+            num_pos, embedding_dim - (embedding_dim // 2))
     def forward(self, pixel_values):
         """
         x_emb = self.column_embeddings(width_values)
         y_emb = self.row_embeddings(height_values)
         # (height, width, embedding_dim * 2)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1),
+                        y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
         # (embedding_dim * 2, height, width)
         pos = pos.permute(2, 0, 1)
         pos = pos.unsqueeze(0)
         pos = pos.permute(0, 2, 3, 1)
         return pos
 class PositionalEmbeddingCosine1D(nn.Module):
     """
     This class implements a very simple positional encoding. It follows closely
         dropout_prob: The dropout probability.
         max_seq_len: The maximum length to precompute the positional encodings.
     """
     def __init__(
             self,
             embed_dim: int = 512,
         embed_dim: The dimension of the embeddings.
         max_seq_len: The maximum length to precompute the positional encodings.
     """
     def __init__(
             self,
             embedding_dim: int = 512,
         len_seq = seq_embeds.size(-2)
         assert len_seq <= self.num_pos
         # [T, D]
+        pos_embeds = self.embeddings(
+            torch.arange(len_seq).to(seq_embeds.device))
         # Adapt pre-computed positional embeddings to the input.
         if shape_len == 3:
             pos_embeds = pos_embeds.view(
         return pos_embeds
 class MySequential(nn.Sequential):
     def forward(self, *inputs):
         for module in self._modules.values():
     def forward(self, x, size):
         B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.groups, C //
+                                  self.groups).permute(2, 0, 3, 1, 4)
         q, k, v = qkv[0], qkv[1], qkv[2]
         q = q * (float(N) ** -0.5)
                  conv_at_attn=True, conv_at_ffn=True):
         super().__init__()
+        drop_path = DropPath(
+            drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.conv1 = PreNorm(None, DepthWiseConv2d(
+            dim, 3, 1, 1)) if conv_at_attn else None
         self.channel_attn = PreNorm(
             norm_layer(dim),
             ChannelAttention(dim, groups=groups, qkv_bias=qkv_bias),
             drop_path
         )
+        self.conv2 = PreNorm(None, DepthWiseConv2d(
+            dim, 3, 1, 1)) if conv_at_ffn else None
         self.ffn = PreNorm(
             norm_layer(dim),
+            Mlp(in_features=dim, hidden_features=int(
+                dim*mlp_ratio), act_layer=act_layer),
             drop_path
         )
 def window_partition(x, window_size: int):
     B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size,
+               W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous(
+    ).view(-1, window_size, window_size, C)
     return windows
 def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
+    B = batch_size
     # this will cause onnx conversion failed for dynamic axis, because treated as constant
+    # int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size,
+                     window_size, window_size, -1)
     x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
     return x
         # attn_windows = self.attn(x_windows)
         B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C //
+                                  self.num_heads).permute(2, 0, 3, 1, 4)
         q, k, v = qkv[0], qkv[1], qkv[2]
         q = q * self.scale
                  norm_layer=nn.LayerNorm, conv_at_attn=True, conv_at_ffn=True):
         super().__init__()
+        drop_path = DropPath(
+            drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.conv1 = PreNorm(None, DepthWiseConv2d(
+            dim, 3, 1, 1)) if conv_at_attn else None
         self.window_attn = PreNorm(
             norm_layer(dim),
             WindowAttention(dim, num_heads, window_size, qkv_bias=qkv_bias),
             drop_path
         )
+        self.conv2 = PreNorm(None, DepthWiseConv2d(
+            dim, 3, 1, 1)) if conv_at_ffn else None
         self.ffn = PreNorm(
             norm_layer(dim),
+            Mlp(in_features=dim, hidden_features=int(
+                dim*mlp_ratio), act_layer=act_layer),
             drop_path
         )
         enable_checkpoint=True,
         conv_at_attn=True,
         conv_at_ffn=True,
+    ):
         super().__init__()
         self.num_classes = num_classes
         assert self.num_stages == len(self.num_heads) == len(self.num_groups)
         num_stages = len(embed_dims)
+        dpr = [x.item() for x in torch.linspace(
+            0, drop_path_rate, sum(depths)*2)]
         depth_offset = 0
         convs = []
         self.norms = norm_layer(self.embed_dims[-1])
         self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.head = nn.Linear(
+            self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
         self.apply(self._init_weights)
         for conv, block in zip(self.convs, self.blocks):
             x, input_size = conv(x, input_size)
             if self.enable_checkpoint:
+                x, input_size = checkpoint.checkpoint(
+                    block, x, input_size, use_reentrant=True)
             else:
                 x, input_size = block(x, input_size)
         return x
         x = self.forward_features(x)
         x = self.head(x)
         return x
     @classmethod
     def from_config(cls, config):
         return cls(
         )
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
     max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(
+        seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
     return (
         indices,
         cu_seqlens,
             if past_key_value[0] is not None:
                 key_states = torch.cat([past_key_value[0], key_states], dim=2)
             if past_key_value[1] is not None:
+                value_states = torch.cat(
+                    [past_key_value[1], value_states], dim=2)
         else:
             # self_attention
             key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
             past_key_value = (key_states, value_states)
         proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(
+            query_states, tgt_len, bsz).view(*proj_shape)
         key_states = key_states.reshape(*proj_shape)
         value_states = value_states.reshape(*proj_shape)
                 raise ValueError(
                     f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                 )
+            attn_weights = attn_weights.view(
+                bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(
+                bsz * self.num_heads, tgt_len, src_len)
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
                     f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
                     f" {layer_head_mask.size()}"
                 )
+            attn_weights = layer_head_mask.view(
+                1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(
+                bsz * self.num_heads, tgt_len, src_len)
         if output_attentions:
             # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
             # In order to do so, attn_weights have to be reshaped
             # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(
+                bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(
+                bsz * self.num_heads, tgt_len, src_len)
         else:
             attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training)
         attn_output = torch.bmm(attn_probs, value_states)
                 f" {attn_output.size()}"
             )
+        attn_output = attn_output.view(
+            bsz, self.num_heads, tgt_len, self.head_dim)
         attn_output = attn_output.transpose(1, 2)
         # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         # Florence2FlashAttention2 attention does not support output_attentions
         if output_attentions:
+            raise ValueError(
+                "Florence2FlashAttention2 attention does not support output_attentions")
         # if key_value_states are provided this layer is used as a cross-attention layer
         # for the decoder
         elif is_cross_attention:
             # cross_attentions
             key_states = self._reshape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._reshape(
+                self.v_proj(key_value_states), -1, bsz)
         elif past_key_value is not None:
             # reuse k, v, self_attention
             key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
             value_states = self._reshape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat(
+                [past_key_value[0].transpose(1, 2), key_states], dim=1)
+            value_states = torch.cat(
+                [past_key_value[1].transpose(1, 2), value_states], dim=1)
         else:
             # self_attention
             key_states = self._reshape(self.k_proj(hidden_states), -1, bsz)
             # all previous decoder key/value_states. Further calls to uni-directional self-attention
             # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
             # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states.transpose(
+                1, 2), value_states.transpose(1, 2))
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
                 causal=causal,
             )
+            attn_output = pad_input(
+                attn_output_unpad, indices_q, batch_size, query_length)
         else:
             attn_output = flash_attn_func(
                 query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
     # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
     def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(
+            attention_mask)
         batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
         key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len,
+                              num_key_value_heads, head_dim), indices_k
         )
         value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len,
+                                num_key_value_heads, head_dim), indices_k
         )
         if query_length == kv_seq_len:
             query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len,
+                                    self.num_heads, head_dim), indices_k
             )
             cu_seqlens_q = cu_seqlens_k
             max_seqlen_in_batch_q = max_seqlen_in_batch_k
         else:
             # The -q_len: slice assumes left padding.
             attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(
+                query_layer, attention_mask)
         return (
             query_layer,
             if past_key_value[0] is not None:
                 key_states = torch.cat([past_key_value[0], key_states], dim=2)
             if past_key_value[1] is not None:
+                value_states = torch.cat(
+                    [past_key_value[1], value_states], dim=2)
         else:
             # self_attention
             key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
             layer_head_mask=layer_head_mask,
             output_attentions=output_attentions,
         )
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
         residual = hidden_states
         hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.activation_dropout, training=self.training)
         hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
         hidden_states = self.final_layer_norm(hidden_states)
         if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(
+                hidden_states).any()
         ):
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(
+                hidden_states, min=-clamp_value, max=clamp_value)
         outputs = (hidden_states,)
         # Self Attention
         # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:
+                                                  2] if past_key_value is not None else None
         # add present self-attn cache to positions 1,2 of present_key_value tuple
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
             layer_head_mask=layer_head_mask,
             output_attentions=output_attentions,
         )
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
             residual = hidden_states
             # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:
+                                                       ] if past_key_value is not None else None
             hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
                 hidden_states=hidden_states,
                 key_value_states=encoder_hidden_states,
                 past_key_value=cross_attn_past_key_value,
                 output_attentions=output_attentions,
             )
+            hidden_states = nn.functional.dropout(
+                hidden_states, p=self.dropout, training=self.training)
             hidden_states = residual + hidden_states
             hidden_states = self.encoder_attn_layer_norm(hidden_states)
         # Fully Connected
         residual = hidden_states
         hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.activation_dropout, training=self.training)
         hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training)
         hidden_states = residual + hidden_states
         hidden_states = self.final_layer_norm(hidden_states)
         return outputs
 class Florence2LanguagePreTrainedModel(PreTrainedModel):
     config_class = Florence2LanguageConfig
     base_model_prefix = "model"
     @property
     def dummy_inputs(self):
         pad_token = self.config.pad_token_id
+        input_ids = torch.tensor(
+            [[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
         dummy_inputs = {
             "attention_mask": input_ids.ne(pad_token),
             "input_ids": input_ids,
             config.max_position_embeddings,
             embed_dim,
         )
+        self.layers = nn.ModuleList([Florence2EncoderLayer(
+            config) for _ in range(config.encoder_layers)])
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self._use_sdpa = config._attn_implementation == "sdpa"
         self.layernorm_embedding = nn.LayerNorm(embed_dim)
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
             input = input_ids
             input_ids = input_ids.view(-1, input_ids.shape[-1])
         elif inputs_embeds is not None:
             input = inputs_embeds[:, :, -1]
         else:
+            raise ValueError(
+                "You have to specify either input_ids or inputs_embeds")
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
         hidden_states = inputs_embeds + embed_pos
         hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training)
         # expand attention_mask
         if attention_mask is not None:
                 # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
                 # the manual implementation that requires a 4D causal mask in all cases.
                 # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    attention_mask, inputs_embeds.dtype)
             else:
                 # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask(
+                    attention_mask, inputs_embeds.dtype)
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
                     layer_outputs = encoder_layer(
                         hidden_states,
                         attention_mask,
+                        layer_head_mask=(
+                            head_mask[idx] if head_mask is not None else None),
                         output_attentions=output_attentions,
                     )
         self.layerdrop = config.decoder_layerdrop
         self.padding_idx = config.pad_token_id
         self.max_target_positions = config.max_position_embeddings
+        embed_scale = math.sqrt(
+            config.d_model) if config.scale_embedding else 1.0
         self.embed_tokens = Florence2ScaledWordEmbedding(
             config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
             config.max_position_embeddings,
             config.d_model,
         )
+        self.layers = nn.ModuleList([Florence2DecoderLayer(
+            config) for _ in range(config.decoder_layers)])
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self._use_sdpa = config._attn_implementation == "sdpa"
         # retrieve input_ids and inputs_embeds
         if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
         elif input_ids is not None:
             input = input_ids
             input_shape = input.shape
             input_shape = inputs_embeds.size()[:-1]
             input = inputs_embeds[:, :, -1]
         else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds")
         # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[
+            2] if past_key_values and past_key_values[0] and past_key_values[0][0] is not None else 0
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input)
         if self._use_flash_attention_2:
             # 2d mask is passed through the layers
+            attention_mask = attention_mask if (
+                attention_mask is not None and 0 in attention_mask) else None
         elif self._use_sdpa and not output_attentions and cross_attn_head_mask is None:
             # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
             # the manual implementation that requires a 4D causal mask in all cases.
         hidden_states = inputs_embeds + positions
         hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.dropout, training=self.training)
         if self.gradient_checkpointing and self.training:
             if use_cache:
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (
+            output_attentions and encoder_hidden_states is not None) else None
         next_decoder_cache = () if use_cache else None
         # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
                     attention_mask=attention_mask,
                     encoder_hidden_states=encoder_hidden_states,
                     encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(
+                        head_mask[idx] if head_mask is not None else None),
                     cross_attn_layer_head_mask=(
                         cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
                     ),
             hidden_states = layer_outputs[0]
             if use_cache:
+                next_decoder_cache += (
+                    layer_outputs[3 if output_attentions else 1],)
             if output_attentions:
                 all_self_attns += (layer_outputs[1],)
 class Florence2LanguageModel(Florence2LanguagePreTrainedModel):
+    _tied_weights_keys = ["encoder.embed_tokens.weight",
+                          "decoder.embed_tokens.weight"]
     def __init__(self, config: Florence2LanguageConfig):
         super().__init__(config)
         elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
             encoder_outputs = BaseModelOutput(
                 last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(
+                    encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(
+                    encoder_outputs) > 2 else None,
             )
         # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
 class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel, GenerationMixin):
     base_model_prefix = "model"
+    _tied_weights_keys = ["encoder.embed_tokens.weight",
+                          "decoder.embed_tokens.weight", "lm_head.weight"]
     _keys_to_ignore_on_load_missing = ["final_logits_bias"]
     def __init__(self, config: Florence2LanguageConfig):
         super().__init__(config)
         self.model = Florence2LanguageModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros(
+            (1, self.model.shared.num_embeddings)))
+        self.lm_head = nn.Linear(
+            config.d_model, self.model.shared.num_embeddings, bias=False)
         # Initialize weights and apply final processing
         self.post_init()
         return self.model.get_decoder()
     def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(
+            new_num_tokens, pad_to_multiple_of)
         self._resize_final_logits_bias(new_embeddings.weight.shape[0])
         return new_embeddings
         if new_num_tokens <= old_num_tokens:
             new_bias = self.final_logits_bias[:, :new_num_tokens]
         else:
+            extra_bias = torch.zeros(
+                (1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
             new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
         self.register_buffer("final_logits_bias", new_bias)
         if labels is not None:
             if use_cache:
+                logger.warning(
+                    "The `use_cache` argument is changed to `False` since `labels` is provided.")
             use_cache = False
             if decoder_input_ids is None and decoder_inputs_embeds is None:
                 decoder_input_ids = shift_tokens_right(
         if labels is not None:
             labels = labels.to(lm_logits.device)
             loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(
+                lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
         if not return_dict:
             output = (lm_logits,) + outputs[1:]
             "head_mask": head_mask,
             "decoder_head_mask": decoder_head_mask,
             "cross_attn_head_mask": cross_attn_head_mask,
+            # change this to avoid caching (presumably for debugging)
+            "use_cache": use_cache,
         }
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         for layer_past in past_key_values:
             # cached cross_attention states don't have to be reordered -> they are always the same
             reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device))
+                      for past_state in layer_past[:2])
                 + layer_past[2:],
             )
         return reordered_past
 @dataclass
 class Florence2Seq2SeqLMOutput(ModelOutput):
     """
             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
 """
 @add_start_docstrings(
     """The FLORENCE2 vision model without any head""",
     FLORENCE2_START_DOCSTRING,
 class Florence2VisionModel(Florence2PreTrainedModel):
     def __init__(self, config: Florence2VisionConfig):
         super().__init__(config)
+        assert config.model_type in [
+            'davit', ""], 'only DaViT is supported for now'
         self.vision_tower = DaViT.from_config(config=config)
         self.post_init()
     def forward(self, pixel_values):
         if len(pixel_values.shape) == 4:
             x = self.vision_tower.forward_features_unpool(pixel_values)
 class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
     def __init__(self, config: Florence2VisionConfig):
         super().__init__(config)
+        assert config.model_type in [
+            'davit', ''], 'only DaViT is supported for now'
         self.vision_tower = DaViT.from_config(config=config)
         self._build_image_projection_layers(config)
         self.post_init()
     def _build_image_projection_layers(self, config):
         image_dim_out = config.dim_embed[-1]
         dim_projection = config.projection_dim
             x = self.vision_tower.forward_features_unpool(pixel_values)
         else:
             raise ValueError(f'invalid image shape {pixel_values.shape}')
         if self.image_pos_embed is not None:
             x = x.view(batch_size * T, -1, x.shape[-1])
             num_tokens = x.shape[-2]
             x = x.view(batch_size, T * h*w, x.shape[-1])
         if self.visual_temporal_embed is not None:
+            visual_temporal_embed = self.visual_temporal_embed(
+                x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
+            x = x.view(
+                batch_size, T, -1, x.shape[-1]) + visual_temporal_embed.view(1, T, 1, x.shape[-1])
         x_feat_dict = {}
         spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
         x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x
+        temporal_avg_pool_x = x.view(
+            batch_size, T, -1, x.shape[-1]).mean(dim=1)
         x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x
         x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
         new_x = []
         for _image_feature_source in self.image_feature_source:
             if _image_feature_source not in x_feat_dict:
+                raise ValueError(
+                    'invalid image feature source: {}'.format(_image_feature_source))
             new_x.append(x_feat_dict[_image_feature_source])
         x = torch.cat(new_x, dim=1)
         x = x @ self.image_projection
         x = self.image_proj_norm(x)
         return x
 @add_start_docstrings(
     """The FLORENCE2 model which consists of a vision backbone and a language model.""",
     FLORENCE2_START_DOCSTRING,
 class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
     def __init__(self, config: Florence2Config):
         super().__init__(config)
+        assert config.vision_config.model_type in [
+            'davit', ''], 'only DaViT is supported for now'
         self.vision_tower = DaViT.from_config(config=config.vision_config)
+        # remove unused layers
         del self.vision_tower.head
         del self.vision_tower.norms
         self._attn_implementation = config._attn_implementation
         self._build_image_projection_layers(config)
+        language_model = Florence2LanguageForConditionalGeneration(
+            config=config.text_config)
         if language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [
+                f"language_model.{k}" for k in language_model._tied_weights_keys]
         self.language_model = language_model
         self.character_character_matching_head = nn.Sequential(
             nn.Linear(2 * 768, config.projection_dim),
             nn.Linear(config.projection_dim, 1)
         )
         self.text_classification_head = nn.Linear(config.projection_dim, 1)
+        self.character_embedding_projection = nn.Linear(
+            config.projection_dim, 768)
         self._init_weights(self.character_character_matching_head)
         self._init_weights(self.text_character_matching_head)
         self._init_weights(self.text_tail_matching_head)
         self._init_weights(self.text_classification_head)
         self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
         self.post_init()
     def _init_weights(self, m):
         if isinstance(m, nn.Linear):
             trunc_normal_(m.weight, std=0.02)
         elif isinstance(m, nn.Sequential):
             for layer in m:
                 self._init_weights(layer)
     def _build_image_projection_layers(self, config):
         image_dim_out = config.vision_config.dim_embed[-1]
         dim_projection = config.vision_config.projection_dim
         return self.language_model.get_input_embeddings()
     def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(
+            new_num_tokens, pad_to_multiple_of)
         # update vocab size
         self.config.text_config.vocab_size = model_embeds.num_embeddings
         self.config.vocab_size = model_embeds.num_embeddings
         self.vocab_size = model_embeds.num_embeddings
         return model_embeds
     def _encode_image(self, pixel_values):
         if len(pixel_values.shape) == 4:
             batch_size, C, H, W = pixel_values.shape
             x = self.vision_tower.forward_features_unpool(pixel_values)
         else:
             raise ValueError(f'invalid image shape {pixel_values.shape}')
         if self.image_pos_embed is not None:
             x = x.view(batch_size * T, -1, x.shape[-1])
             num_tokens = x.shape[-2]
             x = x.view(batch_size, T * h*w, x.shape[-1])
         if self.visual_temporal_embed is not None:
+            visual_temporal_embed = self.visual_temporal_embed(
+                x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
+            x = x.view(
+                batch_size, T, -1, x.shape[-1]) + visual_temporal_embed.view(1, T, 1, x.shape[-1])
         x_feat_dict = {}
         spatial_avg_pool_x = x.view(batch_size, T, -1, x.shape[-1]).mean(dim=2)
         x_feat_dict['spatial_avg_pool'] = spatial_avg_pool_x
+        temporal_avg_pool_x = x.view(
+            batch_size, T, -1, x.shape[-1]).mean(dim=1)
         x_feat_dict['temporal_avg_pool'] = temporal_avg_pool_x
         x = x.view(batch_size, T, -1, x.shape[-1])[:, -1]
         new_x = []
         for _image_feature_source in self.image_feature_source:
             if _image_feature_source not in x_feat_dict:
+                raise ValueError(
+                    'invalid image feature source: {}'.format(_image_feature_source))
             new_x.append(x_feat_dict[_image_feature_source])
         x = torch.cat(new_x, dim=1)
         x = x @ self.image_projection
         x = self.image_proj_norm(x)
+        return x
     def _merge_input_ids_with_image_features(
+        self, image_features, inputs_embeds
     ):
         batch_size, image_token_length = image_features.size()[:-1]
         device = image_features.device
+        image_attention_mask = torch.ones(
+            batch_size, image_token_length, device=device)
         # task_prefix_embeds: [batch_size, padded_context_length, hidden_size]
         # task_prefix_attention_mask: [batch_size, context_length]
             return image_features, image_attention_mask
         task_prefix_embeds = inputs_embeds
+        task_prefix_attention_mask = torch.ones(
+            batch_size, task_prefix_embeds.size(1), device=device)
         if len(task_prefix_attention_mask.shape) == 3:
             task_prefix_attention_mask = task_prefix_attention_mask[:, 0]
         # concat [image embeds, task prefix embeds]
         inputs_embeds = torch.cat([image_features, task_prefix_embeds], dim=1)
+        attention_mask = torch.cat(
+            [image_attention_mask, task_prefix_attention_mask], dim=1)
         return inputs_embeds, attention_mask
     @torch.no_grad()
     def predict_detections_and_associations(
         self,
         essential_text_threshold=0.8,
     ):
         batch_inputs = processor(
+            batch_input_text=[
+                "Find all panels, texts, characters, and speech-bubble tails in the image."] * len(images),
             batch_input_list_of_list_of_bboxes=[[]] * len(images),
             batch_images=images,
             padding=True,
             do_sample=False,
             num_beams=3,
         )
+        generated_texts, list_of_list_of_list_of_bboxes, batch_indices_of_bboxes_in_generated_text = processor.postprocess_output(
+            generated_ids, images)
+        map_to_category = {"<pa": "panels", "<te": "texts",
+                           "<ch": "characters", "<ta": "tails"}
         results = []
         for generated_text, batch_indices_of_bboxes_in_generated_text, list_of_list_of_bboxes in zip(generated_texts, batch_indices_of_bboxes_in_generated_text, list_of_list_of_list_of_bboxes):
+            categories = [map_to_category.get(
+                generated_text[j:j+3], None) for i, j in batch_indices_of_bboxes_in_generated_text]
             result_for_this_image = {
                 "panels": [],
                 "texts": [],
         cleaned_generated_ids = []
         for generated_id in generated_ids:
+            index_of_last_bos = torch.where(
+                generated_id == processor.tokenizer.bos_token_id)[0][-1].item()
             cleaned_generated_ids.append(generated_id[index_of_last_bos:])
+        cleaned_generated_ids = pad_sequence(
+            cleaned_generated_ids, batch_first=True, padding_value=processor.tokenizer.pad_token_id)
         association_outputs = self(
             input_ids=batch_inputs["input_ids"],
             pixel_values=batch_inputs["pixel_values"],
         )
         for img_idx in range(len(results)):
+            character_cluster_labels = UnionFind.from_adj_matrix(
+                association_outputs.character_character_affinity_matrices[img_idx] > character_character_association_threshold).get_labels_for_connected_components()
+            text_character_association = torch.nonzero(
+                association_outputs.text_character_association_matrices[img_idx] > text_character_association_threshold).tolist()
+            text_tail_association = torch.nonzero(
+                association_outputs.text_tail_association_matrices[img_idx] > text_tail_association_threshold).tolist()
+            essential_text_logits = (
+                association_outputs.essential_text_logits[img_idx] > essential_text_threshold).tolist()
             results[img_idx]["character_cluster_labels"] = character_cluster_labels
             results[img_idx]["text_character_associations"] = text_character_association
             results[img_idx]["text_tail_associations"] = text_tail_association
             results[img_idx]["is_essential_text"] = essential_text_logits
         return results
     @torch.no_grad()
     def predict_ocr(
             self,
             processor,
     ):
         batch_inputs = processor(
+            batch_input_text=[
+                "What is the text in the image, with regions?"] * len(images),
             batch_input_list_of_list_of_bboxes=[[]] * len(images),
             batch_images=images,
             padding=True,
             do_sample=False,
             num_beams=3,
         )
+        generated_texts, list_of_list_of_list_of_bboxes, batch_indices_of_bboxes_in_generated_text = processor.postprocess_output(
+            generated_ids, images)
         results = []
         for generated_text, batch_indices_of_bboxes_in_generated_text, list_of_list_of_bboxes in zip(generated_texts, batch_indices_of_bboxes_in_generated_text, list_of_list_of_list_of_bboxes):
             ocr_texts = []
     ):
         def convert_caption_to_instruction(caption):
             return "Locate the phrases in the caption: " + caption
         batch_inputs = processor(
+            batch_input_text=[convert_caption_to_instruction(
+                caption) for caption in captions],
             batch_input_list_of_list_of_bboxes=[[]] * len(images),
             batch_images=images,
             padding=True,
                 do_sample=False,
                 num_beams=3,
             )
+        generated_texts, list_of_list_of_list_of_bboxes, batch_indices_of_bboxes_in_generated_text = processor.postprocess_output(
+            generated_ids, images)
         return [
             {
                 "grounded_caption": generated_text,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = True,
         return_dict: Optional[bool] = None,
+        tokenizer=None,
     ) -> Union[Tuple, Florence2Seq2SeqLMOutput]:
         assert output_hidden_states, "output_hidden_states must be True"
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
             if pixel_values is not None:
                 # (batch_size, num_image_tokens, hidden_size)
                 image_features = self._encode_image(pixel_values)
+                inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(
+                    image_features, inputs_embeds)
         if inputs_embeds is not None:
             attention_mask = attention_mask.to(inputs_embeds.dtype)
             return_dict=return_dict,
         )
+        character_character_affinity_matrices = self.get_character_character_affinity_matrices(
+            outputs.decoder_hidden_states[-1], decoder_input_ids, tokenizer, apply_sigmoid=True)
+        text_character_association_matrices = self.get_text_character_association_matrices(
+            outputs.decoder_hidden_states[-1], decoder_input_ids, tokenizer, apply_sigmoid=True)
+        text_tail_association_matrices = self.get_text_tail_association_matrices(
+            outputs.decoder_hidden_states[-1], decoder_input_ids, tokenizer, apply_sigmoid=True)
+        essential_text_logits = self.get_essential_text_logits(
+            outputs.decoder_hidden_states[-1], decoder_input_ids, tokenizer, apply_sigmoid=True)
         return Florence2Seq2SeqLMOutput(
             character_character_affinity_matrices=character_character_affinity_matrices,
     def generate(
         self,
+        input_ids,
         inputs_embeds=None,
         pixel_values=None,
         **kwargs
+    ):
         if inputs_embeds is None:
             # 1. Extra the input embeddings
             # 2. Merge text and images
             if pixel_values is not None:
                 image_features = self._encode_image(pixel_values)
+                inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(
+                    image_features, inputs_embeds)
         return self.language_model.generate(
             input_ids=None,
             inputs_embeds=inputs_embeds,
             **kwargs
         )
     def slowly_generate_grounded_caption(
         self,
         input_ids,
         """
         input_embeds = self.get_input_embeddings()(input_ids)
         image_features = self._encode_image(pixel_values)
+        inputs_embeds, _ = self._merge_input_ids_with_image_features(
+            image_features, input_embeds)
+        decoder_input_ids = processor.tokenizer(
+            captions, return_tensors="pt", truncation=False, padding=True)["input_ids"].to(self.device)
+        running_indices = torch.zeros(
+            decoder_input_ids.shape[0], dtype=torch.long, device=self.device)
         running_decoder_input_ids = decoder_input_ids[:, :1]
         num_tokens_generated = 1
         CHUNK_SIZE = 8
             )[:, -(CHUNK_SIZE+1):-1]
             what_should_be_the_next_tokens = torch.stack([
+                decoder_input_ids[i, running_indices[i] +
+                                  1:running_indices[i]+CHUNK_SIZE+1]
                 for i in range(decoder_input_ids.shape[0])
             ])
             # if the entire predicted chunk matches the next chunk, then we can saved some time and "jump" to the next chunk
             if predicted_next_tokens.shape[1] == what_should_be_the_next_tokens.shape[1] and torch.all(predicted_next_tokens == what_should_be_the_next_tokens):
                 running_indices += CHUNK_SIZE
+                running_decoder_input_ids = torch.cat(
+                    [running_decoder_input_ids, what_should_be_the_next_tokens], dim=-1)
                 continue
             # if, however, there is a deviation find the maximum prefix that matches in the batch
             predicted_next_tokens = predicted_next_tokens[:, 0]
+            predicted_next_token_strings = processor.batch_decode(
+                predicted_next_tokens)
             next_tokens_to_concat = []
             for i, (pnts, pnt) in enumerate(zip(predicted_next_token_strings, predicted_next_tokens)):
                 if (pnts.startswith("<loc_") or pnts in ["<s>", "<pad>", "</s>"]) and running_indices[i] < decoder_input_ids.shape[1] - 1:
                 else:
                     running_indices[i] += 1
                     if running_indices[i] >= decoder_input_ids.shape[1]:
+                        next_tokens_to_concat.append(torch.tensor(
+                            processor.tokenizer.eos_token_id, device=self.device))
                     # elif "’" in pnts: # this is an annoying character which looks like ' (apostrophe) but isn't.
                     #     import pdb; pdb.set_trace()
                     else:
+                        next_tokens_to_concat.append(
+                            decoder_input_ids[i, running_indices[i]])
             next_tokens_to_concat = torch.stack(next_tokens_to_concat)[:, None]
             if (next_tokens_to_concat == processor.tokenizer.eos_token_id).all():
                 break
+            running_decoder_input_ids = torch.cat(
+                [running_decoder_input_ids, next_tokens_to_concat], dim=-1)
             if num_tokens_generated >= 1024:
                 break
         return running_decoder_input_ids
                 remove_prefix_length = decoder_input_ids.shape[1] - 1
             decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
         return {
             "input_ids": None,  # encoder_outputs is defined. input_ids not needed
             "encoder_outputs": encoder_outputs,
             "head_mask": head_mask,
             "decoder_head_mask": decoder_head_mask,
             "cross_attn_head_mask": cross_attn_head_mask,
+            # change this to avoid caching (presumably for debugging)
+            "use_cache": use_cache,
         }
     def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
         return self.language_model.shift_tokens_right(labels)
     def _reorder_cache(self, *args, **kwargs):
         return self.language_model._reorder_cache(*args, **kwargs)
     def get_character_character_affinity_matrices(self, decoder_hidden_states, decoder_input_ids, tokenizer, apply_sigmoid=False):
         character_character_affinity_matrices = []
         for index in range(len(decoder_hidden_states)):
+            character_embedding_indices = (decoder_input_ids[index] == tokenizer.convert_tokens_to_ids(
+                '<character>')).nonzero().squeeze(-1)
             character_embeddings = decoder_hidden_states[index][character_embedding_indices]
             if character_embeddings.shape[0] == 0:
+                character_character_affinity_matrices.append(
+                    torch.zeros(0, 0).type_as(character_embeddings))
                 continue
+            character_embeddings = self.character_embedding_projection(
+                character_embeddings)
+            char_i = repeat(character_embeddings, "i d -> i repeat d",
+                            repeat=character_embeddings.shape[0])
+            char_j = repeat(character_embeddings, "j d -> repeat j d",
+                            repeat=character_embeddings.shape[0])
             char_ij = rearrange([char_i, char_j], "two i j d -> (i j) (two d)")
+            character_character_affinities = self.character_character_matching_head(
+                char_ij)
+            character_character_affinities = rearrange(
+                character_character_affinities, "(i j) 1 -> i j", i=char_i.shape[0])
+            character_character_affinities = (
+                character_character_affinities + character_character_affinities.T) / 2
             if apply_sigmoid:
+                character_character_affinities = torch.sigmoid(
+                    character_character_affinities)
+            character_character_affinity_matrices.append(
+                character_character_affinities)
         return character_character_affinity_matrices
     def get_text_character_association_matrices(self, decoder_hidden_states, decoder_input_ids, tokenizer, apply_sigmoid=False):
         text_character_association_matrices = []
         for index in range(len(decoder_hidden_states)):
+            text_embedding_indices = (decoder_input_ids[index] == tokenizer.convert_tokens_to_ids(
+                '<text>')).nonzero().squeeze(-1)
             text_embeddings = decoder_hidden_states[index][text_embedding_indices]
+            character_embedding_indices = (decoder_input_ids[index] == tokenizer.convert_tokens_to_ids(
+                '<character>')).nonzero().squeeze(-1)
             character_embeddings = decoder_hidden_states[index][character_embedding_indices]
             if character_embeddings.shape[0] == 0 or text_embeddings.shape[0] == 0:
+                text_character_association_matrices.append(torch.zeros(
+                    text_embeddings.shape[0], character_embeddings.shape[0]).type_as(text_embeddings))
                 continue
+            text_i = repeat(text_embeddings, "i d -> i repeat d",
+                            repeat=character_embeddings.shape[0])
+            char_j = repeat(character_embeddings, "j d -> repeat j d",
+                            repeat=text_embeddings.shape[0])
+            text_char_ij = rearrange(
+                [text_i, char_j], "two i j d -> (i j) (two d)")
+            text_character_affinities = self.text_character_matching_head(
+                text_char_ij)
+            text_character_affinities = rearrange(
+                text_character_affinities, "(i j) 1 -> i j", i=text_i.shape[0])
             if apply_sigmoid:
+                text_character_affinities = torch.sigmoid(
+                    text_character_affinities)
+            text_character_association_matrices.append(
+                text_character_affinities)
         return text_character_association_matrices
     def get_text_tail_association_matrices(self, decoder_hidden_states, decoder_input_ids, tokenizer, apply_sigmoid=False):
         text_tail_association_matrices = []
         for index in range(len(decoder_hidden_states)):
+            text_embedding_indices = (decoder_input_ids[index] == tokenizer.convert_tokens_to_ids(
+                '<text>')).nonzero().squeeze(-1)
             text_embeddings = decoder_hidden_states[index][text_embedding_indices]
+            tail_embedding_indices = (decoder_input_ids[index] == tokenizer.convert_tokens_to_ids(
+                '<tail>')).nonzero().squeeze(-1)
             tail_embeddings = decoder_hidden_states[index][tail_embedding_indices]
             if tail_embeddings.shape[0] == 0 or text_embeddings.shape[0] == 0:
+                text_tail_association_matrices.append(torch.zeros(
+                    text_embeddings.shape[0], tail_embeddings.shape[0]).type_as(text_embeddings))
                 continue
+            text_i = repeat(text_embeddings, "i d -> i repeat d",
+                            repeat=tail_embeddings.shape[0])
+            tail_j = repeat(tail_embeddings, "j d -> repeat j d",
+                            repeat=text_embeddings.shape[0])
+            text_tail_ij = rearrange(
+                [text_i, tail_j], "two i j d -> (i j) (two d)")
             text_tail_affinities = self.text_tail_matching_head(text_tail_ij)
+            text_tail_affinities = rearrange(
+                text_tail_affinities, "(i j) 1 -> i j", i=text_i.shape[0])
             if apply_sigmoid:
                 text_tail_affinities = torch.sigmoid(text_tail_affinities)
             text_tail_association_matrices.append(text_tail_affinities)
         return text_tail_association_matrices
     def get_tail_character_association_matrices(self, decoder_hidden_states, decoder_input_ids, tokenizer, apply_sigmoid=False):
         tail_character_association_matrices = []
         for index in range(len(decoder_hidden_states)):
+            tail_embedding_indices = (decoder_input_ids[index] == tokenizer.convert_tokens_to_ids(
+                '<tail>')).nonzero().squeeze(-1)
             tail_embeddings = decoder_hidden_states[index][tail_embedding_indices]
+            character_embedding_indices = (decoder_input_ids[index] == tokenizer.convert_tokens_to_ids(
+                '<character>')).nonzero().squeeze(-1)
             character_embeddings = decoder_hidden_states[index][character_embedding_indices]
             if character_embeddings.shape[0] == 0 or tail_embeddings.shape[0] == 0:
+                tail_character_association_matrices.append(torch.zeros(
+                    tail_embeddings.shape[0], character_embeddings.shape[0]).type_as(tail_embeddings))
                 continue
+            tail_i = repeat(tail_embeddings, "i d -> i repeat d",
+                            repeat=character_embeddings.shape[0])
+            char_j = repeat(character_embeddings, "j d -> repeat j d",
+                            repeat=tail_embeddings.shape[0])
+            tail_char_ij = rearrange(
+                [tail_i, char_j], "two i j d -> (i j) (two d)")
+            tail_character_affinities = self.tail_character_matching_head(
+                tail_char_ij)
+            tail_character_affinities = rearrange(
+                tail_character_affinities, "(i j) 1 -> i j", i=tail_i.shape[0])
             if apply_sigmoid:
+                tail_character_affinities = torch.sigmoid(
+                    tail_character_affinities)
+            tail_character_association_matrices.append(
+                tail_character_affinities)
         return tail_character_association_matrices
     def get_essential_text_logits(self, decoder_hidden_states, decoder_input_ids, tokenizer, apply_sigmoid=False):
         essential_text_logits = []
         for index in range(len(decoder_hidden_states)):
+            text_embedding_indices = (decoder_input_ids[index] == tokenizer.convert_tokens_to_ids(
+                '<text>')).nonzero().squeeze(-1)
             text_embeddings = decoder_hidden_states[index][text_embedding_indices]
             if text_embeddings.shape[0] == 0:
+                essential_text_logits.append(
+                    torch.zeros(0).type_as(text_embeddings))
                 continue
+            text_logits = rearrange(
+                self.text_classification_head(text_embeddings), "i 1 -> i")
             if apply_sigmoid:
                 text_logits = torch.sigmoid(text_logits)
             essential_text_logits.append(text_logits)
+        return essential_text_logits

processing_florence2.py CHANGED Viewed

@@ -53,18 +53,19 @@ class Florence2Processor(ProcessorMixin):
         if tokenizer is None:
             raise ValueError("You need to specify a `tokenizer`.")
         if not hasattr(image_processor, "image_seq_length"):
-            raise ValueError("Image processor is missing an `image_seq_length` attribute.")
         self.image_seq_length = image_processor.image_seq_length
         tokens_to_add = {
-                'additional_special_tokens': \
-                    tokenizer.additional_special_tokens + \
-                    ['<od>', '</od>', '<ocr>', '</ocr>'] + \
-                    [f'<loc_{x}>' for x in range(1000)] + \
-                    ['<cap>', '</cap>', '<ncap>', '</ncap>','<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>'] + \
-                    ['<panel>', '<text>', '<character>', '<tail>']
-            }
         tokenizer.add_special_tokens(tokens_to_add)
         self.decoder_start_token_id = 2
@@ -74,7 +75,7 @@ class Florence2Processor(ProcessorMixin):
         )
         super().__init__(image_processor, tokenizer)
     def __call__(
         self,
         batch_input_text: List[TextInput] = None,
@@ -82,11 +83,11 @@ class Florence2Processor(ProcessorMixin):
         batch_output_text: List[TextInput] = None,
         batch_output_list_of_list_of_bboxes: List[List[List[List[float]]]] = None,
         batch_images: ImageInput = None,
-        batch_character_cluster_labels = None,
-        batch_text_character_association_labels = None,
-        batch_text_tail_association_labels = None,
-        batch_is_essential_text_labels = None,
-        batch_tail_character_association_labels = None,
         padding: Union[bool, str, PaddingStrategy] = None,
         truncation: Union[bool, str, TruncationStrategy] = None,
         max_input_length_including_image_tokens=None,
@@ -109,17 +110,23 @@ class Florence2Processor(ProcessorMixin):
         assert batch_images is not None, "`batch_images` are expected as arguments to a `Florence2Processor` instance."
         assert batch_input_text is not None, "`batch_input_text` are expected as arguments to a `Florence2Processor` instance."
         if batch_input_list_of_list_of_bboxes is None:
-            batch_input_list_of_list_of_bboxes = [[] for _ in range(len(batch_input_text))]
-        assert len(batch_input_text) == len(batch_input_list_of_list_of_bboxes) == len(batch_images), "`batch_input_text`, `batch_input_list_of_list_of_bboxes` and `batch_images` have different lengths."
         if batch_output_text is None:
             assert batch_output_list_of_list_of_bboxes is None, "`batch_output_text` and `batch_output_list_of_list_of_bboxes` should be provided together."
         else:
             if batch_output_list_of_list_of_bboxes is None:
-                batch_output_list_of_list_of_bboxes = [[] for _ in range(len(batch_output_text))]
-            assert len(batch_output_text) == len(batch_output_list_of_list_of_bboxes) == len(batch_images), "`batch_output_text`, `batch_output_list_of_list_of_bboxes` and `batch_images` have different lengths."
-        max_input_length = max_input_length_including_image_tokens - self.image_seq_length if max_input_length_including_image_tokens is not None else None
-        batch_input_texts = [self._format_text_with_bboxes(text, list_of_list_of_bboxes, image) for text, list_of_list_of_bboxes, image in zip(batch_input_text, batch_input_list_of_list_of_bboxes, batch_images)]
         inputs = self.tokenizer(
             batch_input_texts,
             return_tensors=return_tensors,
@@ -130,9 +137,10 @@ class Florence2Processor(ProcessorMixin):
         if inputs["input_ids"].shape[1] > max_input_length:
             inputs["input_ids"] = inputs["input_ids"][:, :max_input_length]
             inputs["attention_mask"] = inputs["attention_mask"][:, :max_input_length]
         if batch_output_text is not None:
-            batch_output_texts = [self._format_text_with_bboxes(text, list_of_list_of_bboxes, image) for text, list_of_list_of_bboxes, image in zip(batch_output_text, batch_output_list_of_list_of_bboxes, batch_images)]
             decoder_inputs = self.tokenizer(
                 batch_output_texts,
                 return_tensors=return_tensors,
@@ -141,9 +149,10 @@ class Florence2Processor(ProcessorMixin):
             )
             # Truncating manually because I don't want </s> token at the end of truncated sequences, which is the default behavior
             if decoder_inputs["input_ids"].shape[1] > max_output_length:
-                decoder_inputs["input_ids"] = decoder_inputs["input_ids"][:, :max_output_length]
-                decoder_inputs["attention_mask"] = decoder_inputs["attention_mask"][:, :max_output_length]
         pixel_values = self.image_processor(
             batch_images,
@@ -160,7 +169,7 @@ class Florence2Processor(ProcessorMixin):
         if dtype is not None:
             pixel_values = pixel_values.to(dtype)
         return_data = {**inputs, "pixel_values": pixel_values}
         if batch_output_text is not None:
@@ -168,8 +177,10 @@ class Florence2Processor(ProcessorMixin):
             decoder_input_ids = labels.new_zeros(labels.shape)
             decoder_input_ids[:, 1:] = labels[:, :-1].clone()
             decoder_input_ids[:, 0] = self.decoder_start_token_id
-            decoder_attention_mask = decoder_inputs["attention_mask"].new_ones(decoder_input_ids.shape)
-            decoder_attention_mask[:, 1:] = decoder_inputs["attention_mask"][:, :-1].clone()
             # Mask fill labels to replace pad token ID with -100
             labels.masked_fill_(labels == self.tokenizer.pad_token_id, -100)
             return_data.update({
@@ -177,7 +188,7 @@ class Florence2Processor(ProcessorMixin):
                 "decoder_input_ids": decoder_input_ids,
                 "decoder_attention_mask": decoder_attention_mask,
             })
         if device is not None:
             for key, value in return_data.items():
                 if isinstance(value, torch.Tensor):
@@ -201,25 +212,32 @@ class Florence2Processor(ProcessorMixin):
         return generated_text.replace("<s>", "").replace("</s>", "").replace("<pad>", "")
     def postprocess_output(self, generated_ids, images):
-        generated_ids.masked_fill_(generated_ids == -100, self.tokenizer.pad_token_id) # only for some testing purposes
-        batch_decoded_texts = self.batch_decode(generated_ids, skip_special_tokens=False)
-        batch_decoded_texts = [self.cleanup_generated_text(text) for text in batch_decoded_texts]
         batch_list_of_list_of_bboxes = []
         batch_indices_of_bboxes_in_new_string = []
         batch_new_texts = []
         for text, image in zip(batch_decoded_texts, images):
             size_wh = self._get_image_size_wh(image)
-            parsed_text, list_of_stringified_bboxes, start_end_in_new_string = self._parse_text_with_bboxes(text)
-            list_of_list_of_bboxes = [self.box_quantizer.dequantize_from_stringified_bboxes(stringified_bbox, size_wh) for stringified_bbox in list_of_stringified_bboxes]
             batch_list_of_list_of_bboxes.append(list_of_list_of_bboxes)
-            batch_indices_of_bboxes_in_new_string.append(start_end_in_new_string)
             batch_new_texts.append(parsed_text)
         return batch_new_texts, batch_list_of_list_of_bboxes, batch_indices_of_bboxes_in_new_string
     def _parse_text_with_bboxes(self, text):
         loc_pattern = r'((?:<loc_\d+>){4}(?:,(?:<loc_\d+>){4})*)'
         grounding_pattern = r'<grounding>(.*?)</grounding>' + loc_pattern
         list_of_stringified_bboxes = []
         start_end_in_new_string = []
         new_text = ""
@@ -237,7 +255,8 @@ class Florence2Processor(ProcessorMixin):
                 locs = match.group(2)
                 new_text += grounding_text
                 list_of_stringified_bboxes.append(locs)
-                start_end_in_new_string.append((new_pos, new_pos + len(grounding_text)))
                 new_pos += len(grounding_text)
             else:
                 # Handle loc pattern
@@ -245,7 +264,8 @@ class Florence2Processor(ProcessorMixin):
                 replacement = ""
                 new_text += replacement
                 list_of_stringified_bboxes.append(locs)
-                start_end_in_new_string.append((new_pos, new_pos + len(replacement)))
                 new_pos += len(replacement)
             original_pos = match.end()
@@ -254,19 +274,21 @@ class Florence2Processor(ProcessorMixin):
         new_text += text[original_pos:]
         return new_text, list_of_stringified_bboxes, start_end_in_new_string
     def _format_text_with_bboxes(self, text, list_of_list_of_bboxes, image):
         size_wh = self._get_image_size_wh(image)
         quantized_bbox_lists = []
-        for list_of_bboxes in list_of_list_of_bboxes:
-            quantized_bboxes = self.box_quantizer.quantize(list_of_bboxes, size_wh=size_wh)
-            stringified_bboxes = [f"<loc_{x1}><loc_{y1}><loc_{x2}><loc_{y2}>" for x1, y1, x2, y2 in quantized_bboxes]
             stringified_bboxes = ",".join(stringified_bboxes)
             quantized_bbox_lists.append(stringified_bboxes)
         return text.format(*quantized_bbox_lists)
     def _get_image_size_wh(self, image):
-         # Get size_wh from image based on its type
         if isinstance(image, torch.Tensor):
             # For PyTorch tensor
             if image.dim() == 3:
@@ -313,6 +335,7 @@ class Florence2Processor(ProcessorMixin):
         image_processor_input_names = self.image_processor.model_input_names
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
 class BoxQuantizer(object):
     def __init__(self, mode, bins):
         self.mode = mode
@@ -390,4 +413,4 @@ class BoxQuantizer(object):
              dequantized_xmax, dequantized_ymax), dim=-1
         )
-        return dequantized_boxes

         if tokenizer is None:
             raise ValueError("You need to specify a `tokenizer`.")
         if not hasattr(image_processor, "image_seq_length"):
+            raise ValueError(
+                "Image processor is missing an `image_seq_length` attribute.")
         self.image_seq_length = image_processor.image_seq_length
         tokens_to_add = {
+            'additional_special_tokens':
+            tokenizer.additional_special_tokens +
+            ['<od>', '</od>', '<ocr>', '</ocr>'] +
+            [f'<loc_{x}>' for x in range(1000)] +
+            ['<cap>', '</cap>', '<ncap>', '</ncap>', '<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>'] +
+            ['<panel>', '<text>', '<character>', '<tail>']
+        }
         tokenizer.add_special_tokens(tokens_to_add)
         self.decoder_start_token_id = 2
         )
         super().__init__(image_processor, tokenizer)
     def __call__(
         self,
         batch_input_text: List[TextInput] = None,
         batch_output_text: List[TextInput] = None,
         batch_output_list_of_list_of_bboxes: List[List[List[List[float]]]] = None,
         batch_images: ImageInput = None,
+        batch_character_cluster_labels=None,
+        batch_text_character_association_labels=None,
+        batch_text_tail_association_labels=None,
+        batch_is_essential_text_labels=None,
+        batch_tail_character_association_labels=None,
         padding: Union[bool, str, PaddingStrategy] = None,
         truncation: Union[bool, str, TruncationStrategy] = None,
         max_input_length_including_image_tokens=None,
         assert batch_images is not None, "`batch_images` are expected as arguments to a `Florence2Processor` instance."
         assert batch_input_text is not None, "`batch_input_text` are expected as arguments to a `Florence2Processor` instance."
         if batch_input_list_of_list_of_bboxes is None:
+            batch_input_list_of_list_of_bboxes = [
+                [] for _ in range(len(batch_input_text))]
+        assert len(batch_input_text) == len(batch_input_list_of_list_of_bboxes) == len(
+            batch_images), "`batch_input_text`, `batch_input_list_of_list_of_bboxes` and `batch_images` have different lengths."
         if batch_output_text is None:
             assert batch_output_list_of_list_of_bboxes is None, "`batch_output_text` and `batch_output_list_of_list_of_bboxes` should be provided together."
         else:
             if batch_output_list_of_list_of_bboxes is None:
+                batch_output_list_of_list_of_bboxes = [
+                    [] for _ in range(len(batch_output_text))]
+            assert len(batch_output_text) == len(batch_output_list_of_list_of_bboxes) == len(
+                batch_images), "`batch_output_text`, `batch_output_list_of_list_of_bboxes` and `batch_images` have different lengths."
+        max_input_length = max_input_length_including_image_tokens - \
+            self.image_seq_length if max_input_length_including_image_tokens is not None else None
+        batch_input_texts = [self._format_text_with_bboxes(text, list_of_list_of_bboxes, image) for text, list_of_list_of_bboxes, image in zip(
+            batch_input_text, batch_input_list_of_list_of_bboxes, batch_images)]
         inputs = self.tokenizer(
             batch_input_texts,
             return_tensors=return_tensors,
         if inputs["input_ids"].shape[1] > max_input_length:
             inputs["input_ids"] = inputs["input_ids"][:, :max_input_length]
             inputs["attention_mask"] = inputs["attention_mask"][:, :max_input_length]
         if batch_output_text is not None:
+            batch_output_texts = [self._format_text_with_bboxes(text, list_of_list_of_bboxes, image) for text, list_of_list_of_bboxes, image in zip(
+                batch_output_text, batch_output_list_of_list_of_bboxes, batch_images)]
             decoder_inputs = self.tokenizer(
                 batch_output_texts,
                 return_tensors=return_tensors,
             )
             # Truncating manually because I don't want </s> token at the end of truncated sequences, which is the default behavior
             if decoder_inputs["input_ids"].shape[1] > max_output_length:
+                decoder_inputs["input_ids"] = decoder_inputs["input_ids"][:,
+                                                                          :max_output_length]
+                decoder_inputs["attention_mask"] = decoder_inputs["attention_mask"][:,
+                                                                                    :max_output_length]
         pixel_values = self.image_processor(
             batch_images,
         if dtype is not None:
             pixel_values = pixel_values.to(dtype)
         return_data = {**inputs, "pixel_values": pixel_values}
         if batch_output_text is not None:
             decoder_input_ids = labels.new_zeros(labels.shape)
             decoder_input_ids[:, 1:] = labels[:, :-1].clone()
             decoder_input_ids[:, 0] = self.decoder_start_token_id
+            decoder_attention_mask = decoder_inputs["attention_mask"].new_ones(
+                decoder_input_ids.shape)
+            decoder_attention_mask[:,
+                                   1:] = decoder_inputs["attention_mask"][:, :-1].clone()
             # Mask fill labels to replace pad token ID with -100
             labels.masked_fill_(labels == self.tokenizer.pad_token_id, -100)
             return_data.update({
                 "decoder_input_ids": decoder_input_ids,
                 "decoder_attention_mask": decoder_attention_mask,
             })
         if device is not None:
             for key, value in return_data.items():
                 if isinstance(value, torch.Tensor):
         return generated_text.replace("<s>", "").replace("</s>", "").replace("<pad>", "")
     def postprocess_output(self, generated_ids, images):
+        # only for some testing purposes
+        generated_ids.masked_fill_(
+            generated_ids == -100, self.tokenizer.pad_token_id)
+        batch_decoded_texts = self.batch_decode(
+            generated_ids, skip_special_tokens=False)
+        batch_decoded_texts = [self.cleanup_generated_text(
+            text) for text in batch_decoded_texts]
         batch_list_of_list_of_bboxes = []
         batch_indices_of_bboxes_in_new_string = []
         batch_new_texts = []
         for text, image in zip(batch_decoded_texts, images):
             size_wh = self._get_image_size_wh(image)
+            parsed_text, list_of_stringified_bboxes, start_end_in_new_string = self._parse_text_with_bboxes(
+                text)
+            list_of_list_of_bboxes = [self.box_quantizer.dequantize_from_stringified_bboxes(
+                stringified_bbox, size_wh) for stringified_bbox in list_of_stringified_bboxes]
             batch_list_of_list_of_bboxes.append(list_of_list_of_bboxes)
+            batch_indices_of_bboxes_in_new_string.append(
+                start_end_in_new_string)
             batch_new_texts.append(parsed_text)
         return batch_new_texts, batch_list_of_list_of_bboxes, batch_indices_of_bboxes_in_new_string
     def _parse_text_with_bboxes(self, text):
         loc_pattern = r'((?:<loc_\d+>){4}(?:,(?:<loc_\d+>){4})*)'
         grounding_pattern = r'<grounding>(.*?)</grounding>' + loc_pattern
         list_of_stringified_bboxes = []
         start_end_in_new_string = []
         new_text = ""
                 locs = match.group(2)
                 new_text += grounding_text
                 list_of_stringified_bboxes.append(locs)
+                start_end_in_new_string.append(
+                    (new_pos, new_pos + len(grounding_text)))
                 new_pos += len(grounding_text)
             else:
                 # Handle loc pattern
                 replacement = ""
                 new_text += replacement
                 list_of_stringified_bboxes.append(locs)
+                start_end_in_new_string.append(
+                    (new_pos, new_pos + len(replacement)))
                 new_pos += len(replacement)
             original_pos = match.end()
         new_text += text[original_pos:]
         return new_text, list_of_stringified_bboxes, start_end_in_new_string
     def _format_text_with_bboxes(self, text, list_of_list_of_bboxes, image):
         size_wh = self._get_image_size_wh(image)
         quantized_bbox_lists = []
+        for list_of_bboxes in list_of_list_of_bboxes:
+            quantized_bboxes = self.box_quantizer.quantize(
+                list_of_bboxes, size_wh=size_wh)
+            stringified_bboxes = [
+                f"<loc_{x1}><loc_{y1}><loc_{x2}><loc_{y2}>" for x1, y1, x2, y2 in quantized_bboxes]
             stringified_bboxes = ",".join(stringified_bboxes)
             quantized_bbox_lists.append(stringified_bboxes)
         return text.format(*quantized_bbox_lists)
     def _get_image_size_wh(self, image):
+        # Get size_wh from image based on its type
         if isinstance(image, torch.Tensor):
             # For PyTorch tensor
             if image.dim() == 3:
         image_processor_input_names = self.image_processor.model_input_names
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
 class BoxQuantizer(object):
     def __init__(self, mode, bins):
         self.mode = mode
              dequantized_xmax, dequantized_ymax), dim=-1
         )
+        return dequantized_boxes

utils.py CHANGED Viewed

@@ -9,6 +9,7 @@ from copy import deepcopy
 from itertools import groupby
 from concurrent.futures import ThreadPoolExecutor, TimeoutError
 def move_to_device(inputs, device):
     if hasattr(inputs, "keys"):
         return {k: move_to_device(v, device) for k, v in inputs.items()}
@@ -21,6 +22,7 @@ def move_to_device(inputs, device):
     else:
         return inputs.to(device)
 class UnionFind:
     def __init__(self, n):
         self.parent = list(range(n))
@@ -35,7 +37,7 @@ class UnionFind:
                 if adj_matrix[i, j] > 0:
                     ufds.unite(i, j)
         return ufds
     @classmethod
     def from_adj_list(cls, adj_list):
         ufds = cls(len(adj_list))
@@ -43,7 +45,7 @@ class UnionFind:
             for j in adj_list[i]:
                 ufds.unite(i, j)
         return ufds
     @classmethod
     def from_edge_list(cls, edge_list, num_nodes):
         ufds = cls(num_nodes)
@@ -66,11 +68,11 @@ class UnionFind:
             self.parent[y] = x
             self.size[x] += self.size[y]
             self.num_components -= 1
     def get_components_of(self, x):
         x = self.find(x)
         return [i for i in range(len(self.parent)) if self.find(i) == x]
     def are_connected(self, x, y):
         return self.find(x) == self.find(y)
@@ -79,7 +81,7 @@ class UnionFind:
     def get_num_components(self):
         return self.num_components
     def get_labels_for_connected_components(self):
         map_parent_to_label = {}
         labels = []
@@ -90,6 +92,7 @@ class UnionFind:
             labels.append(map_parent_to_label[parent])
         return labels
 def visualise_single_image_prediction(image_as_np_array, predictions, filename):
     h, w = image_as_np_array.shape[:2]
     if h > w:
@@ -102,15 +105,16 @@ def visualise_single_image_prediction(image_as_np_array, predictions, filename):
     plot_bboxes(subplot, predictions["characters"], color="blue")
     COLOURS = [
-        "#b7ff51", # green
-        "#f50a8f", # pink
-        "#4b13b6", # purple
-        "#ddaa34", # orange
-        "#bea2a2", # brown
     ]
     colour_index = 0
     character_cluster_labels = predictions["character_cluster_labels"]
-    unique_label_sorted_by_frequency = sorted(list(set(character_cluster_labels)), key=lambda x: character_cluster_labels.count(x), reverse=True)
     for label in unique_label_sorted_by_frequency:
         root = None
         others = []
@@ -123,7 +127,9 @@ def visualise_single_image_prediction(image_as_np_array, predictions, filename):
         if colour_index >= len(COLOURS):
             random_colour = COLOURS[0]
             while random_colour in COLOURS:
-                random_colour = "#" + "".join([random.choice("0123456789ABCDEF") for j in range(6)])
         else:
             random_colour = COLOURS[colour_index]
             colour_index += 1
@@ -139,8 +145,9 @@ def visualise_single_image_prediction(image_as_np_array, predictions, filename):
             x2 = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
             y2 = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
             subplot.plot([x1, x2], [y1, y2], color=random_colour, linewidth=2)
-            subplot.plot([x2], [y2], color=random_colour, marker="o", markersize=5)
     for (i, j) in predictions["text_character_associations"]:
         score = predictions["dialog_confidences"][i]
         bbox_i = predictions["texts"][i]
@@ -149,7 +156,8 @@ def visualise_single_image_prediction(image_as_np_array, predictions, filename):
         y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
         x2 = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
         y2 = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
-        subplot.plot([x1, x2], [y1, y2], color="red", linewidth=2, linestyle="dashed", alpha=score)
     subplot.axis("off")
     if filename is not None:
@@ -160,6 +168,7 @@ def visualise_single_image_prediction(image_as_np_array, predictions, filename):
     plt.close()
     return image
 def plot_bboxes(subplot, bboxes, color="red", add_index=False):
     for id, bbox in enumerate(bboxes):
         w = bbox[2] - bbox[0]
@@ -170,7 +179,9 @@ def plot_bboxes(subplot, bboxes, color="red", add_index=False):
         subplot.add_patch(rect)
         if add_index:
             cx, cy = bbox[0] + w / 2, bbox[1] + h / 2
-            subplot.text(cx, cy, str(id), color=color, fontsize=10, ha="center", va="center")
 def sort_panels(rects):
     before_rects = convert_to_list_of_lists(rects)
@@ -203,34 +214,42 @@ def sort_panels(rects):
         G.remove_edge(*max_cyclic_edge)
     return list(nx.topological_sort(G))
 def is_strictly_above(rectA, rectB):
     x1A, y1A, x2A, y2A = rectA
     x1B, y1B, x2B, y2B = rectB
     return y2A < y1B
 def is_strictly_below(rectA, rectB):
     x1A, y1A, x2A, y2A = rectA
     x1B, y1B, x2B, y2B = rectB
     return y2B < y1A
 def is_strictly_left_of(rectA, rectB):
     x1A, y1A, x2A, y2A = rectA
     x1B, y1B, x2B, y2B = rectB
     return x2A < x1B
 def is_strictly_right_of(rectA, rectB):
     x1A, y1A, x2A, y2A = rectA
     x1B, y1B, x2B, y2B = rectB
     return x2B < x1A
 def intersects(rectA, rectB):
     return box(*rectA).intersects(box(*rectB))
 def is_there_a_directed_edge(a, b, rects):
     rectA = rects[a]
     rectB = rects[b]
-    centre_of_A = [rectA[0] + (rectA[2] - rectA[0]) / 2, rectA[1] + (rectA[3] - rectA[1]) / 2]
-    centre_of_B = [rectB[0] + (rectB[2] - rectB[0]) / 2, rectB[1] + (rectB[3] - rectB[1]) / 2]
     if np.allclose(np.array(centre_of_A), np.array(centre_of_B)):
         return box(*rectA).area > (box(*rectB)).area
     copy_A = [rectA[0], rectA[1], rectA[2], rectA[3]]
@@ -247,34 +266,41 @@ def is_there_a_directed_edge(a, b, rects):
         if is_strictly_below(copy_A, copy_B) and is_strictly_right_of(copy_A, copy_B):
             return use_cuts_to_determine_edge_from_a_to_b(a, b, rects)
         if is_strictly_below(copy_B, copy_A) and is_strictly_right_of(copy_B, copy_A):
-           return use_cuts_to_determine_edge_from_a_to_b(a, b, rects)
         # otherwise they intersect
         copy_A = erode_rectangle(copy_A, 0.05)
         copy_B = erode_rectangle(copy_B, 0.05)
 def get_distance(rectA, rectB):
     return box(rectA[0], rectA[1], rectA[2], rectA[3]).distance(box(rectB[0], rectB[1], rectB[2], rectB[3]))
 def use_cuts_to_determine_edge_from_a_to_b(a, b, rects):
     rects = deepcopy(rects)
     while True:
-        xmin, ymin, xmax, ymax = min(rects[a][0], rects[b][0]), min(rects[a][1], rects[b][1]), max(rects[a][2], rects[b][2]), max(rects[a][3], rects[b][3])
-        rect_index = [i for i in range(len(rects)) if intersects(rects[i], [xmin, ymin, xmax, ymax])]
-        rects_copy = [rect for rect in rects if intersects(rect, [xmin, ymin, xmax, ymax])]
         # try to split the panels using a "horizontal" lines
-        overlapping_y_ranges = merge_overlapping_ranges([(y1, y2) for x1, y1, x2, y2 in rects_copy])
         panel_index_to_split = {}
         for split_index, (y1, y2) in enumerate(overlapping_y_ranges):
             for i, index in enumerate(rect_index):
                 if y1 <= rects_copy[i][1] <= rects_copy[i][3] <= y2:
                     panel_index_to_split[index] = split_index
         if panel_index_to_split[a] != panel_index_to_split[b]:
             return panel_index_to_split[a] < panel_index_to_split[b]
         # try to split the panels using a "vertical" lines
-        overlapping_x_ranges = merge_overlapping_ranges([(x1, x2) for x1, y1, x2, y2 in rects_copy])
         panel_index_to_split = {}
         for split_index, (x1, x2) in enumerate(overlapping_x_ranges[::-1]):
             for i, index in enumerate(rect_index):
@@ -282,10 +308,11 @@ def use_cuts_to_determine_edge_from_a_to_b(a, b, rects):
                     panel_index_to_split[index] = split_index
         if panel_index_to_split[a] != panel_index_to_split[b]:
             return panel_index_to_split[a] < panel_index_to_split[b]
         # otherwise, erode the rectangles and try again
         rects = [erode_rectangle(rect, 0.05) for rect in rects]
 def erode_rectangle(bbox, erosion_factor):
     x1, y1, x2, y2 = bbox
     w, h = x2 - x1, y2 - y1
@@ -303,6 +330,7 @@ def erode_rectangle(bbox, erosion_factor):
     x1, y1, x2, y2 = cx - w / 2, cy - h / 2, cx + w / 2, cy + h / 2
     return [x1, y1, x2, y2]
 def merge_overlapping_ranges(ranges):
     """
     ranges: list of tuples (x1, x2)
@@ -324,6 +352,7 @@ def merge_overlapping_ranges(ranges):
     merged_ranges.append((prev_x1, prev_x2))
     return merged_ranges
 def sort_text_boxes_in_reading_order(text_bboxes, sorted_panel_bboxes):
     text_bboxes = convert_to_list_of_lists(text_bboxes)
     sorted_panel_bboxes = convert_to_list_of_lists(sorted_panel_bboxes)
@@ -335,18 +364,23 @@ def sort_text_boxes_in_reading_order(text_bboxes, sorted_panel_bboxes):
         groups = groupby(range(len(nums)), key=lambda i: nums[i])
         return [list(indices) for _, indices in groups]
-    panel_id_for_text = get_text_to_panel_mapping(text_bboxes, sorted_panel_bboxes)
     indices_of_texts = list(range(len(text_bboxes)))
-    indices_of_texts, panel_id_for_text = zip(*sorted(zip(indices_of_texts, panel_id_for_text), key=lambda x: x[1]))
     indices_of_texts = list(indices_of_texts)
     grouped_indices = indices_of_same_elements(panel_id_for_text)
     for group in grouped_indices:
         subset_of_text_indices = [indices_of_texts[i] for i in group]
-        text_bboxes_of_subset = [text_bboxes[i] for i in subset_of_text_indices]
         sorted_subset_indices = sort_texts_within_panel(text_bboxes_of_subset)
-        indices_of_texts[group[0] : group[-1] + 1] = [subset_of_text_indices[i] for i in sorted_subset_indices]
     return indices_of_texts
 def get_text_to_panel_mapping(text_bboxes, sorted_panel_bboxes):
     text_to_panel_mapping = []
     for text_bbox in text_bboxes:
@@ -359,14 +393,19 @@ def get_text_to_panel_mapping(text_bboxes, sorted_panel_bboxes):
         for j, annotation in enumerate(sorted_panel_bboxes):
             shapely_annotation_polygon = box(*annotation)
             if shapely_text_polygon.intersects(shapely_annotation_polygon):
-                all_intersections.append((shapely_text_polygon.intersection(shapely_annotation_polygon).area, j))
-            all_distances.append((shapely_text_polygon.distance(shapely_annotation_polygon), j))
         if len(all_intersections) == 0:
-            text_to_panel_mapping.append(min(all_distances, key=lambda x: x[0])[1])
         else:
-            text_to_panel_mapping.append(max(all_intersections, key=lambda x: x[0])[1])
     return text_to_panel_mapping
 def sort_texts_within_panel(rects):
     smallest_y = float("inf")
     greatest_x = float("-inf")
@@ -374,18 +413,20 @@ def sort_texts_within_panel(rects):
         x1, y1, x2, y2 = rect
         smallest_y = min(smallest_y, y1)
         greatest_x = max(greatest_x, x2)
     reference_point = Point(greatest_x, smallest_y)
     polygons_and_index = []
     for i, rect in enumerate(rects):
         x1, y1, x2, y2 = rect
-        polygons_and_index.append((box(x1,y1,x2,y2), i))
     # sort points by closest to reference point
-    polygons_and_index = sorted(polygons_and_index, key=lambda x: reference_point.distance(x[0]))
     indices = [x[1] for x in polygons_and_index]
     return indices
 def force_to_be_valid_bboxes(bboxes):
     if len(bboxes) == 0:
         return bboxes
@@ -394,20 +435,24 @@ def force_to_be_valid_bboxes(bboxes):
     bboxes_as_xywh[:, 2] = torch.clamp(bboxes_as_xywh[:, 2], min=1)
     bboxes_as_xywh[:, 3] = torch.clamp(bboxes_as_xywh[:, 3], min=1)
     bboxes_as_xywh = bboxes_as_xywh.tolist()
-    bboxes_as_xyxy = [[x1, y1, x1 + w, y1 + h] for x1, y1, w, h in bboxes_as_xywh]
     return bboxes_as_xyxy
 def x1y1wh_to_x1y1x2y2(bbox):
     x1, y1, w, h = bbox
     return [x1, y1, x1 + w, y1 + h]
 def x1y1x2y2_to_xywh(bbox):
     x1, y1, x2, y2 = bbox
     return [x1, y1, x2 - x1, y2 - y1]
 def convert_to_list_of_lists(rects):
     if isinstance(rects, torch.Tensor):
         return rects.tolist()
     if isinstance(rects, np.ndarray):
         return rects.tolist()
-    return [[a, b, c, d] for a, b, c, d in rects]

 from itertools import groupby
 from concurrent.futures import ThreadPoolExecutor, TimeoutError
 def move_to_device(inputs, device):
     if hasattr(inputs, "keys"):
         return {k: move_to_device(v, device) for k, v in inputs.items()}
     else:
         return inputs.to(device)
 class UnionFind:
     def __init__(self, n):
         self.parent = list(range(n))
                 if adj_matrix[i, j] > 0:
                     ufds.unite(i, j)
         return ufds
     @classmethod
     def from_adj_list(cls, adj_list):
         ufds = cls(len(adj_list))
             for j in adj_list[i]:
                 ufds.unite(i, j)
         return ufds
     @classmethod
     def from_edge_list(cls, edge_list, num_nodes):
         ufds = cls(num_nodes)
             self.parent[y] = x
             self.size[x] += self.size[y]
             self.num_components -= 1
     def get_components_of(self, x):
         x = self.find(x)
         return [i for i in range(len(self.parent)) if self.find(i) == x]
     def are_connected(self, x, y):
         return self.find(x) == self.find(y)
     def get_num_components(self):
         return self.num_components
     def get_labels_for_connected_components(self):
         map_parent_to_label = {}
         labels = []
             labels.append(map_parent_to_label[parent])
         return labels
 def visualise_single_image_prediction(image_as_np_array, predictions, filename):
     h, w = image_as_np_array.shape[:2]
     if h > w:
     plot_bboxes(subplot, predictions["characters"], color="blue")
     COLOURS = [
+        "#b7ff51",  # green
+        "#f50a8f",  # pink
+        "#4b13b6",  # purple
+        "#ddaa34",  # orange
+        "#bea2a2",  # brown
     ]
     colour_index = 0
     character_cluster_labels = predictions["character_cluster_labels"]
+    unique_label_sorted_by_frequency = sorted(list(set(
+        character_cluster_labels)), key=lambda x: character_cluster_labels.count(x), reverse=True)
     for label in unique_label_sorted_by_frequency:
         root = None
         others = []
         if colour_index >= len(COLOURS):
             random_colour = COLOURS[0]
             while random_colour in COLOURS:
+                random_colour = "#" + \
+                    "".join([random.choice("0123456789ABCDEF")
+                            for j in range(6)])
         else:
             random_colour = COLOURS[colour_index]
             colour_index += 1
             x2 = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
             y2 = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
             subplot.plot([x1, x2], [y1, y2], color=random_colour, linewidth=2)
+            subplot.plot([x2], [y2], color=random_colour,
+                         marker="o", markersize=5)
     for (i, j) in predictions["text_character_associations"]:
         score = predictions["dialog_confidences"][i]
         bbox_i = predictions["texts"][i]
         y1 = bbox_i[1] + (bbox_i[3] - bbox_i[1]) / 2
         x2 = bbox_j[0] + (bbox_j[2] - bbox_j[0]) / 2
         y2 = bbox_j[1] + (bbox_j[3] - bbox_j[1]) / 2
+        subplot.plot([x1, x2], [y1, y2], color="red",
+                     linewidth=2, linestyle="dashed", alpha=score)
     subplot.axis("off")
     if filename is not None:
     plt.close()
     return image
 def plot_bboxes(subplot, bboxes, color="red", add_index=False):
     for id, bbox in enumerate(bboxes):
         w = bbox[2] - bbox[0]
         subplot.add_patch(rect)
         if add_index:
             cx, cy = bbox[0] + w / 2, bbox[1] + h / 2
+            subplot.text(cx, cy, str(id), color=color,
+                         fontsize=10, ha="center", va="center")
 def sort_panels(rects):
     before_rects = convert_to_list_of_lists(rects)
         G.remove_edge(*max_cyclic_edge)
     return list(nx.topological_sort(G))
 def is_strictly_above(rectA, rectB):
     x1A, y1A, x2A, y2A = rectA
     x1B, y1B, x2B, y2B = rectB
     return y2A < y1B
 def is_strictly_below(rectA, rectB):
     x1A, y1A, x2A, y2A = rectA
     x1B, y1B, x2B, y2B = rectB
     return y2B < y1A
 def is_strictly_left_of(rectA, rectB):
     x1A, y1A, x2A, y2A = rectA
     x1B, y1B, x2B, y2B = rectB
     return x2A < x1B
 def is_strictly_right_of(rectA, rectB):
     x1A, y1A, x2A, y2A = rectA
     x1B, y1B, x2B, y2B = rectB
     return x2B < x1A
 def intersects(rectA, rectB):
     return box(*rectA).intersects(box(*rectB))
 def is_there_a_directed_edge(a, b, rects):
     rectA = rects[a]
     rectB = rects[b]
+    centre_of_A = [rectA[0] + (rectA[2] - rectA[0]) / 2,
+                   rectA[1] + (rectA[3] - rectA[1]) / 2]
+    centre_of_B = [rectB[0] + (rectB[2] - rectB[0]) / 2,
+                   rectB[1] + (rectB[3] - rectB[1]) / 2]
     if np.allclose(np.array(centre_of_A), np.array(centre_of_B)):
         return box(*rectA).area > (box(*rectB)).area
     copy_A = [rectA[0], rectA[1], rectA[2], rectA[3]]
         if is_strictly_below(copy_A, copy_B) and is_strictly_right_of(copy_A, copy_B):
             return use_cuts_to_determine_edge_from_a_to_b(a, b, rects)
         if is_strictly_below(copy_B, copy_A) and is_strictly_right_of(copy_B, copy_A):
+            return use_cuts_to_determine_edge_from_a_to_b(a, b, rects)
         # otherwise they intersect
         copy_A = erode_rectangle(copy_A, 0.05)
         copy_B = erode_rectangle(copy_B, 0.05)
 def get_distance(rectA, rectB):
     return box(rectA[0], rectA[1], rectA[2], rectA[3]).distance(box(rectB[0], rectB[1], rectB[2], rectB[3]))
 def use_cuts_to_determine_edge_from_a_to_b(a, b, rects):
     rects = deepcopy(rects)
     while True:
+        xmin, ymin, xmax, ymax = min(rects[a][0], rects[b][0]), min(
+            rects[a][1], rects[b][1]), max(rects[a][2], rects[b][2]), max(rects[a][3], rects[b][3])
+        rect_index = [i for i in range(len(rects)) if intersects(
+            rects[i], [xmin, ymin, xmax, ymax])]
+        rects_copy = [rect for rect in rects if intersects(
+            rect, [xmin, ymin, xmax, ymax])]
         # try to split the panels using a "horizontal" lines
+        overlapping_y_ranges = merge_overlapping_ranges(
+            [(y1, y2) for x1, y1, x2, y2 in rects_copy])
         panel_index_to_split = {}
         for split_index, (y1, y2) in enumerate(overlapping_y_ranges):
             for i, index in enumerate(rect_index):
                 if y1 <= rects_copy[i][1] <= rects_copy[i][3] <= y2:
                     panel_index_to_split[index] = split_index
         if panel_index_to_split[a] != panel_index_to_split[b]:
             return panel_index_to_split[a] < panel_index_to_split[b]
         # try to split the panels using a "vertical" lines
+        overlapping_x_ranges = merge_overlapping_ranges(
+            [(x1, x2) for x1, y1, x2, y2 in rects_copy])
         panel_index_to_split = {}
         for split_index, (x1, x2) in enumerate(overlapping_x_ranges[::-1]):
             for i, index in enumerate(rect_index):
                     panel_index_to_split[index] = split_index
         if panel_index_to_split[a] != panel_index_to_split[b]:
             return panel_index_to_split[a] < panel_index_to_split[b]
         # otherwise, erode the rectangles and try again
         rects = [erode_rectangle(rect, 0.05) for rect in rects]
 def erode_rectangle(bbox, erosion_factor):
     x1, y1, x2, y2 = bbox
     w, h = x2 - x1, y2 - y1
     x1, y1, x2, y2 = cx - w / 2, cy - h / 2, cx + w / 2, cy + h / 2
     return [x1, y1, x2, y2]
 def merge_overlapping_ranges(ranges):
     """
     ranges: list of tuples (x1, x2)
     merged_ranges.append((prev_x1, prev_x2))
     return merged_ranges
 def sort_text_boxes_in_reading_order(text_bboxes, sorted_panel_bboxes):
     text_bboxes = convert_to_list_of_lists(text_bboxes)
     sorted_panel_bboxes = convert_to_list_of_lists(sorted_panel_bboxes)
         groups = groupby(range(len(nums)), key=lambda i: nums[i])
         return [list(indices) for _, indices in groups]
+    panel_id_for_text = get_text_to_panel_mapping(
+        text_bboxes, sorted_panel_bboxes)
     indices_of_texts = list(range(len(text_bboxes)))
+    indices_of_texts, panel_id_for_text = zip(
+        *sorted(zip(indices_of_texts, panel_id_for_text), key=lambda x: x[1]))
     indices_of_texts = list(indices_of_texts)
     grouped_indices = indices_of_same_elements(panel_id_for_text)
     for group in grouped_indices:
         subset_of_text_indices = [indices_of_texts[i] for i in group]
+        text_bboxes_of_subset = [text_bboxes[i]
+                                 for i in subset_of_text_indices]
         sorted_subset_indices = sort_texts_within_panel(text_bboxes_of_subset)
+        indices_of_texts[group[0]: group[-1] + 1] = [subset_of_text_indices[i]
+                                                     for i in sorted_subset_indices]
     return indices_of_texts
 def get_text_to_panel_mapping(text_bboxes, sorted_panel_bboxes):
     text_to_panel_mapping = []
     for text_bbox in text_bboxes:
         for j, annotation in enumerate(sorted_panel_bboxes):
             shapely_annotation_polygon = box(*annotation)
             if shapely_text_polygon.intersects(shapely_annotation_polygon):
+                all_intersections.append(
+                    (shapely_text_polygon.intersection(shapely_annotation_polygon).area, j))
+            all_distances.append(
+                (shapely_text_polygon.distance(shapely_annotation_polygon), j))
         if len(all_intersections) == 0:
+            text_to_panel_mapping.append(
+                min(all_distances, key=lambda x: x[0])[1])
         else:
+            text_to_panel_mapping.append(
+                max(all_intersections, key=lambda x: x[0])[1])
     return text_to_panel_mapping
 def sort_texts_within_panel(rects):
     smallest_y = float("inf")
     greatest_x = float("-inf")
         x1, y1, x2, y2 = rect
         smallest_y = min(smallest_y, y1)
         greatest_x = max(greatest_x, x2)
     reference_point = Point(greatest_x, smallest_y)
     polygons_and_index = []
     for i, rect in enumerate(rects):
         x1, y1, x2, y2 = rect
+        polygons_and_index.append((box(x1, y1, x2, y2), i))
     # sort points by closest to reference point
+    polygons_and_index = sorted(
+        polygons_and_index, key=lambda x: reference_point.distance(x[0]))
     indices = [x[1] for x in polygons_and_index]
     return indices
 def force_to_be_valid_bboxes(bboxes):
     if len(bboxes) == 0:
         return bboxes
     bboxes_as_xywh[:, 2] = torch.clamp(bboxes_as_xywh[:, 2], min=1)
     bboxes_as_xywh[:, 3] = torch.clamp(bboxes_as_xywh[:, 3], min=1)
     bboxes_as_xywh = bboxes_as_xywh.tolist()
+    bboxes_as_xyxy = [[x1, y1, x1 + w, y1 + h]
+                      for x1, y1, w, h in bboxes_as_xywh]
     return bboxes_as_xyxy
 def x1y1wh_to_x1y1x2y2(bbox):
     x1, y1, w, h = bbox
     return [x1, y1, x1 + w, y1 + h]
 def x1y1x2y2_to_xywh(bbox):
     x1, y1, x2, y2 = bbox
     return [x1, y1, x2 - x1, y2 - y1]
 def convert_to_list_of_lists(rects):
     if isinstance(rects, torch.Tensor):
         return rects.tolist()
     if isinstance(rects, np.ndarray):
         return rects.tolist()
+    return [[a, b, c, d] for a, b, c, d in rects]