app.py CHANGED
@@ -1,24 +1,25 @@
 
 
1
  import json
2
- from collections import defaultdict
 
 
 
3
  import safetensors
4
  import timm
5
- from transformers import AutoProcessor
6
- import gradio as gr
7
  import torch
8
- import time
9
- from florence2_implementation.modeling_florence2 import Florence2ForConditionalGeneration
10
- from torchvision.transforms import InterpolationMode
11
- from PIL import Image
12
  import torchvision.transforms.functional as TF
 
 
13
  from torchvision.transforms import transforms
14
- import random
15
- import csv
16
- import os
17
 
18
  torch.set_grad_enabled(False)
19
 
20
  # HF now (Feb 20, 2025) imposes a storage limit of 1GB. Will have to pull JTP from other places.
21
- os.system("wget -nv https://huggingface.co/RedRocket/JointTaggerProject/resolve/main/JTP_PILOT2/JTP_PILOT2-e3-vit_so400m_patch14_siglip_384.safetensors")
22
 
23
 
24
  category_id_to_str = {
@@ -34,16 +35,12 @@ class Pruner:
34
  def __init__(self, path_to_tag_list_csv):
35
  species_tags = set()
36
  allowed_tags = set()
37
- with open(path_to_tag_list_csv, "r") as f:
38
- reader = csv.reader(f)
39
- header = next(reader)
40
- name_index = header.index("name")
41
- category_index = header.index("category")
42
- post_count_index = header.index("post_count")
43
  for row in reader:
44
- if int(row[post_count_index]) > 20:
45
- category = row[category_index]
46
- name = row[name_index]
47
  if category == "5":
48
  species_tags.add(name)
49
  allowed_tags.add(name)
@@ -198,13 +195,6 @@ model = Florence2ForConditionalGeneration.from_pretrained(model_id,).eval()
198
  processor = AutoProcessor.from_pretrained("./florence2_implementation/", trust_remote_code=True)
199
 
200
 
201
- tree = defaultdict(list)
202
- with open('tag_implications-2024-05-05.csv', 'rt') as csvfile:
203
- reader = csv.DictReader(csvfile)
204
- for row in reader:
205
- if row["status"] == "active":
206
- tree[row["consequent_name"]].append(row["antecedent_name"])
207
-
208
 
209
  title = """<h1 align="center">Furrence2 Captioner Demo</h1>"""
210
  description=(
@@ -237,10 +227,9 @@ allowed_tags = list(tags.keys())
237
  for idx, tag in enumerate(allowed_tags):
238
  allowed_tags[idx] = tag
239
 
240
- pruner = Pruner("tags-2024-05-05.csv")
241
 
242
  def generate_prompt(image, expected_caption_length):
243
- global THRESHOLD, tree, tokenizer, model, tagger_model, tagger_transform
244
  tagger_input = tagger_transform(image.convert('RGBA')).unsqueeze(0)
245
  probabilities = tagger_model(tagger_input)
246
  for prob in probabilities:
@@ -319,7 +308,7 @@ def main():
319
  value="Caption it!", interactive=True, variant="primary",
320
  )
321
 
322
- caption_output = gr.Textbox(lines=1, label="Caption Output")
323
  caption_button.click(
324
  inference_caption,
325
  [
 
1
+ import csv
2
+ import gzip
3
  import json
4
+ import random
5
+ import time
6
+
7
+ import gradio as gr
8
  import safetensors
9
  import timm
 
 
10
  import torch
 
 
 
 
11
  import torchvision.transforms.functional as TF
12
+ from PIL import Image
13
+ from torchvision.transforms import InterpolationMode
14
  from torchvision.transforms import transforms
15
+ from transformers import AutoProcessor
16
+
17
+ from florence2_implementation.modeling_florence2 import Florence2ForConditionalGeneration
18
 
19
  torch.set_grad_enabled(False)
20
 
21
  # HF now (Feb 20, 2025) imposes a storage limit of 1GB. Will have to pull JTP from other places.
22
+ # os.system("wget -nv https://huggingface.co/RedRocket/JointTaggerProject/resolve/main/JTP_PILOT2/JTP_PILOT2-e3-vit_so400m_patch14_siglip_384.safetensors")
23
 
24
 
25
  category_id_to_str = {
 
35
  def __init__(self, path_to_tag_list_csv):
36
  species_tags = set()
37
  allowed_tags = set()
38
+ with gzip.open(path_to_tag_list_csv, mode="rt", encoding="utf8") as csv_file:
39
+ reader = csv.DictReader(csv_file)
 
 
 
 
40
  for row in reader:
41
+ if int(row["post_count"]) > 20:
42
+ category = row["category"]
43
+ name = row["name"]
44
  if category == "5":
45
  species_tags.add(name)
46
  allowed_tags.add(name)
 
195
  processor = AutoProcessor.from_pretrained("./florence2_implementation/", trust_remote_code=True)
196
 
197
 
 
 
 
 
 
 
 
198
 
199
  title = """<h1 align="center">Furrence2 Captioner Demo</h1>"""
200
  description=(
 
227
  for idx, tag in enumerate(allowed_tags):
228
  allowed_tags[idx] = tag
229
 
230
+ pruner = Pruner("tags-2025-11-25.csv.gz")
231
 
232
  def generate_prompt(image, expected_caption_length):
 
233
  tagger_input = tagger_transform(image.convert('RGBA')).unsqueeze(0)
234
  probabilities = tagger_model(tagger_input)
235
  for prob in probabilities:
 
308
  value="Caption it!", interactive=True, variant="primary",
309
  )
310
 
311
+ caption_output = gr.Textbox(lines=3, label="Caption Output")
312
  caption_button.click(
313
  inference_caption,
314
  [
florence2_implementation/configuration_florence2.py CHANGED
@@ -14,9 +14,6 @@
14
  import warnings
15
  """ Florence-2 configuration"""
16
 
17
- from typing import Optional
18
-
19
- from transformers import AutoConfig
20
  from transformers.configuration_utils import PretrainedConfig
21
  from transformers.utils import logging
22
 
@@ -77,7 +74,7 @@ class Florence2VisionConfig(PretrainedConfig):
77
  >>> configuration = model.config
78
  ```"""
79
 
80
- model_type = "florence2_vision"
81
  keys_to_ignore_at_inference = ["past_key_values"]
82
 
83
  def __init__(
@@ -118,7 +115,6 @@ class Florence2VisionConfig(PretrainedConfig):
118
  super().__init__(**kwargs)
119
 
120
 
121
-
122
  class Florence2LanguageConfig(PretrainedConfig):
123
  r"""
124
  This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
@@ -272,7 +268,7 @@ class Florence2LanguageConfig(PretrainedConfig):
272
  class Florence2Config(PretrainedConfig):
273
  r"""
274
  This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
275
- Florence-2 model according to the specified arguments, defining the model architecture.
276
 
277
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
278
  documentation from [`PretrainedConfig`] for more information.
@@ -281,7 +277,7 @@ class Florence2Config(PretrainedConfig):
281
  vision_config (`Florence2VisionConfig`, *optional*):
282
  Custom vision config or dict
283
  text_config (`Union[AutoConfig, dict]`, *optional*):
284
- The config object of the text backbone.
285
  ignore_index (`int`, *optional*, defaults to -100):
286
  The ignore index for the loss function.
287
  vocab_size (`int`, *optional*, defaults to 51289):
@@ -327,7 +323,7 @@ class Florence2Config(PretrainedConfig):
327
  self.vocab_size = vocab_size
328
  self.projection_dim = projection_dim
329
  if vision_config is not None:
330
- vision_config = PretrainedConfig(**vision_config)
331
  self.vision_config = vision_config
332
  self.vocab_size = self.vocab_size
333
 
 
14
  import warnings
15
  """ Florence-2 configuration"""
16
 
 
 
 
17
  from transformers.configuration_utils import PretrainedConfig
18
  from transformers.utils import logging
19
 
 
74
  >>> configuration = model.config
75
  ```"""
76
 
77
+ model_type = "davit"
78
  keys_to_ignore_at_inference = ["past_key_values"]
79
 
80
  def __init__(
 
115
  super().__init__(**kwargs)
116
 
117
 
 
118
  class Florence2LanguageConfig(PretrainedConfig):
119
  r"""
120
  This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
 
268
  class Florence2Config(PretrainedConfig):
269
  r"""
270
  This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
271
+ Florence-2 model according to the specified arguments, defining the model architecture.
272
 
273
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
274
  documentation from [`PretrainedConfig`] for more information.
 
277
  vision_config (`Florence2VisionConfig`, *optional*):
278
  Custom vision config or dict
279
  text_config (`Union[AutoConfig, dict]`, *optional*):
280
+ The config object of the text backbone.
281
  ignore_index (`int`, *optional*, defaults to -100):
282
  The ignore index for the loss function.
283
  vocab_size (`int`, *optional*, defaults to 51289):
 
323
  self.vocab_size = vocab_size
324
  self.projection_dim = projection_dim
325
  if vision_config is not None:
326
+ vision_config = Florence2VisionConfig(**vision_config)
327
  self.vision_config = vision_config
328
  self.vocab_size = self.vocab_size
329
 
florence2_implementation/modeling_florence2.py CHANGED
@@ -23,10 +23,10 @@ import torch.utils.checkpoint
23
  from torch import nn
24
  import torch.nn.functional as F
25
  import torch.utils.checkpoint as checkpoint
26
- from torch.nn import CrossEntropyLoss
27
  from collections import OrderedDict
28
  from einops import rearrange
29
- from timm.models.layers import DropPath, trunc_normal_
30
 
31
  from transformers.modeling_utils import PreTrainedModel
32
  from transformers.generation.utils import GenerationMixin
@@ -34,17 +34,15 @@ from transformers.utils import (
34
  ModelOutput,
35
  add_start_docstrings,
36
  add_start_docstrings_to_model_forward,
37
- is_flash_attn_2_available,
38
  logging,
39
  replace_return_docstrings,
40
  is_flash_attn_2_available,
41
  is_flash_attn_greater_or_equal_2_10,
42
  )
43
- from .configuration_florence2 import Florence2Config
44
  from .configuration_florence2 import Florence2LanguageConfig
45
  from .configuration_florence2 import Florence2VisionConfig
46
 
47
-
48
  from transformers.activations import ACT2FN
49
  from transformers.modeling_attn_mask_utils import (
50
  _prepare_4d_attention_mask,
@@ -59,7 +57,6 @@ from transformers.modeling_outputs import (
59
  Seq2SeqModelOutput,
60
  )
61
 
62
-
63
  if is_flash_attn_2_available():
64
  from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
65
 
@@ -67,6 +64,7 @@ logger = logging.get_logger(__name__)
67
 
68
  _CONFIG_FOR_DOC = "Florence2Config"
69
 
 
70
  class LearnedAbsolutePositionEmbedding2D(nn.Module):
71
  """
72
  This module learns positional embeddings up to a fixed maximum size.
@@ -79,7 +77,7 @@ class LearnedAbsolutePositionEmbedding2D(nn.Module):
79
 
80
  def forward(self, pixel_values):
81
  """
82
- pixel_values: (batch_size, height, width, num_channels)
83
  returns: (batch_size, height, width, embedding_dim * 2)
84
  """
85
  if len(pixel_values.shape) != 4:
@@ -100,6 +98,7 @@ class LearnedAbsolutePositionEmbedding2D(nn.Module):
100
  pos = pos.permute(0, 2, 3, 1)
101
  return pos
102
 
 
103
  class PositionalEmbeddingCosine1D(nn.Module):
104
  """
105
  This class implements a very simple positional encoding. It follows closely
@@ -111,6 +110,7 @@ class PositionalEmbeddingCosine1D(nn.Module):
111
  dropout_prob: The dropout probability.
112
  max_seq_len: The maximum length to precompute the positional encodings.
113
  """
 
114
  def __init__(
115
  self,
116
  embed_dim: int = 512,
@@ -126,7 +126,7 @@ class PositionalEmbeddingCosine1D(nn.Module):
126
  # of the position index (i.e., the row index).
127
  frequencies = \
128
  torch.arange(0, self.max_seq_len) \
129
- .reshape(self.max_seq_len, 1) * denominator
130
  pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim))
131
  # Populate uneven entries.
132
  pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
@@ -166,6 +166,7 @@ class LearnedAbsolutePositionEmbedding1D(nn.Module):
166
  embed_dim: The dimension of the embeddings.
167
  max_seq_len: The maximum length to precompute the positional encodings.
168
  """
 
169
  def __init__(
170
  self,
171
  embedding_dim: int = 512,
@@ -199,7 +200,6 @@ class LearnedAbsolutePositionEmbedding1D(nn.Module):
199
  return pos_embeds
200
 
201
 
202
-
203
  class MySequential(nn.Sequential):
204
  def forward(self, *inputs):
205
  for module in self._modules.values():
@@ -234,11 +234,11 @@ class PreNorm(nn.Module):
234
 
235
  class Mlp(nn.Module):
236
  def __init__(
237
- self,
238
- in_features,
239
- hidden_features=None,
240
- out_features=None,
241
- act_layer=nn.GELU,
242
  ):
243
  super().__init__()
244
  out_features = out_features or in_features
@@ -255,12 +255,12 @@ class Mlp(nn.Module):
255
 
256
  class DepthWiseConv2d(nn.Module):
257
  def __init__(
258
- self,
259
- dim_in,
260
- kernel_size,
261
- padding,
262
- stride,
263
- bias=True,
264
  ):
265
  super().__init__()
266
  self.dw = nn.Conv2d(
@@ -288,14 +288,14 @@ class ConvEmbed(nn.Module):
288
  """
289
 
290
  def __init__(
291
- self,
292
- patch_size=7,
293
- in_chans=3,
294
- embed_dim=64,
295
- stride=4,
296
- padding=2,
297
- norm_layer=None,
298
- pre_norm=True
299
  ):
300
  super().__init__()
301
  self.patch_size = patch_size
@@ -374,7 +374,7 @@ class ChannelBlock(nn.Module):
374
  self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
375
  self.ffn = PreNorm(
376
  norm_layer(dim),
377
- Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
378
  drop_path
379
  )
380
 
@@ -398,9 +398,9 @@ def window_partition(x, window_size: int):
398
 
399
 
400
  def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
401
- B = batch_size
402
  # this will cause onnx conversion failed for dynamic axis, because treated as constant
403
- # int(windows.shape[0] / (H * W / window_size / window_size))
404
  x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
405
  x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
406
  return x
@@ -408,7 +408,6 @@ def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
408
 
409
  class WindowAttention(nn.Module):
410
  def __init__(self, dim, num_heads, window_size, qkv_bias=True):
411
-
412
  super().__init__()
413
  self.dim = dim
414
  self.window_size = window_size
@@ -422,7 +421,6 @@ class WindowAttention(nn.Module):
422
  self.softmax = nn.Softmax(dim=-1)
423
 
424
  def forward(self, x, size):
425
-
426
  H, W = size
427
  B, L, C = x.shape
428
  assert L == H * W, "input feature has wrong size"
@@ -484,7 +482,7 @@ class SpatialBlock(nn.Module):
484
  self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
485
  self.ffn = PreNorm(
486
  norm_layer(dim),
487
- Mlp(in_features=dim, hidden_features=int(dim*mlp_ratio), act_layer=act_layer),
488
  drop_path
489
  )
490
 
@@ -523,26 +521,26 @@ class DaViT(nn.Module):
523
  """
524
 
525
  def __init__(
526
- self,
527
- in_chans=3,
528
- num_classes=1000,
529
- depths=(1, 1, 3, 1),
530
- patch_size=(7, 2, 2, 2),
531
- patch_stride=(4, 2, 2, 2),
532
- patch_padding=(3, 0, 0, 0),
533
- patch_prenorm=(False, False, False, False),
534
- embed_dims=(64, 128, 192, 256),
535
- num_heads=(3, 6, 12, 24),
536
- num_groups=(3, 6, 12, 24),
537
- window_size=7,
538
- mlp_ratio=4.,
539
- qkv_bias=True,
540
- drop_path_rate=0.1,
541
- norm_layer=nn.LayerNorm,
542
- enable_checkpoint=False,
543
- conv_at_attn=True,
544
- conv_at_ffn=True,
545
- ):
546
  super().__init__()
547
 
548
  self.num_classes = num_classes
@@ -554,7 +552,7 @@ class DaViT(nn.Module):
554
  assert self.num_stages == len(self.num_heads) == len(self.num_groups)
555
 
556
  num_stages = len(embed_dims)
557
- dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)*2)]
558
 
559
  depth_offset = 0
560
  convs = []
@@ -576,32 +574,32 @@ class DaViT(nn.Module):
576
  MySequential(OrderedDict([
577
  (
578
  'spatial_block', SpatialBlock(
579
- embed_dims[i],
580
- num_heads[i],
581
- window_size,
582
- drop_path_rate=dpr[depth_offset+j*2],
583
- qkv_bias=qkv_bias,
584
- mlp_ratio=mlp_ratio,
585
- conv_at_attn=conv_at_attn,
586
- conv_at_ffn=conv_at_ffn,
587
- )
588
  ),
589
  (
590
  'channel_block', ChannelBlock(
591
- embed_dims[i],
592
- num_groups[i],
593
- drop_path_rate=dpr[depth_offset+j*2+1],
594
- qkv_bias=qkv_bias,
595
- mlp_ratio=mlp_ratio,
596
- conv_at_attn=conv_at_attn,
597
- conv_at_ffn=conv_at_ffn,
598
- )
599
  )
600
  ])) for j in range(depths[i])
601
  ]
602
  )
603
  blocks.append(block)
604
- depth_offset += depths[i]*2
605
 
606
  self.convs = nn.ModuleList(convs)
607
  self.blocks = nn.ModuleList(blocks)
@@ -610,32 +608,13 @@ class DaViT(nn.Module):
610
  self.avgpool = nn.AdaptiveAvgPool1d(1)
611
  self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
612
 
613
- self.apply(self._init_weights)
614
-
615
  @property
616
  def dim_out(self):
617
  return self.embed_dims[-1]
618
 
619
- def _init_weights(self, m):
620
- if isinstance(m, nn.Linear):
621
- trunc_normal_(m.weight, std=0.02)
622
- if m.bias is not None:
623
- nn.init.constant_(m.bias, 0)
624
- elif isinstance(m, nn.Conv2d):
625
- nn.init.normal_(m.weight, std=0.02)
626
- for name, _ in m.named_parameters():
627
- if name in ['bias']:
628
- nn.init.constant_(m.bias, 0)
629
- elif isinstance(m, nn.LayerNorm):
630
- nn.init.constant_(m.weight, 1.0)
631
- nn.init.constant_(m.bias, 0)
632
- elif isinstance(m, nn.BatchNorm2d):
633
- nn.init.constant_(m.weight, 1.0)
634
- nn.init.constant_(m.bias, 0)
635
-
636
  def forward_features_unpool(self, x):
637
  """
638
- forward until avg pooling
639
  Args:
640
  x (_type_): input image tensor
641
  """
@@ -663,7 +642,7 @@ class DaViT(nn.Module):
663
  x = self.forward_features(x)
664
  x = self.head(x)
665
  return x
666
-
667
  @classmethod
668
  def from_config(cls, config):
669
  return cls(
@@ -680,12 +659,11 @@ class DaViT(nn.Module):
680
  )
681
 
682
 
683
-
684
-
685
  if is_flash_attn_2_available():
686
  from flash_attn import flash_attn_func, flash_attn_varlen_func
687
  from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
688
 
 
689
  # Copied from transformers.models.llama.modeling_llama._get_unpad_data
690
  def _get_unpad_data(attention_mask):
691
  seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
@@ -754,14 +732,14 @@ class Florence2Attention(nn.Module):
754
  """Multi-headed attention from 'Attention Is All You Need' paper"""
755
 
756
  def __init__(
757
- self,
758
- embed_dim: int,
759
- num_heads: int,
760
- dropout: float = 0.0,
761
- is_decoder: bool = False,
762
- bias: bool = True,
763
- is_causal: bool = False,
764
- config: Optional[Florence2LanguageConfig] = None,
765
  ):
766
  super().__init__()
767
  self.embed_dim = embed_dim
@@ -775,7 +753,7 @@ class Florence2Attention(nn.Module):
775
  f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
776
  f" and `num_heads`: {num_heads})."
777
  )
778
- self.scaling = self.head_dim**-0.5
779
  self.is_decoder = is_decoder
780
  self.is_causal = is_causal
781
 
@@ -788,13 +766,13 @@ class Florence2Attention(nn.Module):
788
  return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
789
 
790
  def forward(
791
- self,
792
- hidden_states: torch.Tensor,
793
- key_value_states: Optional[torch.Tensor] = None,
794
- past_key_value: Optional[Tuple[torch.Tensor]] = None,
795
- attention_mask: Optional[torch.Tensor] = None,
796
- layer_head_mask: Optional[torch.Tensor] = None,
797
- output_attentions: bool = False,
798
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
799
  """Input shape: Batch x Time x Channel"""
800
 
@@ -811,9 +789,9 @@ class Florence2Attention(nn.Module):
811
  # is checking that the `sequence_length` of the `past_key_value` is the same as
812
  # the provided `key_value_states` to support prefix tuning
813
  if (
814
- is_cross_attention
815
- and past_key_value is not None
816
- and past_key_value[0].shape[2] == key_value_states.shape[1]
817
  ):
818
  # reuse k,v, cross_attentions
819
  key_states = past_key_value[0]
@@ -928,13 +906,13 @@ class Florence2FlashAttention2(Florence2Attention):
928
  return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
929
 
930
  def forward(
931
- self,
932
- hidden_states: torch.Tensor,
933
- key_value_states: Optional[torch.Tensor] = None,
934
- past_key_value: Optional[Tuple[torch.Tensor]] = None,
935
- attention_mask: Optional[torch.Tensor] = None,
936
- layer_head_mask: Optional[torch.Tensor] = None,
937
- output_attentions: bool = False,
938
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
939
  # Florence2FlashAttention2 attention does not support output_attentions
940
  if output_attentions:
@@ -953,9 +931,9 @@ class Florence2FlashAttention2(Florence2Attention):
953
  # is checking that the `sequence_length` of the `past_key_value` is the same as
954
  # the provided `key_value_states` to support prefix tuning
955
  if (
956
- is_cross_attention
957
- and past_key_value is not None
958
- and past_key_value[0].shape[2] == key_value_states.shape[1]
959
  ):
960
  # reuse k,v, cross_attentions
961
  key_states = past_key_value[0].transpose(1, 2)
@@ -1029,7 +1007,7 @@ class Florence2FlashAttention2(Florence2Attention):
1029
 
1030
  # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
1031
  def _flash_attention_forward(
1032
- self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
1033
  ):
1034
  """
1035
  Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
@@ -1129,13 +1107,13 @@ class Florence2FlashAttention2(Florence2Attention):
1129
 
1130
  class Florence2SdpaAttention(Florence2Attention):
1131
  def forward(
1132
- self,
1133
- hidden_states: torch.Tensor,
1134
- key_value_states: Optional[torch.Tensor] = None,
1135
- past_key_value: Optional[Tuple[torch.Tensor]] = None,
1136
- attention_mask: Optional[torch.Tensor] = None,
1137
- layer_head_mask: Optional[torch.Tensor] = None,
1138
- output_attentions: bool = False,
1139
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
1140
  """Input shape: Batch x Time x Channel"""
1141
  if output_attentions or layer_head_mask is not None:
@@ -1166,9 +1144,9 @@ class Florence2SdpaAttention(Florence2Attention):
1166
  # is checking that the `sequence_length` of the `past_key_value` is the same as
1167
  # the provided `key_value_states` to support prefix tuning
1168
  if (
1169
- is_cross_attention
1170
- and past_key_value is not None
1171
- and past_key_value[0].shape[2] == key_value_states.shape[1]
1172
  ):
1173
  # reuse k,v, cross_attentions
1174
  key_states = past_key_value[0]
@@ -1260,11 +1238,11 @@ class Florence2EncoderLayer(nn.Module):
1260
  self.final_layer_norm = nn.LayerNorm(self.embed_dim)
1261
 
1262
  def forward(
1263
- self,
1264
- hidden_states: torch.FloatTensor,
1265
- attention_mask: torch.FloatTensor,
1266
- layer_head_mask: torch.FloatTensor,
1267
- output_attentions: Optional[bool] = False,
1268
  ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
1269
  """
1270
  Args:
@@ -1297,7 +1275,7 @@ class Florence2EncoderLayer(nn.Module):
1297
  hidden_states = self.final_layer_norm(hidden_states)
1298
 
1299
  if hidden_states.dtype == torch.float16 and (
1300
- torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
1301
  ):
1302
  clamp_value = torch.finfo(hidden_states.dtype).max - 1000
1303
  hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
@@ -1341,16 +1319,16 @@ class Florence2DecoderLayer(nn.Module):
1341
  self.final_layer_norm = nn.LayerNorm(self.embed_dim)
1342
 
1343
  def forward(
1344
- self,
1345
- hidden_states: torch.Tensor,
1346
- attention_mask: Optional[torch.Tensor] = None,
1347
- encoder_hidden_states: Optional[torch.Tensor] = None,
1348
- encoder_attention_mask: Optional[torch.Tensor] = None,
1349
- layer_head_mask: Optional[torch.Tensor] = None,
1350
- cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
1351
- past_key_value: Optional[Tuple[torch.Tensor]] = None,
1352
- output_attentions: Optional[bool] = False,
1353
- use_cache: Optional[bool] = True,
1354
  ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
1355
  """
1356
  Args:
@@ -1430,7 +1408,6 @@ class Florence2DecoderLayer(nn.Module):
1430
  return outputs
1431
 
1432
 
1433
-
1434
  class Florence2LanguagePreTrainedModel(PreTrainedModel):
1435
  config_class = Florence2LanguageConfig
1436
  base_model_prefix = "model"
@@ -1451,6 +1428,17 @@ class Florence2LanguagePreTrainedModel(PreTrainedModel):
1451
  module.weight.data.normal_(mean=0.0, std=std)
1452
  if module.padding_idx is not None:
1453
  module.weight.data[module.padding_idx].zero_()
 
 
 
 
 
 
 
 
 
 
 
1454
 
1455
  @property
1456
  def dummy_inputs(self):
@@ -1511,14 +1499,14 @@ class Florence2Encoder(Florence2LanguagePreTrainedModel):
1511
  self.embed_tokens = value
1512
 
1513
  def forward(
1514
- self,
1515
- input_ids: torch.LongTensor = None,
1516
- attention_mask: Optional[torch.Tensor] = None,
1517
- head_mask: Optional[torch.Tensor] = None,
1518
- inputs_embeds: Optional[torch.FloatTensor] = None,
1519
- output_attentions: Optional[bool] = None,
1520
- output_hidden_states: Optional[bool] = None,
1521
- return_dict: Optional[bool] = None,
1522
  ) -> Union[Tuple, BaseModelOutput]:
1523
  r"""
1524
  Args:
@@ -1696,19 +1684,19 @@ class Florence2Decoder(Florence2LanguagePreTrainedModel):
1696
  self.embed_tokens = value
1697
 
1698
  def forward(
1699
- self,
1700
- input_ids: torch.LongTensor = None,
1701
- attention_mask: Optional[torch.Tensor] = None,
1702
- encoder_hidden_states: Optional[torch.FloatTensor] = None,
1703
- encoder_attention_mask: Optional[torch.LongTensor] = None,
1704
- head_mask: Optional[torch.Tensor] = None,
1705
- cross_attn_head_mask: Optional[torch.Tensor] = None,
1706
- past_key_values: Optional[List[torch.FloatTensor]] = None,
1707
- inputs_embeds: Optional[torch.FloatTensor] = None,
1708
- use_cache: Optional[bool] = None,
1709
- output_attentions: Optional[bool] = None,
1710
- output_hidden_states: Optional[bool] = None,
1711
- return_dict: Optional[bool] = None,
1712
  ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
1713
  r"""
1714
  Args:
@@ -1973,22 +1961,22 @@ class Florence2LanguageModel(Florence2LanguagePreTrainedModel):
1973
  return self.decoder
1974
 
1975
  def forward(
1976
- self,
1977
- input_ids: torch.LongTensor = None,
1978
- attention_mask: Optional[torch.Tensor] = None,
1979
- decoder_input_ids: Optional[torch.LongTensor] = None,
1980
- decoder_attention_mask: Optional[torch.LongTensor] = None,
1981
- head_mask: Optional[torch.Tensor] = None,
1982
- decoder_head_mask: Optional[torch.Tensor] = None,
1983
- cross_attn_head_mask: Optional[torch.Tensor] = None,
1984
- encoder_outputs: Optional[List[torch.FloatTensor]] = None,
1985
- past_key_values: Optional[List[torch.FloatTensor]] = None,
1986
- inputs_embeds: Optional[torch.FloatTensor] = None,
1987
- decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
1988
- use_cache: Optional[bool] = None,
1989
- output_attentions: Optional[bool] = None,
1990
- output_hidden_states: Optional[bool] = None,
1991
- return_dict: Optional[bool] = None,
1992
  ) -> Union[Tuple, Seq2SeqModelOutput]:
1993
  # different to other models, Florence2 automatically creates decoder_input_ids from
1994
  # input_ids if no decoder_input_ids are provided
@@ -2074,14 +2062,21 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
2074
  # Initialize weights and apply final processing
2075
  self.post_init()
2076
 
 
 
 
 
 
 
2077
  def get_encoder(self):
2078
  return self.model.get_encoder()
2079
 
2080
  def get_decoder(self):
2081
  return self.model.get_decoder()
2082
 
2083
- def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
2084
- new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
 
2085
  self._resize_final_logits_bias(new_embeddings.weight.shape[0])
2086
  return new_embeddings
2087
 
@@ -2101,23 +2096,23 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
2101
  self.lm_head = new_embeddings
2102
 
2103
  def forward(
2104
- self,
2105
- input_ids: torch.LongTensor = None,
2106
- attention_mask: Optional[torch.Tensor] = None,
2107
- decoder_input_ids: Optional[torch.LongTensor] = None,
2108
- decoder_attention_mask: Optional[torch.LongTensor] = None,
2109
- head_mask: Optional[torch.Tensor] = None,
2110
- decoder_head_mask: Optional[torch.Tensor] = None,
2111
- cross_attn_head_mask: Optional[torch.Tensor] = None,
2112
- encoder_outputs: Optional[List[torch.FloatTensor]] = None,
2113
- past_key_values: Optional[List[torch.FloatTensor]] = None,
2114
- inputs_embeds: Optional[torch.FloatTensor] = None,
2115
- decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
2116
- labels: Optional[torch.LongTensor] = None,
2117
- use_cache: Optional[bool] = None,
2118
- output_attentions: Optional[bool] = None,
2119
- output_hidden_states: Optional[bool] = None,
2120
- return_dict: Optional[bool] = None,
2121
  ) -> Union[Tuple, Seq2SeqLMOutput]:
2122
  r"""
2123
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -2182,17 +2177,17 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
2182
  )
2183
 
2184
  def prepare_inputs_for_generation(
2185
- self,
2186
- decoder_input_ids,
2187
- past_key_values=None,
2188
- attention_mask=None,
2189
- decoder_attention_mask=None,
2190
- head_mask=None,
2191
- decoder_head_mask=None,
2192
- cross_attn_head_mask=None,
2193
- use_cache=None,
2194
- encoder_outputs=None,
2195
- **kwargs,
2196
  ):
2197
  # cut decoder_input_ids if past_key_values is used
2198
  if past_key_values is not None:
@@ -2234,6 +2229,7 @@ class Florence2LanguageForConditionalGeneration(Florence2LanguagePreTrainedModel
2234
  )
2235
  return reordered_past
2236
 
 
2237
  @dataclass
2238
  class Florence2Seq2SeqLMOutput(ModelOutput):
2239
  """
@@ -2415,6 +2411,7 @@ FLORENCE2_INPUTS_DOCSTRING = r"""
2415
  Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
2416
  """
2417
 
 
2418
  @add_start_docstrings(
2419
  """The FLORENCE2 vision model without any head""",
2420
  FLORENCE2_START_DOCSTRING,
@@ -2426,7 +2423,7 @@ class Florence2VisionModel(Florence2PreTrainedModel):
2426
  self.vision_tower = DaViT.from_config(config=config)
2427
 
2428
  self.post_init()
2429
-
2430
  def forward(self, pixel_values):
2431
  if len(pixel_values.shape) == 4:
2432
  x = self.vision_tower.forward_features_unpool(pixel_values)
@@ -2448,7 +2445,7 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
2448
  self._build_image_projection_layers(config)
2449
 
2450
  self.post_init()
2451
-
2452
  def _build_image_projection_layers(self, config):
2453
  image_dim_out = config.dim_embed[-1]
2454
  dim_projection = config.projection_dim
@@ -2484,7 +2481,7 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
2484
  x = self.vision_tower.forward_features_unpool(pixel_values)
2485
  else:
2486
  raise ValueError(f'invalid image shape {pixel_values.shape}')
2487
-
2488
  if self.image_pos_embed is not None:
2489
  x = x.view(batch_size * T, -1, x.shape[-1])
2490
  num_tokens = x.shape[-2]
@@ -2493,7 +2490,7 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
2493
  x = x.view(batch_size * T, h, w, x.shape[-1])
2494
  pos_embed = self.image_pos_embed(x)
2495
  x = x + pos_embed
2496
- x = x.view(batch_size, T * h*w, x.shape[-1])
2497
 
2498
  if self.visual_temporal_embed is not None:
2499
  visual_temporal_embed = self.visual_temporal_embed(x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
@@ -2521,21 +2518,22 @@ class Florence2VisionModelWithProjection(Florence2PreTrainedModel):
2521
  x = x @ self.image_projection
2522
  x = self.image_proj_norm(x)
2523
 
2524
-
2525
  return x
2526
 
2527
 
2528
-
2529
  @add_start_docstrings(
2530
  """The FLORENCE2 model which consists of a vision backbone and a language model.""",
2531
  FLORENCE2_START_DOCSTRING,
2532
  )
2533
- class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixin):
 
 
 
2534
  def __init__(self, config: Florence2Config):
2535
  super().__init__(config)
2536
  assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
2537
  self.vision_tower = DaViT.from_config(config=config.vision_config)
2538
- # remove unused layers
2539
  del self.vision_tower.head
2540
  del self.vision_tower.norms
2541
 
@@ -2545,13 +2543,11 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
2545
 
2546
  language_model = Florence2LanguageForConditionalGeneration(config=config.text_config)
2547
 
2548
- if language_model._tied_weights_keys is not None:
2549
- self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
2550
  self.language_model = language_model
2551
 
2552
  self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
2553
  self.post_init()
2554
-
2555
  def _build_image_projection_layers(self, config):
2556
  image_dim_out = config.vision_config.dim_embed[-1]
2557
  dim_projection = config.vision_config.projection_dim
@@ -2589,14 +2585,15 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
2589
  def get_input_embeddings(self):
2590
  return self.language_model.get_input_embeddings()
2591
 
2592
- def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
2593
- model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
 
2594
  # update vocab size
2595
  self.config.text_config.vocab_size = model_embeds.num_embeddings
2596
  self.config.vocab_size = model_embeds.num_embeddings
2597
  self.vocab_size = model_embeds.num_embeddings
2598
  return model_embeds
2599
-
2600
  def _encode_image(self, pixel_values):
2601
  if len(pixel_values.shape) == 4:
2602
  batch_size, C, H, W = pixel_values.shape
@@ -2604,7 +2601,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
2604
  x = self.vision_tower.forward_features_unpool(pixel_values)
2605
  else:
2606
  raise ValueError(f'invalid image shape {pixel_values.shape}')
2607
-
2608
  if self.image_pos_embed is not None:
2609
  x = x.view(batch_size * T, -1, x.shape[-1])
2610
  num_tokens = x.shape[-2]
@@ -2613,7 +2610,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
2613
  x = x.view(batch_size * T, h, w, x.shape[-1])
2614
  pos_embed = self.image_pos_embed(x)
2615
  x = x + pos_embed
2616
- x = x.view(batch_size, T * h*w, x.shape[-1])
2617
 
2618
  if self.visual_temporal_embed is not None:
2619
  visual_temporal_embed = self.visual_temporal_embed(x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
@@ -2641,10 +2638,10 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
2641
  x = x @ self.image_projection
2642
  x = self.image_proj_norm(x)
2643
 
2644
- return x
2645
 
2646
  def _merge_input_ids_with_image_features(
2647
- self, image_features, inputs_embeds
2648
  ):
2649
  batch_size, image_token_length = image_features.size()[:-1]
2650
  device = image_features.device
@@ -2667,28 +2664,27 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
2667
 
2668
  return inputs_embeds, attention_mask
2669
 
2670
-
2671
  @add_start_docstrings_to_model_forward(FLORENCE2_INPUTS_DOCSTRING)
2672
  @replace_return_docstrings(output_type=Florence2Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
2673
  def forward(
2674
- self,
2675
- input_ids: torch.LongTensor = None,
2676
- pixel_values: torch.FloatTensor = None,
2677
- attention_mask: Optional[torch.Tensor] = None,
2678
- decoder_input_ids: Optional[torch.LongTensor] = None,
2679
- decoder_attention_mask: Optional[torch.LongTensor] = None,
2680
- head_mask: Optional[torch.Tensor] = None,
2681
- decoder_head_mask: Optional[torch.Tensor] = None,
2682
- cross_attn_head_mask: Optional[torch.Tensor] = None,
2683
- encoder_outputs: Optional[List[torch.FloatTensor]] = None,
2684
- past_key_values: Optional[List[torch.FloatTensor]] = None,
2685
- inputs_embeds: Optional[torch.FloatTensor] = None,
2686
- decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
2687
- labels: Optional[torch.LongTensor] = None,
2688
- use_cache: Optional[bool] = None,
2689
- output_attentions: Optional[bool] = None,
2690
- output_hidden_states: Optional[bool] = None,
2691
- return_dict: Optional[bool] = None,
2692
  ) -> Union[Tuple, Florence2Seq2SeqLMOutput]:
2693
  r"""
2694
  Args:
@@ -2778,12 +2774,12 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
2778
  )
2779
 
2780
  def generate(
2781
- self,
2782
- input_ids,
2783
- inputs_embeds=None,
2784
- pixel_values=None,
2785
- **kwargs
2786
- ):
2787
 
2788
  if inputs_embeds is None:
2789
  # 1. Extra the input embeddings
@@ -2793,7 +2789,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
2793
  if pixel_values is not None:
2794
  image_features = self._encode_image(pixel_values)
2795
  inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
2796
-
2797
  return self.language_model.generate(
2798
  input_ids=None,
2799
  inputs_embeds=inputs_embeds,
@@ -2801,18 +2797,18 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
2801
  )
2802
 
2803
  def prepare_inputs_for_generation(
2804
- self,
2805
- decoder_input_ids,
2806
- past_key_values=None,
2807
- attention_mask=None,
2808
- pixel_values=None,
2809
- decoder_attention_mask=None,
2810
- head_mask=None,
2811
- decoder_head_mask=None,
2812
- cross_attn_head_mask=None,
2813
- use_cache=None,
2814
- encoder_outputs=None,
2815
- **kwargs,
2816
  ):
2817
  # cut decoder_input_ids if past_key_values is used
2818
  if past_key_values is not None:
@@ -2826,7 +2822,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
2826
  remove_prefix_length = decoder_input_ids.shape[1] - 1
2827
 
2828
  decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
2829
-
2830
  return {
2831
  "input_ids": None, # encoder_outputs is defined. input_ids not needed
2832
  "encoder_outputs": encoder_outputs,
@@ -2840,7 +2836,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
2840
  "cross_attn_head_mask": cross_attn_head_mask,
2841
  "use_cache": use_cache, # change this to avoid caching (presumably for debugging)
2842
  }
2843
-
2844
  def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
2845
  return self.language_model.shift_tokens_right(labels)
2846
 
 
23
  from torch import nn
24
  import torch.nn.functional as F
25
  import torch.utils.checkpoint as checkpoint
26
+ from torch.nn import CrossEntropyLoss
27
  from collections import OrderedDict
28
  from einops import rearrange
29
+ from timm.layers import DropPath
30
 
31
  from transformers.modeling_utils import PreTrainedModel
32
  from transformers.generation.utils import GenerationMixin
 
34
  ModelOutput,
35
  add_start_docstrings,
36
  add_start_docstrings_to_model_forward,
 
37
  logging,
38
  replace_return_docstrings,
39
  is_flash_attn_2_available,
40
  is_flash_attn_greater_or_equal_2_10,
41
  )
42
+ from .configuration_florence2 import Florence2Config
43
  from .configuration_florence2 import Florence2LanguageConfig
44
  from .configuration_florence2 import Florence2VisionConfig
45
 
 
46
  from transformers.activations import ACT2FN
47
  from transformers.modeling_attn_mask_utils import (
48
  _prepare_4d_attention_mask,
 
57
  Seq2SeqModelOutput,
58
  )
59
 
 
60
  if is_flash_attn_2_available():
61
  from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
62
 
 
64
 
65
  _CONFIG_FOR_DOC = "Florence2Config"
66
 
67
+
68
  class LearnedAbsolutePositionEmbedding2D(nn.Module):
69
  """
70
  This module learns positional embeddings up to a fixed maximum size.
 
77
 
78
  def forward(self, pixel_values):
79
  """
80
+ pixel_values: (batch_size, height, width, num_channels)
81
  returns: (batch_size, height, width, embedding_dim * 2)
82
  """
83
  if len(pixel_values.shape) != 4:
 
98
  pos = pos.permute(0, 2, 3, 1)
99
  return pos
100
 
101
+
102
  class PositionalEmbeddingCosine1D(nn.Module):
103
  """
104
  This class implements a very simple positional encoding. It follows closely
 
110
  dropout_prob: The dropout probability.
111
  max_seq_len: The maximum length to precompute the positional encodings.
112
  """
113
+
114
  def __init__(
115
  self,
116
  embed_dim: int = 512,
 
126
  # of the position index (i.e., the row index).
127
  frequencies = \
128
  torch.arange(0, self.max_seq_len) \
129
+ .reshape(self.max_seq_len, 1) * denominator
130
  pos_idx_to_embed = torch.zeros((self.max_seq_len, self.embed_dim))
131
  # Populate uneven entries.
132
  pos_idx_to_embed[:, 0::2] = torch.sin(frequencies)
 
166
  embed_dim: The dimension of the embeddings.
167
  max_seq_len: The maximum length to precompute the positional encodings.
168
  """
169
+
170
  def __init__(
171
  self,
172
  embedding_dim: int = 512,
 
200
  return pos_embeds
201
 
202
 
 
203
  class MySequential(nn.Sequential):
204
  def forward(self, *inputs):
205
  for module in self._modules.values():
 
234
 
235
  class Mlp(nn.Module):
236
  def __init__(
237
+ self,
238
+ in_features,
239
+ hidden_features=None,
240
+ out_features=None,
241
+ act_layer=nn.GELU,
242
  ):
243
  super().__init__()
244
  out_features = out_features or in_features
 
255
 
256
  class DepthWiseConv2d(nn.Module):
257
  def __init__(
258
+ self,
259
+ dim_in,
260
+ kernel_size,
261
+ padding,
262
+ stride,
263
+ bias=True,
264
  ):
265
  super().__init__()
266
  self.dw = nn.Conv2d(
 
288
  """
289
 
290
  def __init__(
291
+ self,
292
+ patch_size=7,
293
+ in_chans=3,
294
+ embed_dim=64,
295
+ stride=4,
296
+ padding=2,
297
+ norm_layer=None,
298
+ pre_norm=True
299
  ):
300
  super().__init__()
301
  self.patch_size = patch_size
 
374
  self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
375
  self.ffn = PreNorm(
376
  norm_layer(dim),
377
+ Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer),
378
  drop_path
379
  )
380
 
 
398
 
399
 
400
  def window_reverse(windows, batch_size: int, window_size: int, H: int, W: int):
401
+ B = batch_size
402
  # this will cause onnx conversion failed for dynamic axis, because treated as constant
403
+ # int(windows.shape[0] / (H * W / window_size / window_size))
404
  x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
405
  x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
406
  return x
 
408
 
409
  class WindowAttention(nn.Module):
410
  def __init__(self, dim, num_heads, window_size, qkv_bias=True):
 
411
  super().__init__()
412
  self.dim = dim
413
  self.window_size = window_size
 
421
  self.softmax = nn.Softmax(dim=-1)
422
 
423
  def forward(self, x, size):
 
424
  H, W = size
425
  B, L, C = x.shape
426
  assert L == H * W, "input feature has wrong size"
 
482
  self.conv2 = PreNorm(None, DepthWiseConv2d(dim, 3, 1, 1)) if conv_at_ffn else None
483
  self.ffn = PreNorm(
484
  norm_layer(dim),
485
+ Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer),
486
  drop_path
487
  )
488
 
 
521
  """
522
 
523
  def __init__(
524
+ self,
525
+ in_chans=3,
526
+ num_classes=1000,
527
+ depths=(1, 1, 3, 1),
528
+ patch_size=(7, 2, 2, 2),
529
+ patch_stride=(4, 2, 2, 2),
530
+ patch_padding=(3, 0, 0, 0),
531
+ patch_prenorm=(False, False, False, False),
532
+ embed_dims=(64, 128, 192, 256),
533
+ num_heads=(3, 6, 12, 24),
534
+ num_groups=(3, 6, 12, 24),
535
+ window_size=7,
536
+ mlp_ratio=4.,
537
+ qkv_bias=True,
538
+ drop_path_rate=0.1,
539
+ norm_layer=nn.LayerNorm,
540
+ enable_checkpoint=False,
541
+ conv_at_attn=True,
542
+ conv_at_ffn=True,
543
+ ):
544
  super().__init__()
545
 
546
  self.num_classes = num_classes
 
552
  assert self.num_stages == len(self.num_heads) == len(self.num_groups)
553
 
554
  num_stages = len(embed_dims)
555
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths) * 2)]
556
 
557
  depth_offset = 0
558
  convs = []
 
574
  MySequential(OrderedDict([
575
  (
576
  'spatial_block', SpatialBlock(
577
+ embed_dims[i],
578
+ num_heads[i],
579
+ window_size,
580
+ drop_path_rate=dpr[depth_offset + j * 2],
581
+ qkv_bias=qkv_bias,
582
+ mlp_ratio=mlp_ratio,
583
+ conv_at_attn=conv_at_attn,
584
+ conv_at_ffn=conv_at_ffn,
585
+ )
586
  ),
587
  (
588
  'channel_block', ChannelBlock(
589
+ embed_dims[i],
590
+ num_groups[i],
591
+ drop_path_rate=dpr[depth_offset + j * 2 + 1],
592
+ qkv_bias=qkv_bias,
593
+ mlp_ratio=mlp_ratio,
594
+ conv_at_attn=conv_at_attn,
595
+ conv_at_ffn=conv_at_ffn,
596
+ )
597
  )
598
  ])) for j in range(depths[i])
599
  ]
600
  )
601
  blocks.append(block)
602
+ depth_offset += depths[i] * 2
603
 
604
  self.convs = nn.ModuleList(convs)
605
  self.blocks = nn.ModuleList(blocks)
 
608
  self.avgpool = nn.AdaptiveAvgPool1d(1)
609
  self.head = nn.Linear(self.embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity()
610
 
 
 
611
  @property
612
  def dim_out(self):
613
  return self.embed_dims[-1]
614
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
615
  def forward_features_unpool(self, x):
616
  """
617
+ forward until avg pooling
618
  Args:
619
  x (_type_): input image tensor
620
  """
 
642
  x = self.forward_features(x)
643
  x = self.head(x)
644
  return x
645
+
646
  @classmethod
647
  def from_config(cls, config):
648
  return cls(
 
659
  )
660
 
661
 
 
 
662
  if is_flash_attn_2_available():
663
  from flash_attn import flash_attn_func, flash_attn_varlen_func
664
  from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
665
 
666
+
667
  # Copied from transformers.models.llama.modeling_llama._get_unpad_data
668
  def _get_unpad_data(attention_mask):
669
  seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
 
732
  """Multi-headed attention from 'Attention Is All You Need' paper"""
733
 
734
  def __init__(
735
+ self,
736
+ embed_dim: int,
737
+ num_heads: int,
738
+ dropout: float = 0.0,
739
+ is_decoder: bool = False,
740
+ bias: bool = True,
741
+ is_causal: bool = False,
742
+ config: Optional[Florence2LanguageConfig] = None,
743
  ):
744
  super().__init__()
745
  self.embed_dim = embed_dim
 
753
  f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
754
  f" and `num_heads`: {num_heads})."
755
  )
756
+ self.scaling = self.head_dim ** -0.5
757
  self.is_decoder = is_decoder
758
  self.is_causal = is_causal
759
 
 
766
  return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
767
 
768
  def forward(
769
+ self,
770
+ hidden_states: torch.Tensor,
771
+ key_value_states: Optional[torch.Tensor] = None,
772
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
773
+ attention_mask: Optional[torch.Tensor] = None,
774
+ layer_head_mask: Optional[torch.Tensor] = None,
775
+ output_attentions: bool = False,
776
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
777
  """Input shape: Batch x Time x Channel"""
778
 
 
789
  # is checking that the `sequence_length` of the `past_key_value` is the same as
790
  # the provided `key_value_states` to support prefix tuning
791
  if (
792
+ is_cross_attention
793
+ and past_key_value is not None
794
+ and past_key_value[0].shape[2] == key_value_states.shape[1]
795
  ):
796
  # reuse k,v, cross_attentions
797
  key_states = past_key_value[0]
 
906
  return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
907
 
908
  def forward(
909
+ self,
910
+ hidden_states: torch.Tensor,
911
+ key_value_states: Optional[torch.Tensor] = None,
912
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
913
+ attention_mask: Optional[torch.Tensor] = None,
914
+ layer_head_mask: Optional[torch.Tensor] = None,
915
+ output_attentions: bool = False,
916
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
917
  # Florence2FlashAttention2 attention does not support output_attentions
918
  if output_attentions:
 
931
  # is checking that the `sequence_length` of the `past_key_value` is the same as
932
  # the provided `key_value_states` to support prefix tuning
933
  if (
934
+ is_cross_attention
935
+ and past_key_value is not None
936
+ and past_key_value[0].shape[2] == key_value_states.shape[1]
937
  ):
938
  # reuse k,v, cross_attentions
939
  key_states = past_key_value[0].transpose(1, 2)
 
1007
 
1008
  # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
1009
  def _flash_attention_forward(
1010
+ self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
1011
  ):
1012
  """
1013
  Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
 
1107
 
1108
  class Florence2SdpaAttention(Florence2Attention):
1109
  def forward(
1110
+ self,
1111
+ hidden_states: torch.Tensor,
1112
+ key_value_states: Optional[torch.Tensor] = None,
1113
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
1114
+ attention_mask: Optional[torch.Tensor] = None,
1115
+ layer_head_mask: Optional[torch.Tensor] = None,
1116
+ output_attentions: bool = False,
1117
  ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
1118
  """Input shape: Batch x Time x Channel"""
1119
  if output_attentions or layer_head_mask is not None:
 
1144
  # is checking that the `sequence_length` of the `past_key_value` is the same as
1145
  # the provided `key_value_states` to support prefix tuning
1146
  if (
1147
+ is_cross_attention
1148
+ and past_key_value is not None
1149
+ and past_key_value[0].shape[2] == key_value_states.shape[1]
1150
  ):
1151
  # reuse k,v, cross_attentions
1152
  key_states = past_key_value[0]
 
1238
  self.final_layer_norm = nn.LayerNorm(self.embed_dim)
1239
 
1240
  def forward(
1241
+ self,
1242
+ hidden_states: torch.FloatTensor,
1243
+ attention_mask: torch.FloatTensor,
1244
+ layer_head_mask: torch.FloatTensor,
1245
+ output_attentions: Optional[bool] = False,
1246
  ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]:
1247
  """
1248
  Args:
 
1275
  hidden_states = self.final_layer_norm(hidden_states)
1276
 
1277
  if hidden_states.dtype == torch.float16 and (
1278
+ torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
1279
  ):
1280
  clamp_value = torch.finfo(hidden_states.dtype).max - 1000
1281
  hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
1319
  self.final_layer_norm = nn.LayerNorm(self.embed_dim)
1320
 
1321
  def forward(
1322
+ self,
1323
+ hidden_states: torch.Tensor,
1324
+ attention_mask: Optional[torch.Tensor] = None,
1325
+ encoder_hidden_states: Optional[torch.Tensor] = None,
1326
+ encoder_attention_mask: Optional[torch.Tensor] = None,
1327
+ layer_head_mask: Optional[torch.Tensor] = None,
1328
+ cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
1329
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
1330
+ output_attentions: Optional[bool] = False,
1331
+ use_cache: Optional[bool] = True,
1332
  ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
1333
  """
1334
  Args:
 
1408
  return outputs
1409
 
1410
 
 
1411
  class Florence2LanguagePreTrainedModel(PreTrainedModel):
1412
  config_class = Florence2LanguageConfig
1413
  base_model_prefix = "model"
 
1428
  module.weight.data.normal_(mean=0.0, std=std)
1429
  if module.padding_idx is not None:
1430
  module.weight.data[module.padding_idx].zero_()
1431
+ elif isinstance(module, nn.Conv2d):
1432
+ nn.init.normal_(module.weight, std=0.02)
1433
+ for name, _ in module.named_parameters():
1434
+ if name == "bias":
1435
+ nn.init.constant_(module.bias, 0)
1436
+ elif isinstance(module, nn.LayerNorm):
1437
+ nn.init.constant_(module.weight, 1.0)
1438
+ nn.init.constant_(module.bias, 0)
1439
+ elif isinstance(module, nn.BatchNorm2d):
1440
+ nn.init.constant_(module.weight, 1.0)
1441
+ nn.init.constant_(module.bias, 0)
1442
 
1443
  @property
1444
  def dummy_inputs(self):
 
1499
  self.embed_tokens = value
1500
 
1501
  def forward(
1502
+ self,
1503
+ input_ids: torch.LongTensor = None,
1504
+ attention_mask: Optional[torch.Tensor] = None,
1505
+ head_mask: Optional[torch.Tensor] = None,
1506
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1507
+ output_attentions: Optional[bool] = None,
1508
+ output_hidden_states: Optional[bool] = None,
1509
+ return_dict: Optional[bool] = None,
1510
  ) -> Union[Tuple, BaseModelOutput]:
1511
  r"""
1512
  Args:
 
1684
  self.embed_tokens = value
1685
 
1686
  def forward(
1687
+ self,
1688
+ input_ids: torch.LongTensor = None,
1689
+ attention_mask: Optional[torch.Tensor] = None,
1690
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
1691
+ encoder_attention_mask: Optional[torch.LongTensor] = None,
1692
+ head_mask: Optional[torch.Tensor] = None,
1693
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
1694
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1695
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1696
+ use_cache: Optional[bool] = None,
1697
+ output_attentions: Optional[bool] = None,
1698
+ output_hidden_states: Optional[bool] = None,
1699
+ return_dict: Optional[bool] = None,
1700
  ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
1701
  r"""
1702
  Args:
 
1961
  return self.decoder
1962
 
1963
  def forward(
1964
+ self,
1965
+ input_ids: torch.LongTensor = None,
1966
+ attention_mask: Optional[torch.Tensor] = None,
1967
+ decoder_input_ids: Optional[torch.LongTensor] = None,
1968
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
1969
+ head_mask: Optional[torch.Tensor] = None,
1970
+ decoder_head_mask: Optional[torch.Tensor] = None,
1971
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
1972
+ encoder_outputs: Optional[List[torch.FloatTensor]] = None,
1973
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1974
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1975
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
1976
+ use_cache: Optional[bool] = None,
1977
+ output_attentions: Optional[bool] = None,
1978
+ output_hidden_states: Optional[bool] = None,
1979
+ return_dict: Optional[bool] = None,
1980
  ) -> Union[Tuple, Seq2SeqModelOutput]:
1981
  # different to other models, Florence2 automatically creates decoder_input_ids from
1982
  # input_ids if no decoder_input_ids are provided
 
2062
  # Initialize weights and apply final processing
2063
  self.post_init()
2064
 
2065
+ def _tie_weights(self):
2066
+ if self.config.tie_word_embeddings:
2067
+ self._tie_or_clone_weights(self.model.encoder.embed_tokens, self.model.shared)
2068
+ self._tie_or_clone_weights(self.model.decoder.embed_tokens, self.model.shared)
2069
+ self._tie_or_clone_weights(self.lm_head, self.model.shared)
2070
+
2071
  def get_encoder(self):
2072
  return self.model.get_encoder()
2073
 
2074
  def get_decoder(self):
2075
  return self.model.get_decoder()
2076
 
2077
+ def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None,
2078
+ **kwargs) -> nn.Embedding:
2079
+ new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of, **kwargs)
2080
  self._resize_final_logits_bias(new_embeddings.weight.shape[0])
2081
  return new_embeddings
2082
 
 
2096
  self.lm_head = new_embeddings
2097
 
2098
  def forward(
2099
+ self,
2100
+ input_ids: torch.LongTensor = None,
2101
+ attention_mask: Optional[torch.Tensor] = None,
2102
+ decoder_input_ids: Optional[torch.LongTensor] = None,
2103
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
2104
+ head_mask: Optional[torch.Tensor] = None,
2105
+ decoder_head_mask: Optional[torch.Tensor] = None,
2106
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
2107
+ encoder_outputs: Optional[List[torch.FloatTensor]] = None,
2108
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
2109
+ inputs_embeds: Optional[torch.FloatTensor] = None,
2110
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
2111
+ labels: Optional[torch.LongTensor] = None,
2112
+ use_cache: Optional[bool] = None,
2113
+ output_attentions: Optional[bool] = None,
2114
+ output_hidden_states: Optional[bool] = None,
2115
+ return_dict: Optional[bool] = None,
2116
  ) -> Union[Tuple, Seq2SeqLMOutput]:
2117
  r"""
2118
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
 
2177
  )
2178
 
2179
  def prepare_inputs_for_generation(
2180
+ self,
2181
+ decoder_input_ids,
2182
+ past_key_values=None,
2183
+ attention_mask=None,
2184
+ decoder_attention_mask=None,
2185
+ head_mask=None,
2186
+ decoder_head_mask=None,
2187
+ cross_attn_head_mask=None,
2188
+ use_cache=None,
2189
+ encoder_outputs=None,
2190
+ **kwargs,
2191
  ):
2192
  # cut decoder_input_ids if past_key_values is used
2193
  if past_key_values is not None:
 
2229
  )
2230
  return reordered_past
2231
 
2232
+
2233
  @dataclass
2234
  class Florence2Seq2SeqLMOutput(ModelOutput):
2235
  """
 
2411
  Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
2412
  """
2413
 
2414
+
2415
  @add_start_docstrings(
2416
  """The FLORENCE2 vision model without any head""",
2417
  FLORENCE2_START_DOCSTRING,
 
2423
  self.vision_tower = DaViT.from_config(config=config)
2424
 
2425
  self.post_init()
2426
+
2427
  def forward(self, pixel_values):
2428
  if len(pixel_values.shape) == 4:
2429
  x = self.vision_tower.forward_features_unpool(pixel_values)
 
2445
  self._build_image_projection_layers(config)
2446
 
2447
  self.post_init()
2448
+
2449
  def _build_image_projection_layers(self, config):
2450
  image_dim_out = config.dim_embed[-1]
2451
  dim_projection = config.projection_dim
 
2481
  x = self.vision_tower.forward_features_unpool(pixel_values)
2482
  else:
2483
  raise ValueError(f'invalid image shape {pixel_values.shape}')
2484
+
2485
  if self.image_pos_embed is not None:
2486
  x = x.view(batch_size * T, -1, x.shape[-1])
2487
  num_tokens = x.shape[-2]
 
2490
  x = x.view(batch_size * T, h, w, x.shape[-1])
2491
  pos_embed = self.image_pos_embed(x)
2492
  x = x + pos_embed
2493
+ x = x.view(batch_size, T * h * w, x.shape[-1])
2494
 
2495
  if self.visual_temporal_embed is not None:
2496
  visual_temporal_embed = self.visual_temporal_embed(x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
 
2518
  x = x @ self.image_projection
2519
  x = self.image_proj_norm(x)
2520
 
 
2521
  return x
2522
 
2523
 
 
2524
  @add_start_docstrings(
2525
  """The FLORENCE2 model which consists of a vision backbone and a language model.""",
2526
  FLORENCE2_START_DOCSTRING,
2527
  )
2528
+ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
2529
+ _tied_weights_keys = ["language_model.encoder.embed_tokens.weight", "language_model.decoder.embed_tokens.weight",
2530
+ "language_model.lm_head.weight"]
2531
+
2532
  def __init__(self, config: Florence2Config):
2533
  super().__init__(config)
2534
  assert config.vision_config.model_type == 'davit', 'only DaViT is supported for now'
2535
  self.vision_tower = DaViT.from_config(config=config.vision_config)
2536
+ # remove unused layers
2537
  del self.vision_tower.head
2538
  del self.vision_tower.norms
2539
 
 
2543
 
2544
  language_model = Florence2LanguageForConditionalGeneration(config=config.text_config)
2545
 
 
 
2546
  self.language_model = language_model
2547
 
2548
  self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
2549
  self.post_init()
2550
+
2551
  def _build_image_projection_layers(self, config):
2552
  image_dim_out = config.vision_config.dim_embed[-1]
2553
  dim_projection = config.vision_config.projection_dim
 
2585
  def get_input_embeddings(self):
2586
  return self.language_model.get_input_embeddings()
2587
 
2588
+ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None,
2589
+ **kwargs) -> nn.Embedding:
2590
+ model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of, **kwargs)
2591
  # update vocab size
2592
  self.config.text_config.vocab_size = model_embeds.num_embeddings
2593
  self.config.vocab_size = model_embeds.num_embeddings
2594
  self.vocab_size = model_embeds.num_embeddings
2595
  return model_embeds
2596
+
2597
  def _encode_image(self, pixel_values):
2598
  if len(pixel_values.shape) == 4:
2599
  batch_size, C, H, W = pixel_values.shape
 
2601
  x = self.vision_tower.forward_features_unpool(pixel_values)
2602
  else:
2603
  raise ValueError(f'invalid image shape {pixel_values.shape}')
2604
+
2605
  if self.image_pos_embed is not None:
2606
  x = x.view(batch_size * T, -1, x.shape[-1])
2607
  num_tokens = x.shape[-2]
 
2610
  x = x.view(batch_size * T, h, w, x.shape[-1])
2611
  pos_embed = self.image_pos_embed(x)
2612
  x = x + pos_embed
2613
+ x = x.view(batch_size, T * h * w, x.shape[-1])
2614
 
2615
  if self.visual_temporal_embed is not None:
2616
  visual_temporal_embed = self.visual_temporal_embed(x.view(batch_size, T, -1, x.shape[-1])[:, :, 0])
 
2638
  x = x @ self.image_projection
2639
  x = self.image_proj_norm(x)
2640
 
2641
+ return x
2642
 
2643
  def _merge_input_ids_with_image_features(
2644
+ self, image_features, inputs_embeds
2645
  ):
2646
  batch_size, image_token_length = image_features.size()[:-1]
2647
  device = image_features.device
 
2664
 
2665
  return inputs_embeds, attention_mask
2666
 
 
2667
  @add_start_docstrings_to_model_forward(FLORENCE2_INPUTS_DOCSTRING)
2668
  @replace_return_docstrings(output_type=Florence2Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
2669
  def forward(
2670
+ self,
2671
+ input_ids: torch.LongTensor = None,
2672
+ pixel_values: torch.FloatTensor = None,
2673
+ attention_mask: Optional[torch.Tensor] = None,
2674
+ decoder_input_ids: Optional[torch.LongTensor] = None,
2675
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
2676
+ head_mask: Optional[torch.Tensor] = None,
2677
+ decoder_head_mask: Optional[torch.Tensor] = None,
2678
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
2679
+ encoder_outputs: Optional[List[torch.FloatTensor]] = None,
2680
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
2681
+ inputs_embeds: Optional[torch.FloatTensor] = None,
2682
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
2683
+ labels: Optional[torch.LongTensor] = None,
2684
+ use_cache: Optional[bool] = None,
2685
+ output_attentions: Optional[bool] = None,
2686
+ output_hidden_states: Optional[bool] = None,
2687
+ return_dict: Optional[bool] = None,
2688
  ) -> Union[Tuple, Florence2Seq2SeqLMOutput]:
2689
  r"""
2690
  Args:
 
2774
  )
2775
 
2776
  def generate(
2777
+ self,
2778
+ input_ids,
2779
+ inputs_embeds=None,
2780
+ pixel_values=None,
2781
+ **kwargs
2782
+ ):
2783
 
2784
  if inputs_embeds is None:
2785
  # 1. Extra the input embeddings
 
2789
  if pixel_values is not None:
2790
  image_features = self._encode_image(pixel_values)
2791
  inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
2792
+
2793
  return self.language_model.generate(
2794
  input_ids=None,
2795
  inputs_embeds=inputs_embeds,
 
2797
  )
2798
 
2799
  def prepare_inputs_for_generation(
2800
+ self,
2801
+ decoder_input_ids,
2802
+ past_key_values=None,
2803
+ attention_mask=None,
2804
+ pixel_values=None,
2805
+ decoder_attention_mask=None,
2806
+ head_mask=None,
2807
+ decoder_head_mask=None,
2808
+ cross_attn_head_mask=None,
2809
+ use_cache=None,
2810
+ encoder_outputs=None,
2811
+ **kwargs,
2812
  ):
2813
  # cut decoder_input_ids if past_key_values is used
2814
  if past_key_values is not None:
 
2822
  remove_prefix_length = decoder_input_ids.shape[1] - 1
2823
 
2824
  decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
2825
+
2826
  return {
2827
  "input_ids": None, # encoder_outputs is defined. input_ids not needed
2828
  "encoder_outputs": encoder_outputs,
 
2836
  "cross_attn_head_mask": cross_attn_head_mask,
2837
  "use_cache": use_cache, # change this to avoid caching (presumably for debugging)
2838
  }
2839
+
2840
  def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
2841
  return self.language_model.shift_tokens_right(labels)
2842
 
requirements.txt CHANGED
@@ -1,8 +1,9 @@
1
- torch
2
- torchvision
3
- timm
4
  pillow
 
5
  safetensors
6
- transformers
7
- einops
8
- pydantic==2.10.6
 
 
1
+ einops
2
+ gradio
 
3
  pillow
4
+ pydantic
5
  safetensors
6
+ timm
7
+ torch
8
+ torchvision
9
+ transformers==4.51.3
tag_implications-2024-05-05.csv DELETED
The diff for this file is too large to render. See raw diff
 
tags-2024-05-05.csv → tags-2025-11-25.csv.gz RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6d2dee2343cb402468867655a5df065a03bbd50fc68fe5f1b82f1685a5665370
3
- size 31973430
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f309ca05034df465bbb930a9cc29be067ef80e04ccb113d6294bd17861bf7f84
3
+ size 16154767