cheenmaya commited on
Commit
a216602
·
verified ·
1 Parent(s): ca8ff01
Files changed (1) hide show
  1. configuration_florence2.py +316 -0
configuration_florence2.py CHANGED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import warnings
15
+ """ Florence-2 configuration"""
16
+
17
+ from typing import Optional
18
+
19
+ from transformers import AutoConfig
20
+ from transformers.configuration_utils import PretrainedConfig
21
+ from transformers.utils import logging
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ class Florence2VisionConfig(PretrainedConfig):
26
+ r"""
27
+ This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
28
+ according to the specified arguments, defining the model architecture. Instantiating a configuration with the
29
+ defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
30
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
31
+ documentation from [`PretrainedConfig`] for more information.
32
+ Args:
33
+ drop_path_rate (`float`, *optional*, defaults to 0.1):
34
+ The dropout rate of the drop path layer.
35
+ patch_size (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
36
+ The patch size of the image.
37
+ patch_stride (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
38
+ The patch stride of the image.
39
+ patch_padding (`List[int]`, *optional*, defaults to [3, 1, 1, 1]):
40
+ The patch padding of the image.
41
+ patch_prenorm (`List[bool]`, *optional*, defaults to [false, true, true, true]):
42
+ Whether to apply layer normalization before the patch embedding layer.
43
+ enable_checkpoint (`bool`, *optional*, defaults to False):
44
+ Whether to enable checkpointing.
45
+ dim_embed (`List[int]`, *optional*, defaults to [256, 512, 1024, 2048]):
46
+ The dimension of the embedding layer.
47
+ num_heads (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
48
+ The number of attention heads.
49
+ num_groups (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
50
+ The number of groups.
51
+ depths (`List[int]`, *optional*, defaults to [1, 1, 9, 1]):
52
+ The depth of the model.
53
+ window_size (`int`, *optional*, defaults to 12):
54
+ The window size of the model.
55
+ projection_dim (`int`, *optional*, defaults to 1024):
56
+ The dimension of the projection layer.
57
+ visual_temporal_embedding (`dict`, *optional*):
58
+ The configuration of the visual temporal embedding.
59
+ image_pos_embed (`dict`, *optional*):
60
+ The configuration of the image position embedding.
61
+ image_feature_source (`List[str]`, *optional*, defaults to ["spatial_avg_pool", "temporal_avg_pool"]):
62
+ The source of the image feature.
63
+ Example:
64
+ ```python
65
+ >>> from transformers import Florence2VisionConfig, Florence2VisionModel
66
+ >>> # Initializing a Florence2 Vision style configuration
67
+ >>> configuration = Florence2VisionConfig()
68
+ >>> # Initializing a model (with random weights)
69
+ >>> model = Florence2VisionModel(configuration)
70
+ >>> # Accessing the model configuration
71
+ >>> configuration = model.config
72
+ ```"""
73
+
74
+ model_type = "florence2_vision"
75
+ keys_to_ignore_at_inference = ["past_key_values"]
76
+
77
+ def __init__(
78
+ self,
79
+ drop_path_rate=0.1,
80
+ patch_size=[7, 3, 3, 3],
81
+ patch_stride=[4, 2, 2, 2],
82
+ patch_padding=[3, 1, 1, 1],
83
+ patch_prenorm=[False, True, True, True],
84
+ enable_checkpoint=False,
85
+ dim_embed=[256, 512, 1024, 2048],
86
+ num_heads=[8, 16, 32, 64],
87
+ num_groups=[8, 16, 32, 64],
88
+ depths=[1, 1, 9, 1],
89
+ window_size=12,
90
+ projection_dim=1024,
91
+ visual_temporal_embedding=None,
92
+ image_pos_embed=None,
93
+ image_feature_source=["spatial_avg_pool", "temporal_avg_pool"],
94
+ **kwargs,
95
+ ):
96
+ self.drop_path_rate = drop_path_rate
97
+ self.patch_size = patch_size
98
+ self.patch_stride = patch_stride
99
+ self.patch_padding = patch_padding
100
+ self.patch_prenorm = patch_prenorm
101
+ self.enable_checkpoint = enable_checkpoint
102
+ self.dim_embed = dim_embed
103
+ self.num_heads = num_heads
104
+ self.num_groups = num_groups
105
+ self.depths = depths
106
+ self.window_size = window_size
107
+ self.projection_dim = projection_dim
108
+ self.visual_temporal_embedding = visual_temporal_embedding
109
+ self.image_pos_embed = image_pos_embed
110
+ self.image_feature_source = image_feature_source
111
+
112
+ super().__init__(**kwargs)
113
+
114
+
115
+
116
+ class Florence2LanguageConfig(PretrainedConfig):
117
+ r"""
118
+ This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
119
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
120
+ defaults will yield a similar configuration to that of the BART
121
+ [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
122
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
123
+ documentation from [`PretrainedConfig`] for more information.
124
+ Args:
125
+ vocab_size (`int`, *optional*, defaults to 51289):
126
+ Vocabulary size of the Florence2Language model. Defines the number of different tokens that can be represented by the
127
+ `inputs_ids` passed when calling [`Florence2LanguageModel`].
128
+ d_model (`int`, *optional*, defaults to 1024):
129
+ Dimensionality of the layers and the pooler layer.
130
+ encoder_layers (`int`, *optional*, defaults to 12):
131
+ Number of encoder layers.
132
+ decoder_layers (`int`, *optional*, defaults to 12):
133
+ Number of decoder layers.
134
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
135
+ Number of attention heads for each attention layer in the Transformer encoder.
136
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
137
+ Number of attention heads for each attention layer in the Transformer decoder.
138
+ decoder_ffn_dim (`int`, *optional*, defaults to 4096):
139
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
140
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
141
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
142
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
143
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
144
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
145
+ dropout (`float`, *optional*, defaults to 0.1):
146
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
147
+ attention_dropout (`float`, *optional*, defaults to 0.0):
148
+ The dropout ratio for the attention probabilities.
149
+ activation_dropout (`float`, *optional*, defaults to 0.0):
150
+ The dropout ratio for activations inside the fully connected layer.
151
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
152
+ The dropout ratio for classifier.
153
+ max_position_embeddings (`int`, *optional*, defaults to 1024):
154
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
155
+ just in case (e.g., 512 or 1024 or 2048).
156
+ init_std (`float`, *optional*, defaults to 0.02):
157
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
158
+ encoder_layerdrop (`float`, *optional*, defaults to 0.0):
159
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
160
+ for more details.
161
+ decoder_layerdrop (`float`, *optional*, defaults to 0.0):
162
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
163
+ for more details.
164
+ scale_embedding (`bool`, *optional*, defaults to `False`):
165
+ Scale embeddings by diving by sqrt(d_model).
166
+ use_cache (`bool`, *optional*, defaults to `True`):
167
+ Whether or not the model should return the last key/values attentions (not used by all models).
168
+ num_labels (`int`, *optional*, defaults to 3):
169
+ The number of labels to use in [`Florence2LanguageForSequenceClassification`].
170
+ forced_eos_token_id (`int`, *optional*, defaults to 2):
171
+ The id of the token to force as the last generated token when `max_length` is reached. Usually set to
172
+ `eos_token_id`.
173
+ Example:
174
+ ```python
175
+ >>> from transformers import Florence2LanguageConfig, Florence2LanguageModel
176
+ >>> # Initializing a Florence2 Language style configuration
177
+ >>> configuration = Florence2LanguageConfig()
178
+ >>> # Initializing a model (with random weights)
179
+ >>> model = Florence2LangaugeModel(configuration)
180
+ >>> # Accessing the model configuration
181
+ >>> configuration = model.config
182
+ ```"""
183
+
184
+ model_type = "florence2_language"
185
+ keys_to_ignore_at_inference = ["past_key_values"]
186
+ attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
187
+
188
+ def __init__(
189
+ self,
190
+ vocab_size=51289,
191
+ max_position_embeddings=1024,
192
+ encoder_layers=12,
193
+ encoder_ffn_dim=4096,
194
+ encoder_attention_heads=16,
195
+ decoder_layers=12,
196
+ decoder_ffn_dim=4096,
197
+ decoder_attention_heads=16,
198
+ encoder_layerdrop=0.0,
199
+ decoder_layerdrop=0.0,
200
+ activation_function="gelu",
201
+ d_model=1024,
202
+ dropout=0.1,
203
+ attention_dropout=0.0,
204
+ activation_dropout=0.0,
205
+ init_std=0.02,
206
+ classifier_dropout=0.0,
207
+ scale_embedding=False,
208
+ use_cache=True,
209
+ num_labels=3,
210
+ pad_token_id=1,
211
+ bos_token_id=0,
212
+ eos_token_id=2,
213
+ is_encoder_decoder=True,
214
+ decoder_start_token_id=2,
215
+ forced_eos_token_id=2,
216
+ **kwargs,
217
+ ):
218
+ self.vocab_size = vocab_size
219
+ self.max_position_embeddings = max_position_embeddings
220
+ self.d_model = d_model
221
+ self.encoder_ffn_dim = encoder_ffn_dim
222
+ self.encoder_layers = encoder_layers
223
+ self.encoder_attention_heads = encoder_attention_heads
224
+ self.decoder_ffn_dim = decoder_ffn_dim
225
+ self.decoder_layers = decoder_layers
226
+ self.decoder_attention_heads = decoder_attention_heads
227
+ self.dropout = dropout
228
+ self.attention_dropout = attention_dropout
229
+ self.activation_dropout = activation_dropout
230
+ self.activation_function = activation_function
231
+ self.init_std = init_std
232
+ self.encoder_layerdrop = encoder_layerdrop
233
+ self.decoder_layerdrop = decoder_layerdrop
234
+ self.classifier_dropout = classifier_dropout
235
+ self.use_cache = use_cache
236
+ self.num_hidden_layers = encoder_layers
237
+ self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
238
+
239
+ super().__init__(
240
+ num_labels=num_labels,
241
+ pad_token_id=pad_token_id,
242
+ bos_token_id=bos_token_id,
243
+ eos_token_id=eos_token_id,
244
+ is_encoder_decoder=is_encoder_decoder,
245
+ decoder_start_token_id=decoder_start_token_id,
246
+ forced_eos_token_id=forced_eos_token_id,
247
+ **kwargs,
248
+ )
249
+
250
+ # ensure backward compatibility for BART CNN models
251
+ if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
252
+ self.forced_bos_token_id = self.bos_token_id
253
+ warnings.warn(
254
+ f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
255
+ "The config can simply be saved and uploaded again to be fixed."
256
+ )
257
+
258
+ class Florence2Config(PretrainedConfig):
259
+ r"""
260
+ This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
261
+ Florence-2 model according to the specified arguments, defining the model architecture.
262
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
263
+ documentation from [`PretrainedConfig`] for more information.
264
+ Args:
265
+ vision_config (`Florence2VisionConfig`, *optional*):
266
+ Custom vision config or dict
267
+ text_config (`Union[AutoConfig, dict]`, *optional*):
268
+ The config object of the text backbone.
269
+ ignore_index (`int`, *optional*, defaults to -100):
270
+ The ignore index for the loss function.
271
+ vocab_size (`int`, *optional*, defaults to 51289):
272
+ Vocabulary size of the Florence2model. Defines the number of different tokens that can be represented by the
273
+ `inputs_ids` passed when calling [`~Florence2ForConditionalGeneration`]
274
+ projection_dim (`int`, *optional*, defaults to 1024):
275
+ Dimension of the multimodal projection space.
276
+ Example:
277
+ ```python
278
+ >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig
279
+ >>> # Initializing a clip-like vision config
280
+ >>> vision_config = CLIPVisionConfig()
281
+ >>> # Initializing a Bart config
282
+ >>> text_config = BartConfig()
283
+ >>> # Initializing a Florence-2 configuration
284
+ >>> configuration = Florence2Config(vision_config, text_config)
285
+ >>> # Initializing a model from the florence-2 configuration
286
+ >>> model = Florence2ForConditionalGeneration(configuration)
287
+ >>> # Accessing the model configuration
288
+ >>> configuration = model.config
289
+ ```"""
290
+
291
+ model_type = "florence2"
292
+ is_composition = False
293
+
294
+ def __init__(
295
+ self,
296
+ vision_config=None,
297
+ text_config=None,
298
+ ignore_index=-100,
299
+ vocab_size=51289,
300
+ projection_dim=1024,
301
+ **kwargs,
302
+ ):
303
+ self.ignore_index = ignore_index
304
+ self.vocab_size = vocab_size
305
+ self.projection_dim = projection_dim
306
+ if vision_config is not None:
307
+ vision_config = PretrainedConfig(**vision_config)
308
+ self.vision_config = vision_config
309
+ self.vocab_size = self.vocab_size
310
+
311
+ self.text_config = text_config
312
+ if text_config is not None:
313
+ self.text_config = Florence2LanguageConfig(**text_config)
314
+
315
+
316
+ super().__init__(**kwargs)