DannyJun commited on
Commit
c3acb98
·
verified ·
1 Parent(s): 2a747c0

Upload configuration_sprvla.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. configuration_sprvla.py +355 -0
configuration_sprvla.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SPRVLA configuration
3
+ """
4
+
5
+ from typing import Tuple, Optional, Dict, Any
6
+
7
+ from transformers import PretrainedConfig
8
+ from transformers.modeling_rope_utils import rope_config_validation
9
+ from transformers.utils import logging
10
+
11
+ logger = logging.get_logger(__name__)
12
+
13
+
14
+ class SPRVLAVitConfig(PretrainedConfig):
15
+ r"""
16
+ This is the configuration class to store the configuration of a [`SPRVLAVisionTransformer`].
17
+ It is used to instantiate a `SPRVLAVisionTransformer` according to the specified arguments,
18
+ defining the model architecture.
19
+
20
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
21
+ documentation from [`PretrainedConfig`] for more information.
22
+
23
+ Example:
24
+ ```python
25
+ >>> from transformers import SPRVLAVitConfig, SPRVLAVisionTransformer
26
+
27
+ >>> # Initializing a SPRVLAVitConfig
28
+ >>> configuration = SPRVLAVitConfig()
29
+
30
+ >>> # Initializing a SPRVLAVisionTransformer (with random weights)
31
+ >>> model = SPRVLAVisionTransformer(configuration)
32
+
33
+ >>> # Accessing the model configuration
34
+ >>> configuration = model.config
35
+ ```"""
36
+
37
+ model_type = "sprvla_vit"
38
+
39
+ def __init__(
40
+ self,
41
+ hidden_size: int = 1152,
42
+ intermediate_size: int = 4304,
43
+ num_hidden_layers: int = 27,
44
+ num_attention_heads: int = 16,
45
+ num_key_value_heads: int = 16,
46
+ head_dim: int = 72,
47
+ hidden_act: str = "gelu_pytorch_tanh",
48
+ layer_norm_eps: float = 1e-6,
49
+ image_default_input_size: Tuple[int, int] = (378, 378),
50
+ image_patch_size: int = 14,
51
+ image_num_pos: int = 577,
52
+ attention_dropout: float = 0.0,
53
+ residual_dropout: float = 0.0,
54
+ initializer_range: float = 0.02,
55
+ float32_attention: bool = True,
56
+ use_cls_token: bool = False, # True for OpenCLIP
57
+ patch_bias: bool = True, # False for OpenCLIP
58
+ pre_layernorm: bool = False, # True for OpenCLIP
59
+ **kwargs,
60
+ ):
61
+ super().__init__(**kwargs)
62
+ self.hidden_size = hidden_size
63
+ self.intermediate_size = intermediate_size
64
+ self.num_hidden_layers = num_hidden_layers
65
+ self.num_attention_heads = num_attention_heads
66
+ self.num_key_value_heads = num_key_value_heads
67
+ self.head_dim = head_dim
68
+ self.hidden_act = hidden_act
69
+ self.layer_norm_eps = layer_norm_eps
70
+ self.image_default_input_size = image_default_input_size
71
+ self.image_patch_size = image_patch_size
72
+ self.image_num_pos = image_num_pos
73
+ self.attention_dropout = attention_dropout
74
+ self.residual_dropout = residual_dropout
75
+ self.initializer_range = initializer_range
76
+ self.float32_attention = float32_attention
77
+ self.use_cls_token = use_cls_token
78
+ self.patch_bias = patch_bias
79
+ self.pre_layernorm = pre_layernorm
80
+
81
+ @property
82
+ def image_num_patch(self):
83
+ h, w = self.image_default_input_size
84
+ return h // self.image_patch_size, w // self.image_patch_size
85
+
86
+
87
+ class SPRVLAAdapterConfig(PretrainedConfig):
88
+ r"""
89
+ This is the configuration class to store the configuration of SPRVLAAdapter. With SPRVLAVitConfig,
90
+ It is used to instantiate an SPRVLAVisionBackbone according to the specified arguments,
91
+ defining the model architecture.
92
+
93
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
94
+ documentation from [`PretrainedConfig`] for more information.
95
+
96
+ Example:
97
+
98
+ ```python
99
+ >>> from transformers import SPRVLAVitConfig, SPRVLAAdapterConfig, SPRVLAVisionBackbone
100
+
101
+ >>> # Initializing a SPRVLAVitConfig and a SPRVLAAdapterConfig
102
+ >>> vit_config = SPRVLAVitConfig()
103
+ >>> adapter_config = SPRVLAPoolingConfig()
104
+
105
+ >>> # Initializing a SPRVLAVisionBackbone (with random weights)
106
+ >>> model = SPRVLAVisionBackbone(vit_config, adapter_config)
107
+
108
+ >>> # Accessing the model configuration
109
+ >>> vit_configuration = model.vit_config
110
+ >>> adapter_configuration = model.adapter_config
111
+ ```"""
112
+
113
+ def __init__(
114
+ self,
115
+ vit_layers: Tuple = (-3, -9),
116
+ hidden_size: int = 1152,
117
+ num_attention_heads: int = 16,
118
+ num_key_value_heads: int = 16,
119
+ head_dim: int = 72,
120
+ float32_attention: bool = True,
121
+ attention_dropout: float = 0.0,
122
+ residual_dropout: float = 0.0,
123
+ hidden_act: str = "silu",
124
+ intermediate_size: int = 18944,
125
+ text_hidden_size: int = 3584,
126
+ image_feature_dropout: float = 0.0,
127
+ initializer_range: float = 0.02,
128
+ # pooling_mode: str = "indices", # "indices" (SigLIP) or "2x2_attention" (OpenCLIP)
129
+ image_padding_embed: Optional[str] = None, # e.g. "pad_and_partial_pad"
130
+ **kwargs,
131
+ ):
132
+ super().__init__(**kwargs)
133
+ self.vit_layers = vit_layers
134
+ self.hidden_size = hidden_size
135
+ self.num_attention_heads = num_attention_heads
136
+ self.num_key_value_heads = num_key_value_heads
137
+ self.head_dim = head_dim
138
+ self.float32_attention = float32_attention
139
+ self.attention_dropout = attention_dropout
140
+ self.residual_dropout = residual_dropout
141
+ self.hidden_act = hidden_act
142
+ self.intermediate_size = intermediate_size
143
+ self.text_hidden_size = text_hidden_size
144
+ self.image_feature_dropout = image_feature_dropout
145
+ self.initializer_range = initializer_range
146
+ # self.pooling_mode = pooling_mode
147
+ self.image_padding_embed = image_padding_embed
148
+
149
+
150
+ class SPRVLALlmConfig(PretrainedConfig):
151
+ r"""
152
+ This is the configuration class to store the configuration of a [`SPRVLALlm`]. It is used to instantiate a
153
+ `SPRVLALlm` according to the specified arguments, defining the model architecture.
154
+
155
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
156
+ documentation from [`PretrainedConfig`] for more information.
157
+
158
+ Example:
159
+ ```python
160
+ >>> from transformers import SPRVLALlmConfig, SPRVLALlm
161
+
162
+ >>> # Initializing a SPRVLALlmConfig
163
+ >>> configuration = SPRVLALlmConfig()
164
+
165
+ >>> # Initializing a SPRVLALlm (with random weights)
166
+ >>> model = SPRVLALlm(configuration)
167
+
168
+ >>> # Accessing the model configuration
169
+ >>> configuration = model.config
170
+ ```"""
171
+
172
+ model_type = "sprvla_llm"
173
+ keys_to_ignore_at_inference = ["past_key_values"]
174
+ base_model_tp_plan = {
175
+ "blocks.*.self_attn.att_proj": "colwise",
176
+ "blocks.*.self_attn.attn_out": "rowwise",
177
+ "blocks.*.mlp.ff_proj": "colwise",
178
+ "blocks.*.mlp.ff_out": "rowwise",
179
+ }
180
+ base_model_pp_plan = {
181
+ "wte": (["input_ids"], ["inputs_embeds"]),
182
+ "blocks": (["hidden_states", "attention_mask"], ["hidden_states"]),
183
+ "ln_f": (["hidden_states"], ["hidden_states"]),
184
+ }
185
+
186
+ def __init__(
187
+ self,
188
+ hidden_size: int = 3584,
189
+ num_attention_heads: int = 28,
190
+ num_key_value_heads: Optional[int] = 4,
191
+ head_dim: int = 128,
192
+ vocab_size: int = 152064,
193
+ additional_vocab_size: int = 128,
194
+ qkv_bias: bool = True,
195
+ num_hidden_layers: int = 48,
196
+ intermediate_size: int = 18944,
197
+ hidden_act: str = "silu",
198
+ embedding_dropout: float=0.0,
199
+ attention_dropout: float=0.0,
200
+ residual_dropout: float = 0.0,
201
+ max_position_embeddings: int = 4096,
202
+ rope_theta: float = 1000000.0,
203
+ rope_scaling: Dict[str, Any] = None,
204
+ use_qk_norm: bool = False,
205
+ qk_norm_type: str = "olmo",
206
+ layer_norm_eps: int = 1e-6,
207
+ norm_after: bool = False,
208
+ initializer_range: float = 0.02,
209
+ use_cache=True,
210
+ tie_word_embeddings=False,
211
+ **kwargs,
212
+ ):
213
+ super().__init__(
214
+ tie_word_embeddings=tie_word_embeddings,
215
+ **kwargs
216
+ )
217
+ self.hidden_size = hidden_size
218
+ self.num_attention_heads = num_attention_heads
219
+ if num_key_value_heads is None:
220
+ num_key_value_heads = num_attention_heads
221
+ self.num_key_value_heads = num_key_value_heads
222
+ self.head_dim = head_dim
223
+ self.vocab_size = vocab_size
224
+ self.additional_vocab_size = additional_vocab_size
225
+ self.qkv_bias = qkv_bias
226
+ self.num_hidden_layers = num_hidden_layers
227
+ self.intermediate_size = intermediate_size
228
+ self.hidden_act = hidden_act
229
+ self.embedding_dropout = embedding_dropout
230
+ self.attention_dropout = attention_dropout
231
+ self.residual_dropout = residual_dropout
232
+ self.max_position_embeddings = max_position_embeddings
233
+ self.rope_theta = rope_theta
234
+ self.rope_scaling = rope_scaling
235
+ self.use_qk_norm = use_qk_norm
236
+ self.qk_norm_type = qk_norm_type
237
+ self.layer_norm_eps = layer_norm_eps
238
+ self.norm_after = norm_after
239
+ self.initializer_range = initializer_range
240
+ self.use_cache = use_cache
241
+
242
+ # Validate the correctness of rotary position embeddings parameters
243
+ rope_config_validation(self)
244
+
245
+
246
+ class SPRVLAConfig(PretrainedConfig):
247
+ r"""
248
+ This is the configuration class to store the configuration of a [`SPRVLAForActionReasoning`].
249
+ It is used to instantiate an SPRVLA model according to the specified arguments, defining the model architecture.
250
+
251
+ Example:
252
+
253
+ ```python
254
+ >>> from transformers import SPRVLAConfig, SPRVLAVitConfig, SPRVLAAdapterConfig, SPRVLALlmConfig
255
+
256
+ >>> # Initializing a SPRVLAVitConfig
257
+ >>> vit_config = SPRVLAVitConfig()
258
+
259
+ >>> # Initializing a SPRVLAAdapterConfig
260
+ >>> adapter_config = SPRVLAAdapterConfig()
261
+
262
+ >>> # Initializing a SPRVLALlmConfig
263
+ >>> llm_config = SPRVLALlmConfig()
264
+
265
+ >>> # Initializing a SPRVLAConfig
266
+ >>> configuration = SPRVLAConfig(vit_config, adapter_config, llm_config, image_patch_id=152069)
267
+
268
+ >>> # Initializing a model
269
+ >>> model = SPRVLAForActionReasoning(configuration)
270
+
271
+ >>> # Accessing the model configuration
272
+ >>> configuration = model.config
273
+ ```"""
274
+
275
+ model_type = "sprvla"
276
+ sub_configs = {
277
+ "llm_config": SPRVLALlmConfig,
278
+ "vit_config": SPRVLAVitConfig,
279
+ "adapter_config": SPRVLAAdapterConfig,
280
+ }
281
+
282
+ def __init__(
283
+ self,
284
+ vit_config: SPRVLAVitConfig = None,
285
+ adapter_config: SPRVLAAdapterConfig = None,
286
+ llm_config: SPRVLALlmConfig = None,
287
+ image_patch_id: int = None,
288
+ initializer_range: float = 0.02,
289
+ n_action_bins: int = 256,
290
+ norm_stats: dict = {},
291
+ **kwargs,
292
+ ):
293
+ super().__init__(**kwargs)
294
+ if vit_config is None:
295
+ self.vit_config = SPRVLAVitConfig()
296
+ elif isinstance(vit_config, dict):
297
+ self.vit_config = SPRVLAVitConfig(**vit_config)
298
+ else:
299
+ self.vit_config = vit_config
300
+ if adapter_config is None:
301
+ self.adapter_config = SPRVLAAdapterConfig()
302
+ elif isinstance(adapter_config, dict):
303
+ self.adapter_config = SPRVLAAdapterConfig(**adapter_config)
304
+ else:
305
+ self.adapter_config = adapter_config
306
+ if llm_config is None:
307
+ self.llm_config = SPRVLALlmConfig()
308
+ elif isinstance(llm_config, dict):
309
+ self.llm_config = SPRVLALlmConfig(**llm_config)
310
+ else:
311
+ self.llm_config = llm_config
312
+ self.image_patch_id = image_patch_id
313
+ self.initializer_range = initializer_range
314
+
315
+ self.n_action_bins = n_action_bins
316
+ self.norm_stats = norm_stats
317
+
318
+ @property
319
+ def image_num_patch(self):
320
+ assert self.vit_config is not None
321
+ return self.vit_config.image_num_patch
322
+
323
+ @property
324
+ def num_attention_heads(self):
325
+ return self.llm_config.num_attention_heads
326
+
327
+ @property
328
+ def num_key_value_heads(self):
329
+ return self.llm_config.num_key_value_heads
330
+
331
+ @property
332
+ def head_dim(self):
333
+ return self.llm_config.head_dim
334
+
335
+ @property
336
+ def num_hidden_layers(self):
337
+ return self.llm_config.num_hidden_layers
338
+
339
+ @property
340
+ def hidden_size(self):
341
+ return self.llm_config.hidden_size
342
+
343
+ @property
344
+ def vocab_size(self):
345
+ return self.llm_config.vocab_size
346
+
347
+ @property
348
+ def max_position_embeddings(self):
349
+ return self.llm_config.max_position_embeddings
350
+
351
+
352
+ SPRVLAVitConfig.register_for_auto_class()
353
+ SPRVLAAdapterConfig.register_for_auto_class()
354
+ SPRVLALlmConfig.register_for_auto_class()
355
+ SPRVLAConfig.register_for_auto_class()