ankanmbz commited on
Commit
0b96c50
·
verified ·
1 Parent(s): da124db

Upload CoME-VL checkpoint

Browse files
Files changed (3) hide show
  1. .DS_Store +0 -0
  2. config.yaml +330 -0
  3. model.pt +3 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
config.yaml ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: multitask_train
2
+ seed: 6198
3
+ epoch: null
4
+ dry_run: false
5
+ model:
6
+ d_model: 3584
7
+ n_heads: 28
8
+ n_kv_heads: 4
9
+ qkv_bias: true
10
+ clip_qkv: null
11
+ n_layers: 28
12
+ mlp_ratio: 4
13
+ mlp_hidden_size: 37888
14
+ activation_type: swiglu
15
+ block_type: sequential
16
+ block_group_size: 1
17
+ rope: true
18
+ rope_full_precision: true
19
+ rope_theta: 1000000.0
20
+ vision_backbone:
21
+ image_model_type: siglip
22
+ image_default_input_size:
23
+ - 384
24
+ - 384
25
+ image_patch_size: 16
26
+ image_pos_patch_size: 16
27
+ image_emb_dim: 1152
28
+ image_num_heads: 16
29
+ image_num_key_value_heads: 16
30
+ image_num_layers: 27
31
+ image_head_dim: 72
32
+ image_mlp_dim: 4304
33
+ image_mlp_activations: gelu_pytorch_tanh
34
+ image_dropout_rate: 0.0
35
+ image_num_pos: 576
36
+ image_norm_eps: 1.0e-06
37
+ attention_dropout: 0.0
38
+ residual_dropout: 0.0
39
+ initializer_range: 0.02
40
+ fsdp_wrap: false
41
+ resize_mode: siglip
42
+ vision_backbone2:
43
+ image_model_type: dino
44
+ image_default_input_size:
45
+ - 224
46
+ - 224
47
+ image_patch_size: 16
48
+ image_pos_patch_size: 16
49
+ image_emb_dim: 1024
50
+ image_num_heads: 16
51
+ image_num_key_value_heads: 16
52
+ image_num_layers: 24
53
+ image_head_dim: 64
54
+ image_mlp_dim: 4096
55
+ image_mlp_activations: gelu
56
+ image_dropout_rate: 0.0
57
+ image_num_pos: 785
58
+ image_norm_eps: 1.0e-05
59
+ attention_dropout: 0.0
60
+ residual_dropout: 0.0
61
+ initializer_range: 0.02
62
+ fsdp_wrap: false
63
+ resize_mode: dino
64
+ vit_load_path: /molmo_code/data/pretrained_image_encoders/siglip2-so400m-16-384.pt
65
+ vit_load_path2: /molmo_code/data/molmo/pretrained_image_encoders/dinov3-large-224.pt
66
+ llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
67
+ low_cpu_fsdp: true
68
+ attention_type: sdpa
69
+ float32_attention: true
70
+ attention_dropout: 0.0
71
+ attention_layer_norm: false
72
+ residual_dropout: 0.1
73
+ response_residual_dropout: 0.0
74
+ embedding_dropout: 0.0
75
+ layer_norm_type: rms
76
+ layer_norm_with_affine: true
77
+ layer_norm_eps: 1.0e-06
78
+ attention_layer_norm_with_affine: true
79
+ max_sequence_length: 4096
80
+ max_position_embeddings: null
81
+ include_bias: false
82
+ bias_for_layer_norm: null
83
+ scale_logits: false
84
+ vocab_size: 152064
85
+ embedding_size: 152064
86
+ additional_vocab_size: 128
87
+ new_embedding_init_range: 0.02
88
+ weight_tying: false
89
+ init_device: null
90
+ init_fn: normal
91
+ init_std: 0.02
92
+ init_cutoff_factor: null
93
+ norm_after: false
94
+ precision: amp_bf16
95
+ max_crops: 12
96
+ crop_mode: overlap-and-resize-c2
97
+ use_col_tokens: true
98
+ prompt_type: uber_model
99
+ system_prompt_kind: demo_or_style
100
+ message_formatting: role
101
+ always_start_with_space: true
102
+ multi_annotation_weighting: root_subsegments
103
+ default_inference_len: 65
104
+ overlap_margins:
105
+ - 4
106
+ - 4
107
+ pad_value: 0.0
108
+ image_padding_embed: pad_and_partial_pad
109
+ fix_image_padding: true
110
+ vit_layers:
111
+ - -1
112
+ vit_layers2:
113
+ - -1
114
+ image_pooling_h: 2
115
+ image_pooling_w: 2
116
+ image_pooling_2d: attention_meanq
117
+ image_projector: mlp
118
+ image_projector2: mlp
119
+ image_feature_dropout: 0.0
120
+ initializer_range: 0.02
121
+ normalize_input_embeds: false
122
+ use_position_ids: true
123
+ head_dim: null
124
+ tokenizer:
125
+ identifier: Qwen/Qwen2-7B
126
+ tokenizer_dir: null
127
+ pad_tokenizer: true
128
+ moe_num_experts: 8
129
+ moe_top_k: 2
130
+ moe_mlp_impl: sparse
131
+ moe_log_expert_assignment: false
132
+ moe_shared_expert: false
133
+ moe_lbl_in_fp32: false
134
+ moe_interleave: false
135
+ moe_loss_weight: 0.1
136
+ moe_zloss_weight: null
137
+ moe_dropless: true
138
+ moe_capacity_factor: 1.25
139
+ allow_resume: true
140
+ ft_llm: true
141
+ ft_vit: true
142
+ ft_vit2: false
143
+ ft_connector: true
144
+ ft_embedding: lm_head
145
+ optimizer:
146
+ name: adamw
147
+ learning_rate: 0.0001
148
+ weight_decay: 0.01
149
+ betas:
150
+ - 0.9
151
+ - 0.95
152
+ eps: 1.0e-05
153
+ connector_learning_rate: 1.0e-05
154
+ vit_learning_rate: 1.0e-05
155
+ llm_learning_rate: 1.0e-05
156
+ connector_weight_decay: 0.0
157
+ vit_weight_decay: 0.0
158
+ llm_weight_decay: 0.0
159
+ connector_betas:
160
+ - 0.9
161
+ - 0.95
162
+ vit_betas:
163
+ - 0.9
164
+ - 0.95
165
+ llm_betas:
166
+ - 0.9
167
+ - 0.95
168
+ connector_eps: 1.0e-06
169
+ vit_eps: 1.0e-06
170
+ llm_eps: 1.0e-06
171
+ metrics_log_interval: 20
172
+ scheduler:
173
+ name: multimodal
174
+ units: steps
175
+ t_warmup: 100
176
+ t_max: null
177
+ alpha_f: 0.1
178
+ connector_t_warmup: 200
179
+ vit_t_warmup: 200
180
+ llm_t_warmup: 200
181
+ grad_clip_warmup_steps: null
182
+ grad_clip_warmup_factor: null
183
+ warmup_min_lr: 0.0
184
+ data:
185
+ dataset: null
186
+ mixture: null
187
+ root_size_mixture:
188
+ - rate: 0.6
189
+ mixture:
190
+ refcoco: null
191
+ adv_refcoco: null
192
+ pixmo_docs_charts: null
193
+ pixmo_docs_tables: null
194
+ pixmo_docs_other: null
195
+ pixmo_docs_diagrams: null
196
+ - rate: 0.4
197
+ mixture:
198
+ pointing_eval: null
199
+ pixmo_count_counting: null
200
+ pixmo_points: null
201
+ pixmo_count: null
202
+ pixmo_points_counting: null
203
+ split: train
204
+ seed: 50189
205
+ shuffle_messages: true
206
+ pad: to_max
207
+ sequence_length: 2304
208
+ shuffle: true
209
+ for_inference: false
210
+ multi_modal: torch
211
+ num_workers: 2
212
+ drop_last: true
213
+ pin_memory: true
214
+ prefetch_factor: null
215
+ persistent_workers: false
216
+ timeout: 0
217
+ restore_dataloader: true
218
+ fast_forward_batches: null
219
+ evaluators: []
220
+ eval_interval: 12000
221
+ inf_eval_interval: 12000
222
+ inf_evaluators:
223
+ - label: pixmo_docs_charts:validation
224
+ data:
225
+ dataset: pixmo_docs_charts
226
+ mixture: null
227
+ root_size_mixture: null
228
+ split: validation
229
+ seed: null
230
+ shuffle_messages: true
231
+ pad: to_max
232
+ sequence_length: 1792
233
+ shuffle: true
234
+ for_inference: true
235
+ multi_modal: torch
236
+ num_workers: 2
237
+ drop_last: true
238
+ pin_memory: true
239
+ prefetch_factor: null
240
+ persistent_workers: true
241
+ timeout: 0
242
+ device_eval_batch_size: null
243
+ subset_num_batches: null
244
+ max_examples: 2048
245
+ max_new_tokens: 256
246
+ mm_evaluator:
247
+ n_to_log: 0
248
+ num_wandb_examples: 32
249
+ save_predictions: null
250
+ save_tokens: false
251
+ save_full_predictions: false
252
+ vqa_eval: ansl,em
253
+ pointing_eval: false
254
+ count_eval: false
255
+ point_count_eval: false
256
+ android_eval: false
257
+ clock_eval: false
258
+ clock_bench_eval: false
259
+ math_vista_eval: false
260
+ save_dir: null
261
+ save_to_checkpoint_dir: false
262
+ eval_name: null
263
+ skip_if_metrics_cached: true
264
+ save_folder: /molmo_ckpt/final
265
+ remote_save_folder: null
266
+ canceled_check_interval: 50
267
+ save_interval: 30000
268
+ save_interval_unsharded: 1000
269
+ save_interval_ephemeral: null
270
+ save_num_checkpoints_to_keep: 0
271
+ save_num_unsharded_checkpoints_to_keep: 1
272
+ save_overwrite: true
273
+ force_save_unsharded: false
274
+ no_pre_train_checkpoint: true
275
+ initial_model_checkpoint: /molmo_ckpt/step24000-unsharded
276
+ load_model_config: null
277
+ load_path: null
278
+ load_path_sharded_checkpointer: null
279
+ reset_optimizer_state: false
280
+ reset_trainer_state: false
281
+ save_dataloader_state: false
282
+ reset_dataloader_state: false
283
+ sharded_checkpointer: torch_legacy
284
+ max_duration: 30000
285
+ global_train_batch_size: 24
286
+ device_train_batch_size: 3
287
+ device_train_microbatch_size: 3
288
+ device_eval_batch_size: 3
289
+ eval_subset_num_batches: 1
290
+ eval_on_load: false
291
+ device_inf_eval_batch_size: 3
292
+ inf_eval_subset_num_batches: -1
293
+ device_train_grad_accum: 1
294
+ max_grad_norm: 1.0
295
+ multi_component_grad_norm: true
296
+ batch_divisor: global_batch
297
+ max_grad_norm_ratio: null
298
+ precision: amp_bf16
299
+ wandb:
300
+ project: molmo-1
301
+ entity: ankanderia2-mbzuai
302
+ group: null
303
+ name: multitask_train
304
+ tags:
305
+ - watching
306
+ log_artifacts: false
307
+ rank_zero_only: true
308
+ log_interval: 20
309
+ speed_monitor:
310
+ window_size: 20
311
+ gpu_flops_available: null
312
+ console_log_interval: 20
313
+ gen1_gc_interval: 1
314
+ compile: null
315
+ fsdp:
316
+ use_orig_params: true
317
+ sharding_strategy: FULL_SHARD
318
+ wrapping_strategy: by_block_and_size
319
+ precision: float
320
+ hybrid_sharding_num_model_replicas: null
321
+ softmax_auxiliary_loss: true
322
+ softmax_auxiliary_loss_scale: 0.0001
323
+ time_limit: null
324
+ extra_steps_after_cancel: 10
325
+ python_profiling: false
326
+ torch_profiling: false
327
+ stop_at: 30000
328
+ stop_after: null
329
+ activation_checkpointing: whole_layer
330
+ fused_loss: null
model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c51fde84e22a15345368253f3261417116126c5979889c9960dee6df3f3f2e4c
3
+ size 34617396982