File size: 7,742 Bytes
0b96c50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
run_name: multitask_train
seed: 6198
epoch: null
dry_run: false
model:
  d_model: 3584
  n_heads: 28
  n_kv_heads: 4
  qkv_bias: true
  clip_qkv: null
  n_layers: 28
  mlp_ratio: 4
  mlp_hidden_size: 37888
  activation_type: swiglu
  block_type: sequential
  block_group_size: 1
  rope: true
  rope_full_precision: true
  rope_theta: 1000000.0
  vision_backbone:
    image_model_type: siglip
    image_default_input_size:
    - 384
    - 384
    image_patch_size: 16
    image_pos_patch_size: 16
    image_emb_dim: 1152
    image_num_heads: 16
    image_num_key_value_heads: 16
    image_num_layers: 27
    image_head_dim: 72
    image_mlp_dim: 4304
    image_mlp_activations: gelu_pytorch_tanh
    image_dropout_rate: 0.0
    image_num_pos: 576
    image_norm_eps: 1.0e-06
    attention_dropout: 0.0
    residual_dropout: 0.0
    initializer_range: 0.02
    fsdp_wrap: false
    resize_mode: siglip
  vision_backbone2:
    image_model_type: dino
    image_default_input_size:
    - 224
    - 224
    image_patch_size: 16
    image_pos_patch_size: 16
    image_emb_dim: 1024
    image_num_heads: 16
    image_num_key_value_heads: 16
    image_num_layers: 24
    image_head_dim: 64
    image_mlp_dim: 4096
    image_mlp_activations: gelu
    image_dropout_rate: 0.0
    image_num_pos: 785
    image_norm_eps: 1.0e-05
    attention_dropout: 0.0
    residual_dropout: 0.0
    initializer_range: 0.02
    fsdp_wrap: false
    resize_mode: dino
  vit_load_path: /molmo_code/data/pretrained_image_encoders/siglip2-so400m-16-384.pt
  vit_load_path2: /molmo_code/data/molmo/pretrained_image_encoders/dinov3-large-224.pt
  llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
  low_cpu_fsdp: true
  attention_type: sdpa
  float32_attention: true
  attention_dropout: 0.0
  attention_layer_norm: false
  residual_dropout: 0.1
  response_residual_dropout: 0.0
  embedding_dropout: 0.0
  layer_norm_type: rms
  layer_norm_with_affine: true
  layer_norm_eps: 1.0e-06
  attention_layer_norm_with_affine: true
  max_sequence_length: 4096
  max_position_embeddings: null
  include_bias: false
  bias_for_layer_norm: null
  scale_logits: false
  vocab_size: 152064
  embedding_size: 152064
  additional_vocab_size: 128
  new_embedding_init_range: 0.02
  weight_tying: false
  init_device: null
  init_fn: normal
  init_std: 0.02
  init_cutoff_factor: null
  norm_after: false
  precision: amp_bf16
  max_crops: 12
  crop_mode: overlap-and-resize-c2
  use_col_tokens: true
  prompt_type: uber_model
  system_prompt_kind: demo_or_style
  message_formatting: role
  always_start_with_space: true
  multi_annotation_weighting: root_subsegments
  default_inference_len: 65
  overlap_margins:
  - 4
  - 4
  pad_value: 0.0
  image_padding_embed: pad_and_partial_pad
  fix_image_padding: true
  vit_layers:
  - -1
  vit_layers2:
  - -1
  image_pooling_h: 2
  image_pooling_w: 2
  image_pooling_2d: attention_meanq
  image_projector: mlp
  image_projector2: mlp
  image_feature_dropout: 0.0
  initializer_range: 0.02
  normalize_input_embeds: false
  use_position_ids: true
  head_dim: null
  tokenizer:
    identifier: Qwen/Qwen2-7B
    tokenizer_dir: null
  pad_tokenizer: true
  moe_num_experts: 8
  moe_top_k: 2
  moe_mlp_impl: sparse
  moe_log_expert_assignment: false
  moe_shared_expert: false
  moe_lbl_in_fp32: false
  moe_interleave: false
  moe_loss_weight: 0.1
  moe_zloss_weight: null
  moe_dropless: true
  moe_capacity_factor: 1.25
allow_resume: true
ft_llm: true
ft_vit: true
ft_vit2: false
ft_connector: true
ft_embedding: lm_head
optimizer:
  name: adamw
  learning_rate: 0.0001
  weight_decay: 0.01
  betas:
  - 0.9
  - 0.95
  eps: 1.0e-05
  connector_learning_rate: 1.0e-05
  vit_learning_rate: 1.0e-05
  llm_learning_rate: 1.0e-05
  connector_weight_decay: 0.0
  vit_weight_decay: 0.0
  llm_weight_decay: 0.0
  connector_betas:
  - 0.9
  - 0.95
  vit_betas:
  - 0.9
  - 0.95
  llm_betas:
  - 0.9
  - 0.95
  connector_eps: 1.0e-06
  vit_eps: 1.0e-06
  llm_eps: 1.0e-06
  metrics_log_interval: 20
scheduler:
  name: multimodal
  units: steps
  t_warmup: 100
  t_max: null
  alpha_f: 0.1
  connector_t_warmup: 200
  vit_t_warmup: 200
  llm_t_warmup: 200
  grad_clip_warmup_steps: null
  grad_clip_warmup_factor: null
  warmup_min_lr: 0.0
data:
  dataset: null
  mixture: null
  root_size_mixture:
  - rate: 0.6
    mixture:
      refcoco: null
      adv_refcoco: null
      pixmo_docs_charts: null
      pixmo_docs_tables: null
      pixmo_docs_other: null
      pixmo_docs_diagrams: null
  - rate: 0.4
    mixture:
      pointing_eval: null
      pixmo_count_counting: null
      pixmo_points: null
      pixmo_count: null
      pixmo_points_counting: null
  split: train
  seed: 50189
  shuffle_messages: true
  pad: to_max
  sequence_length: 2304
  shuffle: true
  for_inference: false
  multi_modal: torch
  num_workers: 2
  drop_last: true
  pin_memory: true
  prefetch_factor: null
  persistent_workers: false
  timeout: 0
restore_dataloader: true
fast_forward_batches: null
evaluators: []
eval_interval: 12000
inf_eval_interval: 12000
inf_evaluators:
- label: pixmo_docs_charts:validation
  data:
    dataset: pixmo_docs_charts
    mixture: null
    root_size_mixture: null
    split: validation
    seed: null
    shuffle_messages: true
    pad: to_max
    sequence_length: 1792
    shuffle: true
    for_inference: true
    multi_modal: torch
    num_workers: 2
    drop_last: true
    pin_memory: true
    prefetch_factor: null
    persistent_workers: true
    timeout: 0
  device_eval_batch_size: null
  subset_num_batches: null
  max_examples: 2048
  max_new_tokens: 256
  mm_evaluator:
    n_to_log: 0
    num_wandb_examples: 32
    save_predictions: null
    save_tokens: false
    save_full_predictions: false
    vqa_eval: ansl,em
    pointing_eval: false
    count_eval: false
    point_count_eval: false
    android_eval: false
    clock_eval: false
    clock_bench_eval: false
    math_vista_eval: false
  save_dir: null
  save_to_checkpoint_dir: false
  eval_name: null
  skip_if_metrics_cached: true
save_folder: /molmo_ckpt/final
remote_save_folder: null
canceled_check_interval: 50
save_interval: 30000
save_interval_unsharded: 1000
save_interval_ephemeral: null
save_num_checkpoints_to_keep: 0
save_num_unsharded_checkpoints_to_keep: 1
save_overwrite: true
force_save_unsharded: false
no_pre_train_checkpoint: true
initial_model_checkpoint: /molmo_ckpt/step24000-unsharded
load_model_config: null
load_path: null
load_path_sharded_checkpointer: null
reset_optimizer_state: false
reset_trainer_state: false
save_dataloader_state: false
reset_dataloader_state: false
sharded_checkpointer: torch_legacy
max_duration: 30000
global_train_batch_size: 24
device_train_batch_size: 3
device_train_microbatch_size: 3
device_eval_batch_size: 3
eval_subset_num_batches: 1
eval_on_load: false
device_inf_eval_batch_size: 3
inf_eval_subset_num_batches: -1
device_train_grad_accum: 1
max_grad_norm: 1.0
multi_component_grad_norm: true
batch_divisor: global_batch
max_grad_norm_ratio: null
precision: amp_bf16
wandb:
  project: molmo-1
  entity: ankanderia2-mbzuai
  group: null
  name: multitask_train
  tags:
  - watching
  log_artifacts: false
  rank_zero_only: true
  log_interval: 20
speed_monitor:
  window_size: 20
  gpu_flops_available: null
console_log_interval: 20
gen1_gc_interval: 1
compile: null
fsdp:
  use_orig_params: true
  sharding_strategy: FULL_SHARD
  wrapping_strategy: by_block_and_size
  precision: float
  hybrid_sharding_num_model_replicas: null
softmax_auxiliary_loss: true
softmax_auxiliary_loss_scale: 0.0001
time_limit: null
extra_steps_after_cancel: 10
python_profiling: false
torch_profiling: false
stop_at: 30000
stop_after: null
activation_checkpointing: whole_layer
fused_loss: null