Robotics
English
manipulation
abhaybd commited on
Commit
827b160
·
verified ·
1 Parent(s): 1b0cb55

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. README.md +34 -3
  2. config.yaml +612 -0
  3. model.pt +3 -0
README.md CHANGED
@@ -1,3 +1,34 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - allenai/molmobot-data
5
+ language:
6
+ - en
7
+ base_model:
8
+ - allenai/Molmo2-4B
9
+ pipeline_tag: robotics
10
+ tags:
11
+ - robotics
12
+ - manipulation
13
+ ---
14
+
15
+ # MolmoBot-DROID
16
+
17
+ [[Paper](https://arxiv.org/pdf/2603.16861)] [[Project Website](https://allenai.github.io/MolmoBot)] [[Code](https://github.com/allenai/MolmoBot/tree/main/MolmoBot)] [[Data](https://huggingface.co/datasets/allenai/molmobot-data)]
18
+
19
+ MolmoBot-DROID is the MolmoBot VLA trained on simulation data on the DROID platform, **without any real robot data**. See [here](https://github.com/allenai/MolmoBot/tree/main/MolmoBot-Pi0) for usage instructions. In the paper, it is also referred to as `MolmoBot (F=2)`.
20
+
21
+ ## BibTeX
22
+
23
+ ```
24
+ @misc{deshpande2026molmobot,
25
+ title={MolmoB0T: Large-Scale Simulation Enables Zero-Shot Manipulation},
26
+ author={Abhay Deshpande and Maya Guru and Rose Hendrix and Snehal Jauhri and Ainaz Eftekhar and Rohun Tripathi and Max Argus and Jordi Salvador and Haoquan Fang and Matthew Wallingford and Wilbert Pumacay and Yejin Kim and Quinn Pfeifer and Ying-Chun Lee and Piper Wolters and Omar Rayyan and Mingtong Zhang and Jiafei Duan and Karen Farley and Winson Han and Eli Vanderbilt and Dieter Fox and Ali Farhadi and Georgia Chalvatzaki and Dhruv Shah and Ranjay Krishna},
27
+ year={2026},
28
+ eprint={2603.16861},
29
+ archivePrefix={arXiv},
30
+ primaryClass={cs.RO},
31
+ url={https://arxiv.org/abs/2603.16861},
32
+ }
33
+ ```
34
+
config.yaml ADDED
@@ -0,0 +1,612 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ run_name: Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20
2
+ model:
3
+ model_name: molmoact
4
+ data_formatter:
5
+ prompt_templates: uber_model_v2
6
+ message_format: qwen3
7
+ system_prompt: demo_or_style_v2
8
+ always_start_with_space: false
9
+ default_inference_len: 65
10
+ select_answer: best
11
+ debug: false
12
+ image_last: false
13
+ format_message_list: null
14
+ p_one_message: 0.0
15
+ eval_system_prompt_mapping: null
16
+ p_choice_content_in_mc: 1.0
17
+ template_video_mc_questions: true
18
+ pointing_format: html-v2
19
+ points_decimal_places: 1
20
+ use_seperate_non_pointing_qa_style: false
21
+ timestamp_mode: 50-percent-seconds
22
+ output_timestamp_mode: seconds
23
+ seconds_decimal_places: 1
24
+ p_multi_point_all_image: 0.5
25
+ use_seperate_count_without_pointing_style: false
26
+ sample_random_initial_point: true
27
+ llm:
28
+ d_model: 2560
29
+ n_heads: 32
30
+ n_kv_heads: 8
31
+ head_dim: 128
32
+ qkv_bias: false
33
+ clip_qkv: null
34
+ n_layers: 36
35
+ mlp_ratio: 4
36
+ mlp_hidden_size: 19456
37
+ activation_type: swiglu
38
+ block_type: sequential
39
+ rope: true
40
+ rope_full_precision: true
41
+ rope_theta: 5000000.0
42
+ rope_type: default
43
+ rope_factor: null
44
+ rope_high_freq_factor: null
45
+ rope_low_freq_factor: null
46
+ rope_original_max_position_embeddings: null
47
+ rope_attention_factor: null
48
+ rope_beta_fast: null
49
+ rope_beta_slow: null
50
+ rope_mscale: null
51
+ rope_mscale_all_dim: null
52
+ rope_truncate: null
53
+ attention_type: sdpa
54
+ full_attention_layers: null
55
+ sliding_attention_rope_scaling: false
56
+ float32_attention: true
57
+ attention_dropout: 0.0
58
+ attention_layer_norm: true
59
+ attention_layer_norm_type: qwen3
60
+ residual_dropout: 0.1
61
+ response_residual_dropout: 0.0
62
+ layer_norm_type: rms
63
+ layer_norm_with_affine: true
64
+ layer_norm_eps: 1.0e-06
65
+ attention_layer_norm_with_affine: true
66
+ max_sequence_length: 8192
67
+ max_position_embeddings: null
68
+ include_bias: false
69
+ bias_for_layer_norm: null
70
+ norm_after: false
71
+ moe_num_experts: 8
72
+ moe_top_k: 2
73
+ moe_mlp_impl: sparse
74
+ moe_log_expert_assignment: false
75
+ moe_shared_expert: false
76
+ moe_lbl_in_fp32: false
77
+ moe_interleave: false
78
+ moe_loss_weight: 0.1
79
+ moe_zloss_weight: null
80
+ moe_dropless: true
81
+ moe_capacity_factor: 1.25
82
+ embedding_dropout: 0.0
83
+ scale_logits: false
84
+ vocab_size: 151936
85
+ additional_vocab_size: 128
86
+ weight_tying: true
87
+ embedding_size: 151936
88
+ use_position_ids: true
89
+ tokenizer:
90
+ identifier: Qwen/Qwen3-4B-Instruct-2507
91
+ tokenizer_dir: null
92
+ init_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen3-4b-instruct.pt
93
+ init_incremental: null
94
+ new_embedding_init_range: 0.02
95
+ initializer_range: 0.02
96
+ normalize_input_embeds: false
97
+ activation_checkpoint: whole_layer
98
+ compile: blocks
99
+ fix_pad_tokenizer: false
100
+ init_std: 0.02
101
+ init_fn: normal
102
+ init_cutoff_factor: null
103
+ vision_backbone:
104
+ vit:
105
+ image_model_type: siglip
106
+ image_default_input_size:
107
+ - 378
108
+ - 378
109
+ image_patch_size: 14
110
+ image_pos_patch_size: 14
111
+ image_emb_dim: 1152
112
+ image_num_heads: 16
113
+ image_num_key_value_heads: 16
114
+ image_num_layers: 27
115
+ image_head_dim: 72
116
+ image_mlp_dim: 4304
117
+ image_mlp_activations: gelu_pytorch_tanh
118
+ image_dropout_rate: 0.0
119
+ image_num_pos: 729
120
+ image_norm_eps: 1.0e-06
121
+ attention_dropout: 0.0
122
+ residual_dropout: 0.0
123
+ initializer_range: 0.02
124
+ float32_attention: true
125
+ attention_type: sdpa
126
+ sdpa_backend: all
127
+ activation_checkpointing: true
128
+ init_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/siglip2-so400m-14-384.pt
129
+ resize_mode: siglip
130
+ pad_value: 0.0
131
+ normalize: siglip
132
+ image_pooling_2d: attention_meanq
133
+ pooling_attention_mask: true
134
+ image_projector: mlp
135
+ image_padding_embed: null
136
+ vit_layers:
137
+ - -3
138
+ - -9
139
+ skip_unused_layers: true
140
+ use_deepstack: false
141
+ share_connector: false
142
+ image_feature_dropout: 0.0
143
+ connector_activation_checkpointing: true
144
+ compile_vit: blocks
145
+ pool_size_embeds: null
146
+ compile_connector: null
147
+ normalize_on_gpu: true
148
+ use_image_augmentation: true
149
+ use_resize_bottleneck: false
150
+ mm_preprocessor:
151
+ max_answer_len: null
152
+ last_message_loss_only: false
153
+ max_text_tokens: null
154
+ loss_token_weighting: root_subsegments_root_tokens
155
+ max_frames: 1
156
+ frame_sample_mode: uniform_last_frame
157
+ candidate_sampling_fps:
158
+ - 0.25
159
+ - 0.5
160
+ - 1.0
161
+ - 2.0
162
+ - 4.0
163
+ - 6.0
164
+ - 8.0
165
+ - 16.0
166
+ cache_videos: true
167
+ loading_method: torchcodec_exact
168
+ max_fps:
169
+ - 2.0
170
+ time_sampling: true
171
+ time_mode: per-frame-compact
172
+ subtitle_mode: frame_1
173
+ max_crops: 1
174
+ overlap_margins:
175
+ - 4.0
176
+ - 4.0
177
+ use_col_tokens: false
178
+ periodic_high_res_frame: null
179
+ high_low_train_mode: local_rnd
180
+ high_res_frame_sample_options: null
181
+ periodic_sample_rate_training:
182
+ 4:
183
+ - 0.9
184
+ - 0.03
185
+ - 0.03
186
+ - 0.04
187
+ 3:
188
+ - 0.6
189
+ - 0.2
190
+ - 0.2
191
+ skip_low_res_in_high_low: false
192
+ pooling_w: 3
193
+ pooling_h: 3
194
+ high_res_pooling_w: null
195
+ high_res_pooling_h: null
196
+ query_based_resolution_selection: false
197
+ max_queries_for_resolution_selection: 8
198
+ use_frame_special_tokens: true
199
+ frame_sel_clip_identifier: google/siglip2-so400m-patch14-384
200
+ image_padding_mask: false
201
+ max_subtitle_tokens: null
202
+ image:
203
+ crop_mode: resize
204
+ use_col_tokens: true
205
+ max_crops: 8
206
+ high_res_max_crops: 24
207
+ p_high_res: 0.0
208
+ pooling_w: 2
209
+ pooling_h: 2
210
+ overlap_margins:
211
+ - 4
212
+ - 4
213
+ max_images: 4
214
+ max_multi_image_crops: 8
215
+ multi_image_pooling_w: 2
216
+ multi_image_pooling_h: 2
217
+ use_single_crop_col_tokens: false
218
+ use_single_crop_start_token: true
219
+ single_frame: false
220
+ topk: null
221
+ prune_from_frame: 0
222
+ bi_directional_attn: image_tokens
223
+ shared_low_high_embedding: true
224
+ debug: null
225
+ cp_enabled: false
226
+ apply_cp_to_vision_backbone: false
227
+ action_dim: 8
228
+ action_horizon: 16
229
+ n_action_steps: 8
230
+ n_obs_steps: 2
231
+ obs_step_delta: 8
232
+ action_expert:
233
+ max_horizon: 32
234
+ action_dim: 8
235
+ hidden_size: 768
236
+ num_layers: 36
237
+ num_heads: 8
238
+ mlp_ratio: 4.0
239
+ timestep_embed_dim: 256
240
+ dropout: 0.0
241
+ attn_dropout: 0.0
242
+ context_layer_norm: true
243
+ action_expert_layer_mode: per_layer
244
+ flow_matching_num_steps: 10
245
+ flow_matching_cutoff: 0.999
246
+ flow_matching_beta_alpha: 1.0
247
+ flow_matching_beta_beta: 1.5
248
+ num_flow_timestamps: 8
249
+ same_noise_per_time: false
250
+ states_mode: cross_attn
251
+ robot_preprocessor:
252
+ stats_by_repo:
253
+ synthmanip:
254
+ observation.state:
255
+ q01:
256
+ - -0.8200882077217102
257
+ - -1.0460078716278076
258
+ - -1.2745805978775024
259
+ - -2.864607334136963
260
+ - -1.0115491151809692
261
+ - 1.2138986587524414
262
+ - -2.057372808456421
263
+ - -0.027562683448195457
264
+ q99:
265
+ - 0.7587710618972778
266
+ - 0.9406100511550903
267
+ - 0.9344996809959412
268
+ - -0.9798629283905029
269
+ - 0.8359407782554626
270
+ - 3.0869405269622803
271
+ - 1.9223058223724365
272
+ - 0.8661524057388306
273
+ action:
274
+ q01:
275
+ - -0.8200882077217102
276
+ - -1.0460078716278076
277
+ - -1.2745805978775024
278
+ - -2.864607334136963
279
+ - -1.0115491151809692
280
+ - 1.2138986587524414
281
+ - -2.057372808456421
282
+ - 0.0
283
+ q99:
284
+ - 0.7587710618972778
285
+ - 0.9406100511550903
286
+ - 0.9344996809959412
287
+ - -0.9798629283905029
288
+ - 0.8359407782554626
289
+ - 3.0869405269622803
290
+ - 1.9223058223724365
291
+ - 255.0
292
+ default_repo_id: synthmanip
293
+ action_key: action
294
+ state_keys:
295
+ - observation.state
296
+ action_norm_mode: quantiles
297
+ state_norm_mode: quantiles
298
+ robot_postprocessor:
299
+ stats_by_repo:
300
+ synthmanip:
301
+ observation.state:
302
+ q01:
303
+ - -0.8200882077217102
304
+ - -1.0460078716278076
305
+ - -1.2745805978775024
306
+ - -2.864607334136963
307
+ - -1.0115491151809692
308
+ - 1.2138986587524414
309
+ - -2.057372808456421
310
+ - -0.027562683448195457
311
+ q99:
312
+ - 0.7587710618972778
313
+ - 0.9406100511550903
314
+ - 0.9344996809959412
315
+ - -0.9798629283905029
316
+ - 0.8359407782554626
317
+ - 3.0869405269622803
318
+ - 1.9223058223724365
319
+ - 0.8661524057388306
320
+ action:
321
+ q01:
322
+ - -0.8200882077217102
323
+ - -1.0460078716278076
324
+ - -1.2745805978775024
325
+ - -2.864607334136963
326
+ - -1.0115491151809692
327
+ - 1.2138986587524414
328
+ - -2.057372808456421
329
+ - 0.0
330
+ q99:
331
+ - 0.7587710618972778
332
+ - 0.9406100511550903
333
+ - 0.9344996809959412
334
+ - -0.9798629283905029
335
+ - 0.8359407782554626
336
+ - 3.0869405269622803
337
+ - 1.9223058223724365
338
+ - 255.0
339
+ default_repo_id: synthmanip
340
+ action_key: action
341
+ state_keys:
342
+ - observation.state
343
+ action_norm_mode: quantiles
344
+ state_norm_mode: quantiles
345
+ parallelism:
346
+ data_parallel_replicate_degree: 1
347
+ enable_compiled_autograd: false
348
+ data_parallel_shard_degree: -1
349
+ fsdp_reshard_after_forward: default
350
+ context_parallel_config:
351
+ degree: 1
352
+ attention_type: ulysses
353
+ load_balancer: ulysses
354
+ head_stride: 1
355
+ tensor_parallel_config:
356
+ degree: 1
357
+ enable_async: false
358
+ data_parallel_config:
359
+ name: fsdp
360
+ param_dtype: null
361
+ reduce_dtype: float32
362
+ num_replicas: null
363
+ shard_degree: null
364
+ wrapping_strategy: full
365
+ prefetch_factor: 0
366
+ context_parallel_rotate_method: allgather
367
+ seed: 6198
368
+ epoch: null
369
+ dry_run: false
370
+ ft_llm: true
371
+ ft_vit: false
372
+ ft_connector: false
373
+ ft_embedding: ae
374
+ optimizer:
375
+ name: adamw
376
+ learning_rate: 0.0001
377
+ weight_decay: 0.01
378
+ betas:
379
+ - 0.9
380
+ - 0.95
381
+ eps: 1.0e-05
382
+ connector_learning_rate: 5.0e-06
383
+ vit_learning_rate: 5.0e-06
384
+ llm_learning_rate: 1.0e-05
385
+ frame_selector_learning_rate: 0.0001
386
+ temporal_token_scorer_learning_rate: 0.0001
387
+ action_expert_learning_rate: 0.0001
388
+ connector_weight_decay: 0.0
389
+ vit_weight_decay: 0.0
390
+ llm_weight_decay: 0.0
391
+ frame_selector_weight_decay: 0.01
392
+ temporal_token_scorer_weight_decay: 0.01
393
+ action_expert_weight_decay: 0.0
394
+ connector_betas:
395
+ - 0.9
396
+ - 0.95
397
+ vit_betas:
398
+ - 0.9
399
+ - 0.95
400
+ llm_betas:
401
+ - 0.9
402
+ - 0.95
403
+ frame_selector_betas:
404
+ - 0.9
405
+ - 0.95
406
+ temporal_token_scorer_betas:
407
+ - 0.9
408
+ - 0.95
409
+ action_expert_betas:
410
+ - 0.9
411
+ - 0.95
412
+ connector_eps: 1.0e-06
413
+ vit_eps: 1.0e-06
414
+ llm_eps: 1.0e-06
415
+ frame_selector_eps: 1.0e-06
416
+ temporal_token_scorer_eps: 1.0e-06
417
+ action_expert_eps: 1.0e-06
418
+ metrics_log_interval: -1
419
+ scheduler:
420
+ name: multimodal
421
+ units: steps
422
+ t_warmup: 100
423
+ t_max: null
424
+ alpha_f: 0.1
425
+ connector_t_warmup: 200
426
+ vit_t_warmup: 200
427
+ llm_t_warmup: 2000
428
+ frame_selector_t_warmup: 200
429
+ temporal_token_scorer_t_warmup: 200
430
+ action_expert_t_warmup: 200
431
+ grad_clip_warmup_steps: null
432
+ grad_clip_warmup_factor: null
433
+ warmup_min_lr: 0.0
434
+ data:
435
+ dataset: null
436
+ mixture:
437
+ synthmanip/task_0: 0.35
438
+ synthmanip/task_1: 0.2
439
+ synthmanip/task_2: 0.2
440
+ synthmanip/task_3: 0.15
441
+ synthmanip/task_4: 0.1
442
+ root_size_mixture: null
443
+ kwargs_mixture: null
444
+ split: train
445
+ seed: 50189
446
+ pad: to_max
447
+ sequence_length: 928
448
+ max_text_seq_len: null
449
+ shuffle: true
450
+ start_index: 0
451
+ packing: null
452
+ enable_variable_sized_token_pooling: true
453
+ num_workers: 4
454
+ drop_last: true
455
+ pin_memory: true
456
+ prefetch_factor: 4
457
+ persistent_workers: false
458
+ timeout: 300
459
+ action_data: null
460
+ action_loader_rate: null
461
+ action_batch_interval: 1
462
+ restore_dataloader: true
463
+ fast_forward_batches: null
464
+ evaluators:
465
+ - label: synthmanip_val
466
+ data:
467
+ dataset: synthmanip/task_0
468
+ mixture: null
469
+ root_size_mixture: null
470
+ kwargs_mixture: null
471
+ split: val
472
+ seed: 691203
473
+ pad: to_max
474
+ sequence_length: 928
475
+ max_text_seq_len: null
476
+ shuffle: false
477
+ start_index: 0
478
+ packing: null
479
+ enable_variable_sized_token_pooling: true
480
+ num_workers: 3
481
+ drop_last: false
482
+ pin_memory: true
483
+ prefetch_factor: 4
484
+ persistent_workers: false
485
+ timeout: 300
486
+ device_batch_size: 16
487
+ subset_num_batches: null
488
+ max_examples: 2000
489
+ console_log_interval: 10
490
+ response_logits_only: true
491
+ reduce_loss_metrics_manually: false
492
+ eval_interval: 1000
493
+ inf_evaluators: []
494
+ inf_eval_interval: 1000
495
+ eval_on_last_step: true
496
+ eval_on_load: false
497
+ eval_on: []
498
+ save_folder: /weka/oe-training-default/rohunt/model_runs/Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20
499
+ checkpointer_config:
500
+ save_thread_count: null
501
+ load_thread_count: null
502
+ pre_download: false
503
+ work_dir: null
504
+ throttle_uploads: false
505
+ canceled_check_interval: 50
506
+ save_interval: 2000
507
+ save_at: null
508
+ save_final_optim: false
509
+ save_num_checkpoints_to_keep: 1
510
+ checkpoint_retention_frequency: 10000
511
+ save_final_unsharded_checkpoint: false
512
+ save_interval_ephemeral: null
513
+ save_overwrite: true
514
+ load_path: null
515
+ reset_optimizer_state: true
516
+ reset_trainer_state: true
517
+ initial_model_checkpoint: /weka/oe-training-default/rohunt/model_runs/Frnk-8n_abs_-03-06-17-32-00_bs1024_dbs16_stp200000-mix_5_feb20_copy/step200000
518
+ allow_resume: true
519
+ max_duration: 50000
520
+ global_train_batch_size: 1024
521
+ device_train_microbatch_size: 16
522
+ max_grad_norm: 1.0
523
+ multi_component_grad_norm: true
524
+ batch_divisor: global_batch
525
+ max_grad_norm_ratio: null
526
+ precision: amp_bf16
527
+ wandb:
528
+ project: molmo_ae_synth
529
+ entity: prior-ai2
530
+ group: null
531
+ name: Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20
532
+ tags:
533
+ - watching
534
+ log_artifacts: false
535
+ rank_zero_only: true
536
+ log_interval: 20
537
+ allow_resume: true
538
+ finish_on_sigterm: true
539
+ beaker_log_interval: 50
540
+ speed_monitor:
541
+ window_size: 20
542
+ gpu_flops_available: null
543
+ console_log_interval: 20
544
+ enable_timing_logs: false
545
+ gen1_gc_interval: 1
546
+ compile:
547
+ mode: default
548
+ fullgraph: false
549
+ dynamic: false
550
+ backend: inductor
551
+ activation_checkpointing: true
552
+ fsdp:
553
+ fsdp2: true
554
+ precision: pure
555
+ use_orig_params: true
556
+ wrapping_strategy: null
557
+ sharding_strategy: FULL_SHARD
558
+ hybrid_sharding_num_model_replicas: null
559
+ softmax_auxiliary_loss: false
560
+ softmax_auxiliary_loss_scale: 0.0001
561
+ response_logits_only: true
562
+ saliency_score_loss_wt: null
563
+ frame_score_loss_wt: null
564
+ frame_score_loss_type: mse
565
+ frame_score_loss_target: 0.7
566
+ time_limit: null
567
+ extra_steps_after_cancel: 0
568
+ python_profiling: false
569
+ torch_profiling: false
570
+ stop_at: 50000
571
+ stop_after: null
572
+ fused_loss: false
573
+ compile_loss: true
574
+ runtime_data:
575
+ args: launch_scripts/train_synthmanip.py /weka/oe-training-default/rohunt/model_runs/Frnk-8n_abs_-03-06-17-32-00_bs1024_dbs16_stp200000-mix_5_feb20_copy/step200000
576
+ --data_paths mix --stats_path=/weka/oe-training-default/rohunt/robo/stats/franka_mltask_abs_pos.yaml
577
+ --action_preset franka_joint --camera_preset franka_one_random_then_wrist --wandb.name=Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20
578
+ --wandb.entity=prior-ai2 --wandb.project=molmo_ae_synth --seq_len=928 --max_duration=50000
579
+ --device_batch_size=16 --global_batch_size=1024 --log_interval=20 --model.mm_preprocessor.use_frame_special_tokens=True
580
+ --model.mm_preprocessor.max_subtitle_tokens=null --prefetch_factor=4 --data.num_workers=4
581
+ --save_interval=2000 --save_num_checkpoints_to_keep=1 --checkpoint_retention_frequency=10000
582
+ --save_folder=/weka/oe-training-default/rohunt/model_runs/Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20
583
+ --exp_name=Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20
584
+ --data.packing=null --model.mm_preprocessor.image.crop_mode=resize --model.mm_preprocessor.max_frames=1
585
+ --model.same_noise_per_time=False --weighted_sampling --randomize_prompts --ft_embedding=ae
586
+ --model.mm_preprocessor.image.max_images=4 --model.num_flow_timestamps=8 --ft_llm=True
587
+ --scheduler.llm_t_warmup=2000 --optimizer.llm_learning_rate=1e-05 --img_aug --model.mm_preprocessor.image.multi_image_pooling_w=2
588
+ --model.mm_preprocessor.image.multi_image_pooling_h=2 --n_obs_steps=2 --obs_step_delta=8
589
+ --model.mm_preprocessor.image.single_frame=False --reset_optimizer_state --reset_trainer_state
590
+ --furthest_camera_prob=0.5
591
+ hostname: jupiter-cs-aus-148.reviz.ai2.in
592
+ date: 03/09/2026, 01:55
593
+ world_size: 64
594
+ resuming_from: null
595
+ beaker_experiment_id: 01KK84PM8EQZW1SC6YRT12PYRR
596
+ beaker_experiment_url: null
597
+ wandb_id: 1umcfp2f
598
+ wandb_url: https://wandb.ai/prior-ai2/molmo_ae_synth/runs/1umcfp2f
599
+ distributed_eval_enabled: false
600
+ distributed_eval_benchmark_path: /weka/oe/rohunt/robo-bench/FrankaPickandPlaceDroidBench_5ep_json_benchmark
601
+ distributed_eval_config_cls: launch_scripts.synthvla.configure_mujoco_thor:FrankaState8ClampConfig
602
+ distributed_eval_task_horizon: 300
603
+ distributed_eval_num_worker_jobs: 1
604
+ distributed_eval_wandb_project: mjthor-online-eval
605
+ distributed_eval_workspace: ai2/robo-molmo
606
+ distributed_eval_clusters:
607
+ - ai2/saturn
608
+ - ai2/neptune
609
+ - ai2/rhea
610
+ - ai2/ceres
611
+ distributed_eval_priority: high
612
+ distributed_eval_preemptible: true
model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db2c62ccdd6773fb4fffad87ed52d299f5f0fc636290133b16150e942f36576d
3
+ size 19992166548