Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

README.md +34 -3
config.yaml +612 -0
model.pt +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,34 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+datasets:
+- allenai/molmobot-data
+language:
+- en
+base_model:
+- allenai/Molmo2-4B
+pipeline_tag: robotics
+tags:
+- robotics
+- manipulation
+---
+# MolmoBot-DROID
+[[Paper](https://arxiv.org/pdf/2603.16861)] [[Project Website](https://allenai.github.io/MolmoBot)] [[Code](https://github.com/allenai/MolmoBot/tree/main/MolmoBot)] [[Data](https://huggingface.co/datasets/allenai/molmobot-data)]
+MolmoBot-DROID is the MolmoBot VLA trained on simulation data on the DROID platform, **without any real robot data**. See [here](https://github.com/allenai/MolmoBot/tree/main/MolmoBot-Pi0) for usage instructions. In the paper, it is also referred to as `MolmoBot (F=2)`.
+## BibTeX
+```
+@misc{deshpande2026molmobot,
+      title={MolmoB0T: Large-Scale Simulation Enables Zero-Shot Manipulation},
+      author={Abhay Deshpande and Maya Guru and Rose Hendrix and Snehal Jauhri and Ainaz Eftekhar and Rohun Tripathi and Max Argus and Jordi Salvador and Haoquan Fang and Matthew Wallingford and Wilbert Pumacay and Yejin Kim and Quinn Pfeifer and Ying-Chun Lee and Piper Wolters and Omar Rayyan and Mingtong Zhang and Jiafei Duan and Karen Farley and Winson Han and Eli Vanderbilt and Dieter Fox and Ali Farhadi and Georgia Chalvatzaki and Dhruv Shah and Ranjay Krishna},
+      year={2026},
+      eprint={2603.16861},
+      archivePrefix={arXiv},
+      primaryClass={cs.RO},
+      url={https://arxiv.org/abs/2603.16861},
+}
+```

config.yaml ADDED Viewed

	@@ -0,0 +1,612 @@

+run_name: Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20
+model:
+  model_name: molmoact
+  data_formatter:
+    prompt_templates: uber_model_v2
+    message_format: qwen3
+    system_prompt: demo_or_style_v2
+    always_start_with_space: false
+    default_inference_len: 65
+    select_answer: best
+    debug: false
+    image_last: false
+    format_message_list: null
+    p_one_message: 0.0
+    eval_system_prompt_mapping: null
+    p_choice_content_in_mc: 1.0
+    template_video_mc_questions: true
+    pointing_format: html-v2
+    points_decimal_places: 1
+    use_seperate_non_pointing_qa_style: false
+    timestamp_mode: 50-percent-seconds
+    output_timestamp_mode: seconds
+    seconds_decimal_places: 1
+    p_multi_point_all_image: 0.5
+    use_seperate_count_without_pointing_style: false
+    sample_random_initial_point: true
+  llm:
+    d_model: 2560
+    n_heads: 32
+    n_kv_heads: 8
+    head_dim: 128
+    qkv_bias: false
+    clip_qkv: null
+    n_layers: 36
+    mlp_ratio: 4
+    mlp_hidden_size: 19456
+    activation_type: swiglu
+    block_type: sequential
+    rope: true
+    rope_full_precision: true
+    rope_theta: 5000000.0
+    rope_type: default
+    rope_factor: null
+    rope_high_freq_factor: null
+    rope_low_freq_factor: null
+    rope_original_max_position_embeddings: null
+    rope_attention_factor: null
+    rope_beta_fast: null
+    rope_beta_slow: null
+    rope_mscale: null
+    rope_mscale_all_dim: null
+    rope_truncate: null
+    attention_type: sdpa
+    full_attention_layers: null
+    sliding_attention_rope_scaling: false
+    float32_attention: true
+    attention_dropout: 0.0
+    attention_layer_norm: true
+    attention_layer_norm_type: qwen3
+    residual_dropout: 0.1
+    response_residual_dropout: 0.0
+    layer_norm_type: rms
+    layer_norm_with_affine: true
+    layer_norm_eps: 1.0e-06
+    attention_layer_norm_with_affine: true
+    max_sequence_length: 8192
+    max_position_embeddings: null
+    include_bias: false
+    bias_for_layer_norm: null
+    norm_after: false
+    moe_num_experts: 8
+    moe_top_k: 2
+    moe_mlp_impl: sparse
+    moe_log_expert_assignment: false
+    moe_shared_expert: false
+    moe_lbl_in_fp32: false
+    moe_interleave: false
+    moe_loss_weight: 0.1
+    moe_zloss_weight: null
+    moe_dropless: true
+    moe_capacity_factor: 1.25
+    embedding_dropout: 0.0
+    scale_logits: false
+    vocab_size: 151936
+    additional_vocab_size: 128
+    weight_tying: true
+    embedding_size: 151936
+    use_position_ids: true
+    tokenizer:
+      identifier: Qwen/Qwen3-4B-Instruct-2507
+      tokenizer_dir: null
+    init_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen3-4b-instruct.pt
+    init_incremental: null
+    new_embedding_init_range: 0.02
+    initializer_range: 0.02
+    normalize_input_embeds: false
+    activation_checkpoint: whole_layer
+    compile: blocks
+    fix_pad_tokenizer: false
+    init_std: 0.02
+    init_fn: normal
+    init_cutoff_factor: null
+  vision_backbone:
+    vit:
+      image_model_type: siglip
+      image_default_input_size:
+      - 378
+      - 378
+      image_patch_size: 14
+      image_pos_patch_size: 14
+      image_emb_dim: 1152
+      image_num_heads: 16
+      image_num_key_value_heads: 16
+      image_num_layers: 27
+      image_head_dim: 72
+      image_mlp_dim: 4304
+      image_mlp_activations: gelu_pytorch_tanh
+      image_dropout_rate: 0.0
+      image_num_pos: 729
+      image_norm_eps: 1.0e-06
+      attention_dropout: 0.0
+      residual_dropout: 0.0
+      initializer_range: 0.02
+      float32_attention: true
+      attention_type: sdpa
+      sdpa_backend: all
+      activation_checkpointing: true
+      init_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/siglip2-so400m-14-384.pt
+      resize_mode: siglip
+      pad_value: 0.0
+      normalize: siglip
+    image_pooling_2d: attention_meanq
+    pooling_attention_mask: true
+    image_projector: mlp
+    image_padding_embed: null
+    vit_layers:
+    - -3
+    - -9
+    skip_unused_layers: true
+    use_deepstack: false
+    share_connector: false
+    image_feature_dropout: 0.0
+    connector_activation_checkpointing: true
+    compile_vit: blocks
+    pool_size_embeds: null
+    compile_connector: null
+    normalize_on_gpu: true
+    use_image_augmentation: true
+    use_resize_bottleneck: false
+  mm_preprocessor:
+    max_answer_len: null
+    last_message_loss_only: false
+    max_text_tokens: null
+    loss_token_weighting: root_subsegments_root_tokens
+    max_frames: 1
+    frame_sample_mode: uniform_last_frame
+    candidate_sampling_fps:
+    - 0.25
+    - 0.5
+    - 1.0
+    - 2.0
+    - 4.0
+    - 6.0
+    - 8.0
+    - 16.0
+    cache_videos: true
+    loading_method: torchcodec_exact
+    max_fps:
+    - 2.0
+    time_sampling: true
+    time_mode: per-frame-compact
+    subtitle_mode: frame_1
+    max_crops: 1
+    overlap_margins:
+    - 4.0
+    - 4.0
+    use_col_tokens: false
+    periodic_high_res_frame: null
+    high_low_train_mode: local_rnd
+    high_res_frame_sample_options: null
+    periodic_sample_rate_training:
+      4:
+      - 0.9
+      - 0.03
+      - 0.03
+      - 0.04
+      3:
+      - 0.6
+      - 0.2
+      - 0.2
+    skip_low_res_in_high_low: false
+    pooling_w: 3
+    pooling_h: 3
+    high_res_pooling_w: null
+    high_res_pooling_h: null
+    query_based_resolution_selection: false
+    max_queries_for_resolution_selection: 8
+    use_frame_special_tokens: true
+    frame_sel_clip_identifier: google/siglip2-so400m-patch14-384
+    image_padding_mask: false
+    max_subtitle_tokens: null
+    image:
+      crop_mode: resize
+      use_col_tokens: true
+      max_crops: 8
+      high_res_max_crops: 24
+      p_high_res: 0.0
+      pooling_w: 2
+      pooling_h: 2
+      overlap_margins:
+      - 4
+      - 4
+      max_images: 4
+      max_multi_image_crops: 8
+      multi_image_pooling_w: 2
+      multi_image_pooling_h: 2
+      use_single_crop_col_tokens: false
+      use_single_crop_start_token: true
+      single_frame: false
+    topk: null
+    prune_from_frame: 0
+  bi_directional_attn: image_tokens
+  shared_low_high_embedding: true
+  debug: null
+  cp_enabled: false
+  apply_cp_to_vision_backbone: false
+  action_dim: 8
+  action_horizon: 16
+  n_action_steps: 8
+  n_obs_steps: 2
+  obs_step_delta: 8
+  action_expert:
+    max_horizon: 32
+    action_dim: 8
+    hidden_size: 768
+    num_layers: 36
+    num_heads: 8
+    mlp_ratio: 4.0
+    timestep_embed_dim: 256
+    dropout: 0.0
+    attn_dropout: 0.0
+    context_layer_norm: true
+  action_expert_layer_mode: per_layer
+  flow_matching_num_steps: 10
+  flow_matching_cutoff: 0.999
+  flow_matching_beta_alpha: 1.0
+  flow_matching_beta_beta: 1.5
+  num_flow_timestamps: 8
+  same_noise_per_time: false
+  states_mode: cross_attn
+  robot_preprocessor:
+    stats_by_repo:
+      synthmanip:
+        observation.state:
+          q01:
+          - -0.8200882077217102
+          - -1.0460078716278076
+          - -1.2745805978775024
+          - -2.864607334136963
+          - -1.0115491151809692
+          - 1.2138986587524414
+          - -2.057372808456421
+          - -0.027562683448195457
+          q99:
+          - 0.7587710618972778
+          - 0.9406100511550903
+          - 0.9344996809959412
+          - -0.9798629283905029
+          - 0.8359407782554626
+          - 3.0869405269622803
+          - 1.9223058223724365
+          - 0.8661524057388306
+        action:
+          q01:
+          - -0.8200882077217102
+          - -1.0460078716278076
+          - -1.2745805978775024
+          - -2.864607334136963
+          - -1.0115491151809692
+          - 1.2138986587524414
+          - -2.057372808456421
+          - 0.0
+          q99:
+          - 0.7587710618972778
+          - 0.9406100511550903
+          - 0.9344996809959412
+          - -0.9798629283905029
+          - 0.8359407782554626
+          - 3.0869405269622803
+          - 1.9223058223724365
+          - 255.0
+    default_repo_id: synthmanip
+    action_key: action
+    state_keys:
+    - observation.state
+    action_norm_mode: quantiles
+    state_norm_mode: quantiles
+  robot_postprocessor:
+    stats_by_repo:
+      synthmanip:
+        observation.state:
+          q01:
+          - -0.8200882077217102
+          - -1.0460078716278076
+          - -1.2745805978775024
+          - -2.864607334136963
+          - -1.0115491151809692
+          - 1.2138986587524414
+          - -2.057372808456421
+          - -0.027562683448195457
+          q99:
+          - 0.7587710618972778
+          - 0.9406100511550903
+          - 0.9344996809959412
+          - -0.9798629283905029
+          - 0.8359407782554626
+          - 3.0869405269622803
+          - 1.9223058223724365
+          - 0.8661524057388306
+        action:
+          q01:
+          - -0.8200882077217102
+          - -1.0460078716278076
+          - -1.2745805978775024
+          - -2.864607334136963
+          - -1.0115491151809692
+          - 1.2138986587524414
+          - -2.057372808456421
+          - 0.0
+          q99:
+          - 0.7587710618972778
+          - 0.9406100511550903
+          - 0.9344996809959412
+          - -0.9798629283905029
+          - 0.8359407782554626
+          - 3.0869405269622803
+          - 1.9223058223724365
+          - 255.0
+    default_repo_id: synthmanip
+    action_key: action
+    state_keys:
+    - observation.state
+    action_norm_mode: quantiles
+    state_norm_mode: quantiles
+parallelism:
+  data_parallel_replicate_degree: 1
+  enable_compiled_autograd: false
+  data_parallel_shard_degree: -1
+  fsdp_reshard_after_forward: default
+  context_parallel_config:
+    degree: 1
+    attention_type: ulysses
+    load_balancer: ulysses
+    head_stride: 1
+  tensor_parallel_config:
+    degree: 1
+    enable_async: false
+  data_parallel_config:
+    name: fsdp
+    param_dtype: null
+    reduce_dtype: float32
+    num_replicas: null
+    shard_degree: null
+    wrapping_strategy: full
+    prefetch_factor: 0
+  context_parallel_rotate_method: allgather
+seed: 6198
+epoch: null
+dry_run: false
+ft_llm: true
+ft_vit: false
+ft_connector: false
+ft_embedding: ae
+optimizer:
+  name: adamw
+  learning_rate: 0.0001
+  weight_decay: 0.01
+  betas:
+  - 0.9
+  - 0.95
+  eps: 1.0e-05
+  connector_learning_rate: 5.0e-06
+  vit_learning_rate: 5.0e-06
+  llm_learning_rate: 1.0e-05
+  frame_selector_learning_rate: 0.0001
+  temporal_token_scorer_learning_rate: 0.0001
+  action_expert_learning_rate: 0.0001
+  connector_weight_decay: 0.0
+  vit_weight_decay: 0.0
+  llm_weight_decay: 0.0
+  frame_selector_weight_decay: 0.01
+  temporal_token_scorer_weight_decay: 0.01
+  action_expert_weight_decay: 0.0
+  connector_betas:
+  - 0.9
+  - 0.95
+  vit_betas:
+  - 0.9
+  - 0.95
+  llm_betas:
+  - 0.9
+  - 0.95
+  frame_selector_betas:
+  - 0.9
+  - 0.95
+  temporal_token_scorer_betas:
+  - 0.9
+  - 0.95
+  action_expert_betas:
+  - 0.9
+  - 0.95
+  connector_eps: 1.0e-06
+  vit_eps: 1.0e-06
+  llm_eps: 1.0e-06
+  frame_selector_eps: 1.0e-06
+  temporal_token_scorer_eps: 1.0e-06
+  action_expert_eps: 1.0e-06
+  metrics_log_interval: -1
+scheduler:
+  name: multimodal
+  units: steps
+  t_warmup: 100
+  t_max: null
+  alpha_f: 0.1
+  connector_t_warmup: 200
+  vit_t_warmup: 200
+  llm_t_warmup: 2000
+  frame_selector_t_warmup: 200
+  temporal_token_scorer_t_warmup: 200
+  action_expert_t_warmup: 200
+  grad_clip_warmup_steps: null
+  grad_clip_warmup_factor: null
+  warmup_min_lr: 0.0
+data:
+  dataset: null
+  mixture:
+    synthmanip/task_0: 0.35
+    synthmanip/task_1: 0.2
+    synthmanip/task_2: 0.2
+    synthmanip/task_3: 0.15
+    synthmanip/task_4: 0.1
+  root_size_mixture: null
+  kwargs_mixture: null
+  split: train
+  seed: 50189
+  pad: to_max
+  sequence_length: 928
+  max_text_seq_len: null
+  shuffle: true
+  start_index: 0
+  packing: null
+  enable_variable_sized_token_pooling: true
+  num_workers: 4
+  drop_last: true
+  pin_memory: true
+  prefetch_factor: 4
+  persistent_workers: false
+  timeout: 300
+action_data: null
+action_loader_rate: null
+action_batch_interval: 1
+restore_dataloader: true
+fast_forward_batches: null
+evaluators:
+- label: synthmanip_val
+  data:
+    dataset: synthmanip/task_0
+    mixture: null
+    root_size_mixture: null
+    kwargs_mixture: null
+    split: val
+    seed: 691203
+    pad: to_max
+    sequence_length: 928
+    max_text_seq_len: null
+    shuffle: false
+    start_index: 0
+    packing: null
+    enable_variable_sized_token_pooling: true
+    num_workers: 3
+    drop_last: false
+    pin_memory: true
+    prefetch_factor: 4
+    persistent_workers: false
+    timeout: 300
+  device_batch_size: 16
+  subset_num_batches: null
+  max_examples: 2000
+  console_log_interval: 10
+  response_logits_only: true
+  reduce_loss_metrics_manually: false
+eval_interval: 1000
+inf_evaluators: []
+inf_eval_interval: 1000
+eval_on_last_step: true
+eval_on_load: false
+eval_on: []
+save_folder: /weka/oe-training-default/rohunt/model_runs/Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20
+checkpointer_config:
+  save_thread_count: null
+  load_thread_count: null
+  pre_download: false
+  work_dir: null
+  throttle_uploads: false
+canceled_check_interval: 50
+save_interval: 2000
+save_at: null
+save_final_optim: false
+save_num_checkpoints_to_keep: 1
+checkpoint_retention_frequency: 10000
+save_final_unsharded_checkpoint: false
+save_interval_ephemeral: null
+save_overwrite: true
+load_path: null
+reset_optimizer_state: true
+reset_trainer_state: true
+initial_model_checkpoint: /weka/oe-training-default/rohunt/model_runs/Frnk-8n_abs_-03-06-17-32-00_bs1024_dbs16_stp200000-mix_5_feb20_copy/step200000
+allow_resume: true
+max_duration: 50000
+global_train_batch_size: 1024
+device_train_microbatch_size: 16
+max_grad_norm: 1.0
+multi_component_grad_norm: true
+batch_divisor: global_batch
+max_grad_norm_ratio: null
+precision: amp_bf16
+wandb:
+  project: molmo_ae_synth
+  entity: prior-ai2
+  group: null
+  name: Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20
+  tags:
+  - watching
+  log_artifacts: false
+  rank_zero_only: true
+  log_interval: 20
+  allow_resume: true
+  finish_on_sigterm: true
+beaker_log_interval: 50
+speed_monitor:
+  window_size: 20
+  gpu_flops_available: null
+console_log_interval: 20
+enable_timing_logs: false
+gen1_gc_interval: 1
+compile:
+  mode: default
+  fullgraph: false
+  dynamic: false
+  backend: inductor
+activation_checkpointing: true
+fsdp:
+  fsdp2: true
+  precision: pure
+  use_orig_params: true
+  wrapping_strategy: null
+  sharding_strategy: FULL_SHARD
+  hybrid_sharding_num_model_replicas: null
+softmax_auxiliary_loss: false
+softmax_auxiliary_loss_scale: 0.0001
+response_logits_only: true
+saliency_score_loss_wt: null
+frame_score_loss_wt: null
+frame_score_loss_type: mse
+frame_score_loss_target: 0.7
+time_limit: null
+extra_steps_after_cancel: 0
+python_profiling: false
+torch_profiling: false
+stop_at: 50000
+stop_after: null
+fused_loss: false
+compile_loss: true
+runtime_data:
+  args: launch_scripts/train_synthmanip.py /weka/oe-training-default/rohunt/model_runs/Frnk-8n_abs_-03-06-17-32-00_bs1024_dbs16_stp200000-mix_5_feb20_copy/step200000
+    --data_paths mix --stats_path=/weka/oe-training-default/rohunt/robo/stats/franka_mltask_abs_pos.yaml
+    --action_preset franka_joint --camera_preset franka_one_random_then_wrist --wandb.name=Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20
+    --wandb.entity=prior-ai2 --wandb.project=molmo_ae_synth --seq_len=928 --max_duration=50000
+    --device_batch_size=16 --global_batch_size=1024 --log_interval=20 --model.mm_preprocessor.use_frame_special_tokens=True
+    --model.mm_preprocessor.max_subtitle_tokens=null --prefetch_factor=4 --data.num_workers=4
+    --save_interval=2000 --save_num_checkpoints_to_keep=1 --checkpoint_retention_frequency=10000
+    --save_folder=/weka/oe-training-default/rohunt/model_runs/Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20
+    --exp_name=Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20
+    --data.packing=null --model.mm_preprocessor.image.crop_mode=resize --model.mm_preprocessor.max_frames=1
+    --model.same_noise_per_time=False --weighted_sampling --randomize_prompts --ft_embedding=ae
+    --model.mm_preprocessor.image.max_images=4 --model.num_flow_timestamps=8 --ft_llm=True
+    --scheduler.llm_t_warmup=2000 --optimizer.llm_learning_rate=1e-05 --img_aug --model.mm_preprocessor.image.multi_image_pooling_w=2
+    --model.mm_preprocessor.image.multi_image_pooling_h=2 --n_obs_steps=2 --obs_step_delta=8
+    --model.mm_preprocessor.image.single_frame=False --reset_optimizer_state --reset_trainer_state
+    --furthest_camera_prob=0.5
+  hostname: jupiter-cs-aus-148.reviz.ai2.in
+  date: 03/09/2026, 01:55
+  world_size: 64
+  resuming_from: null
+  beaker_experiment_id: 01KK84PM8EQZW1SC6YRT12PYRR
+  beaker_experiment_url: null
+  wandb_id: 1umcfp2f
+  wandb_url: https://wandb.ai/prior-ai2/molmo_ae_synth/runs/1umcfp2f
+distributed_eval_enabled: false
+distributed_eval_benchmark_path: /weka/oe/rohunt/robo-bench/FrankaPickandPlaceDroidBench_5ep_json_benchmark
+distributed_eval_config_cls: launch_scripts.synthvla.configure_mujoco_thor:FrankaState8ClampConfig
+distributed_eval_task_horizon: 300
+distributed_eval_num_worker_jobs: 1
+distributed_eval_wandb_project: mjthor-online-eval
+distributed_eval_workspace: ai2/robo-molmo
+distributed_eval_clusters:
+- ai2/saturn
+- ai2/neptune
+- ai2/rhea
+- ai2/ceres
+distributed_eval_priority: high
+distributed_eval_preemptible: true

model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db2c62ccdd6773fb4fffad87ed52d299f5f0fc636290133b16150e942f36576d
+size 19992166548