| run_name: Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20 |
| model: |
| model_name: molmobot |
| data_formatter: |
| prompt_templates: uber_model_v2 |
| message_format: qwen3 |
| system_prompt: demo_or_style_v2 |
| always_start_with_space: false |
| default_inference_len: 65 |
| select_answer: best |
| debug: false |
| image_last: false |
| format_message_list: null |
| p_one_message: 0.0 |
| eval_system_prompt_mapping: null |
| p_choice_content_in_mc: 1.0 |
| template_video_mc_questions: true |
| pointing_format: html-v2 |
| points_decimal_places: 1 |
| use_seperate_non_pointing_qa_style: false |
| timestamp_mode: 50-percent-seconds |
| output_timestamp_mode: seconds |
| seconds_decimal_places: 1 |
| p_multi_point_all_image: 0.5 |
| use_seperate_count_without_pointing_style: false |
| sample_random_initial_point: true |
| llm: |
| d_model: 2560 |
| n_heads: 32 |
| n_kv_heads: 8 |
| head_dim: 128 |
| qkv_bias: false |
| clip_qkv: null |
| n_layers: 36 |
| mlp_ratio: 4 |
| mlp_hidden_size: 19456 |
| activation_type: swiglu |
| block_type: sequential |
| rope: true |
| rope_full_precision: true |
| rope_theta: 5000000.0 |
| rope_type: default |
| rope_factor: null |
| rope_high_freq_factor: null |
| rope_low_freq_factor: null |
| rope_original_max_position_embeddings: null |
| rope_attention_factor: null |
| rope_beta_fast: null |
| rope_beta_slow: null |
| rope_mscale: null |
| rope_mscale_all_dim: null |
| rope_truncate: null |
| attention_type: sdpa |
| full_attention_layers: null |
| sliding_attention_rope_scaling: false |
| float32_attention: true |
| attention_dropout: 0.0 |
| attention_layer_norm: true |
| attention_layer_norm_type: qwen3 |
| residual_dropout: 0.1 |
| response_residual_dropout: 0.0 |
| layer_norm_type: rms |
| layer_norm_with_affine: true |
| layer_norm_eps: 1.0e-06 |
| attention_layer_norm_with_affine: true |
| max_sequence_length: 8192 |
| max_position_embeddings: null |
| include_bias: false |
| bias_for_layer_norm: null |
| norm_after: false |
| moe_num_experts: 8 |
| moe_top_k: 2 |
| moe_mlp_impl: sparse |
| moe_log_expert_assignment: false |
| moe_shared_expert: false |
| moe_lbl_in_fp32: false |
| moe_interleave: false |
| moe_loss_weight: 0.1 |
| moe_zloss_weight: null |
| moe_dropless: true |
| moe_capacity_factor: 1.25 |
| embedding_dropout: 0.0 |
| scale_logits: false |
| vocab_size: 151936 |
| additional_vocab_size: 128 |
| weight_tying: true |
| embedding_size: 151936 |
| use_position_ids: true |
| tokenizer: |
| identifier: Qwen/Qwen3-4B-Instruct-2507 |
| tokenizer_dir: null |
| init_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen3-4b-instruct.pt |
| init_incremental: null |
| new_embedding_init_range: 0.02 |
| initializer_range: 0.02 |
| normalize_input_embeds: false |
| activation_checkpoint: whole_layer |
| compile: blocks |
| fix_pad_tokenizer: false |
| init_std: 0.02 |
| init_fn: normal |
| init_cutoff_factor: null |
| vision_backbone: |
| vit: |
| image_model_type: siglip |
| image_default_input_size: |
| - 378 |
| - 378 |
| image_patch_size: 14 |
| image_pos_patch_size: 14 |
| image_emb_dim: 1152 |
| image_num_heads: 16 |
| image_num_key_value_heads: 16 |
| image_num_layers: 27 |
| image_head_dim: 72 |
| image_mlp_dim: 4304 |
| image_mlp_activations: gelu_pytorch_tanh |
| image_dropout_rate: 0.0 |
| image_num_pos: 729 |
| image_norm_eps: 1.0e-06 |
| attention_dropout: 0.0 |
| residual_dropout: 0.0 |
| initializer_range: 0.02 |
| float32_attention: true |
| attention_type: sdpa |
| sdpa_backend: all |
| activation_checkpointing: true |
| init_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/siglip2-so400m-14-384.pt |
| resize_mode: siglip |
| pad_value: 0.0 |
| normalize: siglip |
| image_pooling_2d: attention_meanq |
| pooling_attention_mask: true |
| image_projector: mlp |
| image_padding_embed: null |
| vit_layers: |
| - -3 |
| - -9 |
| skip_unused_layers: true |
| use_deepstack: false |
| share_connector: false |
| image_feature_dropout: 0.0 |
| connector_activation_checkpointing: true |
| compile_vit: blocks |
| pool_size_embeds: null |
| compile_connector: null |
| normalize_on_gpu: true |
| use_image_augmentation: true |
| use_resize_bottleneck: false |
| mm_preprocessor: |
| max_answer_len: null |
| last_message_loss_only: false |
| max_text_tokens: null |
| loss_token_weighting: root_subsegments_root_tokens |
| max_frames: 1 |
| frame_sample_mode: uniform_last_frame |
| candidate_sampling_fps: |
| - 0.25 |
| - 0.5 |
| - 1.0 |
| - 2.0 |
| - 4.0 |
| - 6.0 |
| - 8.0 |
| - 16.0 |
| cache_videos: true |
| loading_method: torchcodec_exact |
| max_fps: |
| - 2.0 |
| time_sampling: true |
| time_mode: per-frame-compact |
| subtitle_mode: frame_1 |
| max_crops: 1 |
| overlap_margins: |
| - 4.0 |
| - 4.0 |
| use_col_tokens: false |
| periodic_high_res_frame: null |
| high_low_train_mode: local_rnd |
| high_res_frame_sample_options: null |
| periodic_sample_rate_training: |
| 4: |
| - 0.9 |
| - 0.03 |
| - 0.03 |
| - 0.04 |
| 3: |
| - 0.6 |
| - 0.2 |
| - 0.2 |
| skip_low_res_in_high_low: false |
| pooling_w: 3 |
| pooling_h: 3 |
| high_res_pooling_w: null |
| high_res_pooling_h: null |
| query_based_resolution_selection: false |
| max_queries_for_resolution_selection: 8 |
| use_frame_special_tokens: true |
| frame_sel_clip_identifier: google/siglip2-so400m-patch14-384 |
| image_padding_mask: false |
| max_subtitle_tokens: null |
| image: |
| crop_mode: resize |
| use_col_tokens: true |
| max_crops: 8 |
| high_res_max_crops: 24 |
| p_high_res: 0.0 |
| pooling_w: 2 |
| pooling_h: 2 |
| overlap_margins: |
| - 4 |
| - 4 |
| max_images: 4 |
| max_multi_image_crops: 8 |
| multi_image_pooling_w: 2 |
| multi_image_pooling_h: 2 |
| use_single_crop_col_tokens: false |
| use_single_crop_start_token: true |
| single_frame: false |
| topk: null |
| prune_from_frame: 0 |
| bi_directional_attn: image_tokens |
| shared_low_high_embedding: true |
| debug: null |
| cp_enabled: false |
| apply_cp_to_vision_backbone: false |
| action_dim: 8 |
| action_horizon: 16 |
| n_action_steps: 8 |
| n_obs_steps: 2 |
| obs_step_delta: 8 |
| action_expert: |
| max_horizon: 32 |
| action_dim: 8 |
| hidden_size: 768 |
| num_layers: 36 |
| num_heads: 8 |
| mlp_ratio: 4.0 |
| timestep_embed_dim: 256 |
| dropout: 0.0 |
| attn_dropout: 0.0 |
| context_layer_norm: true |
| action_expert_layer_mode: per_layer |
| flow_matching_num_steps: 10 |
| flow_matching_cutoff: 0.999 |
| flow_matching_beta_alpha: 1.0 |
| flow_matching_beta_beta: 1.5 |
| num_flow_timestamps: 8 |
| same_noise_per_time: false |
| states_mode: cross_attn |
| robot_preprocessor: |
| stats_by_repo: |
| synthmanip: |
| observation.state: |
| q01: |
| - -0.8200882077217102 |
| - -1.0460078716278076 |
| - -1.2745805978775024 |
| - -2.864607334136963 |
| - -1.0115491151809692 |
| - 1.2138986587524414 |
| - -2.057372808456421 |
| - -0.027562683448195457 |
| q99: |
| - 0.7587710618972778 |
| - 0.9406100511550903 |
| - 0.9344996809959412 |
| - -0.9798629283905029 |
| - 0.8359407782554626 |
| - 3.0869405269622803 |
| - 1.9223058223724365 |
| - 0.8661524057388306 |
| action: |
| q01: |
| - -0.8200882077217102 |
| - -1.0460078716278076 |
| - -1.2745805978775024 |
| - -2.864607334136963 |
| - -1.0115491151809692 |
| - 1.2138986587524414 |
| - -2.057372808456421 |
| - 0.0 |
| q99: |
| - 0.7587710618972778 |
| - 0.9406100511550903 |
| - 0.9344996809959412 |
| - -0.9798629283905029 |
| - 0.8359407782554626 |
| - 3.0869405269622803 |
| - 1.9223058223724365 |
| - 255.0 |
| default_repo_id: synthmanip |
| action_key: action |
| state_keys: |
| - observation.state |
| action_norm_mode: quantiles |
| state_norm_mode: quantiles |
| robot_postprocessor: |
| stats_by_repo: |
| synthmanip: |
| observation.state: |
| q01: |
| - -0.8200882077217102 |
| - -1.0460078716278076 |
| - -1.2745805978775024 |
| - -2.864607334136963 |
| - -1.0115491151809692 |
| - 1.2138986587524414 |
| - -2.057372808456421 |
| - -0.027562683448195457 |
| q99: |
| - 0.7587710618972778 |
| - 0.9406100511550903 |
| - 0.9344996809959412 |
| - -0.9798629283905029 |
| - 0.8359407782554626 |
| - 3.0869405269622803 |
| - 1.9223058223724365 |
| - 0.8661524057388306 |
| action: |
| q01: |
| - -0.8200882077217102 |
| - -1.0460078716278076 |
| - -1.2745805978775024 |
| - -2.864607334136963 |
| - -1.0115491151809692 |
| - 1.2138986587524414 |
| - -2.057372808456421 |
| - 0.0 |
| q99: |
| - 0.7587710618972778 |
| - 0.9406100511550903 |
| - 0.9344996809959412 |
| - -0.9798629283905029 |
| - 0.8359407782554626 |
| - 3.0869405269622803 |
| - 1.9223058223724365 |
| - 255.0 |
| default_repo_id: synthmanip |
| action_key: action |
| state_keys: |
| - observation.state |
| action_norm_mode: quantiles |
| state_norm_mode: quantiles |
| parallelism: |
| data_parallel_replicate_degree: 1 |
| enable_compiled_autograd: false |
| data_parallel_shard_degree: -1 |
| fsdp_reshard_after_forward: default |
| context_parallel_config: |
| degree: 1 |
| attention_type: ulysses |
| load_balancer: ulysses |
| head_stride: 1 |
| tensor_parallel_config: |
| degree: 1 |
| enable_async: false |
| data_parallel_config: |
| name: fsdp |
| param_dtype: null |
| reduce_dtype: float32 |
| num_replicas: null |
| shard_degree: null |
| wrapping_strategy: full |
| prefetch_factor: 0 |
| context_parallel_rotate_method: allgather |
| seed: 6198 |
| epoch: null |
| dry_run: false |
| ft_llm: true |
| ft_vit: false |
| ft_connector: false |
| ft_embedding: ae |
| optimizer: |
| name: adamw |
| learning_rate: 0.0001 |
| weight_decay: 0.01 |
| betas: |
| - 0.9 |
| - 0.95 |
| eps: 1.0e-05 |
| connector_learning_rate: 5.0e-06 |
| vit_learning_rate: 5.0e-06 |
| llm_learning_rate: 1.0e-05 |
| frame_selector_learning_rate: 0.0001 |
| temporal_token_scorer_learning_rate: 0.0001 |
| action_expert_learning_rate: 0.0001 |
| connector_weight_decay: 0.0 |
| vit_weight_decay: 0.0 |
| llm_weight_decay: 0.0 |
| frame_selector_weight_decay: 0.01 |
| temporal_token_scorer_weight_decay: 0.01 |
| action_expert_weight_decay: 0.0 |
| connector_betas: |
| - 0.9 |
| - 0.95 |
| vit_betas: |
| - 0.9 |
| - 0.95 |
| llm_betas: |
| - 0.9 |
| - 0.95 |
| frame_selector_betas: |
| - 0.9 |
| - 0.95 |
| temporal_token_scorer_betas: |
| - 0.9 |
| - 0.95 |
| action_expert_betas: |
| - 0.9 |
| - 0.95 |
| connector_eps: 1.0e-06 |
| vit_eps: 1.0e-06 |
| llm_eps: 1.0e-06 |
| frame_selector_eps: 1.0e-06 |
| temporal_token_scorer_eps: 1.0e-06 |
| action_expert_eps: 1.0e-06 |
| metrics_log_interval: -1 |
| scheduler: |
| name: multimodal |
| units: steps |
| t_warmup: 100 |
| t_max: null |
| alpha_f: 0.1 |
| connector_t_warmup: 200 |
| vit_t_warmup: 200 |
| llm_t_warmup: 2000 |
| frame_selector_t_warmup: 200 |
| temporal_token_scorer_t_warmup: 200 |
| action_expert_t_warmup: 200 |
| grad_clip_warmup_steps: null |
| grad_clip_warmup_factor: null |
| warmup_min_lr: 0.0 |
| data: |
| dataset: null |
| mixture: |
| synthmanip/task_0: 0.35 |
| synthmanip/task_1: 0.2 |
| synthmanip/task_2: 0.2 |
| synthmanip/task_3: 0.15 |
| synthmanip/task_4: 0.1 |
| root_size_mixture: null |
| kwargs_mixture: null |
| split: train |
| seed: 50189 |
| pad: to_max |
| sequence_length: 928 |
| max_text_seq_len: null |
| shuffle: true |
| start_index: 0 |
| packing: null |
| enable_variable_sized_token_pooling: true |
| num_workers: 4 |
| drop_last: true |
| pin_memory: true |
| prefetch_factor: 4 |
| persistent_workers: false |
| timeout: 300 |
| action_data: null |
| action_loader_rate: null |
| action_batch_interval: 1 |
| restore_dataloader: true |
| fast_forward_batches: null |
| evaluators: |
| - label: synthmanip_val |
| data: |
| dataset: synthmanip/task_0 |
| mixture: null |
| root_size_mixture: null |
| kwargs_mixture: null |
| split: val |
| seed: 691203 |
| pad: to_max |
| sequence_length: 928 |
| max_text_seq_len: null |
| shuffle: false |
| start_index: 0 |
| packing: null |
| enable_variable_sized_token_pooling: true |
| num_workers: 3 |
| drop_last: false |
| pin_memory: true |
| prefetch_factor: 4 |
| persistent_workers: false |
| timeout: 300 |
| device_batch_size: 16 |
| subset_num_batches: null |
| max_examples: 2000 |
| console_log_interval: 10 |
| response_logits_only: true |
| reduce_loss_metrics_manually: false |
| eval_interval: 1000 |
| inf_evaluators: [] |
| inf_eval_interval: 1000 |
| eval_on_last_step: true |
| eval_on_load: false |
| eval_on: [] |
| save_folder: /weka/oe-training-default/rohunt/model_runs/Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20 |
| checkpointer_config: |
| save_thread_count: null |
| load_thread_count: null |
| pre_download: false |
| work_dir: null |
| throttle_uploads: false |
| canceled_check_interval: 50 |
| save_interval: 2000 |
| save_at: null |
| save_final_optim: false |
| save_num_checkpoints_to_keep: 1 |
| checkpoint_retention_frequency: 10000 |
| save_final_unsharded_checkpoint: false |
| save_interval_ephemeral: null |
| save_overwrite: true |
| load_path: null |
| reset_optimizer_state: true |
| reset_trainer_state: true |
| initial_model_checkpoint: /weka/oe-training-default/rohunt/model_runs/Frnk-8n_abs_-03-06-17-32-00_bs1024_dbs16_stp200000-mix_5_feb20_copy/step200000 |
| allow_resume: true |
| max_duration: 50000 |
| global_train_batch_size: 1024 |
| device_train_microbatch_size: 16 |
| max_grad_norm: 1.0 |
| multi_component_grad_norm: true |
| batch_divisor: global_batch |
| max_grad_norm_ratio: null |
| precision: amp_bf16 |
| wandb: |
| project: molmo_ae_synth |
| entity: prior-ai2 |
| group: null |
| name: Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20 |
| tags: |
| - watching |
| log_artifacts: false |
| rank_zero_only: true |
| log_interval: 20 |
| allow_resume: true |
| finish_on_sigterm: true |
| beaker_log_interval: 50 |
| speed_monitor: |
| window_size: 20 |
| gpu_flops_available: null |
| console_log_interval: 20 |
| enable_timing_logs: false |
| gen1_gc_interval: 1 |
| compile: |
| mode: default |
| fullgraph: false |
| dynamic: false |
| backend: inductor |
| activation_checkpointing: true |
| fsdp: |
| fsdp2: true |
| precision: pure |
| use_orig_params: true |
| wrapping_strategy: null |
| sharding_strategy: FULL_SHARD |
| hybrid_sharding_num_model_replicas: null |
| softmax_auxiliary_loss: false |
| softmax_auxiliary_loss_scale: 0.0001 |
| response_logits_only: true |
| saliency_score_loss_wt: null |
| frame_score_loss_wt: null |
| frame_score_loss_type: mse |
| frame_score_loss_target: 0.7 |
| time_limit: null |
| extra_steps_after_cancel: 0 |
| python_profiling: false |
| torch_profiling: false |
| stop_at: 50000 |
| stop_after: null |
| fused_loss: false |
| compile_loss: true |
| runtime_data: |
| args: launch_scripts/train_synthmanip.py /weka/oe-training-default/rohunt/model_runs/Frnk-8n_abs_-03-06-17-32-00_bs1024_dbs16_stp200000-mix_5_feb20_copy/step200000 |
| --data_paths mix --stats_path=/weka/oe-training-default/rohunt/robo/stats/franka_mltask_abs_pos.yaml |
| --action_preset franka_joint --camera_preset franka_one_random_then_wrist --wandb.name=Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20 |
| --wandb.entity=prior-ai2 --wandb.project=molmo_ae_synth --seq_len=928 --max_duration=50000 |
| --device_batch_size=16 --global_batch_size=1024 --log_interval=20 --model.mm_preprocessor.use_frame_special_tokens=True |
| --model.mm_preprocessor.max_subtitle_tokens=null --prefetch_factor=4 --data.num_workers=4 |
| --save_interval=2000 --save_num_checkpoints_to_keep=1 --checkpoint_retention_frequency=10000 |
| --save_folder=/weka/oe-training-default/rohunt/model_runs/Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20 |
| --exp_name=Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20 |
| --data.packing=null --model.mm_preprocessor.image.crop_mode=resize --model.mm_preprocessor.max_frames=1 |
| --model.same_noise_per_time=False --weighted_sampling --randomize_prompts --ft_embedding=ae |
| --model.mm_preprocessor.image.max_images=4 --model.num_flow_timestamps=8 --ft_llm=True |
| --scheduler.llm_t_warmup=2000 --optimizer.llm_learning_rate=1e-05 --img_aug --model.mm_preprocessor.image.multi_image_pooling_w=2 |
| --model.mm_preprocessor.image.multi_image_pooling_h=2 --n_obs_steps=2 --obs_step_delta=8 |
| --model.mm_preprocessor.image.single_frame=False --reset_optimizer_state --reset_trainer_state |
| --furthest_camera_prob=0.5 |
| hostname: jupiter-cs-aus-148.reviz.ai2.in |
| date: 03/09/2026, 01:55 |
| world_size: 64 |
| resuming_from: null |
| beaker_experiment_id: 01KK84PM8EQZW1SC6YRT12PYRR |
| beaker_experiment_url: null |
| wandb_id: 1umcfp2f |
| wandb_url: https://wandb.ai/prior-ai2/molmo_ae_synth/runs/1umcfp2f |
| distributed_eval_enabled: false |
| distributed_eval_benchmark_path: /weka/oe/rohunt/robo-bench/FrankaPickandPlaceDroidBench_5ep_json_benchmark |
| distributed_eval_config_cls: launch_scripts.synthvla.configure_mujoco_thor:FrankaState8ClampConfig |
| distributed_eval_task_horizon: 300 |
| distributed_eval_num_worker_jobs: 1 |
| distributed_eval_wandb_project: mjthor-online-eval |
| distributed_eval_workspace: ai2/robo-molmo |
| distributed_eval_clusters: |
| - ai2/saturn |
| - ai2/neptune |
| - ai2/rhea |
| - ai2/ceres |
| distributed_eval_priority: high |
| distributed_eval_preemptible: true |
|
|