| run_name: glue_20251002_155411 | |
| seed: 6198 | |
| epoch: null | |
| dry_run: false | |
| model: | |
| d_model: 3584 | |
| n_heads: 28 | |
| n_kv_heads: 4 | |
| qkv_bias: true | |
| clip_qkv: null | |
| n_layers: 28 | |
| mlp_ratio: 4 | |
| mlp_hidden_size: 37888 | |
| activation_type: swiglu | |
| block_type: sequential | |
| block_group_size: 1 | |
| rope: true | |
| rope_full_precision: true | |
| rope_theta: 1000000.0 | |
| vision_backbone: | |
| image_model_type: openai | |
| image_default_input_size: | |
| - 336 | |
| - 336 | |
| image_patch_size: 14 | |
| image_pos_patch_size: 14 | |
| image_emb_dim: 1024 | |
| image_num_heads: 16 | |
| image_num_key_value_heads: 16 | |
| image_num_layers: 23 | |
| image_head_dim: 64 | |
| image_mlp_dim: 4096 | |
| image_mlp_activations: quick_gelu | |
| image_dropout_rate: 0.0 | |
| image_num_pos: 577 | |
| image_norm_eps: 1.0e-05 | |
| attention_dropout: 0.0 | |
| residual_dropout: 0.0 | |
| initializer_range: 0.02 | |
| fsdp_wrap: false | |
| resize_mode: default | |
| vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt | |
| llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt | |
| low_cpu_fsdp: true | |
| attention_type: sdpa | |
| float32_attention: true | |
| attention_dropout: 0.0 | |
| attention_layer_norm: false | |
| residual_dropout: 0.1 | |
| response_residual_dropout: 0.0 | |
| embedding_dropout: 0.0 | |
| layer_norm_type: rms | |
| layer_norm_with_affine: true | |
| layer_norm_eps: 1.0e-06 | |
| attention_layer_norm_with_affine: true | |
| max_sequence_length: 4096 | |
| max_position_embeddings: null | |
| include_bias: false | |
| bias_for_layer_norm: null | |
| scale_logits: false | |
| vocab_size: 152064 | |
| embedding_size: 152064 | |
| ff_out_size: null | |
| additional_vocab_size: 128 | |
| new_embedding_init_range: 0.02 | |
| weight_tying: false | |
| init_device: null | |
| init_fn: normal | |
| init_std: 0.02 | |
| init_cutoff_factor: null | |
| norm_after: false | |
| precision: amp_bf16 | |
| max_crops: 12 | |
| crop_mode: overlap-and-resize-c2 | |
| use_col_tokens: true | |
| prompt_type: uber_model | |
| system_prompt_kind: demo_or_style | |
| message_formatting: role | |
| always_start_with_space: true | |
| multi_annotation_weighting: root_subsegments | |
| default_inference_len: 65 | |
| overlap_margins: | |
| - 4 | |
| - 4 | |
| pad_value: 0.0 | |
| image_padding_embed: pad_and_partial_pad | |
| fix_image_padding: true | |
| vit_layers: | |
| - -2 | |
| - -9 | |
| image_pooling_h: 2 | |
| image_pooling_w: 2 | |
| image_pooling_2d: attention_meanq | |
| image_projector: mlp | |
| image_feature_dropout: 0.0 | |
| initializer_range: 0.02 | |
| normalize_input_embeds: false | |
| use_position_ids: true | |
| head_dim: null | |
| action_tokenizer: | |
| identifier: physical-intelligence/fast | |
| tokenizer_dir: null | |
| action_dim: 7 | |
| horizon: 8 | |
| tokenizer: | |
| identifier: Qwen/Qwen2-7B | |
| tokenizer_dir: null | |
| pad_tokenizer: true | |
| moe_num_experts: 8 | |
| moe_top_k: 2 | |
| moe_mlp_impl: sparse | |
| moe_log_expert_assignment: false | |
| moe_shared_expert: false | |
| moe_lbl_in_fp32: false | |
| moe_interleave: false | |
| moe_loss_weight: 0.1 | |
| moe_zloss_weight: null | |
| moe_dropless: true | |
| moe_capacity_factor: 1.25 | |
| action_head: l1_regression | |
| num_diffusion_steps: 1000 | |
| num_diffusion_inference_steps: 30 | |
| use_proprio: true | |
| action_head_dit_hidden_size: 1152 | |
| action_head_dit_depth: 28 | |
| action_head_dit_num_heads: 16 | |
| llm_causal_attention: false | |
| action_use_left_eef: true | |
| action_use_mobile_base: false | |
| allow_resume: false | |
| ft_llm: true | |
| ft_vit: false | |
| ft_connector: false | |
| ft_embedding: lm_head | |
| lora: false | |
| use_lora: true | |
| lora_rank: 8 | |
| lora_llm: false | |
| lora_vit: false | |
| lora_connector: false | |
| early_exit: false | |
| train_exit_random_layer: false | |
| optimizer: | |
| name: adamw | |
| learning_rate: 0.0001 | |
| weight_decay: 0.01 | |
| betas: | |
| - 0.9 | |
| - 0.95 | |
| eps: 1.0e-05 | |
| connector_learning_rate: 0.0002 | |
| vit_learning_rate: 6.0e-06 | |
| llm_learning_rate: 5.0e-05 | |
| connector_weight_decay: 0.0 | |
| vit_weight_decay: 0.0 | |
| llm_weight_decay: 0.0 | |
| connector_betas: | |
| - 0.9 | |
| - 0.95 | |
| vit_betas: | |
| - 0.9 | |
| - 0.95 | |
| llm_betas: | |
| - 0.9 | |
| - 0.95 | |
| connector_eps: 1.0e-06 | |
| vit_eps: 1.0e-06 | |
| llm_eps: 1.0e-06 | |
| metrics_log_interval: 20 | |
| scheduler: | |
| name: multimodal | |
| units: steps | |
| t_warmup: 100 | |
| t_max: null | |
| alpha_f: 0.1 | |
| connector_t_warmup: 200 | |
| vit_t_warmup: 2000 | |
| llm_t_warmup: 2000 | |
| grad_clip_warmup_steps: null | |
| grad_clip_warmup_factor: null | |
| warmup_min_lr: 0.0 | |
| data: | |
| dataset: vla_dataset_realworld | |
| mixture: null | |
| root_size_mixture: null | |
| split: train | |
| seed: 95818 | |
| shuffle_messages: false | |
| pad: to_max | |
| sequence_length: 1600 | |
| shuffle: true | |
| for_inference: false | |
| multi_modal: torch | |
| num_workers: 0 | |
| drop_last: true | |
| pin_memory: true | |
| prefetch_factor: null | |
| persistent_workers: false | |
| timeout: 0 | |
| rlds_dataset_name: libero_4_task_suites_no_noops | |
| rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds | |
| use_wrist_image: true | |
| use_proprio: true | |
| rlds_shuffle_buffer_size: 100000 | |
| rlds_traj_threads: 8 | |
| rlds_read_threads: 8 | |
| lerobot_episode_index_start: null | |
| lerobot_episode_index_end: null | |
| restore_dataloader: true | |
| fast_forward_batches: null | |
| evaluators: | |
| - label: val | |
| data: | |
| dataset: vla_dataset_realworld | |
| mixture: null | |
| root_size_mixture: null | |
| split: validation | |
| seed: null | |
| shuffle_messages: false | |
| pad: to_max | |
| sequence_length: 1600 | |
| shuffle: false | |
| for_inference: false | |
| multi_modal: torch | |
| num_workers: 0 | |
| drop_last: true | |
| pin_memory: true | |
| prefetch_factor: null | |
| persistent_workers: true | |
| timeout: 0 | |
| rlds_dataset_name: libero_4_task_suites_no_noops | |
| rlds_data_root_dir: /mnt/data/zhangjian/dataset/Simulation/datasets--openvla--modified_libero_rlds | |
| use_wrist_image: true | |
| use_proprio: true | |
| rlds_shuffle_buffer_size: 256000 | |
| rlds_traj_threads: 8 | |
| rlds_read_threads: 8 | |
| lerobot_episode_index_start: 353 | |
| lerobot_episode_index_end: 765 | |
| device_eval_batch_size: null | |
| subset_num_batches: 64 | |
| max_examples: null | |
| max_new_tokens: 448 | |
| mm_evaluator: null | |
| save_dir: null | |
| save_to_checkpoint_dir: false | |
| eval_name: null | |
| skip_if_metrics_cached: true | |
| eval_interval: 0 | |
| inf_eval_interval: -1 | |
| inf_evaluators: [] | |
| save_folder: /vast/users/xiaodan/workspace/minghao.guo/warehouse_a1/ckpt/ | |
| remote_save_folder: null | |
| canceled_check_interval: 50 | |
| save_interval: 500 | |
| save_interval_unsharded: 500 | |
| save_interval_ephemeral: null | |
| save_interval_action_head: 500 | |
| save_num_checkpoints_to_keep: 1 | |
| save_num_unsharded_checkpoints_to_keep: 1 | |
| save_num_action_head_checkpoints_to_keep: 2 | |
| save_overwrite: true | |
| force_save_unsharded: false | |
| no_pre_train_checkpoint: true | |
| initial_model_checkpoint: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924 | |
| load_model_config: null | |
| checkpoint_dir: /vast/users/xiaodan/zhangjian/molmo_data/Molmo-7B-D-0924 | |
| load_path: null | |
| load_path_sharded_checkpointer: null | |
| reset_optimizer_state: false | |
| reset_trainer_state: false | |
| save_dataloader_state: false | |
| reset_dataloader_state: false | |
| keep_lr_on_load: true | |
| sharded_checkpointer: torch_legacy | |
| max_duration: 500000 | |
| global_train_batch_size: 126 | |
| device_train_batch_size: 15 | |
| device_train_microbatch_size: 16 | |
| device_eval_batch_size: 4 | |
| eval_subset_num_batches: -1 | |
| eval_on_load: false | |
| device_inf_eval_batch_size: 16 | |
| inf_eval_subset_num_batches: -1 | |
| device_train_grad_accum: 0 | |
| max_grad_norm: 1.0 | |
| multi_component_grad_norm: true | |
| batch_divisor: global_batch | |
| max_grad_norm_ratio: null | |
| precision: amp_bf16 | |
| wandb: | |
| project: a1-realworld | |
| entity: henryeap | |
| group: null | |
| name: glue_20251002_155411 | |
| tags: | |
| - watching | |
| log_artifacts: false | |
| rank_zero_only: true | |
| log_interval: 1 | |
| speed_monitor: | |
| window_size: 20 | |
| gpu_flops_available: null | |
| console_log_interval: 1 | |
| gen1_gc_interval: 1 | |
| compile: null | |
| fsdp: | |
| use_orig_params: true | |
| sharding_strategy: FULL_SHARD | |
| wrapping_strategy: by_block_and_size | |
| precision: float | |
| hybrid_sharding_num_model_replicas: null | |
| softmax_auxiliary_loss: true | |
| softmax_auxiliary_loss_scale: 0.0001 | |
| time_limit: null | |
| extra_steps_after_cancel: 10 | |
| python_profiling: false | |
| torch_profiling: false | |
| stop_at: 500000 | |
| stop_after: null | |
| activation_checkpointing: whole_layer | |
| fused_loss: null | |