datasets: vla_data: CoT_prompt: Your task is {instruction}. To identify the key objects for your task. Locate their bounding boxes in [x1,y1,x2,y2] format. data_mix: bridge_rt_1 data_root_dir: ./playground/Datasets/OXE_LEROBOT dataset_py: lerobot_datasets image_size: - 224 - 224 per_device_batch_size: 8 video_backend: torchvision_av framework: action_model: action_dim: 7 action_horizon: 16 action_model_type: DiT-B add_pos_embed: true diffusion_model_cfg: cross_attention_dim: 4096 dropout: 0.2 final_dropout: true interleave_self_attention: true norm_type: ada_norm num_layers: 16 output_dim: 1024 positional_embeddings: null future_action_window_size: 15 hidden_size: 1024 max_seq_len: 1024 noise_beta_alpha: 1.5 noise_beta_beta: 1.0 noise_s: 0.999 num_inference_timesteps: 4 num_target_vision_tokens: 32 num_timestep_buckets: 1000 past_action_window_size: 0 state_dim: 7 name: QwenGR00T qwenvl: base_vlm: /mnt/18T/starVLAproject/Qwen3-VL-8B-Instruct output_dir: /starvla/Checkpoints/qwen3vl_bridge_rt1_QwenGR00T_2node_0203_1256 run_id: qwen3vl_bridge_rt1_QwenGR00T_2node_0203_1256 run_root_dir: /starvla/Checkpoints seed: 42 trainer: eval_interval: 500 freeze_modules: true gradient_accumulation_steps: 1 gradient_clipping: 1.0 is_resume: false learning_rate: action_model: 0.0001 base: 1.0e-05 qwen_vl_interface: 1.0e-05 logging_frequency: 50 lr_scheduler_type: cosine_with_min_lr max_train_steps: 100000 num_warmup_steps: 10000 optimizer: betas: - 0.9 - 0.95 eps: 1.0e-08 weight_decay: 1.0e-08 repeated_diffusion_steps: 4 save_interval: 10000 scheduler_specific_kwargs: min_lr: 5.0e-07 wandb_entity: xiguapi wandb_project: Qwen3VL_Bridge_RT1_QwenGR00T