datasets:
  vla_data:
    CoT_prompt: Your task is {instruction}. To identify the key objects for your task.
      Locate their bounding boxes in [x1,y1,x2,y2] format.
    data_mix: bridge_rt_1
    data_root_dir: ./playground/Datasets/OXE_LEROBOT
    dataset_py: lerobot_datasets
    image_size:
    - 224
    - 224
    per_device_batch_size: 8
    video_backend: torchvision_av
framework:
  action_model:
    action_dim: 7
    action_horizon: 16
    action_model_type: DiT-B
    add_pos_embed: true
    diffusion_model_cfg:
      cross_attention_dim: 4096
      dropout: 0.2
      final_dropout: true
      interleave_self_attention: true
      norm_type: ada_norm
      num_layers: 16
      output_dim: 1024
      positional_embeddings: null
    future_action_window_size: 15
    hidden_size: 1024
    max_seq_len: 1024
    noise_beta_alpha: 1.5
    noise_beta_beta: 1.0
    noise_s: 0.999
    num_inference_timesteps: 4
    num_target_vision_tokens: 32
    num_timestep_buckets: 1000
    past_action_window_size: 0
    state_dim: 7
  name: QwenGR00T
  qwenvl:
    base_vlm: /mnt/18T/starVLAproject/Qwen3-VL-8B-Instruct
output_dir: /starvla/Checkpoints/qwen3vl_bridge_rt1_QwenGR00T_2node_0203_1256
run_id: qwen3vl_bridge_rt1_QwenGR00T_2node_0203_1256
run_root_dir: /starvla/Checkpoints
seed: 42
trainer:
  eval_interval: 500
  freeze_modules: true
  gradient_accumulation_steps: 1
  gradient_clipping: 1.0
  is_resume: false
  learning_rate:
    action_model: 0.0001
    base: 1.0e-05
    qwen_vl_interface: 1.0e-05
  logging_frequency: 50
  lr_scheduler_type: cosine_with_min_lr
  max_train_steps: 100000
  num_warmup_steps: 10000
  optimizer:
    betas:
    - 0.9
    - 0.95
    eps: 1.0e-08
    weight_decay: 1.0e-08
  repeated_diffusion_steps: 4
  save_interval: 10000
  scheduler_specific_kwargs:
    min_lr: 5.0e-07
wandb_entity: xiguapi
wandb_project: Qwen3VL_Bridge_RT1_QwenGR00T