run_id: puma-domino-dynamic-35task
run_root_dir: ./result/output/Dynamic_VLA
seed: 42
trackers:
- jsonl
- wandb
wandb_entity: heng_
wandb_project: Dynamic_VLA
is_debug: false
framework:
  name: PUMA
  history_flow_stage: stage2
  qwenvl:
    base_vlm: ./playground/Pretrained_models/Qwen3-VL-4B-Instruct-Action
    attn_implementation: sdpa
    vl_hidden_dim: 2560
  action_model:
    action_model_type: MLP
    action_hidden_dim: 2560
    action_dim: 14
    state_dim: 14
    future_action_window_size: 15
    past_action_window_size: 0
  world_model:
    enabled: true
    world_query_num: 4
    loss_weight: 0.05
    supervision: per_frame
    feature_loss: cosine
    grounding_mode: image
    future_view_index: 0
    dino_backbone: dinov2_vitb14
    world_token: <|world|>
    grounding:
      sam2_model_config: configs/sam2.1/sam2.1_hiera_l.yaml
      sam2_checkpoint: ./playground/Pretrained_models/grounded_sam2/sam2.1_hiera_large.pt
      grounding_dino_config: ./playground/Pretrained_models/grounded_sam2/GroundingDINO_SwinT_OGC.py
      grounding_dino_checkpoint: ./playground/Pretrained_models/grounded_sam2/groundingdino_swint_ogc.pth
      box_threshold: 0.35
      text_threshold: 0.25
      multimask_output: false
      max_boxes: 1
      video_prompt: mask
      cache:
        enabled: true
        read: true
        write: true
        dirname: grounding_cache
        version: v1
    debug:
      enabled: false
      output_dir: ./grounding_output
      include_box: true
      include_mask: true
datasets:
  vla_data:
    dataset_py: lerobot_datasets
    num_workers: 8
    data_root_dir: ./data/robotwin/dynamic-35tasks-clean-level1
    data_mix: robotwin_dynamic_task
    action_type: abs_qpos
    default_image_resolution:
    - 3
    - 224
    - 224
    per_device_batch_size: 8
    load_all_data_for_training: true
    obs:
    - image_0
    image_size:
    - 224
    - 224
    video_backend: torchvision_av
    include_state: false
    future_k: 4
    future_stride: 4
    history_k: 4
    history_stride: 4
    history_mode: flow
    history_image_size:
    - 64
    - 64
    history_flow:
      compute_size:
      - 64
      - 64
      cpu_worker_num: 12
      cache:
        enabled: true
        read: true
        write: true
        dirname: history_flow_cache
        version: v1
trainer:
  epochs: 100
  max_train_steps: 100000
  num_warmup_steps: 5000
  save_interval: 10000
  eval_interval: 1000
  learning_rate:
    base: 1.0e-05
    qwen_vl_interface: 1.0e-05
    action_model: 0.0001
  lr_scheduler_type: cosine_with_min_lr
  scheduler_specific_kwargs:
    min_lr: 5.0e-07
  freeze_modules: null
  loss_scale:
    vla: 1.0
    vlm: 0.0
  repeated_diffusion_steps: 4
  max_grad_norm: 1.0
  warmup_ratio: 0.1
  weight_decay: 0.0
  logging_frequency: 100
  gradient_clipping: 1.0
  gradient_accumulation_steps: 1
  optimizer:
    name: AdamW
    betas:
    - 0.9
    - 0.95
    eps: 1.0e-08
    weight_decay: 1.0e-08
  is_resume: false
  resume_epoch: null
  resume_step: null
  enable_gradient_checkpointing: true
  enable_mixed_precision_training: true
output_dir: ./result/output/Dynamic_VLA/20260301-qwenoft-robotwin_dynamic_task-qwenaction-world-query-flow-stage2-h4s4f4s4-h64w64-dynamic-35task