File size: 3,244 Bytes
45bc8f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
run_id: puma-domino-dynamic-35task
run_root_dir: ./result/output/Dynamic_VLA
seed: 42
trackers:
- jsonl
- wandb
wandb_entity: heng_
wandb_project: Dynamic_VLA
is_debug: false
framework:
  name: PUMA
  history_flow_stage: stage2
  qwenvl:
    base_vlm: ./playground/Pretrained_models/Qwen3-VL-4B-Instruct-Action
    attn_implementation: sdpa
    vl_hidden_dim: 2560
  action_model:
    action_model_type: MLP
    action_hidden_dim: 2560
    action_dim: 14
    state_dim: 14
    future_action_window_size: 15
    past_action_window_size: 0
  world_model:
    enabled: true
    world_query_num: 4
    loss_weight: 0.05
    supervision: per_frame
    feature_loss: cosine
    grounding_mode: image
    future_view_index: 0
    dino_backbone: dinov2_vitb14
    world_token: <|world|>
    grounding:
      sam2_model_config: configs/sam2.1/sam2.1_hiera_l.yaml
      sam2_checkpoint: ./playground/Pretrained_models/grounded_sam2/sam2.1_hiera_large.pt
      grounding_dino_config: ./playground/Pretrained_models/grounded_sam2/GroundingDINO_SwinT_OGC.py
      grounding_dino_checkpoint: ./playground/Pretrained_models/grounded_sam2/groundingdino_swint_ogc.pth
      box_threshold: 0.35
      text_threshold: 0.25
      multimask_output: false
      max_boxes: 1
      video_prompt: mask
      cache:
        enabled: true
        read: true
        write: true
        dirname: grounding_cache
        version: v1
    debug:
      enabled: false
      output_dir: ./grounding_output
      include_box: true
      include_mask: true
datasets:
  vla_data:
    dataset_py: lerobot_datasets
    num_workers: 8
    data_root_dir: ./data/robotwin/dynamic-35tasks-clean-level1
    data_mix: robotwin_dynamic_task
    action_type: abs_qpos
    default_image_resolution:
    - 3
    - 224
    - 224
    per_device_batch_size: 8
    load_all_data_for_training: true
    obs:
    - image_0
    image_size:
    - 224
    - 224
    video_backend: torchvision_av
    include_state: false
    future_k: 4
    future_stride: 4
    history_k: 4
    history_stride: 4
    history_mode: flow
    history_image_size:
    - 64
    - 64
    history_flow:
      compute_size:
      - 64
      - 64
      cpu_worker_num: 12
      cache:
        enabled: true
        read: true
        write: true
        dirname: history_flow_cache
        version: v1
trainer:
  epochs: 100
  max_train_steps: 100000
  num_warmup_steps: 5000
  save_interval: 10000
  eval_interval: 1000
  learning_rate:
    base: 1.0e-05
    qwen_vl_interface: 1.0e-05
    action_model: 0.0001
  lr_scheduler_type: cosine_with_min_lr
  scheduler_specific_kwargs:
    min_lr: 5.0e-07
  freeze_modules: null
  loss_scale:
    vla: 1.0
    vlm: 0.0
  repeated_diffusion_steps: 4
  max_grad_norm: 1.0
  warmup_ratio: 0.1
  weight_decay: 0.0
  logging_frequency: 100
  gradient_clipping: 1.0
  gradient_accumulation_steps: 1
  optimizer:
    name: AdamW
    betas:
    - 0.9
    - 0.95
    eps: 1.0e-08
    weight_decay: 1.0e-08
  is_resume: false
  resume_epoch: null
  resume_step: null
  enable_gradient_checkpointing: true
  enable_mixed_precision_training: true
output_dir: ./result/output/Dynamic_VLA/20260301-qwenoft-robotwin_dynamic_task-qwenaction-world-query-flow-stage2-h4s4f4s4-h64w64-dynamic-35task