File size: 2,191 Bytes
45bc8f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
datasets:
  vla_data:
    data_mix: robotwin_dynamic_task
    data_root_dir: ./data/robotwin/dynamic-35tasks-clean-level1
    dataset_py: lerobot_datasets
    history_flow:
      cpu_worker_num: 12
    history_image_size:
    - 64
    - 64
    history_mode: flow
    image_size:
    - 224
    - 224
    num_workers: 8
    per_device_batch_size: 8
    video_backend: torchvision_av
framework:
  action_model:
    action_dim: 14
    action_hidden_dim: 2560
    action_model_type: MLP
    future_action_window_size: 15
    past_action_window_size: 0
  history_flow_stage: stage2
  name: PUMA
  qwenvl:
    attn_implementation: sdpa
    base_vlm: ./playground/Pretrained_models/Qwen3-VL-4B-Instruct-Action
  world_model:
    dino_backbone: dinov2_vitb14
    enabled: true
    feature_loss: cosine
    future_view_index: 0
    grounding:
      box_threshold: 0.35
      grounding_dino_checkpoint: ./playground/Pretrained_models/grounded_sam2/groundingdino_swint_ogc.pth
      grounding_dino_config: ./playground/Pretrained_models/grounded_sam2/GroundingDINO_SwinT_OGC.py
      max_boxes: 1
      multimask_output: false
      sam2_checkpoint: ./playground/Pretrained_models/grounded_sam2/sam2.1_hiera_large.pt
      sam2_model_config: configs/sam2.1/sam2.1_hiera_l.yaml
      text_threshold: 0.25
      video_prompt: mask
    grounding_mode: image
    loss_weight: 0.05
    supervision: per_frame
    world_query_num: 4
output_dir: ./result/output/Dynamic_VLA/20260301-qwenoft-robotwin_dynamic_task-qwenaction-world-query-flow-stage2-h4s4f4s4-h64w64-dynamic-35task
run_id: puma-domino-dynamic-35task
run_root_dir: ./result/output/Dynamic_VLA
seed: 42
trainer:
  eval_interval: 1000
  freeze_modules: null
  gradient_accumulation_steps: 1
  gradient_clipping: 1.0
  is_resume: false
  learning_rate:
    action_model: 0.0001
    base: 1.0e-05
    qwen_vl_interface: 1.0e-05
  logging_frequency: 100
  lr_scheduler_type: cosine_with_min_lr
  max_train_steps: 100000
  num_warmup_steps: 5000
  optimizer:
    betas:
    - 0.9
    - 0.95
    eps: 1.0e-08
    weight_decay: 1.0e-08
  save_interval: 10000
  scheduler_specific_kwargs:
    min_lr: 5.0e-07
wandb_entity: heng_
wandb_project: Dynamic_VLA