File size: 3,273 Bytes
152ab68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Loadable config for SemanticVLA-SimplerEnv (SimplerEnv WidowX policy).
#
# Load via:
#   from semanticvla.model.framework.base_framework import baseframework
#   policy = baseframework.from_pretrained("pytorch_model.pt")
#
# The loader walks two directory levels up from the checkpoint file to locate
# this `config.yaml` and the sibling `dataset_statistics.json`.

seed: 42

framework:
  name: SemanticVLA
  qwenvl:
    base_vlm: Qwen/Qwen3-VL-4B-Instruct
    attn_implementation: flash_attention_2
    vl_hidden_dim: 2048
  dino:
    dino_backbone: dinov2_vits14
  action_model:
    action_model_type: DiT-B
    action_hidden_dim: 1024
    hidden_size: 1024
    add_pos_embed: true
    max_seq_len: 1024
    action_dim: 7
    state_dim: 7
    future_action_window_size: 15
    action_horizon: 16
    past_action_window_size: 0
    repeated_diffusion_steps: 8
    noise_beta_alpha: 1.5
    noise_beta_beta: 1.0
    noise_s: 0.999
    num_timestep_buckets: 1000
    num_inference_timesteps: 4
    num_target_vision_tokens: 32
    diffusion_model_cfg:
      cross_attention_dim: 2048
      dropout: 0.2
      final_dropout: true
      interleave_self_attention: true
      norm_type: ada_norm
      num_layers: 16
      output_dim: 1024
      positional_embeddings: null
      progress_dim: 0
      trace_dim: 0
    trace:
      injection_mode: none
      hidden_dim: 256
      num_layers: 3
      num_heads: 8
      window_size: 12
      num_tokens: 4
      dropout: 0.1
      num_anchor_points: 4
      lm_aux_loss: false
      aux_loss_weight: 0.1
      coord_range: 1000
      prompt_style: plain
    semantic_output:
      enabled: true
      mode: trace_latent
      order: trace_latent
      lm_loss_weight: 0.1
      latent_vocab_size: 32
      latent_num_tokens: 4
      latent_token_prefix: LAM
      prompt_style: plain
      trace_anchor_points: 4
      parse_trace_for_decoder: false
      trainable_token_rows: false
  reduce_in_full_precision: true

datasets:
  vla_data:
    dataset_py: lerobot_datasets
    data_root_dir: /path/to/bridge_lerobot
    data_mix: bridge
    statistics_key: oxe_bridge
    action_horizon: 16
    image_size: [224, 224]
    default_image_resolution: [3, 224, 224]
    per_device_batch_size: 16
    num_workers: 4
    trace:
      enabled: true
      root: /path/to/trace_annotations/bridge
      window_size: 12
      normalize: true
      num_anchor_points: 4
    latent_action_labels:
      enabled: true
      root: /path/to/lam_labels
      variant: semanticvla_lam
      strict: true
      missing_policy: clip
      out_key: latent_action_idx

trainer:
  epochs: 100
  max_train_steps: 100000
  num_warmup_steps: 5000
  save_interval: 5000
  eval_interval: 2000
  learning_rate:
    base: 4.0e-05
    qwen_vl_interface: 1.0e-05
    action_model: 1.0e-04
  lr_scheduler_type: cosine_with_min_lr
  scheduler_specific_kwargs:
    min_lr: 5.0e-07
  freeze_modules: ''
  loss_scale:
    vla: 1.0
    vlm: 0.1
  max_grad_norm: 1.0
  warmup_ratio: 0.1
  weight_decay: 0.0
  logging_frequency: 100
  gradient_clipping: 1.0
  gradient_accumulation_steps: 1
  optimizer:
    name: AdamW
    betas: [0.9, 0.95]
    eps: 1.0e-08
    weight_decay: 1.0e-08
  enable_gradient_checkpointing: true
  enable_mixed_precision_training: true