File size: 3,273 Bytes
152ab68 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | # Loadable config for SemanticVLA-SimplerEnv (SimplerEnv WidowX policy).
#
# Load via:
# from semanticvla.model.framework.base_framework import baseframework
# policy = baseframework.from_pretrained("pytorch_model.pt")
#
# The loader walks two directory levels up from the checkpoint file to locate
# this `config.yaml` and the sibling `dataset_statistics.json`.
seed: 42
framework:
name: SemanticVLA
qwenvl:
base_vlm: Qwen/Qwen3-VL-4B-Instruct
attn_implementation: flash_attention_2
vl_hidden_dim: 2048
dino:
dino_backbone: dinov2_vits14
action_model:
action_model_type: DiT-B
action_hidden_dim: 1024
hidden_size: 1024
add_pos_embed: true
max_seq_len: 1024
action_dim: 7
state_dim: 7
future_action_window_size: 15
action_horizon: 16
past_action_window_size: 0
repeated_diffusion_steps: 8
noise_beta_alpha: 1.5
noise_beta_beta: 1.0
noise_s: 0.999
num_timestep_buckets: 1000
num_inference_timesteps: 4
num_target_vision_tokens: 32
diffusion_model_cfg:
cross_attention_dim: 2048
dropout: 0.2
final_dropout: true
interleave_self_attention: true
norm_type: ada_norm
num_layers: 16
output_dim: 1024
positional_embeddings: null
progress_dim: 0
trace_dim: 0
trace:
injection_mode: none
hidden_dim: 256
num_layers: 3
num_heads: 8
window_size: 12
num_tokens: 4
dropout: 0.1
num_anchor_points: 4
lm_aux_loss: false
aux_loss_weight: 0.1
coord_range: 1000
prompt_style: plain
semantic_output:
enabled: true
mode: trace_latent
order: trace_latent
lm_loss_weight: 0.1
latent_vocab_size: 32
latent_num_tokens: 4
latent_token_prefix: LAM
prompt_style: plain
trace_anchor_points: 4
parse_trace_for_decoder: false
trainable_token_rows: false
reduce_in_full_precision: true
datasets:
vla_data:
dataset_py: lerobot_datasets
data_root_dir: /path/to/bridge_lerobot
data_mix: bridge
statistics_key: oxe_bridge
action_horizon: 16
image_size: [224, 224]
default_image_resolution: [3, 224, 224]
per_device_batch_size: 16
num_workers: 4
trace:
enabled: true
root: /path/to/trace_annotations/bridge
window_size: 12
normalize: true
num_anchor_points: 4
latent_action_labels:
enabled: true
root: /path/to/lam_labels
variant: semanticvla_lam
strict: true
missing_policy: clip
out_key: latent_action_idx
trainer:
epochs: 100
max_train_steps: 100000
num_warmup_steps: 5000
save_interval: 5000
eval_interval: 2000
learning_rate:
base: 4.0e-05
qwen_vl_interface: 1.0e-05
action_model: 1.0e-04
lr_scheduler_type: cosine_with_min_lr
scheduler_specific_kwargs:
min_lr: 5.0e-07
freeze_modules: ''
loss_scale:
vla: 1.0
vlm: 0.1
max_grad_norm: 1.0
warmup_ratio: 0.1
weight_decay: 0.0
logging_frequency: 100
gradient_clipping: 1.0
gradient_accumulation_steps: 1
optimizer:
name: AdamW
betas: [0.9, 0.95]
eps: 1.0e-08
weight_decay: 1.0e-08
enable_gradient_checkpointing: true
enable_mixed_precision_training: true
|