SemanticVLA-SimplerEnv / config.yaml
spikefly's picture
Add files using upload-large-folder tool
152ab68 verified
# Loadable config for SemanticVLA-SimplerEnv (SimplerEnv WidowX policy).
#
# Load via:
# from semanticvla.model.framework.base_framework import baseframework
# policy = baseframework.from_pretrained("pytorch_model.pt")
#
# The loader walks two directory levels up from the checkpoint file to locate
# this `config.yaml` and the sibling `dataset_statistics.json`.
seed: 42
framework:
name: SemanticVLA
qwenvl:
base_vlm: Qwen/Qwen3-VL-4B-Instruct
attn_implementation: flash_attention_2
vl_hidden_dim: 2048
dino:
dino_backbone: dinov2_vits14
action_model:
action_model_type: DiT-B
action_hidden_dim: 1024
hidden_size: 1024
add_pos_embed: true
max_seq_len: 1024
action_dim: 7
state_dim: 7
future_action_window_size: 15
action_horizon: 16
past_action_window_size: 0
repeated_diffusion_steps: 8
noise_beta_alpha: 1.5
noise_beta_beta: 1.0
noise_s: 0.999
num_timestep_buckets: 1000
num_inference_timesteps: 4
num_target_vision_tokens: 32
diffusion_model_cfg:
cross_attention_dim: 2048
dropout: 0.2
final_dropout: true
interleave_self_attention: true
norm_type: ada_norm
num_layers: 16
output_dim: 1024
positional_embeddings: null
progress_dim: 0
trace_dim: 0
trace:
injection_mode: none
hidden_dim: 256
num_layers: 3
num_heads: 8
window_size: 12
num_tokens: 4
dropout: 0.1
num_anchor_points: 4
lm_aux_loss: false
aux_loss_weight: 0.1
coord_range: 1000
prompt_style: plain
semantic_output:
enabled: true
mode: trace_latent
order: trace_latent
lm_loss_weight: 0.1
latent_vocab_size: 32
latent_num_tokens: 4
latent_token_prefix: LAM
prompt_style: plain
trace_anchor_points: 4
parse_trace_for_decoder: false
trainable_token_rows: false
reduce_in_full_precision: true
datasets:
vla_data:
dataset_py: lerobot_datasets
data_root_dir: /path/to/bridge_lerobot
data_mix: bridge
statistics_key: oxe_bridge
action_horizon: 16
image_size: [224, 224]
default_image_resolution: [3, 224, 224]
per_device_batch_size: 16
num_workers: 4
trace:
enabled: true
root: /path/to/trace_annotations/bridge
window_size: 12
normalize: true
num_anchor_points: 4
latent_action_labels:
enabled: true
root: /path/to/lam_labels
variant: semanticvla_lam
strict: true
missing_policy: clip
out_key: latent_action_idx
trainer:
epochs: 100
max_train_steps: 100000
num_warmup_steps: 5000
save_interval: 5000
eval_interval: 2000
learning_rate:
base: 4.0e-05
qwen_vl_interface: 1.0e-05
action_model: 1.0e-04
lr_scheduler_type: cosine_with_min_lr
scheduler_specific_kwargs:
min_lr: 5.0e-07
freeze_modules: ''
loss_scale:
vla: 1.0
vlm: 0.1
max_grad_norm: 1.0
warmup_ratio: 0.1
weight_decay: 0.0
logging_frequency: 100
gradient_clipping: 1.0
gradient_accumulation_steps: 1
optimizer:
name: AdamW
betas: [0.9, 0.95]
eps: 1.0e-08
weight_decay: 1.0e-08
enable_gradient_checkpointing: true
enable_mixed_precision_training: true