| |
| |
| |
| |
| |
| |
| |
| |
|
|
| seed: 42 |
|
|
| framework: |
| name: SemanticVLA |
| qwenvl: |
| base_vlm: Qwen/Qwen3-VL-4B-Instruct |
| attn_implementation: flash_attention_2 |
| vl_hidden_dim: 2048 |
| dino: |
| dino_backbone: dinov2_vits14 |
| action_model: |
| action_model_type: DiT-B |
| action_hidden_dim: 1024 |
| hidden_size: 1024 |
| add_pos_embed: true |
| max_seq_len: 1024 |
| action_dim: 7 |
| state_dim: 7 |
| future_action_window_size: 15 |
| action_horizon: 16 |
| past_action_window_size: 0 |
| repeated_diffusion_steps: 8 |
| noise_beta_alpha: 1.5 |
| noise_beta_beta: 1.0 |
| noise_s: 0.999 |
| num_timestep_buckets: 1000 |
| num_inference_timesteps: 4 |
| num_target_vision_tokens: 32 |
| diffusion_model_cfg: |
| cross_attention_dim: 2048 |
| dropout: 0.2 |
| final_dropout: true |
| interleave_self_attention: true |
| norm_type: ada_norm |
| num_layers: 16 |
| output_dim: 1024 |
| positional_embeddings: null |
| progress_dim: 0 |
| trace_dim: 0 |
| trace: |
| injection_mode: none |
| hidden_dim: 256 |
| num_layers: 3 |
| num_heads: 8 |
| window_size: 12 |
| num_tokens: 4 |
| dropout: 0.1 |
| num_anchor_points: 4 |
| lm_aux_loss: false |
| aux_loss_weight: 0.1 |
| coord_range: 1000 |
| prompt_style: plain |
| semantic_output: |
| enabled: true |
| mode: trace_latent |
| order: trace_latent |
| lm_loss_weight: 0.1 |
| latent_vocab_size: 32 |
| latent_num_tokens: 4 |
| latent_token_prefix: LAM |
| prompt_style: plain |
| trace_anchor_points: 4 |
| parse_trace_for_decoder: false |
| trainable_token_rows: false |
| reduce_in_full_precision: true |
|
|
| datasets: |
| vla_data: |
| dataset_py: lerobot_datasets |
| data_root_dir: /path/to/bridge_lerobot |
| data_mix: bridge |
| statistics_key: oxe_bridge |
| action_horizon: 16 |
| image_size: [224, 224] |
| default_image_resolution: [3, 224, 224] |
| per_device_batch_size: 16 |
| num_workers: 4 |
| trace: |
| enabled: true |
| root: /path/to/trace_annotations/bridge |
| window_size: 12 |
| normalize: true |
| num_anchor_points: 4 |
| latent_action_labels: |
| enabled: true |
| root: /path/to/lam_labels |
| variant: semanticvla_lam |
| strict: true |
| missing_policy: clip |
| out_key: latent_action_idx |
|
|
| trainer: |
| epochs: 100 |
| max_train_steps: 100000 |
| num_warmup_steps: 5000 |
| save_interval: 5000 |
| eval_interval: 2000 |
| learning_rate: |
| base: 4.0e-05 |
| qwen_vl_interface: 1.0e-05 |
| action_model: 1.0e-04 |
| lr_scheduler_type: cosine_with_min_lr |
| scheduler_specific_kwargs: |
| min_lr: 5.0e-07 |
| freeze_modules: '' |
| loss_scale: |
| vla: 1.0 |
| vlm: 0.1 |
| max_grad_norm: 1.0 |
| warmup_ratio: 0.1 |
| weight_decay: 0.0 |
| logging_frequency: 100 |
| gradient_clipping: 1.0 |
| gradient_accumulation_steps: 1 |
| optimizer: |
| name: AdamW |
| betas: [0.9, 0.95] |
| eps: 1.0e-08 |
| weight_decay: 1.0e-08 |
| enable_gradient_checkpointing: true |
| enable_mixed_precision_training: true |
|
|