# Loadable config for SemanticVLA-SimplerEnv (SimplerEnv WidowX policy). # # Load via: # from semanticvla.model.framework.base_framework import baseframework # policy = baseframework.from_pretrained("pytorch_model.pt") # # The loader walks two directory levels up from the checkpoint file to locate # this `config.yaml` and the sibling `dataset_statistics.json`. seed: 42 framework: name: SemanticVLA qwenvl: base_vlm: Qwen/Qwen3-VL-4B-Instruct attn_implementation: flash_attention_2 vl_hidden_dim: 2048 dino: dino_backbone: dinov2_vits14 action_model: action_model_type: DiT-B action_hidden_dim: 1024 hidden_size: 1024 add_pos_embed: true max_seq_len: 1024 action_dim: 7 state_dim: 7 future_action_window_size: 15 action_horizon: 16 past_action_window_size: 0 repeated_diffusion_steps: 8 noise_beta_alpha: 1.5 noise_beta_beta: 1.0 noise_s: 0.999 num_timestep_buckets: 1000 num_inference_timesteps: 4 num_target_vision_tokens: 32 diffusion_model_cfg: cross_attention_dim: 2048 dropout: 0.2 final_dropout: true interleave_self_attention: true norm_type: ada_norm num_layers: 16 output_dim: 1024 positional_embeddings: null progress_dim: 0 trace_dim: 0 trace: injection_mode: none hidden_dim: 256 num_layers: 3 num_heads: 8 window_size: 12 num_tokens: 4 dropout: 0.1 num_anchor_points: 4 lm_aux_loss: false aux_loss_weight: 0.1 coord_range: 1000 prompt_style: plain semantic_output: enabled: true mode: trace_latent order: trace_latent lm_loss_weight: 0.1 latent_vocab_size: 32 latent_num_tokens: 4 latent_token_prefix: LAM prompt_style: plain trace_anchor_points: 4 parse_trace_for_decoder: false trainable_token_rows: false reduce_in_full_precision: true datasets: vla_data: dataset_py: lerobot_datasets data_root_dir: /path/to/bridge_lerobot data_mix: bridge statistics_key: oxe_bridge action_horizon: 16 image_size: [224, 224] default_image_resolution: [3, 224, 224] per_device_batch_size: 16 num_workers: 4 trace: enabled: true root: /path/to/trace_annotations/bridge window_size: 12 normalize: true num_anchor_points: 4 latent_action_labels: enabled: true root: /path/to/lam_labels variant: semanticvla_lam strict: true missing_policy: clip out_key: latent_action_idx trainer: epochs: 100 max_train_steps: 100000 num_warmup_steps: 5000 save_interval: 5000 eval_interval: 2000 learning_rate: base: 4.0e-05 qwen_vl_interface: 1.0e-05 action_model: 1.0e-04 lr_scheduler_type: cosine_with_min_lr scheduler_specific_kwargs: min_lr: 5.0e-07 freeze_modules: '' loss_scale: vla: 1.0 vlm: 0.1 max_grad_norm: 1.0 warmup_ratio: 0.1 weight_decay: 0.0 logging_frequency: 100 gradient_clipping: 1.0 gradient_accumulation_steps: 1 optimizer: name: AdamW betas: [0.9, 0.95] eps: 1.0e-08 weight_decay: 1.0e-08 enable_gradient_checkpointing: true enable_mixed_precision_training: true