spikefly
/

SemanticVLA-SimplerEnv

vision-language-action

Model card Files Files and versions

SemanticVLA-SimplerEnv / config.yaml

spikefly's picture

Add files using upload-large-folder tool

152ab68 verified 4 days ago

history blame contribute delete

3.27 kB

	# Loadable config for SemanticVLA-SimplerEnv (SimplerEnv WidowX policy).
	#
	# Load via:
	# from semanticvla.model.framework.base_framework import baseframework
	# policy = baseframework.from_pretrained("pytorch_model.pt")
	#
	# The loader walks two directory levels up from the checkpoint file to locate
	# this `config.yaml` and the sibling `dataset_statistics.json`.

	seed: 42

	framework:
	name: SemanticVLA
	qwenvl:
	base_vlm: Qwen/Qwen3-VL-4B-Instruct
	attn_implementation: flash_attention_2
	vl_hidden_dim: 2048
	dino:
	dino_backbone: dinov2_vits14
	action_model:
	action_model_type: DiT-B
	action_hidden_dim: 1024
	hidden_size: 1024
	add_pos_embed: true
	max_seq_len: 1024
	action_dim: 7
	state_dim: 7
	future_action_window_size: 15
	action_horizon: 16
	past_action_window_size: 0
	repeated_diffusion_steps: 8
	noise_beta_alpha: 1.5
	noise_beta_beta: 1.0
	noise_s: 0.999
	num_timestep_buckets: 1000
	num_inference_timesteps: 4
	num_target_vision_tokens: 32
	diffusion_model_cfg:
	cross_attention_dim: 2048
	dropout: 0.2
	final_dropout: true
	interleave_self_attention: true
	norm_type: ada_norm
	num_layers: 16
	output_dim: 1024
	positional_embeddings: null
	progress_dim: 0
	trace_dim: 0
	trace:
	injection_mode: none
	hidden_dim: 256
	num_layers: 3
	num_heads: 8
	window_size: 12
	num_tokens: 4
	dropout: 0.1
	num_anchor_points: 4
	lm_aux_loss: false
	aux_loss_weight: 0.1
	coord_range: 1000
	prompt_style: plain
	semantic_output:
	enabled: true
	mode: trace_latent
	order: trace_latent
	lm_loss_weight: 0.1
	latent_vocab_size: 32
	latent_num_tokens: 4
	latent_token_prefix: LAM
	prompt_style: plain
	trace_anchor_points: 4
	parse_trace_for_decoder: false
	trainable_token_rows: false
	reduce_in_full_precision: true

	datasets:
	vla_data:
	dataset_py: lerobot_datasets
	data_root_dir: /path/to/bridge_lerobot
	data_mix: bridge
	statistics_key: oxe_bridge
	action_horizon: 16
	image_size: [224, 224]
	default_image_resolution: [3, 224, 224]
	per_device_batch_size: 16
	num_workers: 4
	trace:
	enabled: true
	root: /path/to/trace_annotations/bridge
	window_size: 12
	normalize: true
	num_anchor_points: 4
	latent_action_labels:
	enabled: true
	root: /path/to/lam_labels
	variant: semanticvla_lam
	strict: true
	missing_policy: clip
	out_key: latent_action_idx

	trainer:
	epochs: 100
	max_train_steps: 100000
	num_warmup_steps: 5000
	save_interval: 5000
	eval_interval: 2000
	learning_rate:
	base: 4.0e-05
	qwen_vl_interface: 1.0e-05
	action_model: 1.0e-04
	lr_scheduler_type: cosine_with_min_lr
	scheduler_specific_kwargs:
	min_lr: 5.0e-07
	freeze_modules: ''
	loss_scale:
	vla: 1.0
	vlm: 0.1
	max_grad_norm: 1.0
	warmup_ratio: 0.1
	weight_decay: 0.0
	logging_frequency: 100
	gradient_clipping: 1.0
	gradient_accumulation_steps: 1
	optimizer:
	name: AdamW
	betas: [0.9, 0.95]
	eps: 1.0e-08
	weight_decay: 1.0e-08
	enable_gradient_checkpointing: true
	enable_mixed_precision_training: true