lerobot
/

VLA-JEPA-SimplerEnv

Model card Files Files and versions

VLA-JEPA-SimplerEnv / config.json

maximellerbach's picture

Add gripper_dim and gripper_threshold to config

aa1546c verified 6 days ago

history blame contribute delete

2.37 kB

	{
	"type": "vla_jepa",
	"n_obs_steps": 1,
	"input_features": {
	"observation.images.image": {
	"type": "VISUAL",
	"shape": [
	3,
	224,
	224
	]
	}
	},
	"output_features": {
	"action": {
	"type": "ACTION",
	"shape": [
	7
	]
	}
	},
	"device": null,
	"use_amp": false,
	"use_peft": false,
	"push_to_hub": true,
	"repo_id": null,
	"private": null,
	"tags": null,
	"license": null,
	"pretrained_path": null,
	"chunk_size": 7,
	"n_action_steps": 7,
	"normalization_mapping": {
	"VISUAL": "IDENTITY",
	"STATE": "MEAN_STD",
	"ACTION": "MIN_MAX"
	},
	"qwen_model_name": "Qwen/Qwen3-VL-2B-Instruct",
	"jepa_encoder_name": "facebook/vjepa2-vitl-fpc64-256",
	"freeze_qwen": false,
	"enable_world_model": true,
	"reinit_modules": null,
	"tokenizer_padding_side": "left",
	"prompt_template": "Your task is {instruction}. Infer the temporal dynamics from frames {actions} and produce the corresponding policy actions {e_actions}.",
	"special_action_token": "<\|action_{}\|>",
	"embodied_action_token": "<\|embodied_action\|>",
	"action_dim": 7,
	"state_dim": 8,
	"num_action_tokens_per_timestep": 8,
	"num_embodied_action_tokens_per_instruction": 32,
	"num_inference_timesteps": 4,
	"action_hidden_size": 1024,
	"action_model_type": "DiT-B",
	"action_num_layers": 16,
	"action_num_heads": 12,
	"action_attention_head_dim": 64,
	"action_dropout": 0.2,
	"action_num_timestep_buckets": 1000,
	"action_noise_beta_alpha": 1.5,
	"action_noise_beta_beta": 1.0,
	"action_noise_s": 0.999,
	"num_target_vision_tokens": 32,
	"action_max_seq_len": 1024,
	"num_video_frames": 8,
	"predictor_depth": 12,
	"predictor_num_heads": 8,
	"predictor_mlp_ratio": 4.0,
	"predictor_dropout": 0.0,
	"world_model_loss_weight": 0.1,
	"jepa_tubelet_size": 2,
	"repeated_diffusion_steps": 8,
	"resize_images_to": [
	224,
	224
	],
	"binarize_gripper_action": true,
	"pre_snap_gripper_action": true,
	"clip_normalized_actions": true,
	"torch_dtype": "bfloat16",
	"optimizer_lr": 0.0001,
	"optimizer_betas": [
	0.9,
	0.95
	],
	"optimizer_eps": 1e-08,
	"optimizer_weight_decay": 1e-08,
	"optimizer_grad_clip_norm": 1.0,
	"scheduler_warmup_steps": 5000,
	"scheduler_decay_steps": 30000,
	"scheduler_decay_lr": 1e-05,
	"gripper_dim": 6,
	"gripper_threshold": 0.5
	}