VLA-JEPA-SimplerEnv / config.json
maximellerbach's picture
Add gripper_dim and gripper_threshold to config
aa1546c verified
{
"type": "vla_jepa",
"n_obs_steps": 1,
"input_features": {
"observation.images.image": {
"type": "VISUAL",
"shape": [
3,
224,
224
]
}
},
"output_features": {
"action": {
"type": "ACTION",
"shape": [
7
]
}
},
"device": null,
"use_amp": false,
"use_peft": false,
"push_to_hub": true,
"repo_id": null,
"private": null,
"tags": null,
"license": null,
"pretrained_path": null,
"chunk_size": 7,
"n_action_steps": 7,
"normalization_mapping": {
"VISUAL": "IDENTITY",
"STATE": "MEAN_STD",
"ACTION": "MIN_MAX"
},
"qwen_model_name": "Qwen/Qwen3-VL-2B-Instruct",
"jepa_encoder_name": "facebook/vjepa2-vitl-fpc64-256",
"freeze_qwen": false,
"enable_world_model": true,
"reinit_modules": null,
"tokenizer_padding_side": "left",
"prompt_template": "Your task is {instruction}. Infer the temporal dynamics from frames {actions} and produce the corresponding policy actions {e_actions}.",
"special_action_token": "<|action_{}|>",
"embodied_action_token": "<|embodied_action|>",
"action_dim": 7,
"state_dim": 8,
"num_action_tokens_per_timestep": 8,
"num_embodied_action_tokens_per_instruction": 32,
"num_inference_timesteps": 4,
"action_hidden_size": 1024,
"action_model_type": "DiT-B",
"action_num_layers": 16,
"action_num_heads": 12,
"action_attention_head_dim": 64,
"action_dropout": 0.2,
"action_num_timestep_buckets": 1000,
"action_noise_beta_alpha": 1.5,
"action_noise_beta_beta": 1.0,
"action_noise_s": 0.999,
"num_target_vision_tokens": 32,
"action_max_seq_len": 1024,
"num_video_frames": 8,
"predictor_depth": 12,
"predictor_num_heads": 8,
"predictor_mlp_ratio": 4.0,
"predictor_dropout": 0.0,
"world_model_loss_weight": 0.1,
"jepa_tubelet_size": 2,
"repeated_diffusion_steps": 8,
"resize_images_to": [
224,
224
],
"binarize_gripper_action": true,
"pre_snap_gripper_action": true,
"clip_normalized_actions": true,
"torch_dtype": "bfloat16",
"optimizer_lr": 0.0001,
"optimizer_betas": [
0.9,
0.95
],
"optimizer_eps": 1e-08,
"optimizer_weight_decay": 1e-08,
"optimizer_grad_clip_norm": 1.0,
"scheduler_warmup_steps": 5000,
"scheduler_decay_steps": 30000,
"scheduler_decay_lr": 1e-05,
"gripper_dim": 6,
"gripper_threshold": 0.5
}