| { |
| "type": "vla_jepa", |
| "n_obs_steps": 1, |
| "input_features": { |
| "observation.images.image": { |
| "type": "VISUAL", |
| "shape": [ |
| 3, |
| 224, |
| 224 |
| ] |
| } |
| }, |
| "output_features": { |
| "action": { |
| "type": "ACTION", |
| "shape": [ |
| 7 |
| ] |
| } |
| }, |
| "device": null, |
| "use_amp": false, |
| "use_peft": false, |
| "push_to_hub": true, |
| "repo_id": null, |
| "private": null, |
| "tags": null, |
| "license": null, |
| "pretrained_path": null, |
| "chunk_size": 7, |
| "n_action_steps": 7, |
| "normalization_mapping": { |
| "VISUAL": "IDENTITY", |
| "STATE": "MEAN_STD", |
| "ACTION": "MIN_MAX" |
| }, |
| "qwen_model_name": "Qwen/Qwen3-VL-2B-Instruct", |
| "jepa_encoder_name": "facebook/vjepa2-vitl-fpc64-256", |
| "freeze_qwen": false, |
| "enable_world_model": true, |
| "reinit_modules": null, |
| "tokenizer_padding_side": "left", |
| "prompt_template": "Your task is {instruction}. Infer the temporal dynamics from frames {actions} and produce the corresponding policy actions {e_actions}.", |
| "special_action_token": "<|action_{}|>", |
| "embodied_action_token": "<|embodied_action|>", |
| "action_dim": 7, |
| "state_dim": 8, |
| "num_action_tokens_per_timestep": 8, |
| "num_embodied_action_tokens_per_instruction": 32, |
| "num_inference_timesteps": 4, |
| "action_hidden_size": 1024, |
| "action_model_type": "DiT-B", |
| "action_num_layers": 16, |
| "action_num_heads": 12, |
| "action_attention_head_dim": 64, |
| "action_dropout": 0.2, |
| "action_num_timestep_buckets": 1000, |
| "action_noise_beta_alpha": 1.5, |
| "action_noise_beta_beta": 1.0, |
| "action_noise_s": 0.999, |
| "num_target_vision_tokens": 32, |
| "action_max_seq_len": 1024, |
| "num_video_frames": 8, |
| "predictor_depth": 12, |
| "predictor_num_heads": 8, |
| "predictor_mlp_ratio": 4.0, |
| "predictor_dropout": 0.0, |
| "world_model_loss_weight": 0.1, |
| "jepa_tubelet_size": 2, |
| "repeated_diffusion_steps": 8, |
| "resize_images_to": [ |
| 224, |
| 224 |
| ], |
| "binarize_gripper_action": true, |
| "pre_snap_gripper_action": true, |
| "clip_normalized_actions": true, |
| "torch_dtype": "bfloat16", |
| "optimizer_lr": 0.0001, |
| "optimizer_betas": [ |
| 0.9, |
| 0.95 |
| ], |
| "optimizer_eps": 1e-08, |
| "optimizer_weight_decay": 1e-08, |
| "optimizer_grad_clip_norm": 1.0, |
| "scheduler_warmup_steps": 5000, |
| "scheduler_decay_steps": 30000, |
| "scheduler_decay_lr": 1e-05, |
| "gripper_dim": 6, |
| "gripper_threshold": 0.5 |
| } |