{ "type": "vla_jepa", "n_obs_steps": 1, "input_features": { "observation.images.image": { "type": "VISUAL", "shape": [ 3, 224, 224 ] } }, "output_features": { "action": { "type": "ACTION", "shape": [ 7 ] } }, "device": null, "use_amp": false, "use_peft": false, "push_to_hub": true, "repo_id": null, "private": null, "tags": null, "license": null, "pretrained_path": null, "chunk_size": 7, "n_action_steps": 7, "normalization_mapping": { "VISUAL": "IDENTITY", "STATE": "MEAN_STD", "ACTION": "MIN_MAX" }, "qwen_model_name": "Qwen/Qwen3-VL-2B-Instruct", "jepa_encoder_name": "facebook/vjepa2-vitl-fpc64-256", "freeze_qwen": false, "enable_world_model": true, "reinit_modules": null, "tokenizer_padding_side": "left", "prompt_template": "Your task is {instruction}. Infer the temporal dynamics from frames {actions} and produce the corresponding policy actions {e_actions}.", "special_action_token": "<|action_{}|>", "embodied_action_token": "<|embodied_action|>", "action_dim": 7, "state_dim": 8, "num_action_tokens_per_timestep": 8, "num_embodied_action_tokens_per_instruction": 32, "num_inference_timesteps": 4, "action_hidden_size": 1024, "action_model_type": "DiT-B", "action_num_layers": 16, "action_num_heads": 12, "action_attention_head_dim": 64, "action_dropout": 0.2, "action_num_timestep_buckets": 1000, "action_noise_beta_alpha": 1.5, "action_noise_beta_beta": 1.0, "action_noise_s": 0.999, "num_target_vision_tokens": 32, "action_max_seq_len": 1024, "num_video_frames": 8, "predictor_depth": 12, "predictor_num_heads": 8, "predictor_mlp_ratio": 4.0, "predictor_dropout": 0.0, "world_model_loss_weight": 0.1, "jepa_tubelet_size": 2, "repeated_diffusion_steps": 8, "resize_images_to": [ 224, 224 ], "binarize_gripper_action": true, "pre_snap_gripper_action": true, "clip_normalized_actions": true, "torch_dtype": "bfloat16", "optimizer_lr": 0.0001, "optimizer_betas": [ 0.9, 0.95 ], "optimizer_eps": 1e-08, "optimizer_weight_decay": 1e-08, "optimizer_grad_clip_norm": 1.0, "scheduler_warmup_steps": 5000, "scheduler_decay_steps": 30000, "scheduler_decay_lr": 1e-05, "gripper_dim": 6, "gripper_threshold": 0.5 }