| { |
| "run_id": "LIBERO", |
| "run_root_dir": "checkpoints", |
| "seed": 42, |
| "trackers": [ |
| "json" |
| ], |
| "is_debug": false, |
| "framework": { |
| "name": "VLA_JEPA", |
| "qwenvl": { |
| "base_vlm": "/home/dataset-local/models/Qwen3-VL-2B-Instruct", |
| "attn_implementation": "flash_attention_2", |
| "vl_hidden_dim": 2048 |
| }, |
| "action_model": { |
| "action_model_type": "DiT-B", |
| "action_hidden_dim": 1024, |
| "hidden_size": 1024, |
| "add_pos_embed": true, |
| "max_seq_len": 1024, |
| "action_dim": 7, |
| "state_dim": 8, |
| "future_action_window_size": 6, |
| "action_horizon": 7, |
| "past_action_window_size": 0, |
| "repeated_diffusion_steps": 8, |
| "noise_beta_alpha": 1.5, |
| "noise_beta_beta": 1.0, |
| "noise_s": 0.999, |
| "num_timestep_buckets": 1000, |
| "num_inference_timesteps": 4, |
| "num_target_vision_tokens": 32, |
| "diffusion_model_cfg": { |
| "cross_attention_dim": 2048, |
| "dropout": 0.2, |
| "final_dropout": true, |
| "interleave_self_attention": true, |
| "norm_type": "ada_norm", |
| "num_layers": 16, |
| "output_dim": 1024, |
| "positional_embeddings": null |
| } |
| }, |
| "vj2_model": { |
| "base_encoder": "/home/dataset-local/models/vjepa2-vitl-fpc64-256", |
| "depth": 12, |
| "num_heads": 8, |
| "special_action_token": "<|action_{}|>", |
| "num_action_tokens_per_timestep": 8, |
| "embodied_action_token": "<|embodied_action|>", |
| "num_embodied_action_tokens_per_instruction": 32, |
| "num_frames": 8 |
| }, |
| "reduce_in_full_precision": true |
| }, |
| "datasets": { |
| "vla_data": { |
| "dataset_py": "lerobot_datasets", |
| "data_root_dir": "/home/dataset-local/datasets/LeRobot/LEROBOT_LIBERO_DATA", |
| "data_mix": "libero_all", |
| "action_type": "delta_qpos", |
| "CoT_prompt": "Your task is {instruction}. Infer the temporal dynamics from frames {actions} and produce the corresponding policy actions {e_actions}.", |
| "resolution_size": 224, |
| "per_device_batch_size": 32, |
| "video_resolution_size": 256, |
| "load_all_data_for_training": true, |
| "with_state": true |
| } |
| }, |
| "trainer": { |
| "epochs": 100, |
| "max_train_steps": 30000, |
| "num_warmup_steps": 5000, |
| "save_interval": 10000, |
| "eval_interval": 100, |
| "learning_rate": { |
| "base": 3e-05, |
| "qwen_vl_interface": 1e-05, |
| "action_model": 0.0001 |
| }, |
| "lr_scheduler_type": "cosine_with_min_lr", |
| "scheduler_specific_kwargs": { |
| "min_lr": 1e-06 |
| }, |
| "freeze_modules": "", |
| "loss_scale": { |
| "vla": 1.0, |
| "vlm": 0.1 |
| }, |
| "max_grad_norm": 1.0, |
| "warmup_ratio": 0.1, |
| "weight_decay": 0.0, |
| "logging_frequency": 10, |
| "gradient_clipping": 1.0, |
| "gradient_accumulation_steps": 1, |
| "pretrained_checkpoint": "/home/dataset-local/VLA_JEPA/checkpoints/pretrain/VLA-JEPA-pretrain.pt", |
| "optimizer": { |
| "name": "AdamW", |
| "betas": [ |
| 0.9, |
| 0.95 |
| ], |
| "eps": 1e-08, |
| "weight_decay": 1e-08 |
| }, |
| "is_resume": false, |
| "resume_epoch": null, |
| "resume_step": null, |
| "enable_gradient_checkpointing": true, |
| "enable_mixed_precision_training": true |
| }, |
| "output_dir": "checkpoints/LIBERO" |
| } |