| { | |
| "dataset": { | |
| "pose_dir": "data/kinetics_processed", | |
| "desc_dir": "data/kinetics_full_output/descriptions", | |
| "video_dir": "data/kinetics-dataset/k700-2020", | |
| "val_split": 0.02, | |
| "max_samples_per_class": null, | |
| "num_workers": 8, | |
| "image_size": 224, | |
| "normalize_pose": true, | |
| "use_joint_angles": true, | |
| "sample_stride": 16, | |
| "include_temporal_context": false, | |
| "action_focus_prompt": false, | |
| "video_fps": 10.0, | |
| "augment_flip": true, | |
| "seed": 42 | |
| }, | |
| "model_config": { | |
| "qwen_model_name": "/root/.cache/huggingface/hub/models--Qwen--Qwen3-VL-4B-Instruct/snapshots/ebb281ec70b05090aa6165b016eac8ec08e71b17", | |
| "qwen_hidden_size": 2560, | |
| "use_intermediate_hidden": true, | |
| "hidden_layer_fraction": 0.5, | |
| "hidden_layer_index": 18, | |
| "use_early_exit": true, | |
| "use_deepstack_features": true, | |
| "use_flash_attention": true, | |
| "projection_dim": 1024, | |
| "action_dim": 44, | |
| "diffusion_hidden_dim": 1536, | |
| "num_diffusion_layers": 24, | |
| "num_diffusion_heads": 24, | |
| "num_future_tokens": 4, | |
| "action_horizon": 16, | |
| "num_frames": 4, | |
| "use_lora": true, | |
| "lora_rank": 128, | |
| "lora_alpha": 128, | |
| "lora_dropout": 0.05, | |
| "freeze_vision_encoder": true, | |
| "freeze_qwen_layers": 0, | |
| "use_thinking_mode": false, | |
| "diffusion_steps": 2, | |
| "init_from_current_pose": false | |
| }, | |
| "learning_rate": 1e-05, | |
| "llm_learning_rate": 1e-06, | |
| "weight_decay": 0.01, | |
| "batch_size": 8, | |
| "num_epochs": 4, | |
| "unfreeze_pct": 0.5, | |
| "gradient_accumulation_steps": 16, | |
| "gradient_clip": 1.0, | |
| "use_amp": true, | |
| "gradient_checkpointing": false, | |
| "log_every_n_steps": 1, | |
| "save_every_n_steps": 750, | |
| "val_max_batches": 100, | |
| "perf_log_every": 100, | |
| "max_checkpoints": 5, | |
| "log_dir": "logs/kinetics_vla", | |
| "checkpoint_dir": "checkpoints/kinetics_vla" | |
| } |