{ "dataset": { "pose_dir": "data/kinetics_processed", "desc_dir": "data/kinetics_full_output/descriptions", "video_dir": "data/kinetics-dataset/k700-2020", "val_split": 0.02, "max_samples_per_class": null, "num_workers": 8, "image_size": 224, "normalize_pose": true, "use_joint_angles": true, "sample_stride": 16, "include_temporal_context": false, "action_focus_prompt": false, "video_fps": 10.0, "augment_flip": true, "seed": 42 }, "model_config": { "qwen_model_name": "/root/.cache/huggingface/hub/models--Qwen--Qwen3-VL-4B-Instruct/snapshots/ebb281ec70b05090aa6165b016eac8ec08e71b17", "qwen_hidden_size": 2560, "use_intermediate_hidden": true, "hidden_layer_fraction": 0.5, "hidden_layer_index": 18, "use_early_exit": true, "use_deepstack_features": true, "use_flash_attention": true, "projection_dim": 1024, "action_dim": 44, "diffusion_hidden_dim": 1536, "num_diffusion_layers": 24, "num_diffusion_heads": 24, "num_future_tokens": 4, "action_horizon": 16, "num_frames": 4, "use_lora": true, "lora_rank": 128, "lora_alpha": 128, "lora_dropout": 0.05, "freeze_vision_encoder": true, "freeze_qwen_layers": 0, "use_thinking_mode": false, "diffusion_steps": 2, "init_from_current_pose": false }, "learning_rate": 1e-05, "llm_learning_rate": 1e-06, "weight_decay": 0.01, "batch_size": 8, "num_epochs": 4, "unfreeze_pct": 0.5, "gradient_accumulation_steps": 16, "gradient_clip": 1.0, "use_amp": true, "gradient_checkpointing": false, "log_every_n_steps": 1, "save_every_n_steps": 750, "val_max_batches": 100, "perf_log_every": 100, "max_checkpoints": 5, "log_dir": "logs/kinetics_vla", "checkpoint_dir": "checkpoints/kinetics_vla" }