mimic-vlam / config.json
maxsegan's picture
Add files using upload-large-folder tool
9221c07 verified
{
"dataset": {
"pose_dir": "data/kinetics_processed",
"desc_dir": "data/kinetics_full_output/descriptions",
"video_dir": "data/kinetics-dataset/k700-2020",
"val_split": 0.02,
"max_samples_per_class": null,
"num_workers": 8,
"image_size": 224,
"normalize_pose": true,
"use_joint_angles": true,
"sample_stride": 16,
"include_temporal_context": false,
"action_focus_prompt": false,
"video_fps": 10.0,
"augment_flip": true,
"seed": 42
},
"model_config": {
"qwen_model_name": "/root/.cache/huggingface/hub/models--Qwen--Qwen3-VL-4B-Instruct/snapshots/ebb281ec70b05090aa6165b016eac8ec08e71b17",
"qwen_hidden_size": 2560,
"use_intermediate_hidden": true,
"hidden_layer_fraction": 0.5,
"hidden_layer_index": 18,
"use_early_exit": true,
"use_deepstack_features": true,
"use_flash_attention": true,
"projection_dim": 1024,
"action_dim": 44,
"diffusion_hidden_dim": 1536,
"num_diffusion_layers": 24,
"num_diffusion_heads": 24,
"num_future_tokens": 4,
"action_horizon": 16,
"num_frames": 4,
"use_lora": true,
"lora_rank": 128,
"lora_alpha": 128,
"lora_dropout": 0.05,
"freeze_vision_encoder": true,
"freeze_qwen_layers": 0,
"use_thinking_mode": false,
"diffusion_steps": 2,
"init_from_current_pose": false
},
"learning_rate": 1e-05,
"llm_learning_rate": 1e-06,
"weight_decay": 0.01,
"batch_size": 8,
"num_epochs": 4,
"unfreeze_pct": 0.5,
"gradient_accumulation_steps": 16,
"gradient_clip": 1.0,
"use_amp": true,
"gradient_checkpointing": false,
"log_every_n_steps": 1,
"save_every_n_steps": 750,
"val_max_batches": 100,
"perf_log_every": 100,
"max_checkpoints": 5,
"log_dir": "logs/kinetics_vla",
"checkpoint_dir": "checkpoints/kinetics_vla"
}