{ "type": "vila_chunk", "n_obs_steps": 2, "input_features": { "observation.state": { "type": "STATE", "shape": [ 7 ] }, "observation.images.top": { "type": "VISUAL", "shape": [ 3, 384, 384 ] }, "observation.images.up": { "type": "VISUAL", "shape": [ 3, 384, 384 ] }, "observation.images.side": { "type": "VISUAL", "shape": [ 3, 384, 384 ] }, "observation.images.wrist": { "type": "VISUAL", "shape": [ 3, 384, 384 ] }, "observation.images.image": { "type": "VISUAL", "shape": [ 3, 384, 384 ] }, "observation.image": { "type": "VISUAL", "shape": [ 3, 384, 384 ] }, "observation.images.exterior_image_1_left": { "type": "VISUAL", "shape": [ 3, 384, 384 ] }, "observation.images.exterior_1_left": { "type": "VISUAL", "shape": [ 3, 384, 384 ] }, "observation.images.wrist_left": { "type": "VISUAL", "shape": [ 3, 384, 384 ] } }, "output_features": { "action": { "type": "ACTION", "shape": [ 8, 7 ] } }, "device": "cuda", "use_amp": false, "push_to_hub": true, "repo_id": "FT-LLM-2026-RAMEN/v4-8b-decay2m-ipt_v3_1-instruct4-add-tokens-nvila-stage3-vla-flow-taskfix-20260302-1717", "private": null, "tags": null, "license": null, "pretrained_path": null, "chunk_size": 8, "n_action_steps": 4, "normalization_mapping": { "VISUAL": "IDENTITY", "STATE": "MEAN_STD", "ACTION": "MEAN_STD" }, "temporal_ensemble_mode": "exponential", "temporal_ensemble_decay": 0.1, "temporal_ensemble_every_step": true, "max_state_dim": 14, "max_action_dim": 14, "state_dim": 7, "action_dim": 7, "resize_imgs_with_padding": [ 448, 448 ], "preprocess_images_in_policy": true, "image_key": null, "image_keys": [ "observation.images.top", "observation.images.up", "observation.images.side", "observation.images.wrist", "observation.images.image", "observation.image", "observation.images.exterior_image_1_left", "observation.images.exterior_1_left", "observation.images.wrist_left" ], "auto_detect_image_key": true, "vila_model_name": "FT-LLM-2026-RAMEN/v4-8b-decay2m-ipt_v3_1-instruct4-add-tokens-nvila-stage3", "llm_jp_vila_path": "/home/suzuki/ft_llm_2026_ramen/vla/lerobot_policy_vila/../eval/simpler_env/a100/vla_inference/llm-jp-VILA", "llm_hidden_size": 4096, "hidden_state_layer": -1, "load_in_4bit": false, "bnb_4bit_quant_type": "nf4", "bnb_4bit_use_double_quant": true, "use_flash_attention": true, "action_head_type": "flow", "action_head_hidden_dim": 256, "action_head_num_layers": 2, "action_head_dropout": 0.1, "input_scale": 0.015625, "policy_d_model": 1024, "vision_target_grid": 12, "resampler_num_latents": 32, "resampler_layers": 2, "history_mixer": "gru", "text_encoder_layers": 2, "flow_steps": 10, "flow_depth": 8, "flow_nhead": 8, "flow_dropout": 0.1, "flow_noise_scale": 1.0, "use_state_concat": true, "optimizer_lr": 0.0001, "optimizer_betas": [ 0.9, 0.95 ], "optimizer_eps": 1e-08, "optimizer_weight_decay": 1e-10, "optimizer_grad_clip_norm": 10.0, "use_lr_scheduler": true, "num_warmup_steps": 500, "num_decay_steps": 29500, "decay_lr": 1e-06, "freeze_vision_encoder": true, "freeze_llm_backbone": true, "train_action_head": true, "train_state_proj": true, "vision_unfreeze_last_n_blocks": 0, "vision_lora_r": 0, "vision_lora_alpha": 16, "vision_lora_dropout": 0.0, "vision_lora_target_modules": [ "q_proj", "k_proj", "v_proj", "out_proj", "fc1", "fc2" ], "action_clip_min": null, "action_clip_max": null, "simplerenv_gripper_conversion": true, "simplerenv_gripper_index": -1, "simplerenv_gripper_invert": false, "simplerenv_gripper_binarize": true, "simplerenv_euler_to_axis_angle": true, "simplerenv_rotation_start_index": 3, "prompt_template": "\n\u30ed\u30dc\u30c3\u30c8\u306e\u52d5\u4f5c\u3092\u4e88\u6e2c\u3057\u3066\u304f\u3060\u3055\u3044\u3002", "conversation_template": "llmjp_v3" }