| { |
| "action_dim": 32, |
| "action_head_cfg": { |
| "action_dim": 32, |
| "action_horizon": 16, |
| "add_pos_embed": true, |
| "attn_type": "joint_attn_v2", |
| "backbone_embedding_dim": 4096, |
| "base_freq": 50.0, |
| "dct_loss_weight": 0.0, |
| "diffusion_model_cfg": { |
| "attention_head_dim": 64, |
| "cross_attention_dim": 4096, |
| "depth": 4, |
| "depth_single_blocks": 8, |
| "direct_visual_conditioning": null, |
| "disable_time_token_pos_emb": false, |
| "dropout": 0.2, |
| "final_dropout": true, |
| "interleave_self_attention": true, |
| "meta_queries_as_modality": false, |
| "norm_type": "ada_norm", |
| "num_attention_heads": 24, |
| "num_layers": 16, |
| "num_temb_tokens": 1, |
| "output_dim": 1024, |
| "positional_embeddings": "rope_sa_only", |
| "rope_theta": 10000.0, |
| "state_as_modality": false, |
| "temb_type": "additional_token", |
| "use_alternate_vl_conditioning": false, |
| "use_swiglu": true |
| }, |
| "discretize_timesteps": false, |
| "ff_loss_weight": 0.0, |
| "flow_matching_loss_weight": 1.0, |
| "hidden_size": 1024, |
| "input_embedding_dim": 1536, |
| "max_action_dim": 32, |
| "max_num_embodiments": 32, |
| "max_state_dim": 64, |
| "model_dct": false, |
| "model_dtype": "float32", |
| "noise_beta_alpha": 1.5, |
| "noise_beta_beta": 1.0, |
| "noise_s": 0.999, |
| "num_inference_timesteps": 4, |
| "num_target_vision_tokens": 32, |
| "num_timestep_buckets": 1000, |
| "post_norm": "none", |
| "pre_norm": "layer_norm", |
| "qk_rmsnorm": false, |
| "remove_bias": false, |
| "tune_diffusion_model": true, |
| "tune_projector": true, |
| "use_future_tokens": false, |
| "use_qknorm": true, |
| "use_rmsnorm": true, |
| "use_vlln": false, |
| "vl_self_attention_cfg": { |
| "attention_head_dim": 64, |
| "dropout": 0.2, |
| "final_dropout": true, |
| "num_attention_heads": 64, |
| "num_layers": 4, |
| "positional_embeddings": null |
| }, |
| "x_prediction": false |
| }, |
| "action_horizon": 16, |
| "architectures": [ |
| "GR00T_N1_5" |
| ], |
| "attn_implementation": null, |
| "backbone_cfg": { |
| "load_bf16": false, |
| "meta_queries_mode": "full", |
| "n_meta_queries": 4, |
| "project_to_dim": null, |
| "qwen_path": "/fsx/alinvla/AlinVLA-VLM/checkpoints/robot_vqa_v2/checkpoint-7647", |
| "reproject_vision": false, |
| "select_layer": 18, |
| "tune_llm": false, |
| "tune_visual": false, |
| "use_causal_mask": true, |
| "use_flash_attention": true, |
| "use_meta_queries": false |
| }, |
| "backbone_model_type": "qwen3_vl_8b", |
| "compute_dtype": "bfloat16", |
| "dtype": "bfloat16", |
| "hidden_size": 2048, |
| "lap_cfg": {}, |
| "max_action_dim": 32, |
| "model_dtype": "float32", |
| "model_type": "gr00t_n1_5", |
| "transformers_version": "5.0.0.dev0", |
| "tune_diffusion_model": true, |
| "tune_llm": false, |
| "tune_projector": true, |
| "tune_visual": false, |
| "use_cache": false |
| } |
|
|