{ "framework": { "name": "QwenGR00T_with_Language", "qwenvl": { "base_vlm": "Qwen/Qwen2.5-VL-3B-Instruct", "attn_implementation": "flash_attention_2", "vl_hidden_dim": 2048 }, "dino": { "dino_backbone": "dinov2_vits14" }, "action_model": { "action_model_type": "DiT-B", "action_hidden_dim": 2560, "hidden_size": 1024, "add_pos_embed": true, "max_seq_len": 1024, "action_dim": 11, "state_dim": 11, "future_action_window_size": 4, "action_horizon": 5, "past_action_window_size": 0, "num_inference_timesteps": 4, "num_target_vision_tokens": 32, "noise_beta_alpha": 1.5, "noise_beta_beta": 1.0, "noise_s": 0.999, "num_timestep_buckets": 1000, "diffusion_model_cfg": { "cross_attention_dim": 2048, "dropout": 0.2, "final_dropout": true, "interleave_self_attention": true, "norm_type": "ada_norm", "num_layers": 16, "output_dim": 1024, "positional_embeddings": null } } }, "datasets": { "vla_data": { "history_window_size": 7, "action_chunk_size": 5 }, "vla_data2": { "default_image_resolution": [3, 224, 224], "image_size": [224, 224], "action_chunk_size": 5 } } }