| { |
| "framework": { |
| "name": "QwenGR00T_with_Language", |
| "qwenvl": { |
| "base_vlm": "Qwen/Qwen2.5-VL-3B-Instruct", |
| "attn_implementation": "flash_attention_2", |
| "vl_hidden_dim": 2048 |
| }, |
| "dino": { |
| "dino_backbone": "dinov2_vits14" |
| }, |
| "action_model": { |
| "action_model_type": "DiT-B", |
| "action_hidden_dim": 2560, |
| "hidden_size": 1024, |
| "add_pos_embed": true, |
| "max_seq_len": 1024, |
| "action_dim": 11, |
| "state_dim": 11, |
| "future_action_window_size": 4, |
| "action_horizon": 5, |
| "past_action_window_size": 0, |
| "num_inference_timesteps": 4, |
| "num_target_vision_tokens": 32, |
| "noise_beta_alpha": 1.5, |
| "noise_beta_beta": 1.0, |
| "noise_s": 0.999, |
| "num_timestep_buckets": 1000, |
| "diffusion_model_cfg": { |
| "cross_attention_dim": 2048, |
| "dropout": 0.2, |
| "final_dropout": true, |
| "interleave_self_attention": true, |
| "norm_type": "ada_norm", |
| "num_layers": 16, |
| "output_dim": 1024, |
| "positional_embeddings": null |
| } |
| } |
| }, |
| "datasets": { |
| "vla_data": { |
| "history_window_size": 7, |
| "action_chunk_size": 5 |
| }, |
| "vla_data2": { |
| "default_image_resolution": [3, 224, 224], |
| "image_size": [224, 224], |
| "action_chunk_size": 5 |
| } |
| } |
| } |
|
|