{
  "framework": {
    "name": "QwenGR00T_with_Language",
    "qwenvl": {
      "base_vlm": "Qwen/Qwen2.5-VL-3B-Instruct",
      "attn_implementation": "flash_attention_2",
      "vl_hidden_dim": 2048
    },
    "dino": {
      "dino_backbone": "dinov2_vits14"
    },
    "action_model": {
      "action_model_type": "DiT-B",
      "action_hidden_dim": 2560,
      "hidden_size": 1024,
      "add_pos_embed": true,
      "max_seq_len": 1024,
      "action_dim": 11,
      "state_dim": 11,
      "future_action_window_size": 4,
      "action_horizon": 5,
      "past_action_window_size": 0,
      "num_inference_timesteps": 4,
      "num_target_vision_tokens": 32,
      "noise_beta_alpha": 1.5,
      "noise_beta_beta": 1.0,
      "noise_s": 0.999,
      "num_timestep_buckets": 1000,
      "diffusion_model_cfg": {
        "cross_attention_dim": 2048,
        "dropout": 0.2,
        "final_dropout": true,
        "interleave_self_attention": true,
        "norm_type": "ada_norm",
        "num_layers": 16,
        "output_dim": 1024,
        "positional_embeddings": null
      }
    }
  },
  "datasets": {
    "vla_data": {
      "history_window_size": 7,
      "action_chunk_size": 5
    },
    "vla_data2": {
      "default_image_resolution": [3, 224, 224],
      "image_size": [224, 224],
      "action_chunk_size": 5
    }
  }
}