{ "_class_name": "VisionLanguageActionModel", "architectures": [ "VisionLanguageActionModel" ], "model_type": "vla-model", "hidden_size": 768, "num_tasks": 6, "vision_config": { "model_type": "vit", "image_size": 224, "patch_size": 14, "hidden_size": 1024, "num_hidden_layers": 24, "num_attention_heads": 16, "intermediate_size": 4096, "projection_dim": 768 }, "caption_config": { "model_type": "bert", "vocab_size": 30522, "hidden_size": 1024, "num_hidden_layers": 24, "num_attention_heads": 16, "intermediate_size": 4096, "projection_dim": 768 }, "context_config": { "model_type": "gpt2", "vocab_size": 50257, "n_positions": 1024, "n_embd": 1024, "n_layer": 24, "n_head": 16, "projection_dim": 768 }, "spatial_config": { "input_dim": 10, "output_dim": 768 }, "temporal_config": { "input_dim": 1280, "output_dim": 768 }, "fusion_config": { "input_dim": 3840, "output_dim": 768 }, "reasoning_config": { "d_model": 768, "nhead": 12, "num_layers": 8 }, "action_head_config": { "num_actions": 8 }, "explanation_head_config": { "vocab_size": 30522 } }