File size: 1,232 Bytes
90ae3fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
{
"_class_name": "VisionLanguageActionModel",
"architectures": [
"VisionLanguageActionModel"
],
"model_type": "vla-model",
"hidden_size": 768,
"num_tasks": 6,
"vision_config": {
"model_type": "vit",
"image_size": 224,
"patch_size": 14,
"hidden_size": 1024,
"num_hidden_layers": 24,
"num_attention_heads": 16,
"intermediate_size": 4096,
"projection_dim": 768
},
"caption_config": {
"model_type": "bert",
"vocab_size": 30522,
"hidden_size": 1024,
"num_hidden_layers": 24,
"num_attention_heads": 16,
"intermediate_size": 4096,
"projection_dim": 768
},
"context_config": {
"model_type": "gpt2",
"vocab_size": 50257,
"n_positions": 1024,
"n_embd": 1024,
"n_layer": 24,
"n_head": 16,
"projection_dim": 768
},
"spatial_config": {
"input_dim": 10,
"output_dim": 768
},
"temporal_config": {
"input_dim": 1280,
"output_dim": 768
},
"fusion_config": {
"input_dim": 3840,
"output_dim": 768
},
"reasoning_config": {
"d_model": 768,
"nhead": 12,
"num_layers": 8
},
"action_head_config": {
"num_actions": 8
},
"explanation_head_config": {
"vocab_size": 30522
}
}
|