Fine-tuned ViLT on COCO-QA dataset
Browse files- config.json +2 -41
- model.safetensors +2 -2
config.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
-
"
|
| 4 |
],
|
| 5 |
"attention_probs_dropout_prob": 0.0,
|
| 6 |
"dtype": "float32",
|
|
@@ -440,8 +440,6 @@
|
|
| 440 |
"429": "canoe"
|
| 441 |
},
|
| 442 |
"image_size": 384,
|
| 443 |
-
"image_text_hidden_size": 256,
|
| 444 |
-
"initializer_factor": 1.0,
|
| 445 |
"initializer_range": 0.02,
|
| 446 |
"intermediate_size": 3072,
|
| 447 |
"label2id": {
|
|
@@ -876,56 +874,19 @@
|
|
| 876 |
"zebras": 38,
|
| 877 |
"zoo": 180
|
| 878 |
},
|
| 879 |
-
"label_smoothing": 0.0,
|
| 880 |
"layer_norm_eps": 1e-12,
|
| 881 |
-
"logit_scale_init_value": 2.6592,
|
| 882 |
"max_image_length": -1,
|
| 883 |
"max_position_embeddings": 40,
|
| 884 |
"modality_type_vocab_size": 2,
|
| 885 |
-
"model_type": "
|
| 886 |
"num_attention_heads": 12,
|
| 887 |
"num_channels": 3,
|
| 888 |
"num_hidden_layers": 12,
|
| 889 |
"num_images": -1,
|
| 890 |
"patch_size": 32,
|
| 891 |
-
"projection_dim": 512,
|
| 892 |
"qkv_bias": true,
|
| 893 |
-
"text_config": {
|
| 894 |
-
"attention_probs_dropout_prob": 0.0,
|
| 895 |
-
"dtype": "float32",
|
| 896 |
-
"encoder_hidden_size": 768,
|
| 897 |
-
"hidden_act": "gelu",
|
| 898 |
-
"hidden_dropout_prob": 0.0,
|
| 899 |
-
"hidden_size": 768,
|
| 900 |
-
"initializer_range": 0.02,
|
| 901 |
-
"intermediate_size": 3072,
|
| 902 |
-
"label_smoothing": 0.0,
|
| 903 |
-
"layer_norm_eps": 1e-12,
|
| 904 |
-
"max_position_embeddings": 512,
|
| 905 |
-
"model_type": "blip_text_model",
|
| 906 |
-
"num_attention_heads": 8,
|
| 907 |
-
"num_hidden_layers": 12,
|
| 908 |
-
"projection_dim": 768,
|
| 909 |
-
"use_cache": true,
|
| 910 |
-
"vocab_size": 30524
|
| 911 |
-
},
|
| 912 |
"tie_word_embeddings": false,
|
| 913 |
"transformers_version": "4.57.1",
|
| 914 |
"type_vocab_size": 2,
|
| 915 |
-
"vision_config": {
|
| 916 |
-
"attention_dropout": 0.0,
|
| 917 |
-
"dtype": "float32",
|
| 918 |
-
"hidden_act": "gelu",
|
| 919 |
-
"hidden_size": 768,
|
| 920 |
-
"image_size": 384,
|
| 921 |
-
"initializer_range": 1e-10,
|
| 922 |
-
"intermediate_size": 3072,
|
| 923 |
-
"layer_norm_eps": 1e-05,
|
| 924 |
-
"model_type": "blip_vision_model",
|
| 925 |
-
"num_attention_heads": 12,
|
| 926 |
-
"num_hidden_layers": 12,
|
| 927 |
-
"patch_size": 16,
|
| 928 |
-
"projection_dim": 512
|
| 929 |
-
},
|
| 930 |
"vocab_size": 30522
|
| 931 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
+
"ViltForQuestionAnswering"
|
| 4 |
],
|
| 5 |
"attention_probs_dropout_prob": 0.0,
|
| 6 |
"dtype": "float32",
|
|
|
|
| 440 |
"429": "canoe"
|
| 441 |
},
|
| 442 |
"image_size": 384,
|
|
|
|
|
|
|
| 443 |
"initializer_range": 0.02,
|
| 444 |
"intermediate_size": 3072,
|
| 445 |
"label2id": {
|
|
|
|
| 874 |
"zebras": 38,
|
| 875 |
"zoo": 180
|
| 876 |
},
|
|
|
|
| 877 |
"layer_norm_eps": 1e-12,
|
|
|
|
| 878 |
"max_image_length": -1,
|
| 879 |
"max_position_embeddings": 40,
|
| 880 |
"modality_type_vocab_size": 2,
|
| 881 |
+
"model_type": "vilt",
|
| 882 |
"num_attention_heads": 12,
|
| 883 |
"num_channels": 3,
|
| 884 |
"num_hidden_layers": 12,
|
| 885 |
"num_images": -1,
|
| 886 |
"patch_size": 32,
|
|
|
|
| 887 |
"qkv_bias": true,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 888 |
"tie_word_embeddings": false,
|
| 889 |
"transformers_version": "4.57.1",
|
| 890 |
"type_vocab_size": 2,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 891 |
"vocab_size": 30522
|
| 892 |
}
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1b0bbf2791f8e4a97d0553e514e957b018859635fc8f4363fd6f6a47fa88dd15
|
| 3 |
+
size 453785512
|