phonghoccode commited on
Commit
c76f76b
·
verified ·
1 Parent(s): bf5d202

Fine-tuned ViLT on COCO-QA dataset

Browse files
Files changed (2) hide show
  1. config.json +2 -41
  2. model.safetensors +2 -2
config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "architectures": [
3
- "BlipForQuestionAnswering"
4
  ],
5
  "attention_probs_dropout_prob": 0.0,
6
  "dtype": "float32",
@@ -440,8 +440,6 @@
440
  "429": "canoe"
441
  },
442
  "image_size": 384,
443
- "image_text_hidden_size": 256,
444
- "initializer_factor": 1.0,
445
  "initializer_range": 0.02,
446
  "intermediate_size": 3072,
447
  "label2id": {
@@ -876,56 +874,19 @@
876
  "zebras": 38,
877
  "zoo": 180
878
  },
879
- "label_smoothing": 0.0,
880
  "layer_norm_eps": 1e-12,
881
- "logit_scale_init_value": 2.6592,
882
  "max_image_length": -1,
883
  "max_position_embeddings": 40,
884
  "modality_type_vocab_size": 2,
885
- "model_type": "blip",
886
  "num_attention_heads": 12,
887
  "num_channels": 3,
888
  "num_hidden_layers": 12,
889
  "num_images": -1,
890
  "patch_size": 32,
891
- "projection_dim": 512,
892
  "qkv_bias": true,
893
- "text_config": {
894
- "attention_probs_dropout_prob": 0.0,
895
- "dtype": "float32",
896
- "encoder_hidden_size": 768,
897
- "hidden_act": "gelu",
898
- "hidden_dropout_prob": 0.0,
899
- "hidden_size": 768,
900
- "initializer_range": 0.02,
901
- "intermediate_size": 3072,
902
- "label_smoothing": 0.0,
903
- "layer_norm_eps": 1e-12,
904
- "max_position_embeddings": 512,
905
- "model_type": "blip_text_model",
906
- "num_attention_heads": 8,
907
- "num_hidden_layers": 12,
908
- "projection_dim": 768,
909
- "use_cache": true,
910
- "vocab_size": 30524
911
- },
912
  "tie_word_embeddings": false,
913
  "transformers_version": "4.57.1",
914
  "type_vocab_size": 2,
915
- "vision_config": {
916
- "attention_dropout": 0.0,
917
- "dtype": "float32",
918
- "hidden_act": "gelu",
919
- "hidden_size": 768,
920
- "image_size": 384,
921
- "initializer_range": 1e-10,
922
- "intermediate_size": 3072,
923
- "layer_norm_eps": 1e-05,
924
- "model_type": "blip_vision_model",
925
- "num_attention_heads": 12,
926
- "num_hidden_layers": 12,
927
- "patch_size": 16,
928
- "projection_dim": 512
929
- },
930
  "vocab_size": 30522
931
  }
 
1
  {
2
  "architectures": [
3
+ "ViltForQuestionAnswering"
4
  ],
5
  "attention_probs_dropout_prob": 0.0,
6
  "dtype": "float32",
 
440
  "429": "canoe"
441
  },
442
  "image_size": 384,
 
 
443
  "initializer_range": 0.02,
444
  "intermediate_size": 3072,
445
  "label2id": {
 
874
  "zebras": 38,
875
  "zoo": 180
876
  },
 
877
  "layer_norm_eps": 1e-12,
 
878
  "max_image_length": -1,
879
  "max_position_embeddings": 40,
880
  "modality_type_vocab_size": 2,
881
+ "model_type": "vilt",
882
  "num_attention_heads": 12,
883
  "num_channels": 3,
884
  "num_hidden_layers": 12,
885
  "num_images": -1,
886
  "patch_size": 32,
 
887
  "qkv_bias": true,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
888
  "tie_word_embeddings": false,
889
  "transformers_version": "4.57.1",
890
  "type_vocab_size": 2,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
891
  "vocab_size": 30522
892
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90e8aad5da725191db28ac93afc48f58ad37af604ccdd78457567825b65daf06
3
- size 1445022200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b0bbf2791f8e4a97d0553e514e957b018859635fc8f4363fd6f6a47fa88dd15
3
+ size 453785512