vilt-vqa-finetune / config.json
phonghoccode's picture
Upload CustomViltForVQA
aa14ac6 verified
{
"_name_or_path": "phonghoccode/vilt-vqa-finetune-pytorch",
"architectures": [
"CustomViltForVQA"
],
"attention_probs_dropout_prob": 0.0,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.0,
"hidden_size": 768,
"id2label": {
"0": "donut",
"1": "Orange",
"2": "bottle",
"3": "laptop",
"4": "toilet",
"5": "car",
"6": "fork",
"7": "bus",
"8": "keyboard",
"9": "0",
"10": "Green",
"11": "vase",
"12": "bear",
"13": "4",
"14": "3",
"15": "bird",
"16": "Yellow",
"17": "Grey",
"18": "suitcase",
"19": "skateboard",
"20": "tv",
"21": "Red",
"22": "Behind",
"23": "spoon",
"24": "bicycle",
"25": "7",
"26": "remote",
"27": "kite",
"28": "orange",
"29": "cow",
"30": "Below",
"31": "parking meter",
"32": "right",
"33": "bowl",
"34": "sheep",
"35": "handbag",
"36": "potted plant",
"37": "left",
"38": "airplane",
"39": "2",
"40": "6",
"41": "elephant",
"42": "skis",
"43": "bench",
"44": "dog",
"45": "truck",
"46": "Left",
"47": "umbrella",
"48": "motorcycle",
"49": "5",
"50": "tennis racket",
"51": "cake",
"52": "Front",
"53": "clock",
"54": "teddy bear",
"55": "hot dog",
"56": "oven",
"57": "toothbrush",
"58": "Black",
"59": "book",
"60": "1",
"61": "tie",
"62": "couch",
"63": "mouse",
"64": "Brown",
"65": "dining table",
"66": "Pink",
"67": "carrot",
"68": "surfboard",
"69": "pizza",
"70": "bed",
"71": "cell phone",
"72": "broccoli",
"73": "scissors",
"74": "Purple",
"75": "boat",
"76": "Yes",
"77": "apple",
"78": "Blue",
"79": "stop sign",
"80": "8",
"81": "frisbee",
"82": "sports ball",
"83": "9",
"84": "fire hydrant",
"85": "wine glass",
"86": "sink",
"87": "baseball glove",
"88": "cat",
"89": "train",
"90": "banana",
"91": "horse",
"92": "above",
"93": "White",
"94": "traffic light",
"95": "snowboard",
"96": "No",
"97": "baseball bat",
"98": "person",
"99": "refrigerator",
"100": "zebra",
"101": "chair",
"102": "cup",
"103": "giraffe",
"104": "knife",
"105": "Right",
"106": "Above",
"107": "sandwich"
},
"image_size": 384,
"initializer_range": 0.02,
"intermediate_size": 3072,
"label2id": {
"0": 9,
"1": 60,
"2": 39,
"3": 14,
"4": 13,
"5": 49,
"6": 40,
"7": 25,
"8": 80,
"9": 83,
"Above": 106,
"Behind": 22,
"Below": 30,
"Black": 58,
"Blue": 78,
"Brown": 64,
"Front": 52,
"Green": 10,
"Grey": 17,
"Left": 46,
"No": 96,
"Orange": 1,
"Pink": 66,
"Purple": 74,
"Red": 21,
"Right": 105,
"White": 93,
"Yellow": 16,
"Yes": 76,
"above": 92,
"airplane": 38,
"apple": 77,
"banana": 90,
"baseball bat": 97,
"baseball glove": 87,
"bear": 12,
"bed": 70,
"bench": 43,
"bicycle": 24,
"bird": 15,
"boat": 75,
"book": 59,
"bottle": 2,
"bowl": 33,
"broccoli": 72,
"bus": 7,
"cake": 51,
"car": 5,
"carrot": 67,
"cat": 88,
"cell phone": 71,
"chair": 101,
"clock": 53,
"couch": 62,
"cow": 29,
"cup": 102,
"dining table": 65,
"dog": 44,
"donut": 0,
"elephant": 41,
"fire hydrant": 84,
"fork": 6,
"frisbee": 81,
"giraffe": 103,
"handbag": 35,
"horse": 91,
"hot dog": 55,
"keyboard": 8,
"kite": 27,
"knife": 104,
"laptop": 3,
"left": 37,
"motorcycle": 48,
"mouse": 63,
"orange": 28,
"oven": 56,
"parking meter": 31,
"person": 98,
"pizza": 69,
"potted plant": 36,
"refrigerator": 99,
"remote": 26,
"right": 32,
"sandwich": 107,
"scissors": 73,
"sheep": 34,
"sink": 86,
"skateboard": 19,
"skis": 42,
"snowboard": 95,
"spoon": 23,
"sports ball": 82,
"stop sign": 79,
"suitcase": 18,
"surfboard": 68,
"teddy bear": 54,
"tennis racket": 50,
"tie": 61,
"toilet": 4,
"toothbrush": 57,
"traffic light": 94,
"train": 89,
"truck": 45,
"tv": 20,
"umbrella": 47,
"vase": 11,
"wine glass": 85,
"zebra": 100
},
"layer_norm_eps": 1e-12,
"max_image_length": -1,
"max_position_embeddings": 40,
"modality_type_vocab_size": 2,
"model_type": "vilt",
"num_attention_heads": 12,
"num_channels": 3,
"num_hidden_layers": 12,
"num_images": -1,
"patch_size": 32,
"qkv_bias": true,
"tie_word_embeddings": false,
"torch_dtype": "float32",
"transformers_version": "4.35.2",
"type_vocab_size": 2,
"vocab_size": 30522
}