{ "_name_or_path": "dandelin/vilt-b32-finetuned-vqa", "architectures": [ "CustomViltForVQA" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "boat", "1": "spoon", "2": "bench", "3": "banana", "4": "cat", "5": "oven", "6": "bottle", "7": "No", "8": "Yes", "9": "knife", "10": "clock", "11": "5", "12": "mouse", "13": "cup", "14": "sheep", "15": "dining table", "16": "fork", "17": "refrigerator", "18": "zebra", "19": "broccoli", "20": "dog", "21": "skateboard", "22": "Black", "23": "bed", "24": "motorcycle", "25": "pizza", "26": "donut", "27": "skis", "28": "chair", "29": "tennis racket", "30": "bird", "31": "potted plant", "32": "sports ball", "33": "Brown", "34": "laptop", "35": "elephant", "36": "horse", "37": "Blue", "38": "suitcase", "39": "hot dog", "40": "4", "41": "Orange", "42": "Purple", "43": "handbag", "44": "cow", "45": "fire hydrant", "46": "snowboard", "47": "toothbrush", "48": "Below", "49": "parking meter", "50": "Front", "51": "Right", "52": "cake", "53": "tv", "54": "9", "55": "tie", "56": "orange", "57": "wine glass", "58": "cell phone", "59": "stop sign", "60": "right", "61": "Pink", "62": "giraffe", "63": "scissors", "64": "1", "65": "7", "66": "keyboard", "67": "Yellow", "68": "3", "69": "remote", "70": "bear", "71": "car", "72": "truck", "73": "surfboard", "74": "traffic light", "75": "left", "76": "bus", "77": "frisbee", "78": "couch", "79": "Red", "80": "Left", "81": "6", "82": "toilet", "83": "airplane", "84": "Grey", "85": "8", "86": "above", "87": "baseball glove", "88": "vase", "89": "kite", "90": "bowl", "91": "0", "92": "2", "93": "White", "94": "Behind", "95": "Above", "96": "baseball bat", "97": "Green", "98": "person", "99": "sandwich", "100": "sink", "101": "book", "102": "train", "103": "umbrella", "104": "carrot", "105": "bicycle", "106": "apple", "107": "teddy bear" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 91, "1": 64, "2": 92, "3": 68, "4": 40, "5": 11, "6": 81, "7": 65, "8": 85, "9": 54, "Above": 95, "Behind": 94, "Below": 48, "Black": 22, "Blue": 37, "Brown": 33, "Front": 50, "Green": 97, "Grey": 84, "Left": 80, "No": 7, "Orange": 41, "Pink": 61, "Purple": 42, "Red": 79, "Right": 51, "White": 93, "Yellow": 67, "Yes": 8, "above": 86, "airplane": 83, "apple": 106, "banana": 3, "baseball bat": 96, "baseball glove": 87, "bear": 70, "bed": 23, "bench": 2, "bicycle": 105, "bird": 30, "boat": 0, "book": 101, "bottle": 6, "bowl": 90, "broccoli": 19, "bus": 76, "cake": 52, "car": 71, "carrot": 104, "cat": 4, "cell phone": 58, "chair": 28, "clock": 10, "couch": 78, "cow": 44, "cup": 13, "dining table": 15, "dog": 20, "donut": 26, "elephant": 35, "fire hydrant": 45, "fork": 16, "frisbee": 77, "giraffe": 62, "handbag": 43, "horse": 36, "hot dog": 39, "keyboard": 66, "kite": 89, "knife": 9, "laptop": 34, "left": 75, "motorcycle": 24, "mouse": 12, "orange": 56, "oven": 5, "parking meter": 49, "person": 98, "pizza": 25, "potted plant": 31, "refrigerator": 17, "remote": 69, "right": 60, "sandwich": 99, "scissors": 63, "sheep": 14, "sink": 100, "skateboard": 21, "skis": 27, "snowboard": 46, "spoon": 1, "sports ball": 32, "stop sign": 59, "suitcase": 38, "surfboard": 73, "teddy bear": 107, "tennis racket": 29, "tie": 55, "toilet": 82, "toothbrush": 47, "traffic light": 74, "train": 102, "truck": 72, "tv": 53, "umbrella": 103, "vase": 88, "wine glass": 57, "zebra": 18 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.47.0", "type_vocab_size": 2, "vocab_size": 30522 }