{ "_name_or_path": "dandelin/vilt-b32-finetuned-vqa", "architectures": [ "CustomViltForVQA" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "5", "1": "zebra", "2": "couch", "3": "sink", "4": "apple", "5": "umbrella", "6": "bus", "7": "dog", "8": "boat", "9": "Grey", "10": "suitcase", "11": "2", "12": "bottle", "13": "spoon", "14": "3", "15": "toilet", "16": "8", "17": "7", "18": "knife", "19": "Pink", "20": "skis", "21": "4", "22": "toothbrush", "23": "surfboard", "24": "Behind", "25": "person", "26": "orange", "27": "chair", "28": "handbag", "29": "cow", "30": "1", "31": "fire hydrant", "32": "oven", "33": "tennis racket", "34": "Yellow", "35": "remote", "36": "9", "37": "No", "38": "0", "39": "carrot", "40": "Above", "41": "sports ball", "42": "Purple", "43": "snowboard", "44": "parking meter", "45": "mouse", "46": "White", "47": "clock", "48": "dining table", "49": "wine glass", "50": "car", "51": "teddy bear", "52": "bicycle", "53": "scissors", "54": "keyboard", "55": "Green", "56": "Below", "57": "Left", "58": "book", "59": "Front", "60": "fork", "61": "broccoli", "62": "giraffe", "63": "baseball glove", "64": "Red", "65": "Orange", "66": "Black", "67": "frisbee", "68": "potted plant", "69": "sandwich", "70": "cup", "71": "right", "72": "cake", "73": "Yes", "74": "Right", "75": "bed", "76": "cell phone", "77": "skateboard", "78": "bowl", "79": "truck", "80": "donut", "81": "above", "82": "Brown", "83": "kite", "84": "cat", "85": "traffic light", "86": "pizza", "87": "sheep", "88": "elephant", "89": "laptop", "90": "refrigerator", "91": "6", "92": "banana", "93": "Blue", "94": "hot dog", "95": "bear", "96": "bird", "97": "motorcycle", "98": "horse", "99": "tv", "100": "tie", "101": "left", "102": "vase", "103": "train", "104": "baseball bat", "105": "stop sign", "106": "airplane", "107": "bench" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 38, "1": 30, "2": 11, "3": 14, "4": 21, "5": 0, "6": 91, "7": 17, "8": 16, "9": 36, "Above": 40, "Behind": 24, "Below": 56, "Black": 66, "Blue": 93, "Brown": 82, "Front": 59, "Green": 55, "Grey": 9, "Left": 57, "No": 37, "Orange": 65, "Pink": 19, "Purple": 42, "Red": 64, "Right": 74, "White": 46, "Yellow": 34, "Yes": 73, "above": 81, "airplane": 106, "apple": 4, "banana": 92, "baseball bat": 104, "baseball glove": 63, "bear": 95, "bed": 75, "bench": 107, "bicycle": 52, "bird": 96, "boat": 8, "book": 58, "bottle": 12, "bowl": 78, "broccoli": 61, "bus": 6, "cake": 72, "car": 50, "carrot": 39, "cat": 84, "cell phone": 76, "chair": 27, "clock": 47, "couch": 2, "cow": 29, "cup": 70, "dining table": 48, "dog": 7, "donut": 80, "elephant": 88, "fire hydrant": 31, "fork": 60, "frisbee": 67, "giraffe": 62, "handbag": 28, "horse": 98, "hot dog": 94, "keyboard": 54, "kite": 83, "knife": 18, "laptop": 89, "left": 101, "motorcycle": 97, "mouse": 45, "orange": 26, "oven": 32, "parking meter": 44, "person": 25, "pizza": 86, "potted plant": 68, "refrigerator": 90, "remote": 35, "right": 71, "sandwich": 69, "scissors": 53, "sheep": 87, "sink": 3, "skateboard": 77, "skis": 20, "snowboard": 43, "spoon": 13, "sports ball": 41, "stop sign": 105, "suitcase": 10, "surfboard": 23, "teddy bear": 51, "tennis racket": 33, "tie": 100, "toilet": 15, "toothbrush": 22, "traffic light": 85, "train": 103, "truck": 79, "tv": 99, "umbrella": 5, "vase": 102, "wine glass": 49, "zebra": 1 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.49.0", "type_vocab_size": 2, "vocab_size": 30522 }