{ "_name_or_path": "phonghoccode/vilt-vqa-finetune-pytorch", "architectures": [ "CustomViltForVQA" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "donut", "1": "Orange", "2": "bottle", "3": "laptop", "4": "toilet", "5": "car", "6": "fork", "7": "bus", "8": "keyboard", "9": "0", "10": "Green", "11": "vase", "12": "bear", "13": "4", "14": "3", "15": "bird", "16": "Yellow", "17": "Grey", "18": "suitcase", "19": "skateboard", "20": "tv", "21": "Red", "22": "Behind", "23": "spoon", "24": "bicycle", "25": "7", "26": "remote", "27": "kite", "28": "orange", "29": "cow", "30": "Below", "31": "parking meter", "32": "right", "33": "bowl", "34": "sheep", "35": "handbag", "36": "potted plant", "37": "left", "38": "airplane", "39": "2", "40": "6", "41": "elephant", "42": "skis", "43": "bench", "44": "dog", "45": "truck", "46": "Left", "47": "umbrella", "48": "motorcycle", "49": "5", "50": "tennis racket", "51": "cake", "52": "Front", "53": "clock", "54": "teddy bear", "55": "hot dog", "56": "oven", "57": "toothbrush", "58": "Black", "59": "book", "60": "1", "61": "tie", "62": "couch", "63": "mouse", "64": "Brown", "65": "dining table", "66": "Pink", "67": "carrot", "68": "surfboard", "69": "pizza", "70": "bed", "71": "cell phone", "72": "broccoli", "73": "scissors", "74": "Purple", "75": "boat", "76": "Yes", "77": "apple", "78": "Blue", "79": "stop sign", "80": "8", "81": "frisbee", "82": "sports ball", "83": "9", "84": "fire hydrant", "85": "wine glass", "86": "sink", "87": "baseball glove", "88": "cat", "89": "train", "90": "banana", "91": "horse", "92": "above", "93": "White", "94": "traffic light", "95": "snowboard", "96": "No", "97": "baseball bat", "98": "person", "99": "refrigerator", "100": "zebra", "101": "chair", "102": "cup", "103": "giraffe", "104": "knife", "105": "Right", "106": "Above", "107": "sandwich" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 9, "1": 60, "2": 39, "3": 14, "4": 13, "5": 49, "6": 40, "7": 25, "8": 80, "9": 83, "Above": 106, "Behind": 22, "Below": 30, "Black": 58, "Blue": 78, "Brown": 64, "Front": 52, "Green": 10, "Grey": 17, "Left": 46, "No": 96, "Orange": 1, "Pink": 66, "Purple": 74, "Red": 21, "Right": 105, "White": 93, "Yellow": 16, "Yes": 76, "above": 92, "airplane": 38, "apple": 77, "banana": 90, "baseball bat": 97, "baseball glove": 87, "bear": 12, "bed": 70, "bench": 43, "bicycle": 24, "bird": 15, "boat": 75, "book": 59, "bottle": 2, "bowl": 33, "broccoli": 72, "bus": 7, "cake": 51, "car": 5, "carrot": 67, "cat": 88, "cell phone": 71, "chair": 101, "clock": 53, "couch": 62, "cow": 29, "cup": 102, "dining table": 65, "dog": 44, "donut": 0, "elephant": 41, "fire hydrant": 84, "fork": 6, "frisbee": 81, "giraffe": 103, "handbag": 35, "horse": 91, "hot dog": 55, "keyboard": 8, "kite": 27, "knife": 104, "laptop": 3, "left": 37, "motorcycle": 48, "mouse": 63, "orange": 28, "oven": 56, "parking meter": 31, "person": 98, "pizza": 69, "potted plant": 36, "refrigerator": 99, "remote": 26, "right": 32, "sandwich": 107, "scissors": 73, "sheep": 34, "sink": 86, "skateboard": 19, "skis": 42, "snowboard": 95, "spoon": 23, "sports ball": 82, "stop sign": 79, "suitcase": 18, "surfboard": 68, "teddy bear": 54, "tennis racket": 50, "tie": 61, "toilet": 4, "toothbrush": 57, "traffic light": 94, "train": 89, "truck": 45, "tv": 20, "umbrella": 47, "vase": 11, "wine glass": 85, "zebra": 100 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.35.2", "type_vocab_size": 2, "vocab_size": 30522 }