Commit ·
2ab3e8a
1
Parent(s): ca39bed
Update model
Browse files- config.json +356 -0
- model.safetensors +1 -1
- modeling_vivqa.py +13 -10
config.json
CHANGED
|
@@ -21,8 +21,364 @@
|
|
| 21 |
"encoder_layers": 4,
|
| 22 |
"encoder_normalize_before": true,
|
| 23 |
"fsdp": false,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
"img_size": 224,
|
| 25 |
"in_chans": 3,
|
|
|
|
| 26 |
"layernorm_embedding": false,
|
| 27 |
"layernorm_eps": 1e-05,
|
| 28 |
"max_rel_pos": 0,
|
|
|
|
| 21 |
"encoder_layers": 4,
|
| 22 |
"encoder_normalize_before": true,
|
| 23 |
"fsdp": false,
|
| 24 |
+
"id2label": {
|
| 25 |
+
"0": "hai",
|
| 26 |
+
"1": "ba",
|
| 27 |
+
"2": "b\u1ed1n",
|
| 28 |
+
"3": "m\u00e0u tr\u1eafng",
|
| 29 |
+
"4": "m\u00e0u \u0111\u1ecf",
|
| 30 |
+
"5": "m\u00e0u xanh d\u01b0\u01a1ng",
|
| 31 |
+
"6": "m\u00e0u \u0111en",
|
| 32 |
+
"7": "m\u00e0u xanh l\u00e1",
|
| 33 |
+
"8": "ph\u00f2ng",
|
| 34 |
+
"9": "m\u00e0u v\u00e0ng",
|
| 35 |
+
"10": "ph\u00f2ng b\u1ebfp",
|
| 36 |
+
"11": "m\u00e0u n\u00e2u",
|
| 37 |
+
"12": "ph\u00f2ng t\u1eafm",
|
| 38 |
+
"13": "m\u00e0u cam",
|
| 39 |
+
"14": "gi\u01b0\u1eddng",
|
| 40 |
+
"15": "con m\u00e8o",
|
| 41 |
+
"16": "h\u01b0\u01a1u cao c\u1ed5",
|
| 42 |
+
"17": "m\u00e1y bay",
|
| 43 |
+
"18": "g\u01b0\u01a1ng",
|
| 44 |
+
"19": "n\u0103m",
|
| 45 |
+
"20": "con chim",
|
| 46 |
+
"21": "m\u00e0u x\u00e1m",
|
| 47 |
+
"22": "m\u00e0u t\u00eda",
|
| 48 |
+
"23": "con ch\u00f3",
|
| 49 |
+
"24": "con thuy\u1ec1n",
|
| 50 |
+
"25": "g\u1ea5u",
|
| 51 |
+
"26": "xe \u00f4 t\u00f4",
|
| 52 |
+
"27": "l\u1ecd c\u1eafm hoa",
|
| 53 |
+
"28": "con voi",
|
| 54 |
+
"29": "m\u1ed9t",
|
| 55 |
+
"30": "con ng\u1ef1a",
|
| 56 |
+
"31": "c\u00e1i gh\u1ebf",
|
| 57 |
+
"32": "xe m\u00e1y",
|
| 58 |
+
"33": "xe t\u1ea3i",
|
| 59 |
+
"34": "t\u00e0u h\u1ecfa",
|
| 60 |
+
"35": "xe bu\u00fdt",
|
| 61 |
+
"36": "\u0111\u01b0\u1eddng ph\u1ed1",
|
| 62 |
+
"37": "ch\u1eadu",
|
| 63 |
+
"38": "h\u1ed9p",
|
| 64 |
+
"39": "b\u00e1t",
|
| 65 |
+
"40": "pizza",
|
| 66 |
+
"41": "xe \u0111\u1ea1p",
|
| 67 |
+
"42": "chu\u1ed3ng",
|
| 68 |
+
"43": "con b\u00f2",
|
| 69 |
+
"44": "vali",
|
| 70 |
+
"45": "b\u00e1nh",
|
| 71 |
+
"46": "\u0111\u1ed3ng h\u1ed3",
|
| 72 |
+
"47": "s\u00e1u",
|
| 73 |
+
"48": "di\u1ec1u",
|
| 74 |
+
"49": "b\u0103ng gh\u1ebf",
|
| 75 |
+
"50": "donut",
|
| 76 |
+
"51": "nh\u00e0 v\u1ec7 sinh",
|
| 77 |
+
"52": "l\u00f2 vi s\u00f3ng",
|
| 78 |
+
"53": "sandwich",
|
| 79 |
+
"54": "ng\u1ef1a v\u1eb1n",
|
| 80 |
+
"55": "tr\u1ea1m",
|
| 81 |
+
"56": "chi\u1ebfc \u00f4",
|
| 82 |
+
"57": "ph\u00f2ng ng\u1ee7",
|
| 83 |
+
"58": "ng\u1ef1a r\u1eb1n",
|
| 84 |
+
"59": "\u0111\u0129a \u0103n",
|
| 85 |
+
"60": "v\u00f2i",
|
| 86 |
+
"61": "\u0111i\u1ec7n tho\u1ea1i",
|
| 87 |
+
"62": "con c\u1eebu",
|
| 88 |
+
"63": "t\u00f2a nh\u00e0",
|
| 89 |
+
"64": "v\u00e1n tr\u01b0\u1ee3t",
|
| 90 |
+
"65": "c\u1eeda s\u1ed5",
|
| 91 |
+
"66": "c\u1eeda h\u00e0ng",
|
| 92 |
+
"67": "t\u00f2a th\u00e1p",
|
| 93 |
+
"68": "b\u1ed3n t\u1eafm",
|
| 94 |
+
"69": "c\u00e1i r\u1ed5",
|
| 95 |
+
"70": "c\u00e2y",
|
| 96 |
+
"71": "m\u00e1y vi t\u00ednh",
|
| 97 |
+
"72": "qu\u00e1n \u0103n",
|
| 98 |
+
"73": "ga ra",
|
| 99 |
+
"74": "ch\u1ea3o",
|
| 100 |
+
"75": "v\u01b0\u1eddn b\u00e1ch th\u00fa",
|
| 101 |
+
"76": "nh\u00e0 \u1edf",
|
| 102 |
+
"77": "xe \u0111\u1ea9y",
|
| 103 |
+
"78": "laptop",
|
| 104 |
+
"79": "xe l\u1eeda",
|
| 105 |
+
"80": "b\u00f4ng hoa",
|
| 106 |
+
"81": "v\u00e1n l\u01b0\u1edbt s\u00f3ng",
|
| 107 |
+
"82": "c\u00e1i t\u00fai",
|
| 108 |
+
"83": "t\u1ee7 \u0111\u00e1",
|
| 109 |
+
"84": "qu\u1ea3 b\u00f3ng",
|
| 110 |
+
"85": "chu\u1ed1i",
|
| 111 |
+
"86": "s\u00e2n bay",
|
| 112 |
+
"87": "v\u0103n ph\u00f2ng",
|
| 113 |
+
"88": "th\u00f9ng ch\u1ee9a",
|
| 114 |
+
"89": "n\u00fai",
|
| 115 |
+
"90": "c\u00e1i b\u00e0n",
|
| 116 |
+
"91": "tr\u01b0\u1ee3t tuy\u1ebft",
|
| 117 |
+
"92": "c\u00e0 v\u1ea1t",
|
| 118 |
+
"93": "h\u1ed3 b\u01a1i",
|
| 119 |
+
"94": "b\u00e3i c\u1ecf",
|
| 120 |
+
"95": "b\u1ea3y",
|
| 121 |
+
"96": "m\u00f3n \u0103n",
|
| 122 |
+
"97": "\u0111\u01b0\u1eddng b\u1ed9",
|
| 123 |
+
"98": "xe",
|
| 124 |
+
"99": "n\u00f3n",
|
| 125 |
+
"100": "\u0111\u1ed9ng c\u01a1",
|
| 126 |
+
"101": "c\u00e1i m\u00e2m",
|
| 127 |
+
"102": "g\u1eady",
|
| 128 |
+
"103": "g\u1ea5u tr\u00fac",
|
| 129 |
+
"104": "c\u1eeda ti\u1ec7m",
|
| 130 |
+
"105": "con v\u1ecbt",
|
| 131 |
+
"106": "l\u1ed3ng",
|
| 132 |
+
"107": "t\u01b0\u1eddng",
|
| 133 |
+
"108": "c\u00e1i n\u1ed3i",
|
| 134 |
+
"109": "t\u1ee7 l\u1ea1nh",
|
| 135 |
+
"110": "c\u1eeda",
|
| 136 |
+
"111": "t\u00e1ch",
|
| 137 |
+
"112": "b\u1ee9c \u1ea3nh",
|
| 138 |
+
"113": "s\u00e2n v\u01b0\u1eddn",
|
| 139 |
+
"114": "\u0111\u1ed3i",
|
| 140 |
+
"115": "b\u1eefa \u0103n",
|
| 141 |
+
"116": "s\u00e2n v\u1eadn \u0111\u1ed9ng",
|
| 142 |
+
"117": "d\u0129a nh\u1ef1a",
|
| 143 |
+
"118": "ph\u01b0\u01a1ng ti\u1ec7n giao th\u00f4ng",
|
| 144 |
+
"119": "m\u00e1y xay",
|
| 145 |
+
"120": "\u0111\u1ed3 ch\u01a1i",
|
| 146 |
+
"121": "m\u0169",
|
| 147 |
+
"122": "rau",
|
| 148 |
+
"123": "\u00e1o vest",
|
| 149 |
+
"124": "v\u00f2i hoa sen",
|
| 150 |
+
"125": "b\u00e0n ch\u1ea3i",
|
| 151 |
+
"126": "c\u00e1i k\u1ec7",
|
| 152 |
+
"127": "\u0111\u01b0\u1eddng",
|
| 153 |
+
"128": "xe l\u0103n",
|
| 154 |
+
"129": "c\u00e0 r\u1ed1t",
|
| 155 |
+
"130": "xe c\u1ed9",
|
| 156 |
+
"131": "th\u00e2n c\u00e2y",
|
| 157 |
+
"132": "m\u00e1y \u1ea3nh",
|
| 158 |
+
"133": "chai",
|
| 159 |
+
"134": "\u00f4 c\u1eeda",
|
| 160 |
+
"135": "s\u00e2n",
|
| 161 |
+
"136": "b\u1ebfn du thuy\u1ec1n",
|
| 162 |
+
"137": "dao",
|
| 163 |
+
"138": "xe tay ga",
|
| 164 |
+
"139": "qu\u00e1n bar",
|
| 165 |
+
"140": "th\u01b0 vi\u1ec7n",
|
| 166 |
+
"141": "h\u00e0nh l\u00fd",
|
| 167 |
+
"142": "b\u1edd bi\u1ec3n",
|
| 168 |
+
"143": "t\u00e1m",
|
| 169 |
+
"144": "c\u00e1i l\u1ecd",
|
| 170 |
+
"145": "m\u1eb7t tr\u1eddi",
|
| 171 |
+
"146": "\u00e1o s\u01a1 mi",
|
| 172 |
+
"147": "qu\u1ea7y t\u00ednh ti\u1ec1n",
|
| 173 |
+
"148": "\u0111\u01b0\u1eddng s\u1eaft",
|
| 174 |
+
"149": "b\u1ea7u tr\u1eddi",
|
| 175 |
+
"150": "chu\u1ed9t",
|
| 176 |
+
"151": "r\u00e0o ch\u1eafn",
|
| 177 |
+
"152": "\u1ea3nh ch\u1ee5p",
|
| 178 |
+
"153": "balo",
|
| 179 |
+
"154": "b\u1ea3o t\u00e0ng",
|
| 180 |
+
"155": "qu\u1ea3 t\u00e1o",
|
| 181 |
+
"156": "hoa qu\u1ea3",
|
| 182 |
+
"157": "b\u1ee9c t\u01b0\u1ee3ng",
|
| 183 |
+
"158": "m\u00e1y t\u00ednh",
|
| 184 |
+
"159": "c\u00e1c t\u00f2a nh\u00e0",
|
| 185 |
+
"160": "ch\u00e9n \u0111\u0129a",
|
| 186 |
+
"161": "m\u01b0\u1eddi",
|
| 187 |
+
"162": "ch\u00edn",
|
| 188 |
+
"163": "gi\u1ea5y b\u1ea1c",
|
| 189 |
+
"164": "s\u00e0n nh\u00e0",
|
| 190 |
+
"165": "chu\u1ed3ng tr\u1ea1i",
|
| 191 |
+
"166": "l\u1edbp h\u1ecdc",
|
| 192 |
+
"167": "kho",
|
| 193 |
+
"168": "b\u1ebfp",
|
| 194 |
+
"169": "b\u1ea3ng",
|
| 195 |
+
"170": "gia s\u00fac",
|
| 196 |
+
"171": "th\u1ecbt",
|
| 197 |
+
"172": "b\u1ed3n ti\u1ec3u",
|
| 198 |
+
"173": "t\u1ea1p d\u1ec1",
|
| 199 |
+
"174": "c\u00e1i l\u1ec1u",
|
| 200 |
+
"175": "g\u0103ng tay",
|
| 201 |
+
"176": "h\u00e0nh lang",
|
| 202 |
+
"177": "l\u00e1",
|
| 203 |
+
"178": "t\u00fai",
|
| 204 |
+
"179": "h\u1ea3i \u00e2u",
|
| 205 |
+
"180": "v\u1ee3t",
|
| 206 |
+
"181": "b\u00e0n ph\u00edm",
|
| 207 |
+
"182": "s\u00f4 c\u00f4 la",
|
| 208 |
+
"183": "r\u01b0\u1ee3u",
|
| 209 |
+
"184": "t\u00e1o",
|
| 210 |
+
"185": "gian h\u00e0ng",
|
| 211 |
+
"186": "xe \u0111i\u1ec7n ng\u1ea7m",
|
| 212 |
+
"187": "m\u00e1y s\u1ea5y kh\u00f4",
|
| 213 |
+
"188": "toa xe",
|
| 214 |
+
"189": "trang thi\u1ebft b\u1ecb",
|
| 215 |
+
"190": "c\u1ed7 m\u00e1y",
|
| 216 |
+
"191": "n\u01b0\u1edbc",
|
| 217 |
+
"192": "c\u00e2y k\u00e9o",
|
| 218 |
+
"193": "ng\u0103n k\u00e9o",
|
| 219 |
+
"194": "v\u1ea1ch k\u1ebb \u0111\u01b0\u1eddng",
|
| 220 |
+
"195": "b\u00e1nh ng\u1ecdt",
|
| 221 |
+
"196": "l\u1ed1i \u0111i",
|
| 222 |
+
"197": "t\u00e0u",
|
| 223 |
+
"198": "\u0111\u01b0\u1eddng \u0111i b\u1ed9",
|
| 224 |
+
"199": "d\u0129a",
|
| 225 |
+
"200": "con v\u1eb9t",
|
| 226 |
+
"201": "l\u00e1 c\u1edd",
|
| 227 |
+
"202": "kh\u0103n",
|
| 228 |
+
"203": "chung c\u01b0",
|
| 229 |
+
"204": "h\u1ed3",
|
| 230 |
+
"205": "ca n\u00f4",
|
| 231 |
+
"206": "gi\u00e1 \u0111\u1ee1",
|
| 232 |
+
"207": "nh\u1eefng qu\u1ea3 cam",
|
| 233 |
+
"208": "b\u1eefa tr\u01b0a",
|
| 234 |
+
"209": "k\u00ednh \u0111eo",
|
| 235 |
+
"210": "cupcake",
|
| 236 |
+
"211": "\u0111\u01b0\u1eddng ray",
|
| 237 |
+
"212": "b\u1ed9 \u0111\u1ed3",
|
| 238 |
+
"213": "h\u00e0ng ho\u00e1",
|
| 239 |
+
"214": "nh\u1eefng b\u1ee9c \u1ea3nh",
|
| 240 |
+
"215": "c\u00e1i v\u00ed",
|
| 241 |
+
"216": "c\u1eebu",
|
| 242 |
+
"217": "ng\u01b0\u1eddi gi\u1eef",
|
| 243 |
+
"218": "b\u1ee9c tranh",
|
| 244 |
+
"219": "c\u1ea7u",
|
| 245 |
+
"220": "nhi\u1ec1u c\u00e1i gh\u1ebf",
|
| 246 |
+
"221": "b\u00f4ng c\u1ea3i xanh",
|
| 247 |
+
"222": "b\u1eefa \u0103n t\u1ed1i",
|
| 248 |
+
"223": "v\u1ebd tranh l\u00ean t\u01b0\u1eddng",
|
| 249 |
+
"224": "thuy\u1ec1n bu\u1ed3m",
|
| 250 |
+
"225": "\u0111i v\u0103ng",
|
| 251 |
+
"226": "s\u00e2n kh\u1ea5u",
|
| 252 |
+
"227": "n\u1ebfn",
|
| 253 |
+
"228": "bu\u1ed3ng",
|
| 254 |
+
"229": "c\u00e1i th\u00eca",
|
| 255 |
+
"230": "c\u1ecf kh\u00f4",
|
| 256 |
+
"231": "con kh\u1ec9",
|
| 257 |
+
"232": "t\u01b0\u1ee3ng \u0111\u00e0i",
|
| 258 |
+
"233": "t\u1ee7 \u0111\u00f4ng",
|
| 259 |
+
"234": "hoa h\u1ed3ng",
|
| 260 |
+
"235": "chim b\u1ed3 c\u00e2u",
|
| 261 |
+
"236": "hay",
|
| 262 |
+
"237": "g\u1ea7u m\u00fac",
|
| 263 |
+
"238": "b\u00fai t\u00f3c",
|
| 264 |
+
"239": "m\u00f3ng vu\u1ed1t",
|
| 265 |
+
"240": "xe \u0111i\u1ec7n",
|
| 266 |
+
"241": "\u0111\u0129a",
|
| 267 |
+
"242": "m\u00e0n",
|
| 268 |
+
"243": "\u00e1o kho\u00e1c",
|
| 269 |
+
"244": "m\u1eb7t n\u1ea1",
|
| 270 |
+
"245": "\u0111\u1ed3 u\u1ed1ng",
|
| 271 |
+
"246": "b\u00f2 \u0111\u1ef1c",
|
| 272 |
+
"247": "c\u00e1i n\u0129a",
|
| 273 |
+
"248": "\u0111\u01b0\u1eddng \u1ed1ng",
|
| 274 |
+
"249": "n\u01b0\u1edbc ti\u1ec3u",
|
| 275 |
+
"250": "ly",
|
| 276 |
+
"251": "\u0111\u00e8n \u0111\u1ec3 b\u00e0n",
|
| 277 |
+
"252": "\u0111\u1ed3 n\u1ed9i th\u1ea5t",
|
| 278 |
+
"253": "m\u00e1i ch\u00e8o",
|
| 279 |
+
"254": "\u0111\u1ea7u m\u00e1y",
|
| 280 |
+
"255": "\u0111\u1ea7m",
|
| 281 |
+
"256": "m\u0169 l\u01b0\u1ee1i trai",
|
| 282 |
+
"257": "truy\u1ec1n h\u00ecnh",
|
| 283 |
+
"258": "ph\u00f4 mai",
|
| 284 |
+
"259": "c\u00e0 ph\u00ea",
|
| 285 |
+
"260": "b\u1ebfn t\u00e0u",
|
| 286 |
+
"261": "con d\u00ea",
|
| 287 |
+
"262": "c\u1eeda ra v\u00e0o",
|
| 288 |
+
"263": "k\u00fd t\u00ean",
|
| 289 |
+
"264": "thi\u1ebft b\u1ecb",
|
| 290 |
+
"265": "b\u00ecnh hoa",
|
| 291 |
+
"266": "bia",
|
| 292 |
+
"267": "con d\u1ed1c",
|
| 293 |
+
"268": "\u00e1o cho\u00e0ng",
|
| 294 |
+
"269": "m\u00f3n tr\u00e1ng mi\u1ec7ng",
|
| 295 |
+
"270": "c\u00e2y s\u00e0o",
|
| 296 |
+
"271": "thu\u1ed1c l\u00e1",
|
| 297 |
+
"272": "m\u1eb7t",
|
| 298 |
+
"273": "k\u00ednh r\u00e2m",
|
| 299 |
+
"274": "\u0111i\u00eau kh\u1eafc",
|
| 300 |
+
"275": "nh\u00e0",
|
| 301 |
+
"276": "rau qu\u1ea3",
|
| 302 |
+
"277": "tr\u00e1i c\u00e2y",
|
| 303 |
+
"278": "qu\u1ea3 cam",
|
| 304 |
+
"279": "\u0111\u0129a n\u00e9m",
|
| 305 |
+
"280": "ba lan",
|
| 306 |
+
"281": "c\u00e2y g\u1eady",
|
| 307 |
+
"282": "s\u1eefa",
|
| 308 |
+
"283": "h\u1ed9p \u0111\u1ef1ng",
|
| 309 |
+
"284": "khung",
|
| 310 |
+
"285": "ngo\u00e0i tr\u1eddi",
|
| 311 |
+
"286": "\u0111o\u1ea1n phim gi\u1edbi thi\u1ec7u",
|
| 312 |
+
"287": "c\u1edd",
|
| 313 |
+
"288": "th\u00f9ng",
|
| 314 |
+
"289": "l\u00f2 s\u01b0\u1edfi",
|
| 315 |
+
"290": "l\u00e1t c\u1eaft",
|
| 316 |
+
"291": "b\u1eafp ch\u00e2n",
|
| 317 |
+
"292": "c\u00fan y\u00eau",
|
| 318 |
+
"293": "ng\u00e2n h\u00e0ng",
|
| 319 |
+
"294": "rau x\u00e0 l\u00e1ch",
|
| 320 |
+
"295": "xa l\u1ed9",
|
| 321 |
+
"296": "g\u00e0",
|
| 322 |
+
"297": "qu\u1ea7n short",
|
| 323 |
+
"298": "v\u00f2i n\u01b0\u1edbc",
|
| 324 |
+
"299": "m\u0169 b\u1ea3o hi\u1ec3m",
|
| 325 |
+
"300": "c\u00f4ng c\u1ee5",
|
| 326 |
+
"301": "qu\u1ea3 cam ",
|
| 327 |
+
"302": "v\u00e1n tr\u01b0\u1ee3t tuy\u1ebft",
|
| 328 |
+
"303": "g\u1ea1ch",
|
| 329 |
+
"304": "ch\u00ecm xu\u1ed1ng",
|
| 330 |
+
"305": "kh\u0103n t\u1eafm",
|
| 331 |
+
"306": "l\u00e1t g\u1ea1ch",
|
| 332 |
+
"307": "ng\u0103n",
|
| 333 |
+
"308": "b\u1ea3ng hi\u1ec7u",
|
| 334 |
+
"309": "l\u0103n tr\u00f2n",
|
| 335 |
+
"310": "hotdog",
|
| 336 |
+
"311": "c\u1ecf",
|
| 337 |
+
"312": "b\u00ecnh",
|
| 338 |
+
"313": "b\u00ean",
|
| 339 |
+
"314": "t\u00e0u ho\u1ea3",
|
| 340 |
+
"315": "b\u00e1nh xe",
|
| 341 |
+
"316": "lon",
|
| 342 |
+
"317": "nh\u00e0 t\u1eafm",
|
| 343 |
+
"318": "\u0111\u01b0\u1eddng \u0111ua",
|
| 344 |
+
"319": "m\u00e0u s\u1eafc",
|
| 345 |
+
"320": "bao b\u00ec",
|
| 346 |
+
"321": "th\u00e0nh ph\u1ea7n",
|
| 347 |
+
"322": "chim \u01b0ng",
|
| 348 |
+
"323": "\u0111i\u1ec3m t\u00e2m",
|
| 349 |
+
"324": "d\u0129a ",
|
| 350 |
+
"325": "b\u00e0n ch\u1ea3i \u0111\u00e1nh r\u0103ng",
|
| 351 |
+
"326": "h\u00e0ng h\u00f3a",
|
| 352 |
+
"327": "pug",
|
| 353 |
+
"328": "h\u1ed9p s\u1ed1",
|
| 354 |
+
"329": "c\u00e1",
|
| 355 |
+
"330": "gi\u1ecf",
|
| 356 |
+
"331": "gh\u1ebf s\u00f4 pha",
|
| 357 |
+
"332": "qu\u1ea7n \u00e1o",
|
| 358 |
+
"333": "tr\u01b0\u1eddng h\u1ee3p",
|
| 359 |
+
"334": "b\u00f2",
|
| 360 |
+
"335": "v\u00f4 tuy\u1ebfn",
|
| 361 |
+
"336": "con thoi",
|
| 362 |
+
"337": "theo d\u00f5i",
|
| 363 |
+
"338": "\u00e1o ba l\u1ed7",
|
| 364 |
+
"339": "d\u00f2ng s\u00f4ng",
|
| 365 |
+
"340": "g\u00e0 t\u00e2y",
|
| 366 |
+
"341": "d\u1ea5u hi\u1ec7u",
|
| 367 |
+
"342": "m\u00e8o con",
|
| 368 |
+
"343": "m\u1eaft",
|
| 369 |
+
"344": "\u0111\u01b0a \u0111\u00f3n",
|
| 370 |
+
"345": "con heo",
|
| 371 |
+
"346": "ngo\u00e0i",
|
| 372 |
+
"347": "\u0111\u1ed3ng ph\u1ee5c",
|
| 373 |
+
"348": "m\u00e1y bay tr\u1ef1c th\u0103ng",
|
| 374 |
+
"349": "\u0111\u1ea1i d\u01b0\u01a1ng",
|
| 375 |
+
"350": "b\u1ee9c m\u00e0n",
|
| 376 |
+
"351": "cam",
|
| 377 |
+
"352": "b\u00e1nh hamburger"
|
| 378 |
+
},
|
| 379 |
"img_size": 224,
|
| 380 |
"in_chans": 3,
|
| 381 |
+
"label2id": null,
|
| 382 |
"layernorm_embedding": false,
|
| 383 |
"layernorm_eps": 1e-05,
|
| 384 |
"max_rel_pos": 0,
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4911305908
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:350969d1b1809558a103e887928ed65b68950e1d33edaff47e549c923b5b7691
|
| 3 |
size 4911305908
|
modeling_vivqa.py
CHANGED
|
@@ -38,10 +38,12 @@ class Blip2EfficientExtractor(nn.Module):
|
|
| 38 |
|
| 39 |
# Efficientnet
|
| 40 |
self.model_efficient = EfficientNet.from_pretrained('efficientnet-b7').to(self.device)
|
|
|
|
| 41 |
self.pooling1 = nn.AdaptiveAvgPool2d((1, 32))
|
| 42 |
self.pooling2 = nn.AdaptiveAvgPool2d((1, 768))
|
| 43 |
|
| 44 |
-
def forward(self, images):
|
|
|
|
| 45 |
global_features = self.model_blip2.extract_features(samples={"image": images}, mode="image").image_embeds
|
| 46 |
|
| 47 |
local_features = self.model_efficient.extract_features(images)
|
|
@@ -111,18 +113,19 @@ class ViVQABEiT3(PreTrainedModel):
|
|
| 111 |
x1 = self.vision_embed(visual_tokens)
|
| 112 |
multiway_split_position = x1.size(1)
|
| 113 |
|
| 114 |
-
x2 = self.text_embed(textual_tokens, text_padding_position)
|
| 115 |
x2 = self.linear(x2)
|
| 116 |
|
| 117 |
x = torch.cat([x1, x2], dim=1)
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
|
|
|
| 126 |
encoder_out = self.encoder(
|
| 127 |
src_tokens=None,
|
| 128 |
encoder_padding_mask=encoder_padding_mask,
|
|
|
|
| 38 |
|
| 39 |
# Efficientnet
|
| 40 |
self.model_efficient = EfficientNet.from_pretrained('efficientnet-b7').to(self.device)
|
| 41 |
+
self.model_efficient.eval()
|
| 42 |
self.pooling1 = nn.AdaptiveAvgPool2d((1, 32))
|
| 43 |
self.pooling2 = nn.AdaptiveAvgPool2d((1, 768))
|
| 44 |
|
| 45 |
+
def forward(self, images):
|
| 46 |
+
|
| 47 |
global_features = self.model_blip2.extract_features(samples={"image": images}, mode="image").image_embeds
|
| 48 |
|
| 49 |
local_features = self.model_efficient.extract_features(images)
|
|
|
|
| 113 |
x1 = self.vision_embed(visual_tokens)
|
| 114 |
multiway_split_position = x1.size(1)
|
| 115 |
|
| 116 |
+
x2 = self.text_embed(textual_tokens, 1-text_padding_position)
|
| 117 |
x2 = self.linear(x2)
|
| 118 |
|
| 119 |
x = torch.cat([x1, x2], dim=1)
|
| 120 |
+
|
| 121 |
+
encoder_padding_mask = torch.cat(
|
| 122 |
+
[
|
| 123 |
+
torch.zeros(x1.shape[:-1]).to(x1.device).bool(),
|
| 124 |
+
text_padding_position,
|
| 125 |
+
],
|
| 126 |
+
dim=1,
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
encoder_out = self.encoder(
|
| 130 |
src_tokens=None,
|
| 131 |
encoder_padding_mask=encoder_padding_mask,
|