huiwon commited on Dec 20, 2025

Commit

5724a75

verified ·

1 Parent(s): 59f3cb9

Upload folder using huggingface_hub

Browse files

Files changed (22) hide show

.gitattributes +2 -0
added_tokens.json +2079 -0
chat_template.jinja +120 -0
config.json +68 -0
contextvla.py +126 -0
generation_config.json +13 -0
latest +1 -0
merges.txt +0 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +758 -0
modeling_contextvla.py +58 -0
modeling_qwen3_vl.py +1617 -0
special_tokens_map.json +31 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0
trainer_state.json +3 -0
training_args.bin +3 -0
vocab.json +0 -0
zero_to_fp32.py +760 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+trainer_state.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,2079 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|action_0|>": 151672,
+  "<|action_1000|>": 152672,
+  "<|action_1001|>": 152673,
+  "<|action_1002|>": 152674,
+  "<|action_1003|>": 152675,
+  "<|action_1004|>": 152676,
+  "<|action_1005|>": 152677,
+  "<|action_1006|>": 152678,
+  "<|action_1007|>": 152679,
+  "<|action_1008|>": 152680,
+  "<|action_1009|>": 152681,
+  "<|action_100|>": 151772,
+  "<|action_1010|>": 152682,
+  "<|action_1011|>": 152683,
+  "<|action_1012|>": 152684,
+  "<|action_1013|>": 152685,
+  "<|action_1014|>": 152686,
+  "<|action_1015|>": 152687,
+  "<|action_1016|>": 152688,
+  "<|action_1017|>": 152689,
+  "<|action_1018|>": 152690,
+  "<|action_1019|>": 152691,
+  "<|action_101|>": 151773,
+  "<|action_1020|>": 152692,
+  "<|action_1021|>": 152693,
+  "<|action_1022|>": 152694,
+  "<|action_1023|>": 152695,
+  "<|action_1024|>": 152696,
+  "<|action_1025|>": 152697,
+  "<|action_1026|>": 152698,
+  "<|action_1027|>": 152699,
+  "<|action_1028|>": 152700,
+  "<|action_1029|>": 152701,
+  "<|action_102|>": 151774,
+  "<|action_1030|>": 152702,
+  "<|action_1031|>": 152703,
+  "<|action_1032|>": 152704,
+  "<|action_1033|>": 152705,
+  "<|action_1034|>": 152706,
+  "<|action_1035|>": 152707,
+  "<|action_1036|>": 152708,
+  "<|action_1037|>": 152709,
+  "<|action_1038|>": 152710,
+  "<|action_1039|>": 152711,
+  "<|action_103|>": 151775,
+  "<|action_1040|>": 152712,
+  "<|action_1041|>": 152713,
+  "<|action_1042|>": 152714,
+  "<|action_1043|>": 152715,
+  "<|action_1044|>": 152716,
+  "<|action_1045|>": 152717,
+  "<|action_1046|>": 152718,
+  "<|action_1047|>": 152719,
+  "<|action_1048|>": 152720,
+  "<|action_1049|>": 152721,
+  "<|action_104|>": 151776,
+  "<|action_1050|>": 152722,
+  "<|action_1051|>": 152723,
+  "<|action_1052|>": 152724,
+  "<|action_1053|>": 152725,
+  "<|action_1054|>": 152726,
+  "<|action_1055|>": 152727,
+  "<|action_1056|>": 152728,
+  "<|action_1057|>": 152729,
+  "<|action_1058|>": 152730,
+  "<|action_1059|>": 152731,
+  "<|action_105|>": 151777,
+  "<|action_1060|>": 152732,
+  "<|action_1061|>": 152733,
+  "<|action_1062|>": 152734,
+  "<|action_1063|>": 152735,
+  "<|action_1064|>": 152736,
+  "<|action_1065|>": 152737,
+  "<|action_1066|>": 152738,
+  "<|action_1067|>": 152739,
+  "<|action_1068|>": 152740,
+  "<|action_1069|>": 152741,
+  "<|action_106|>": 151778,
+  "<|action_1070|>": 152742,
+  "<|action_1071|>": 152743,
+  "<|action_1072|>": 152744,
+  "<|action_1073|>": 152745,
+  "<|action_1074|>": 152746,
+  "<|action_1075|>": 152747,
+  "<|action_1076|>": 152748,
+  "<|action_1077|>": 152749,
+  "<|action_1078|>": 152750,
+  "<|action_1079|>": 152751,
+  "<|action_107|>": 151779,
+  "<|action_1080|>": 152752,
+  "<|action_1081|>": 152753,
+  "<|action_1082|>": 152754,
+  "<|action_1083|>": 152755,
+  "<|action_1084|>": 152756,
+  "<|action_1085|>": 152757,
+  "<|action_1086|>": 152758,
+  "<|action_1087|>": 152759,
+  "<|action_1088|>": 152760,
+  "<|action_1089|>": 152761,
+  "<|action_108|>": 151780,
+  "<|action_1090|>": 152762,
+  "<|action_1091|>": 152763,
+  "<|action_1092|>": 152764,
+  "<|action_1093|>": 152765,
+  "<|action_1094|>": 152766,
+  "<|action_1095|>": 152767,
+  "<|action_1096|>": 152768,
+  "<|action_1097|>": 152769,
+  "<|action_1098|>": 152770,
+  "<|action_1099|>": 152771,
+  "<|action_109|>": 151781,
+  "<|action_10|>": 151682,
+  "<|action_1100|>": 152772,
+  "<|action_1101|>": 152773,
+  "<|action_1102|>": 152774,
+  "<|action_1103|>": 152775,
+  "<|action_1104|>": 152776,
+  "<|action_1105|>": 152777,
+  "<|action_1106|>": 152778,
+  "<|action_1107|>": 152779,
+  "<|action_1108|>": 152780,
+  "<|action_1109|>": 152781,
+  "<|action_110|>": 151782,
+  "<|action_1110|>": 152782,
+  "<|action_1111|>": 152783,
+  "<|action_1112|>": 152784,
+  "<|action_1113|>": 152785,
+  "<|action_1114|>": 152786,
+  "<|action_1115|>": 152787,
+  "<|action_1116|>": 152788,
+  "<|action_1117|>": 152789,
+  "<|action_1118|>": 152790,
+  "<|action_1119|>": 152791,
+  "<|action_111|>": 151783,
+  "<|action_1120|>": 152792,
+  "<|action_1121|>": 152793,
+  "<|action_1122|>": 152794,
+  "<|action_1123|>": 152795,
+  "<|action_1124|>": 152796,
+  "<|action_1125|>": 152797,
+  "<|action_1126|>": 152798,
+  "<|action_1127|>": 152799,
+  "<|action_1128|>": 152800,
+  "<|action_1129|>": 152801,
+  "<|action_112|>": 151784,
+  "<|action_1130|>": 152802,
+  "<|action_1131|>": 152803,
+  "<|action_1132|>": 152804,
+  "<|action_1133|>": 152805,
+  "<|action_1134|>": 152806,
+  "<|action_1135|>": 152807,
+  "<|action_1136|>": 152808,
+  "<|action_1137|>": 152809,
+  "<|action_1138|>": 152810,
+  "<|action_1139|>": 152811,
+  "<|action_113|>": 151785,
+  "<|action_1140|>": 152812,
+  "<|action_1141|>": 152813,
+  "<|action_1142|>": 152814,
+  "<|action_1143|>": 152815,
+  "<|action_1144|>": 152816,
+  "<|action_1145|>": 152817,
+  "<|action_1146|>": 152818,
+  "<|action_1147|>": 152819,
+  "<|action_1148|>": 152820,
+  "<|action_1149|>": 152821,
+  "<|action_114|>": 151786,
+  "<|action_1150|>": 152822,
+  "<|action_1151|>": 152823,
+  "<|action_1152|>": 152824,
+  "<|action_1153|>": 152825,
+  "<|action_1154|>": 152826,
+  "<|action_1155|>": 152827,
+  "<|action_1156|>": 152828,
+  "<|action_1157|>": 152829,
+  "<|action_1158|>": 152830,
+  "<|action_1159|>": 152831,
+  "<|action_115|>": 151787,
+  "<|action_1160|>": 152832,
+  "<|action_1161|>": 152833,
+  "<|action_1162|>": 152834,
+  "<|action_1163|>": 152835,
+  "<|action_1164|>": 152836,
+  "<|action_1165|>": 152837,
+  "<|action_1166|>": 152838,
+  "<|action_1167|>": 152839,
+  "<|action_1168|>": 152840,
+  "<|action_1169|>": 152841,
+  "<|action_116|>": 151788,
+  "<|action_1170|>": 152842,
+  "<|action_1171|>": 152843,
+  "<|action_1172|>": 152844,
+  "<|action_1173|>": 152845,
+  "<|action_1174|>": 152846,
+  "<|action_1175|>": 152847,
+  "<|action_1176|>": 152848,
+  "<|action_1177|>": 152849,
+  "<|action_1178|>": 152850,
+  "<|action_1179|>": 152851,
+  "<|action_117|>": 151789,
+  "<|action_1180|>": 152852,
+  "<|action_1181|>": 152853,
+  "<|action_1182|>": 152854,
+  "<|action_1183|>": 152855,
+  "<|action_1184|>": 152856,
+  "<|action_1185|>": 152857,
+  "<|action_1186|>": 152858,
+  "<|action_1187|>": 152859,
+  "<|action_1188|>": 152860,
+  "<|action_1189|>": 152861,
+  "<|action_118|>": 151790,
+  "<|action_1190|>": 152862,
+  "<|action_1191|>": 152863,
+  "<|action_1192|>": 152864,
+  "<|action_1193|>": 152865,
+  "<|action_1194|>": 152866,
+  "<|action_1195|>": 152867,
+  "<|action_1196|>": 152868,
+  "<|action_1197|>": 152869,
+  "<|action_1198|>": 152870,
+  "<|action_1199|>": 152871,
+  "<|action_119|>": 151791,
+  "<|action_11|>": 151683,
+  "<|action_1200|>": 152872,
+  "<|action_1201|>": 152873,
+  "<|action_1202|>": 152874,
+  "<|action_1203|>": 152875,
+  "<|action_1204|>": 152876,
+  "<|action_1205|>": 152877,
+  "<|action_1206|>": 152878,
+  "<|action_1207|>": 152879,
+  "<|action_1208|>": 152880,
+  "<|action_1209|>": 152881,
+  "<|action_120|>": 151792,
+  "<|action_1210|>": 152882,
+  "<|action_1211|>": 152883,
+  "<|action_1212|>": 152884,
+  "<|action_1213|>": 152885,
+  "<|action_1214|>": 152886,
+  "<|action_1215|>": 152887,
+  "<|action_1216|>": 152888,
+  "<|action_1217|>": 152889,
+  "<|action_1218|>": 152890,
+  "<|action_1219|>": 152891,
+  "<|action_121|>": 151793,
+  "<|action_1220|>": 152892,
+  "<|action_1221|>": 152893,
+  "<|action_1222|>": 152894,
+  "<|action_1223|>": 152895,
+  "<|action_1224|>": 152896,
+  "<|action_1225|>": 152897,
+  "<|action_1226|>": 152898,
+  "<|action_1227|>": 152899,
+  "<|action_1228|>": 152900,
+  "<|action_1229|>": 152901,
+  "<|action_122|>": 151794,
+  "<|action_1230|>": 152902,
+  "<|action_1231|>": 152903,
+  "<|action_1232|>": 152904,
+  "<|action_1233|>": 152905,
+  "<|action_1234|>": 152906,
+  "<|action_1235|>": 152907,
+  "<|action_1236|>": 152908,
+  "<|action_1237|>": 152909,
+  "<|action_1238|>": 152910,
+  "<|action_1239|>": 152911,
+  "<|action_123|>": 151795,
+  "<|action_1240|>": 152912,
+  "<|action_1241|>": 152913,
+  "<|action_1242|>": 152914,
+  "<|action_1243|>": 152915,
+  "<|action_1244|>": 152916,
+  "<|action_1245|>": 152917,
+  "<|action_1246|>": 152918,
+  "<|action_1247|>": 152919,
+  "<|action_1248|>": 152920,
+  "<|action_1249|>": 152921,
+  "<|action_124|>": 151796,
+  "<|action_1250|>": 152922,
+  "<|action_1251|>": 152923,
+  "<|action_1252|>": 152924,
+  "<|action_1253|>": 152925,
+  "<|action_1254|>": 152926,
+  "<|action_1255|>": 152927,
+  "<|action_1256|>": 152928,
+  "<|action_1257|>": 152929,
+  "<|action_1258|>": 152930,
+  "<|action_1259|>": 152931,
+  "<|action_125|>": 151797,
+  "<|action_1260|>": 152932,
+  "<|action_1261|>": 152933,
+  "<|action_1262|>": 152934,
+  "<|action_1263|>": 152935,
+  "<|action_1264|>": 152936,
+  "<|action_1265|>": 152937,
+  "<|action_1266|>": 152938,
+  "<|action_1267|>": 152939,
+  "<|action_1268|>": 152940,
+  "<|action_1269|>": 152941,
+  "<|action_126|>": 151798,
+  "<|action_1270|>": 152942,
+  "<|action_1271|>": 152943,
+  "<|action_1272|>": 152944,
+  "<|action_1273|>": 152945,
+  "<|action_1274|>": 152946,
+  "<|action_1275|>": 152947,
+  "<|action_1276|>": 152948,
+  "<|action_1277|>": 152949,
+  "<|action_1278|>": 152950,
+  "<|action_1279|>": 152951,
+  "<|action_127|>": 151799,
+  "<|action_1280|>": 152952,
+  "<|action_1281|>": 152953,
+  "<|action_1282|>": 152954,
+  "<|action_1283|>": 152955,
+  "<|action_1284|>": 152956,
+  "<|action_1285|>": 152957,
+  "<|action_1286|>": 152958,
+  "<|action_1287|>": 152959,
+  "<|action_1288|>": 152960,
+  "<|action_1289|>": 152961,
+  "<|action_128|>": 151800,
+  "<|action_1290|>": 152962,
+  "<|action_1291|>": 152963,
+  "<|action_1292|>": 152964,
+  "<|action_1293|>": 152965,
+  "<|action_1294|>": 152966,
+  "<|action_1295|>": 152967,
+  "<|action_1296|>": 152968,
+  "<|action_1297|>": 152969,
+  "<|action_1298|>": 152970,
+  "<|action_1299|>": 152971,
+  "<|action_129|>": 151801,
+  "<|action_12|>": 151684,
+  "<|action_1300|>": 152972,
+  "<|action_1301|>": 152973,
+  "<|action_1302|>": 152974,
+  "<|action_1303|>": 152975,
+  "<|action_1304|>": 152976,
+  "<|action_1305|>": 152977,
+  "<|action_1306|>": 152978,
+  "<|action_1307|>": 152979,
+  "<|action_1308|>": 152980,
+  "<|action_1309|>": 152981,
+  "<|action_130|>": 151802,
+  "<|action_1310|>": 152982,
+  "<|action_1311|>": 152983,
+  "<|action_1312|>": 152984,
+  "<|action_1313|>": 152985,
+  "<|action_1314|>": 152986,
+  "<|action_1315|>": 152987,
+  "<|action_1316|>": 152988,
+  "<|action_1317|>": 152989,
+  "<|action_1318|>": 152990,
+  "<|action_1319|>": 152991,
+  "<|action_131|>": 151803,
+  "<|action_1320|>": 152992,
+  "<|action_1321|>": 152993,
+  "<|action_1322|>": 152994,
+  "<|action_1323|>": 152995,
+  "<|action_1324|>": 152996,
+  "<|action_1325|>": 152997,
+  "<|action_1326|>": 152998,
+  "<|action_1327|>": 152999,
+  "<|action_1328|>": 153000,
+  "<|action_1329|>": 153001,
+  "<|action_132|>": 151804,
+  "<|action_1330|>": 153002,
+  "<|action_1331|>": 153003,
+  "<|action_1332|>": 153004,
+  "<|action_1333|>": 153005,
+  "<|action_1334|>": 153006,
+  "<|action_1335|>": 153007,
+  "<|action_1336|>": 153008,
+  "<|action_1337|>": 153009,
+  "<|action_1338|>": 153010,
+  "<|action_1339|>": 153011,
+  "<|action_133|>": 151805,
+  "<|action_1340|>": 153012,
+  "<|action_1341|>": 153013,
+  "<|action_1342|>": 153014,
+  "<|action_1343|>": 153015,
+  "<|action_1344|>": 153016,
+  "<|action_1345|>": 153017,
+  "<|action_1346|>": 153018,
+  "<|action_1347|>": 153019,
+  "<|action_1348|>": 153020,
+  "<|action_1349|>": 153021,
+  "<|action_134|>": 151806,
+  "<|action_1350|>": 153022,
+  "<|action_1351|>": 153023,
+  "<|action_1352|>": 153024,
+  "<|action_1353|>": 153025,
+  "<|action_1354|>": 153026,
+  "<|action_1355|>": 153027,
+  "<|action_1356|>": 153028,
+  "<|action_1357|>": 153029,
+  "<|action_1358|>": 153030,
+  "<|action_1359|>": 153031,
+  "<|action_135|>": 151807,
+  "<|action_1360|>": 153032,
+  "<|action_1361|>": 153033,
+  "<|action_1362|>": 153034,
+  "<|action_1363|>": 153035,
+  "<|action_1364|>": 153036,
+  "<|action_1365|>": 153037,
+  "<|action_1366|>": 153038,
+  "<|action_1367|>": 153039,
+  "<|action_1368|>": 153040,
+  "<|action_1369|>": 153041,
+  "<|action_136|>": 151808,
+  "<|action_1370|>": 153042,
+  "<|action_1371|>": 153043,
+  "<|action_1372|>": 153044,
+  "<|action_1373|>": 153045,
+  "<|action_1374|>": 153046,
+  "<|action_1375|>": 153047,
+  "<|action_1376|>": 153048,
+  "<|action_1377|>": 153049,
+  "<|action_1378|>": 153050,
+  "<|action_1379|>": 153051,
+  "<|action_137|>": 151809,
+  "<|action_1380|>": 153052,
+  "<|action_1381|>": 153053,
+  "<|action_1382|>": 153054,
+  "<|action_1383|>": 153055,
+  "<|action_1384|>": 153056,
+  "<|action_1385|>": 153057,
+  "<|action_1386|>": 153058,
+  "<|action_1387|>": 153059,
+  "<|action_1388|>": 153060,
+  "<|action_1389|>": 153061,
+  "<|action_138|>": 151810,
+  "<|action_1390|>": 153062,
+  "<|action_1391|>": 153063,
+  "<|action_1392|>": 153064,
+  "<|action_1393|>": 153065,
+  "<|action_1394|>": 153066,
+  "<|action_1395|>": 153067,
+  "<|action_1396|>": 153068,
+  "<|action_1397|>": 153069,
+  "<|action_1398|>": 153070,
+  "<|action_1399|>": 153071,
+  "<|action_139|>": 151811,
+  "<|action_13|>": 151685,
+  "<|action_1400|>": 153072,
+  "<|action_1401|>": 153073,
+  "<|action_1402|>": 153074,
+  "<|action_1403|>": 153075,
+  "<|action_1404|>": 153076,
+  "<|action_1405|>": 153077,
+  "<|action_1406|>": 153078,
+  "<|action_1407|>": 153079,
+  "<|action_1408|>": 153080,
+  "<|action_1409|>": 153081,
+  "<|action_140|>": 151812,
+  "<|action_1410|>": 153082,
+  "<|action_1411|>": 153083,
+  "<|action_1412|>": 153084,
+  "<|action_1413|>": 153085,
+  "<|action_1414|>": 153086,
+  "<|action_1415|>": 153087,
+  "<|action_1416|>": 153088,
+  "<|action_1417|>": 153089,
+  "<|action_1418|>": 153090,
+  "<|action_1419|>": 153091,
+  "<|action_141|>": 151813,
+  "<|action_1420|>": 153092,
+  "<|action_1421|>": 153093,
+  "<|action_1422|>": 153094,
+  "<|action_1423|>": 153095,
+  "<|action_1424|>": 153096,
+  "<|action_1425|>": 153097,
+  "<|action_1426|>": 153098,
+  "<|action_1427|>": 153099,
+  "<|action_1428|>": 153100,
+  "<|action_1429|>": 153101,
+  "<|action_142|>": 151814,
+  "<|action_1430|>": 153102,
+  "<|action_1431|>": 153103,
+  "<|action_1432|>": 153104,
+  "<|action_1433|>": 153105,
+  "<|action_1434|>": 153106,
+  "<|action_1435|>": 153107,
+  "<|action_1436|>": 153108,
+  "<|action_1437|>": 153109,
+  "<|action_1438|>": 153110,
+  "<|action_1439|>": 153111,
+  "<|action_143|>": 151815,
+  "<|action_1440|>": 153112,
+  "<|action_1441|>": 153113,
+  "<|action_1442|>": 153114,
+  "<|action_1443|>": 153115,
+  "<|action_1444|>": 153116,
+  "<|action_1445|>": 153117,
+  "<|action_1446|>": 153118,
+  "<|action_1447|>": 153119,
+  "<|action_1448|>": 153120,
+  "<|action_1449|>": 153121,
+  "<|action_144|>": 151816,
+  "<|action_1450|>": 153122,
+  "<|action_1451|>": 153123,
+  "<|action_1452|>": 153124,
+  "<|action_1453|>": 153125,
+  "<|action_1454|>": 153126,
+  "<|action_1455|>": 153127,
+  "<|action_1456|>": 153128,
+  "<|action_1457|>": 153129,
+  "<|action_1458|>": 153130,
+  "<|action_1459|>": 153131,
+  "<|action_145|>": 151817,
+  "<|action_1460|>": 153132,
+  "<|action_1461|>": 153133,
+  "<|action_1462|>": 153134,
+  "<|action_1463|>": 153135,
+  "<|action_1464|>": 153136,
+  "<|action_1465|>": 153137,
+  "<|action_1466|>": 153138,
+  "<|action_1467|>": 153139,
+  "<|action_1468|>": 153140,
+  "<|action_1469|>": 153141,
+  "<|action_146|>": 151818,
+  "<|action_1470|>": 153142,
+  "<|action_1471|>": 153143,
+  "<|action_1472|>": 153144,
+  "<|action_1473|>": 153145,
+  "<|action_1474|>": 153146,
+  "<|action_1475|>": 153147,
+  "<|action_1476|>": 153148,
+  "<|action_1477|>": 153149,
+  "<|action_1478|>": 153150,
+  "<|action_1479|>": 153151,
+  "<|action_147|>": 151819,
+  "<|action_1480|>": 153152,
+  "<|action_1481|>": 153153,
+  "<|action_1482|>": 153154,
+  "<|action_1483|>": 153155,
+  "<|action_1484|>": 153156,
+  "<|action_1485|>": 153157,
+  "<|action_1486|>": 153158,
+  "<|action_1487|>": 153159,
+  "<|action_1488|>": 153160,
+  "<|action_1489|>": 153161,
+  "<|action_148|>": 151820,
+  "<|action_1490|>": 153162,
+  "<|action_1491|>": 153163,
+  "<|action_1492|>": 153164,
+  "<|action_1493|>": 153165,
+  "<|action_1494|>": 153166,
+  "<|action_1495|>": 153167,
+  "<|action_1496|>": 153168,
+  "<|action_1497|>": 153169,
+  "<|action_1498|>": 153170,
+  "<|action_1499|>": 153171,
+  "<|action_149|>": 151821,
+  "<|action_14|>": 151686,
+  "<|action_1500|>": 153172,
+  "<|action_1501|>": 153173,
+  "<|action_1502|>": 153174,
+  "<|action_1503|>": 153175,
+  "<|action_1504|>": 153176,
+  "<|action_1505|>": 153177,
+  "<|action_1506|>": 153178,
+  "<|action_1507|>": 153179,
+  "<|action_1508|>": 153180,
+  "<|action_1509|>": 153181,
+  "<|action_150|>": 151822,
+  "<|action_1510|>": 153182,
+  "<|action_1511|>": 153183,
+  "<|action_1512|>": 153184,
+  "<|action_1513|>": 153185,
+  "<|action_1514|>": 153186,
+  "<|action_1515|>": 153187,
+  "<|action_1516|>": 153188,
+  "<|action_1517|>": 153189,
+  "<|action_1518|>": 153190,
+  "<|action_1519|>": 153191,
+  "<|action_151|>": 151823,
+  "<|action_1520|>": 153192,
+  "<|action_1521|>": 153193,
+  "<|action_1522|>": 153194,
+  "<|action_1523|>": 153195,
+  "<|action_1524|>": 153196,
+  "<|action_1525|>": 153197,
+  "<|action_1526|>": 153198,
+  "<|action_1527|>": 153199,
+  "<|action_1528|>": 153200,
+  "<|action_1529|>": 153201,
+  "<|action_152|>": 151824,
+  "<|action_1530|>": 153202,
+  "<|action_1531|>": 153203,
+  "<|action_1532|>": 153204,
+  "<|action_1533|>": 153205,
+  "<|action_1534|>": 153206,
+  "<|action_1535|>": 153207,
+  "<|action_1536|>": 153208,
+  "<|action_1537|>": 153209,
+  "<|action_1538|>": 153210,
+  "<|action_1539|>": 153211,
+  "<|action_153|>": 151825,
+  "<|action_1540|>": 153212,
+  "<|action_1541|>": 153213,
+  "<|action_1542|>": 153214,
+  "<|action_1543|>": 153215,
+  "<|action_1544|>": 153216,
+  "<|action_1545|>": 153217,
+  "<|action_1546|>": 153218,
+  "<|action_1547|>": 153219,
+  "<|action_1548|>": 153220,
+  "<|action_1549|>": 153221,
+  "<|action_154|>": 151826,
+  "<|action_1550|>": 153222,
+  "<|action_1551|>": 153223,
+  "<|action_1552|>": 153224,
+  "<|action_1553|>": 153225,
+  "<|action_1554|>": 153226,
+  "<|action_1555|>": 153227,
+  "<|action_1556|>": 153228,
+  "<|action_1557|>": 153229,
+  "<|action_1558|>": 153230,
+  "<|action_1559|>": 153231,
+  "<|action_155|>": 151827,
+  "<|action_1560|>": 153232,
+  "<|action_1561|>": 153233,
+  "<|action_1562|>": 153234,
+  "<|action_1563|>": 153235,
+  "<|action_1564|>": 153236,
+  "<|action_1565|>": 153237,
+  "<|action_1566|>": 153238,
+  "<|action_1567|>": 153239,
+  "<|action_1568|>": 153240,
+  "<|action_1569|>": 153241,
+  "<|action_156|>": 151828,
+  "<|action_1570|>": 153242,
+  "<|action_1571|>": 153243,
+  "<|action_1572|>": 153244,
+  "<|action_1573|>": 153245,
+  "<|action_1574|>": 153246,
+  "<|action_1575|>": 153247,
+  "<|action_1576|>": 153248,
+  "<|action_1577|>": 153249,
+  "<|action_1578|>": 153250,
+  "<|action_1579|>": 153251,
+  "<|action_157|>": 151829,
+  "<|action_1580|>": 153252,
+  "<|action_1581|>": 153253,
+  "<|action_1582|>": 153254,
+  "<|action_1583|>": 153255,
+  "<|action_1584|>": 153256,
+  "<|action_1585|>": 153257,
+  "<|action_1586|>": 153258,
+  "<|action_1587|>": 153259,
+  "<|action_1588|>": 153260,
+  "<|action_1589|>": 153261,
+  "<|action_158|>": 151830,
+  "<|action_1590|>": 153262,
+  "<|action_1591|>": 153263,
+  "<|action_1592|>": 153264,
+  "<|action_1593|>": 153265,
+  "<|action_1594|>": 153266,
+  "<|action_1595|>": 153267,
+  "<|action_1596|>": 153268,
+  "<|action_1597|>": 153269,
+  "<|action_1598|>": 153270,
+  "<|action_1599|>": 153271,
+  "<|action_159|>": 151831,
+  "<|action_15|>": 151687,
+  "<|action_1600|>": 153272,
+  "<|action_1601|>": 153273,
+  "<|action_1602|>": 153274,
+  "<|action_1603|>": 153275,
+  "<|action_1604|>": 153276,
+  "<|action_1605|>": 153277,
+  "<|action_1606|>": 153278,
+  "<|action_1607|>": 153279,
+  "<|action_1608|>": 153280,
+  "<|action_1609|>": 153281,
+  "<|action_160|>": 151832,
+  "<|action_1610|>": 153282,
+  "<|action_1611|>": 153283,
+  "<|action_1612|>": 153284,
+  "<|action_1613|>": 153285,
+  "<|action_1614|>": 153286,
+  "<|action_1615|>": 153287,
+  "<|action_1616|>": 153288,
+  "<|action_1617|>": 153289,
+  "<|action_1618|>": 153290,
+  "<|action_1619|>": 153291,
+  "<|action_161|>": 151833,
+  "<|action_1620|>": 153292,
+  "<|action_1621|>": 153293,
+  "<|action_1622|>": 153294,
+  "<|action_1623|>": 153295,
+  "<|action_1624|>": 153296,
+  "<|action_1625|>": 153297,
+  "<|action_1626|>": 153298,
+  "<|action_1627|>": 153299,
+  "<|action_1628|>": 153300,
+  "<|action_1629|>": 153301,
+  "<|action_162|>": 151834,
+  "<|action_1630|>": 153302,
+  "<|action_1631|>": 153303,
+  "<|action_1632|>": 153304,
+  "<|action_1633|>": 153305,
+  "<|action_1634|>": 153306,
+  "<|action_1635|>": 153307,
+  "<|action_1636|>": 153308,
+  "<|action_1637|>": 153309,
+  "<|action_1638|>": 153310,
+  "<|action_1639|>": 153311,
+  "<|action_163|>": 151835,
+  "<|action_1640|>": 153312,
+  "<|action_1641|>": 153313,
+  "<|action_1642|>": 153314,
+  "<|action_1643|>": 153315,
+  "<|action_1644|>": 153316,
+  "<|action_1645|>": 153317,
+  "<|action_1646|>": 153318,
+  "<|action_1647|>": 153319,
+  "<|action_1648|>": 153320,
+  "<|action_1649|>": 153321,
+  "<|action_164|>": 151836,
+  "<|action_1650|>": 153322,
+  "<|action_1651|>": 153323,
+  "<|action_1652|>": 153324,
+  "<|action_1653|>": 153325,
+  "<|action_1654|>": 153326,
+  "<|action_1655|>": 153327,
+  "<|action_1656|>": 153328,
+  "<|action_1657|>": 153329,
+  "<|action_1658|>": 153330,
+  "<|action_1659|>": 153331,
+  "<|action_165|>": 151837,
+  "<|action_1660|>": 153332,
+  "<|action_1661|>": 153333,
+  "<|action_1662|>": 153334,
+  "<|action_1663|>": 153335,
+  "<|action_1664|>": 153336,
+  "<|action_1665|>": 153337,
+  "<|action_1666|>": 153338,
+  "<|action_1667|>": 153339,
+  "<|action_1668|>": 153340,
+  "<|action_1669|>": 153341,
+  "<|action_166|>": 151838,
+  "<|action_1670|>": 153342,
+  "<|action_1671|>": 153343,
+  "<|action_1672|>": 153344,
+  "<|action_1673|>": 153345,
+  "<|action_1674|>": 153346,
+  "<|action_1675|>": 153347,
+  "<|action_1676|>": 153348,
+  "<|action_1677|>": 153349,
+  "<|action_1678|>": 153350,
+  "<|action_1679|>": 153351,
+  "<|action_167|>": 151839,
+  "<|action_1680|>": 153352,
+  "<|action_1681|>": 153353,
+  "<|action_1682|>": 153354,
+  "<|action_1683|>": 153355,
+  "<|action_1684|>": 153356,
+  "<|action_1685|>": 153357,
+  "<|action_1686|>": 153358,
+  "<|action_1687|>": 153359,
+  "<|action_1688|>": 153360,
+  "<|action_1689|>": 153361,
+  "<|action_168|>": 151840,
+  "<|action_1690|>": 153362,
+  "<|action_1691|>": 153363,
+  "<|action_1692|>": 153364,
+  "<|action_1693|>": 153365,
+  "<|action_1694|>": 153366,
+  "<|action_1695|>": 153367,
+  "<|action_1696|>": 153368,
+  "<|action_1697|>": 153369,
+  "<|action_1698|>": 153370,
+  "<|action_1699|>": 153371,
+  "<|action_169|>": 151841,
+  "<|action_16|>": 151688,
+  "<|action_1700|>": 153372,
+  "<|action_1701|>": 153373,
+  "<|action_1702|>": 153374,
+  "<|action_1703|>": 153375,
+  "<|action_1704|>": 153376,
+  "<|action_1705|>": 153377,
+  "<|action_1706|>": 153378,
+  "<|action_1707|>": 153379,
+  "<|action_1708|>": 153380,
+  "<|action_1709|>": 153381,
+  "<|action_170|>": 151842,
+  "<|action_1710|>": 153382,
+  "<|action_1711|>": 153383,
+  "<|action_1712|>": 153384,
+  "<|action_1713|>": 153385,
+  "<|action_1714|>": 153386,
+  "<|action_1715|>": 153387,
+  "<|action_1716|>": 153388,
+  "<|action_1717|>": 153389,
+  "<|action_1718|>": 153390,
+  "<|action_1719|>": 153391,
+  "<|action_171|>": 151843,
+  "<|action_1720|>": 153392,
+  "<|action_1721|>": 153393,
+  "<|action_1722|>": 153394,
+  "<|action_1723|>": 153395,
+  "<|action_1724|>": 153396,
+  "<|action_1725|>": 153397,
+  "<|action_1726|>": 153398,
+  "<|action_1727|>": 153399,
+  "<|action_1728|>": 153400,
+  "<|action_1729|>": 153401,
+  "<|action_172|>": 151844,
+  "<|action_1730|>": 153402,
+  "<|action_1731|>": 153403,
+  "<|action_1732|>": 153404,
+  "<|action_1733|>": 153405,
+  "<|action_1734|>": 153406,
+  "<|action_1735|>": 153407,
+  "<|action_1736|>": 153408,
+  "<|action_1737|>": 153409,
+  "<|action_1738|>": 153410,
+  "<|action_1739|>": 153411,
+  "<|action_173|>": 151845,
+  "<|action_1740|>": 153412,
+  "<|action_1741|>": 153413,
+  "<|action_1742|>": 153414,
+  "<|action_1743|>": 153415,
+  "<|action_1744|>": 153416,
+  "<|action_1745|>": 153417,
+  "<|action_1746|>": 153418,
+  "<|action_1747|>": 153419,
+  "<|action_1748|>": 153420,
+  "<|action_1749|>": 153421,
+  "<|action_174|>": 151846,
+  "<|action_1750|>": 153422,
+  "<|action_1751|>": 153423,
+  "<|action_1752|>": 153424,
+  "<|action_1753|>": 153425,
+  "<|action_1754|>": 153426,
+  "<|action_1755|>": 153427,
+  "<|action_1756|>": 153428,
+  "<|action_1757|>": 153429,
+  "<|action_1758|>": 153430,
+  "<|action_1759|>": 153431,
+  "<|action_175|>": 151847,
+  "<|action_1760|>": 153432,
+  "<|action_1761|>": 153433,
+  "<|action_1762|>": 153434,
+  "<|action_1763|>": 153435,
+  "<|action_1764|>": 153436,
+  "<|action_1765|>": 153437,
+  "<|action_1766|>": 153438,
+  "<|action_1767|>": 153439,
+  "<|action_1768|>": 153440,
+  "<|action_1769|>": 153441,
+  "<|action_176|>": 151848,
+  "<|action_1770|>": 153442,
+  "<|action_1771|>": 153443,
+  "<|action_1772|>": 153444,
+  "<|action_1773|>": 153445,
+  "<|action_1774|>": 153446,
+  "<|action_1775|>": 153447,
+  "<|action_1776|>": 153448,
+  "<|action_1777|>": 153449,
+  "<|action_1778|>": 153450,
+  "<|action_1779|>": 153451,
+  "<|action_177|>": 151849,
+  "<|action_1780|>": 153452,
+  "<|action_1781|>": 153453,
+  "<|action_1782|>": 153454,
+  "<|action_1783|>": 153455,
+  "<|action_1784|>": 153456,
+  "<|action_1785|>": 153457,
+  "<|action_1786|>": 153458,
+  "<|action_1787|>": 153459,
+  "<|action_1788|>": 153460,
+  "<|action_1789|>": 153461,
+  "<|action_178|>": 151850,
+  "<|action_1790|>": 153462,
+  "<|action_1791|>": 153463,
+  "<|action_1792|>": 153464,
+  "<|action_1793|>": 153465,
+  "<|action_1794|>": 153466,
+  "<|action_1795|>": 153467,
+  "<|action_1796|>": 153468,
+  "<|action_1797|>": 153469,
+  "<|action_1798|>": 153470,
+  "<|action_1799|>": 153471,
+  "<|action_179|>": 151851,
+  "<|action_17|>": 151689,
+  "<|action_1800|>": 153472,
+  "<|action_1801|>": 153473,
+  "<|action_1802|>": 153474,
+  "<|action_1803|>": 153475,
+  "<|action_1804|>": 153476,
+  "<|action_1805|>": 153477,
+  "<|action_1806|>": 153478,
+  "<|action_1807|>": 153479,
+  "<|action_1808|>": 153480,
+  "<|action_1809|>": 153481,
+  "<|action_180|>": 151852,
+  "<|action_1810|>": 153482,
+  "<|action_1811|>": 153483,
+  "<|action_1812|>": 153484,
+  "<|action_1813|>": 153485,
+  "<|action_1814|>": 153486,
+  "<|action_1815|>": 153487,
+  "<|action_1816|>": 153488,
+  "<|action_1817|>": 153489,
+  "<|action_1818|>": 153490,
+  "<|action_1819|>": 153491,
+  "<|action_181|>": 151853,
+  "<|action_1820|>": 153492,
+  "<|action_1821|>": 153493,
+  "<|action_1822|>": 153494,
+  "<|action_1823|>": 153495,
+  "<|action_1824|>": 153496,
+  "<|action_1825|>": 153497,
+  "<|action_1826|>": 153498,
+  "<|action_1827|>": 153499,
+  "<|action_1828|>": 153500,
+  "<|action_1829|>": 153501,
+  "<|action_182|>": 151854,
+  "<|action_1830|>": 153502,
+  "<|action_1831|>": 153503,
+  "<|action_1832|>": 153504,
+  "<|action_1833|>": 153505,
+  "<|action_1834|>": 153506,
+  "<|action_1835|>": 153507,
+  "<|action_1836|>": 153508,
+  "<|action_1837|>": 153509,
+  "<|action_1838|>": 153510,
+  "<|action_1839|>": 153511,
+  "<|action_183|>": 151855,
+  "<|action_1840|>": 153512,
+  "<|action_1841|>": 153513,
+  "<|action_1842|>": 153514,
+  "<|action_1843|>": 153515,
+  "<|action_1844|>": 153516,
+  "<|action_1845|>": 153517,
+  "<|action_1846|>": 153518,
+  "<|action_1847|>": 153519,
+  "<|action_1848|>": 153520,
+  "<|action_1849|>": 153521,
+  "<|action_184|>": 151856,
+  "<|action_1850|>": 153522,
+  "<|action_1851|>": 153523,
+  "<|action_1852|>": 153524,
+  "<|action_1853|>": 153525,
+  "<|action_1854|>": 153526,
+  "<|action_1855|>": 153527,
+  "<|action_1856|>": 153528,
+  "<|action_1857|>": 153529,
+  "<|action_1858|>": 153530,
+  "<|action_1859|>": 153531,
+  "<|action_185|>": 151857,
+  "<|action_1860|>": 153532,
+  "<|action_1861|>": 153533,
+  "<|action_1862|>": 153534,
+  "<|action_1863|>": 153535,
+  "<|action_1864|>": 153536,
+  "<|action_1865|>": 153537,
+  "<|action_1866|>": 153538,
+  "<|action_1867|>": 153539,
+  "<|action_1868|>": 153540,
+  "<|action_1869|>": 153541,
+  "<|action_186|>": 151858,
+  "<|action_1870|>": 153542,
+  "<|action_1871|>": 153543,
+  "<|action_1872|>": 153544,
+  "<|action_1873|>": 153545,
+  "<|action_1874|>": 153546,
+  "<|action_1875|>": 153547,
+  "<|action_1876|>": 153548,
+  "<|action_1877|>": 153549,
+  "<|action_1878|>": 153550,
+  "<|action_1879|>": 153551,
+  "<|action_187|>": 151859,
+  "<|action_1880|>": 153552,
+  "<|action_1881|>": 153553,
+  "<|action_1882|>": 153554,
+  "<|action_1883|>": 153555,
+  "<|action_1884|>": 153556,
+  "<|action_1885|>": 153557,
+  "<|action_1886|>": 153558,
+  "<|action_1887|>": 153559,
+  "<|action_1888|>": 153560,
+  "<|action_1889|>": 153561,
+  "<|action_188|>": 151860,
+  "<|action_1890|>": 153562,
+  "<|action_1891|>": 153563,
+  "<|action_1892|>": 153564,
+  "<|action_1893|>": 153565,
+  "<|action_1894|>": 153566,
+  "<|action_1895|>": 153567,
+  "<|action_1896|>": 153568,
+  "<|action_1897|>": 153569,
+  "<|action_1898|>": 153570,
+  "<|action_1899|>": 153571,
+  "<|action_189|>": 151861,
+  "<|action_18|>": 151690,
+  "<|action_1900|>": 153572,
+  "<|action_1901|>": 153573,
+  "<|action_1902|>": 153574,
+  "<|action_1903|>": 153575,
+  "<|action_1904|>": 153576,
+  "<|action_1905|>": 153577,
+  "<|action_1906|>": 153578,
+  "<|action_1907|>": 153579,
+  "<|action_1908|>": 153580,
+  "<|action_1909|>": 153581,
+  "<|action_190|>": 151862,
+  "<|action_1910|>": 153582,
+  "<|action_1911|>": 153583,
+  "<|action_1912|>": 153584,
+  "<|action_1913|>": 153585,
+  "<|action_1914|>": 153586,
+  "<|action_1915|>": 153587,
+  "<|action_1916|>": 153588,
+  "<|action_1917|>": 153589,
+  "<|action_1918|>": 153590,
+  "<|action_1919|>": 153591,
+  "<|action_191|>": 151863,
+  "<|action_1920|>": 153592,
+  "<|action_1921|>": 153593,
+  "<|action_1922|>": 153594,
+  "<|action_1923|>": 153595,
+  "<|action_1924|>": 153596,
+  "<|action_1925|>": 153597,
+  "<|action_1926|>": 153598,
+  "<|action_1927|>": 153599,
+  "<|action_1928|>": 153600,
+  "<|action_1929|>": 153601,
+  "<|action_192|>": 151864,
+  "<|action_1930|>": 153602,
+  "<|action_1931|>": 153603,
+  "<|action_1932|>": 153604,
+  "<|action_1933|>": 153605,
+  "<|action_1934|>": 153606,
+  "<|action_1935|>": 153607,
+  "<|action_1936|>": 153608,
+  "<|action_1937|>": 153609,
+  "<|action_1938|>": 153610,
+  "<|action_1939|>": 153611,
+  "<|action_193|>": 151865,
+  "<|action_1940|>": 153612,
+  "<|action_1941|>": 153613,
+  "<|action_1942|>": 153614,
+  "<|action_1943|>": 153615,
+  "<|action_1944|>": 153616,
+  "<|action_1945|>": 153617,
+  "<|action_1946|>": 153618,
+  "<|action_1947|>": 153619,
+  "<|action_1948|>": 153620,
+  "<|action_1949|>": 153621,
+  "<|action_194|>": 151866,
+  "<|action_1950|>": 153622,
+  "<|action_1951|>": 153623,
+  "<|action_1952|>": 153624,
+  "<|action_1953|>": 153625,
+  "<|action_1954|>": 153626,
+  "<|action_1955|>": 153627,
+  "<|action_1956|>": 153628,
+  "<|action_1957|>": 153629,
+  "<|action_1958|>": 153630,
+  "<|action_1959|>": 153631,
+  "<|action_195|>": 151867,
+  "<|action_1960|>": 153632,
+  "<|action_1961|>": 153633,
+  "<|action_1962|>": 153634,
+  "<|action_1963|>": 153635,
+  "<|action_1964|>": 153636,
+  "<|action_1965|>": 153637,
+  "<|action_1966|>": 153638,
+  "<|action_1967|>": 153639,
+  "<|action_1968|>": 153640,
+  "<|action_1969|>": 153641,
+  "<|action_196|>": 151868,
+  "<|action_1970|>": 153642,
+  "<|action_1971|>": 153643,
+  "<|action_1972|>": 153644,
+  "<|action_1973|>": 153645,
+  "<|action_1974|>": 153646,
+  "<|action_1975|>": 153647,
+  "<|action_1976|>": 153648,
+  "<|action_1977|>": 153649,
+  "<|action_1978|>": 153650,
+  "<|action_1979|>": 153651,
+  "<|action_197|>": 151869,
+  "<|action_1980|>": 153652,
+  "<|action_1981|>": 153653,
+  "<|action_1982|>": 153654,
+  "<|action_1983|>": 153655,
+  "<|action_1984|>": 153656,
+  "<|action_1985|>": 153657,
+  "<|action_1986|>": 153658,
+  "<|action_1987|>": 153659,
+  "<|action_1988|>": 153660,
+  "<|action_1989|>": 153661,
+  "<|action_198|>": 151870,
+  "<|action_1990|>": 153662,
+  "<|action_1991|>": 153663,
+  "<|action_1992|>": 153664,
+  "<|action_1993|>": 153665,
+  "<|action_1994|>": 153666,
+  "<|action_1995|>": 153667,
+  "<|action_1996|>": 153668,
+  "<|action_1997|>": 153669,
+  "<|action_1998|>": 153670,
+  "<|action_1999|>": 153671,
+  "<|action_199|>": 151871,
+  "<|action_19|>": 151691,
+  "<|action_1|>": 151673,
+  "<|action_2000|>": 153672,
+  "<|action_2001|>": 153673,
+  "<|action_2002|>": 153674,
+  "<|action_2003|>": 153675,
+  "<|action_2004|>": 153676,
+  "<|action_2005|>": 153677,
+  "<|action_2006|>": 153678,
+  "<|action_2007|>": 153679,
+  "<|action_2008|>": 153680,
+  "<|action_2009|>": 153681,
+  "<|action_200|>": 151872,
+  "<|action_2010|>": 153682,
+  "<|action_2011|>": 153683,
+  "<|action_2012|>": 153684,
+  "<|action_2013|>": 153685,
+  "<|action_2014|>": 153686,
+  "<|action_2015|>": 153687,
+  "<|action_2016|>": 153688,
+  "<|action_2017|>": 153689,
+  "<|action_2018|>": 153690,
+  "<|action_2019|>": 153691,
+  "<|action_201|>": 151873,
+  "<|action_2020|>": 153692,
+  "<|action_2021|>": 153693,
+  "<|action_2022|>": 153694,
+  "<|action_2023|>": 153695,
+  "<|action_2024|>": 153696,
+  "<|action_2025|>": 153697,
+  "<|action_2026|>": 153698,
+  "<|action_2027|>": 153699,
+  "<|action_2028|>": 153700,
+  "<|action_2029|>": 153701,
+  "<|action_202|>": 151874,
+  "<|action_2030|>": 153702,
+  "<|action_2031|>": 153703,
+  "<|action_2032|>": 153704,
+  "<|action_2033|>": 153705,
+  "<|action_2034|>": 153706,
+  "<|action_2035|>": 153707,
+  "<|action_2036|>": 153708,
+  "<|action_2037|>": 153709,
+  "<|action_2038|>": 153710,
+  "<|action_2039|>": 153711,
+  "<|action_203|>": 151875,
+  "<|action_2040|>": 153712,
+  "<|action_2041|>": 153713,
+  "<|action_2042|>": 153714,
+  "<|action_2043|>": 153715,
+  "<|action_2044|>": 153716,
+  "<|action_2045|>": 153717,
+  "<|action_2046|>": 153718,
+  "<|action_2047|>": 153719,
+  "<|action_204|>": 151876,
+  "<|action_205|>": 151877,
+  "<|action_206|>": 151878,
+  "<|action_207|>": 151879,
+  "<|action_208|>": 151880,
+  "<|action_209|>": 151881,
+  "<|action_20|>": 151692,
+  "<|action_210|>": 151882,
+  "<|action_211|>": 151883,
+  "<|action_212|>": 151884,
+  "<|action_213|>": 151885,
+  "<|action_214|>": 151886,
+  "<|action_215|>": 151887,
+  "<|action_216|>": 151888,
+  "<|action_217|>": 151889,
+  "<|action_218|>": 151890,
+  "<|action_219|>": 151891,
+  "<|action_21|>": 151693,
+  "<|action_220|>": 151892,
+  "<|action_221|>": 151893,
+  "<|action_222|>": 151894,
+  "<|action_223|>": 151895,
+  "<|action_224|>": 151896,
+  "<|action_225|>": 151897,
+  "<|action_226|>": 151898,
+  "<|action_227|>": 151899,
+  "<|action_228|>": 151900,
+  "<|action_229|>": 151901,
+  "<|action_22|>": 151694,
+  "<|action_230|>": 151902,
+  "<|action_231|>": 151903,
+  "<|action_232|>": 151904,
+  "<|action_233|>": 151905,
+  "<|action_234|>": 151906,
+  "<|action_235|>": 151907,
+  "<|action_236|>": 151908,
+  "<|action_237|>": 151909,
+  "<|action_238|>": 151910,
+  "<|action_239|>": 151911,
+  "<|action_23|>": 151695,
+  "<|action_240|>": 151912,
+  "<|action_241|>": 151913,
+  "<|action_242|>": 151914,
+  "<|action_243|>": 151915,
+  "<|action_244|>": 151916,
+  "<|action_245|>": 151917,
+  "<|action_246|>": 151918,
+  "<|action_247|>": 151919,
+  "<|action_248|>": 151920,
+  "<|action_249|>": 151921,
+  "<|action_24|>": 151696,
+  "<|action_250|>": 151922,
+  "<|action_251|>": 151923,
+  "<|action_252|>": 151924,
+  "<|action_253|>": 151925,
+  "<|action_254|>": 151926,
+  "<|action_255|>": 151927,
+  "<|action_256|>": 151928,
+  "<|action_257|>": 151929,
+  "<|action_258|>": 151930,
+  "<|action_259|>": 151931,
+  "<|action_25|>": 151697,
+  "<|action_260|>": 151932,
+  "<|action_261|>": 151933,
+  "<|action_262|>": 151934,
+  "<|action_263|>": 151935,
+  "<|action_264|>": 151936,
+  "<|action_265|>": 151937,
+  "<|action_266|>": 151938,
+  "<|action_267|>": 151939,
+  "<|action_268|>": 151940,
+  "<|action_269|>": 151941,
+  "<|action_26|>": 151698,
+  "<|action_270|>": 151942,
+  "<|action_271|>": 151943,
+  "<|action_272|>": 151944,
+  "<|action_273|>": 151945,
+  "<|action_274|>": 151946,
+  "<|action_275|>": 151947,
+  "<|action_276|>": 151948,
+  "<|action_277|>": 151949,
+  "<|action_278|>": 151950,
+  "<|action_279|>": 151951,
+  "<|action_27|>": 151699,
+  "<|action_280|>": 151952,
+  "<|action_281|>": 151953,
+  "<|action_282|>": 151954,
+  "<|action_283|>": 151955,
+  "<|action_284|>": 151956,
+  "<|action_285|>": 151957,
+  "<|action_286|>": 151958,
+  "<|action_287|>": 151959,
+  "<|action_288|>": 151960,
+  "<|action_289|>": 151961,
+  "<|action_28|>": 151700,
+  "<|action_290|>": 151962,
+  "<|action_291|>": 151963,
+  "<|action_292|>": 151964,
+  "<|action_293|>": 151965,
+  "<|action_294|>": 151966,
+  "<|action_295|>": 151967,
+  "<|action_296|>": 151968,
+  "<|action_297|>": 151969,
+  "<|action_298|>": 151970,
+  "<|action_299|>": 151971,
+  "<|action_29|>": 151701,
+  "<|action_2|>": 151674,
+  "<|action_300|>": 151972,
+  "<|action_301|>": 151973,
+  "<|action_302|>": 151974,
+  "<|action_303|>": 151975,
+  "<|action_304|>": 151976,
+  "<|action_305|>": 151977,
+  "<|action_306|>": 151978,
+  "<|action_307|>": 151979,
+  "<|action_308|>": 151980,
+  "<|action_309|>": 151981,
+  "<|action_30|>": 151702,
+  "<|action_310|>": 151982,
+  "<|action_311|>": 151983,
+  "<|action_312|>": 151984,
+  "<|action_313|>": 151985,
+  "<|action_314|>": 151986,
+  "<|action_315|>": 151987,
+  "<|action_316|>": 151988,
+  "<|action_317|>": 151989,
+  "<|action_318|>": 151990,
+  "<|action_319|>": 151991,
+  "<|action_31|>": 151703,
+  "<|action_320|>": 151992,
+  "<|action_321|>": 151993,
+  "<|action_322|>": 151994,
+  "<|action_323|>": 151995,
+  "<|action_324|>": 151996,
+  "<|action_325|>": 151997,
+  "<|action_326|>": 151998,
+  "<|action_327|>": 151999,
+  "<|action_328|>": 152000,
+  "<|action_329|>": 152001,
+  "<|action_32|>": 151704,
+  "<|action_330|>": 152002,
+  "<|action_331|>": 152003,
+  "<|action_332|>": 152004,
+  "<|action_333|>": 152005,
+  "<|action_334|>": 152006,
+  "<|action_335|>": 152007,
+  "<|action_336|>": 152008,
+  "<|action_337|>": 152009,
+  "<|action_338|>": 152010,
+  "<|action_339|>": 152011,
+  "<|action_33|>": 151705,
+  "<|action_340|>": 152012,
+  "<|action_341|>": 152013,
+  "<|action_342|>": 152014,
+  "<|action_343|>": 152015,
+  "<|action_344|>": 152016,
+  "<|action_345|>": 152017,
+  "<|action_346|>": 152018,
+  "<|action_347|>": 152019,
+  "<|action_348|>": 152020,
+  "<|action_349|>": 152021,
+  "<|action_34|>": 151706,
+  "<|action_350|>": 152022,
+  "<|action_351|>": 152023,
+  "<|action_352|>": 152024,
+  "<|action_353|>": 152025,
+  "<|action_354|>": 152026,
+  "<|action_355|>": 152027,
+  "<|action_356|>": 152028,
+  "<|action_357|>": 152029,
+  "<|action_358|>": 152030,
+  "<|action_359|>": 152031,
+  "<|action_35|>": 151707,
+  "<|action_360|>": 152032,
+  "<|action_361|>": 152033,
+  "<|action_362|>": 152034,
+  "<|action_363|>": 152035,
+  "<|action_364|>": 152036,
+  "<|action_365|>": 152037,
+  "<|action_366|>": 152038,
+  "<|action_367|>": 152039,
+  "<|action_368|>": 152040,
+  "<|action_369|>": 152041,
+  "<|action_36|>": 151708,
+  "<|action_370|>": 152042,
+  "<|action_371|>": 152043,
+  "<|action_372|>": 152044,
+  "<|action_373|>": 152045,
+  "<|action_374|>": 152046,
+  "<|action_375|>": 152047,
+  "<|action_376|>": 152048,
+  "<|action_377|>": 152049,
+  "<|action_378|>": 152050,
+  "<|action_379|>": 152051,
+  "<|action_37|>": 151709,
+  "<|action_380|>": 152052,
+  "<|action_381|>": 152053,
+  "<|action_382|>": 152054,
+  "<|action_383|>": 152055,
+  "<|action_384|>": 152056,
+  "<|action_385|>": 152057,
+  "<|action_386|>": 152058,
+  "<|action_387|>": 152059,
+  "<|action_388|>": 152060,
+  "<|action_389|>": 152061,
+  "<|action_38|>": 151710,
+  "<|action_390|>": 152062,
+  "<|action_391|>": 152063,
+  "<|action_392|>": 152064,
+  "<|action_393|>": 152065,
+  "<|action_394|>": 152066,
+  "<|action_395|>": 152067,
+  "<|action_396|>": 152068,
+  "<|action_397|>": 152069,
+  "<|action_398|>": 152070,
+  "<|action_399|>": 152071,
+  "<|action_39|>": 151711,
+  "<|action_3|>": 151675,
+  "<|action_400|>": 152072,
+  "<|action_401|>": 152073,
+  "<|action_402|>": 152074,
+  "<|action_403|>": 152075,
+  "<|action_404|>": 152076,
+  "<|action_405|>": 152077,
+  "<|action_406|>": 152078,
+  "<|action_407|>": 152079,
+  "<|action_408|>": 152080,
+  "<|action_409|>": 152081,
+  "<|action_40|>": 151712,
+  "<|action_410|>": 152082,
+  "<|action_411|>": 152083,
+  "<|action_412|>": 152084,
+  "<|action_413|>": 152085,
+  "<|action_414|>": 152086,
+  "<|action_415|>": 152087,
+  "<|action_416|>": 152088,
+  "<|action_417|>": 152089,
+  "<|action_418|>": 152090,
+  "<|action_419|>": 152091,
+  "<|action_41|>": 151713,
+  "<|action_420|>": 152092,
+  "<|action_421|>": 152093,
+  "<|action_422|>": 152094,
+  "<|action_423|>": 152095,
+  "<|action_424|>": 152096,
+  "<|action_425|>": 152097,
+  "<|action_426|>": 152098,
+  "<|action_427|>": 152099,
+  "<|action_428|>": 152100,
+  "<|action_429|>": 152101,
+  "<|action_42|>": 151714,
+  "<|action_430|>": 152102,
+  "<|action_431|>": 152103,
+  "<|action_432|>": 152104,
+  "<|action_433|>": 152105,
+  "<|action_434|>": 152106,
+  "<|action_435|>": 152107,
+  "<|action_436|>": 152108,
+  "<|action_437|>": 152109,
+  "<|action_438|>": 152110,
+  "<|action_439|>": 152111,
+  "<|action_43|>": 151715,
+  "<|action_440|>": 152112,
+  "<|action_441|>": 152113,
+  "<|action_442|>": 152114,
+  "<|action_443|>": 152115,
+  "<|action_444|>": 152116,
+  "<|action_445|>": 152117,
+  "<|action_446|>": 152118,
+  "<|action_447|>": 152119,
+  "<|action_448|>": 152120,
+  "<|action_449|>": 152121,
+  "<|action_44|>": 151716,
+  "<|action_450|>": 152122,
+  "<|action_451|>": 152123,
+  "<|action_452|>": 152124,
+  "<|action_453|>": 152125,
+  "<|action_454|>": 152126,
+  "<|action_455|>": 152127,
+  "<|action_456|>": 152128,
+  "<|action_457|>": 152129,
+  "<|action_458|>": 152130,
+  "<|action_459|>": 152131,
+  "<|action_45|>": 151717,
+  "<|action_460|>": 152132,
+  "<|action_461|>": 152133,
+  "<|action_462|>": 152134,
+  "<|action_463|>": 152135,
+  "<|action_464|>": 152136,
+  "<|action_465|>": 152137,
+  "<|action_466|>": 152138,
+  "<|action_467|>": 152139,
+  "<|action_468|>": 152140,
+  "<|action_469|>": 152141,
+  "<|action_46|>": 151718,
+  "<|action_470|>": 152142,
+  "<|action_471|>": 152143,
+  "<|action_472|>": 152144,
+  "<|action_473|>": 152145,
+  "<|action_474|>": 152146,
+  "<|action_475|>": 152147,
+  "<|action_476|>": 152148,
+  "<|action_477|>": 152149,
+  "<|action_478|>": 152150,
+  "<|action_479|>": 152151,
+  "<|action_47|>": 151719,
+  "<|action_480|>": 152152,
+  "<|action_481|>": 152153,
+  "<|action_482|>": 152154,
+  "<|action_483|>": 152155,
+  "<|action_484|>": 152156,
+  "<|action_485|>": 152157,
+  "<|action_486|>": 152158,
+  "<|action_487|>": 152159,
+  "<|action_488|>": 152160,
+  "<|action_489|>": 152161,
+  "<|action_48|>": 151720,
+  "<|action_490|>": 152162,
+  "<|action_491|>": 152163,
+  "<|action_492|>": 152164,
+  "<|action_493|>": 152165,
+  "<|action_494|>": 152166,
+  "<|action_495|>": 152167,
+  "<|action_496|>": 152168,
+  "<|action_497|>": 152169,
+  "<|action_498|>": 152170,
+  "<|action_499|>": 152171,
+  "<|action_49|>": 151721,
+  "<|action_4|>": 151676,
+  "<|action_500|>": 152172,
+  "<|action_501|>": 152173,
+  "<|action_502|>": 152174,
+  "<|action_503|>": 152175,
+  "<|action_504|>": 152176,
+  "<|action_505|>": 152177,
+  "<|action_506|>": 152178,
+  "<|action_507|>": 152179,
+  "<|action_508|>": 152180,
+  "<|action_509|>": 152181,
+  "<|action_50|>": 151722,
+  "<|action_510|>": 152182,
+  "<|action_511|>": 152183,
+  "<|action_512|>": 152184,
+  "<|action_513|>": 152185,
+  "<|action_514|>": 152186,
+  "<|action_515|>": 152187,
+  "<|action_516|>": 152188,
+  "<|action_517|>": 152189,
+  "<|action_518|>": 152190,
+  "<|action_519|>": 152191,
+  "<|action_51|>": 151723,
+  "<|action_520|>": 152192,
+  "<|action_521|>": 152193,
+  "<|action_522|>": 152194,
+  "<|action_523|>": 152195,
+  "<|action_524|>": 152196,
+  "<|action_525|>": 152197,
+  "<|action_526|>": 152198,
+  "<|action_527|>": 152199,
+  "<|action_528|>": 152200,
+  "<|action_529|>": 152201,
+  "<|action_52|>": 151724,
+  "<|action_530|>": 152202,
+  "<|action_531|>": 152203,
+  "<|action_532|>": 152204,
+  "<|action_533|>": 152205,
+  "<|action_534|>": 152206,
+  "<|action_535|>": 152207,
+  "<|action_536|>": 152208,
+  "<|action_537|>": 152209,
+  "<|action_538|>": 152210,
+  "<|action_539|>": 152211,
+  "<|action_53|>": 151725,
+  "<|action_540|>": 152212,
+  "<|action_541|>": 152213,
+  "<|action_542|>": 152214,
+  "<|action_543|>": 152215,
+  "<|action_544|>": 152216,
+  "<|action_545|>": 152217,
+  "<|action_546|>": 152218,
+  "<|action_547|>": 152219,
+  "<|action_548|>": 152220,
+  "<|action_549|>": 152221,
+  "<|action_54|>": 151726,
+  "<|action_550|>": 152222,
+  "<|action_551|>": 152223,
+  "<|action_552|>": 152224,
+  "<|action_553|>": 152225,
+  "<|action_554|>": 152226,
+  "<|action_555|>": 152227,
+  "<|action_556|>": 152228,
+  "<|action_557|>": 152229,
+  "<|action_558|>": 152230,
+  "<|action_559|>": 152231,
+  "<|action_55|>": 151727,
+  "<|action_560|>": 152232,
+  "<|action_561|>": 152233,
+  "<|action_562|>": 152234,
+  "<|action_563|>": 152235,
+  "<|action_564|>": 152236,
+  "<|action_565|>": 152237,
+  "<|action_566|>": 152238,
+  "<|action_567|>": 152239,
+  "<|action_568|>": 152240,
+  "<|action_569|>": 152241,
+  "<|action_56|>": 151728,
+  "<|action_570|>": 152242,
+  "<|action_571|>": 152243,
+  "<|action_572|>": 152244,
+  "<|action_573|>": 152245,
+  "<|action_574|>": 152246,
+  "<|action_575|>": 152247,
+  "<|action_576|>": 152248,
+  "<|action_577|>": 152249,
+  "<|action_578|>": 152250,
+  "<|action_579|>": 152251,
+  "<|action_57|>": 151729,
+  "<|action_580|>": 152252,
+  "<|action_581|>": 152253,
+  "<|action_582|>": 152254,
+  "<|action_583|>": 152255,
+  "<|action_584|>": 152256,
+  "<|action_585|>": 152257,
+  "<|action_586|>": 152258,
+  "<|action_587|>": 152259,
+  "<|action_588|>": 152260,
+  "<|action_589|>": 152261,
+  "<|action_58|>": 151730,
+  "<|action_590|>": 152262,
+  "<|action_591|>": 152263,
+  "<|action_592|>": 152264,
+  "<|action_593|>": 152265,
+  "<|action_594|>": 152266,
+  "<|action_595|>": 152267,
+  "<|action_596|>": 152268,
+  "<|action_597|>": 152269,
+  "<|action_598|>": 152270,
+  "<|action_599|>": 152271,
+  "<|action_59|>": 151731,
+  "<|action_5|>": 151677,
+  "<|action_600|>": 152272,
+  "<|action_601|>": 152273,
+  "<|action_602|>": 152274,
+  "<|action_603|>": 152275,
+  "<|action_604|>": 152276,
+  "<|action_605|>": 152277,
+  "<|action_606|>": 152278,
+  "<|action_607|>": 152279,
+  "<|action_608|>": 152280,
+  "<|action_609|>": 152281,
+  "<|action_60|>": 151732,
+  "<|action_610|>": 152282,
+  "<|action_611|>": 152283,
+  "<|action_612|>": 152284,
+  "<|action_613|>": 152285,
+  "<|action_614|>": 152286,
+  "<|action_615|>": 152287,
+  "<|action_616|>": 152288,
+  "<|action_617|>": 152289,
+  "<|action_618|>": 152290,
+  "<|action_619|>": 152291,
+  "<|action_61|>": 151733,
+  "<|action_620|>": 152292,
+  "<|action_621|>": 152293,
+  "<|action_622|>": 152294,
+  "<|action_623|>": 152295,
+  "<|action_624|>": 152296,
+  "<|action_625|>": 152297,
+  "<|action_626|>": 152298,
+  "<|action_627|>": 152299,
+  "<|action_628|>": 152300,
+  "<|action_629|>": 152301,
+  "<|action_62|>": 151734,
+  "<|action_630|>": 152302,
+  "<|action_631|>": 152303,
+  "<|action_632|>": 152304,
+  "<|action_633|>": 152305,
+  "<|action_634|>": 152306,
+  "<|action_635|>": 152307,
+  "<|action_636|>": 152308,
+  "<|action_637|>": 152309,
+  "<|action_638|>": 152310,
+  "<|action_639|>": 152311,
+  "<|action_63|>": 151735,
+  "<|action_640|>": 152312,
+  "<|action_641|>": 152313,
+  "<|action_642|>": 152314,
+  "<|action_643|>": 152315,
+  "<|action_644|>": 152316,
+  "<|action_645|>": 152317,
+  "<|action_646|>": 152318,
+  "<|action_647|>": 152319,
+  "<|action_648|>": 152320,
+  "<|action_649|>": 152321,
+  "<|action_64|>": 151736,
+  "<|action_650|>": 152322,
+  "<|action_651|>": 152323,
+  "<|action_652|>": 152324,
+  "<|action_653|>": 152325,
+  "<|action_654|>": 152326,
+  "<|action_655|>": 152327,
+  "<|action_656|>": 152328,
+  "<|action_657|>": 152329,
+  "<|action_658|>": 152330,
+  "<|action_659|>": 152331,
+  "<|action_65|>": 151737,
+  "<|action_660|>": 152332,
+  "<|action_661|>": 152333,
+  "<|action_662|>": 152334,
+  "<|action_663|>": 152335,
+  "<|action_664|>": 152336,
+  "<|action_665|>": 152337,
+  "<|action_666|>": 152338,
+  "<|action_667|>": 152339,
+  "<|action_668|>": 152340,
+  "<|action_669|>": 152341,
+  "<|action_66|>": 151738,
+  "<|action_670|>": 152342,
+  "<|action_671|>": 152343,
+  "<|action_672|>": 152344,
+  "<|action_673|>": 152345,
+  "<|action_674|>": 152346,
+  "<|action_675|>": 152347,
+  "<|action_676|>": 152348,
+  "<|action_677|>": 152349,
+  "<|action_678|>": 152350,
+  "<|action_679|>": 152351,
+  "<|action_67|>": 151739,
+  "<|action_680|>": 152352,
+  "<|action_681|>": 152353,
+  "<|action_682|>": 152354,
+  "<|action_683|>": 152355,
+  "<|action_684|>": 152356,
+  "<|action_685|>": 152357,
+  "<|action_686|>": 152358,
+  "<|action_687|>": 152359,
+  "<|action_688|>": 152360,
+  "<|action_689|>": 152361,
+  "<|action_68|>": 151740,
+  "<|action_690|>": 152362,
+  "<|action_691|>": 152363,
+  "<|action_692|>": 152364,
+  "<|action_693|>": 152365,
+  "<|action_694|>": 152366,
+  "<|action_695|>": 152367,
+  "<|action_696|>": 152368,
+  "<|action_697|>": 152369,
+  "<|action_698|>": 152370,
+  "<|action_699|>": 152371,
+  "<|action_69|>": 151741,
+  "<|action_6|>": 151678,
+  "<|action_700|>": 152372,
+  "<|action_701|>": 152373,
+  "<|action_702|>": 152374,
+  "<|action_703|>": 152375,
+  "<|action_704|>": 152376,
+  "<|action_705|>": 152377,
+  "<|action_706|>": 152378,
+  "<|action_707|>": 152379,
+  "<|action_708|>": 152380,
+  "<|action_709|>": 152381,
+  "<|action_70|>": 151742,
+  "<|action_710|>": 152382,
+  "<|action_711|>": 152383,
+  "<|action_712|>": 152384,
+  "<|action_713|>": 152385,
+  "<|action_714|>": 152386,
+  "<|action_715|>": 152387,
+  "<|action_716|>": 152388,
+  "<|action_717|>": 152389,
+  "<|action_718|>": 152390,
+  "<|action_719|>": 152391,
+  "<|action_71|>": 151743,
+  "<|action_720|>": 152392,
+  "<|action_721|>": 152393,
+  "<|action_722|>": 152394,
+  "<|action_723|>": 152395,
+  "<|action_724|>": 152396,
+  "<|action_725|>": 152397,
+  "<|action_726|>": 152398,
+  "<|action_727|>": 152399,
+  "<|action_728|>": 152400,
+  "<|action_729|>": 152401,
+  "<|action_72|>": 151744,
+  "<|action_730|>": 152402,
+  "<|action_731|>": 152403,
+  "<|action_732|>": 152404,
+  "<|action_733|>": 152405,
+  "<|action_734|>": 152406,
+  "<|action_735|>": 152407,
+  "<|action_736|>": 152408,
+  "<|action_737|>": 152409,
+  "<|action_738|>": 152410,
+  "<|action_739|>": 152411,
+  "<|action_73|>": 151745,
+  "<|action_740|>": 152412,
+  "<|action_741|>": 152413,
+  "<|action_742|>": 152414,
+  "<|action_743|>": 152415,
+  "<|action_744|>": 152416,
+  "<|action_745|>": 152417,
+  "<|action_746|>": 152418,
+  "<|action_747|>": 152419,
+  "<|action_748|>": 152420,
+  "<|action_749|>": 152421,
+  "<|action_74|>": 151746,
+  "<|action_750|>": 152422,
+  "<|action_751|>": 152423,
+  "<|action_752|>": 152424,
+  "<|action_753|>": 152425,
+  "<|action_754|>": 152426,
+  "<|action_755|>": 152427,
+  "<|action_756|>": 152428,
+  "<|action_757|>": 152429,
+  "<|action_758|>": 152430,
+  "<|action_759|>": 152431,
+  "<|action_75|>": 151747,
+  "<|action_760|>": 152432,
+  "<|action_761|>": 152433,
+  "<|action_762|>": 152434,
+  "<|action_763|>": 152435,
+  "<|action_764|>": 152436,
+  "<|action_765|>": 152437,
+  "<|action_766|>": 152438,
+  "<|action_767|>": 152439,
+  "<|action_768|>": 152440,
+  "<|action_769|>": 152441,
+  "<|action_76|>": 151748,
+  "<|action_770|>": 152442,
+  "<|action_771|>": 152443,
+  "<|action_772|>": 152444,
+  "<|action_773|>": 152445,
+  "<|action_774|>": 152446,
+  "<|action_775|>": 152447,
+  "<|action_776|>": 152448,
+  "<|action_777|>": 152449,
+  "<|action_778|>": 152450,
+  "<|action_779|>": 152451,
+  "<|action_77|>": 151749,
+  "<|action_780|>": 152452,
+  "<|action_781|>": 152453,
+  "<|action_782|>": 152454,
+  "<|action_783|>": 152455,
+  "<|action_784|>": 152456,
+  "<|action_785|>": 152457,
+  "<|action_786|>": 152458,
+  "<|action_787|>": 152459,
+  "<|action_788|>": 152460,
+  "<|action_789|>": 152461,
+  "<|action_78|>": 151750,
+  "<|action_790|>": 152462,
+  "<|action_791|>": 152463,
+  "<|action_792|>": 152464,
+  "<|action_793|>": 152465,
+  "<|action_794|>": 152466,
+  "<|action_795|>": 152467,
+  "<|action_796|>": 152468,
+  "<|action_797|>": 152469,
+  "<|action_798|>": 152470,
+  "<|action_799|>": 152471,
+  "<|action_79|>": 151751,
+  "<|action_7|>": 151679,
+  "<|action_800|>": 152472,
+  "<|action_801|>": 152473,
+  "<|action_802|>": 152474,
+  "<|action_803|>": 152475,
+  "<|action_804|>": 152476,
+  "<|action_805|>": 152477,
+  "<|action_806|>": 152478,
+  "<|action_807|>": 152479,
+  "<|action_808|>": 152480,
+  "<|action_809|>": 152481,
+  "<|action_80|>": 151752,
+  "<|action_810|>": 152482,
+  "<|action_811|>": 152483,
+  "<|action_812|>": 152484,
+  "<|action_813|>": 152485,
+  "<|action_814|>": 152486,
+  "<|action_815|>": 152487,
+  "<|action_816|>": 152488,
+  "<|action_817|>": 152489,
+  "<|action_818|>": 152490,
+  "<|action_819|>": 152491,
+  "<|action_81|>": 151753,
+  "<|action_820|>": 152492,
+  "<|action_821|>": 152493,
+  "<|action_822|>": 152494,
+  "<|action_823|>": 152495,
+  "<|action_824|>": 152496,
+  "<|action_825|>": 152497,
+  "<|action_826|>": 152498,
+  "<|action_827|>": 152499,
+  "<|action_828|>": 152500,
+  "<|action_829|>": 152501,
+  "<|action_82|>": 151754,
+  "<|action_830|>": 152502,
+  "<|action_831|>": 152503,
+  "<|action_832|>": 152504,
+  "<|action_833|>": 152505,
+  "<|action_834|>": 152506,
+  "<|action_835|>": 152507,
+  "<|action_836|>": 152508,
+  "<|action_837|>": 152509,
+  "<|action_838|>": 152510,
+  "<|action_839|>": 152511,
+  "<|action_83|>": 151755,
+  "<|action_840|>": 152512,
+  "<|action_841|>": 152513,
+  "<|action_842|>": 152514,
+  "<|action_843|>": 152515,
+  "<|action_844|>": 152516,
+  "<|action_845|>": 152517,
+  "<|action_846|>": 152518,
+  "<|action_847|>": 152519,
+  "<|action_848|>": 152520,
+  "<|action_849|>": 152521,
+  "<|action_84|>": 151756,
+  "<|action_850|>": 152522,
+  "<|action_851|>": 152523,
+  "<|action_852|>": 152524,
+  "<|action_853|>": 152525,
+  "<|action_854|>": 152526,
+  "<|action_855|>": 152527,
+  "<|action_856|>": 152528,
+  "<|action_857|>": 152529,
+  "<|action_858|>": 152530,
+  "<|action_859|>": 152531,
+  "<|action_85|>": 151757,
+  "<|action_860|>": 152532,
+  "<|action_861|>": 152533,
+  "<|action_862|>": 152534,
+  "<|action_863|>": 152535,
+  "<|action_864|>": 152536,
+  "<|action_865|>": 152537,
+  "<|action_866|>": 152538,
+  "<|action_867|>": 152539,
+  "<|action_868|>": 152540,
+  "<|action_869|>": 152541,
+  "<|action_86|>": 151758,
+  "<|action_870|>": 152542,
+  "<|action_871|>": 152543,
+  "<|action_872|>": 152544,
+  "<|action_873|>": 152545,
+  "<|action_874|>": 152546,
+  "<|action_875|>": 152547,
+  "<|action_876|>": 152548,
+  "<|action_877|>": 152549,
+  "<|action_878|>": 152550,
+  "<|action_879|>": 152551,
+  "<|action_87|>": 151759,
+  "<|action_880|>": 152552,
+  "<|action_881|>": 152553,
+  "<|action_882|>": 152554,
+  "<|action_883|>": 152555,
+  "<|action_884|>": 152556,
+  "<|action_885|>": 152557,
+  "<|action_886|>": 152558,
+  "<|action_887|>": 152559,
+  "<|action_888|>": 152560,
+  "<|action_889|>": 152561,
+  "<|action_88|>": 151760,
+  "<|action_890|>": 152562,
+  "<|action_891|>": 152563,
+  "<|action_892|>": 152564,
+  "<|action_893|>": 152565,
+  "<|action_894|>": 152566,
+  "<|action_895|>": 152567,
+  "<|action_896|>": 152568,
+  "<|action_897|>": 152569,
+  "<|action_898|>": 152570,
+  "<|action_899|>": 152571,
+  "<|action_89|>": 151761,
+  "<|action_8|>": 151680,
+  "<|action_900|>": 152572,
+  "<|action_901|>": 152573,
+  "<|action_902|>": 152574,
+  "<|action_903|>": 152575,
+  "<|action_904|>": 152576,
+  "<|action_905|>": 152577,
+  "<|action_906|>": 152578,
+  "<|action_907|>": 152579,
+  "<|action_908|>": 152580,
+  "<|action_909|>": 152581,
+  "<|action_90|>": 151762,
+  "<|action_910|>": 152582,
+  "<|action_911|>": 152583,
+  "<|action_912|>": 152584,
+  "<|action_913|>": 152585,
+  "<|action_914|>": 152586,
+  "<|action_915|>": 152587,
+  "<|action_916|>": 152588,
+  "<|action_917|>": 152589,
+  "<|action_918|>": 152590,
+  "<|action_919|>": 152591,
+  "<|action_91|>": 151763,
+  "<|action_920|>": 152592,
+  "<|action_921|>": 152593,
+  "<|action_922|>": 152594,
+  "<|action_923|>": 152595,
+  "<|action_924|>": 152596,
+  "<|action_925|>": 152597,
+  "<|action_926|>": 152598,
+  "<|action_927|>": 152599,
+  "<|action_928|>": 152600,
+  "<|action_929|>": 152601,
+  "<|action_92|>": 151764,
+  "<|action_930|>": 152602,
+  "<|action_931|>": 152603,
+  "<|action_932|>": 152604,
+  "<|action_933|>": 152605,
+  "<|action_934|>": 152606,
+  "<|action_935|>": 152607,
+  "<|action_936|>": 152608,
+  "<|action_937|>": 152609,
+  "<|action_938|>": 152610,
+  "<|action_939|>": 152611,
+  "<|action_93|>": 151765,
+  "<|action_940|>": 152612,
+  "<|action_941|>": 152613,
+  "<|action_942|>": 152614,
+  "<|action_943|>": 152615,
+  "<|action_944|>": 152616,
+  "<|action_945|>": 152617,
+  "<|action_946|>": 152618,
+  "<|action_947|>": 152619,
+  "<|action_948|>": 152620,
+  "<|action_949|>": 152621,
+  "<|action_94|>": 151766,
+  "<|action_950|>": 152622,
+  "<|action_951|>": 152623,
+  "<|action_952|>": 152624,
+  "<|action_953|>": 152625,
+  "<|action_954|>": 152626,
+  "<|action_955|>": 152627,
+  "<|action_956|>": 152628,
+  "<|action_957|>": 152629,
+  "<|action_958|>": 152630,
+  "<|action_959|>": 152631,
+  "<|action_95|>": 151767,
+  "<|action_960|>": 152632,
+  "<|action_961|>": 152633,
+  "<|action_962|>": 152634,
+  "<|action_963|>": 152635,
+  "<|action_964|>": 152636,
+  "<|action_965|>": 152637,
+  "<|action_966|>": 152638,
+  "<|action_967|>": 152639,
+  "<|action_968|>": 152640,
+  "<|action_969|>": 152641,
+  "<|action_96|>": 151768,
+  "<|action_970|>": 152642,
+  "<|action_971|>": 152643,
+  "<|action_972|>": 152644,
+  "<|action_973|>": 152645,
+  "<|action_974|>": 152646,
+  "<|action_975|>": 152647,
+  "<|action_976|>": 152648,
+  "<|action_977|>": 152649,
+  "<|action_978|>": 152650,
+  "<|action_979|>": 152651,
+  "<|action_97|>": 151769,
+  "<|action_980|>": 152652,
+  "<|action_981|>": 152653,
+  "<|action_982|>": 152654,
+  "<|action_983|>": 152655,
+  "<|action_984|>": 152656,
+  "<|action_985|>": 152657,
+  "<|action_986|>": 152658,
+  "<|action_987|>": 152659,
+  "<|action_988|>": 152660,
+  "<|action_989|>": 152661,
+  "<|action_98|>": 151770,
+  "<|action_990|>": 152662,
+  "<|action_991|>": 152663,
+  "<|action_992|>": 152664,
+  "<|action_993|>": 152665,
+  "<|action_994|>": 152666,
+  "<|action_995|>": 152667,
+  "<|action_996|>": 152668,
+  "<|action_997|>": 152669,
+  "<|action_998|>": 152670,
+  "<|action_999|>": 152671,
+  "<|action_99|>": 151771,
+  "<|action_9|>": 151681,
+  "<|action_end|>": 151670,
+  "<|action_placeholder|>": 151671,
+  "<|action_start|>": 151669,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,120 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' }}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- for message in messages %}
+    {%- if message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content in message.content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                    <|vision_start|><|image_pad|><|vision_end|>
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                    <|vision_start|><|video_pad|><|vision_end|>
+                {%- elif 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content_item in message.content %}
+                {%- if 'text' in content_item %}
+                    {{- content_item.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and message.content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content in message.content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                    <|vision_start|><|image_pad|><|vision_end|>
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                    <|vision_start|><|video_pad|><|vision_end|>
+                {%- elif 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,68 @@

+{
+  "architectures": [
+    "Qwen3VLForConditionalGeneration"
+  ],
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "image_token_id": 151655,
+  "model_type": "qwen3_vl",
+  "pad_token_id": 151643,
+  "text_config": {
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 12288,
+    "max_position_embeddings": 262144,
+    "model_type": "qwen3_vl_text",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        24,
+        20,
+        20
+      ],
+      "rope_theta": 5000000,
+      "rope_type": "default"
+    },
+    "rope_theta": 5000000,
+    "use_cache": true,
+    "vocab_size": 153720
+  },
+  "tie_word_embeddings": false,
+  "transformers_version": "5.0.0.dev0",
+  "use_cache": false,
+  "video_token_id": 151656,
+  "vision_config": {
+    "deepstack_visual_indexes": [
+      8,
+      16,
+      24
+    ],
+    "depth": 27,
+    "dtype": "bfloat16",
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 4304,
+    "model_type": "qwen3_vl",
+    "num_heads": 16,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 4096,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652
+}

contextvla.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import os
+import torch.nn as nn
+import torch
+import torch.distributed as dist
+local_rank = int(os.getenv("LOCAL_RANK", "0"))
+world_size = torch.cuda.device_count()
+rank = local_rank
+class LayerWrapper(nn.Module):
+    def __init__(
+            self,
+            layer,
+            layer_idx,
+            internal_projection=4,
+            img_pattern=[151652],
+            motion_token=0
+    ):
+        super().__init__()
+        self.layer = layer
+        self.layer_idx = layer_idx
+        self.internal_projection = internal_projection
+        self.motion_token = motion_token
+        self.img_pattern = img_pattern
+        assert motion_token == 1
+    def get_removing_indices(self, hidden_states, input_ids):
+        pat_len = len(self.img_pattern)
+        windows = input_ids.unfold(dimension=1, size=pat_len, step=1)
+        pattern_tensor = torch.tensor(self.img_pattern, device=hidden_states.device).view(1, 1, -1)
+        matches = (windows == pattern_tensor).all(dim=-1)
+        match_lists = [torch.nonzero(matches[b], as_tuple=False).squeeze(-1) for b in range(hidden_states.shape[0])]
+        begin_idx = torch.tensor([m[0] for m in match_lists], device=hidden_states.device).unsqueeze(1)
+        end_idx   = torch.tensor([m[-1] for m in match_lists], device=hidden_states.device).unsqueeze(1)
+        return begin_idx, end_idx
+    def left_pad_emb_list(self, emb_list):
+        rev = [e.flip(0) for e in emb_list]
+        padded_rev = torch.nn.utils.rnn.pad_sequence(rev, batch_first=True, padding_value=0)
+        return padded_rev.flip(1)
+    def forward(self, hidden_states, input_ids, *args, **kwargs):
+        bsz, seq_len, dim = hidden_states.shape
+        is_incremental = (
+            "cache_position" in kwargs
+            and kwargs["cache_position"] is not None
+            and seq_len == 1
+        )
+        if self.layer_idx == self.internal_projection and not is_incremental:
+            device = hidden_states.device
+            token_indices = torch.arange(seq_len, device=device).view(1, -1).expand(bsz, -1)
+            begin_idx, end_idx = self.get_removing_indices(hidden_states, input_ids)
+            compress_mask = (end_idx > begin_idx).reshape(-1)
+            keep_mask_front = token_indices < begin_idx
+            keep_mask_back  = token_indices >= end_idx
+            drop_mask = ~(keep_mask_front | keep_mask_back)
+            motion_token = (
+                (hidden_states * drop_mask.unsqueeze(-1)).sum(dim=1)
+                / drop_mask.sum(dim=1, keepdim=True).clamp(min=1)
+            ).reshape(bsz, self.motion_token, -1)
+            hidden_states = [
+                torch.cat([
+                    hidden_states[b][keep_mask_front[b]],
+                    motion_token[b] if compress_mask[b] else torch.tensor([], device=hidden_states.device, dtype=hidden_states.dtype),
+                    hidden_states[b][keep_mask_back[b]]
+                ], dim=0) for b in range(bsz)
+            ]
+            hidden_states = self.left_pad_emb_list(hidden_states)
+            if 'attention_mask' in kwargs and kwargs['attention_mask'] is not None:
+                att_list = [
+                    torch.cat([
+                        kwargs["attention_mask"][b][keep_mask_front[b]],
+                        torch.ones(1, device=kwargs["attention_mask"].device, dtype=kwargs["attention_mask"].dtype) if compress_mask[b] else torch.tensor([], device=kwargs["attention_mask"].device, dtype=kwargs["attention_mask"].dtype),
+                        kwargs["attention_mask"][b][keep_mask_back[b]],
+                    ]) for b in range(bsz)
+                ]
+                kwargs["attention_mask"] = self.left_pad_emb_list(att_list)
+            if 'position_ids' in kwargs.keys() and kwargs['position_ids'] is not None:
+                pos_list = [
+                    torch.cat([
+                        kwargs["position_ids"][b][keep_mask_front[b]],
+                        kwargs["position_ids"][b][begin_idx[b]:begin_idx[b]+1] if compress_mask[b] else torch.tensor([], device=kwargs["position_ids"].device, dtype=kwargs["position_ids"].dtype),
+                        kwargs["position_ids"][b][keep_mask_back[b]],
+                    ]) for b in range(bsz)
+                ]
+                kwargs["position_ids"] = self.left_pad_emb_list(pos_list)
+            if 'position_embeddings' in kwargs.keys() and kwargs['position_embeddings'] is not None:
+                emb_x_list = [
+                    torch.cat([
+                        kwargs["position_embeddings"][0][b][keep_mask_front[b]],
+                        kwargs["position_embeddings"][0][b][begin_idx[b]:begin_idx[b]+1] if compress_mask[b] else torch.tensor([], device=kwargs["position_embeddings"][0].device, dtype=kwargs["position_embeddings"][0].dtype),
+                        kwargs["position_embeddings"][0][b][keep_mask_back[b]],
+                    ], dim=0) for b in range(bsz)
+                ]
+                emb_y_list = [
+                    torch.cat([
+                        kwargs["position_embeddings"][1][b][keep_mask_front[b]],
+                        kwargs["position_embeddings"][1][b][begin_idx[b]:begin_idx[b]+1] if compress_mask[b] else torch.tensor([], device=kwargs["position_embeddings"][0].device, dtype=kwargs["position_embeddings"][0].dtype),
+                        kwargs["position_embeddings"][1][b][keep_mask_back[b]],
+                    ], dim=0) for b in range(bsz)
+                ]
+                emb_x_padded = self.left_pad_emb_list(emb_x_list)
+                emb_y_padded = self.left_pad_emb_list(emb_y_list)
+                kwargs["position_embeddings"] = (emb_x_padded, emb_y_padded)
+            if "cache_position" in kwargs and kwargs["cache_position"] is not None:
+                kwargs["cache_position"] = kwargs["cache_position"][: hidden_states.shape[1]]
+        return self.layer(hidden_states, *args, **kwargs), kwargs

generation_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "5.0.0.dev0"
+}

latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step70000

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f2607e81681f17af60e21c27c8ece74d54fdc00acdf306165af4a5135318d6a
+size 4912008096

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:573b5c8e794a57e60078ea152ac8b553221636b0a0b0e6707e0c3ffb23cfe219
+size 4915963312

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c58179bb7001dfd964833a6345795036c12d74f0ca8685554ff0bcc365c70264
+size 4983071440

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5f906abc28a935826db5e7ed9e5b1bededb367fb7fee1c832ff8cbf70401012
+size 2752528080

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,758 @@

+{
+  "metadata": {
+    "total_parameters": 770288,
+    "total_size": 17563476448
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.layer.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.layer.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.layer.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.layer.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.layer.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.layer.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.layer.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.layer.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.layer.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.layer.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.0.layer.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.layer.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.layer.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.layer.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.layer.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.layer.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.layer.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.layer.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.layer.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.layer.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.layer.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.1.layer.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.10.layer.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.layer.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.layer.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.layer.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.layer.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.layer.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.layer.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.layer.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.layer.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.layer.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.10.layer.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.layer.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.layer.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.layer.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.layer.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.layer.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.layer.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.layer.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.layer.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.layer.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.layer.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.11.layer.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.layer.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.layer.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.layer.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.layer.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.layer.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.layer.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.layer.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.layer.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.layer.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.layer.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.12.layer.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.layer.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.layer.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.layer.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.layer.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.layer.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.layer.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.layer.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.layer.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.layer.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.layer.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.13.layer.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.layer.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.layer.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.layer.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.layer.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.layer.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.layer.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.layer.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.layer.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.layer.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.layer.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.14.layer.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.layer.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.layer.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.layer.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.layer.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.layer.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.layer.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.layer.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.layer.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.layer.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.layer.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.15.layer.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.layer.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.layer.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.layer.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.layer.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.layer.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.layer.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.layer.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.layer.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.layer.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.layer.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.16.layer.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.layer.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.layer.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.layer.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.layer.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.layer.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.layer.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.layer.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.layer.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.layer.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.layer.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.17.layer.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.layer.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.layer.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.layer.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.layer.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.layer.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.layer.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.layer.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.layer.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.layer.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.layer.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.18.layer.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.layer.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.19.layer.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.19.layer.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.19.layer.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.19.layer.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.19.layer.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.layer.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.layer.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.layer.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.layer.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.19.layer.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.2.layer.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.layer.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.layer.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.layer.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.layer.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.layer.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.layer.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.layer.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.layer.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.layer.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.2.layer.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.20.layer.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.20.layer.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.20.layer.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.20.layer.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.20.layer.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.20.layer.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.20.layer.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.20.layer.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.20.layer.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.20.layer.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.20.layer.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.21.layer.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.21.layer.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.21.layer.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.21.layer.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.21.layer.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.21.layer.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.21.layer.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.21.layer.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.21.layer.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.21.layer.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.21.layer.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.22.layer.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.22.layer.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.22.layer.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.22.layer.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.22.layer.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.22.layer.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.22.layer.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.22.layer.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.22.layer.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.22.layer.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.22.layer.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.layer.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.layer.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.layer.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.layer.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.layer.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.layer.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.layer.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.layer.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.layer.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.layer.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.23.layer.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.layer.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.layer.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.layer.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.layer.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.layer.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.layer.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.layer.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.layer.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.layer.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.layer.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.24.layer.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.layer.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.layer.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.layer.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.layer.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.layer.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.layer.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.layer.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.layer.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.layer.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.layer.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.25.layer.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.layer.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.layer.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.layer.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.layer.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.layer.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.layer.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.layer.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.layer.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.layer.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.layer.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.26.layer.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.layer.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.layer.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.layer.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.layer.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.layer.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.layer.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.layer.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.layer.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.layer.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.layer.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.27.layer.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.layer.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.layer.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.layer.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.layer.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.layer.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.layer.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.layer.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.layer.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.layer.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.layer.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.28.layer.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.layer.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.layer.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.layer.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.layer.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.layer.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.layer.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.layer.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.layer.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.layer.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.layer.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.29.layer.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.3.layer.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.layer.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.layer.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.layer.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.layer.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.layer.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.layer.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.layer.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.layer.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.layer.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.3.layer.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.30.layer.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.layer.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.layer.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.layer.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.layer.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.layer.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.layer.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.layer.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.layer.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.layer.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.30.layer.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.layer.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.layer.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.layer.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.layer.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.layer.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.layer.self_attn.k_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.layer.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.layer.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.layer.self_attn.q_norm.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.layer.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.31.layer.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.layer.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.32.layer.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.32.layer.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.32.layer.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.32.layer.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.32.layer.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.32.layer.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.layer.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.32.layer.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.32.layer.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.32.layer.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.language_model.layers.33.layer.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.33.layer.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.33.layer.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.33.layer.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.33.layer.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.33.layer.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.33.layer.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.33.layer.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.33.layer.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.33.layer.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.33.layer.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.34.layer.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.34.layer.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.34.layer.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.34.layer.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.34.layer.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.34.layer.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.34.layer.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.34.layer.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.34.layer.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.34.layer.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.34.layer.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.layer.input_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.layer.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.layer.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.layer.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.layer.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.layer.self_attn.k_norm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.layer.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.layer.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.layer.self_attn.q_norm.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.layer.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.35.layer.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
+    "model.language_model.layers.4.layer.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.layer.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.layer.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.layer.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.layer.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.layer.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.layer.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.layer.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.layer.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.layer.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.4.layer.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.layer.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.layer.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.layer.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.layer.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.layer.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.layer.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.layer.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.layer.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.layer.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.layer.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.5.layer.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.layer.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.6.layer.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.6.layer.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.layer.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.6.layer.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.6.layer.self_attn.k_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.layer.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.layer.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.layer.self_attn.q_norm.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.layer.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.6.layer.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.language_model.layers.7.layer.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.7.layer.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.7.layer.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.7.layer.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.7.layer.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.7.layer.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.7.layer.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.7.layer.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.7.layer.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.7.layer.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.7.layer.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.8.layer.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.8.layer.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.8.layer.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.8.layer.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.8.layer.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.8.layer.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.8.layer.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.8.layer.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.8.layer.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.8.layer.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.8.layer.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.9.layer.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.9.layer.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.9.layer.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.9.layer.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.9.layer.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.9.layer.self_attn.k_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.9.layer.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.9.layer.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.9.layer.self_attn.q_norm.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.9.layer.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.layers.9.layer.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.language_model.norm.weight": "model-00004-of-00004.safetensors",
+    "model.visual.blocks.0.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.0.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.0.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.0.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.0.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.0.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.0.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.1.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.1.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.1.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.1.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.1.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.1.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.10.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.10.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.10.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.10.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.10.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.10.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.11.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.11.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.11.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.11.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.11.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.11.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.11.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.12.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.12.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.12.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.12.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.12.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.12.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.12.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.13.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.13.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.13.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.13.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.13.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.14.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.14.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.14.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.14.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.14.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.14.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.14.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.15.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.15.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.15.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.15.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.15.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.15.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.15.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.15.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.16.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.16.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.16.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.16.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.16.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.16.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.17.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.17.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.17.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.17.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.17.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.17.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.17.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.17.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.18.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.18.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.18.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.18.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.18.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.18.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.18.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.18.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.19.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.19.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.19.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.19.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.19.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.19.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.19.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.19.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.2.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.2.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.2.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.2.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.2.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.2.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.20.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.20.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.20.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.20.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.20.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.20.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.20.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.21.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.21.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.21.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.21.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.21.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.21.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.21.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.21.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.22.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.22.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.22.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.22.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.22.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.22.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.22.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.22.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.23.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.23.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.23.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.23.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.23.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.23.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.23.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.23.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.23.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.24.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.24.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.24.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.24.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.24.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.24.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.24.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.24.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.24.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.25.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.25.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.25.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.25.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.25.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.25.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.25.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.25.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.25.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.26.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.26.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.26.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.26.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.26.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.26.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.26.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.26.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.26.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.3.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.3.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.3.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.3.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.3.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.3.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.3.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.4.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.4.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.4.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.4.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.4.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.4.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.4.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.5.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.5.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.5.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.5.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.5.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.6.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.6.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.6.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.6.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.6.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.6.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.7.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.7.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.7.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.7.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.7.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.7.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.7.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.8.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.8.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.8.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.8.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.8.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.8.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.8.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.8.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.9.attn.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.9.attn.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.9.attn.qkv.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.9.mlp.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.9.norm1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.9.norm2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.blocks.9.norm2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.0.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.0.norm.bias": "model-00001-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.0.norm.weight": "model-00001-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.1.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.1.norm.bias": "model-00001-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.1.norm.weight": "model-00001-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.2.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.2.norm.bias": "model-00001-of-00004.safetensors",
+    "model.visual.deepstack_merger_list.2.norm.weight": "model-00001-of-00004.safetensors",
+    "model.visual.merger.linear_fc1.bias": "model-00001-of-00004.safetensors",
+    "model.visual.merger.linear_fc1.weight": "model-00001-of-00004.safetensors",
+    "model.visual.merger.linear_fc2.bias": "model-00001-of-00004.safetensors",
+    "model.visual.merger.linear_fc2.weight": "model-00001-of-00004.safetensors",
+    "model.visual.merger.norm.bias": "model-00001-of-00004.safetensors",
+    "model.visual.merger.norm.weight": "model-00001-of-00004.safetensors",
+    "model.visual.patch_embed.proj.bias": "model-00001-of-00004.safetensors",
+    "model.visual.patch_embed.proj.weight": "model-00001-of-00004.safetensors",
+    "model.visual.pos_embed.weight": "model-00001-of-00004.safetensors"
+  }
+}

modeling_contextvla.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import os
+from torch import nn
+import torch
+from huggingface_hub import snapshot_download
+from transformers.trainer_utils import load_sharded_checkpoint
+from transformers import AutoConfig, AutoProcessor
+from qwenvl.model.modeling_qwen3_vl import Qwen3VLForConditionalGeneration
+from qwenvl.model.contextvla import LayerWrapper
+ACTION_START_TOKEN = "<|action_start|>"
+ACTION_END_TOKEN = "<|action_end|>"
+ACTION_PLACEHOLDER_TOKEN = "<|action_placeholder|>"
+def add_action_to_processor(processor):
+    custom_tokens = [ACTION_START_TOKEN, ACTION_END_TOKEN, ACTION_PLACEHOLDER_TOKEN]
+    for i in range(2048):
+        custom_tokens.append(f"<|action_{i}|>")
+    num_added = processor.tokenizer.add_tokens(custom_tokens, special_tokens=True)
+    print(f"Added {num_added} custom tokens")
+    return processor
+class ContextVLA_Qwen3VL(Qwen3VLForConditionalGeneration):
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        base_config = AutoConfig.from_pretrained("Qwen/Qwen3-VL-8B-Instruct")
+        model = Qwen3VLForConditionalGeneration._from_config(base_config, **kwargs)
+        for layer_idx in range(len(model.model.language_model.layers)):
+            model.model.language_model.layers[layer_idx] = LayerWrapper(
+                model.model.language_model.layers[layer_idx],
+                layer_idx=layer_idx,
+                internal_projection=4,
+                img_pattern=[151652],
+                motion_token=1
+            )
+        processor = AutoProcessor.from_pretrained(
+            "Qwen/Qwen3-VL-8B-Instruct",
+        )
+        processor = add_action_to_processor(processor)
+        model.resize_token_embeddings(len(processor.tokenizer))
+        if os.path.isdir(pretrained_model_name_or_path):
+            local_dir = pretrained_model_name_or_path
+        else:
+            local_dir = snapshot_download(pretrained_model_name_or_path)
+        load_sharded_checkpoint(model, local_dir)
+        print(f"[ContextVLA] weights loaded from {local_dir}")
+        return model

modeling_qwen3_vl.py ADDED Viewed

	@@ -0,0 +1,1617 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/qwen3_vl/modular_qwen3_vl.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_qwen3_vl.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Any, Optional, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.masking_utils import create_causal_mask
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, auto_docstring, is_torchdynamo_compiling
+from transformers.utils.generic import check_model_inputs
+from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig, Qwen3VLTextConfig, Qwen3VLVisionConfig
+local_rank = int(os.getenv("LOCAL_RANK", "0"))
+world_size = torch.cuda.device_count()
+rank = local_rank
+class Qwen3VLVisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.linear_fc1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=True)
+        self.linear_fc2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=True)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_state):
+        return self.linear_fc2(self.act_fn(self.linear_fc1(hidden_state)))
+class Qwen3VLVisionPatchEmbed(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.patch_size = config.patch_size
+        self.temporal_patch_size = config.temporal_patch_size
+        self.in_channels = config.in_channels
+        self.embed_dim = config.hidden_size
+        kernel_size = [self.temporal_patch_size, self.patch_size, self.patch_size]
+        self.proj = nn.Conv3d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=True)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.view(
+            -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+        )
+        hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
+        return hidden_states
+class Qwen3VLVisionRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+class Qwen3VLVisionPatchMerger(nn.Module):
+    def __init__(self, config: Qwen3VLVisionConfig, use_postshuffle_norm=False) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size * (config.spatial_merge_size**2)
+        self.use_postshuffle_norm = use_postshuffle_norm
+        self.norm = nn.LayerNorm(self.hidden_size if use_postshuffle_norm else config.hidden_size, eps=1e-6)
+        self.linear_fc1 = nn.Linear(self.hidden_size, self.hidden_size)
+        self.act_fn = nn.GELU()
+        self.linear_fc2 = nn.Linear(self.hidden_size, config.out_hidden_size)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm(x.view(-1, self.hidden_size) if self.use_postshuffle_norm else x).view(-1, self.hidden_size)
+        x = self.linear_fc2(self.act_fn(self.linear_fc1(x)))
+        return x
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb_vision(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    orig_q_dtype = q.dtype
+    orig_k_dtype = k.dtype
+    q, k = q.float(), k.float()
+    cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float()
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    q_embed = q_embed.to(orig_q_dtype)
+    k_embed = k_embed.to(orig_k_dtype)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+class Qwen3VLVisionAttention(nn.Module):
+    def __init__(self, config: Qwen3VLVisionConfig) -> None:
+        super().__init__()
+        self.dim = config.hidden_size
+        self.num_heads = config.num_heads
+        self.head_dim = self.dim // self.num_heads
+        self.num_key_value_groups = 1  # needed for eager attention
+        self.qkv = nn.Linear(self.dim, self.dim * 3, bias=True)
+        self.proj = nn.Linear(self.dim, self.dim)
+        self.scaling = self.head_dim**-0.5
+        self.config = config
+        self.attention_dropout = 0.0
+        self.is_causal = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        query_states, key_states, value_states = (
+            self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        )
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin)
+        query_states = query_states.transpose(0, 1).unsqueeze(0)
+        key_states = key_states.transpose(0, 1).unsqueeze(0)
+        value_states = value_states.transpose(0, 1).unsqueeze(0)
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        if self.config._attn_implementation == "flash_attention_2":
+            # Flash Attention 2: Use cu_seqlens for variable length attention
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            attn_output, _ = attention_interface(
+                self,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask=None,
+                scaling=self.scaling,
+                dropout=0.0 if not self.training else self.attention_dropout,
+                cu_seq_lens_q=cu_seqlens,
+                cu_seq_lens_k=cu_seqlens,
+                max_length_q=max_seqlen,
+                max_length_k=max_seqlen,
+                is_causal=False,
+                **kwargs,
+            )
+        else:
+            # Other implementations: Process each chunk separately
+            lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+            splits = [
+                torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states)
+            ]
+            attn_outputs = [
+                attention_interface(
+                    self,
+                    q,
+                    k,
+                    v,
+                    attention_mask=None,
+                    scaling=self.scaling,
+                    dropout=0.0 if not self.training else self.attention_dropout,
+                    is_causal=False,
+                    **kwargs,
+                )[0]
+                for q, k, v in zip(*splits)
+            ]
+            attn_output = torch.cat(attn_outputs, dim=1)
+        attn_output = attn_output.reshape(seq_length, -1).contiguous()
+        attn_output = self.proj(attn_output)
+        return attn_output
+class Qwen3VLVisionBlock(GradientCheckpointingLayer):
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=1e-6)
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=1e-6)
+        self.attn = Qwen3VLVisionAttention(config=config)
+        self.mlp = Qwen3VLVisionMLP(config=config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+class Qwen3VLTextRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, config: Qwen3VLTextConfig, device=None):
+        super().__init__()
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_type = self.config.rope_parameters["rope_type"]
+        rope_init_fn: Callable = self.compute_default_rope_parameters
+        if self.rope_type != "default":
+            rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = inv_freq
+        self.mrope_section = config.rope_parameters.get("mrope_section", [24, 20, 20])
+    @staticmethod
+    def compute_default_rope_parameters(
+        config: Optional[Qwen3VLTextConfig] = None,
+        device: Optional["torch.device"] = None,
+        seq_len: Optional[int] = None,
+    ) -> tuple["torch.Tensor", float]:
+        """
+        Computes the inverse frequencies according to the original RoPE implementation
+        Args:
+            config ([`~transformers.PreTrainedConfig`]):
+                The model configuration.
+            device (`torch.device`):
+                The device to use for initialization of the inverse frequencies.
+            seq_len (`int`, *optional*):
+                The current sequence length. Unused for this type of RoPE.
+        Returns:
+            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+        """
+        base = config.rope_parameters["rope_theta"]
+        dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        attention_factor = 1.0  # Unused in this type of RoPE
+        # Compute the inverse frequencies
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
+        )
+        return inv_freq, attention_factor
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        # In contrast to other models, Qwen3VL has different position ids for the grids
+        # So we expand the inv_freq to shape (3, ...)
+        if position_ids.ndim == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+        inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
+        position_ids_expanded = position_ids[:, :, None, :].float()  # shape (3, bs, 1, positions)
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
+            freqs = self.apply_interleaved_mrope(freqs, self.mrope_section)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+    def apply_interleaved_mrope(self, freqs, mrope_section):
+        """Apply interleaved MRoPE to 3D rotary embeddings.
+        Reorganizes frequency layout from chunked [TTT...HHH...WWW] to
+        interleaved [THTHWHTHW...TT], preserving frequency continuity.
+        args:
+            x: (3, bs, seq_len, head_dim // 2)
+            mrope_section: (3,)
+        returns:
+            x_t: (bs, seq_len, head_dim // 2)
+        """
+        freqs_t = freqs[0]  # just overwrite the first dimension T
+        for dim, offset in enumerate((1, 2), start=1):  # H, W
+            length = mrope_section[dim] * 3
+            idx = slice(offset, length, 3)
+            freqs_t[..., idx] = freqs[dim, ..., idx]
+        return freqs_t
+@use_kernel_forward_from_hub("RMSNorm")
+class Qwen3VLTextRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps: float = 1e-6) -> None:
+        """
+        Qwen3VLTextRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class Qwen3VLTextAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: Qwen3VLTextConfig, layer_idx: int):
+        super().__init__()
+        self.layer_type = config.layer_types[layer_idx] if hasattr(config, "layer_types") else None
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.q_norm = Qwen3VLTextRMSNorm(self.head_dim, eps=config.rms_norm_eps)  # unlike olmo, only on the head dim!
+        self.k_norm = Qwen3VLTextRMSNorm(
+            self.head_dim, eps=config.rms_norm_eps
+        )  # thus post q_norm does not need reshape
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class Qwen3VLTextMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class Qwen3VLTextDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Qwen3VLTextConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Qwen3VLTextAttention(config=config, layer_idx=layer_idx)
+        self.mlp = Qwen3VLTextMLP(config)
+        self.input_layernorm = Qwen3VLTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen3VLTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Llava outputs, with hidden states and attentions.
+    """
+)
+class Qwen3VLModelOutputWithPast(ModelOutput):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+        The rope index difference between sequence length and multimodal rope.
+    """
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+@auto_docstring
+class Qwen3VLPreTrainedModel(PreTrainedModel):
+    config: Qwen3VLConfig
+    base_model_prefix = "model"
+    input_modalities = ["image", "video", "text"]
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen3VLTextDecoderLayer", "Qwen3VLVisionBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Qwen3VLTextDecoderLayer,
+        "attentions": Qwen3VLTextAttention,
+    }
+class Qwen3VLVisionModel(Qwen3VLPreTrainedModel):
+    config: Qwen3VLVisionConfig
+    _no_split_modules = ["Qwen3VLVisionBlock"]
+    def __init__(self, config, *inputs, **kwargs) -> None:
+        super().__init__(config, *inputs, **kwargs)
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_size = config.patch_size
+        self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
+        self.patch_embed = Qwen3VLVisionPatchEmbed(
+            config=config,
+        )
+        self.pos_embed = nn.Embedding(config.num_position_embeddings, config.hidden_size)
+        self.num_grid_per_side = int(config.num_position_embeddings**0.5)
+        head_dim = config.hidden_size // config.num_heads
+        self.rotary_pos_emb = Qwen3VLVisionRotaryEmbedding(head_dim // 2)
+        self.blocks = nn.ModuleList([Qwen3VLVisionBlock(config) for _ in range(config.depth)])
+        self.merger = Qwen3VLVisionPatchMerger(
+            config=config,
+            use_postshuffle_norm=False,
+        )
+        self.deepstack_visual_indexes = config.deepstack_visual_indexes
+        self.deepstack_merger_list = nn.ModuleList(
+            [
+                Qwen3VLVisionPatchMerger(
+                    config=config,
+                    use_postshuffle_norm=True,
+                )
+                for _ in range(len(config.deepstack_visual_indexes))
+            ]
+        )
+        self.gradient_checkpointing = False
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        merge_size = self.spatial_merge_size
+        max_hw = int(grid_thw[:, 1:].max().item())
+        freq_table = self.rotary_pos_emb(max_hw)  # (max_hw, dim // 2)
+        device = freq_table.device
+        total_tokens = int(torch.prod(grid_thw, dim=1).sum().item())
+        pos_ids = torch.empty((total_tokens, 2), dtype=torch.long, device=device)
+        offset = 0
+        for num_frames, height, width in grid_thw:
+            merged_h, merged_w = height // merge_size, width // merge_size
+            block_rows = torch.arange(merged_h, device=device)  # block row indices
+            block_cols = torch.arange(merged_w, device=device)  # block col indices
+            intra_row = torch.arange(merge_size, device=device)  # intra-block row offsets
+            intra_col = torch.arange(merge_size, device=device)  # intra-block col offsets
+            # Compute full-resolution positions
+            row_idx = block_rows[:, None, None, None] * merge_size + intra_row[None, None, :, None]
+            col_idx = block_cols[None, :, None, None] * merge_size + intra_col[None, None, None, :]
+            row_idx = row_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1)
+            col_idx = col_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1)
+            coords = torch.stack((row_idx, col_idx), dim=-1)
+            if num_frames > 1:
+                coords = coords.repeat(num_frames, 1)
+            num_tokens = coords.shape[0]
+            pos_ids[offset : offset + num_tokens] = coords
+            offset += num_tokens
+        embeddings = freq_table[pos_ids]  # lookup rotary embeddings
+        embeddings = embeddings.flatten(1)
+        return embeddings
+    def fast_pos_embed_interpolate(self, grid_thw):
+        grid_ts, grid_hs, grid_ws = grid_thw[:, 0], grid_thw[:, 1], grid_thw[:, 2]
+        device = self.pos_embed.weight.device
+        idx_list = [[] for _ in range(4)]
+        weight_list = [[] for _ in range(4)]
+        for t, h, w in zip(grid_ts, grid_hs, grid_ws):
+            h_idxs = torch.linspace(0, self.num_grid_per_side - 1, h)
+            w_idxs = torch.linspace(0, self.num_grid_per_side - 1, w)
+            h_idxs_floor = h_idxs.int()
+            w_idxs_floor = w_idxs.int()
+            h_idxs_ceil = (h_idxs.int() + 1).clip(max=self.num_grid_per_side - 1)
+            w_idxs_ceil = (w_idxs.int() + 1).clip(max=self.num_grid_per_side - 1)
+            dh = h_idxs - h_idxs_floor
+            dw = w_idxs - w_idxs_floor
+            base_h = h_idxs_floor * self.num_grid_per_side
+            base_h_ceil = h_idxs_ceil * self.num_grid_per_side
+            indices = [
+                (base_h[None].T + w_idxs_floor[None]).flatten(),
+                (base_h[None].T + w_idxs_ceil[None]).flatten(),
+                (base_h_ceil[None].T + w_idxs_floor[None]).flatten(),
+                (base_h_ceil[None].T + w_idxs_ceil[None]).flatten(),
+            ]
+            weights = [
+                ((1 - dh)[None].T * (1 - dw)[None]).flatten(),
+                ((1 - dh)[None].T * dw[None]).flatten(),
+                (dh[None].T * (1 - dw)[None]).flatten(),
+                (dh[None].T * dw[None]).flatten(),
+            ]
+            for i in range(4):
+                idx_list[i].extend(indices[i].tolist())
+                weight_list[i].extend(weights[i].tolist())
+        idx_tensor = torch.tensor(idx_list, dtype=torch.long, device=device)
+        weight_tensor = torch.tensor(weight_list, dtype=self.pos_embed.weight.dtype, device=device)
+        pos_embeds = self.pos_embed(idx_tensor).to(device) * weight_tensor[:, :, None]
+        patch_pos_embeds = pos_embeds[0] + pos_embeds[1] + pos_embeds[2] + pos_embeds[3]
+        patch_pos_embeds = patch_pos_embeds.split([h * w for h, w in zip(grid_hs, grid_ws)])
+        patch_pos_embeds_permute = []
+        merge_size = self.config.spatial_merge_size
+        for pos_embed, t, h, w in zip(patch_pos_embeds, grid_ts, grid_hs, grid_ws):
+            pos_embed = pos_embed.repeat(t, 1)
+            pos_embed = (
+                pos_embed.view(t, h // merge_size, merge_size, w // merge_size, merge_size, -1)
+                .permute(0, 1, 3, 2, 4, 5)
+                .flatten(0, 4)
+            )
+            patch_pos_embeds_permute.append(pos_embed)
+        patch_pos_embeds = torch.cat(patch_pos_embeds_permute)
+        return patch_pos_embeds
+    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
+                The final hidden states of the model.
+            grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
+                The temporal, height and width of feature shape of each image in LLM.
+        Returns:
+            `torch.Tensor`: hidden_states.
+        """
+        hidden_states = self.patch_embed(hidden_states)
+        pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
+        hidden_states = hidden_states + pos_embeds
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        seq_len, _ = hidden_states.size()
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0,
+            # Select dtype based on the following factors:
+            #  - FA2 requires that cu_seqlens_q must have dtype int32
+            #  - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw
+            # See https://github.com/huggingface/transformers/pull/34852 for more information
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        deepstack_feature_lists = []
+        for layer_num, blk in enumerate(self.blocks):
+            hidden_states = blk(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+            if layer_num in self.deepstack_visual_indexes:
+                deepstack_feature = self.deepstack_merger_list[self.deepstack_visual_indexes.index(layer_num)](
+                    hidden_states
+                )
+                deepstack_feature_lists.append(deepstack_feature)
+        hidden_states = self.merger(hidden_states)
+        return hidden_states, deepstack_feature_lists
+@auto_docstring(
+    custom_intro=(
+        "Text part of Qwen3VL, "
+        "not a pure text-only model, as DeepStack integrates visual features into the early hidden states."
+    )
+)
+class Qwen3VLTextModel(Qwen3VLPreTrainedModel):
+    config: Qwen3VLTextConfig
+    _no_split_modules = ["Qwen3VLTextDecoderLayer"]
+    def __init__(self, config: Qwen3VLTextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen3VLTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Qwen3VLTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen3VLTextRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    @check_model_inputs()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        # args for deepstack
+        visual_pos_masks: Optional[torch.Tensor] = None,
+        deepstack_visual_embeds: Optional[list[torch.Tensor]] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        r"""
+        visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*):
+            The mask of the visual positions.
+        deepstack_visual_embeds (`list[torch.Tensor]`, *optional*):
+            The deepstack visual embeddings. The shape is (num_layers, visual_seqlen, embed_dim).
+            The feature is extracted from the different visual encoder layers, and fed to the decoder
+            hidden states. It's from the paper DeepStack(https://arxiv.org/abs/2406.04334).
+        """
+        # torch.jit.trace() doesn't support cache objects in the output
+        if use_cache and past_key_values is None and not torch.jit.is_tracing():
+            past_key_values = DynamicCache(config=self.config)
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        # the hard coded `3` is for temporal, height and width.
+        if position_ids is None:
+            position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)
+        elif position_ids.ndim == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+        if position_ids.ndim == 3 and position_ids.shape[0] == 4:
+            text_position_ids = position_ids[0]
+            position_ids = position_ids[1:]
+        else:
+            text_position_ids = position_ids[0]
+        attention_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=text_position_ids,
+        )
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers: FIXME: HARD CODING
+        for layer_idx, decoder_layer in enumerate(self.layers):
+            layer_outputs = decoder_layer(
+                hidden_states,
+                input_ids,
+                attention_mask=attention_mask,
+                position_ids=text_position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+            ## FIXME: HARD CODING
+            hidden_states = layer_outputs[0]
+            if 'attention_mask' in layer_outputs[1]:
+                attention_mask = layer_outputs[1]['attention_mask']
+            if 'position_ids' in layer_outputs[1]:
+                text_position_ids = layer_outputs[1]['position_ids']
+            if 'past_key_values' in layer_outputs[1]:
+                past_key_values = layer_outputs[1]['past_key_values']
+            if 'cache_position' in layer_outputs[1]:
+                cache_position = layer_outputs[1]['cache_position']
+            if 'position_embeddings' in layer_outputs[1]:
+                position_embeddings = layer_outputs[1]['position_embeddings']
+            # add visual features to the hidden states of first several layers
+            if deepstack_visual_embeds is not None and layer_idx in range(len(deepstack_visual_embeds)):
+                hidden_states = self._deepstack_process(
+                    hidden_states,
+                    visual_pos_masks,
+                    deepstack_visual_embeds[layer_idx],
+                )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+    def _deepstack_process(
+        self, hidden_states: torch.Tensor, visual_pos_masks: torch.Tensor, visual_embeds: torch.Tensor
+    ):
+        visual_pos_masks = visual_pos_masks.to(hidden_states.device)
+        visual_embeds = visual_embeds.to(hidden_states.device, hidden_states.dtype)
+        hidden_states = hidden_states.clone()
+        local_this = hidden_states[visual_pos_masks, :] + visual_embeds
+        hidden_states[visual_pos_masks, :] = local_this
+        return hidden_states
+@auto_docstring
+class Qwen3VLModel(Qwen3VLPreTrainedModel):
+    base_model_prefix = ""
+    _checkpoint_conversion_mapping = {}
+    # Reference: fix gemma3 grad acc #37208
+    accepts_loss_kwargs = False
+    config: Qwen3VLConfig
+    _no_split_modules = ["Qwen3VLTextDecoderLayer", "Qwen3VLVisionBlock"]
+    def __init__(self, config):
+        super().__init__(config)
+        self.visual = Qwen3VLVisionModel._from_config(config.vision_config)
+        self.language_model = Qwen3VLTextModel._from_config(config.text_config)
+        self.rope_deltas = None  # cache rope_deltas here
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+    def get_decoder(self):
+        return self.language_model
+    def get_rope_index(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Different from the original implementation, Qwen3VL use timestamps rather than absolute time position ids."""
+        # Since we use timestamps to seperate videos, like <t1> <vision_start> <frame1> <vision_end> <t2> <vision_start> <frame2> <vision_end>, the video_grid_thw should also be split
+        if video_grid_thw is not None:
+            video_grid_thw = torch.repeat_interleave(video_grid_thw, video_grid_thw[:, 0], dim=0)
+            video_grid_thw[:, 0] = 1
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+        mrope_position_deltas = []
+        if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+            total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = torch.ones_like(total_input_ids)
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
+            image_index, video_index = 0, 0
+            attention_mask = attention_mask.to(total_input_ids.device)
+            for i, input_ids in enumerate(total_input_ids):
+                input_ids = input_ids[attention_mask[i] == 1]
+                image_nums, video_nums = 0, 0
+                vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
+                vision_tokens = input_ids[vision_start_indices + 1]
+                image_nums = (vision_tokens == image_token_id).sum()
+                video_nums = (vision_tokens == video_token_id).sum()
+                input_tokens = input_ids.tolist()
+                llm_pos_ids_list: list = []
+                st = 0
+                remain_images, remain_videos = image_nums, video_nums
+                for _ in range(image_nums + video_nums):
+                    if image_token_id in input_tokens and remain_images > 0:
+                        ed_image = input_tokens.index(image_token_id, st)
+                    else:
+                        ed_image = len(input_tokens) + 1
+                    if video_token_id in input_tokens and remain_videos > 0:
+                        ed_video = input_tokens.index(video_token_id, st)
+                    else:
+                        ed_video = len(input_tokens) + 1
+                    if ed_image < ed_video:
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        image_index += 1
+                        remain_images -= 1
+                        ed = ed_image
+                    else:
+                        t, h, w = (
+                            video_grid_thw[video_index][0],
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+                        video_index += 1
+                        remain_videos -= 1
+                        ed = ed_video
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t.item(),
+                        h.item() // spatial_merge_size,
+                        w.item() // spatial_merge_size,
+                    )
+                    text_len = ed - st
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                    # t_index is always 0 because llm_grid_t is always 1 (we use timestamps to encode the temporal information for videos)
+                    t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                    llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+                    st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+                if st < len(input_tokens):
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    text_len = len(input_tokens) - st
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+            mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+            else:
+                position_ids = (
+                    torch.arange(input_ids.shape[1], device=input_ids.device)
+                    .view(1, 1, -1)
+                    .expand(3, input_ids.shape[0], -1)
+                )
+                mrope_position_deltas = torch.zeros(
+                    [input_ids.shape[0], 1],
+                    device=input_ids.device,
+                    dtype=input_ids.dtype,
+                )
+            return position_ids, mrope_position_deltas
+    def get_video_features(
+        self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
+    ):
+        """
+        Encodes videos into continuous embeddings that can be forwarded to the language model. The deepstack visual features are also returned.
+        Args:
+            pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input videos.
+            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+        """
+        # Same implementation as for images
+        return self.get_image_features(pixel_values_videos, video_grid_thw)
+    def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
+        """
+        Encodes images into continuous embeddings that can be forwarded to the language model. The deepstack visual features are also returned.
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+                The temporal, height and width of feature shape of each image in LLM.
+        """
+        pixel_values = pixel_values.type(self.visual.dtype)
+        image_embeds, deepstack_image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+        split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
+        image_embeds = torch.split(image_embeds, split_sizes)
+        return image_embeds, deepstack_image_embeds
+    def get_placeholder_mask(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.FloatTensor,
+        image_features: Optional[torch.FloatTensor] = None,
+        video_features: Optional[torch.FloatTensor] = None,
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+            special_video_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_video_mask = special_video_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+            special_video_mask = input_ids == self.config.video_token_id
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}"
+            )
+        n_video_tokens = special_video_mask.sum()
+        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel():
+            raise ValueError(
+                f"Videos features and video tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0]}"
+            )
+        return special_image_mask, special_video_mask
+    @auto_docstring
+    @check_model_inputs()
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Qwen3VLModelOutputWithPast]:
+        r"""
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+        image_mask = None
+        video_mask = None
+        if pixel_values is not None:
+            image_embeds, deepstack_image_embeds = self.get_image_features(pixel_values, image_grid_thw)
+            image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            image_mask, _ = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+        if pixel_values_videos is not None:
+            video_embeds, deepstack_video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
+            video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            _, video_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+        visual_pos_masks = None
+        deepstack_visual_embeds = None
+        if image_mask is not None and video_mask is not None:
+            # aggregate visual_pos_masks and deepstack_visual_embeds
+            image_mask = image_mask[..., 0]
+            video_mask = video_mask[..., 0]
+            visual_pos_masks = image_mask | video_mask
+            deepstack_visual_embeds = []
+            image_mask_joint = image_mask[visual_pos_masks]
+            video_mask_joint = video_mask[visual_pos_masks]
+            for img_embed, vid_embed in zip(deepstack_image_embeds, deepstack_video_embeds):
+                embed_joint = img_embed.new_zeros(visual_pos_masks.sum(), img_embed.shape[-1]).to(img_embed.device)
+                embed_joint[image_mask_joint, :] = img_embed
+                embed_joint[video_mask_joint, :] = vid_embed
+                deepstack_visual_embeds.append(embed_joint)
+        elif image_mask is not None:
+            image_mask = image_mask[..., 0]
+            visual_pos_masks = image_mask
+            deepstack_visual_embeds = deepstack_image_embeds
+        elif video_mask is not None:
+            video_mask = video_mask[..., 0]
+            visual_pos_masks = video_mask
+            deepstack_visual_embeds = deepstack_video_embeds
+        if position_ids is None:
+            past_key_values_length = 0 if past_key_values is None else past_key_values.get_seq_length()
+            if self.rope_deltas is None or past_key_values_length == 0:
+                position_ids, rope_deltas = self.get_rope_index(
+                    input_ids,
+                    image_grid_thw,
+                    video_grid_thw,
+                    attention_mask=attention_mask,
+                )
+                self.rope_deltas = rope_deltas
+            # then use the prev pre-calculated rope-deltas to get the correct position ids
+            else:
+                batch_size, seq_length, _ = inputs_embeds.shape
+                delta = (past_key_values_length + self.rope_deltas).to(inputs_embeds.device)
+                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                if cache_position is not None:  # otherwise `deltas` is an int `0`
+                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+        outputs = self.language_model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            visual_pos_masks=visual_pos_masks,
+            deepstack_visual_embeds=deepstack_visual_embeds,
+            **kwargs,
+        )
+        return Qwen3VLModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            rope_deltas=self.rope_deltas,
+        )
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Qwen3VL causal language model (or autoregressive) outputs.
+    """
+)
+class Qwen3VLCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+        The rope index difference between sequence length and multimodal rope.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+class Qwen3VLForConditionalGeneration(Qwen3VLPreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {}
+    _tied_weights_keys = {"lm_head.weight": "model.language_model.embed_tokens.weight"}
+    # Reference: fix gemma3 grad acc #37208
+    accepts_loss_kwargs = False
+    config: Qwen3VLConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Qwen3VLModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+    def get_decoder(self):
+        return self.model.get_decoder()
+    def get_video_features(
+        self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
+    ):
+        return self.model.get_video_features(pixel_values_videos, video_grid_thw)
+    def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None):
+        return self.model.get_image_features(pixel_values, image_grid_thw)
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+    @property
+    def visual(self):
+        return self.model.visual
+    @check_model_inputs()
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Qwen3VLCausalLMOutputWithPast]:
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels[..., -1*logits.shape[1]:], vocab_size=self.config.text_config.vocab_size)
+        return Qwen3VLCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            rope_deltas=outputs.rope_deltas,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        # Qwen3VL position_ids are prepared with rope_deltas
+        if position_ids is None:
+            # Calculate RoPE index once per generation in the pre-fill stage only.
+            # When compiling, we can't check tensor values thus we check only input length
+            # It is safe to assume that `length!=1` means we're in pre-fill because compiled
+            # models currently cannot do asssisted decoding
+            if model_inputs["cache_position"][0] == 0 or self.model.rope_deltas is None:
+                vision_positions, rope_deltas = self.model.get_rope_index(
+                    model_inputs.get("input_ids", None),
+                    image_grid_thw=image_grid_thw,
+                    video_grid_thw=video_grid_thw,
+                    attention_mask=attention_mask,
+                )
+                self.model.rope_deltas = rope_deltas
+            # then use the prev pre-calculated rope-deltas to get the correct position ids
+            elif "position_ids" in model_inputs:
+                batch_size, seq_length = model_inputs["position_ids"].shape
+                device = model_inputs["position_ids"].device
+                position_ids = torch.arange(seq_length, device=device)
+                position_ids = position_ids.view(1, 1, -1).expand(3, batch_size, -1)
+                delta = cache_position[0] + self.model.rope_deltas
+                delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
+                vision_positions = position_ids + delta.expand_as(position_ids)
+            # Concatenate "text + vision" positions into [4, bs, seq-len]
+            text_positions = model_inputs["position_ids"][None, ...]
+            model_inputs["position_ids"] = torch.cat([text_positions, vision_positions], dim=0)
+        if cache_position[0] != 0:
+            model_inputs["pixel_values"] = None
+            model_inputs["pixel_values_videos"] = None
+        return model_inputs
+    def _get_image_nums_and_video_nums(
+        self,
+        input_ids: Optional[torch.LongTensor],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
+        These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+        Returns:
+            image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`)
+            video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`)
+        """
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+        if inputs_embeds is not None:
+            vision_start_mask = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(vision_start_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+            image_mask = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(image_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+            video_mask = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(video_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+        else:
+            vision_start_mask = input_ids == vision_start_token_id
+            image_mask = input_ids == image_token_id
+            video_mask = input_ids == video_token_id
+        vision_first_mask = torch.roll(vision_start_mask, shifts=1, dims=1)
+        image_nums = torch.sum(vision_first_mask & image_mask, dim=1)
+        video_nums = torch.sum(vision_first_mask & video_mask, dim=1)
+        return image_nums, video_nums
+    def _expand_inputs_for_generation(
+        self,
+        expand_size: int = 1,
+        is_encoder_decoder: bool = False,
+        input_ids: Optional[torch.LongTensor] = None,
+        **model_kwargs,
+    ) -> tuple[torch.LongTensor, dict[str, Any]]:
+        # Overwritten -- Support for expanding tensors without a batch size dimension
+        # e.g., pixel_values, image_grid_thw, pixel_values_videos, video_grid_thw, second_per_grid_t
+        # pixel_values.shape[0] is sum(seqlen_images for samples)
+        # image_grid_thw.shape[0] is sum(num_images for samples)
+        if expand_size == 1:
+            return input_ids, model_kwargs
+        visual_keys = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
+        def _expand_dict_for_generation_visual(dict_to_expand):
+            image_grid_thw = model_kwargs.get("image_grid_thw", None)
+            video_grid_thw = model_kwargs.get("video_grid_thw", None)
+            image_nums, video_nums = self._get_image_nums_and_video_nums(
+                input_ids, inputs_embeds=model_kwargs.get("inputs_embeds", None)
+            )
+            # video_nums: (batch_size,)
+            # since video_nums is the number of videos in the input dependent on the input_ids(vision_start),
+            # but qwen3vl append vision_start to each frame of each video, so we need to recover the real video_nums according to video_grid_thw
+            if video_grid_thw is not None:
+                cumulative_frame_counts = torch.cumsum(video_grid_thw[:, 0], dim=0)
+                cumulative_token_video_counts = torch.cumsum(video_nums, dim=0)
+                # Find video boundaries in cumulative_frame_counts
+                video_boundary_indices = torch.searchsorted(cumulative_frame_counts, cumulative_token_video_counts)
+                # example: video_boundary_indices = [3, 5] means video_nums = [4, 2]
+                video_nums = torch.diff(torch.cat([-video_boundary_indices.new_ones(1), video_boundary_indices]))
+            def _repeat_interleave_samples(x, lengths, repeat_times):
+                samples = torch.split(x, lengths)
+                repeat_args = [repeat_times] + [1] * (x.dim() - 1)
+                result = torch.cat([sample.repeat(*repeat_args) for sample in samples], dim=0)
+                return result
+            for key in dict_to_expand:
+                if key == "pixel_values":
+                    # split images into samples
+                    samples = torch.split(image_grid_thw, list(image_nums))
+                    # compute the sequence length of images for each sample
+                    lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "image_grid_thw":
+                    # get the num of images for each sample
+                    lengths = list(image_nums)
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "pixel_values_videos":
+                    samples = torch.split(video_grid_thw, list(video_nums))
+                    lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "video_grid_thw":
+                    lengths = list(video_nums)
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+            return dict_to_expand
+        def _expand_dict_for_generation(dict_to_expand):
+            for key in dict_to_expand:
+                if (
+                    key != "cache_position"
+                    and dict_to_expand[key] is not None
+                    and isinstance(dict_to_expand[key], torch.Tensor)
+                    and key not in visual_keys
+                ):
+                    dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
+            return dict_to_expand
+        model_kwargs = _expand_dict_for_generation_visual(model_kwargs)
+        if input_ids is not None:
+            input_ids = input_ids.repeat_interleave(expand_size, dim=0)
+        model_kwargs = _expand_dict_for_generation(model_kwargs)
+        if is_encoder_decoder:
+            if model_kwargs.get("encoder_outputs") is None:
+                raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
+            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
+        return input_ids, model_kwargs
+__all__ = [
+    "Qwen3VLVisionModel",
+    "Qwen3VLForConditionalGeneration",
+    "Qwen3VLModel",
+    "Qwen3VLPreTrainedModel",
+    "Qwen3VLTextModel",
+]

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78113b4ebba2cf35807c8b5277d635e4940fee06c39a0eda6d913c7c7f9edbf1
+size 11815343

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

trainer_state.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1475be9f09ec148da85bbe25c4595c6416527ff01e26fb2976cc14377b5c397d
+size 11351594

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c81a80ebcd627171a70a382e22a64c162c34370fa9d42260e3bf782beb3383ae
+size 7121

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,760 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example:
+#   python zero_to_fp32.py . output_dir/
+#   or
+#   python zero_to_fp32.py . output_dir/ --safe_serialization
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+import gc
+import json
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device, weights_only=False)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in tqdm(files, desc='Loading checkpoint shards'):
+        state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if ZERO_STAGE not in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                                          exclude_frozen_parameters)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+class GatheredTensor:
+    """
+    A pseudo tensor that collects partitioned weights.
+    It is more memory efficient when there are multiple groups.
+    """
+    def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
+        self.flat_groups = flat_groups
+        self.flat_groups_offset = flat_groups_offset
+        self.offset = offset
+        self.partitioned_numel = partitioned_numel
+        self.shape = shape
+        self.dtype = self.flat_groups[0][0].dtype
+    def contiguous(self):
+        """
+        Merge partitioned weights from flat_groups into a single tensor.
+        """
+        end_idx = self.offset + self.partitioned_numel
+        world_size = len(self.flat_groups)
+        pad_flat_param_chunks = []
+        for rank_i in range(world_size):
+            # for each rank, we need to collect weights from related group/groups
+            flat_groups_at_rank_i = self.flat_groups[rank_i]
+            start_group_id = None
+            end_group_id = None
+            for group_id in range(len(self.flat_groups_offset)):
+                if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
+                    start_group_id = group_id
+                if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
+                    end_group_id = group_id
+                    break
+            # collect weights from related group/groups
+            for group_id in range(start_group_id, end_group_id + 1):
+                flat_tensor = flat_groups_at_rank_i[group_id]
+                start_offset = self.offset - self.flat_groups_offset[group_id]
+                end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
+                pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
+        # collect weights from all ranks
+        pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
+        param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
+        return param
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
+    for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # memory efficient tensor
+        tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
+        state_dict[name] = tensor
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
+                                               exclude_frozen_parameters):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    if not exclude_frozen_parameters:
+        _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def to_torch_tensor(state_dict, return_empty_tensor=False):
+    """
+    Convert state_dict of GatheredTensor to torch tensor
+    """
+    torch_state_dict = {}
+    converted_tensors = {}
+    for name, tensor in state_dict.items():
+        tensor_id = id(tensor)
+        if tensor_id in converted_tensors:  # shared tensors
+            shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
+            torch_state_dict[name] = shared_tensor
+        else:
+            converted_tensors[tensor_id] = name
+            if return_empty_tensor:
+                torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
+            else:
+                torch_state_dict[name] = tensor.contiguous()
+    return torch_state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                             tag=None,
+                                             exclude_frozen_parameters=False,
+                                             lazy_mode=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+        - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
+          Convert the pesduo tensor to torch tensor by ``.contiguous()``
+    Returns:
+        - pytorch ``state_dict``
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
+    You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint. Or you can load state_dict in lazy mode ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
+        for name, lazy_tensor in state_dict.item():
+            tensor = lazy_tensor.contiguous()  # to cpu
+            print(name, tensor)
+            # del tensor to release memory if it no longer in use
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
+    if lazy_mode:
+        return state_dict
+    else:
+        return to_torch_tensor(state_dict)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
+                                               output_dir,
+                                               max_shard_size="5GB",
+                                               safe_serialization=False,
+                                               tag=None,
+                                               exclude_frozen_parameters=False):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_dir``: directory to the pytorch fp32 state_dict output files
+        - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
+        - ``safe_serialization``:  whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+        - ``exclude_frozen_parameters``: exclude frozen parameters
+    """
+    # Dependency pre-check
+    if safe_serialization:
+        try:
+            from safetensors.torch import save_file
+        except ImportError:
+            print('If you want to use `safe_serialization`, please `pip install safetensors`')
+            raise
+    if max_shard_size is not None:
+        try:
+            from huggingface_hub import split_torch_state_dict_into_shards
+        except ImportError:
+            print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
+            raise
+    # Convert zero checkpoint to state_dict
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
+                                                          tag,
+                                                          exclude_frozen_parameters,
+                                                          lazy_mode=True)
+    # Shard the model if it is too big.
+    weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
+    if max_shard_size is not None:
+        filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
+        # an memory-efficient approach for sharding
+        empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
+        state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
+                                                              filename_pattern=filename_pattern,
+                                                              max_shard_size=max_shard_size)
+    else:
+        from collections import namedtuple
+        StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
+        state_dict_split = StateDictSplit(is_sharded=False,
+                                          filename_to_tensors={weights_name: list(state_dict.keys())})
+    # Save the model by shard
+    os.makedirs(output_dir, exist_ok=True)
+    filename_to_tensors = state_dict_split.filename_to_tensors.items()
+    for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
+        shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
+        shard_state_dict = to_torch_tensor(shard_state_dict)
+        output_path = os.path.join(output_dir, shard_file)
+        if safe_serialization:
+            save_file(shard_state_dict, output_path, metadata={"format": "pt"})
+        else:
+            torch.save(shard_state_dict, output_path)
+        # release the memory of current shard
+        for tensor_name in list(shard_state_dict.keys()):
+            del state_dict[tensor_name]
+            del shard_state_dict[tensor_name]
+        del shard_state_dict
+        gc.collect()
+    # Save index if sharded
+    if state_dict_split.is_sharded:
+        index = {
+            "metadata": state_dict_split.metadata,
+            "weight_map": state_dict_split.tensor_to_filename,
+        }
+        save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
+        save_index_file = os.path.join(output_dir, save_index_file)
+        with open(save_index_file, "w", encoding="utf-8") as f:
+            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+            f.write(content)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info("Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info("Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument("output_dir",
+                        type=str,
+                        help="directory to the pytorch fp32 state_dict output files"
+                        "(e.g. path/checkpoint-12-output/)")
+    parser.add_argument(
+        "--max_shard_size",
+        type=str,
+        default="5GB",
+        help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
+        "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
+        "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
+        "without CPU OOM issues.")
+    parser.add_argument(
+        "--safe_serialization",
+        default=False,
+        action='store_true',
+        help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
+                                               args.output_dir,
+                                               max_shard_size=args.max_shard_size,
+                                               safe_serialization=args.safe_serialization,
+                                               tag=args.tag,
+                                               exclude_frozen_parameters=args.exclude_frozen_parameters)