zstriving commited on May 12

Commit

f313e88

verified ·

1 Parent(s): 113b391

Add files using upload-large-folder tool

Browse files

Files changed (24) hide show

.gitattributes +1 -0
README.md +88 -0
added_tokens.json +2081 -0
chat_template.jinja +120 -0
config.json +155 -0
configuration_prts_qwen3_vl.py +345 -0
dit_action_head.py +1230 -0
generation_config.json +12 -0
merges.txt +0 -0
model-00001-of-00003.safetensors +3 -0
model-00002-of-00003.safetensors +3 -0
model-00003-of-00003.safetensors +3 -0
model.safetensors.index.json +0 -0
modeling_prts_qwen3_vl.py +935 -0
modeling_qwen3_vl.py +1645 -0
preprocessor_config.json +39 -0
processing_prts_qwen3_vl.py +352 -0
special_tokens_map.json +31 -0
statistics.json +333 -0
tokenizer.json +3 -0
tokenizer_config.json +0 -0
training_args.bin +3 -0
video_preprocessor_config.json +41 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,88 @@

+---
+license: cc-by-nc-4.0
+library_name: transformers
+pipeline_tag: robotics
+tags:
+- robotics
+- vision-language-action
+- vla
+- libero
+- qwen3-vl
+- prts
+- custom_code
+language:
+- en
+base_model: TeleEmbodied/PRTS-4B
+---
+<h1 align="center">PRTS-4B-LIBERO</h1>
+<p align="center">
+  <a href="https://arxiv.org/abs/2604.27472"><img src="https://img.shields.io/badge/arXiv-2604.27472-b31b1b.svg" alt="arXiv"></a>
+  &nbsp;
+  <a href="https://github.com/TeleHuman/PRTS"><img src="https://img.shields.io/badge/GitHub-PRTS-181717.svg" alt="GitHub"></a>
+  &nbsp;
+  <a href="https://huggingface.co/TeleEmbodied/PRTS-4B"><img src="https://img.shields.io/badge/Base-PRTS--4B-yellow.svg" alt="Base model"></a>
+</p>
+**PRTS-4B-LIBERO** is the LIBERO fine-tuned variant of [`TeleEmbodied/PRTS-4B`](https://huggingface.co/TeleEmbodied/PRTS-4B). This is the exact checkpoint used to report the LIBERO numbers in the PRTS paper. For the base model card (architecture, prompt format, contrastive RL design), please refer to the parent [PRTS-4B](https://huggingface.co/TeleEmbodied/PRTS-4B) repository.
+## Post-training budget
+Fine-tuned from `TeleEmbodied/PRTS-4B` with the launch script [`scripts/ft/launch_finetune.sh`](https://github.com/TeleHuman/PRTS/blob/main/scripts/ft/launch_finetune.sh) in the open-source repo. Key settings:
+| | |
+| :--- | :--- |
+| Base model | `TeleEmbodied/PRTS-4B` |
+| Dataset config | `configs/post-train/libero.yaml` |
+| Embodiment tag | `libero_panda` |
+| Hardware | 4 GPUs, DeepSpeed ZeRO-2, bf16, `flash_attention_3`, no gradient checkpointing |
+| Steps | 30,000 total, 5,000 warmup, save every 10,000 |
+| Effective batch | 8 (per-device) × 4 GPUs × 1 (grad-acc) = **32** |
+| LRs | `1e-5` for vision / merger / LLM; `1e-4` for the action head |
+| Scheduler | `cosine_with_min_lr` (min `1e-6`) |
+| Optimizer | AdamW (β1=0.9, β2=0.95, ε=1e-8), weight decay `1e-8`, grad clip `1.0` |
+| Action head | DiT-L + MoT action expert, chunk size `20`, max action dim `32` |
+| Action normalization | `QUANTILE` (stats bundled in this checkpoint) |
+| Seed | 42 |
+## Loading for evaluation
+This checkpoint plugs into the policy server [`scripts/serve_policy.py`](https://github.com/TeleHuman/PRTS/blob/main/scripts/serve_policy.py). Update the `EnvMode.LIBERO` entry in `DEFAULT_CHECKPOINT` so that `dir=` points to your local download of this repo. Normalization stats are already bundled in the checkpoint, so `dataset_path` can be left as `None`:
+```python
+EnvMode.LIBERO: Checkpoint(
+    config="prts_libero",
+    dir="/path/to/PRTS-4B-libero",  # local download path of this repo
+    action_dim=7,
+    dataset_path=None,  # normalization stats are bundled in the checkpoint
+    state_mode="QUANTILE",
+),
+```
+## Running LIBERO evaluation
+Follow the LIBERO simulation setup in [`examples/libero/README.md`](https://github.com/TeleHuman/PRTS/blob/main/examples/libero/README.md), then start the policy server from the PRTS repo root with [`examples/libero/run_libero_server.sh`](https://github.com/TeleHuman/PRTS/blob/main/examples/libero/run_libero_server.sh):
+```bash
+bash examples/libero/run_libero_server.sh
+# which runs:
+# CUDA_VISIBLE_DEVICES=0 python scripts/serve_policy.py --env LIBERO --port 10000
+```
+The LIBERO simulator (Terminal 1 in the example README) connects to this server over websocket and rolls out the 4 LIBERO task suites.
+## License
+Released under [CC BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/) — free for academic and non-commercial research; commercial use is **not** permitted.
+## Citation
+```bibtex
+@article{zhang2026prts,
+  title   = {PRTS: A Primitive Reasoning and Tasking System via Contrastive Representations},
+  author  = {Yang Zhang and Jiangyuan Zhao and Chenyou Fan and Fangzheng Yan and Tian Li and Haitong Tang and Sen Fu and Xuan'er Wu and Qizhen Weng and Weinan Zhang and Xiu Li and Chi Zhang and Chenjia Bai and Xuelong Li},
+  journal = {arXiv preprint arXiv:2604.27472},
+  year    = {2026},
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,2081 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|action_end|>": 151671,
+  "<|action_pad|>": 151670,
+  "<|action_start|>": 151669,
+  "<|action_token_0|>": 151674,
+  "<|action_token_1000|>": 152674,
+  "<|action_token_1001|>": 152675,
+  "<|action_token_1002|>": 152676,
+  "<|action_token_1003|>": 152677,
+  "<|action_token_1004|>": 152678,
+  "<|action_token_1005|>": 152679,
+  "<|action_token_1006|>": 152680,
+  "<|action_token_1007|>": 152681,
+  "<|action_token_1008|>": 152682,
+  "<|action_token_1009|>": 152683,
+  "<|action_token_100|>": 151774,
+  "<|action_token_1010|>": 152684,
+  "<|action_token_1011|>": 152685,
+  "<|action_token_1012|>": 152686,
+  "<|action_token_1013|>": 152687,
+  "<|action_token_1014|>": 152688,
+  "<|action_token_1015|>": 152689,
+  "<|action_token_1016|>": 152690,
+  "<|action_token_1017|>": 152691,
+  "<|action_token_1018|>": 152692,
+  "<|action_token_1019|>": 152693,
+  "<|action_token_101|>": 151775,
+  "<|action_token_1020|>": 152694,
+  "<|action_token_1021|>": 152695,
+  "<|action_token_1022|>": 152696,
+  "<|action_token_1023|>": 152697,
+  "<|action_token_1024|>": 152698,
+  "<|action_token_1025|>": 152699,
+  "<|action_token_1026|>": 152700,
+  "<|action_token_1027|>": 152701,
+  "<|action_token_1028|>": 152702,
+  "<|action_token_1029|>": 152703,
+  "<|action_token_102|>": 151776,
+  "<|action_token_1030|>": 152704,
+  "<|action_token_1031|>": 152705,
+  "<|action_token_1032|>": 152706,
+  "<|action_token_1033|>": 152707,
+  "<|action_token_1034|>": 152708,
+  "<|action_token_1035|>": 152709,
+  "<|action_token_1036|>": 152710,
+  "<|action_token_1037|>": 152711,
+  "<|action_token_1038|>": 152712,
+  "<|action_token_1039|>": 152713,
+  "<|action_token_103|>": 151777,
+  "<|action_token_1040|>": 152714,
+  "<|action_token_1041|>": 152715,
+  "<|action_token_1042|>": 152716,
+  "<|action_token_1043|>": 152717,
+  "<|action_token_1044|>": 152718,
+  "<|action_token_1045|>": 152719,
+  "<|action_token_1046|>": 152720,
+  "<|action_token_1047|>": 152721,
+  "<|action_token_1048|>": 152722,
+  "<|action_token_1049|>": 152723,
+  "<|action_token_104|>": 151778,
+  "<|action_token_1050|>": 152724,
+  "<|action_token_1051|>": 152725,
+  "<|action_token_1052|>": 152726,
+  "<|action_token_1053|>": 152727,
+  "<|action_token_1054|>": 152728,
+  "<|action_token_1055|>": 152729,
+  "<|action_token_1056|>": 152730,
+  "<|action_token_1057|>": 152731,
+  "<|action_token_1058|>": 152732,
+  "<|action_token_1059|>": 152733,
+  "<|action_token_105|>": 151779,
+  "<|action_token_1060|>": 152734,
+  "<|action_token_1061|>": 152735,
+  "<|action_token_1062|>": 152736,
+  "<|action_token_1063|>": 152737,
+  "<|action_token_1064|>": 152738,
+  "<|action_token_1065|>": 152739,
+  "<|action_token_1066|>": 152740,
+  "<|action_token_1067|>": 152741,
+  "<|action_token_1068|>": 152742,
+  "<|action_token_1069|>": 152743,
+  "<|action_token_106|>": 151780,
+  "<|action_token_1070|>": 152744,
+  "<|action_token_1071|>": 152745,
+  "<|action_token_1072|>": 152746,
+  "<|action_token_1073|>": 152747,
+  "<|action_token_1074|>": 152748,
+  "<|action_token_1075|>": 152749,
+  "<|action_token_1076|>": 152750,
+  "<|action_token_1077|>": 152751,
+  "<|action_token_1078|>": 152752,
+  "<|action_token_1079|>": 152753,
+  "<|action_token_107|>": 151781,
+  "<|action_token_1080|>": 152754,
+  "<|action_token_1081|>": 152755,
+  "<|action_token_1082|>": 152756,
+  "<|action_token_1083|>": 152757,
+  "<|action_token_1084|>": 152758,
+  "<|action_token_1085|>": 152759,
+  "<|action_token_1086|>": 152760,
+  "<|action_token_1087|>": 152761,
+  "<|action_token_1088|>": 152762,
+  "<|action_token_1089|>": 152763,
+  "<|action_token_108|>": 151782,
+  "<|action_token_1090|>": 152764,
+  "<|action_token_1091|>": 152765,
+  "<|action_token_1092|>": 152766,
+  "<|action_token_1093|>": 152767,
+  "<|action_token_1094|>": 152768,
+  "<|action_token_1095|>": 152769,
+  "<|action_token_1096|>": 152770,
+  "<|action_token_1097|>": 152771,
+  "<|action_token_1098|>": 152772,
+  "<|action_token_1099|>": 152773,
+  "<|action_token_109|>": 151783,
+  "<|action_token_10|>": 151684,
+  "<|action_token_1100|>": 152774,
+  "<|action_token_1101|>": 152775,
+  "<|action_token_1102|>": 152776,
+  "<|action_token_1103|>": 152777,
+  "<|action_token_1104|>": 152778,
+  "<|action_token_1105|>": 152779,
+  "<|action_token_1106|>": 152780,
+  "<|action_token_1107|>": 152781,
+  "<|action_token_1108|>": 152782,
+  "<|action_token_1109|>": 152783,
+  "<|action_token_110|>": 151784,
+  "<|action_token_1110|>": 152784,
+  "<|action_token_1111|>": 152785,
+  "<|action_token_1112|>": 152786,
+  "<|action_token_1113|>": 152787,
+  "<|action_token_1114|>": 152788,
+  "<|action_token_1115|>": 152789,
+  "<|action_token_1116|>": 152790,
+  "<|action_token_1117|>": 152791,
+  "<|action_token_1118|>": 152792,
+  "<|action_token_1119|>": 152793,
+  "<|action_token_111|>": 151785,
+  "<|action_token_1120|>": 152794,
+  "<|action_token_1121|>": 152795,
+  "<|action_token_1122|>": 152796,
+  "<|action_token_1123|>": 152797,
+  "<|action_token_1124|>": 152798,
+  "<|action_token_1125|>": 152799,
+  "<|action_token_1126|>": 152800,
+  "<|action_token_1127|>": 152801,
+  "<|action_token_1128|>": 152802,
+  "<|action_token_1129|>": 152803,
+  "<|action_token_112|>": 151786,
+  "<|action_token_1130|>": 152804,
+  "<|action_token_1131|>": 152805,
+  "<|action_token_1132|>": 152806,
+  "<|action_token_1133|>": 152807,
+  "<|action_token_1134|>": 152808,
+  "<|action_token_1135|>": 152809,
+  "<|action_token_1136|>": 152810,
+  "<|action_token_1137|>": 152811,
+  "<|action_token_1138|>": 152812,
+  "<|action_token_1139|>": 152813,
+  "<|action_token_113|>": 151787,
+  "<|action_token_1140|>": 152814,
+  "<|action_token_1141|>": 152815,
+  "<|action_token_1142|>": 152816,
+  "<|action_token_1143|>": 152817,
+  "<|action_token_1144|>": 152818,
+  "<|action_token_1145|>": 152819,
+  "<|action_token_1146|>": 152820,
+  "<|action_token_1147|>": 152821,
+  "<|action_token_1148|>": 152822,
+  "<|action_token_1149|>": 152823,
+  "<|action_token_114|>": 151788,
+  "<|action_token_1150|>": 152824,
+  "<|action_token_1151|>": 152825,
+  "<|action_token_1152|>": 152826,
+  "<|action_token_1153|>": 152827,
+  "<|action_token_1154|>": 152828,
+  "<|action_token_1155|>": 152829,
+  "<|action_token_1156|>": 152830,
+  "<|action_token_1157|>": 152831,
+  "<|action_token_1158|>": 152832,
+  "<|action_token_1159|>": 152833,
+  "<|action_token_115|>": 151789,
+  "<|action_token_1160|>": 152834,
+  "<|action_token_1161|>": 152835,
+  "<|action_token_1162|>": 152836,
+  "<|action_token_1163|>": 152837,
+  "<|action_token_1164|>": 152838,
+  "<|action_token_1165|>": 152839,
+  "<|action_token_1166|>": 152840,
+  "<|action_token_1167|>": 152841,
+  "<|action_token_1168|>": 152842,
+  "<|action_token_1169|>": 152843,
+  "<|action_token_116|>": 151790,
+  "<|action_token_1170|>": 152844,
+  "<|action_token_1171|>": 152845,
+  "<|action_token_1172|>": 152846,
+  "<|action_token_1173|>": 152847,
+  "<|action_token_1174|>": 152848,
+  "<|action_token_1175|>": 152849,
+  "<|action_token_1176|>": 152850,
+  "<|action_token_1177|>": 152851,
+  "<|action_token_1178|>": 152852,
+  "<|action_token_1179|>": 152853,
+  "<|action_token_117|>": 151791,
+  "<|action_token_1180|>": 152854,
+  "<|action_token_1181|>": 152855,
+  "<|action_token_1182|>": 152856,
+  "<|action_token_1183|>": 152857,
+  "<|action_token_1184|>": 152858,
+  "<|action_token_1185|>": 152859,
+  "<|action_token_1186|>": 152860,
+  "<|action_token_1187|>": 152861,
+  "<|action_token_1188|>": 152862,
+  "<|action_token_1189|>": 152863,
+  "<|action_token_118|>": 151792,
+  "<|action_token_1190|>": 152864,
+  "<|action_token_1191|>": 152865,
+  "<|action_token_1192|>": 152866,
+  "<|action_token_1193|>": 152867,
+  "<|action_token_1194|>": 152868,
+  "<|action_token_1195|>": 152869,
+  "<|action_token_1196|>": 152870,
+  "<|action_token_1197|>": 152871,
+  "<|action_token_1198|>": 152872,
+  "<|action_token_1199|>": 152873,
+  "<|action_token_119|>": 151793,
+  "<|action_token_11|>": 151685,
+  "<|action_token_1200|>": 152874,
+  "<|action_token_1201|>": 152875,
+  "<|action_token_1202|>": 152876,
+  "<|action_token_1203|>": 152877,
+  "<|action_token_1204|>": 152878,
+  "<|action_token_1205|>": 152879,
+  "<|action_token_1206|>": 152880,
+  "<|action_token_1207|>": 152881,
+  "<|action_token_1208|>": 152882,
+  "<|action_token_1209|>": 152883,
+  "<|action_token_120|>": 151794,
+  "<|action_token_1210|>": 152884,
+  "<|action_token_1211|>": 152885,
+  "<|action_token_1212|>": 152886,
+  "<|action_token_1213|>": 152887,
+  "<|action_token_1214|>": 152888,
+  "<|action_token_1215|>": 152889,
+  "<|action_token_1216|>": 152890,
+  "<|action_token_1217|>": 152891,
+  "<|action_token_1218|>": 152892,
+  "<|action_token_1219|>": 152893,
+  "<|action_token_121|>": 151795,
+  "<|action_token_1220|>": 152894,
+  "<|action_token_1221|>": 152895,
+  "<|action_token_1222|>": 152896,
+  "<|action_token_1223|>": 152897,
+  "<|action_token_1224|>": 152898,
+  "<|action_token_1225|>": 152899,
+  "<|action_token_1226|>": 152900,
+  "<|action_token_1227|>": 152901,
+  "<|action_token_1228|>": 152902,
+  "<|action_token_1229|>": 152903,
+  "<|action_token_122|>": 151796,
+  "<|action_token_1230|>": 152904,
+  "<|action_token_1231|>": 152905,
+  "<|action_token_1232|>": 152906,
+  "<|action_token_1233|>": 152907,
+  "<|action_token_1234|>": 152908,
+  "<|action_token_1235|>": 152909,
+  "<|action_token_1236|>": 152910,
+  "<|action_token_1237|>": 152911,
+  "<|action_token_1238|>": 152912,
+  "<|action_token_1239|>": 152913,
+  "<|action_token_123|>": 151797,
+  "<|action_token_1240|>": 152914,
+  "<|action_token_1241|>": 152915,
+  "<|action_token_1242|>": 152916,
+  "<|action_token_1243|>": 152917,
+  "<|action_token_1244|>": 152918,
+  "<|action_token_1245|>": 152919,
+  "<|action_token_1246|>": 152920,
+  "<|action_token_1247|>": 152921,
+  "<|action_token_1248|>": 152922,
+  "<|action_token_1249|>": 152923,
+  "<|action_token_124|>": 151798,
+  "<|action_token_1250|>": 152924,
+  "<|action_token_1251|>": 152925,
+  "<|action_token_1252|>": 152926,
+  "<|action_token_1253|>": 152927,
+  "<|action_token_1254|>": 152928,
+  "<|action_token_1255|>": 152929,
+  "<|action_token_1256|>": 152930,
+  "<|action_token_1257|>": 152931,
+  "<|action_token_1258|>": 152932,
+  "<|action_token_1259|>": 152933,
+  "<|action_token_125|>": 151799,
+  "<|action_token_1260|>": 152934,
+  "<|action_token_1261|>": 152935,
+  "<|action_token_1262|>": 152936,
+  "<|action_token_1263|>": 152937,
+  "<|action_token_1264|>": 152938,
+  "<|action_token_1265|>": 152939,
+  "<|action_token_1266|>": 152940,
+  "<|action_token_1267|>": 152941,
+  "<|action_token_1268|>": 152942,
+  "<|action_token_1269|>": 152943,
+  "<|action_token_126|>": 151800,
+  "<|action_token_1270|>": 152944,
+  "<|action_token_1271|>": 152945,
+  "<|action_token_1272|>": 152946,
+  "<|action_token_1273|>": 152947,
+  "<|action_token_1274|>": 152948,
+  "<|action_token_1275|>": 152949,
+  "<|action_token_1276|>": 152950,
+  "<|action_token_1277|>": 152951,
+  "<|action_token_1278|>": 152952,
+  "<|action_token_1279|>": 152953,
+  "<|action_token_127|>": 151801,
+  "<|action_token_1280|>": 152954,
+  "<|action_token_1281|>": 152955,
+  "<|action_token_1282|>": 152956,
+  "<|action_token_1283|>": 152957,
+  "<|action_token_1284|>": 152958,
+  "<|action_token_1285|>": 152959,
+  "<|action_token_1286|>": 152960,
+  "<|action_token_1287|>": 152961,
+  "<|action_token_1288|>": 152962,
+  "<|action_token_1289|>": 152963,
+  "<|action_token_128|>": 151802,
+  "<|action_token_1290|>": 152964,
+  "<|action_token_1291|>": 152965,
+  "<|action_token_1292|>": 152966,
+  "<|action_token_1293|>": 152967,
+  "<|action_token_1294|>": 152968,
+  "<|action_token_1295|>": 152969,
+  "<|action_token_1296|>": 152970,
+  "<|action_token_1297|>": 152971,
+  "<|action_token_1298|>": 152972,
+  "<|action_token_1299|>": 152973,
+  "<|action_token_129|>": 151803,
+  "<|action_token_12|>": 151686,
+  "<|action_token_1300|>": 152974,
+  "<|action_token_1301|>": 152975,
+  "<|action_token_1302|>": 152976,
+  "<|action_token_1303|>": 152977,
+  "<|action_token_1304|>": 152978,
+  "<|action_token_1305|>": 152979,
+  "<|action_token_1306|>": 152980,
+  "<|action_token_1307|>": 152981,
+  "<|action_token_1308|>": 152982,
+  "<|action_token_1309|>": 152983,
+  "<|action_token_130|>": 151804,
+  "<|action_token_1310|>": 152984,
+  "<|action_token_1311|>": 152985,
+  "<|action_token_1312|>": 152986,
+  "<|action_token_1313|>": 152987,
+  "<|action_token_1314|>": 152988,
+  "<|action_token_1315|>": 152989,
+  "<|action_token_1316|>": 152990,
+  "<|action_token_1317|>": 152991,
+  "<|action_token_1318|>": 152992,
+  "<|action_token_1319|>": 152993,
+  "<|action_token_131|>": 151805,
+  "<|action_token_1320|>": 152994,
+  "<|action_token_1321|>": 152995,
+  "<|action_token_1322|>": 152996,
+  "<|action_token_1323|>": 152997,
+  "<|action_token_1324|>": 152998,
+  "<|action_token_1325|>": 152999,
+  "<|action_token_1326|>": 153000,
+  "<|action_token_1327|>": 153001,
+  "<|action_token_1328|>": 153002,
+  "<|action_token_1329|>": 153003,
+  "<|action_token_132|>": 151806,
+  "<|action_token_1330|>": 153004,
+  "<|action_token_1331|>": 153005,
+  "<|action_token_1332|>": 153006,
+  "<|action_token_1333|>": 153007,
+  "<|action_token_1334|>": 153008,
+  "<|action_token_1335|>": 153009,
+  "<|action_token_1336|>": 153010,
+  "<|action_token_1337|>": 153011,
+  "<|action_token_1338|>": 153012,
+  "<|action_token_1339|>": 153013,
+  "<|action_token_133|>": 151807,
+  "<|action_token_1340|>": 153014,
+  "<|action_token_1341|>": 153015,
+  "<|action_token_1342|>": 153016,
+  "<|action_token_1343|>": 153017,
+  "<|action_token_1344|>": 153018,
+  "<|action_token_1345|>": 153019,
+  "<|action_token_1346|>": 153020,
+  "<|action_token_1347|>": 153021,
+  "<|action_token_1348|>": 153022,
+  "<|action_token_1349|>": 153023,
+  "<|action_token_134|>": 151808,
+  "<|action_token_1350|>": 153024,
+  "<|action_token_1351|>": 153025,
+  "<|action_token_1352|>": 153026,
+  "<|action_token_1353|>": 153027,
+  "<|action_token_1354|>": 153028,
+  "<|action_token_1355|>": 153029,
+  "<|action_token_1356|>": 153030,
+  "<|action_token_1357|>": 153031,
+  "<|action_token_1358|>": 153032,
+  "<|action_token_1359|>": 153033,
+  "<|action_token_135|>": 151809,
+  "<|action_token_1360|>": 153034,
+  "<|action_token_1361|>": 153035,
+  "<|action_token_1362|>": 153036,
+  "<|action_token_1363|>": 153037,
+  "<|action_token_1364|>": 153038,
+  "<|action_token_1365|>": 153039,
+  "<|action_token_1366|>": 153040,
+  "<|action_token_1367|>": 153041,
+  "<|action_token_1368|>": 153042,
+  "<|action_token_1369|>": 153043,
+  "<|action_token_136|>": 151810,
+  "<|action_token_1370|>": 153044,
+  "<|action_token_1371|>": 153045,
+  "<|action_token_1372|>": 153046,
+  "<|action_token_1373|>": 153047,
+  "<|action_token_1374|>": 153048,
+  "<|action_token_1375|>": 153049,
+  "<|action_token_1376|>": 153050,
+  "<|action_token_1377|>": 153051,
+  "<|action_token_1378|>": 153052,
+  "<|action_token_1379|>": 153053,
+  "<|action_token_137|>": 151811,
+  "<|action_token_1380|>": 153054,
+  "<|action_token_1381|>": 153055,
+  "<|action_token_1382|>": 153056,
+  "<|action_token_1383|>": 153057,
+  "<|action_token_1384|>": 153058,
+  "<|action_token_1385|>": 153059,
+  "<|action_token_1386|>": 153060,
+  "<|action_token_1387|>": 153061,
+  "<|action_token_1388|>": 153062,
+  "<|action_token_1389|>": 153063,
+  "<|action_token_138|>": 151812,
+  "<|action_token_1390|>": 153064,
+  "<|action_token_1391|>": 153065,
+  "<|action_token_1392|>": 153066,
+  "<|action_token_1393|>": 153067,
+  "<|action_token_1394|>": 153068,
+  "<|action_token_1395|>": 153069,
+  "<|action_token_1396|>": 153070,
+  "<|action_token_1397|>": 153071,
+  "<|action_token_1398|>": 153072,
+  "<|action_token_1399|>": 153073,
+  "<|action_token_139|>": 151813,
+  "<|action_token_13|>": 151687,
+  "<|action_token_1400|>": 153074,
+  "<|action_token_1401|>": 153075,
+  "<|action_token_1402|>": 153076,
+  "<|action_token_1403|>": 153077,
+  "<|action_token_1404|>": 153078,
+  "<|action_token_1405|>": 153079,
+  "<|action_token_1406|>": 153080,
+  "<|action_token_1407|>": 153081,
+  "<|action_token_1408|>": 153082,
+  "<|action_token_1409|>": 153083,
+  "<|action_token_140|>": 151814,
+  "<|action_token_1410|>": 153084,
+  "<|action_token_1411|>": 153085,
+  "<|action_token_1412|>": 153086,
+  "<|action_token_1413|>": 153087,
+  "<|action_token_1414|>": 153088,
+  "<|action_token_1415|>": 153089,
+  "<|action_token_1416|>": 153090,
+  "<|action_token_1417|>": 153091,
+  "<|action_token_1418|>": 153092,
+  "<|action_token_1419|>": 153093,
+  "<|action_token_141|>": 151815,
+  "<|action_token_1420|>": 153094,
+  "<|action_token_1421|>": 153095,
+  "<|action_token_1422|>": 153096,
+  "<|action_token_1423|>": 153097,
+  "<|action_token_1424|>": 153098,
+  "<|action_token_1425|>": 153099,
+  "<|action_token_1426|>": 153100,
+  "<|action_token_1427|>": 153101,
+  "<|action_token_1428|>": 153102,
+  "<|action_token_1429|>": 153103,
+  "<|action_token_142|>": 151816,
+  "<|action_token_1430|>": 153104,
+  "<|action_token_1431|>": 153105,
+  "<|action_token_1432|>": 153106,
+  "<|action_token_1433|>": 153107,
+  "<|action_token_1434|>": 153108,
+  "<|action_token_1435|>": 153109,
+  "<|action_token_1436|>": 153110,
+  "<|action_token_1437|>": 153111,
+  "<|action_token_1438|>": 153112,
+  "<|action_token_1439|>": 153113,
+  "<|action_token_143|>": 151817,
+  "<|action_token_1440|>": 153114,
+  "<|action_token_1441|>": 153115,
+  "<|action_token_1442|>": 153116,
+  "<|action_token_1443|>": 153117,
+  "<|action_token_1444|>": 153118,
+  "<|action_token_1445|>": 153119,
+  "<|action_token_1446|>": 153120,
+  "<|action_token_1447|>": 153121,
+  "<|action_token_1448|>": 153122,
+  "<|action_token_1449|>": 153123,
+  "<|action_token_144|>": 151818,
+  "<|action_token_1450|>": 153124,
+  "<|action_token_1451|>": 153125,
+  "<|action_token_1452|>": 153126,
+  "<|action_token_1453|>": 153127,
+  "<|action_token_1454|>": 153128,
+  "<|action_token_1455|>": 153129,
+  "<|action_token_1456|>": 153130,
+  "<|action_token_1457|>": 153131,
+  "<|action_token_1458|>": 153132,
+  "<|action_token_1459|>": 153133,
+  "<|action_token_145|>": 151819,
+  "<|action_token_1460|>": 153134,
+  "<|action_token_1461|>": 153135,
+  "<|action_token_1462|>": 153136,
+  "<|action_token_1463|>": 153137,
+  "<|action_token_1464|>": 153138,
+  "<|action_token_1465|>": 153139,
+  "<|action_token_1466|>": 153140,
+  "<|action_token_1467|>": 153141,
+  "<|action_token_1468|>": 153142,
+  "<|action_token_1469|>": 153143,
+  "<|action_token_146|>": 151820,
+  "<|action_token_1470|>": 153144,
+  "<|action_token_1471|>": 153145,
+  "<|action_token_1472|>": 153146,
+  "<|action_token_1473|>": 153147,
+  "<|action_token_1474|>": 153148,
+  "<|action_token_1475|>": 153149,
+  "<|action_token_1476|>": 153150,
+  "<|action_token_1477|>": 153151,
+  "<|action_token_1478|>": 153152,
+  "<|action_token_1479|>": 153153,
+  "<|action_token_147|>": 151821,
+  "<|action_token_1480|>": 153154,
+  "<|action_token_1481|>": 153155,
+  "<|action_token_1482|>": 153156,
+  "<|action_token_1483|>": 153157,
+  "<|action_token_1484|>": 153158,
+  "<|action_token_1485|>": 153159,
+  "<|action_token_1486|>": 153160,
+  "<|action_token_1487|>": 153161,
+  "<|action_token_1488|>": 153162,
+  "<|action_token_1489|>": 153163,
+  "<|action_token_148|>": 151822,
+  "<|action_token_1490|>": 153164,
+  "<|action_token_1491|>": 153165,
+  "<|action_token_1492|>": 153166,
+  "<|action_token_1493|>": 153167,
+  "<|action_token_1494|>": 153168,
+  "<|action_token_1495|>": 153169,
+  "<|action_token_1496|>": 153170,
+  "<|action_token_1497|>": 153171,
+  "<|action_token_1498|>": 153172,
+  "<|action_token_1499|>": 153173,
+  "<|action_token_149|>": 151823,
+  "<|action_token_14|>": 151688,
+  "<|action_token_1500|>": 153174,
+  "<|action_token_1501|>": 153175,
+  "<|action_token_1502|>": 153176,
+  "<|action_token_1503|>": 153177,
+  "<|action_token_1504|>": 153178,
+  "<|action_token_1505|>": 153179,
+  "<|action_token_1506|>": 153180,
+  "<|action_token_1507|>": 153181,
+  "<|action_token_1508|>": 153182,
+  "<|action_token_1509|>": 153183,
+  "<|action_token_150|>": 151824,
+  "<|action_token_1510|>": 153184,
+  "<|action_token_1511|>": 153185,
+  "<|action_token_1512|>": 153186,
+  "<|action_token_1513|>": 153187,
+  "<|action_token_1514|>": 153188,
+  "<|action_token_1515|>": 153189,
+  "<|action_token_1516|>": 153190,
+  "<|action_token_1517|>": 153191,
+  "<|action_token_1518|>": 153192,
+  "<|action_token_1519|>": 153193,
+  "<|action_token_151|>": 151825,
+  "<|action_token_1520|>": 153194,
+  "<|action_token_1521|>": 153195,
+  "<|action_token_1522|>": 153196,
+  "<|action_token_1523|>": 153197,
+  "<|action_token_1524|>": 153198,
+  "<|action_token_1525|>": 153199,
+  "<|action_token_1526|>": 153200,
+  "<|action_token_1527|>": 153201,
+  "<|action_token_1528|>": 153202,
+  "<|action_token_1529|>": 153203,
+  "<|action_token_152|>": 151826,
+  "<|action_token_1530|>": 153204,
+  "<|action_token_1531|>": 153205,
+  "<|action_token_1532|>": 153206,
+  "<|action_token_1533|>": 153207,
+  "<|action_token_1534|>": 153208,
+  "<|action_token_1535|>": 153209,
+  "<|action_token_1536|>": 153210,
+  "<|action_token_1537|>": 153211,
+  "<|action_token_1538|>": 153212,
+  "<|action_token_1539|>": 153213,
+  "<|action_token_153|>": 151827,
+  "<|action_token_1540|>": 153214,
+  "<|action_token_1541|>": 153215,
+  "<|action_token_1542|>": 153216,
+  "<|action_token_1543|>": 153217,
+  "<|action_token_1544|>": 153218,
+  "<|action_token_1545|>": 153219,
+  "<|action_token_1546|>": 153220,
+  "<|action_token_1547|>": 153221,
+  "<|action_token_1548|>": 153222,
+  "<|action_token_1549|>": 153223,
+  "<|action_token_154|>": 151828,
+  "<|action_token_1550|>": 153224,
+  "<|action_token_1551|>": 153225,
+  "<|action_token_1552|>": 153226,
+  "<|action_token_1553|>": 153227,
+  "<|action_token_1554|>": 153228,
+  "<|action_token_1555|>": 153229,
+  "<|action_token_1556|>": 153230,
+  "<|action_token_1557|>": 153231,
+  "<|action_token_1558|>": 153232,
+  "<|action_token_1559|>": 153233,
+  "<|action_token_155|>": 151829,
+  "<|action_token_1560|>": 153234,
+  "<|action_token_1561|>": 153235,
+  "<|action_token_1562|>": 153236,
+  "<|action_token_1563|>": 153237,
+  "<|action_token_1564|>": 153238,
+  "<|action_token_1565|>": 153239,
+  "<|action_token_1566|>": 153240,
+  "<|action_token_1567|>": 153241,
+  "<|action_token_1568|>": 153242,
+  "<|action_token_1569|>": 153243,
+  "<|action_token_156|>": 151830,
+  "<|action_token_1570|>": 153244,
+  "<|action_token_1571|>": 153245,
+  "<|action_token_1572|>": 153246,
+  "<|action_token_1573|>": 153247,
+  "<|action_token_1574|>": 153248,
+  "<|action_token_1575|>": 153249,
+  "<|action_token_1576|>": 153250,
+  "<|action_token_1577|>": 153251,
+  "<|action_token_1578|>": 153252,
+  "<|action_token_1579|>": 153253,
+  "<|action_token_157|>": 151831,
+  "<|action_token_1580|>": 153254,
+  "<|action_token_1581|>": 153255,
+  "<|action_token_1582|>": 153256,
+  "<|action_token_1583|>": 153257,
+  "<|action_token_1584|>": 153258,
+  "<|action_token_1585|>": 153259,
+  "<|action_token_1586|>": 153260,
+  "<|action_token_1587|>": 153261,
+  "<|action_token_1588|>": 153262,
+  "<|action_token_1589|>": 153263,
+  "<|action_token_158|>": 151832,
+  "<|action_token_1590|>": 153264,
+  "<|action_token_1591|>": 153265,
+  "<|action_token_1592|>": 153266,
+  "<|action_token_1593|>": 153267,
+  "<|action_token_1594|>": 153268,
+  "<|action_token_1595|>": 153269,
+  "<|action_token_1596|>": 153270,
+  "<|action_token_1597|>": 153271,
+  "<|action_token_1598|>": 153272,
+  "<|action_token_1599|>": 153273,
+  "<|action_token_159|>": 151833,
+  "<|action_token_15|>": 151689,
+  "<|action_token_1600|>": 153274,
+  "<|action_token_1601|>": 153275,
+  "<|action_token_1602|>": 153276,
+  "<|action_token_1603|>": 153277,
+  "<|action_token_1604|>": 153278,
+  "<|action_token_1605|>": 153279,
+  "<|action_token_1606|>": 153280,
+  "<|action_token_1607|>": 153281,
+  "<|action_token_1608|>": 153282,
+  "<|action_token_1609|>": 153283,
+  "<|action_token_160|>": 151834,
+  "<|action_token_1610|>": 153284,
+  "<|action_token_1611|>": 153285,
+  "<|action_token_1612|>": 153286,
+  "<|action_token_1613|>": 153287,
+  "<|action_token_1614|>": 153288,
+  "<|action_token_1615|>": 153289,
+  "<|action_token_1616|>": 153290,
+  "<|action_token_1617|>": 153291,
+  "<|action_token_1618|>": 153292,
+  "<|action_token_1619|>": 153293,
+  "<|action_token_161|>": 151835,
+  "<|action_token_1620|>": 153294,
+  "<|action_token_1621|>": 153295,
+  "<|action_token_1622|>": 153296,
+  "<|action_token_1623|>": 153297,
+  "<|action_token_1624|>": 153298,
+  "<|action_token_1625|>": 153299,
+  "<|action_token_1626|>": 153300,
+  "<|action_token_1627|>": 153301,
+  "<|action_token_1628|>": 153302,
+  "<|action_token_1629|>": 153303,
+  "<|action_token_162|>": 151836,
+  "<|action_token_1630|>": 153304,
+  "<|action_token_1631|>": 153305,
+  "<|action_token_1632|>": 153306,
+  "<|action_token_1633|>": 153307,
+  "<|action_token_1634|>": 153308,
+  "<|action_token_1635|>": 153309,
+  "<|action_token_1636|>": 153310,
+  "<|action_token_1637|>": 153311,
+  "<|action_token_1638|>": 153312,
+  "<|action_token_1639|>": 153313,
+  "<|action_token_163|>": 151837,
+  "<|action_token_1640|>": 153314,
+  "<|action_token_1641|>": 153315,
+  "<|action_token_1642|>": 153316,
+  "<|action_token_1643|>": 153317,
+  "<|action_token_1644|>": 153318,
+  "<|action_token_1645|>": 153319,
+  "<|action_token_1646|>": 153320,
+  "<|action_token_1647|>": 153321,
+  "<|action_token_1648|>": 153322,
+  "<|action_token_1649|>": 153323,
+  "<|action_token_164|>": 151838,
+  "<|action_token_1650|>": 153324,
+  "<|action_token_1651|>": 153325,
+  "<|action_token_1652|>": 153326,
+  "<|action_token_1653|>": 153327,
+  "<|action_token_1654|>": 153328,
+  "<|action_token_1655|>": 153329,
+  "<|action_token_1656|>": 153330,
+  "<|action_token_1657|>": 153331,
+  "<|action_token_1658|>": 153332,
+  "<|action_token_1659|>": 153333,
+  "<|action_token_165|>": 151839,
+  "<|action_token_1660|>": 153334,
+  "<|action_token_1661|>": 153335,
+  "<|action_token_1662|>": 153336,
+  "<|action_token_1663|>": 153337,
+  "<|action_token_1664|>": 153338,
+  "<|action_token_1665|>": 153339,
+  "<|action_token_1666|>": 153340,
+  "<|action_token_1667|>": 153341,
+  "<|action_token_1668|>": 153342,
+  "<|action_token_1669|>": 153343,
+  "<|action_token_166|>": 151840,
+  "<|action_token_1670|>": 153344,
+  "<|action_token_1671|>": 153345,
+  "<|action_token_1672|>": 153346,
+  "<|action_token_1673|>": 153347,
+  "<|action_token_1674|>": 153348,
+  "<|action_token_1675|>": 153349,
+  "<|action_token_1676|>": 153350,
+  "<|action_token_1677|>": 153351,
+  "<|action_token_1678|>": 153352,
+  "<|action_token_1679|>": 153353,
+  "<|action_token_167|>": 151841,
+  "<|action_token_1680|>": 153354,
+  "<|action_token_1681|>": 153355,
+  "<|action_token_1682|>": 153356,
+  "<|action_token_1683|>": 153357,
+  "<|action_token_1684|>": 153358,
+  "<|action_token_1685|>": 153359,
+  "<|action_token_1686|>": 153360,
+  "<|action_token_1687|>": 153361,
+  "<|action_token_1688|>": 153362,
+  "<|action_token_1689|>": 153363,
+  "<|action_token_168|>": 151842,
+  "<|action_token_1690|>": 153364,
+  "<|action_token_1691|>": 153365,
+  "<|action_token_1692|>": 153366,
+  "<|action_token_1693|>": 153367,
+  "<|action_token_1694|>": 153368,
+  "<|action_token_1695|>": 153369,
+  "<|action_token_1696|>": 153370,
+  "<|action_token_1697|>": 153371,
+  "<|action_token_1698|>": 153372,
+  "<|action_token_1699|>": 153373,
+  "<|action_token_169|>": 151843,
+  "<|action_token_16|>": 151690,
+  "<|action_token_1700|>": 153374,
+  "<|action_token_1701|>": 153375,
+  "<|action_token_1702|>": 153376,
+  "<|action_token_1703|>": 153377,
+  "<|action_token_1704|>": 153378,
+  "<|action_token_1705|>": 153379,
+  "<|action_token_1706|>": 153380,
+  "<|action_token_1707|>": 153381,
+  "<|action_token_1708|>": 153382,
+  "<|action_token_1709|>": 153383,
+  "<|action_token_170|>": 151844,
+  "<|action_token_1710|>": 153384,
+  "<|action_token_1711|>": 153385,
+  "<|action_token_1712|>": 153386,
+  "<|action_token_1713|>": 153387,
+  "<|action_token_1714|>": 153388,
+  "<|action_token_1715|>": 153389,
+  "<|action_token_1716|>": 153390,
+  "<|action_token_1717|>": 153391,
+  "<|action_token_1718|>": 153392,
+  "<|action_token_1719|>": 153393,
+  "<|action_token_171|>": 151845,
+  "<|action_token_1720|>": 153394,
+  "<|action_token_1721|>": 153395,
+  "<|action_token_1722|>": 153396,
+  "<|action_token_1723|>": 153397,
+  "<|action_token_1724|>": 153398,
+  "<|action_token_1725|>": 153399,
+  "<|action_token_1726|>": 153400,
+  "<|action_token_1727|>": 153401,
+  "<|action_token_1728|>": 153402,
+  "<|action_token_1729|>": 153403,
+  "<|action_token_172|>": 151846,
+  "<|action_token_1730|>": 153404,
+  "<|action_token_1731|>": 153405,
+  "<|action_token_1732|>": 153406,
+  "<|action_token_1733|>": 153407,
+  "<|action_token_1734|>": 153408,
+  "<|action_token_1735|>": 153409,
+  "<|action_token_1736|>": 153410,
+  "<|action_token_1737|>": 153411,
+  "<|action_token_1738|>": 153412,
+  "<|action_token_1739|>": 153413,
+  "<|action_token_173|>": 151847,
+  "<|action_token_1740|>": 153414,
+  "<|action_token_1741|>": 153415,
+  "<|action_token_1742|>": 153416,
+  "<|action_token_1743|>": 153417,
+  "<|action_token_1744|>": 153418,
+  "<|action_token_1745|>": 153419,
+  "<|action_token_1746|>": 153420,
+  "<|action_token_1747|>": 153421,
+  "<|action_token_1748|>": 153422,
+  "<|action_token_1749|>": 153423,
+  "<|action_token_174|>": 151848,
+  "<|action_token_1750|>": 153424,
+  "<|action_token_1751|>": 153425,
+  "<|action_token_1752|>": 153426,
+  "<|action_token_1753|>": 153427,
+  "<|action_token_1754|>": 153428,
+  "<|action_token_1755|>": 153429,
+  "<|action_token_1756|>": 153430,
+  "<|action_token_1757|>": 153431,
+  "<|action_token_1758|>": 153432,
+  "<|action_token_1759|>": 153433,
+  "<|action_token_175|>": 151849,
+  "<|action_token_1760|>": 153434,
+  "<|action_token_1761|>": 153435,
+  "<|action_token_1762|>": 153436,
+  "<|action_token_1763|>": 153437,
+  "<|action_token_1764|>": 153438,
+  "<|action_token_1765|>": 153439,
+  "<|action_token_1766|>": 153440,
+  "<|action_token_1767|>": 153441,
+  "<|action_token_1768|>": 153442,
+  "<|action_token_1769|>": 153443,
+  "<|action_token_176|>": 151850,
+  "<|action_token_1770|>": 153444,
+  "<|action_token_1771|>": 153445,
+  "<|action_token_1772|>": 153446,
+  "<|action_token_1773|>": 153447,
+  "<|action_token_1774|>": 153448,
+  "<|action_token_1775|>": 153449,
+  "<|action_token_1776|>": 153450,
+  "<|action_token_1777|>": 153451,
+  "<|action_token_1778|>": 153452,
+  "<|action_token_1779|>": 153453,
+  "<|action_token_177|>": 151851,
+  "<|action_token_1780|>": 153454,
+  "<|action_token_1781|>": 153455,
+  "<|action_token_1782|>": 153456,
+  "<|action_token_1783|>": 153457,
+  "<|action_token_1784|>": 153458,
+  "<|action_token_1785|>": 153459,
+  "<|action_token_1786|>": 153460,
+  "<|action_token_1787|>": 153461,
+  "<|action_token_1788|>": 153462,
+  "<|action_token_1789|>": 153463,
+  "<|action_token_178|>": 151852,
+  "<|action_token_1790|>": 153464,
+  "<|action_token_1791|>": 153465,
+  "<|action_token_1792|>": 153466,
+  "<|action_token_1793|>": 153467,
+  "<|action_token_1794|>": 153468,
+  "<|action_token_1795|>": 153469,
+  "<|action_token_1796|>": 153470,
+  "<|action_token_1797|>": 153471,
+  "<|action_token_1798|>": 153472,
+  "<|action_token_1799|>": 153473,
+  "<|action_token_179|>": 151853,
+  "<|action_token_17|>": 151691,
+  "<|action_token_1800|>": 153474,
+  "<|action_token_1801|>": 153475,
+  "<|action_token_1802|>": 153476,
+  "<|action_token_1803|>": 153477,
+  "<|action_token_1804|>": 153478,
+  "<|action_token_1805|>": 153479,
+  "<|action_token_1806|>": 153480,
+  "<|action_token_1807|>": 153481,
+  "<|action_token_1808|>": 153482,
+  "<|action_token_1809|>": 153483,
+  "<|action_token_180|>": 151854,
+  "<|action_token_1810|>": 153484,
+  "<|action_token_1811|>": 153485,
+  "<|action_token_1812|>": 153486,
+  "<|action_token_1813|>": 153487,
+  "<|action_token_1814|>": 153488,
+  "<|action_token_1815|>": 153489,
+  "<|action_token_1816|>": 153490,
+  "<|action_token_1817|>": 153491,
+  "<|action_token_1818|>": 153492,
+  "<|action_token_1819|>": 153493,
+  "<|action_token_181|>": 151855,
+  "<|action_token_1820|>": 153494,
+  "<|action_token_1821|>": 153495,
+  "<|action_token_1822|>": 153496,
+  "<|action_token_1823|>": 153497,
+  "<|action_token_1824|>": 153498,
+  "<|action_token_1825|>": 153499,
+  "<|action_token_1826|>": 153500,
+  "<|action_token_1827|>": 153501,
+  "<|action_token_1828|>": 153502,
+  "<|action_token_1829|>": 153503,
+  "<|action_token_182|>": 151856,
+  "<|action_token_1830|>": 153504,
+  "<|action_token_1831|>": 153505,
+  "<|action_token_1832|>": 153506,
+  "<|action_token_1833|>": 153507,
+  "<|action_token_1834|>": 153508,
+  "<|action_token_1835|>": 153509,
+  "<|action_token_1836|>": 153510,
+  "<|action_token_1837|>": 153511,
+  "<|action_token_1838|>": 153512,
+  "<|action_token_1839|>": 153513,
+  "<|action_token_183|>": 151857,
+  "<|action_token_1840|>": 153514,
+  "<|action_token_1841|>": 153515,
+  "<|action_token_1842|>": 153516,
+  "<|action_token_1843|>": 153517,
+  "<|action_token_1844|>": 153518,
+  "<|action_token_1845|>": 153519,
+  "<|action_token_1846|>": 153520,
+  "<|action_token_1847|>": 153521,
+  "<|action_token_1848|>": 153522,
+  "<|action_token_1849|>": 153523,
+  "<|action_token_184|>": 151858,
+  "<|action_token_1850|>": 153524,
+  "<|action_token_1851|>": 153525,
+  "<|action_token_1852|>": 153526,
+  "<|action_token_1853|>": 153527,
+  "<|action_token_1854|>": 153528,
+  "<|action_token_1855|>": 153529,
+  "<|action_token_1856|>": 153530,
+  "<|action_token_1857|>": 153531,
+  "<|action_token_1858|>": 153532,
+  "<|action_token_1859|>": 153533,
+  "<|action_token_185|>": 151859,
+  "<|action_token_1860|>": 153534,
+  "<|action_token_1861|>": 153535,
+  "<|action_token_1862|>": 153536,
+  "<|action_token_1863|>": 153537,
+  "<|action_token_1864|>": 153538,
+  "<|action_token_1865|>": 153539,
+  "<|action_token_1866|>": 153540,
+  "<|action_token_1867|>": 153541,
+  "<|action_token_1868|>": 153542,
+  "<|action_token_1869|>": 153543,
+  "<|action_token_186|>": 151860,
+  "<|action_token_1870|>": 153544,
+  "<|action_token_1871|>": 153545,
+  "<|action_token_1872|>": 153546,
+  "<|action_token_1873|>": 153547,
+  "<|action_token_1874|>": 153548,
+  "<|action_token_1875|>": 153549,
+  "<|action_token_1876|>": 153550,
+  "<|action_token_1877|>": 153551,
+  "<|action_token_1878|>": 153552,
+  "<|action_token_1879|>": 153553,
+  "<|action_token_187|>": 151861,
+  "<|action_token_1880|>": 153554,
+  "<|action_token_1881|>": 153555,
+  "<|action_token_1882|>": 153556,
+  "<|action_token_1883|>": 153557,
+  "<|action_token_1884|>": 153558,
+  "<|action_token_1885|>": 153559,
+  "<|action_token_1886|>": 153560,
+  "<|action_token_1887|>": 153561,
+  "<|action_token_1888|>": 153562,
+  "<|action_token_1889|>": 153563,
+  "<|action_token_188|>": 151862,
+  "<|action_token_1890|>": 153564,
+  "<|action_token_1891|>": 153565,
+  "<|action_token_1892|>": 153566,
+  "<|action_token_1893|>": 153567,
+  "<|action_token_1894|>": 153568,
+  "<|action_token_1895|>": 153569,
+  "<|action_token_1896|>": 153570,
+  "<|action_token_1897|>": 153571,
+  "<|action_token_1898|>": 153572,
+  "<|action_token_1899|>": 153573,
+  "<|action_token_189|>": 151863,
+  "<|action_token_18|>": 151692,
+  "<|action_token_1900|>": 153574,
+  "<|action_token_1901|>": 153575,
+  "<|action_token_1902|>": 153576,
+  "<|action_token_1903|>": 153577,
+  "<|action_token_1904|>": 153578,
+  "<|action_token_1905|>": 153579,
+  "<|action_token_1906|>": 153580,
+  "<|action_token_1907|>": 153581,
+  "<|action_token_1908|>": 153582,
+  "<|action_token_1909|>": 153583,
+  "<|action_token_190|>": 151864,
+  "<|action_token_1910|>": 153584,
+  "<|action_token_1911|>": 153585,
+  "<|action_token_1912|>": 153586,
+  "<|action_token_1913|>": 153587,
+  "<|action_token_1914|>": 153588,
+  "<|action_token_1915|>": 153589,
+  "<|action_token_1916|>": 153590,
+  "<|action_token_1917|>": 153591,
+  "<|action_token_1918|>": 153592,
+  "<|action_token_1919|>": 153593,
+  "<|action_token_191|>": 151865,
+  "<|action_token_1920|>": 153594,
+  "<|action_token_1921|>": 153595,
+  "<|action_token_1922|>": 153596,
+  "<|action_token_1923|>": 153597,
+  "<|action_token_1924|>": 153598,
+  "<|action_token_1925|>": 153599,
+  "<|action_token_1926|>": 153600,
+  "<|action_token_1927|>": 153601,
+  "<|action_token_1928|>": 153602,
+  "<|action_token_1929|>": 153603,
+  "<|action_token_192|>": 151866,
+  "<|action_token_1930|>": 153604,
+  "<|action_token_1931|>": 153605,
+  "<|action_token_1932|>": 153606,
+  "<|action_token_1933|>": 153607,
+  "<|action_token_1934|>": 153608,
+  "<|action_token_1935|>": 153609,
+  "<|action_token_1936|>": 153610,
+  "<|action_token_1937|>": 153611,
+  "<|action_token_1938|>": 153612,
+  "<|action_token_1939|>": 153613,
+  "<|action_token_193|>": 151867,
+  "<|action_token_1940|>": 153614,
+  "<|action_token_1941|>": 153615,
+  "<|action_token_1942|>": 153616,
+  "<|action_token_1943|>": 153617,
+  "<|action_token_1944|>": 153618,
+  "<|action_token_1945|>": 153619,
+  "<|action_token_1946|>": 153620,
+  "<|action_token_1947|>": 153621,
+  "<|action_token_1948|>": 153622,
+  "<|action_token_1949|>": 153623,
+  "<|action_token_194|>": 151868,
+  "<|action_token_1950|>": 153624,
+  "<|action_token_1951|>": 153625,
+  "<|action_token_1952|>": 153626,
+  "<|action_token_1953|>": 153627,
+  "<|action_token_1954|>": 153628,
+  "<|action_token_1955|>": 153629,
+  "<|action_token_1956|>": 153630,
+  "<|action_token_1957|>": 153631,
+  "<|action_token_1958|>": 153632,
+  "<|action_token_1959|>": 153633,
+  "<|action_token_195|>": 151869,
+  "<|action_token_1960|>": 153634,
+  "<|action_token_1961|>": 153635,
+  "<|action_token_1962|>": 153636,
+  "<|action_token_1963|>": 153637,
+  "<|action_token_1964|>": 153638,
+  "<|action_token_1965|>": 153639,
+  "<|action_token_1966|>": 153640,
+  "<|action_token_1967|>": 153641,
+  "<|action_token_1968|>": 153642,
+  "<|action_token_1969|>": 153643,
+  "<|action_token_196|>": 151870,
+  "<|action_token_1970|>": 153644,
+  "<|action_token_1971|>": 153645,
+  "<|action_token_1972|>": 153646,
+  "<|action_token_1973|>": 153647,
+  "<|action_token_1974|>": 153648,
+  "<|action_token_1975|>": 153649,
+  "<|action_token_1976|>": 153650,
+  "<|action_token_1977|>": 153651,
+  "<|action_token_1978|>": 153652,
+  "<|action_token_1979|>": 153653,
+  "<|action_token_197|>": 151871,
+  "<|action_token_1980|>": 153654,
+  "<|action_token_1981|>": 153655,
+  "<|action_token_1982|>": 153656,
+  "<|action_token_1983|>": 153657,
+  "<|action_token_1984|>": 153658,
+  "<|action_token_1985|>": 153659,
+  "<|action_token_1986|>": 153660,
+  "<|action_token_1987|>": 153661,
+  "<|action_token_1988|>": 153662,
+  "<|action_token_1989|>": 153663,
+  "<|action_token_198|>": 151872,
+  "<|action_token_1990|>": 153664,
+  "<|action_token_1991|>": 153665,
+  "<|action_token_1992|>": 153666,
+  "<|action_token_1993|>": 153667,
+  "<|action_token_1994|>": 153668,
+  "<|action_token_1995|>": 153669,
+  "<|action_token_1996|>": 153670,
+  "<|action_token_1997|>": 153671,
+  "<|action_token_1998|>": 153672,
+  "<|action_token_1999|>": 153673,
+  "<|action_token_199|>": 151873,
+  "<|action_token_19|>": 151693,
+  "<|action_token_1|>": 151675,
+  "<|action_token_2000|>": 153674,
+  "<|action_token_2001|>": 153675,
+  "<|action_token_2002|>": 153676,
+  "<|action_token_2003|>": 153677,
+  "<|action_token_2004|>": 153678,
+  "<|action_token_2005|>": 153679,
+  "<|action_token_2006|>": 153680,
+  "<|action_token_2007|>": 153681,
+  "<|action_token_2008|>": 153682,
+  "<|action_token_2009|>": 153683,
+  "<|action_token_200|>": 151874,
+  "<|action_token_2010|>": 153684,
+  "<|action_token_2011|>": 153685,
+  "<|action_token_2012|>": 153686,
+  "<|action_token_2013|>": 153687,
+  "<|action_token_2014|>": 153688,
+  "<|action_token_2015|>": 153689,
+  "<|action_token_2016|>": 153690,
+  "<|action_token_2017|>": 153691,
+  "<|action_token_2018|>": 153692,
+  "<|action_token_2019|>": 153693,
+  "<|action_token_201|>": 151875,
+  "<|action_token_2020|>": 153694,
+  "<|action_token_2021|>": 153695,
+  "<|action_token_2022|>": 153696,
+  "<|action_token_2023|>": 153697,
+  "<|action_token_2024|>": 153698,
+  "<|action_token_2025|>": 153699,
+  "<|action_token_2026|>": 153700,
+  "<|action_token_2027|>": 153701,
+  "<|action_token_2028|>": 153702,
+  "<|action_token_2029|>": 153703,
+  "<|action_token_202|>": 151876,
+  "<|action_token_2030|>": 153704,
+  "<|action_token_2031|>": 153705,
+  "<|action_token_2032|>": 153706,
+  "<|action_token_2033|>": 153707,
+  "<|action_token_2034|>": 153708,
+  "<|action_token_2035|>": 153709,
+  "<|action_token_2036|>": 153710,
+  "<|action_token_2037|>": 153711,
+  "<|action_token_2038|>": 153712,
+  "<|action_token_2039|>": 153713,
+  "<|action_token_203|>": 151877,
+  "<|action_token_2040|>": 153714,
+  "<|action_token_2041|>": 153715,
+  "<|action_token_2042|>": 153716,
+  "<|action_token_2043|>": 153717,
+  "<|action_token_2044|>": 153718,
+  "<|action_token_2045|>": 153719,
+  "<|action_token_2046|>": 153720,
+  "<|action_token_2047|>": 153721,
+  "<|action_token_204|>": 151878,
+  "<|action_token_205|>": 151879,
+  "<|action_token_206|>": 151880,
+  "<|action_token_207|>": 151881,
+  "<|action_token_208|>": 151882,
+  "<|action_token_209|>": 151883,
+  "<|action_token_20|>": 151694,
+  "<|action_token_210|>": 151884,
+  "<|action_token_211|>": 151885,
+  "<|action_token_212|>": 151886,
+  "<|action_token_213|>": 151887,
+  "<|action_token_214|>": 151888,
+  "<|action_token_215|>": 151889,
+  "<|action_token_216|>": 151890,
+  "<|action_token_217|>": 151891,
+  "<|action_token_218|>": 151892,
+  "<|action_token_219|>": 151893,
+  "<|action_token_21|>": 151695,
+  "<|action_token_220|>": 151894,
+  "<|action_token_221|>": 151895,
+  "<|action_token_222|>": 151896,
+  "<|action_token_223|>": 151897,
+  "<|action_token_224|>": 151898,
+  "<|action_token_225|>": 151899,
+  "<|action_token_226|>": 151900,
+  "<|action_token_227|>": 151901,
+  "<|action_token_228|>": 151902,
+  "<|action_token_229|>": 151903,
+  "<|action_token_22|>": 151696,
+  "<|action_token_230|>": 151904,
+  "<|action_token_231|>": 151905,
+  "<|action_token_232|>": 151906,
+  "<|action_token_233|>": 151907,
+  "<|action_token_234|>": 151908,
+  "<|action_token_235|>": 151909,
+  "<|action_token_236|>": 151910,
+  "<|action_token_237|>": 151911,
+  "<|action_token_238|>": 151912,
+  "<|action_token_239|>": 151913,
+  "<|action_token_23|>": 151697,
+  "<|action_token_240|>": 151914,
+  "<|action_token_241|>": 151915,
+  "<|action_token_242|>": 151916,
+  "<|action_token_243|>": 151917,
+  "<|action_token_244|>": 151918,
+  "<|action_token_245|>": 151919,
+  "<|action_token_246|>": 151920,
+  "<|action_token_247|>": 151921,
+  "<|action_token_248|>": 151922,
+  "<|action_token_249|>": 151923,
+  "<|action_token_24|>": 151698,
+  "<|action_token_250|>": 151924,
+  "<|action_token_251|>": 151925,
+  "<|action_token_252|>": 151926,
+  "<|action_token_253|>": 151927,
+  "<|action_token_254|>": 151928,
+  "<|action_token_255|>": 151929,
+  "<|action_token_256|>": 151930,
+  "<|action_token_257|>": 151931,
+  "<|action_token_258|>": 151932,
+  "<|action_token_259|>": 151933,
+  "<|action_token_25|>": 151699,
+  "<|action_token_260|>": 151934,
+  "<|action_token_261|>": 151935,
+  "<|action_token_262|>": 151936,
+  "<|action_token_263|>": 151937,
+  "<|action_token_264|>": 151938,
+  "<|action_token_265|>": 151939,
+  "<|action_token_266|>": 151940,
+  "<|action_token_267|>": 151941,
+  "<|action_token_268|>": 151942,
+  "<|action_token_269|>": 151943,
+  "<|action_token_26|>": 151700,
+  "<|action_token_270|>": 151944,
+  "<|action_token_271|>": 151945,
+  "<|action_token_272|>": 151946,
+  "<|action_token_273|>": 151947,
+  "<|action_token_274|>": 151948,
+  "<|action_token_275|>": 151949,
+  "<|action_token_276|>": 151950,
+  "<|action_token_277|>": 151951,
+  "<|action_token_278|>": 151952,
+  "<|action_token_279|>": 151953,
+  "<|action_token_27|>": 151701,
+  "<|action_token_280|>": 151954,
+  "<|action_token_281|>": 151955,
+  "<|action_token_282|>": 151956,
+  "<|action_token_283|>": 151957,
+  "<|action_token_284|>": 151958,
+  "<|action_token_285|>": 151959,
+  "<|action_token_286|>": 151960,
+  "<|action_token_287|>": 151961,
+  "<|action_token_288|>": 151962,
+  "<|action_token_289|>": 151963,
+  "<|action_token_28|>": 151702,
+  "<|action_token_290|>": 151964,
+  "<|action_token_291|>": 151965,
+  "<|action_token_292|>": 151966,
+  "<|action_token_293|>": 151967,
+  "<|action_token_294|>": 151968,
+  "<|action_token_295|>": 151969,
+  "<|action_token_296|>": 151970,
+  "<|action_token_297|>": 151971,
+  "<|action_token_298|>": 151972,
+  "<|action_token_299|>": 151973,
+  "<|action_token_29|>": 151703,
+  "<|action_token_2|>": 151676,
+  "<|action_token_300|>": 151974,
+  "<|action_token_301|>": 151975,
+  "<|action_token_302|>": 151976,
+  "<|action_token_303|>": 151977,
+  "<|action_token_304|>": 151978,
+  "<|action_token_305|>": 151979,
+  "<|action_token_306|>": 151980,
+  "<|action_token_307|>": 151981,
+  "<|action_token_308|>": 151982,
+  "<|action_token_309|>": 151983,
+  "<|action_token_30|>": 151704,
+  "<|action_token_310|>": 151984,
+  "<|action_token_311|>": 151985,
+  "<|action_token_312|>": 151986,
+  "<|action_token_313|>": 151987,
+  "<|action_token_314|>": 151988,
+  "<|action_token_315|>": 151989,
+  "<|action_token_316|>": 151990,
+  "<|action_token_317|>": 151991,
+  "<|action_token_318|>": 151992,
+  "<|action_token_319|>": 151993,
+  "<|action_token_31|>": 151705,
+  "<|action_token_320|>": 151994,
+  "<|action_token_321|>": 151995,
+  "<|action_token_322|>": 151996,
+  "<|action_token_323|>": 151997,
+  "<|action_token_324|>": 151998,
+  "<|action_token_325|>": 151999,
+  "<|action_token_326|>": 152000,
+  "<|action_token_327|>": 152001,
+  "<|action_token_328|>": 152002,
+  "<|action_token_329|>": 152003,
+  "<|action_token_32|>": 151706,
+  "<|action_token_330|>": 152004,
+  "<|action_token_331|>": 152005,
+  "<|action_token_332|>": 152006,
+  "<|action_token_333|>": 152007,
+  "<|action_token_334|>": 152008,
+  "<|action_token_335|>": 152009,
+  "<|action_token_336|>": 152010,
+  "<|action_token_337|>": 152011,
+  "<|action_token_338|>": 152012,
+  "<|action_token_339|>": 152013,
+  "<|action_token_33|>": 151707,
+  "<|action_token_340|>": 152014,
+  "<|action_token_341|>": 152015,
+  "<|action_token_342|>": 152016,
+  "<|action_token_343|>": 152017,
+  "<|action_token_344|>": 152018,
+  "<|action_token_345|>": 152019,
+  "<|action_token_346|>": 152020,
+  "<|action_token_347|>": 152021,
+  "<|action_token_348|>": 152022,
+  "<|action_token_349|>": 152023,
+  "<|action_token_34|>": 151708,
+  "<|action_token_350|>": 152024,
+  "<|action_token_351|>": 152025,
+  "<|action_token_352|>": 152026,
+  "<|action_token_353|>": 152027,
+  "<|action_token_354|>": 152028,
+  "<|action_token_355|>": 152029,
+  "<|action_token_356|>": 152030,
+  "<|action_token_357|>": 152031,
+  "<|action_token_358|>": 152032,
+  "<|action_token_359|>": 152033,
+  "<|action_token_35|>": 151709,
+  "<|action_token_360|>": 152034,
+  "<|action_token_361|>": 152035,
+  "<|action_token_362|>": 152036,
+  "<|action_token_363|>": 152037,
+  "<|action_token_364|>": 152038,
+  "<|action_token_365|>": 152039,
+  "<|action_token_366|>": 152040,
+  "<|action_token_367|>": 152041,
+  "<|action_token_368|>": 152042,
+  "<|action_token_369|>": 152043,
+  "<|action_token_36|>": 151710,
+  "<|action_token_370|>": 152044,
+  "<|action_token_371|>": 152045,
+  "<|action_token_372|>": 152046,
+  "<|action_token_373|>": 152047,
+  "<|action_token_374|>": 152048,
+  "<|action_token_375|>": 152049,
+  "<|action_token_376|>": 152050,
+  "<|action_token_377|>": 152051,
+  "<|action_token_378|>": 152052,
+  "<|action_token_379|>": 152053,
+  "<|action_token_37|>": 151711,
+  "<|action_token_380|>": 152054,
+  "<|action_token_381|>": 152055,
+  "<|action_token_382|>": 152056,
+  "<|action_token_383|>": 152057,
+  "<|action_token_384|>": 152058,
+  "<|action_token_385|>": 152059,
+  "<|action_token_386|>": 152060,
+  "<|action_token_387|>": 152061,
+  "<|action_token_388|>": 152062,
+  "<|action_token_389|>": 152063,
+  "<|action_token_38|>": 151712,
+  "<|action_token_390|>": 152064,
+  "<|action_token_391|>": 152065,
+  "<|action_token_392|>": 152066,
+  "<|action_token_393|>": 152067,
+  "<|action_token_394|>": 152068,
+  "<|action_token_395|>": 152069,
+  "<|action_token_396|>": 152070,
+  "<|action_token_397|>": 152071,
+  "<|action_token_398|>": 152072,
+  "<|action_token_399|>": 152073,
+  "<|action_token_39|>": 151713,
+  "<|action_token_3|>": 151677,
+  "<|action_token_400|>": 152074,
+  "<|action_token_401|>": 152075,
+  "<|action_token_402|>": 152076,
+  "<|action_token_403|>": 152077,
+  "<|action_token_404|>": 152078,
+  "<|action_token_405|>": 152079,
+  "<|action_token_406|>": 152080,
+  "<|action_token_407|>": 152081,
+  "<|action_token_408|>": 152082,
+  "<|action_token_409|>": 152083,
+  "<|action_token_40|>": 151714,
+  "<|action_token_410|>": 152084,
+  "<|action_token_411|>": 152085,
+  "<|action_token_412|>": 152086,
+  "<|action_token_413|>": 152087,
+  "<|action_token_414|>": 152088,
+  "<|action_token_415|>": 152089,
+  "<|action_token_416|>": 152090,
+  "<|action_token_417|>": 152091,
+  "<|action_token_418|>": 152092,
+  "<|action_token_419|>": 152093,
+  "<|action_token_41|>": 151715,
+  "<|action_token_420|>": 152094,
+  "<|action_token_421|>": 152095,
+  "<|action_token_422|>": 152096,
+  "<|action_token_423|>": 152097,
+  "<|action_token_424|>": 152098,
+  "<|action_token_425|>": 152099,
+  "<|action_token_426|>": 152100,
+  "<|action_token_427|>": 152101,
+  "<|action_token_428|>": 152102,
+  "<|action_token_429|>": 152103,
+  "<|action_token_42|>": 151716,
+  "<|action_token_430|>": 152104,
+  "<|action_token_431|>": 152105,
+  "<|action_token_432|>": 152106,
+  "<|action_token_433|>": 152107,
+  "<|action_token_434|>": 152108,
+  "<|action_token_435|>": 152109,
+  "<|action_token_436|>": 152110,
+  "<|action_token_437|>": 152111,
+  "<|action_token_438|>": 152112,
+  "<|action_token_439|>": 152113,
+  "<|action_token_43|>": 151717,
+  "<|action_token_440|>": 152114,
+  "<|action_token_441|>": 152115,
+  "<|action_token_442|>": 152116,
+  "<|action_token_443|>": 152117,
+  "<|action_token_444|>": 152118,
+  "<|action_token_445|>": 152119,
+  "<|action_token_446|>": 152120,
+  "<|action_token_447|>": 152121,
+  "<|action_token_448|>": 152122,
+  "<|action_token_449|>": 152123,
+  "<|action_token_44|>": 151718,
+  "<|action_token_450|>": 152124,
+  "<|action_token_451|>": 152125,
+  "<|action_token_452|>": 152126,
+  "<|action_token_453|>": 152127,
+  "<|action_token_454|>": 152128,
+  "<|action_token_455|>": 152129,
+  "<|action_token_456|>": 152130,
+  "<|action_token_457|>": 152131,
+  "<|action_token_458|>": 152132,
+  "<|action_token_459|>": 152133,
+  "<|action_token_45|>": 151719,
+  "<|action_token_460|>": 152134,
+  "<|action_token_461|>": 152135,
+  "<|action_token_462|>": 152136,
+  "<|action_token_463|>": 152137,
+  "<|action_token_464|>": 152138,
+  "<|action_token_465|>": 152139,
+  "<|action_token_466|>": 152140,
+  "<|action_token_467|>": 152141,
+  "<|action_token_468|>": 152142,
+  "<|action_token_469|>": 152143,
+  "<|action_token_46|>": 151720,
+  "<|action_token_470|>": 152144,
+  "<|action_token_471|>": 152145,
+  "<|action_token_472|>": 152146,
+  "<|action_token_473|>": 152147,
+  "<|action_token_474|>": 152148,
+  "<|action_token_475|>": 152149,
+  "<|action_token_476|>": 152150,
+  "<|action_token_477|>": 152151,
+  "<|action_token_478|>": 152152,
+  "<|action_token_479|>": 152153,
+  "<|action_token_47|>": 151721,
+  "<|action_token_480|>": 152154,
+  "<|action_token_481|>": 152155,
+  "<|action_token_482|>": 152156,
+  "<|action_token_483|>": 152157,
+  "<|action_token_484|>": 152158,
+  "<|action_token_485|>": 152159,
+  "<|action_token_486|>": 152160,
+  "<|action_token_487|>": 152161,
+  "<|action_token_488|>": 152162,
+  "<|action_token_489|>": 152163,
+  "<|action_token_48|>": 151722,
+  "<|action_token_490|>": 152164,
+  "<|action_token_491|>": 152165,
+  "<|action_token_492|>": 152166,
+  "<|action_token_493|>": 152167,
+  "<|action_token_494|>": 152168,
+  "<|action_token_495|>": 152169,
+  "<|action_token_496|>": 152170,
+  "<|action_token_497|>": 152171,
+  "<|action_token_498|>": 152172,
+  "<|action_token_499|>": 152173,
+  "<|action_token_49|>": 151723,
+  "<|action_token_4|>": 151678,
+  "<|action_token_500|>": 152174,
+  "<|action_token_501|>": 152175,
+  "<|action_token_502|>": 152176,
+  "<|action_token_503|>": 152177,
+  "<|action_token_504|>": 152178,
+  "<|action_token_505|>": 152179,
+  "<|action_token_506|>": 152180,
+  "<|action_token_507|>": 152181,
+  "<|action_token_508|>": 152182,
+  "<|action_token_509|>": 152183,
+  "<|action_token_50|>": 151724,
+  "<|action_token_510|>": 152184,
+  "<|action_token_511|>": 152185,
+  "<|action_token_512|>": 152186,
+  "<|action_token_513|>": 152187,
+  "<|action_token_514|>": 152188,
+  "<|action_token_515|>": 152189,
+  "<|action_token_516|>": 152190,
+  "<|action_token_517|>": 152191,
+  "<|action_token_518|>": 152192,
+  "<|action_token_519|>": 152193,
+  "<|action_token_51|>": 151725,
+  "<|action_token_520|>": 152194,
+  "<|action_token_521|>": 152195,
+  "<|action_token_522|>": 152196,
+  "<|action_token_523|>": 152197,
+  "<|action_token_524|>": 152198,
+  "<|action_token_525|>": 152199,
+  "<|action_token_526|>": 152200,
+  "<|action_token_527|>": 152201,
+  "<|action_token_528|>": 152202,
+  "<|action_token_529|>": 152203,
+  "<|action_token_52|>": 151726,
+  "<|action_token_530|>": 152204,
+  "<|action_token_531|>": 152205,
+  "<|action_token_532|>": 152206,
+  "<|action_token_533|>": 152207,
+  "<|action_token_534|>": 152208,
+  "<|action_token_535|>": 152209,
+  "<|action_token_536|>": 152210,
+  "<|action_token_537|>": 152211,
+  "<|action_token_538|>": 152212,
+  "<|action_token_539|>": 152213,
+  "<|action_token_53|>": 151727,
+  "<|action_token_540|>": 152214,
+  "<|action_token_541|>": 152215,
+  "<|action_token_542|>": 152216,
+  "<|action_token_543|>": 152217,
+  "<|action_token_544|>": 152218,
+  "<|action_token_545|>": 152219,
+  "<|action_token_546|>": 152220,
+  "<|action_token_547|>": 152221,
+  "<|action_token_548|>": 152222,
+  "<|action_token_549|>": 152223,
+  "<|action_token_54|>": 151728,
+  "<|action_token_550|>": 152224,
+  "<|action_token_551|>": 152225,
+  "<|action_token_552|>": 152226,
+  "<|action_token_553|>": 152227,
+  "<|action_token_554|>": 152228,
+  "<|action_token_555|>": 152229,
+  "<|action_token_556|>": 152230,
+  "<|action_token_557|>": 152231,
+  "<|action_token_558|>": 152232,
+  "<|action_token_559|>": 152233,
+  "<|action_token_55|>": 151729,
+  "<|action_token_560|>": 152234,
+  "<|action_token_561|>": 152235,
+  "<|action_token_562|>": 152236,
+  "<|action_token_563|>": 152237,
+  "<|action_token_564|>": 152238,
+  "<|action_token_565|>": 152239,
+  "<|action_token_566|>": 152240,
+  "<|action_token_567|>": 152241,
+  "<|action_token_568|>": 152242,
+  "<|action_token_569|>": 152243,
+  "<|action_token_56|>": 151730,
+  "<|action_token_570|>": 152244,
+  "<|action_token_571|>": 152245,
+  "<|action_token_572|>": 152246,
+  "<|action_token_573|>": 152247,
+  "<|action_token_574|>": 152248,
+  "<|action_token_575|>": 152249,
+  "<|action_token_576|>": 152250,
+  "<|action_token_577|>": 152251,
+  "<|action_token_578|>": 152252,
+  "<|action_token_579|>": 152253,
+  "<|action_token_57|>": 151731,
+  "<|action_token_580|>": 152254,
+  "<|action_token_581|>": 152255,
+  "<|action_token_582|>": 152256,
+  "<|action_token_583|>": 152257,
+  "<|action_token_584|>": 152258,
+  "<|action_token_585|>": 152259,
+  "<|action_token_586|>": 152260,
+  "<|action_token_587|>": 152261,
+  "<|action_token_588|>": 152262,
+  "<|action_token_589|>": 152263,
+  "<|action_token_58|>": 151732,
+  "<|action_token_590|>": 152264,
+  "<|action_token_591|>": 152265,
+  "<|action_token_592|>": 152266,
+  "<|action_token_593|>": 152267,
+  "<|action_token_594|>": 152268,
+  "<|action_token_595|>": 152269,
+  "<|action_token_596|>": 152270,
+  "<|action_token_597|>": 152271,
+  "<|action_token_598|>": 152272,
+  "<|action_token_599|>": 152273,
+  "<|action_token_59|>": 151733,
+  "<|action_token_5|>": 151679,
+  "<|action_token_600|>": 152274,
+  "<|action_token_601|>": 152275,
+  "<|action_token_602|>": 152276,
+  "<|action_token_603|>": 152277,
+  "<|action_token_604|>": 152278,
+  "<|action_token_605|>": 152279,
+  "<|action_token_606|>": 152280,
+  "<|action_token_607|>": 152281,
+  "<|action_token_608|>": 152282,
+  "<|action_token_609|>": 152283,
+  "<|action_token_60|>": 151734,
+  "<|action_token_610|>": 152284,
+  "<|action_token_611|>": 152285,
+  "<|action_token_612|>": 152286,
+  "<|action_token_613|>": 152287,
+  "<|action_token_614|>": 152288,
+  "<|action_token_615|>": 152289,
+  "<|action_token_616|>": 152290,
+  "<|action_token_617|>": 152291,
+  "<|action_token_618|>": 152292,
+  "<|action_token_619|>": 152293,
+  "<|action_token_61|>": 151735,
+  "<|action_token_620|>": 152294,
+  "<|action_token_621|>": 152295,
+  "<|action_token_622|>": 152296,
+  "<|action_token_623|>": 152297,
+  "<|action_token_624|>": 152298,
+  "<|action_token_625|>": 152299,
+  "<|action_token_626|>": 152300,
+  "<|action_token_627|>": 152301,
+  "<|action_token_628|>": 152302,
+  "<|action_token_629|>": 152303,
+  "<|action_token_62|>": 151736,
+  "<|action_token_630|>": 152304,
+  "<|action_token_631|>": 152305,
+  "<|action_token_632|>": 152306,
+  "<|action_token_633|>": 152307,
+  "<|action_token_634|>": 152308,
+  "<|action_token_635|>": 152309,
+  "<|action_token_636|>": 152310,
+  "<|action_token_637|>": 152311,
+  "<|action_token_638|>": 152312,
+  "<|action_token_639|>": 152313,
+  "<|action_token_63|>": 151737,
+  "<|action_token_640|>": 152314,
+  "<|action_token_641|>": 152315,
+  "<|action_token_642|>": 152316,
+  "<|action_token_643|>": 152317,
+  "<|action_token_644|>": 152318,
+  "<|action_token_645|>": 152319,
+  "<|action_token_646|>": 152320,
+  "<|action_token_647|>": 152321,
+  "<|action_token_648|>": 152322,
+  "<|action_token_649|>": 152323,
+  "<|action_token_64|>": 151738,
+  "<|action_token_650|>": 152324,
+  "<|action_token_651|>": 152325,
+  "<|action_token_652|>": 152326,
+  "<|action_token_653|>": 152327,
+  "<|action_token_654|>": 152328,
+  "<|action_token_655|>": 152329,
+  "<|action_token_656|>": 152330,
+  "<|action_token_657|>": 152331,
+  "<|action_token_658|>": 152332,
+  "<|action_token_659|>": 152333,
+  "<|action_token_65|>": 151739,
+  "<|action_token_660|>": 152334,
+  "<|action_token_661|>": 152335,
+  "<|action_token_662|>": 152336,
+  "<|action_token_663|>": 152337,
+  "<|action_token_664|>": 152338,
+  "<|action_token_665|>": 152339,
+  "<|action_token_666|>": 152340,
+  "<|action_token_667|>": 152341,
+  "<|action_token_668|>": 152342,
+  "<|action_token_669|>": 152343,
+  "<|action_token_66|>": 151740,
+  "<|action_token_670|>": 152344,
+  "<|action_token_671|>": 152345,
+  "<|action_token_672|>": 152346,
+  "<|action_token_673|>": 152347,
+  "<|action_token_674|>": 152348,
+  "<|action_token_675|>": 152349,
+  "<|action_token_676|>": 152350,
+  "<|action_token_677|>": 152351,
+  "<|action_token_678|>": 152352,
+  "<|action_token_679|>": 152353,
+  "<|action_token_67|>": 151741,
+  "<|action_token_680|>": 152354,
+  "<|action_token_681|>": 152355,
+  "<|action_token_682|>": 152356,
+  "<|action_token_683|>": 152357,
+  "<|action_token_684|>": 152358,
+  "<|action_token_685|>": 152359,
+  "<|action_token_686|>": 152360,
+  "<|action_token_687|>": 152361,
+  "<|action_token_688|>": 152362,
+  "<|action_token_689|>": 152363,
+  "<|action_token_68|>": 151742,
+  "<|action_token_690|>": 152364,
+  "<|action_token_691|>": 152365,
+  "<|action_token_692|>": 152366,
+  "<|action_token_693|>": 152367,
+  "<|action_token_694|>": 152368,
+  "<|action_token_695|>": 152369,
+  "<|action_token_696|>": 152370,
+  "<|action_token_697|>": 152371,
+  "<|action_token_698|>": 152372,
+  "<|action_token_699|>": 152373,
+  "<|action_token_69|>": 151743,
+  "<|action_token_6|>": 151680,
+  "<|action_token_700|>": 152374,
+  "<|action_token_701|>": 152375,
+  "<|action_token_702|>": 152376,
+  "<|action_token_703|>": 152377,
+  "<|action_token_704|>": 152378,
+  "<|action_token_705|>": 152379,
+  "<|action_token_706|>": 152380,
+  "<|action_token_707|>": 152381,
+  "<|action_token_708|>": 152382,
+  "<|action_token_709|>": 152383,
+  "<|action_token_70|>": 151744,
+  "<|action_token_710|>": 152384,
+  "<|action_token_711|>": 152385,
+  "<|action_token_712|>": 152386,
+  "<|action_token_713|>": 152387,
+  "<|action_token_714|>": 152388,
+  "<|action_token_715|>": 152389,
+  "<|action_token_716|>": 152390,
+  "<|action_token_717|>": 152391,
+  "<|action_token_718|>": 152392,
+  "<|action_token_719|>": 152393,
+  "<|action_token_71|>": 151745,
+  "<|action_token_720|>": 152394,
+  "<|action_token_721|>": 152395,
+  "<|action_token_722|>": 152396,
+  "<|action_token_723|>": 152397,
+  "<|action_token_724|>": 152398,
+  "<|action_token_725|>": 152399,
+  "<|action_token_726|>": 152400,
+  "<|action_token_727|>": 152401,
+  "<|action_token_728|>": 152402,
+  "<|action_token_729|>": 152403,
+  "<|action_token_72|>": 151746,
+  "<|action_token_730|>": 152404,
+  "<|action_token_731|>": 152405,
+  "<|action_token_732|>": 152406,
+  "<|action_token_733|>": 152407,
+  "<|action_token_734|>": 152408,
+  "<|action_token_735|>": 152409,
+  "<|action_token_736|>": 152410,
+  "<|action_token_737|>": 152411,
+  "<|action_token_738|>": 152412,
+  "<|action_token_739|>": 152413,
+  "<|action_token_73|>": 151747,
+  "<|action_token_740|>": 152414,
+  "<|action_token_741|>": 152415,
+  "<|action_token_742|>": 152416,
+  "<|action_token_743|>": 152417,
+  "<|action_token_744|>": 152418,
+  "<|action_token_745|>": 152419,
+  "<|action_token_746|>": 152420,
+  "<|action_token_747|>": 152421,
+  "<|action_token_748|>": 152422,
+  "<|action_token_749|>": 152423,
+  "<|action_token_74|>": 151748,
+  "<|action_token_750|>": 152424,
+  "<|action_token_751|>": 152425,
+  "<|action_token_752|>": 152426,
+  "<|action_token_753|>": 152427,
+  "<|action_token_754|>": 152428,
+  "<|action_token_755|>": 152429,
+  "<|action_token_756|>": 152430,
+  "<|action_token_757|>": 152431,
+  "<|action_token_758|>": 152432,
+  "<|action_token_759|>": 152433,
+  "<|action_token_75|>": 151749,
+  "<|action_token_760|>": 152434,
+  "<|action_token_761|>": 152435,
+  "<|action_token_762|>": 152436,
+  "<|action_token_763|>": 152437,
+  "<|action_token_764|>": 152438,
+  "<|action_token_765|>": 152439,
+  "<|action_token_766|>": 152440,
+  "<|action_token_767|>": 152441,
+  "<|action_token_768|>": 152442,
+  "<|action_token_769|>": 152443,
+  "<|action_token_76|>": 151750,
+  "<|action_token_770|>": 152444,
+  "<|action_token_771|>": 152445,
+  "<|action_token_772|>": 152446,
+  "<|action_token_773|>": 152447,
+  "<|action_token_774|>": 152448,
+  "<|action_token_775|>": 152449,
+  "<|action_token_776|>": 152450,
+  "<|action_token_777|>": 152451,
+  "<|action_token_778|>": 152452,
+  "<|action_token_779|>": 152453,
+  "<|action_token_77|>": 151751,
+  "<|action_token_780|>": 152454,
+  "<|action_token_781|>": 152455,
+  "<|action_token_782|>": 152456,
+  "<|action_token_783|>": 152457,
+  "<|action_token_784|>": 152458,
+  "<|action_token_785|>": 152459,
+  "<|action_token_786|>": 152460,
+  "<|action_token_787|>": 152461,
+  "<|action_token_788|>": 152462,
+  "<|action_token_789|>": 152463,
+  "<|action_token_78|>": 151752,
+  "<|action_token_790|>": 152464,
+  "<|action_token_791|>": 152465,
+  "<|action_token_792|>": 152466,
+  "<|action_token_793|>": 152467,
+  "<|action_token_794|>": 152468,
+  "<|action_token_795|>": 152469,
+  "<|action_token_796|>": 152470,
+  "<|action_token_797|>": 152471,
+  "<|action_token_798|>": 152472,
+  "<|action_token_799|>": 152473,
+  "<|action_token_79|>": 151753,
+  "<|action_token_7|>": 151681,
+  "<|action_token_800|>": 152474,
+  "<|action_token_801|>": 152475,
+  "<|action_token_802|>": 152476,
+  "<|action_token_803|>": 152477,
+  "<|action_token_804|>": 152478,
+  "<|action_token_805|>": 152479,
+  "<|action_token_806|>": 152480,
+  "<|action_token_807|>": 152481,
+  "<|action_token_808|>": 152482,
+  "<|action_token_809|>": 152483,
+  "<|action_token_80|>": 151754,
+  "<|action_token_810|>": 152484,
+  "<|action_token_811|>": 152485,
+  "<|action_token_812|>": 152486,
+  "<|action_token_813|>": 152487,
+  "<|action_token_814|>": 152488,
+  "<|action_token_815|>": 152489,
+  "<|action_token_816|>": 152490,
+  "<|action_token_817|>": 152491,
+  "<|action_token_818|>": 152492,
+  "<|action_token_819|>": 152493,
+  "<|action_token_81|>": 151755,
+  "<|action_token_820|>": 152494,
+  "<|action_token_821|>": 152495,
+  "<|action_token_822|>": 152496,
+  "<|action_token_823|>": 152497,
+  "<|action_token_824|>": 152498,
+  "<|action_token_825|>": 152499,
+  "<|action_token_826|>": 152500,
+  "<|action_token_827|>": 152501,
+  "<|action_token_828|>": 152502,
+  "<|action_token_829|>": 152503,
+  "<|action_token_82|>": 151756,
+  "<|action_token_830|>": 152504,
+  "<|action_token_831|>": 152505,
+  "<|action_token_832|>": 152506,
+  "<|action_token_833|>": 152507,
+  "<|action_token_834|>": 152508,
+  "<|action_token_835|>": 152509,
+  "<|action_token_836|>": 152510,
+  "<|action_token_837|>": 152511,
+  "<|action_token_838|>": 152512,
+  "<|action_token_839|>": 152513,
+  "<|action_token_83|>": 151757,
+  "<|action_token_840|>": 152514,
+  "<|action_token_841|>": 152515,
+  "<|action_token_842|>": 152516,
+  "<|action_token_843|>": 152517,
+  "<|action_token_844|>": 152518,
+  "<|action_token_845|>": 152519,
+  "<|action_token_846|>": 152520,
+  "<|action_token_847|>": 152521,
+  "<|action_token_848|>": 152522,
+  "<|action_token_849|>": 152523,
+  "<|action_token_84|>": 151758,
+  "<|action_token_850|>": 152524,
+  "<|action_token_851|>": 152525,
+  "<|action_token_852|>": 152526,
+  "<|action_token_853|>": 152527,
+  "<|action_token_854|>": 152528,
+  "<|action_token_855|>": 152529,
+  "<|action_token_856|>": 152530,
+  "<|action_token_857|>": 152531,
+  "<|action_token_858|>": 152532,
+  "<|action_token_859|>": 152533,
+  "<|action_token_85|>": 151759,
+  "<|action_token_860|>": 152534,
+  "<|action_token_861|>": 152535,
+  "<|action_token_862|>": 152536,
+  "<|action_token_863|>": 152537,
+  "<|action_token_864|>": 152538,
+  "<|action_token_865|>": 152539,
+  "<|action_token_866|>": 152540,
+  "<|action_token_867|>": 152541,
+  "<|action_token_868|>": 152542,
+  "<|action_token_869|>": 152543,
+  "<|action_token_86|>": 151760,
+  "<|action_token_870|>": 152544,
+  "<|action_token_871|>": 152545,
+  "<|action_token_872|>": 152546,
+  "<|action_token_873|>": 152547,
+  "<|action_token_874|>": 152548,
+  "<|action_token_875|>": 152549,
+  "<|action_token_876|>": 152550,
+  "<|action_token_877|>": 152551,
+  "<|action_token_878|>": 152552,
+  "<|action_token_879|>": 152553,
+  "<|action_token_87|>": 151761,
+  "<|action_token_880|>": 152554,
+  "<|action_token_881|>": 152555,
+  "<|action_token_882|>": 152556,
+  "<|action_token_883|>": 152557,
+  "<|action_token_884|>": 152558,
+  "<|action_token_885|>": 152559,
+  "<|action_token_886|>": 152560,
+  "<|action_token_887|>": 152561,
+  "<|action_token_888|>": 152562,
+  "<|action_token_889|>": 152563,
+  "<|action_token_88|>": 151762,
+  "<|action_token_890|>": 152564,
+  "<|action_token_891|>": 152565,
+  "<|action_token_892|>": 152566,
+  "<|action_token_893|>": 152567,
+  "<|action_token_894|>": 152568,
+  "<|action_token_895|>": 152569,
+  "<|action_token_896|>": 152570,
+  "<|action_token_897|>": 152571,
+  "<|action_token_898|>": 152572,
+  "<|action_token_899|>": 152573,
+  "<|action_token_89|>": 151763,
+  "<|action_token_8|>": 151682,
+  "<|action_token_900|>": 152574,
+  "<|action_token_901|>": 152575,
+  "<|action_token_902|>": 152576,
+  "<|action_token_903|>": 152577,
+  "<|action_token_904|>": 152578,
+  "<|action_token_905|>": 152579,
+  "<|action_token_906|>": 152580,
+  "<|action_token_907|>": 152581,
+  "<|action_token_908|>": 152582,
+  "<|action_token_909|>": 152583,
+  "<|action_token_90|>": 151764,
+  "<|action_token_910|>": 152584,
+  "<|action_token_911|>": 152585,
+  "<|action_token_912|>": 152586,
+  "<|action_token_913|>": 152587,
+  "<|action_token_914|>": 152588,
+  "<|action_token_915|>": 152589,
+  "<|action_token_916|>": 152590,
+  "<|action_token_917|>": 152591,
+  "<|action_token_918|>": 152592,
+  "<|action_token_919|>": 152593,
+  "<|action_token_91|>": 151765,
+  "<|action_token_920|>": 152594,
+  "<|action_token_921|>": 152595,
+  "<|action_token_922|>": 152596,
+  "<|action_token_923|>": 152597,
+  "<|action_token_924|>": 152598,
+  "<|action_token_925|>": 152599,
+  "<|action_token_926|>": 152600,
+  "<|action_token_927|>": 152601,
+  "<|action_token_928|>": 152602,
+  "<|action_token_929|>": 152603,
+  "<|action_token_92|>": 151766,
+  "<|action_token_930|>": 152604,
+  "<|action_token_931|>": 152605,
+  "<|action_token_932|>": 152606,
+  "<|action_token_933|>": 152607,
+  "<|action_token_934|>": 152608,
+  "<|action_token_935|>": 152609,
+  "<|action_token_936|>": 152610,
+  "<|action_token_937|>": 152611,
+  "<|action_token_938|>": 152612,
+  "<|action_token_939|>": 152613,
+  "<|action_token_93|>": 151767,
+  "<|action_token_940|>": 152614,
+  "<|action_token_941|>": 152615,
+  "<|action_token_942|>": 152616,
+  "<|action_token_943|>": 152617,
+  "<|action_token_944|>": 152618,
+  "<|action_token_945|>": 152619,
+  "<|action_token_946|>": 152620,
+  "<|action_token_947|>": 152621,
+  "<|action_token_948|>": 152622,
+  "<|action_token_949|>": 152623,
+  "<|action_token_94|>": 151768,
+  "<|action_token_950|>": 152624,
+  "<|action_token_951|>": 152625,
+  "<|action_token_952|>": 152626,
+  "<|action_token_953|>": 152627,
+  "<|action_token_954|>": 152628,
+  "<|action_token_955|>": 152629,
+  "<|action_token_956|>": 152630,
+  "<|action_token_957|>": 152631,
+  "<|action_token_958|>": 152632,
+  "<|action_token_959|>": 152633,
+  "<|action_token_95|>": 151769,
+  "<|action_token_960|>": 152634,
+  "<|action_token_961|>": 152635,
+  "<|action_token_962|>": 152636,
+  "<|action_token_963|>": 152637,
+  "<|action_token_964|>": 152638,
+  "<|action_token_965|>": 152639,
+  "<|action_token_966|>": 152640,
+  "<|action_token_967|>": 152641,
+  "<|action_token_968|>": 152642,
+  "<|action_token_969|>": 152643,
+  "<|action_token_96|>": 151770,
+  "<|action_token_970|>": 152644,
+  "<|action_token_971|>": 152645,
+  "<|action_token_972|>": 152646,
+  "<|action_token_973|>": 152647,
+  "<|action_token_974|>": 152648,
+  "<|action_token_975|>": 152649,
+  "<|action_token_976|>": 152650,
+  "<|action_token_977|>": 152651,
+  "<|action_token_978|>": 152652,
+  "<|action_token_979|>": 152653,
+  "<|action_token_97|>": 151771,
+  "<|action_token_980|>": 152654,
+  "<|action_token_981|>": 152655,
+  "<|action_token_982|>": 152656,
+  "<|action_token_983|>": 152657,
+  "<|action_token_984|>": 152658,
+  "<|action_token_985|>": 152659,
+  "<|action_token_986|>": 152660,
+  "<|action_token_987|>": 152661,
+  "<|action_token_988|>": 152662,
+  "<|action_token_989|>": 152663,
+  "<|action_token_98|>": 151772,
+  "<|action_token_990|>": 152664,
+  "<|action_token_991|>": 152665,
+  "<|action_token_992|>": 152666,
+  "<|action_token_993|>": 152667,
+  "<|action_token_994|>": 152668,
+  "<|action_token_995|>": 152669,
+  "<|action_token_996|>": 152670,
+  "<|action_token_997|>": 152671,
+  "<|action_token_998|>": 152672,
+  "<|action_token_999|>": 152673,
+  "<|action_token_99|>": 151773,
+  "<|action_token_9|>": 151683,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|goal_repr|>": 151672,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|obs_repr|>": 151673,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,120 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' }}
+        {%- if messages[0].content is string %}
+            {{- messages[0].content }}
+        {%- else %}
+            {%- for content in messages[0].content %}
+                {%- if 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set image_count = namespace(value=0) %}
+{%- set video_count = namespace(value=0) %}
+{%- for message in messages %}
+    {%- if message.role == "user" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content in message.content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                    <|vision_start|><|image_pad|><|vision_end|>
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                    <|vision_start|><|video_pad|><|vision_end|>
+                {%- elif 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role + '\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content_item in message.content %}
+                {%- if 'text' in content_item %}
+                    {{- content_item.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and message.content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {%- if message.content is string %}
+            {{- message.content }}
+        {%- else %}
+            {%- for content in message.content %}
+                {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
+                    {%- set image_count.value = image_count.value + 1 %}
+                    {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
+                    <|vision_start|><|image_pad|><|vision_end|>
+                {%- elif content.type == 'video' or 'video' in content %}
+                    {%- set video_count.value = video_count.value + 1 %}
+                    {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
+                    <|vision_start|><|video_pad|><|vision_end|>
+                {%- elif 'text' in content %}
+                    {{- content.text }}
+                {%- endif %}
+            {%- endfor %}
+        {%- endif %}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

config.json ADDED Viewed

	@@ -0,0 +1,155 @@

+{
+  "action_chunk_size": 20,
+  "action_expert_config": {
+    "action_end_token_id": null,
+    "action_start_token_id": 151669,
+    "action_token_id": 151670,
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "crl_goal_repr_token_id": 151672,
+    "crl_obs_repr_token_id": 151673,
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "image_token_id": 151655,
+    "initializer_range": 0.02,
+    "intermediate_size": 2432,
+    "max_position_embeddings": 262144,
+    "model_type": "prts_qwen3_vl_text",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        24,
+        20,
+        20
+      ],
+      "rope_type": "default"
+    },
+    "rope_theta": 5000000,
+    "tie_word_embeddings": true,
+    "use_cache": true,
+    "video_token_id": 151656,
+    "vision_start_token_id": 151652,
+    "vocab_size": 153722
+  },
+  "action_start_token_id": 151669,
+  "architectures": [
+    "PRTS_Qwen3VL"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_prts_qwen3_vl.PRTS_FlowMatchingConfig_Qwen3VL",
+    "AutoModel": "modeling_prts_qwen3_vl.PRTS_Qwen3VL"
+  },
+  "crl_embed_dim": 256,
+  "crl_encoder_init_w": 0.001,
+  "crl_goal_repr_token_id": 151672,
+  "crl_logsumexp_reg_weight": 0.0,
+  "crl_loss_weight": 0.0,
+  "crl_obs_repr_token_id": 151673,
+  "crl_repr_norm": true,
+  "dit_action_head_config": {
+    "add_pos_embed": true,
+    "attend_text_every_n_blocks": 2,
+    "attention_head_dim": 48,
+    "attn_implementation": "sdpa",
+    "dropout": 0.2,
+    "final_dropout": true,
+    "interleave_self_attention": true,
+    "mlp_mult": 4,
+    "noise_beta_alpha": 1.5,
+    "noise_beta_beta": 1.0,
+    "noise_s": 0.999,
+    "norm_type": "ada_norm",
+    "num_attention_heads": 32,
+    "num_layers": 16,
+    "num_timestep_buckets": 1000,
+    "output_dim": 1024,
+    "use_alternate_vl_dit": true,
+    "use_mot_action_expert": true
+  },
+  "dtype": "bfloat16",
+  "embodiment_tag": "libero_panda",
+  "flow_matching_action_loss_weight": 1.0,
+  "flow_matching_sub_goal_loss_weight": 0.0,
+  "image_token_id": 151655,
+  "label2id": null,
+  "max_action_dim": 32,
+  "model_type": "prts_qwen3_vl",
+  "num_denoise_steps": 5,
+  "pad_token_id": 151643,
+  "text_config": {
+    "action_end_token_id": null,
+    "action_start_token_id": 151669,
+    "action_token_id": 151670,
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "crl_goal_repr_token_id": 151672,
+    "crl_obs_repr_token_id": 151673,
+    "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "image_token_id": 151655,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "max_position_embeddings": 262144,
+    "model_type": "prts_qwen3_vl_text",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": {
+      "mrope_interleaved": true,
+      "mrope_section": [
+        24,
+        20,
+        20
+      ],
+      "rope_type": "default"
+    },
+    "rope_theta": 5000000,
+    "tie_word_embeddings": true,
+    "use_cache": false,
+    "video_token_id": 151656,
+    "vision_start_token_id": 151652,
+    "vocab_size": 153722
+  },
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.3",
+  "use_cache": true,
+  "use_fast_action_tokenizer": true,
+  "video_token_id": 151656,
+  "vision_config": {
+    "deepstack_visual_indexes": [
+      5,
+      11,
+      17
+    ],
+    "depth": 24,
+    "dtype": "bfloat16",
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1024,
+    "in_channels": 3,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "model_type": "qwen3_vl",
+    "num_heads": 16,
+    "num_position_embeddings": 2304,
+    "out_hidden_size": 2560,
+    "patch_size": 16,
+    "spatial_merge_size": 2,
+    "temporal_patch_size": 2
+  },
+  "vision_end_token_id": 151653,
+  "vision_start_token_id": 151652,
+  "vocab_size": 153722
+}

configuration_prts_qwen3_vl.py ADDED Viewed

	@@ -0,0 +1,345 @@

+# Copyright 2025 TeleAI Rhodes Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Configuration classes for PRTS built on Qwen3-VL."""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLVisionConfig
+class PRTS_Qwen3VLTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a PRTS Text Model based on Qwen3-VL.
+    It extends PretrainedConfig with Qwen3-VL text model parameters and PRTS-specific parameters.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen3VL model.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            Number of key-value heads for Grouped Query Attention.
+        head_dim (`int`, *optional*, defaults to 128):
+            The dimension of the head.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 128000):
+            The maximum sequence length.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 5000000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        image_token_id (`int`, *optional*):
+            Token index used as placeholder for image embeddings.
+        video_token_id (`int`, *optional*):
+            Token index used as placeholder for video embeddings.
+        action_token_id (`int`, *optional*):
+            Token index used as placeholder for action embeddings.
+        action_start_token_id (`int`, *optional*):
+            Token index for action sequence start.
+        action_end_token_id (`int`, *optional*):
+            Token index for action sequence end.
+        vision_start_token_id (`int`, *optional*):
+            Token index for vision sequence start.
+        **kwargs:
+            Additional keyword arguments passed to PretrainedConfig.
+    """
+    model_type = "prts_qwen3_vl_text"   # TODO (zy): check if this is correct
+    base_config_key = "text_config"
+    def __init__(
+        self,
+        vocab_size=151936,
+        hidden_size=4096,
+        intermediate_size=22016,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        head_dim=128,
+        hidden_act="silu",
+        max_position_embeddings=128000,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=5000000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        # PRTS specific
+        action_token_id=None,
+        action_start_token_id=None,
+        action_end_token_id=None,
+        crl_goal_repr_token_id=None,
+        crl_obs_repr_token_id=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate rope config
+        rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"})
+        # PRTS specific token IDs
+        self.action_token_id = action_token_id
+        self.action_start_token_id = action_start_token_id
+        self.action_end_token_id = action_end_token_id
+        self.crl_goal_repr_token_id = crl_goal_repr_token_id
+        self.crl_obs_repr_token_id = crl_obs_repr_token_id
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+class PRTS_FlowMatchingConfig_Qwen3VL(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a PRTS model based on Qwen3-VL.
+    It extends PretrainedConfig with Qwen3-VL model parameters and PRTS-specific parameters for action prediction.
+    [`PRTS_FlowMatchingConfig_Qwen3VL`] is the configuration class to store the configuration of a PRTS model. It is used to
+    instantiate a PRTS model according to the specified arguments, defining the vision encoder, text encoder,
+    action expert, and flow matching components.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
+    Args:
+        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `PRTS_Qwen3VLTextConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Qwen3VLVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        max_action_dim (`int`, *optional*, defaults to 14):
+            Maximum dimension of action vectors. Used for padding different robot action spaces.
+        action_chunk_size (`int`, *optional*, defaults to 100):
+            Number of action timesteps to predict in each forward pass.
+        num_denoise_steps (`int`, *optional*, defaults to 4):
+            Number of denoising steps for flow matching during inference.
+        flow_matching_action_loss_weight (`float`, *optional*, defaults to 1.0):
+            Weight for the flow matching action loss.
+        crl_loss_weight (`float`, *optional*, defaults to 0.0):
+            Weight for the Contrastive Reinforcement Learning (CRL) loss. Set to 0 to disable.
+        crl_embed_dim (`int`, *optional*, defaults to 256):
+            Dimension of the CRL embedding space for action and goal encoders.
+        crl_logsumexp_reg_weight (`float`, *optional*, defaults to 0.0):
+            Weight for logsumexp regularization on CRL logits.
+        image_token_id (`int`, *optional*):
+            Token id for image placeholders.
+        video_token_id (`int`, *optional*):
+            Token id for video placeholders.
+        vision_start_token_id (`int`, *optional*):
+            Token id for vision start marker.
+        vision_end_token_id (`int`, *optional*):
+            Token id for vision end marker.
+        **kwargs:
+            Additional keyword arguments passed to PretrainedConfig.
+    Example:
+    ```python
+    >>> from prts.models import PRTS_FlowMatchingConfig_Qwen3VL, PRTS_Qwen3VL
+    >>> # Initializing a PRTS Qwen3-VL configuration
+    >>> configuration = PRTS_FlowMatchingConfig_Qwen3VL()
+    >>> # Initializing a model from the configuration
+    >>> model = PRTS_Qwen3VL(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "prts_qwen3_vl"
+    sub_configs = {
+        "vision_config": Qwen3VLVisionConfig,
+        "text_config": PRTS_Qwen3VLTextConfig,
+    }
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        image_token_id=151655,
+        video_token_id=151656,
+        vision_start_token_id=151652,
+        vision_end_token_id=151653,
+        tie_word_embeddings=False,
+        # PRTS specific
+        max_action_dim=32,
+        action_chunk_size=50,
+        num_denoise_steps=4,
+        flow_matching_action_loss_weight=0.,
+        use_fast_action_tokenizer=True,
+        # Embodiment tag: identifies the robot embodiment used for finetuning.
+        # Stores the delta_action_mask key so eval code can recover it without
+        # needing the training dataset config.
+        embodiment_tag=None,
+        # DiT action head config
+        dit_action_head_config=None,
+        # CRL (Contrastive Reinforcement Learning) parameters
+        crl_loss_weight=0.,
+        crl_embed_dim=256,
+        crl_logsumexp_reg_weight=0.0,
+        crl_encoder_init_w=1e-12,  # Cold initialization weight for encoder last layer
+        crl_repr_norm=True,  # Whether to L2-normalize CRL representations
+        **kwargs,
+    ):
+        # Initialize vision config
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+        # Initialize text config
+        if isinstance(text_config, dict):
+            self.text_config = self.sub_configs["text_config"](**text_config)
+        elif text_config is None:
+            # For BC use all kwargs to init `TextConfig`
+            self.text_config = self.sub_configs["text_config"](**kwargs)
+        # PRTS-specific parameters
+        self.max_action_dim = max_action_dim
+        self.action_chunk_size = action_chunk_size
+        self.num_denoise_steps = num_denoise_steps
+        self.flow_matching_action_loss_weight = flow_matching_action_loss_weight
+        self.use_fast_action_tokenizer = use_fast_action_tokenizer
+        self.embodiment_tag = embodiment_tag
+        # DiT action head config (nested dict)
+        # cross_attention_dim defaults to text_config.hidden_size at model init time
+        _default_dit_config = {
+            # Architecture — aligned with GR00T N1.6 (32 layers, inner_dim=32×48=1536)
+            "num_layers": 16,   # 32
+            "num_attention_heads": 32,
+            "attention_head_dim": 48,
+            "output_dim": 1024,
+            # Regularisation
+            "dropout": 0.2,
+            "interleave_self_attention": True,
+            "norm_type": "ada_norm",
+            "final_dropout": True,
+            # Action-head specifics
+            "add_pos_embed": True,
+            # Noise schedule
+            "noise_beta_alpha": 1.5,
+            "noise_beta_beta": 1.0,
+            "noise_s": 0.999,
+            "num_timestep_buckets": 1000,
+            # Attention backend
+            "attn_implementation": "sdpa",
+            # AlternateVLDiT — separate visual / text token cross-attention
+            "use_alternate_vl_dit": True,
+            "attend_text_every_n_blocks": 2,
+            # MoT-style action expert: forwards full VLM ``past_key_values`` into the head;
+            # expert depth defaults to text_config.num_hidden_layers (override with expert_num_layers).
+            "use_mot_action_expert": False,
+            "mlp_mult": 4,  # FFN hidden dim = inner_dim * mlp_mult (standard DiT only)
+        }
+        if dit_action_head_config is not None:
+            _default_dit_config.update(dit_action_head_config)
+        self.dit_action_head_config = _default_dit_config
+        # CRL (Contrastive Reinforcement Learning) parameters
+        self.crl_loss_weight = crl_loss_weight
+        self.crl_embed_dim = crl_embed_dim
+        self.crl_logsumexp_reg_weight = crl_logsumexp_reg_weight
+        self.crl_encoder_init_w = crl_encoder_init_w
+        self.crl_repr_norm = crl_repr_norm
+        # Token IDs
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_start_token_id = vision_start_token_id
+        self.vision_end_token_id = vision_end_token_id
+        # # Propagate token IDs to text config
+        # if self.image_token_id is not None:
+        #     self.text_config.image_token_id = self.image_token_id
+        # if self.video_token_id is not None:
+        #     self.text_config.video_token_id = self.video_token_id
+        # if self.vision_start_token_id is not None:
+        #     self.text_config.vision_start_token_id = self.vision_start_token_id
+        # Ensure vocab sizes are consistent
+        # if hasattr(self.text_config, 'vocab_size'):
+        #     self.vocab_size = self.text_config.vocab_size
+        super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
+    # TODO (zy): 这里需要看下是不是在VLConfig传入这些state action的特殊token更合适更灵活
+    @property
+    def action_token_id(self):
+        """Get action token id from text config."""
+        return getattr(self.text_config, 'action_token_id', None)
+    @action_token_id.setter
+    def action_token_id(self, value):
+        """Set action token id in text config."""
+        if hasattr(self.text_config, 'action_token_id'):
+            self.text_config.action_token_id = value
+    def __getattribute__(self, key):
+        if "text_config" in super().__getattribute__("__dict__") and key not in [
+            "dtype",
+            "_attn_implementation_internal",
+        ]:
+            text_config = super().__getattribute__("text_config")
+            if key in text_config.__dict__:
+                return getattr(text_config, key)
+        return super().__getattribute__(key)
+PRTS_FlowMatchingConfig_Qwen3VL.register_for_auto_class()
+__all__ = ["PRTS_FlowMatchingConfig_Qwen3VL", "PRTS_Qwen3VLTextConfig"]

dit_action_head.py ADDED Viewed

	@@ -0,0 +1,1230 @@

+"""
+DiT (Diffusion Transformer) based flow matching action head for PRTS.
+Replaces the Qwen3VLTextModel-based fm_action_expert with a lightweight DiT
+that uses explicit cross-attention to VLM hidden states, following the architecture
+from GR00T / pi05.
+Architecture:
+    ActionEncoder(noisy_actions + dof_mask, timestep)
+    → action_features
+    → DiT(cross-attn to VLM hidden states, ada-norm timestep conditioning)
+    → ActionDecoder → predicted velocity
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributions import Beta
+from typing import Optional
+from transformers.cache_utils import Cache
+from transformers.modeling_flash_attention_utils import _flash_attention_forward
+# DIT_PRESETS = {
+#     "DiT-B": {"num_attention_heads": 12, "attention_head_dim": 64, "output_dim": 768},
+#     "DiT-L": {"num_attention_heads": 32, "attention_head_dim": 48, "output_dim": 1536},
+# }
+class SinusoidalPositionalEncoding(nn.Module):
+    """Sinusoidal positional encoding for sequence positions or timesteps."""
+    def __init__(self, embedding_dim: int):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
+        timesteps = timesteps.float()
+        squeeze = False
+        if timesteps.dim() == 1:
+            timesteps = timesteps.unsqueeze(1)
+            squeeze = True
+        half_dim = self.embedding_dim // 2
+        exponent = -torch.arange(half_dim, dtype=torch.float, device=timesteps.device) * (
+            math.log(10000.0) / half_dim
+        )
+        freqs = timesteps.unsqueeze(-1) * exponent.exp()
+        enc = torch.cat([torch.sin(freqs), torch.cos(freqs)], dim=-1)
+        if squeeze:
+            enc = enc.squeeze(1)
+        return enc
+class TimestepEncoder(nn.Module):
+    """Projects scalar timesteps to embedding space via sinusoidal encoding + MLP."""
+    def __init__(self, embedding_dim: int):
+        super().__init__()
+        self.sinusoidal = SinusoidalPositionalEncoding(256)
+        self.linear_1 = nn.Linear(256, embedding_dim)
+        self.act = nn.SiLU()
+        self.linear_2 = nn.Linear(embedding_dim, embedding_dim)
+    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
+        t_emb = self.sinusoidal(timesteps)
+        t_emb = self.linear_1(t_emb.to(dtype=self.linear_1.weight.dtype))
+        t_emb = self.act(t_emb)
+        t_emb = self.linear_2(t_emb)
+        return t_emb
+class AdaLayerNorm(nn.Module):
+    """Adaptive Layer Normalization conditioned on timestep embeddings.
+    Applies scale-shift modulation: out = norm(x) * (1 + scale) + shift,
+    where (scale, shift) are linearly projected from the timestep embedding.
+    """
+    def __init__(self, embedding_dim: int, eps: float = 1e-5):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
+        self.norm = nn.LayerNorm(embedding_dim, eps=eps, elementwise_affine=False)
+    def forward(self, x: torch.Tensor, temb: torch.Tensor) -> torch.Tensor:
+        temb = self.linear(self.silu(temb))
+        scale, shift = temb.chunk(2, dim=-1)
+        x = self.norm(x) * (1 + scale[:, None]) + shift[:, None]
+        return x
+class DiTAttention(nn.Module):
+    """Multi-head attention supporting both self-attention and cross-attention.
+    Supports two backends selected via ``attn_implementation``:
+    * ``"sdpa"`` (default) – uses :func:`F.scaled_dot_product_attention`, which
+      dispatches automatically to FlashAttention / memory-efficient attention
+      depending on the installed PyTorch build.  The encoder padding mask is
+      expanded to ``(B, 1, 1, S)`` and passed as ``attn_mask``.
+    * ``"flash_attention_2"`` – calls the ``flash_attn`` package directly for
+      lower memory usage and higher throughput.  For cross-attention with an
+      encoder padding mask the k/v tensors are unpadded and
+      :func:`flash_attn_varlen_func` is used so that padding tokens are never
+      processed.  For self-attention (no mask) the simpler
+      :func:`flash_attn_func` is used.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        num_heads: int,
+        head_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        dropout: float = 0.0,
+        bias: bool = True,
+        attn_implementation: str = "sdpa",
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = head_dim
+        self.attn_implementation = attn_implementation
+        inner_dim = num_heads * head_dim
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=bias)
+        kv_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.to_k = nn.Linear(kv_dim, inner_dim, bias=bias)
+        self.to_v = nn.Linear(kv_dim, inner_dim, bias=bias)
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim, bias=bias),
+            nn.Dropout(dropout),
+        )
+    # ------------------------------------------------------------------
+    # Flash-Attention backend
+    # ------------------------------------------------------------------
+    def _flash_attn_forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        """Run Flash Attention via HuggingFace's ``_flash_attention_forward``.
+        Args:
+            q: ``(B, T_q, H, D)``
+            k: ``(B, T_k, H, D)``
+            v: ``(B, T_k, H, D)``
+            attention_mask: ``(B, T_k)`` bool, True = valid token.
+        Returns:
+            ``(B, T_q, H*D)``
+        """
+        B, T_q, H, D = q.shape
+        # _flash_attention_forward returns (B, T_q, H, D); handles unpad/varlen internally.
+        out = _flash_attention_forward(
+            q, k, v,
+            attention_mask=attention_mask,
+            query_length=T_q,
+            is_causal=False,
+            dropout=0.0,
+        )
+        return out.reshape(B, T_q, H * D)
+    # ------------------------------------------------------------------
+    # Forward
+    # ------------------------------------------------------------------
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        B, T, _ = hidden_states.shape
+        q = self.to_q(hidden_states)
+        kv_input = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+        k = self.to_k(kv_input)
+        v = self.to_v(kv_input)
+        if self.attn_implementation == "flash_attention_2":
+            # Flash Attention expects (B, S, H, D)
+            q = q.view(B, T, self.num_heads, self.head_dim)
+            k = k.view(B, -1, self.num_heads, self.head_dim)
+            v = v.view(B, -1, self.num_heads, self.head_dim)
+            attn_output = self._flash_attn_forward(q, k, v, attention_mask)
+        else:
+            # SDPA expects (B, H, S, D)
+            q = q.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
+            k = k.view(B, -1, self.num_heads, self.head_dim).transpose(1, 2)
+            v = v.view(B, -1, self.num_heads, self.head_dim).transpose(1, 2)
+            # Expand (B, S) bool mask → (B, 1, 1, S) for broadcasting.
+            sdpa_mask = None
+            if attention_mask is not None:
+                if attention_mask.dim() == 2:
+                    sdpa_mask = attention_mask[:, None, None, :]
+                else:
+                    sdpa_mask = attention_mask
+            attn_output = F.scaled_dot_product_attention(
+                q, k, v, attn_mask=sdpa_mask, dropout_p=0.0
+            )
+            attn_output = attn_output.transpose(1, 2).contiguous().view(B, T, -1)
+        return self.to_out(attn_output)
+class FeedForward(nn.Module):
+    """Feed-forward network with GELU activation."""
+    def __init__(self, dim: int, dropout: float = 0.0, mult: int = 4):
+        super().__init__()
+        inner_dim = dim * mult
+        self.net = nn.Sequential(
+            nn.Linear(dim, inner_dim),
+            nn.GELU(approximate="tanh"),
+            nn.Dropout(dropout),
+            nn.Linear(inner_dim, dim),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)
+class BasicTransformerBlock(nn.Module):
+    """Transformer block with self/cross-attention, optional AdaLayerNorm, and feed-forward.
+    When cross_attention_dim is set, the attention block performs cross-attention
+    to encoder_hidden_states. Otherwise, it performs self-attention.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout: float = 0.0,
+        cross_attention_dim: Optional[int] = None,
+        norm_type: str = "ada_norm",
+        final_dropout: bool = False,
+        attn_implementation: str = "sdpa",
+    ):
+        super().__init__()
+        self.norm_type = norm_type
+        if norm_type == "ada_norm":
+            self.norm1 = AdaLayerNorm(dim)
+        else:
+            self.norm1 = nn.LayerNorm(dim)
+        self.attn1 = DiTAttention(
+            query_dim=dim,
+            num_heads=num_attention_heads,
+            head_dim=attention_head_dim,
+            cross_attention_dim=cross_attention_dim,
+            dropout=dropout,
+            attn_implementation=attn_implementation,
+        )
+        self.norm3 = nn.LayerNorm(dim)
+        self.ff = FeedForward(dim, dropout=dropout)
+        self.final_dropout = nn.Dropout(dropout) if final_dropout else None
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if self.norm_type == "ada_norm":
+            norm_hidden_states = self.norm1(hidden_states, temb)
+        else:
+            norm_hidden_states = self.norm1(hidden_states)
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=encoder_attention_mask,
+        )
+        if self.final_dropout is not None:
+            attn_output = self.final_dropout(attn_output)
+        hidden_states = attn_output + hidden_states
+        norm_hidden_states = self.norm3(hidden_states)
+        ff_output = self.ff(norm_hidden_states)
+        hidden_states = ff_output + hidden_states
+        return hidden_states
+class DiT(nn.Module):
+    """Diffusion Transformer with cross-attention to VLM context features.
+    Interleaves cross-attention blocks (attending to encoder_hidden_states)
+    with self-attention blocks when interleave_self_attention=True.
+    Uses AdaLayerNorm for timestep conditioning throughout.
+    Output block applies timestep-conditioned scale-shift before final projection.
+    """
+    def __init__(
+        self,
+        num_attention_heads: int = 12,
+        attention_head_dim: int = 64,
+        output_dim: int = 768,
+        num_layers: int = 12,
+        dropout: float = 0.1,
+        norm_type: str = "ada_norm",
+        final_dropout: bool = True,
+        interleave_self_attention: bool = False,
+        cross_attention_dim: Optional[int] = None,
+        attn_implementation: str = "sdpa",
+    ):
+        super().__init__()
+        self.inner_dim = num_attention_heads * attention_head_dim
+        self.output_dim = output_dim
+        self.num_layers = num_layers
+        self.interleave_self_attention = interleave_self_attention
+        self.timestep_encoder = TimestepEncoder(self.inner_dim)
+        all_blocks = []
+        for idx in range(num_layers):
+            use_self_attn = idx % 2 == 1 and interleave_self_attention
+            curr_cross_attention_dim = cross_attention_dim if not use_self_attn else None
+            all_blocks.append(
+                BasicTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=curr_cross_attention_dim,
+                    norm_type=norm_type,
+                    final_dropout=final_dropout,
+                    attn_implementation=attn_implementation,
+                )
+            )
+        self.transformer_blocks = nn.ModuleList(all_blocks)
+        self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out_1 = nn.Linear(self.inner_dim, 2 * self.inner_dim)
+        self.proj_out_2 = nn.Linear(self.inner_dim, output_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        temb = self.timestep_encoder(timestep)
+        hidden_states = hidden_states.contiguous()
+        encoder_hidden_states = encoder_hidden_states.contiguous()
+        for idx, block in enumerate(self.transformer_blocks):
+            if idx % 2 == 1 and self.interleave_self_attention:
+                hidden_states = block(
+                    hidden_states,
+                    encoder_hidden_states=None,
+                    encoder_attention_mask=None,
+                    temb=temb,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    temb=temb,
+                )
+        conditioning = temb
+        shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=-1)
+        hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+        return self.proj_out_2(hidden_states)
+class AlternateVLDiT(DiT):
+    """DiT variant that separates visual and text tokens during cross-attention.
+    Mirrors GR00T's AlternateVLDiT: even-indexed blocks do cross-attention,
+    alternating every ``attend_text_every_n_blocks`` between text tokens and
+    visual tokens.  Odd-indexed blocks do self-attention (requires
+    ``interleave_self_attention=True``).
+    When no visual tokens are present (``image_mask`` is None or all-False),
+    all valid tokens are treated as text.
+    """
+    def __init__(self, *args, attend_text_every_n_blocks: int = 2, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.interleave_self_attention, (
+            "AlternateVLDiT requires interleave_self_attention=True"
+        )
+        self.attend_text_every_n_blocks = attend_text_every_n_blocks
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        image_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            encoder_attention_mask: (B, S) bool – True = valid VLM token.
+            image_mask: (B, S) bool – True = visual token position.
+                If None, all valid tokens are treated as text.
+        """
+        temb = self.timestep_encoder(timestep)
+        hidden_states = hidden_states.contiguous()
+        encoder_hidden_states = encoder_hidden_states.contiguous()
+        B, S, _ = encoder_hidden_states.shape
+        backbone_mask = (
+            encoder_attention_mask.bool()
+            if encoder_attention_mask is not None
+            else torch.ones(B, S, dtype=torch.bool, device=hidden_states.device)
+        )
+        if image_mask is not None and image_mask.any():
+            vis_mask  = image_mask.bool() & backbone_mask   # visual tokens
+            text_mask = (~image_mask.bool()) & backbone_mask # text tokens
+        else:
+            # No visual tokens – treat everything as text.
+            vis_mask  = torch.zeros_like(backbone_mask)
+            text_mask = backbone_mask
+        for idx, block in enumerate(self.transformer_blocks):
+            if idx % 2 == 1:
+                # Self-attention block.
+                hidden_states = block(
+                    hidden_states,
+                    encoder_hidden_states=None,
+                    encoder_attention_mask=None,
+                    temb=temb,
+                )
+            else:
+                # Cross-attention block: alternate text / visual every N blocks.
+                if idx % (2 * self.attend_text_every_n_blocks) == 0:
+                    curr_mask = text_mask
+                else:
+                    curr_mask = vis_mask
+                hidden_states = block(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=curr_mask,
+                    temb=temb,
+                )
+        conditioning = temb
+        shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=-1)
+        hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+        return self.proj_out_2(hidden_states)
+class ActionEncoder(nn.Module):
+    """Encodes noisy actions (optionally concatenated with DOF mask) and timestep
+    into hidden features via MLP + sinusoidal time encoding.
+    Architecture: Linear → concat(action_emb, time_emb) → SiLU + Linear → Linear
+    """
+    def __init__(self, action_input_dim: int, hidden_size: int):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.layer1 = nn.Linear(action_input_dim, hidden_size)
+        self.layer2 = nn.Linear(2 * hidden_size, hidden_size)
+        self.layer3 = nn.Linear(hidden_size, hidden_size)
+        self.pos_encoding = SinusoidalPositionalEncoding(hidden_size)
+    def forward(self, actions: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            actions: (B, T, action_input_dim) noisy actions (+ DOF mask)
+            timesteps: (B,) discretized timesteps
+        """
+        B, T, _ = actions.shape
+        timesteps_expanded = timesteps.unsqueeze(1).expand(-1, T)
+        a_emb = self.layer1(actions)
+        tau_emb = self.pos_encoding(timesteps_expanded).to(dtype=a_emb.dtype)
+        x = torch.cat([a_emb, tau_emb], dim=-1)
+        x = F.silu(self.layer2(x))
+        x = self.layer3(x)
+        return x
+class ActionDecoder(nn.Module):
+    """2-layer MLP that decodes DiT output to action-space velocity."""
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
+        super().__init__()
+        self.layer1 = nn.Linear(input_dim, hidden_dim)
+        self.layer2 = nn.Linear(hidden_dim, output_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.layer2(F.relu(self.layer1(x)))
+class FlowMatchingDiTHead(nn.Module):
+    """Flow matching action head using DiT (Diffusion Transformer).
+    Replaces the fm_action_expert (Qwen3VLTextModel-based) with a DiT that uses
+    explicit cross-attention to VLM hidden states instead of KV cache continuation.
+    Training:
+        1. Sample noise and timestep from Beta distribution
+        2. Compute noisy trajectory: x_t = (1-t)*noise + t*actions
+        3. Compute velocity target: v = actions - noise
+        4. Encode noisy actions + DOF mask + timestep → action features
+        5. Prepend learned future query tokens
+        6. Run DiT with cross-attention to VLM hidden states
+        7. Decode to action-space velocity prediction
+    Inference:
+        Euler integration from pure noise (t=0) to clean actions (t=1)
+        over num_inference_timesteps steps.
+    """
+    def __init__(
+        self,
+        action_dim: int,
+        action_chunk_size: int,
+        cross_attention_dim: int,
+        num_inference_timesteps: int = 4,
+        config: Optional[dict] = None,
+    ):
+        super().__init__()
+        cfg = {
+            "num_layers": 16,
+            "num_attention_heads": 12,
+            "attention_head_dim": 64,
+            "output_dim": 1024,
+            "dropout": 0.2,
+            "interleave_self_attention": True,
+            "norm_type": "ada_norm",
+            "final_dropout": True,
+            "add_pos_embed": True,
+            "noise_beta_alpha": 1.5,
+            "noise_beta_beta": 1.0,
+            "noise_s": 0.999,
+            "num_timestep_buckets": 1000,
+            "attn_implementation": "sdpa",
+            "use_alternate_vl_dit": False,
+            "attend_text_every_n_blocks": 2,
+        }
+        if config is not None:
+            cfg.update(config)
+            # dit_model_type = config.get("dit_model_type")
+            # if dit_model_type and dit_model_type in DIT_PRESETS:
+            #     cfg.update(DIT_PRESETS[dit_model_type])
+        # cfg.pop("dit_model_type", None)
+        self.action_dim = action_dim
+        self.action_chunk_size = action_chunk_size
+        self.num_inference_timesteps = num_inference_timesteps
+        self.num_timestep_buckets = cfg["num_timestep_buckets"]
+        self.noise_s = cfg["noise_s"]
+        self.use_alternate_vl_dit = cfg["use_alternate_vl_dit"]
+        self.add_pos_embed = cfg["add_pos_embed"]
+        num_attention_heads = cfg["num_attention_heads"]
+        attention_head_dim = cfg["attention_head_dim"]
+        output_dim = cfg["output_dim"]
+        inner_dim = num_attention_heads * attention_head_dim
+        dit_kwargs = dict(
+            num_attention_heads=num_attention_heads,
+            attention_head_dim=attention_head_dim,
+            output_dim=output_dim,
+            num_layers=cfg["num_layers"],
+            dropout=cfg["dropout"],
+            norm_type=cfg["norm_type"],
+            final_dropout=cfg["final_dropout"],
+            interleave_self_attention=cfg["interleave_self_attention"],
+            cross_attention_dim=cross_attention_dim,
+            attn_implementation=cfg["attn_implementation"],
+        )
+        if self.use_alternate_vl_dit:
+            self.dit = AlternateVLDiT(
+                **dit_kwargs,
+                attend_text_every_n_blocks=cfg["attend_text_every_n_blocks"],
+            )
+        else:
+            self.dit = DiT(**dit_kwargs)
+        # action_dim * 2: noisy action + DOF mask concatenated
+        self.action_encoder = ActionEncoder(action_dim * 2, inner_dim)
+        self.action_decoder = ActionDecoder(output_dim, inner_dim, action_dim)
+        if self.add_pos_embed:
+            max_seq_len = max(action_chunk_size, 256)
+            self.position_embedding = nn.Embedding(max_seq_len, inner_dim)
+            nn.init.normal_(self.position_embedding.weight, mean=0.0, std=0.02)
+        # self.beta_dist = Beta(cfg["noise_beta_alpha"], cfg["noise_beta_beta"])
+        self._beta_alpha = cfg["noise_beta_alpha"]
+        self._beta_beta  = cfg["noise_beta_beta"]
+    def reset_parameters(self):
+        """Re-apply proper initialization.
+        HuggingFace from_pretrained calls _init_weights on modules whose
+        parameters are absent from the checkpoint, overwriting any custom
+        init done in __init__.  Call this after from_pretrained when loading
+        from a base VLM checkpoint that does not contain DiT weights.
+        """
+        if self.add_pos_embed:
+            nn.init.normal_(self.position_embedding.weight, mean=0.0, std=0.02)
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+                if module.bias is not None:
+                    fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                    nn.init.uniform_(module.bias, -bound, bound)
+            elif isinstance(module, nn.LayerNorm):
+                if module.elementwise_affine:
+                    nn.init.ones_(module.weight)
+                    nn.init.zeros_(module.bias)
+    def sample_time(self, batch_size: int, device, dtype) -> torch.Tensor:
+        beta_dist = Beta(self._beta_alpha, self._beta_beta)
+        sample = beta_dist.sample([batch_size]).to(device, dtype=dtype).clamp(max=self.noise_s)
+        return (self.noise_s - sample) / self.noise_s
+    def _encode_actions(
+        self,
+        noisy_actions: torch.Tensor,
+        t_discretized: torch.Tensor,
+        action_dof_mask: Optional[torch.Tensor],
+        device,
+    ) -> torch.Tensor:
+        """Encode noisy actions with DOF mask and timestep, add position embeddings."""
+        if action_dof_mask is not None:
+            encoder_input = torch.cat(
+                [noisy_actions, action_dof_mask.to(noisy_actions.dtype)], dim=-1
+            )
+        else:
+            encoder_input = torch.cat(
+                [noisy_actions, torch.ones_like(noisy_actions)], dim=-1
+            )
+        action_features = self.action_encoder(encoder_input, t_discretized)
+        if self.add_pos_embed:
+            pos_ids = torch.arange(action_features.shape[1], dtype=torch.long, device=device)
+            pos_embs = self.position_embedding(pos_ids).unsqueeze(0)
+            action_features = action_features + pos_embs
+        return action_features
+    def _dit_forward(
+        self,
+        sa_embs: torch.Tensor,
+        vl_embs: torch.Tensor,
+        t_discretized: torch.LongTensor,
+        encoder_attention_mask: Optional[torch.Tensor],
+        image_mask: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if self.use_alternate_vl_dit:
+            return self.dit(
+                hidden_states=sa_embs,
+                encoder_hidden_states=vl_embs,
+                timestep=t_discretized,
+                encoder_attention_mask=encoder_attention_mask,
+                image_mask=image_mask,
+            )
+        return self.dit(
+            hidden_states=sa_embs,
+            encoder_hidden_states=vl_embs,
+            timestep=t_discretized,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+    def forward(
+        self,
+        vl_embs: torch.Tensor,
+        actions: torch.Tensor,
+        action_dof_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        image_mask: Optional[torch.Tensor] = None,
+    ) -> tuple:
+        """Training forward pass.
+        Args:
+            vl_embs: (B, S, D) VLM hidden states for cross-attention
+            actions: (B, T, action_dim) ground truth action trajectories
+            action_dof_mask: (B, T, action_dim) DOF validity mask
+            encoder_attention_mask: (B, S) bool – True = valid VLM token
+            image_mask: (B, S) bool – True = visual token (used by AlternateVLDiT)
+        Returns:
+            (pred_v, velocity): predicted velocity and target velocity, both (B, T, action_dim)
+        """
+        device = vl_embs.device
+        B = actions.shape[0]
+        noise = torch.randn(actions.shape, device=device, dtype=actions.dtype)
+        t = self.sample_time(B, device=device, dtype=actions.dtype)
+        t_expanded = t[:, None, None]
+        noisy_trajectory = (1 - t_expanded) * noise + t_expanded * actions
+        velocity = actions - noise
+        t_discretized = (t * self.num_timestep_buckets).long()
+        action_features = self._encode_actions(noisy_trajectory, t_discretized, action_dof_mask, device)
+        model_output = self._dit_forward(
+            action_features, vl_embs, t_discretized, encoder_attention_mask, image_mask
+        )
+        pred = self.action_decoder(model_output)
+        pred_v = pred[:, :actions.shape[1]]
+        return pred_v, velocity
+    @torch.no_grad()
+    def predict_action(
+        self,
+        vl_embs: torch.Tensor,
+        action_dof_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        image_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Inference: denoise actions from noise using Euler integration.
+        Args:
+            vl_embs: (B, S, D) VLM hidden states
+            action_dof_mask: optional (B, T, action_dim) or (1, T, action_dim) DOF mask
+            encoder_attention_mask: (B, S) bool – True = valid VLM token
+            image_mask: (B, S) bool – True = visual token (used by AlternateVLDiT)
+        Returns:
+            (B, T, action_dim) denoised action trajectories
+        """
+        B = vl_embs.shape[0]
+        device = vl_embs.device
+        dtype = vl_embs.dtype
+        actions = torch.randn(
+            (B, self.action_chunk_size, self.action_dim),
+            device=device, dtype=dtype,
+        )
+        dt = 1.0 / self.num_inference_timesteps
+        for step in range(self.num_inference_timesteps):
+            t_cont = step / float(self.num_inference_timesteps)
+            t_discretized_val = int(t_cont * self.num_timestep_buckets)
+            timesteps_tensor = torch.full((B,), t_discretized_val, device=device, dtype=torch.long)
+            action_features = self._encode_actions(actions, timesteps_tensor, action_dof_mask, device)
+            model_output = self._dit_forward(
+                action_features, vl_embs, timesteps_tensor, encoder_attention_mask, image_mask
+            )
+            pred = self.action_decoder(model_output)
+            pred_velocity = pred[:, :self.action_chunk_size]
+            actions = actions + dt * pred_velocity
+        return actions
+# ============================================================================
+# Pi0.5-style KV-cache action expert (VLM K/V concat + GQA + SwiGLU FFN)
+# ============================================================================
+class AdaRMSNorm(nn.Module):
+    """Adaptive RMS normalization: (scale, shift, gate) from cond; zero-init."""
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.modulation = nn.Linear(dim, dim * 3)
+        nn.init.zeros_(self.modulation.weight)
+        nn.init.zeros_(self.modulation.bias)
+    def forward(self, x: torch.Tensor, cond: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        var = x.float().pow(2).mean(-1, keepdim=True)
+        normed = (x * torch.rsqrt(var + self.eps)).to(x.dtype)
+        scale, shift, gate = self.modulation(cond).chunk(3, dim=-1)
+        normed = normed * (1 + scale[:, None]) + shift[:, None]
+        return normed, gate[:, None]
+class SwiGLUFeedForward(nn.Module):
+    """SiLU(gate_proj(x)) * up_proj(x) → down_proj."""
+    def __init__(self, dim: int, hidden_dim: int, dropout: float = 0.0, bias: bool = True):
+        super().__init__()
+        self.gate_proj = nn.Linear(dim, hidden_dim, bias=bias)
+        self.up_proj = nn.Linear(dim, hidden_dim, bias=bias)
+        self.down_proj = nn.Linear(hidden_dim, dim, bias=bias)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.dropout(F.silu(self.gate_proj(x)) * self.up_proj(x)))
+class MoTAttention(nn.Module):
+    """Action Q attends to concatenated [VLM KV cache ; action KV]; GQA expand for SDPA."""
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if num_attention_heads % num_kv_heads != 0:
+            raise ValueError(
+                f"num_attention_heads ({num_attention_heads}) must be divisible by "
+                f"num_kv_heads ({num_kv_heads})"
+            )
+        self.num_attention_heads = num_attention_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+        q_dim = num_attention_heads * head_dim
+        kv_dim = num_kv_heads * head_dim
+        self.q_proj = nn.Linear(hidden_size, q_dim, bias=bias)
+        self.k_proj = nn.Linear(hidden_size, kv_dim, bias=bias)
+        self.v_proj = nn.Linear(hidden_size, kv_dim, bias=bias)
+        self.o_proj = nn.Linear(q_dim, hidden_size, bias=bias)
+        self.dropout = nn.Dropout(dropout)
+    def forward(
+        self,
+        action_hidden: torch.Tensor,
+        vlm_cached_k: torch.Tensor,
+        vlm_cached_v: torch.Tensor,
+        vlm_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        B, T_a, _ = action_hidden.shape
+        q = self.q_proj(action_hidden)
+        act_k = self.k_proj(action_hidden)
+        act_v = self.v_proj(action_hidden)
+        q = q.view(B, T_a, self.num_attention_heads, self.head_dim).transpose(1, 2)
+        act_k = act_k.view(B, T_a, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        act_v = act_v.view(B, T_a, self.num_kv_heads, self.head_dim).transpose(1, 2)
+        k = torch.cat([vlm_cached_k, act_k], dim=2)
+        v = torch.cat([vlm_cached_v, act_v], dim=2)
+        repeat_factor = self.num_attention_heads // self.num_kv_heads
+        k = k.repeat_interleave(repeat_factor, dim=1)
+        v = v.repeat_interleave(repeat_factor, dim=1)
+        sdpa_mask = None
+        if vlm_attention_mask is not None:
+            action_mask = vlm_attention_mask.new_ones(B, T_a)
+            combined_mask = torch.cat([vlm_attention_mask, action_mask], dim=1)
+            sdpa_mask = combined_mask[:, None, None, :]
+        attn_out = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=sdpa_mask, dropout_p=0.0,
+        )
+        attn_out = attn_out.transpose(1, 2).contiguous().view(B, T_a, -1)
+        return self.dropout(self.o_proj(attn_out))
+class MoTBlock(nn.Module):
+    """AdaRMSNorm → attention → gated residual → AdaRMSNorm → SwiGLU FFN → gated residual."""
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        intermediate_size: int,
+        dropout: float = 0.0,
+    ):
+        super().__init__()
+        self.pre_attn_norm = AdaRMSNorm(hidden_size)
+        self.attn = MoTAttention(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_dim,
+            dropout=dropout,
+        )
+        self.pre_ffn_norm = AdaRMSNorm(hidden_size)
+        self.ffn = SwiGLUFeedForward(hidden_size, intermediate_size, dropout=dropout)
+    def forward(
+        self,
+        action_hidden: torch.Tensor,
+        vlm_cached_k: torch.Tensor,
+        vlm_cached_v: torch.Tensor,
+        adarms_cond: torch.Tensor,
+        vlm_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        normed, gate1 = self.pre_attn_norm(action_hidden, adarms_cond)
+        attn_out = self.attn(normed, vlm_cached_k, vlm_cached_v, vlm_attention_mask)
+        action_hidden = action_hidden + attn_out * gate1
+        normed2, gate2 = self.pre_ffn_norm(action_hidden, adarms_cond)
+        action_hidden = action_hidden + self.ffn(normed2) * gate2
+        return action_hidden
+class MoTDiT(nn.Module):
+    """Stack of ActionBlocks; each block uses one VLM layer's KV pair."""
+    def __init__(
+        self,
+        hidden_size: int,
+        num_attention_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        intermediate_size: int,
+        num_layers: int,
+        dropout: float = 0.2,
+    ):
+        super().__init__()
+        self.num_layers = num_layers
+        self.blocks = nn.ModuleList([
+            MoTBlock(
+                hidden_size=hidden_size,
+                num_attention_heads=num_attention_heads,
+                num_kv_heads=num_kv_heads,
+                head_dim=head_dim,
+                intermediate_size=intermediate_size,
+                dropout=dropout,
+            )
+            for _ in range(num_layers)
+        ])
+        self.final_norm = AdaRMSNorm(hidden_size)
+    def forward(
+        self,
+        action_hidden: torch.Tensor,
+        vlm_kv_cache: list,
+        adarms_cond: torch.Tensor,
+        vlm_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        for idx, block in enumerate(self.blocks):
+            cached_k, cached_v = vlm_kv_cache[idx]
+            action_hidden = block(
+                action_hidden, cached_k, cached_v, adarms_cond, vlm_attention_mask,
+            )
+        action_hidden, _ = self.final_norm(action_hidden, adarms_cond)
+        return action_hidden
+def _kv_pairs_from_past_key_values(past_key_values: Cache) -> list[tuple[torch.Tensor, torch.Tensor]]:
+    """Per-layer (K, V) from a HuggingFace decoder KV cache (order matches transformer layers)."""
+    return [
+        (past_key_values[i][0], past_key_values[i][1])
+        for i in range(len(past_key_values))
+    ]
+class MoTFlowMatchingHead(nn.Module):
+    """Flow matching head: MoT-style action expert over VLM KV cache (concat + GQA)."""
+    def __init__(
+        self,
+        action_dim: int,
+        action_chunk_size: int,
+        vlm_config,
+        num_inference_timesteps: int = 10,
+        config: Optional[dict] = None,
+    ):
+        super().__init__()
+        _vlm_num_q_heads  = 8 # vlm_config.num_attention_heads // 2 # optional: 8
+        _vlm_num_kv_heads = vlm_config.num_key_value_heads   # 8
+        _vlm_head_dim     = getattr(
+            vlm_config, "head_dim", vlm_config.hidden_size // vlm_config.num_attention_heads
+        )  # 128
+        cfg = {
+            "hidden_size": 1024, # vlm_config.hidden_size // 2,
+            # "hidden_size": vlm_config.hidden_size // 2,
+            "intermediate_size": vlm_config.intermediate_size // 4,
+            "expert_num_layers": vlm_config.num_hidden_layers,
+            # Attention dims default to VLM values (required for KV cache compat)
+            "num_attention_heads": _vlm_num_q_heads,
+            "num_kv_heads": _vlm_num_kv_heads,
+            "head_dim": _vlm_head_dim,
+            # Noise schedule
+            "dropout": 0.2,
+            "add_pos_embed": True,
+            "noise_beta_alpha": 1.5,
+            "noise_beta_beta": 1.0,
+            "noise_s": 0.999,
+            "num_timestep_buckets": 1000,
+        }
+        if config is not None:
+            config = cfg.copy()
+        num_attention_heads = cfg["num_attention_heads"]
+        num_kv_heads        = cfg["num_kv_heads"]
+        head_dim            = cfg["head_dim"]
+        hidden_size         = cfg["hidden_size"]
+        intermediate_size   = cfg["intermediate_size"]
+        num_layers          = cfg["expert_num_layers"]
+        self.action_dim = action_dim
+        self.action_chunk_size = action_chunk_size
+        self.num_inference_timesteps = num_inference_timesteps
+        self.num_timestep_buckets = cfg["num_timestep_buckets"]
+        self.noise_s = cfg["noise_s"]
+        self.add_pos_embed = cfg["add_pos_embed"]
+        self.action_in_proj = nn.Linear(action_dim * 2, hidden_size)
+        self.action_out_proj = nn.Linear(hidden_size, action_dim)
+        self.time_sinusoidal = SinusoidalPositionalEncoding(hidden_size)
+        self.time_mlp_1 = nn.Linear(hidden_size, hidden_size)
+        self.time_mlp_2 = nn.Linear(hidden_size, hidden_size)
+        if self.add_pos_embed:
+            max_seq = max(action_chunk_size, 256)
+            self.position_embedding = nn.Embedding(max_seq, hidden_size)
+            nn.init.normal_(self.position_embedding.weight, mean=0.0, std=0.02)
+        self.dit = MoTDiT(
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_dim,
+            intermediate_size=intermediate_size,
+            num_layers=num_layers,
+            dropout=cfg["dropout"],
+        )
+        self._beta_alpha = cfg["noise_beta_alpha"]
+        self._beta_beta = cfg["noise_beta_beta"]
+    @property
+    def num_dit_layers(self) -> int:
+        """Number of expert blocks; must match ``len(past_key_values.key_cache)``."""
+        return self.dit.num_layers
+    def _vlm_kv_list_from_past(self, past_key_values: Cache) -> list[tuple[torch.Tensor, torch.Tensor]]:
+        n = len(past_key_values)
+        if n != self.num_dit_layers:
+            raise ValueError(
+                f"MoT expert has {self.num_dit_layers} blocks but `past_key_values` has {n} "
+                "layers. Set `dit_action_head_config['expert_num_layers']` to match "
+                "`text_config.num_hidden_layers`."
+            )
+        return _kv_pairs_from_past_key_values(past_key_values)
+    def reset_parameters(self):
+        """Re-apply proper initialization after from_pretrained."""
+        if self.add_pos_embed:
+            nn.init.normal_(self.position_embedding.weight, mean=0.0, std=0.02)
+        for module in self.modules():
+            if isinstance(module, AdaRMSNorm):
+                nn.init.zeros_(module.modulation.weight)
+                nn.init.zeros_(module.modulation.bias)
+            elif isinstance(module, nn.Linear):
+                nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+                if module.bias is not None:
+                    fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                    nn.init.uniform_(module.bias, -bound, bound)
+    def _compute_adarms_cond(self, t_discretized: torch.Tensor) -> torch.Tensor:
+        t_emb = self.time_sinusoidal(t_discretized.float())
+        t_emb = t_emb.to(dtype=self.time_mlp_1.weight.dtype)
+        t_emb = F.silu(self.time_mlp_1(t_emb))
+        t_emb = F.silu(self.time_mlp_2(t_emb))
+        return t_emb
+    def sample_time(self, batch_size: int, device, dtype) -> torch.Tensor:
+        beta_dist = Beta(self._beta_alpha, self._beta_beta)
+        sample = beta_dist.sample([batch_size]).to(device, dtype=dtype).clamp(max=self.noise_s)
+        return (self.noise_s - sample) / self.noise_s
+    def _prepare_action_embeds(
+        self,
+        noisy_actions: torch.Tensor,
+        action_dof_mask: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if action_dof_mask is not None:
+            x = torch.cat(
+                [noisy_actions, action_dof_mask.to(noisy_actions.dtype)], dim=-1,
+            )
+        else:
+            x = torch.cat([noisy_actions, torch.ones_like(noisy_actions)], dim=-1)
+        tokens = self.action_in_proj(x)
+        if self.add_pos_embed:
+            pos_ids = torch.arange(tokens.shape[1], dtype=torch.long, device=noisy_actions.device)
+            tokens = tokens + self.position_embedding(pos_ids).unsqueeze(0)
+        return tokens
+    def forward(
+        self,
+        past_key_values: Cache,
+        actions: torch.Tensor,
+        action_dof_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+    ) -> tuple:
+        """Training: returns (pred_velocity, target_velocity).
+        Args:
+            past_key_values: VLM decoder KV cache; layer count must equal ``num_dit_layers``.
+        """
+        vlm_kv_cache = self._vlm_kv_list_from_past(past_key_values)
+        device = actions.device
+        B = actions.shape[0]
+        noise = torch.randn(actions.shape, device=device, dtype=actions.dtype)
+        t = self.sample_time(B, device=device, dtype=actions.dtype)
+        t_expanded = t[:, None, None]
+        noisy_trajectory = (1 - t_expanded) * noise + t_expanded * actions
+        velocity = actions - noise
+        t_discretized = (t * self.num_timestep_buckets).long()
+        adarms_cond = self._compute_adarms_cond(t_discretized)
+        action_tokens = self._prepare_action_embeds(noisy_trajectory, action_dof_mask)
+        output = self.dit(
+            action_tokens, vlm_kv_cache, adarms_cond, encoder_attention_mask,
+        )
+        pred = self.action_out_proj(output)
+        pred_v = pred[:, :actions.shape[1]]
+        return pred_v, velocity
+    def compute_velocity(
+        self,
+        past_key_values: Cache,
+        actions: torch.Tensor,
+        noise: torch.Tensor,
+        t: torch.Tensor,
+        action_dof_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Compute velocity prediction for pre-sampled noise and timestep.
+        Used by DiffusionNFT where noise and timestep must be shared between
+        the current policy (v_θ) and the reference policy (v_old).
+        Args:
+            past_key_values: VLM decoder KV cache
+            actions: (B, T, action_dim) ground truth actions (x_0)
+            noise: (B, T, action_dim) pre-sampled noise (ε)
+            t: (B,) continuous timesteps in [0, 1)
+            action_dof_mask, encoder_attention_mask,
+        Returns:
+            pred_v: (B, T, action_dim) predicted velocity
+        """
+        vlm_kv_cache = self._vlm_kv_list_from_past(past_key_values)
+        device = actions.device
+        t_expanded = t[:, None, None]
+        noisy_trajectory = (1 - t_expanded) * noise + t_expanded * actions
+        t_discretized = (t * self.num_timestep_buckets).long()
+        adarms_cond = self._compute_adarms_cond(t_discretized)
+        action_tokens = self._prepare_action_embeds(noisy_trajectory, action_dof_mask)
+        output = self.dit(
+            action_tokens, vlm_kv_cache, adarms_cond, encoder_attention_mask,
+        )
+        pred = self.action_out_proj(output)
+        return pred[:, :actions.shape[1]]
+    @torch.no_grad()
+    def predict_action(
+        self,
+        past_key_values: Cache,
+        action_dof_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Inference: Euler integration, returns (B, chunk_size, action_dim)."""
+        k0 = past_key_values[0][0]
+        B = k0.shape[0]
+        device = k0.device
+        dtype = k0.dtype
+        vlm_kv_cache = self._vlm_kv_list_from_past(past_key_values)
+        actions = torch.randn(
+            (B, self.action_chunk_size, self.action_dim),
+            device=device, dtype=dtype,
+        )
+        dt = 1.0 / self.num_inference_timesteps
+        for step in range(self.num_inference_timesteps):
+            t_cont = step / float(self.num_inference_timesteps)
+            t_disc_val = int(t_cont * self.num_timestep_buckets)
+            t_tensor = torch.full((B,), t_disc_val, device=device, dtype=torch.long)
+            adarms_cond = self._compute_adarms_cond(t_tensor)
+            action_tokens = self._prepare_action_embeds(actions, action_dof_mask)
+            output = self.dit(
+                action_tokens, vlm_kv_cache, adarms_cond, encoder_attention_mask,
+            )
+            pred_velocity = self.action_out_proj(output)[:, :self.action_chunk_size]
+            actions = actions + dt * pred_velocity
+        return actions

generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.57.3"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:543d8bd40ff00d487b24b85c724587b96839434f61bc51d438319042e0cf0fcb
+size 4999639200

model-00002-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:303b0fe49a58b0397c013f89c7047483c0910b77b8dc665ccb6f0e410cdd848c
+size 4995750056

model-00003-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31a73b4447c20c0bb0cf72b8a999e01ff08a0df3354686b449bdcb560a010c66
+size 981882944

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_prts_qwen3_vl.py ADDED Viewed

	@@ -0,0 +1,935 @@

+# Copyright 2025 TeleAI Rhodes Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Main VLA model architecture based on Qwen3-VL."""
+from dataclasses import dataclass
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss, MSELoss
+from typing import Any, Dict, List, Optional, Tuple, Union
+from transformers.modeling_outputs import ModelOutput
+from transformers.cache_utils import Cache
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, is_torchdynamo_compiling
+from .modeling_qwen3_vl import (
+    Qwen3VLForConditionalGeneration,
+    Qwen3VLTextModel,
+    Qwen3VLVisionModel,
+)
+from .configuration_prts_qwen3_vl import PRTS_FlowMatchingConfig_Qwen3VL
+from .dit_action_head import FlowMatchingDiTHead, MoTFlowMatchingHead
+ACTION_DATASET_NAMES = []
+# ----------------------------- Print Customization -----------------------------
+from colorama import init, Fore, Style
+from datetime import datetime
+# Initialize colorama
+init(autoreset=True)
+class CustomPrinter:
+    """Custom colored printer."""
+    # Define message type configuration
+    TYPE_CONFIG = {
+        'normal': {
+            'color': Fore.WHITE,
+            'icon': '',
+            'prefix': '',
+            'style': Style.NORMAL
+        },
+        'important': {
+            'color': Fore.CYAN,
+            'icon': '💡',
+            'prefix': 'IMPORTANT',
+            'style': Style.BRIGHT
+        }
+    }
+    @classmethod
+    def print(cls, message, msg_type='normal', show_time=True, show_icon=True, end='\n'):
+        """
+        Custom print function.
+        Args:
+            message: The message content to print
+            msg_type: Message type ('normal', 'info', 'success', 'warning', 'error', 'fail', 'debug', 'important')
+            show_time: Whether to display a timestamp
+            show_icon: Whether to display the icon
+            end: Line terminator
+        """
+        # Get configuration for the message type
+        config = cls.TYPE_CONFIG.get(msg_type, cls.TYPE_CONFIG['normal'])
+        # Build prefix parts
+        prefix_parts = []
+        # Add timestamp
+        if show_time:
+            timestamp = datetime.now().strftime('%H:%M:%S')
+            prefix_parts.append(f"[{timestamp}]")
+        # Add icon and prefix text
+        icon_text = f"{config['icon']} " if show_icon else ""
+        prefix_parts.append(f"{icon_text}{config['prefix']}")
+        if config['prefix'] == '':
+            full_message = message
+        else:
+            # Combine prefix parts
+            prefix = " ".join(prefix_parts)
+            # Construct full message
+            full_message = f"{prefix}: {message}"
+        # Apply color and style and print
+        formatted_message = f"{config['style']}{config['color']}{full_message}"
+        print(formatted_message, end=end)
+    @classmethod
+    def normal(cls, message, **kwargs):
+        """Convenience: normal-level print."""
+        cls.print(message, 'normal', **kwargs)
+    @classmethod
+    def important(cls, message, **kwargs):
+        """Convenience: important-level print."""
+        cls.print(message, 'important', **kwargs)
+def important(message, **kwargs):
+    CustomPrinter.important(message, **kwargs)
+# -------------------------------------------------------------
+def create_sinusoidal_pos_embedding(
+    time: torch.Tensor,
+    dimension: int,
+    min_period: float = 4e-3,
+    max_period: float = 4.0,
+    device="cpu",
+) -> torch.Tensor:
+    """
+    Computes sine-cosine positional embedding vectors for scalar positions (diffusion timesteps).
+    Args:
+        time: Tensor of shape (batch_size,) containing timestep values
+        dimension: Embedding dimension (must be even)
+        min_period: Minimum period for sinusoidal encoding
+        max_period: Maximum period for sinusoidal encoding
+        device: Device to create tensors on
+    Returns:
+        Positional embeddings of shape (batch_size, dimension)
+    """
+    if dimension % 2 != 0:
+        raise ValueError(f"dimension ({dimension}) must be divisible by 2")
+    if time.ndim != 1:
+        raise ValueError("The time tensor is expected to be of shape `(batch_size, )`.")
+    fraction = torch.linspace(0.0, 1.0, dimension // 2, device=device)
+    period = min_period * (max_period / min_period) ** fraction
+    scaling_factor = 1.0 / period * 2 * math.pi
+    sin_input = scaling_factor[None, :] * time[:, None]
+    pos_emb = torch.cat([torch.sin(sin_input), torch.cos(sin_input)], dim=1)
+    return pos_emb
+class ContrastiveEncoder(nn.Module):
+    """
+    MLP projector for Contrastive Reinforcement Learning (CRL) embeddings.
+    Projects hidden states to a shared latent space for contrastive learning,
+    with L2 normalization for stable similarity computation.
+    Architecture: N-layer MLP with LayerNorm and Swish activation,
+                  followed by a cold-initialized output projection.
+                  [Linear -> LayerNorm -> Swish] x N -> Linear (cold init)
+    Matches stable_contrastive_rl's Q network structure (default: 4 hidden layers).
+    Args:
+        input_dim: Dimension of input hidden states
+        output_dim: Dimension of output embeddings (default: 256)
+        hidden_dim: Dimension of hidden layers (default: 1024)
+        num_layers: Number of hidden layers (default: 4)
+        repr_norm: Whether to L2-normalize outputs (default: False)
+        init_w: Small value for last layer weight initialization for cold init (default: 1e-12)
+    """
+    def __init__(
+        self,
+        input_dim: int,
+        output_dim: int = 256,
+        hidden_dim: int = 1024,
+        num_layers: int = 4,
+        repr_norm: bool = False,
+        init_w: float = 1e-12,
+    ):
+        super().__init__()
+        self.num_layers = num_layers
+        self.repr_norm = repr_norm
+        # Build hidden layers with LayerNorm
+        self.hidden_layers = nn.ModuleList()
+        self.layer_norms = nn.ModuleList()
+        for i in range(num_layers):
+            in_dim = input_dim if i == 0 else hidden_dim
+            self.hidden_layers.append(nn.Linear(in_dim, hidden_dim))
+            self.layer_norms.append(nn.LayerNorm(hidden_dim))
+        # Output projection layer with cold initialization
+        self.output_proj = nn.Linear(hidden_dim, output_dim)
+        self.output_proj.weight.data.uniform_(-init_w, init_w)
+        self.output_proj.bias.data.fill_(0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Project input to L2-normalized embedding space.
+        Args:
+            x: Input tensor of shape (batch_size, input_dim)
+        Returns:
+            L2-normalized embeddings of shape (batch_size, output_dim)
+        """
+        # Pass through hidden layers
+        for fc, norm in zip(self.hidden_layers, self.layer_norms):
+            x = fc(x)
+            x = norm(x)
+            x = F.silu(x)
+        # Output projection
+        x = self.output_proj(x)
+        # Optional L2 normalization
+        if self.repr_norm:
+            x = F.normalize(x, dim=-1)
+        return x
+@dataclass
+class PRTS_Qwen3VL_ModelOutputWithPast(ModelOutput):
+    """
+    Output class for PRTS model based on Qwen3-VL.
+    Args:
+        loss: Combined total loss
+        flow_loss: Flow matching loss for action prediction
+        cross_entropy_loss: Standard language modeling loss
+        crl_loss: Contrastive Reinforcement Learning loss for goal-action alignment
+        logits: Language model logits
+        past_key_values: Cached key-value states
+        hidden_states: Hidden states from all layers (if output_hidden_states=True)
+        attentions: Attention weights (if output_attentions=True)
+        rope_deltas: RoPE position delta information
+        channel_loss_dict: Per-dataset loss values for logging
+        channel_loss_count_dict: Per-dataset token counts for loss normalization
+    """
+    loss: Optional[torch.FloatTensor] = None
+    flow_loss: Optional[torch.FloatTensor] = None
+    cross_entropy_loss: Optional[torch.FloatTensor] = None
+    crl_loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+    crl_num_samples: Optional[torch.LongTensor] = None
+    channel_loss_dict: Optional[dict] = None
+    channel_loss_count_dict: Optional[dict] = None
+class PRTS_Qwen3VL(Qwen3VLForConditionalGeneration):
+    """
+    Vision-Language-Action model based on Qwen3-VL.
+    This model extends Qwen3-VL to support:
+    1. Proprioceptive state embedding and prediction
+    2. Sub-task description generation (language format)
+    3. Action chunk prediction via flow matching (continuous actions)
+    4. Optional discrete action tokenization (fast mode)
+    The model uses a flow matching approach for continuous action prediction, with a DiT
+    (Diffusion Transformer) action head that cross-attends to VLM hidden states.
+    """
+    config: PRTS_FlowMatchingConfig_Qwen3VL
+    _tied_weights_keys = ["lm_head.weight"]
+    _no_split_modules = ["Qwen3VLTextDecoderLayer", "Qwen3VLVisionBlock"]
+    def __init__(
+        self,
+        config: PRTS_FlowMatchingConfig_Qwen3VL,
+    ):
+        """
+        Initialize the PRTS Qwen3-VL model for action processing.
+        Args:
+            config: Model configuration
+            use_fast_tokenizer (bool): Whether to use FAST tokenizer for discrete actions
+            flow_matching_action_loss_weight (float): Weight for flow matching action loss
+        """
+        super().__init__(config)
+        # The parent class initializes:
+        # - self.visual: Qwen3VLVisionModel
+        # - self.language_model: Qwen3VLTextModel
+        # - self.lm_head: Language model head
+        # - self.rope_deltas: Cached rope deltas
+        # We keep these and add PRTS-specific components
+        # PRTS-specific parameters
+        self.action_dim = config.max_action_dim
+        self.use_fast_tokenizer = config.use_fast_action_tokenizer
+        self.flow_matching_action_loss_weight = config.flow_matching_action_loss_weight
+        # Loss functions
+        self.loss_fct = CrossEntropyLoss(reduction="none")
+        self.loss_mse = MSELoss(reduction="none")
+        # DiT-based flow matching action head: standard (+ AlternateVLDiT) or pi0.5 KV expert
+        self.use_mot_action_expert = config.dit_action_head_config.get(
+            "use_mot_action_expert", False
+        )
+        if config.flow_matching_action_loss_weight > 0.:
+            if self.use_mot_action_expert:
+                self.dit_action_head = MoTFlowMatchingHead(
+                    action_dim=self.action_dim,
+                    action_chunk_size=config.action_chunk_size,
+                    vlm_config=config.text_config,
+                    num_inference_timesteps=config.num_denoise_steps,
+                    config=config.dit_action_head_config,
+                )
+            else:
+                self.dit_action_head = FlowMatchingDiTHead(
+                    action_dim=self.action_dim,
+                    action_chunk_size=config.action_chunk_size,
+                    cross_attention_dim=config.text_config.hidden_size,
+                    num_inference_timesteps=config.num_denoise_steps,
+                    config=config.dit_action_head_config,
+                )
+        # CRL (Contrastive Reinforcement Learning) components
+        if config.crl_loss_weight > 0.:
+            hidden_size = config.text_config.hidden_size
+            # Current encoders (trainable)
+            self.crl_action_encoder = ContrastiveEncoder(
+                input_dim=hidden_size,
+                output_dim=config.crl_embed_dim,
+                init_w=config.crl_encoder_init_w,
+                repr_norm=config.crl_repr_norm,
+            )
+            self.crl_goal_encoder = ContrastiveEncoder(
+                input_dim=hidden_size,
+                output_dim=config.crl_embed_dim,
+                init_w=config.crl_encoder_init_w,
+                repr_norm=config.crl_repr_norm,
+            )
+            # Learnable temperature (log-space for numerical stability, CLIP recipe).
+            self.crl_logit_scale = nn.Parameter(
+                torch.ones([], requires_grad=True) * math.log(1 / 0.2)
+            )
+        # Initialize weights
+        self.post_init()
+        # Print parameter counts
+        visual_params = sum(p.numel() for p in self.visual.parameters())
+        language_params = sum(p.numel() for p in self.language_model.parameters())
+        model_params = visual_params + language_params
+        important(f"Backbone VLM (visual + language_model) parameters: {model_params / 1e6:.2f}M")
+        important(f"Flow Matching Loss coefficient: {self.flow_matching_action_loss_weight}")
+        if config.flow_matching_action_loss_weight > 0.:
+            dit_params = sum(p.numel() for p in self.dit_action_head.parameters())
+            # Get the inner model type name for logging
+            if hasattr(self.dit_action_head, 'dit'):
+                dit_head_type = type(self.dit_action_head.dit).__name__
+            else:
+                dit_head_type = type(self.dit_action_head).__name__
+            important(f"DiT Action Head ({dit_head_type}) parameters: {dit_params / 1e6:.2f}M")
+        if config.crl_loss_weight > 0.:
+            crl_params = sum(p.numel() for p in self.crl_action_encoder.parameters())
+            crl_params += sum(p.numel() for p in self.crl_goal_encoder.parameters())
+            important(f"CRL Encoders (action + goal) parameters: {crl_params / 1e6:.2f}M")
+            important(f"CRL Loss coefficient: {config.crl_loss_weight}")
+            important(f"CRL Encoder init_w: {config.crl_encoder_init_w}")
+            important(f"CRL Repr Norm: {config.crl_repr_norm}")
+        self.fast_action_token_start_idx = 200000
+        self.use_multi_positive = True
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+    def get_decoder(self):
+        return self.language_model
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def to_float32_flow_matching_head(self):
+        """Convert flow matching heads to float32 for numerical stability."""
+        if hasattr(self, 'dit_action_head'):
+            self.dit_action_head = self.dit_action_head.to(dtype=torch.float32)
+    def set_fast_action_info(self, action_mapper, fast_action_token_start_idx):
+        """Set information for fast (discrete) action tokenization."""
+        self.action_mapper = action_mapper
+        self.fast_action_token_start_idx = fast_action_token_start_idx
+    def get_placeholder_mask_with_special_token(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.FloatTensor,
+        special_features: torch.FloatTensor,
+        special_pad_token_id: int,
+    ):
+        """
+        Get placeholder mask for a specific special token (e.g., state tokens).
+        Similar to get_placeholder_mask but for custom special tokens beyond image/video.
+        """
+        if input_ids is None:
+            special_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(special_pad_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_mask = special_mask.all(-1)
+        else:
+            special_mask = input_ids == special_pad_token_id
+        n_special_tokens = special_mask.sum()
+        special_mask = special_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if special_features is not None and inputs_embeds[special_mask].numel() != special_features.numel():
+            raise ValueError(
+                f"Features and tokens do not match: tokens: {n_special_tokens}, features {special_features.shape[0]}"
+            )
+        return special_mask
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        # use_cache: Optional[bool] = None,
+        # output_attentions: Optional[bool] = None,
+        # output_hidden_states: Optional[bool] = None,
+        # return_dict: Optional[bool] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        # rope_deltas: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        actions: Optional[torch.Tensor] = None,
+        action_is_pad: torch.Tensor | None = None,
+        action_dof_mask: Optional[torch.Tensor] = None,
+        dataset_names: Optional[List[str]] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, PRTS_Qwen3VL_ModelOutputWithPast]:
+        """
+        Forward pass for PRTS_Qwen3VL model.
+        This extends Qwen3VLForConditionalGeneration.forward with:
+        - State embedding injection
+        - Action chunk flow matching
+        - DeepStack visual feature handling
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        # 1. Prepare input embeddings
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+        image_mask = None
+        video_mask = None
+        # 2. Process images with deepstack features
+        deepstack_image_embeds = None
+        if pixel_values is not None:
+            image_embeds, deepstack_image_embeds = self.get_image_features(pixel_values, image_grid_thw, image_max_seqlen=kwargs['image_max_seqlen'])
+            image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            image_mask, _ = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+        # 3. Process videos with deepstack features
+        deepstack_video_embeds = None
+        if pixel_values_videos is not None:
+            video_embeds, deepstack_video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
+            video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            _, video_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+        # 4. Aggregate deepstack visual features
+        visual_pos_masks = None
+        deepstack_visual_embeds = None
+        if image_mask is not None and video_mask is not None:
+            # aggregate visual_pos_masks and deepstack_visual_embeds
+            image_mask = image_mask[..., 0]
+            video_mask = video_mask[..., 0]
+            visual_pos_masks = image_mask | video_mask
+            deepstack_visual_embeds = []
+            image_mask_joint = image_mask[visual_pos_masks]
+            video_mask_joint = video_mask[visual_pos_masks]
+            for img_embed, vid_embed in zip(deepstack_image_embeds, deepstack_video_embeds):
+                embed_joint = img_embed.new_zeros(visual_pos_masks.sum(), img_embed.shape[-1]).to(img_embed.device)
+                embed_joint[image_mask_joint, :] = img_embed
+                embed_joint[video_mask_joint, :] = vid_embed
+                deepstack_visual_embeds.append(embed_joint)
+        elif image_mask is not None:
+            image_mask = image_mask[..., 0]
+            visual_pos_masks = image_mask
+            deepstack_visual_embeds = deepstack_image_embeds
+        elif video_mask is not None:
+            video_mask = video_mask[..., 0]
+            visual_pos_masks = video_mask
+            deepstack_visual_embeds = deepstack_video_embeds
+        if attention_mask is not None:
+            attention_mask = attention_mask.to(inputs_embeds.device)
+        # 7. Calculate position IDs using Qwen3VL's rope index
+        if position_ids is None:
+            attention_mask_tensor = (
+                attention_mask if not isinstance(attention_mask, dict) else attention_mask["full_attention"]
+            )
+            if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4:
+                attention_mask_tensor = torch.diagonal(attention_mask_tensor[:, 0], dim1=1, dim2=2)
+                if attention_mask_tensor.dtype.is_floating_point:
+                    attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min
+                    attention_mask_tensor = (1.0 - attention_mask_tensor).int()
+            prefill_compiled_stage = is_torchdynamo_compiling() and (
+                (input_ids is not None and input_ids.shape[1] != 1)
+                or (inputs_embeds is not None and inputs_embeds.shape[1] != 1)
+            )
+            prefill_noncompiled_stage = not is_torchdynamo_compiling() and (
+                (cache_position is not None and cache_position[0] == 0)
+                or (past_key_values is None or past_key_values.get_seq_length() == 0)
+            )
+            if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None:
+                position_ids, rope_deltas = self.get_rope_index(
+                    input_ids,
+                    image_grid_thw,
+                    video_grid_thw,
+                    attention_mask=attention_mask_tensor,
+                )
+                self.rope_deltas = rope_deltas
+            else:
+                batch_size, seq_length, _ = inputs_embeds.shape
+                delta = (
+                    (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
+                    if cache_position is not None
+                    else 0
+                )
+                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                if cache_position is not None:  # otherwise `deltas` is an int `0`
+                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+        _lm_extra_kwargs: dict = {}
+        _use_cache = (
+            self.use_mot_action_expert
+            and self.flow_matching_action_loss_weight > 0.
+            and actions is not None
+        )
+        vlm_outputs = self.language_model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=_use_cache,
+            cache_position=cache_position,
+            visual_pos_masks=visual_pos_masks,
+            deepstack_visual_embeds=deepstack_visual_embeds,
+            output_hidden_states=False,
+            **_lm_extra_kwargs,
+            **kwargs,
+        )
+        vlm_hidden_states = vlm_outputs.last_hidden_state
+        # 11. Run DiT action head if actions are present
+        dit_pred_v = None
+        dit_velocity = None
+        if actions is not None and self.flow_matching_action_loss_weight > 0:
+            # vlm_hidden_states shape: bs, seq_length, hidden_size
+            actions_for_dit = actions.to(vlm_hidden_states.device, dtype=vlm_hidden_states.dtype)
+            dof_mask_for_dit = action_dof_mask.to(vlm_hidden_states.device, dtype=vlm_hidden_states.dtype) if action_dof_mask is not None else None
+            # Pass attention_mask so DiT cross-attention ignores padding tokens
+            dit_encoder_attention_mask = attention_mask.bool() if attention_mask is not None else None
+            if self.use_mot_action_expert and vlm_outputs.past_key_values is not None:
+                dit_pred_v, dit_velocity = self.dit_action_head(
+                    vlm_outputs.past_key_values,
+                    actions_for_dit,
+                    dof_mask_for_dit,
+                    encoder_attention_mask=dit_encoder_attention_mask,
+                )
+            else:
+                # Standard: pass single (last-layer) VLM hidden states
+                dit_image_mask = visual_pos_masks.bool() if visual_pos_masks is not None else None
+                dit_pred_v, dit_velocity = self.dit_action_head(
+                    vlm_hidden_states, actions_for_dit, dof_mask_for_dit,
+                    encoder_attention_mask=dit_encoder_attention_mask,
+                    image_mask=dit_image_mask,
+                )
+        # 12. Compute logits
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(vlm_hidden_states[:, slice_indices, :])
+        # 13. Compute losses
+        loss = None
+        cross_entropy_loss, flow_loss = None, None
+        channel_loss_dict = None
+        channel_loss_count_dict = None
+        if labels is not None:
+            loss = 0
+            action_accuracy = 0
+            unique_datasets_name = list(set(dataset_names)) if dataset_names is not None else []
+            # Compute cross-entropy loss
+            shift_logits = logits[..., :-1, :].float().contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            shift_labels = shift_labels.to(shift_logits.device)
+            non_ignored_mask = shift_labels != -100
+            _cross_entropy_loss = self.loss_fct(shift_logits, shift_labels)
+            cross_entropy_loss = (
+                _cross_entropy_loss[non_ignored_mask].mean()
+                if non_ignored_mask.any()
+                else (_cross_entropy_loss.sum() * 0.0)
+            )
+            # Add cross-entropy loss to total
+            if not torch.isnan(cross_entropy_loss):
+                loss += cross_entropy_loss
+            else:
+                with torch.no_grad():
+                    cross_entropy_loss.detach()
+            # Compute action token prediction accuracy (for logging)
+            shift_logits_for_acc = logits[..., :-1, :].contiguous()
+            action_preds = shift_logits_for_acc.argmax(dim=-1)
+            shift_labels_for_acc = labels[..., 1:].contiguous()
+            action_mask = (
+                shift_labels_for_acc >= self.fast_action_token_start_idx
+            )
+            if self.use_fast_tokenizer and action_mask.any():
+                correct_preds = (action_preds == shift_labels_for_acc) & action_mask
+                action_accuracy = (
+                    correct_preds.sum().float() / action_mask.sum().float()
+                )
+                if channel_loss_dict is None:
+                    channel_loss_dict = {}
+                    channel_loss_count_dict = {}
+                channel_loss_dict["action_accuracy"] = action_accuracy.detach()
+                channel_loss_count_dict["action_accuracy"] = torch.tensor(1, device=action_accuracy.device)
+        # 14. Compute flow matching loss (DiT action head)
+        if dit_pred_v is not None and self.flow_matching_action_loss_weight > 0:
+            if channel_loss_dict is not None:
+                channel_loss_dict.update(
+                    {
+                        f"flow_matching/{dataset_name}": torch.tensor(0.0, device=logits.device)
+                        for dataset_name in ACTION_DATASET_NAMES
+                    }
+                )
+                channel_loss_count_dict.update(
+                    {
+                        f"flow_matching/{dataset_name}": torch.tensor(0, device=logits.device)
+                        for dataset_name in ACTION_DATASET_NAMES
+                    }
+                )
+            else:
+                channel_loss_dict = {
+                    f"flow_matching/{dataset_name}": torch.tensor(0.0, device=logits.device)
+                    for dataset_name in ACTION_DATASET_NAMES
+                }
+                channel_loss_count_dict = {
+                    f"flow_matching/{dataset_name}": torch.tensor(0, device=logits.device)
+                    for dataset_name in ACTION_DATASET_NAMES
+                }
+            # Compute flow matching loss: MSE between predicted and target velocity
+            _fm_loss = self.loss_mse(dit_pred_v, dit_velocity)
+            # Apply DOF mask (zero out invalid action dimensions)
+            if action_dof_mask is not None:
+                valid_action_dim = int(action_dof_mask[0, 0, :].sum(dim=-1).item())     # NOTE: only support 单种具身实体数据微调
+                _fm_loss = _fm_loss[:, :, :valid_action_dim]
+            # Apply action_is_pad mask: exclude padding timesteps from loss
+            # action_is_pad: (B, T), True = pad timestep → should not contribute to loss
+            if action_is_pad is not None:
+                valid_timestep_mask = ~action_is_pad[:, :_fm_loss.shape[1]]  # align length
+                _fm_loss = _fm_loss * valid_timestep_mask.unsqueeze(-1)
+                flow_loss = _fm_loss.sum() / (valid_timestep_mask.sum() * _fm_loss.shape[-1])
+            else:
+                flow_loss = _fm_loss.mean()
+            if not torch.isnan(flow_loss):
+                loss = loss + self.flow_matching_action_loss_weight * flow_loss if loss is not None else self.flow_matching_action_loss_weight * flow_loss
+            else:
+                with torch.no_grad():
+                    flow_loss.detach()
+            # Per-dataset flow matching loss logging
+            logging_fm_loss = _fm_loss.detach().mean(dim=(1, 2))  # Sum over chunk_size and action_dim
+            action_dataset_names = dataset_names if dataset_names is not None else []
+            unique_action_datasets = list(set(action_dataset_names))
+            for dataset_name_i in unique_action_datasets:
+                action_dataset_mask = torch.tensor(
+                    [name == dataset_name_i for name in action_dataset_names],
+                    device=logits.device,
+                )
+                if action_dataset_mask.any():
+                    dataset_fm_loss = logging_fm_loss[action_dataset_mask].sum()
+                    dataset_fm_count = action_dataset_mask.sum()
+                    prefixed_key = f"flow_matching/{dataset_name_i}"
+                    channel_loss_dict[prefixed_key] += dataset_fm_loss
+                    channel_loss_count_dict[prefixed_key] += dataset_fm_count
+        elif self.flow_matching_action_loss_weight > 0:
+            # Dummy loss to keep all DiT parameters in computation graph
+            dummy_params = [p.sum() * 0.0 for p in self.dit_action_head.parameters() if p.requires_grad]
+            dummy_loss = sum(dummy_params) if len(dummy_params) > 0 else torch.tensor(0.0, device=logits.device)
+            loss = (loss + dummy_loss) if loss is not None else dummy_loss
+        return PRTS_Qwen3VL_ModelOutputWithPast(
+            loss=loss,
+            cross_entropy_loss=(
+                cross_entropy_loss.detach() if cross_entropy_loss is not None else None
+            ),
+            flow_loss=(
+                flow_loss.detach() if flow_loss is not None else None
+            ),
+            crl_loss=None,
+            logits=logits,
+            past_key_values=vlm_outputs.past_key_values,
+            # hidden_states=vlm_outputs.hidden_states,
+            # attentions=vlm_outputs.attentions,
+            crl_num_samples=None,
+            rope_deltas=self.rope_deltas,
+            channel_loss_dict=channel_loss_dict,
+            channel_loss_count_dict=channel_loss_count_dict,
+        )
+    def embed_prefix(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.FloatTensor | None = None,
+        pixel_values: torch.Tensor | None = None,
+        pixel_values_videos: torch.FloatTensor | None = None,
+        image_grid_thw: torch.LongTensor | None = None,
+        video_grid_thw: torch.LongTensor | None = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[List[torch.Tensor]]]:
+        """
+        Embed prefix tokens including vision, DeepStack, and (optionally) state features.
+        Returns:
+            (inputs_embeds, visual_pos_masks, deepstack_visual_embeds)
+        """
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+        image_mask = None
+        video_mask = None
+        deepstack_image_embeds = None
+        deepstack_video_embeds = None
+        if pixel_values is not None:
+            image_embeds, deepstack_image_embeds = self.get_image_features(
+                pixel_values, image_grid_thw,
+                image_max_seqlen=kwargs.get('image_max_seqlen'),
+            )
+            image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            image_mask, _ = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+        if pixel_values_videos is not None:
+            video_embeds, deepstack_video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
+            video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            _, video_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+        visual_pos_masks = None
+        deepstack_visual_embeds = None
+        if image_mask is not None and video_mask is not None:
+            image_mask = image_mask[..., 0]
+            video_mask = video_mask[..., 0]
+            visual_pos_masks = image_mask | video_mask
+            deepstack_visual_embeds = []
+            image_mask_joint = image_mask[visual_pos_masks]
+            video_mask_joint = video_mask[visual_pos_masks]
+            for img_embed, vid_embed in zip(deepstack_image_embeds, deepstack_video_embeds):
+                embed_joint = img_embed.new_zeros(visual_pos_masks.sum(), img_embed.shape[-1]).to(img_embed.device)
+                embed_joint[image_mask_joint, :] = img_embed
+                embed_joint[video_mask_joint, :] = vid_embed
+                deepstack_visual_embeds.append(embed_joint)
+        elif image_mask is not None:
+            image_mask = image_mask[..., 0]
+            visual_pos_masks = image_mask
+            deepstack_visual_embeds = deepstack_image_embeds
+        elif video_mask is not None:
+            video_mask = video_mask[..., 0]
+            visual_pos_masks = video_mask
+            deepstack_visual_embeds = deepstack_video_embeds
+        return inputs_embeds, visual_pos_masks, deepstack_visual_embeds
+    @torch.no_grad()
+    def sample_actions(
+        self,
+        input_ids: torch.LongTensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        attention_mask: torch.Tensor | None = None,
+        past_key_values: list[torch.FloatTensor] | None = None,
+        inputs_embeds: torch.FloatTensor | None = None,
+        cache_position: torch.LongTensor | None = None,
+        pixel_values: torch.Tensor | None = None,
+        pixel_values_videos: torch.FloatTensor | None = None,
+        image_grid_thw: torch.LongTensor | None = None,
+        video_grid_thw: torch.LongTensor | None = None,
+        action_dof_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Any]:
+        """
+        Sample actions using DiT-based flow matching denoising.
+        1. Computes position_ids via get_rope_index
+        2. Embeds the prefix (with DeepStack visual features)
+        3. Runs the language model to get hidden states
+        4. Uses DiT action head to denoise actions via cross-attention to VLM features
+        Returns:
+            (x_t, outputs) — denoised action trajectories and language-model outputs
+        """
+        if position_ids is None:
+            position_ids, _ = self.get_rope_index(
+                input_ids,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                attention_mask=attention_mask,
+            )
+        visual_pos_masks = None
+        deepstack_visual_embeds = None
+        if inputs_embeds is None:
+            inputs_embeds, visual_pos_masks, deepstack_visual_embeds = self.embed_prefix(
+                input_ids,
+                pixel_values=pixel_values,
+                pixel_values_videos=pixel_values_videos,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                **kwargs,
+            )
+        _sample_use_cache = (
+            self.use_mot_action_expert and self.flow_matching_action_loss_weight > 0
+        )
+        outputs = self.language_model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=_sample_use_cache,
+            cache_position=cache_position,
+            visual_pos_masks=visual_pos_masks,
+            deepstack_visual_embeds=deepstack_visual_embeds,
+            output_hidden_states=False,
+        )
+        vlm_hidden_states = outputs.last_hidden_state
+        dit_encoder_attention_mask = attention_mask.bool() if attention_mask is not None else None
+        if self.use_mot_action_expert and outputs.past_key_values is not None:
+            x_t = self.dit_action_head.predict_action(
+                outputs.past_key_values,
+                action_dof_mask,
+                encoder_attention_mask=dit_encoder_attention_mask,
+            )
+        else:
+            dit_image_mask = visual_pos_masks.bool() if visual_pos_masks is not None else None
+            x_t = self.dit_action_head.predict_action(
+                vlm_hidden_states, action_dof_mask,
+                encoder_attention_mask=dit_encoder_attention_mask,
+                image_mask=dit_image_mask,
+            )
+        return x_t, outputs
+PRTS_Qwen3VL.register_for_auto_class()
+__all__ = ["PRTS_Qwen3VL", "PRTS_Qwen3VL_ModelOutputWithPast"]

modeling_qwen3_vl.py ADDED Viewed

	@@ -0,0 +1,1645 @@

+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/qwen3_vl/modular_qwen3_vl.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_qwen3_vl.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Union
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.generation import GenerationMixin
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.masking_utils import create_causal_mask
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, auto_docstring, is_torchdynamo_compiling
+from transformers.utils.deprecation import deprecate_kwarg
+from transformers.utils.generic import check_model_inputs
+from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig, Qwen3VLTextConfig, Qwen3VLVisionConfig
+# 在文件头部导入
+try:
+    from qwen_rope_kernel_2 import fused_qwen_rope as fused_qwen_rope_v2
+    HAS_QWEN_ROPE_V2 = True
+except ImportError:
+    print("No qwen_rope_kernel_2 found")
+    HAS_QWEN_ROPE_V2 = False
+try:
+    from fused_rmsnorm import RMSNormModelFunction as _FUSED_RMSFUNC
+    HAS_FUSED_RMSNORM = True
+except ImportError:
+    print("No fused_rmsnorm found")
+    HAS_FUSED_RMSNORM = False
+class Qwen3VLVisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.linear_fc1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=True)
+        self.linear_fc2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=True)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_state):
+        return self.linear_fc2(self.act_fn(self.linear_fc1(hidden_state)))
+class Qwen3VLVisionPatchEmbed(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.patch_size = config.patch_size
+        self.temporal_patch_size = config.temporal_patch_size
+        self.in_channels = config.in_channels
+        self.embed_dim = config.hidden_size
+        kernel_size = [self.temporal_patch_size, self.patch_size, self.patch_size]
+        self.proj = nn.Conv3d(self.in_channels, self.embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=True)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.proj.weight.dtype
+        hidden_states = hidden_states.view(
+            -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+        )
+        hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
+        return hidden_states
+class Qwen3VLVisionRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+class Qwen3VLVisionPatchMerger(nn.Module):
+    def __init__(self, config: Qwen3VLVisionConfig, use_postshuffle_norm=False) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size * (config.spatial_merge_size**2)
+        self.use_postshuffle_norm = use_postshuffle_norm
+        self.norm = nn.LayerNorm(self.hidden_size if use_postshuffle_norm else config.hidden_size, eps=1e-6)
+        self.linear_fc1 = nn.Linear(self.hidden_size, self.hidden_size)
+        self.act_fn = nn.GELU()
+        self.linear_fc2 = nn.Linear(self.hidden_size, config.out_hidden_size)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.norm(x.view(-1, self.hidden_size) if self.use_postshuffle_norm else x).view(-1, self.hidden_size)
+        x = self.linear_fc2(self.act_fn(self.linear_fc1(x)))
+        return x
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb_vision(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if HAS_QWEN_ROPE_V2 and q.is_cuda and q.dtype == torch.bfloat16 and q.shape[-1] in (64, 128):
+        # qwen_rope_kernel_2 handles (S, D) cos/sin for (S, H, D) input naturally.
+        # The kernel REQUIRES cos/sin to be 2D [S, D] if input is 3D [S, H, D].
+        # It DOES NOT support 3D [S, 1, D] for cos/sin.
+        if cos.dtype != torch.float32:
+            cos = cos.to(torch.float32)
+        if sin.dtype != torch.float32:
+            sin = sin.to(torch.float32)
+        # Proactively squeeze [S, 1, D] -> [S, D] to satisfy kernel requirements
+        # This is a view operation, zero memory copy overhead.
+        if cos.ndim == 3 and cos.shape[1] == 1:
+            cos = cos.squeeze(1)
+            sin = sin.squeeze(1)
+        return fused_qwen_rope_v2(q, cos, sin), fused_qwen_rope_v2(k, cos, sin)
+    orig_q_dtype = q.dtype
+    orig_k_dtype = k.dtype
+    q, k = q.float(), k.float()
+    if cos.ndim == 2:
+        cos = cos.unsqueeze(-2)
+        sin = sin.unsqueeze(-2)
+    if cos.dtype != torch.float32:
+        cos = cos.to(torch.float32)
+    if sin.dtype != torch.float32:
+        sin = sin.to(torch.float32)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed.to(orig_q_dtype), k_embed.to(orig_k_dtype)
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+class Qwen3VLVisionAttention(nn.Module):
+    def __init__(self, config: Qwen3VLVisionConfig) -> None:
+        super().__init__()
+        self.dim = config.hidden_size
+        self.num_heads = config.num_heads
+        self.head_dim = self.dim // self.num_heads
+        self.num_key_value_groups = 1  # needed for eager attention
+        self.qkv = nn.Linear(self.dim, self.dim * 3, bias=True)
+        self.proj = nn.Linear(self.dim, self.dim)
+        self.scaling = self.head_dim**-0.5
+        self.config = config
+        self.attention_dropout = 0.0
+        self.is_causal = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        query_states, key_states, value_states = (
+            self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        )
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin)
+        query_states = query_states.transpose(0, 1).unsqueeze(0)
+        key_states = key_states.transpose(0, 1).unsqueeze(0)
+        value_states = value_states.transpose(0, 1).unsqueeze(0)
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        if self.config._attn_implementation in ["flash_attention_2", "flash_attention_3"]:
+            # Flash Attention 2: Use cu_seqlens for variable length attention
+            if "image_max_seqlen" in kwargs and kwargs["image_max_seqlen"] is not None:
+                max_seqlen = kwargs["image_max_seqlen"]
+            else:
+                max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
+            attn_output, _ = attention_interface(
+                self,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask=None,
+                scaling=self.scaling,
+                dropout=0.0 if not self.training else self.attention_dropout,
+                cu_seq_lens_q=cu_seqlens,
+                cu_seq_lens_k=cu_seqlens,
+                max_length_q=max_seqlen,
+                max_length_k=max_seqlen,
+                is_causal=False,
+                **kwargs,
+            )
+        else:
+            # Other implementations: Process each chunk separately
+            lengths = cu_seqlens[1:] - cu_seqlens[:-1]
+            splits = [
+                torch.split(tensor, lengths.tolist(), dim=2) for tensor in (query_states, key_states, value_states)
+            ]
+            attn_outputs = [
+                attention_interface(
+                    self,
+                    q,
+                    k,
+                    v,
+                    attention_mask=None,
+                    scaling=self.scaling,
+                    dropout=0.0 if not self.training else self.attention_dropout,
+                    is_causal=False,
+                    **kwargs,
+                )[0]
+                for q, k, v in zip(*splits)
+            ]
+            attn_output = torch.cat(attn_outputs, dim=1)
+        attn_output = attn_output.reshape(seq_length, -1).contiguous()
+        attn_output = self.proj(attn_output)
+        return attn_output
+class Qwen3VLVisionBlock(GradientCheckpointingLayer):
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=1e-6)
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=1e-6)
+        self.attn = Qwen3VLVisionAttention(config=config)
+        self.mlp = Qwen3VLVisionMLP(config=config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states),
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+class Qwen3VLTextRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    def __init__(self, config: Qwen3VLTextConfig, device=None):
+        super().__init__()
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", "default")
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+        self.mrope_section = config.rope_scaling.get("mrope_section", [24, 20, 20])
+    def apply_interleaved_mrope(self, freqs, mrope_section):
+        """Apply interleaved MRoPE to 3D rotary embeddings.
+        Reorganizes frequency layout from chunked [TTT...HHH...WWW] to
+        interleaved [THTHWHTHW...TT], preserving frequency continuity.
+        args:
+            x: (3, bs, seq_len, head_dim // 2)
+            mrope_section: (3,)
+        returns:
+            x_t: (bs, seq_len, head_dim // 2)
+        """
+        freqs_t = freqs[0]  # just overwrite the first dimension T
+        for dim, offset in enumerate((1, 2), start=1):  # H, W
+            length = mrope_section[dim] * 3
+            idx = slice(offset, length, 3)
+            freqs_t[..., idx] = freqs[dim, ..., idx]
+        return freqs_t
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        if position_ids.ndim == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+        inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
+        device = inv_freq_expanded.device
+        position_ids_expanded = position_ids[:, :, None, :].float().to(device)
+        freqs = (inv_freq_expanded @ position_ids_expanded).transpose(2, 3)
+        freqs = self.apply_interleaved_mrope(freqs, self.mrope_section)
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos = emb.cos() * self.attention_scaling
+        sin = emb.sin() * self.attention_scaling
+        return cos.contiguous(), sin.contiguous()
+@use_kernel_forward_from_hub("RMSNorm")
+class Qwen3VLTextRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps: float = 1e-6) -> None:
+        """
+        Qwen3VLTextRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size, dtype=torch.bfloat16))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if HAS_FUSED_RMSNORM and hidden_states.is_cuda:
+            x = hidden_states if hidden_states.dtype == torch.bfloat16 else hidden_states.to(torch.bfloat16)
+            x = x.contiguous()
+            return _FUSED_RMSFUNC.apply(x, self.weight, self.variance_epsilon, self.weight.shape[0])
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    if HAS_QWEN_ROPE_V2 and q.is_cuda and q.dtype == torch.bfloat16 and q.shape[-1] in (64, 128):
+        # qwen_rope_kernel_2 handles (S, D) cos/sin for (S, H, D) input naturally.
+        if cos.dtype != torch.float32:
+            cos = cos.to(torch.float32)
+        if sin.dtype != torch.float32:
+            sin = sin.to(torch.float32)
+        return fused_qwen_rope_v2(q, cos, sin), fused_qwen_rope_v2(k, cos, sin)
+    if cos.ndim != q.ndim:
+        cos = cos.unsqueeze(unsqueeze_dim)
+        sin = sin.unsqueeze(unsqueeze_dim)
+    if cos.dtype != q.dtype:
+        cos = cos.to(q.dtype)
+        sin = sin.to(q.dtype)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class Qwen3VLTextAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: Qwen3VLTextConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.q_norm = Qwen3VLTextRMSNorm(self.head_dim, eps=config.rms_norm_eps)  # unlike olmo, only on the head dim!
+        self.k_norm = Qwen3VLTextRMSNorm(
+            self.head_dim, eps=config.rms_norm_eps
+        )  # thus post q_norm does not need reshape
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class Qwen3VLTextMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+class Qwen3VLTextDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Qwen3VLTextConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Qwen3VLTextAttention(config=config, layer_idx=layer_idx)
+        self.mlp = Qwen3VLTextMLP(config)
+        self.input_layernorm = Qwen3VLTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen3VLTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention. DEBUG: When we use packing mode, here we would enter `qwen3vl_forward` in `train_utils.py`
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Llava outputs, with hidden states and attentions.
+    """
+)
+class Qwen3VLModelOutputWithPast(ModelOutput):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+        The rope index difference between sequence length and multimodal rope.
+    """
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+@auto_docstring
+class Qwen3VLPreTrainedModel(PreTrainedModel):
+    config: Qwen3VLConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen3VLTextDecoderLayer", "Qwen3VLVisionBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Qwen3VLTextDecoderLayer,
+        "attentions": Qwen3VLTextAttention,
+    }
+class Qwen3VLVisionModel(Qwen3VLPreTrainedModel):
+    config: Qwen3VLVisionConfig
+    _no_split_modules = ["Qwen3VLVisionBlock"]
+    def __init__(self, config, *inputs, **kwargs) -> None:
+        super().__init__(config, *inputs, **kwargs)
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_size = config.patch_size
+        self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
+        self.patch_embed = Qwen3VLVisionPatchEmbed(
+            config=config,
+        )
+        self.pos_embed = nn.Embedding(config.num_position_embeddings, config.hidden_size)
+        self.num_grid_per_side = int(config.num_position_embeddings**0.5)
+        head_dim = config.hidden_size // config.num_heads
+        self.rotary_pos_emb = Qwen3VLVisionRotaryEmbedding(head_dim // 2)
+        self.blocks = nn.ModuleList([Qwen3VLVisionBlock(config) for _ in range(config.depth)])
+        self.merger = Qwen3VLVisionPatchMerger(
+            config=config,
+            use_postshuffle_norm=False,
+        )
+        self.deepstack_visual_indexes = config.deepstack_visual_indexes
+        self.deepstack_merger_list = nn.ModuleList(
+            [
+                Qwen3VLVisionPatchMerger(
+                    config=config,
+                    use_postshuffle_norm=True,
+                )
+                for _ in range(len(config.deepstack_visual_indexes))
+            ]
+        )
+        self.gradient_checkpointing = False
+    def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor:
+        merge_size = self.spatial_merge_size
+        max_hw = int(grid_thw[:, 1:].max().item())
+        freq_table = self.rotary_pos_emb(max_hw)  # (max_hw, dim // 2)
+        device = freq_table.device
+        total_tokens = int(torch.prod(grid_thw, dim=1).sum().item())
+        # pos_ids = torch.empty((total_tokens, 2), dtype=torch.long, device=device)
+        pos_ids_cpu = torch.empty((total_tokens, 2) , dtype=torch.long , device="cpu")
+        offset = 0
+        for num_frames, height, width in grid_thw.numpy():
+            merged_h, merged_w = height // merge_size, width // merge_size
+            block_rows = torch.arange(merged_h, device="cpu")  # block row indices
+            block_cols = torch.arange(merged_w, device="cpu")  # block col indices
+            intra_row = torch.arange(merge_size, device="cpu")  # intra-block row offsets
+            intra_col = torch.arange(merge_size, device="cpu")  # intra-block col offsets
+            # Compute full-resolution positions
+            row_idx = block_rows[:, None, None, None] * merge_size + intra_row[None, None, :, None]
+            col_idx = block_cols[None, :, None, None] * merge_size + intra_col[None, None, None, :]
+            row_idx = row_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1)
+            col_idx = col_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1)
+            coords = torch.stack((row_idx, col_idx), dim=-1)
+            if num_frames > 1:
+                coords = coords.repeat(num_frames, 1)
+            num_tokens = coords.shape[0]
+            pos_ids_cpu[offset : offset + num_tokens] = coords
+            offset += num_tokens
+        pos_ids = pos_ids_cpu.to(device , non_blocking=True)
+        embeddings = freq_table[pos_ids]  # lookup rotary embeddings
+        embeddings = embeddings.flatten(1)
+        return embeddings
+    def fast_pos_embed_interpolate(self, grid_thw):
+        # grid_thw 已经是 CPU Tensor，直接解包
+        grid_ts, grid_hs, grid_ws = grid_thw[:, 0], grid_thw[:, 1], grid_thw[:, 2]
+        idx_accum = [[] for _ in range(4)]
+        weight_accum = [[] for _ in range(4)]
+        # 预取配置，避免循环内 getattr
+        num_grid = self.num_grid_per_side
+        # 这一步依然需要在 CPU 循环计算，因为 H/W 是变长的，但这只是纯算数，很快
+        for h, w in zip(grid_hs, grid_ws):
+            h_idxs = torch.linspace(0, num_grid - 1, h)
+            w_idxs = torch.linspace(0, num_grid - 1, w)
+            h_idxs_floor = h_idxs.int()
+            w_idxs_floor = w_idxs.int()
+            h_idxs_ceil = (h_idxs_floor + 1).clamp(max=num_grid - 1)
+            w_idxs_ceil = (w_idxs_floor + 1).clamp(max=num_grid - 1)
+            dh = h_idxs - h_idxs_floor
+            dw = w_idxs - w_idxs_floor
+            base_h = h_idxs_floor * num_grid
+            base_h_ceil = h_idxs_ceil * num_grid
+            indices = [
+                (base_h[:, None] + w_idxs_floor[None, :]).flatten(),
+                (base_h[:, None] + w_idxs_ceil[None, :]).flatten(),
+                (base_h_ceil[:, None] + w_idxs_floor[None, :]).flatten(),
+                (base_h_ceil[:, None] + w_idxs_ceil[None, :]).flatten(),
+            ]
+            weights = [
+                ((1 - dh)[:, None] * (1 - dw)[None, :]).flatten(),
+                ((1 - dh)[:, None] * dw[None, :]).flatten(),
+                (dh[:, None] * (1 - dw)[None, :]).flatten(),
+                (dh[:, None] * dw[None, :]).flatten(),
+            ]
+            # 直接 Append Tensor，不做 tolist()
+            for i in range(4):
+                idx_accum[i].append(indices[i])
+                weight_accum[i].append(weights[i])
+        target_device = self.pos_embed.weight.device
+        target_dtype = self.pos_embed.weight.dtype
+        idx_tensor = torch.stack([torch.cat(acc) for acc in idx_accum]).to(device=target_device, dtype=torch.long)
+        weight_tensor = torch.stack([torch.cat(acc) for acc in weight_accum]).to(device=target_device, dtype=target_dtype)
+        pos_embeds = self.pos_embed(idx_tensor) * weight_tensor[:, :, None]
+        patch_pos_embeds = pos_embeds.sum(dim=0)
+        merge_size = self.config.spatial_merge_size
+        indices_list = []
+        current_offset = 0
+        for t, h, w in zip(grid_ts.tolist(), grid_hs.tolist(), grid_ws.tolist()):
+            local_ids = torch.arange(h * w, device='cpu').view(h, w)
+            local_ids_permuted = (
+                local_ids.view(h // merge_size, merge_size, w // merge_size, merge_size)
+                .permute(0, 2, 1, 3)
+                .reshape(-1)
+            )
+            global_ids = local_ids_permuted + current_offset
+            if t > 1:
+                global_ids = global_ids.repeat(t)
+            indices_list.append(global_ids)
+            current_offset += h * w
+        all_indices = torch.cat(indices_list).to(target_device)
+        patch_pos_embeds = patch_pos_embeds[all_indices]
+        return patch_pos_embeds
+    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor, **kwargs) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
+                The final hidden states of the model.
+            grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
+                The temporal, height and width of feature shape of each image in LLM.
+        Returns:
+            `torch.Tensor`: hidden_states.
+        """
+        hidden_states = self.patch_embed(hidden_states)
+        #move grid_thw to cpu
+        grid_thw_cpu = grid_thw.cpu()
+        pos_embeds = self.fast_pos_embed_interpolate(grid_thw_cpu)
+        hidden_states = hidden_states + pos_embeds
+        rotary_pos_emb = self.rot_pos_emb(grid_thw_cpu)
+        seq_len, _ = hidden_states.size()
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        cos = emb.cos().to(torch.float32).unsqueeze(-2).contiguous()
+        sin = emb.sin().to(torch.float32).unsqueeze(-2).contiguous()
+        cos = cos.to(device=hidden_states.device, non_blocking=True)
+        sin = sin.to(device=hidden_states.device, non_blocking=True)
+        position_embeddings = (cos, sin)
+        #use the grid_thw in gpu
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0,
+            dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        cu_seqlens = cu_seqlens.to(device=hidden_states.device)
+        deepstack_feature_lists = []
+        for layer_num, blk in enumerate(self.blocks):
+            if self.gradient_checkpointing and self.training:
+                blk.gradient_checkpointing = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(inputs[0], inputs[1], inputs[2], inputs[3], **inputs[4])
+                    return custom_forward
+                hidden_states = self._gradient_checkpointing_func(
+                    create_custom_forward(blk),
+                    hidden_states,
+                    cu_seqlens,
+                    None,
+                    position_embeddings,
+                    kwargs,
+                )
+            else:
+                hidden_states = blk(
+                    hidden_states,
+                    cu_seqlens=cu_seqlens,
+                    position_embeddings=position_embeddings,
+                    **kwargs,
+                )
+            if layer_num in self.deepstack_visual_indexes:
+                deepstack_feature = self.deepstack_merger_list[self.deepstack_visual_indexes.index(layer_num)](
+                    hidden_states
+                )
+                deepstack_feature_lists.append(deepstack_feature)
+        hidden_states = self.merger(hidden_states)
+        return hidden_states, deepstack_feature_lists
+@auto_docstring(
+    custom_intro=(
+        "Text part of Qwen3VL, "
+        "not a pure text-only model, as DeepStack integrates visual features into the early hidden states."
+    )
+)
+class Qwen3VLTextModel(Qwen3VLPreTrainedModel):
+    config: Qwen3VLTextConfig
+    _no_split_modules = ["Qwen3VLTextDecoderLayer"]
+    def __init__(self, config: Qwen3VLTextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Qwen3VLTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Qwen3VLTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen3VLTextRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @check_model_inputs()
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        # args for deepstack
+        visual_pos_masks: Optional[torch.Tensor] = None,
+        deepstack_visual_embeds: Optional[list[torch.Tensor]] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        r"""
+        visual_pos_masks (`torch.Tensor` of shape `(batch_size, seqlen)`, *optional*):
+            The mask of the visual positions.
+        deepstack_visual_embeds (`list[torch.Tensor]`, *optional*):
+            The deepstack visual embeddings. The shape is (num_layers, visual_seqlen, embed_dim).
+            The feature is extracted from the different visual encoder layers, and fed to the decoder
+            hidden states. It's from the paper DeepStack(https://arxiv.org/abs/2406.04334).
+        """
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        # torch.jit.trace() doesn't support cache objects in the output
+        if use_cache and past_key_values is None and not torch.jit.is_tracing():
+            past_key_values = DynamicCache(config=self.config)
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        # the hard coded `3` is for temporal, height and width.
+        if position_ids is None:
+            position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1)  # (3, bs, seq_length)
+        elif position_ids.ndim == 2:
+            position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1)
+        if position_ids.ndim == 3 and position_ids.shape[0] == 4:
+            text_position_ids = position_ids[0]
+            position_ids = position_ids[1:]
+        else:
+            text_position_ids = position_ids[0]
+        # NOTE: Attention! When we use packing mode, this `create_causal_mask` is overwrited, and directly return `attention_mask`.
+        attention_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=text_position_ids,
+        )
+        hidden_states = inputs_embeds
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        cos, sin = position_embeddings
+        cos = cos.to(device=hidden_states.device, non_blocking=True).unsqueeze(1).contiguous()
+        sin = sin.to(device=hidden_states.device, non_blocking=True).unsqueeze(1).contiguous()
+        position_embeddings = (cos, sin)
+        # decoder layers
+        for layer_idx, decoder_layer in enumerate(self.layers):
+            if self.gradient_checkpointing and self.training:
+                decoder_layer.gradient_checkpointing = False
+                def create_custom_forward(module):  # DEBUG: Here we enter the Qwen3VLTextDecoderLayer forward
+                    def custom_forward(*inputs):
+                        # inputs: hidden_states, position_embeddings, attention_mask, position_ids, past_key_values, use_cache, cache_position, kwargs_dict
+                        return module(
+                            inputs[0],
+                            inputs[1],
+                            attention_mask=inputs[2],
+                            position_ids=inputs[3],
+                            past_key_values=inputs[4],
+                            use_cache=inputs[5],
+                            cache_position=inputs[6],
+                            **inputs[7]
+                        )
+                    return custom_forward
+                layer_outputs = self._gradient_checkpointing_func(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    position_embeddings,
+                    attention_mask,
+                    text_position_ids,
+                    past_key_values,
+                    False, # use_cache
+                    cache_position,
+                    kwargs,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=text_position_ids,
+                    past_key_values=past_key_values,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                    **kwargs,
+                )
+            hidden_states = layer_outputs
+            # add visual features to the hidden states of first several layers
+            if deepstack_visual_embeds is not None and layer_idx in range(len(deepstack_visual_embeds)):
+                hidden_states = self._deepstack_process(
+                    hidden_states,
+                    visual_pos_masks,
+                    deepstack_visual_embeds[layer_idx],
+                )
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+    def _deepstack_process(
+        self, hidden_states: torch.Tensor, visual_pos_masks: torch.Tensor, visual_embeds: torch.Tensor
+    ):
+        visual_pos_masks = visual_pos_masks.to(hidden_states.device)
+        visual_embeds = visual_embeds.to(hidden_states.device, hidden_states.dtype)
+        local_this = hidden_states[visual_pos_masks, :].clone() + visual_embeds
+        hidden_states[visual_pos_masks, :] = local_this
+        return hidden_states
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Qwen3VL causal language model (or autoregressive) outputs.
+    """
+)
+class Qwen3VLCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+        The rope index difference between sequence length and multimodal rope.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+class Qwen3VLForConditionalGeneration(Qwen3VLPreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {}
+    _tied_weights_keys = ["lm_head.weight"]
+    # Reference: fix gemma3 grad acc #37208
+    accepts_loss_kwargs = False
+    config: Qwen3VLConfig
+    _no_split_modules = ["Qwen3VLTextDecoderLayer", "Qwen3VLVisionBlock"]
+    def __init__(self, config):
+        super().__init__(config)
+        # Directly initialize visual and language_model instead of using Qwen3VLModel
+        self.visual = Qwen3VLVisionModel._from_config(config.vision_config)
+        self.language_model = Qwen3VLTextModel._from_config(config.text_config)
+        self.rope_deltas = None  # cache rope_deltas here
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+    def get_decoder(self):
+        return self.language_model
+    def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
+        self.gradient_checkpointing = True
+        self.visual.gradient_checkpointing_enable(gradient_checkpointing_kwargs)
+        self.language_model.gradient_checkpointing_enable(gradient_checkpointing_kwargs)
+    def get_rope_index(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Different from the original implementation, Qwen3VL use timestamps rather than absolute time position ids."""
+        # Since we use timestamps to seperate videos, like <t1> <vision_start> <frame1> <vision_end> <t2> <vision_start> <frame2> <vision_end>, the video_grid_thw should also be split
+        if video_grid_thw is not None:
+            video_grid_thw = torch.repeat_interleave(video_grid_thw, video_grid_thw[:, 0], dim=0)
+            video_grid_thw[:, 0] = 1
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+        mrope_position_deltas = []
+        if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+            total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = torch.ones_like(total_input_ids)
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
+            image_index, video_index = 0, 0
+            attention_mask = attention_mask.to(total_input_ids.device)
+            for i, input_ids in enumerate(total_input_ids):
+                input_ids = input_ids[attention_mask[i] == 1]
+                image_nums, video_nums = 0, 0
+                vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
+                vision_tokens = input_ids[vision_start_indices + 1]
+                image_nums = (vision_tokens == image_token_id).sum()
+                video_nums = (vision_tokens == video_token_id).sum()
+                input_tokens = input_ids.tolist()
+                llm_pos_ids_list: list = []
+                st = 0
+                remain_images, remain_videos = image_nums, video_nums
+                for _ in range(image_nums + video_nums):
+                    if image_token_id in input_tokens and remain_images > 0:
+                        ed_image = input_tokens.index(image_token_id, st)
+                    else:
+                        ed_image = len(input_tokens) + 1
+                    if video_token_id in input_tokens and remain_videos > 0:
+                        ed_video = input_tokens.index(video_token_id, st)
+                    else:
+                        ed_video = len(input_tokens) + 1
+                    if ed_image < ed_video:
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        image_index += 1
+                        remain_images -= 1
+                        ed = ed_image
+                    else:
+                        t, h, w = (
+                            video_grid_thw[video_index][0],
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+                        video_index += 1
+                        remain_videos -= 1
+                        ed = ed_video
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t.item(),
+                        h.item() // spatial_merge_size,
+                        w.item() // spatial_merge_size,
+                    )
+                    text_len = ed - st
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                    # t_index is always 0 because llm_grid_t is always 1 (we use timestamps to encode the temporal information for videos)
+                    t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+                    h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                    w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                    llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+                    st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+                if st < len(input_tokens):
+                    st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                    text_len = len(input_tokens) - st
+                    llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+                mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+            mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+            else:
+                position_ids = (
+                    torch.arange(input_ids.shape[1], device=input_ids.device)
+                    .view(1, 1, -1)
+                    .expand(3, input_ids.shape[0], -1)
+                )
+                mrope_position_deltas = torch.zeros(
+                    [input_ids.shape[0], 1],
+                    device=input_ids.device,
+                    dtype=input_ids.dtype,
+                )
+            return position_ids, mrope_position_deltas
+    def get_video_features(
+        self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None
+    ):
+        """
+        Encodes videos into continuous embeddings that can be forwarded to the language model. The deepstack visual features are also returned.
+        Args:
+            pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input videos.
+            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+        """
+        # Same implementation as for images
+        return self.get_image_features(pixel_values_videos, video_grid_thw)
+    def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None, **kwargs):
+        """
+        Encodes images into continuous embeddings that can be forwarded to the language model.
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
+                The tensors corresponding to the input images.
+            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+                The temporal, height and width of feature shape of each image in LLM.
+        """
+        pixel_values = pixel_values.type(self.visual.dtype)
+        image_embeds, deepstack_feature_lists = self.visual(pixel_values, grid_thw=image_grid_thw, **kwargs)
+        split_sizes = (image_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist()
+        image_embeds = torch.split(image_embeds, split_sizes)
+        return image_embeds, deepstack_feature_lists
+    def get_placeholder_mask(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.FloatTensor,
+        image_features: Optional[torch.FloatTensor] = None,
+        video_features: Optional[torch.FloatTensor] = None,
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+            special_video_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_video_mask = special_video_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+            special_video_mask = input_ids == self.config.video_token_id
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}"
+            )
+        n_video_tokens = special_video_mask.sum()
+        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel():
+            raise ValueError(
+                f"Videos features and video tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0]}"
+            )
+        return special_image_mask, special_video_mask
+    @check_model_inputs()
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        image_grid_thw: Optional[torch.LongTensor] = None,
+        video_grid_thw: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, Qwen3VLCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        Example:
+            TODO: Add example
+        """
+        # Inlined from Qwen3VLModel.forward
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+        image_mask = None
+        video_mask = None
+        if pixel_values is not None:
+            image_embeds, deepstack_image_embeds = self.get_image_features(pixel_values, image_grid_thw, image_max_seqlen=kwargs.get("image_max_seqlen"))
+            image_embeds = torch.cat(image_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            image_mask, _ = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+        if pixel_values_videos is not None:
+            video_embeds, deepstack_video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw)
+            video_embeds = torch.cat(video_embeds, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            _, video_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+        visual_pos_masks = None
+        deepstack_visual_embeds = None
+        if image_mask is not None and video_mask is not None:
+            # aggregate visual_pos_masks and deepstack_visual_embeds
+            image_mask = image_mask[..., 0]
+            video_mask = video_mask[..., 0]
+            visual_pos_masks = image_mask | video_mask
+            deepstack_visual_embeds = []
+            image_mask_joint = image_mask[visual_pos_masks]
+            video_mask_joint = video_mask[visual_pos_masks]
+            for img_embed, vid_embed in zip(deepstack_image_embeds, deepstack_video_embeds):
+                embed_joint = img_embed.new_zeros(visual_pos_masks.sum(), img_embed.shape[-1]).to(img_embed.device)
+                embed_joint[image_mask_joint, :] = img_embed
+                embed_joint[video_mask_joint, :] = vid_embed
+                deepstack_visual_embeds.append(embed_joint)
+        elif image_mask is not None:
+            image_mask = image_mask[..., 0]
+            visual_pos_masks = image_mask
+            deepstack_visual_embeds = deepstack_image_embeds
+        elif video_mask is not None:
+            video_mask = video_mask[..., 0]
+            visual_pos_masks = video_mask
+            deepstack_visual_embeds = deepstack_video_embeds
+        if position_ids is None:
+            attention_mask_tensor = (
+                attention_mask if not isinstance(attention_mask, dict) else attention_mask["full_attention"]
+            )
+            if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4:
+                attention_mask_tensor = torch.diagonal(attention_mask_tensor[:, 0], dim1=1, dim2=2)
+                # Only apply conversion for floating point tensors (inverted masks)
+                if attention_mask_tensor.dtype.is_floating_point:
+                    attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min
+                    attention_mask_tensor = (1.0 - attention_mask_tensor).int()
+            # Calculate RoPE index once per generation in the pre-fill stage only.
+            # When compiling, we can't check tensor values thus we check only input length
+            # It is safe to assume that `length!=1` means we're in pre-fill because compiled
+            # models currently cannot do asssisted decoding
+            prefill_compiled_stage = is_torchdynamo_compiling() and (
+                (input_ids is not None and input_ids.shape[1] != 1)
+                or (inputs_embeds is not None and inputs_embeds.shape[1] != 1)
+            )
+            prefill_noncompiled_stage = not is_torchdynamo_compiling() and (
+                (cache_position is not None and cache_position[0] == 0)
+                or (past_key_values is None or past_key_values.get_seq_length() == 0)
+            )
+            if (prefill_compiled_stage or prefill_noncompiled_stage) or self.rope_deltas is None:
+                position_ids, rope_deltas = self.get_rope_index(
+                    input_ids,
+                    image_grid_thw,
+                    video_grid_thw,
+                    attention_mask=attention_mask_tensor,
+                )
+                self.rope_deltas = rope_deltas
+            # then use the prev pre-calculated rope-deltas to get the correct position ids
+            else:
+                batch_size, seq_length, _ = inputs_embeds.shape
+                delta = (
+                    (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
+                    if cache_position is not None
+                    else 0
+                )
+                position_ids = torch.arange(seq_length, device=inputs_embeds.device)
+                position_ids = position_ids.view(1, -1).expand(batch_size, -1)
+                if cache_position is not None:  # otherwise `deltas` is an int `0`
+                    delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
+        if kwargs.get("max_seqlen") is not None:
+            try:
+                self.language_model.config.max_seqlen = int(kwargs.get("max_seqlen"))
+            except Exception:
+                self.language_model.config.max_seqlen = kwargs.get("max_seqlen")
+        outputs = self.language_model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            visual_pos_masks=visual_pos_masks,
+            deepstack_visual_embeds=deepstack_visual_embeds,
+            **kwargs,
+        )
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
+        return Qwen3VLCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            rope_deltas=self.rope_deltas,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        pixel_values=None,
+        pixel_values_videos=None,
+        image_grid_thw=None,
+        video_grid_thw=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        # Qwen3VL position_ids are prepareed with rope_deltas in forward
+        model_inputs["position_ids"] = None
+        if cache_position[0] != 0:
+            model_inputs["pixel_values"] = None
+            model_inputs["pixel_values_videos"] = None
+        return model_inputs
+    def _get_image_nums_and_video_nums(
+        self,
+        input_ids: Optional[torch.LongTensor],
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
+        These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+        Returns:
+            image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`)
+            video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`)
+        """
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+        if inputs_embeds is not None:
+            vision_start_mask = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(vision_start_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+            image_mask = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(image_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+            video_mask = (
+                inputs_embeds
+                == self.get_input_embeddings()(
+                    torch.tensor(video_token_id, dtype=torch.long, device=inputs_embeds.device)
+                )
+            )[..., 0]
+        else:
+            vision_start_mask = input_ids == vision_start_token_id
+            image_mask = input_ids == image_token_id
+            video_mask = input_ids == video_token_id
+        vision_first_mask = torch.roll(vision_start_mask, shifts=1, dims=1)
+        image_nums = torch.sum(vision_first_mask & image_mask, dim=1)
+        video_nums = torch.sum(vision_first_mask & video_mask, dim=1)
+        return image_nums, video_nums
+    def _expand_inputs_for_generation(
+        self,
+        expand_size: int = 1,
+        is_encoder_decoder: bool = False,
+        input_ids: Optional[torch.LongTensor] = None,
+        **model_kwargs,
+    ) -> tuple[torch.LongTensor, dict[str, Any]]:
+        # Overwritten -- Support for expanding tensors without a batch size dimension
+        # e.g., pixel_values, image_grid_thw, pixel_values_videos, video_grid_thw, second_per_grid_t
+        # pixel_values.shape[0] is sum(seqlen_images for samples)
+        # image_grid_thw.shape[0] is sum(num_images for samples)
+        if expand_size == 1:
+            return input_ids, model_kwargs
+        visual_keys = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw", "second_per_grid_ts"]
+        def _expand_dict_for_generation_visual(dict_to_expand):
+            image_grid_thw = model_kwargs.get("image_grid_thw", None)
+            video_grid_thw = model_kwargs.get("video_grid_thw", None)
+            image_nums, video_nums = self._get_image_nums_and_video_nums(
+                input_ids, inputs_embeds=model_kwargs.get("inputs_embeds", None)
+            )
+            def _repeat_interleave_samples(x, lengths, repeat_times):
+                samples = torch.split(x, lengths)
+                repeat_args = [repeat_times] + [1] * (x.dim() - 1)
+                result = torch.cat([sample.repeat(*repeat_args) for sample in samples], dim=0)
+                return result
+            for key in dict_to_expand:
+                if key == "pixel_values":
+                    # split images into samples
+                    samples = torch.split(image_grid_thw, list(image_nums))
+                    # compute the sequence length of images for each sample
+                    lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "image_grid_thw":
+                    # get the num of images for each sample
+                    lengths = list(image_nums)
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "pixel_values_videos":
+                    samples = torch.split(video_grid_thw, list(video_nums))
+                    lengths = [torch.prod(sample, dim=1).sum() for sample in samples]
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "video_grid_thw":
+                    lengths = list(video_nums)
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=lengths, repeat_times=expand_size
+                    )
+                elif key == "second_per_grid_ts":
+                    dict_to_expand[key] = _repeat_interleave_samples(
+                        dict_to_expand[key], lengths=list(video_nums), repeat_times=expand_size
+                    )
+            return dict_to_expand
+        def _expand_dict_for_generation(dict_to_expand):
+            for key in dict_to_expand:
+                if (
+                    key != "cache_position"
+                    and dict_to_expand[key] is not None
+                    and isinstance(dict_to_expand[key], torch.Tensor)
+                    and key not in visual_keys
+                ):
+                    dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
+            return dict_to_expand
+        model_kwargs = _expand_dict_for_generation_visual(model_kwargs)
+        if input_ids is not None:
+            input_ids = input_ids.repeat_interleave(expand_size, dim=0)
+        model_kwargs = _expand_dict_for_generation(model_kwargs)
+        if is_encoder_decoder:
+            if model_kwargs.get("encoder_outputs") is None:
+                raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
+            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
+        return input_ids, model_kwargs
+__all__ = [
+    "Qwen3VLVisionModel",
+    "Qwen3VLForConditionalGeneration",
+    "Qwen3VLPreTrainedModel",
+    "Qwen3VLTextModel",
+]

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,39 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "disable_grouping": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_pad": null,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "Qwen2VLImageProcessorFast",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "input_data_format": null,
+  "max_pixels": 147456,
+  "merge_size": 2,
+  "min_pixels": 65536,
+  "pad_size": null,
+  "patch_size": 16,
+  "processor_class": "PRTS_Qwen3VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_tensors": null,
+  "size": {
+    "longest_edge": 147456,
+    "shortest_edge": 65536
+  },
+  "temporal_patch_size": 2
+}

processing_prts_qwen3_vl.py ADDED Viewed

	@@ -0,0 +1,352 @@

+# Copyright 2025 TeleAI Rhodes Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Processor for PRTS built on Qwen3-VL (hub / trust_remote_code; no prts package required)."""
+from __future__ import annotations
+import logging
+from typing import Optional, Union
+import numpy as np
+import torch
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import (
+    ImagesKwargs,
+    MultiModalData,
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+    VideosKwargs,
+)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils.logging import get_logger
+from transformers.video_utils import VideoInput
+ACTION_START_TOKEN = "<|action_start|>"
+ACTION_PLACEHOLDER_TOKEN = "<|action_pad|>"
+ACTION_END_TOKEN = "<|action_end|>"
+CRL_GOAL_REPR_TOKEN = "<|goal_repr|>"
+CRL_OBS_REPR_TOKEN = "<|obs_repr|>"
+VISION_START_TOKEN = "<|vision_start|>"         # beginning of vision input
+IMAGE_PLACEHOLDER_TOKEN = "<|image_pad|>"       # image placeholder
+VIDEO_PLACEHOLDER_TOKEN = "<|video_pad|>"       # video placeholder
+logger = get_logger(__name__)
+if not logger.handlers:
+    handler = logging.StreamHandler()
+    handler.setLevel(logging.INFO)
+    handler.setFormatter(logging.Formatter("%(levelname)s:%(name)s:%(message)s"))
+    logger.addHandler(handler)
+class Qwen3VLVideosProcessorKwargs(VideosKwargs, total=False):
+    pass
+class Qwen3VLImagesKwargs(ImagesKwargs):
+    min_pixels: Optional[int]
+    max_pixels: Optional[int]
+    patch_size: Optional[int]
+    temporal_patch_size: Optional[int]
+    merge_size: Optional[int]
+class Qwen3VLProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: Qwen3VLImagesKwargs
+    videos_kwargs: Qwen3VLVideosProcessorKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_token_type_ids": False,
+            "return_mm_token_type_ids": False,
+        },
+        "videos_kwargs": {"return_metadata": True},
+    }
+class PRTS_Qwen3VLProcessor(ProcessorMixin):
+    r"""
+    Constructs a PRTS processor which wraps a Qwen3-VL image processor and a Qwen2 tokenizer into a single processor.
+    This processor is built independently (not inheriting from Qwen3VLProcessor) to avoid tight coupling,
+    while maintaining compatibility with Qwen3-VL's timestamp-based video processing approach.
+    [`PRTS_Qwen3VLProcessor`] offers all the functionalities needed for PRTS model with:
+    - Action token handling (discrete and continuous)
+    - State token handling for proprioceptive inputs
+    - Expert trigger tokens for flow matching action prediction
+    - Qwen3-VL compatible image/video processing with timestamp-based video handling
+    Args:
+        image_processor ([`Qwen2VLImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        video_processor ([`Qwen3VLVideoProcessor`], *optional*):
+            The video processor is a required input.
+        chat_template (`str`, *optional*):
+            A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
+    """
+    attributes = ["image_processor", "tokenizer", "video_processor"]
+    image_processor_class = "AutoImageProcessor"
+    video_processor_class = "AutoVideoProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+    def __init__(self, image_processor=None, tokenizer=None, video_processor=None,
+                 chat_template=None, **kwargs):
+        # Initialize base ProcessorMixin
+        super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
+        # Get image/video tokens from tokenizer
+        self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
+        self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
+        self.image_token_id = (
+            tokenizer.image_token_id
+            if getattr(tokenizer, "image_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.image_token)
+        )
+        self.video_token_id = (
+            tokenizer.video_token_id
+            if getattr(tokenizer, "video_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.video_token)
+        )
+        # Qwen3-VL vision tokens
+        self.vision_start_token = (
+            "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token
+        )
+        self.vision_end_token = (
+            "<|vision_end|>" if not hasattr(tokenizer, "vision_end_token") else tokenizer.vision_end_token
+        )
+        self.vision_start_token_id = (
+            tokenizer.vision_start_token_id
+            if getattr(tokenizer, "vision_start_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.vision_start_token)
+        )
+        self.vision_end_token_id = (
+            tokenizer.vision_end_token_id
+            if getattr(tokenizer, "vision_end_token_id", None)
+            else tokenizer.convert_tokens_to_ids(self.vision_end_token)
+        )
+        prts_special_tokens = [
+            ACTION_START_TOKEN,
+            ACTION_PLACEHOLDER_TOKEN,
+            ACTION_END_TOKEN,
+            CRL_GOAL_REPR_TOKEN,
+            CRL_OBS_REPR_TOKEN,
+        ]
+        num_new_tokens = tokenizer.add_tokens(prts_special_tokens, special_tokens=True)
+        logger.info(f"Added {num_new_tokens} new special tokens to the tokenizer.")
+        self.action_token = getattr(tokenizer, "action_token", ACTION_PLACEHOLDER_TOKEN)
+        self.action_token_id = tokenizer.convert_tokens_to_ids(self.action_token)
+        token_dict = {
+            "action_start_token_id": ACTION_START_TOKEN,
+            "action_token_id": ACTION_PLACEHOLDER_TOKEN,
+            "vision_start_token_id": VISION_START_TOKEN,
+            "image_token_id": IMAGE_PLACEHOLDER_TOKEN,
+            "video_token_id": VIDEO_PLACEHOLDER_TOKEN,
+            "crl_goal_repr_token_id": CRL_GOAL_REPR_TOKEN,
+            "crl_obs_repr_token_id": CRL_OBS_REPR_TOKEN,
+        }
+        self.token_ids = {key: tokenizer.convert_tokens_to_ids(value) for key, value in token_dict.items()}
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        videos: Optional[VideoInput] = None,
+        actions: Union[torch.Tensor] = None,
+        **kwargs: Unpack[Qwen3VLProcessorKwargs],
+    ) -> BatchFeature:
+        output_kwargs = self._merge_kwargs(
+            Qwen3VLProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        image_inputs = {}
+        if images is not None:
+            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+            image_grid_thw = image_inputs["image_grid_thw"]
+        else:
+            image_grid_thw = None
+        videos_inputs = {}
+        if videos is not None:
+            videos_inputs = self.video_processor(videos=videos, **output_kwargs["videos_kwargs"])
+            video_grid_thw = videos_inputs["video_grid_thw"]
+            if "return_metadata" not in kwargs:
+                video_metadata = videos_inputs.pop("video_metadata", None)
+            else:
+                video_metadata = videos_inputs.get("video_metadata", None)
+        else:
+            video_grid_thw = None
+            video_metadata = None
+        if not isinstance(text, list):
+            text = [text]
+        text = text.copy()
+        if image_grid_thw is not None:
+            merge_length = self.image_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    num_image_tokens = image_grid_thw[index].prod() // merge_length
+                    text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+        if video_grid_thw is not None:
+            merge_length = self.video_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while self.video_token in text[i]:
+                    if video_metadata is not None and index < len(video_metadata):
+                        metadata = video_metadata[index]
+                        if metadata.fps is None:
+                            logger.warning_once(
+                                "Qwen3VL requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. "
+                                "Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. "
+                                "Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results."
+                            )
+                            metadata.fps = 24 if metadata.fps is None else metadata.fps
+                        curr_timestamp = self._calculate_timestamps(
+                            metadata.frames_indices,
+                            metadata.fps,
+                            self.video_processor.merge_size,
+                        )
+                        video_placeholder = ""
+                        frame_seqlen = video_grid_thw[index][1:].prod() // merge_length
+                        for frame_idx in range(video_grid_thw[index][0]):
+                            curr_time = curr_timestamp[frame_idx]
+                            video_placeholder += f"<{curr_time:.1f} seconds>"
+                            video_placeholder += (
+                                self.vision_start_token + "<|placeholder|>" * frame_seqlen + self.vision_end_token
+                            )
+                        if f"{self.vision_start_token}{self.video_token}{self.vision_end_token}" in text[i]:
+                            text[i] = text[i].replace(
+                                f"{self.vision_start_token}{self.video_token}{self.vision_end_token}",
+                                video_placeholder,
+                                1,
+                            )
+                        else:
+                            text[i] = text[i].replace(self.video_token, video_placeholder, 1)
+                    else:
+                        num_video_tokens = video_grid_thw[index].prod() // merge_length
+                        text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1)
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.video_token)
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+        output_data = {**text_inputs, **image_inputs, **videos_inputs}
+        if actions is not None:
+            output_data["actions"] = actions
+        return BatchFeature(data=output_data, tensor_type=return_tensors)
+    def _calculate_timestamps(self, indices: Union[list[int], np.ndarray], video_fps: float, merge_size: int = 2):
+        if not isinstance(indices, list):
+            indices = indices.tolist()
+        if len(indices) % merge_size != 0:
+            indices.extend(indices[-1] for _ in range(merge_size - len(indices) % merge_size))
+        timestamps = [idx / video_fps for idx in indices]
+        timestamps = [
+            (timestamps[i] + timestamps[i + merge_size - 1]) / 2 for i in range(0, len(timestamps), merge_size)
+        ]
+        return timestamps
+    def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
+        vision_data = {}
+        if image_sizes is not None:
+            images_kwargs = Qwen3VLProcessorKwargs._defaults.get("images_kwargs", {})
+            images_kwargs.update(kwargs)
+            merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size
+            num_image_patches = [
+                self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
+                for image_size in image_sizes
+            ]
+            num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+        if video_sizes is not None:
+            videos_kwargs = Qwen3VLProcessorKwargs._defaults.get("videos_kwargs", {})
+            videos_kwargs.update(kwargs)
+            merge_size = videos_kwargs.get("merge_size", None) or self.video_processor.merge_size
+            num_video_patches = [
+                self.video_processor.get_number_of_video_patches(*video_size, videos_kwargs)
+                for video_size in video_sizes
+            ]
+            num_video_tokens = [(num_patches // merge_size**2) for num_patches in num_video_patches]
+            vision_data["num_video_tokens"] = num_video_tokens
+        return MultiModalData(**vision_data)
+    def set_action_tokenizer(self, action_tokenizer):
+        self.action_tokenizer = action_tokenizer
+        prts_fast_action_tokens = [f"<|action_token_{i}|>" for i in range(action_tokenizer.vocab_size)]
+        num_new_tokens = self.tokenizer.add_tokens(prts_fast_action_tokens, special_tokens=True)
+        logger.info(f"Added {num_new_tokens} FAST action tokens to the tokenizer.")
+        self.action_token_start_index = self.tokenizer.convert_tokens_to_ids("<|action_token_0|>")
+        self.action_vocab_size = action_tokenizer.vocab_size
+        token_ids = self.tokenizer.convert_tokens_to_ids(prts_fast_action_tokens)
+        self.action_mapper = {k: v for k, v in zip(prts_fast_action_tokens, token_ids, strict=True)}
+    def preprocess_action(self, actions, **kwargs):
+        raise NotImplementedError
+    def post_process_image_text_to_text(
+        self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
+    ):
+        return self.tokenizer.batch_decode(
+            generated_outputs,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+PRTS_Qwen3VLProcessor.register_for_auto_class()
+__all__ = ["PRTS_Qwen3VLProcessor"]

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

statistics.json ADDED Viewed

	@@ -0,0 +1,333 @@

+{
+  "state_mode": "QUANTILE",
+  "features": {
+    "observation.state": {
+      "dtype": "float32",
+      "shape": [
+        8
+      ],
+      "names": {
+        "motors": [
+          "x",
+          "y",
+          "z",
+          "roll",
+          "pitch",
+          "yaw",
+          "pad",
+          "gripper"
+        ]
+      }
+    },
+    "action": {
+      "dtype": "float32",
+      "shape": [
+        7
+      ],
+      "names": {
+        "motors": [
+          "x",
+          "y",
+          "z",
+          "roll",
+          "pitch",
+          "yaw",
+          "gripper"
+        ]
+      }
+    }
+  },
+  "stats": {
+    "observation.state": {
+      "min": [
+        -0.4828203022480011,
+        -0.3255046010017395,
+        0.008128180168569088,
+        0.35277295112609863,
+        -3.641430377960205,
+        -1.842738389968872,
+        -0.0013586411951109767,
+        -0.042040832340717316
+      ],
+      "max": [
+        0.21031762659549713,
+        0.39128610491752625,
+        1.3660105466842651,
+        3.6714255809783936,
+        3.560650587081909,
+        1.386339545249939,
+        0.04233968257904053,
+        0.0013633022317662835
+      ],
+      "mean": [
+        -0.046518828719854355,
+        0.034408919513225555,
+        0.7645694613456726,
+        2.9716713428497314,
+        -0.2204727977514267,
+        -0.12557993829250336,
+        0.026915358379483223,
+        -0.027191326022148132
+      ],
+      "std": [
+        0.10494082421064377,
+        0.1517619788646698,
+        0.3785194456577301,
+        0.3442671298980713,
+        0.9068173766136169,
+        0.32538288831710815,
+        0.014175750315189362,
+        0.014058776199817657
+      ],
+      "count": [
+        273465
+      ],
+      "q01": [
+        -0.3991248905658722,
+        -0.2688351273536682,
+        0.03826696425676346,
+        1.508958101272583,
+        -2.7197911739349365,
+        -1.0805085897445679,
+        0.0017423711251467466,
+        -0.04002561420202255
+      ],
+      "q99": [
+        0.13556525111198425,
+        0.33566486835479736,
+        1.2706660032272339,
+        3.277346134185791,
+        2.406111240386963,
+        0.5977716445922852,
+        0.04031316190958023,
+        -0.00177810771856457
+      ]
+    },
+    "action": {
+      "min": [
+        -0.9375,
+        -0.9375,
+        -0.9375,
+        -0.2582142949104309,
+        -0.375,
+        -0.3675000071525574,
+        -1.0
+      ],
+      "max": [
+        0.9375,
+        0.9375,
+        0.9375,
+        0.3557142913341522,
+        0.375,
+        0.375,
+        1.0
+      ],
+      "mean": [
+        0.06278152763843536,
+        0.08684158325195312,
+        -0.0903734639286995,
+        0.0005407554563134909,
+        0.005643464159220457,
+        -0.005229106638580561,
+        -0.0496407225728035
+      ],
+      "std": [
+        0.33551836013793945,
+        0.37847793102264404,
+        0.4446770250797272,
+        0.03924214467406273,
+        0.06341660022735596,
+        0.07792268693447113,
+        1.000144362449646
+      ],
+      "count": [
+        273465
+      ],
+      "q01": [
+        -0.7044642567634583,
+        -0.8008928298950195,
+        -0.9375,
+        -0.11464285850524902,
+        -0.1639285683631897,
+        -0.2239285707473755,
+        -1.0
+      ],
+      "q99": [
+        0.9375,
+        0.8678571581840515,
+        0.9375,
+        0.13178572058677673,
+        0.19285714626312256,
+        0.335357129573822,
+        1.0
+      ]
+    }
+  },
+  "datasets": {
+    "libero_4_suites": {
+      "features": {
+        "observation.state": {
+          "dtype": "float32",
+          "shape": [
+            8
+          ],
+          "names": {
+            "motors": [
+              "x",
+              "y",
+              "z",
+              "roll",
+              "pitch",
+              "yaw",
+              "pad",
+              "gripper"
+            ]
+          }
+        },
+        "action": {
+          "dtype": "float32",
+          "shape": [
+            7
+          ],
+          "names": {
+            "motors": [
+              "x",
+              "y",
+              "z",
+              "roll",
+              "pitch",
+              "yaw",
+              "gripper"
+            ]
+          }
+        }
+      },
+      "stats": {
+        "observation.state": {
+          "min": [
+            -0.4828203022480011,
+            -0.3255046010017395,
+            0.008128180168569088,
+            0.35277295112609863,
+            -3.641430377960205,
+            -1.842738389968872,
+            -0.0013586411951109767,
+            -0.042040832340717316
+          ],
+          "max": [
+            0.21031762659549713,
+            0.39128610491752625,
+            1.3660105466842651,
+            3.6714255809783936,
+            3.560650587081909,
+            1.386339545249939,
+            0.04233968257904053,
+            0.0013633022317662835
+          ],
+          "mean": [
+            -0.046518828719854355,
+            0.034408919513225555,
+            0.7645694613456726,
+            2.9716713428497314,
+            -0.2204727977514267,
+            -0.12557993829250336,
+            0.026915358379483223,
+            -0.027191326022148132
+          ],
+          "std": [
+            0.10494082421064377,
+            0.1517619788646698,
+            0.3785194456577301,
+            0.3442671298980713,
+            0.9068173766136169,
+            0.32538288831710815,
+            0.014175750315189362,
+            0.014058776199817657
+          ],
+          "count": [
+            273465
+          ],
+          "q01": [
+            -0.3991248905658722,
+            -0.2688351273536682,
+            0.03826696425676346,
+            1.508958101272583,
+            -2.7197911739349365,
+            -1.0805085897445679,
+            0.0017423711251467466,
+            -0.04002561420202255
+          ],
+          "q99": [
+            0.13556525111198425,
+            0.33566486835479736,
+            1.2706660032272339,
+            3.277346134185791,
+            2.406111240386963,
+            0.5977716445922852,
+            0.04031316190958023,
+            -0.00177810771856457
+          ]
+        },
+        "action": {
+          "min": [
+            -0.9375,
+            -0.9375,
+            -0.9375,
+            -0.2582142949104309,
+            -0.375,
+            -0.3675000071525574,
+            -1.0
+          ],
+          "max": [
+            0.9375,
+            0.9375,
+            0.9375,
+            0.3557142913341522,
+            0.375,
+            0.375,
+            1.0
+          ],
+          "mean": [
+            0.06278152763843536,
+            0.08684158325195312,
+            -0.0903734639286995,
+            0.0005407554563134909,
+            0.005643464159220457,
+            -0.005229106638580561,
+            -0.0496407225728035
+          ],
+          "std": [
+            0.33551836013793945,
+            0.37847793102264404,
+            0.4446770250797272,
+            0.03924214467406273,
+            0.06341660022735596,
+            0.07792268693447113,
+            1.000144362449646
+          ],
+          "count": [
+            273465
+          ],
+          "q01": [
+            -0.7044642567634583,
+            -0.8008928298950195,
+            -0.9375,
+            -0.11464285850524902,
+            -0.1639285683631897,
+            -0.2239285707473755,
+            -1.0
+          ],
+          "q99": [
+            0.9375,
+            0.8678571581840515,
+            0.9375,
+            0.13178572058677673,
+            0.19285714626312256,
+            0.335357129573822,
+            1.0
+          ]
+        }
+      }
+    }
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5482df2482307db564c0595428d3dfdad4bf5dbd9d3d5156052ca12f93b7d3ed
+size 11828002

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb6e322ff4c32859f3014cdd5e49182ec932f2b10cc2e365df3439522af926a7
+size 10129

video_preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "crop_size": null,
+  "data_format": "channels_first",
+  "default_to_square": true,
+  "device": null,
+  "do_center_crop": null,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "do_sample_frames": true,
+  "fps": 2.0,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "input_data_format": null,
+  "max_frames": 8,
+  "merge_size": 2,
+  "min_frames": 4,
+  "num_frames": null,
+  "pad_size": null,
+  "patch_size": 16,
+  "processor_class": "PRTS_Qwen3VLProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "return_metadata": false,
+  "size": {
+    "longest_edge": 147456,
+    "shortest_edge": 65536
+  },
+  "temporal_patch_size": 2,
+  "video_metadata": null,
+  "video_processor_type": "Qwen3VLVideoProcessor"
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff