XinBB commited on Jun 5, 2025

Commit

ee53828

verified ·

1 Parent(s): dbf0378

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

eval/logs/rec_results_android_studio_macos_8k_SFT_0.json +1605 -0
eval/test_grounding_r1.py +318 -0
eval/test_grounding_r1_nothink.py +331 -0
eval/test_grounding_r1_nothink_ssv2.py +335 -0
eval/test_od_r1.py +178 -0
eval/test_rec_baseline.py +225 -0
eval/test_rec_r1.py +232 -0
open-r1-multimodal/.gitignore +178 -0
open-r1-multimodal/LICENSE +201 -0
open-r1-multimodal/Makefile +20 -0
open-r1-multimodal/configs/ddp.yaml +16 -0
open-r1-multimodal/configs/zero2.yaml +21 -0
open-r1-multimodal/configs/zero3.yaml +22 -0
open-r1-multimodal/data_config/rec.yaml +4 -0
open-r1-multimodal/data_jsonl/gui_multi-image.jsonl +0 -0
open-r1-multimodal/local_scripts/lmms_eval_qwen2vl.sh +61 -0
open-r1-multimodal/local_scripts/prepare_hf_data.py +166 -0
open-r1-multimodal/local_scripts/train_aria_moe.sh +68 -0
open-r1-multimodal/local_scripts/train_qwen2_vl.sh +61 -0
open-r1-multimodal/local_scripts/zero2.json +41 -0
open-r1-multimodal/local_scripts/zero3.json +41 -0
open-r1-multimodal/local_scripts/zero3_offload.json +48 -0
open-r1-multimodal/run_scripts/multinode_training_args.yaml +21 -0
open-r1-multimodal/run_scripts/multinode_training_demo.sh +145 -0
open-r1-multimodal/run_scripts/run_grpo_gui.sh +34 -0
open-r1-multimodal/run_scripts/run_grpo_gui_grounding.sh +34 -0
open-r1-multimodal/run_scripts/run_grpo_rec.sh +33 -0
open-r1-multimodal/run_scripts/run_grpo_rec_internvl.sh +36 -0
open-r1-multimodal/run_scripts/run_grpo_rec_lora.sh +43 -0
open-r1-multimodal/setup.cfg +41 -0
open-r1-multimodal/src/open_r1.egg-info/PKG-INFO +63 -0
open-r1-multimodal/src/open_r1.egg-info/dependency_links.txt +1 -0
open-r1-multimodal/src/open_r1.egg-info/requires.txt +36 -0
open-r1-multimodal/src/open_r1.egg-info/top_level.txt +1 -0
open-r1-multimodal/src/open_r1/__init__.py +0 -0
open-r1-multimodal/src/open_r1/__pycache__/__init__.cpython-310.pyc +0 -0
open-r1-multimodal/src/open_r1/configs.py +82 -0
open-r1-multimodal/src/open_r1/evaluate.py +85 -0
open-r1-multimodal/src/open_r1/generate.py +156 -0
open-r1-multimodal/src/open_r1/grpo_gui_grounding.py +357 -0
open-r1-multimodal/src/open_r1/grpo_jsonl.py +649 -0
open-r1-multimodal/src/open_r1/grpo_rec.py +291 -0
open-r1-multimodal/src/open_r1/trainer/__pycache__/__init__.cpython-310.pyc +0 -0
open-r1-multimodal/src/open_r1/trainer/__pycache__/grpo_config.cpython-310.pyc +0 -0
open-r1-multimodal/src/open_r1/trainer/__pycache__/grpo_trainer.cpython-310.pyc +0 -0
open-r1-multimodal/src/open_r1/trainer/__pycache__/qwen_grpo_trainer.cpython-310.pyc +0 -0
open-r1-multimodal/src/open_r1/trainer/grpo_config.py +286 -0
open-r1-multimodal/src/open_r1/trainer/grpo_trainer.py +849 -0
open-r1-multimodal/src/open_r1/trainer/qwen_grpo_trainer.py +877 -0
open-r1-multimodal/src/open_r1/trainer/vllm_grpo_trainer.py +828 -0

eval/logs/rec_results_android_studio_macos_8k_SFT_0.json ADDED Viewed

	@@ -0,0 +1,1605 @@

+{
+  "accuracy": 15.0,
+  "results": [
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_11-52-28.png",
+      "question": "go to previous page in virtual Android machine in Android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5294270833333333,
+        0.4152777777777778,
+        0.5348958333333333,
+        0.425
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [548, 503]}}\n</tool_call>",
+      "extracted_answer": [
+        0.21990369181380418,
+        0.35928571428571426
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-07_15-03-23.png",
+      "question": "reverse the sorting by device name in AVD in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.12604166666666666,
+        0.12129629629629629,
+        0.1296875,
+        0.12685185185185185
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [337, 156]}}\n</tool_call>",
+      "extracted_answer": [
+        0.13523274478330657,
+        0.11142857142857143
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-07_15-30-52.png",
+      "question": "open physical device panel of the device manager in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.12291666666666666,
+        0.09212962962962963,
+        0.13932291666666666,
+        0.10324074074074074
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [305, 156]}}\n</tool_call>",
+      "extracted_answer": [
+        0.1223916532905297,
+        0.11142857142857143
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-05_16-07-20.png",
+      "question": "where is the stop button in android studio?",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.54609375,
+        0.07083333333333333,
+        0.5520833333333334,
+        0.08055555555555556
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 86]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5445425361155698,
+        0.06142857142857143
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_17-59-02.png",
+      "question": "open marscode ai logger in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.44244791666666666,
+        0.8569444444444444,
+        0.47942708333333334,
+        0.8680555555555556
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [875, 1183]}}\n</tool_call>",
+      "extracted_answer": [
+        0.351123595505618,
+        0.845
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_17-04-51.png",
+      "question": "Increase the volume of the Android virtual machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5861979166666667,
+        0.3388888888888889,
+        0.5911458333333334,
+        0.34953703703703703
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1586, 473]}}\n</tool_call>",
+      "extracted_answer": [
+        0.6364365971107544,
+        0.33785714285714286
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_17-31-44.png",
+      "question": "switch the android virtual machine to pixel xl api 34 in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.41041666666666665,
+        0.29953703703703705,
+        0.4401041666666667,
+        0.30972222222222223
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [608, 537]}}\n</tool_call>",
+      "extracted_answer": [
+        0.24398073836276082,
+        0.38357142857142856
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_11-19-16.png",
+      "question": "login with Marscode in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.4942708333333333,
+        0.30925925925925923,
+        0.5182291666666666,
+        0.3351851851851852
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1273, 450]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5108346709470305,
+        0.32142857142857145
+      ],
+      "correct": 1
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_14-43-59.png",
+      "question": "turn off bluetooth of virtual Android machine in Android studio ",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.57421875,
+        0.3638888888888889,
+        0.6270833333333333,
+        0.39444444444444443
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1387, 526]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5565810593900482,
+        0.3757142857142857
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-07_15-10-30.png",
+      "question": "refresh virtual device of device manager in android studio ",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.13802083333333334,
+        0.10694444444444444,
+        0.14635416666666667,
+        0.11805555555555555
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [327, 156]}}\n</tool_call>",
+      "extracted_answer": [
+        0.1312199036918138,
+        0.11142857142857143
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_11-50-55.png",
+      "question": "open Recent Apps in virtual Android machine in Android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5372395833333333,
+        0.3347222222222222,
+        0.5427083333333333,
+        0.3453703703703704
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [375, 568]}}\n</tool_call>",
+      "extracted_answer": [
+        0.15048154093097912,
+        0.4057142857142857
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_11-18-32.png",
+      "question": "record virtual android phone screen in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5106770833333333,
+        0.4759259259259259,
+        0.5174479166666667,
+        0.48564814814814816
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1385, 876]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5557784911717496,
+        0.6257142857142857
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-05_16-04-38.png",
+      "question": "search information of project in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.56328125,
+        0.0699074074074074,
+        0.5684895833333333,
+        0.08287037037037037
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 142]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5445425361155698,
+        0.10142857142857142
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_15-28-57.png",
+      "question": "search emoji whthin the virtual android studio in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.52265625,
+        0.5958333333333333,
+        0.57734375,
+        0.6111111111111112
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1375, 816]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5517656500802568,
+        0.5828571428571429
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_15-39-12.png",
+      "question": "undo the action of the picture within the virtual android machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.57109375,
+        0.7337962962962963,
+        0.5809895833333333,
+        0.7523148148148148
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1437, 1012]}}\n</tool_call>",
+      "extracted_answer": [
+        0.576645264847512,
+        0.7228571428571429
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-07_14-57-01.png",
+      "question": "close bookmark panel",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.25442708333333336,
+        0.33287037037037037,
+        0.2604166666666667,
+        0.3435185185185185
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [607, 473]}}\n</tool_call>",
+      "extracted_answer": [
+        0.24357945425361155,
+        0.33785714285714286
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-07_15-33-17.png",
+      "question": "open settings of terminal in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.14921875,
+        0.612037037037037,
+        0.17786458333333333,
+        0.6236111111111111
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [357, 816]}}\n</tool_call>",
+      "extracted_answer": [
+        0.14325842696629212,
+        0.5828571428571429
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_17-12-20.png",
+      "question": "turn off notification dot of google in android virtual machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.6489583333333333,
+        0.6916666666666667,
+        0.6614583333333334,
+        0.7032407407407407
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1635, 978]}}\n</tool_call>",
+      "extracted_answer": [
+        0.656099518459069,
+        0.6985714285714286
+      ],
+      "correct": 1
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_16-09-39.png",
+      "question": "turn on dark theme of the virtual android macine",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5903645833333333,
+        0.6708333333333333,
+        0.60234375,
+        0.6824074074074075
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1478, 950]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5930979133226324,
+        0.6785714285714286
+      ],
+      "correct": 1
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_14-56-16.png",
+      "question": "open contacts of the android machine",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5348958333333333,
+        0.7754629629629629,
+        0.5575520833333333,
+        0.8050925925925926
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 1086]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5445425361155698,
+        0.7757142857142857
+      ],
+      "correct": 1
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-07_14-49-07.png",
+      "question": "add new service in android studio using the icon",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.14453125,
+        0.600925925925926,
+        0.15130208333333334,
+        0.6143518518518518
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [356, 822]}}\n</tool_call>",
+      "extracted_answer": [
+        0.14285714285714285,
+        0.5871428571428572
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_11-31-04.png",
+      "question": "install missing platform and fix project in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5153645833333333,
+        0.5990740740740741,
+        0.5791666666666667,
+        0.6083333333333333
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 816]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5445425361155698,
+        0.5828571428571429
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_15-06-40.png",
+      "question": "share the photo in virtual android machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.58046875,
+        0.6995370370370371,
+        0.5890625,
+        0.7180555555555556
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1473, 986]}}\n</tool_call>",
+      "extracted_answer": [
+        0.591091492776886,
+        0.7042857142857143
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-05_16-05-52.png",
+      "question": "run the project in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.4984375,
+        0.07037037037037037,
+        0.5036458333333333,
+        0.08194444444444444
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 106]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5445425361155698,
+        0.07571428571428572
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_15-51-23.png",
+      "question": "like the video on youtube within the virtual android machine in android studio ",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5859375,
+        0.6314814814814815,
+        0.6005208333333333,
+        0.6430555555555556
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [2037, 806]}}\n</tool_call>",
+      "extracted_answer": [
+        0.8174157303370787,
+        0.5757142857142857
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-05_16-05-07.png",
+      "question": "open device manager in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5549479166666667,
+        0.07083333333333333,
+        0.56015625,
+        0.0824074074074074
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1378, 205]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5529695024077047,
+        0.14642857142857144
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_15-24-57.png",
+      "question": "change the pen color to green with in the virtual android machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5018229166666667,
+        0.7347222222222223,
+        0.5104166666666666,
+        0.7472222222222222
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 1040]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5445425361155698,
+        0.7428571428571429
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_17-17-15.png",
+      "question": "open the os settings of android virtual machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.6151041666666667,
+        0.5416666666666666,
+        0.625,
+        0.5606481481481481
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1518, 763]}}\n</tool_call>",
+      "extracted_answer": [
+        0.6091492776886035,
+        0.545
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_16-11-18.png",
+      "question": "take a screenshot of the virtual android machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.57109375,
+        0.3402777777777778,
+        0.5778645833333333,
+        0.35185185185185186
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1378, 825]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5529695024077047,
+        0.5892857142857143
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_18-00-07.png",
+      "question": "open notifications panel on the right in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.80859375,
+        0.3,
+        0.8127604166666667,
+        0.3509259259259259
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [2378, 406]}}\n</tool_call>",
+      "extracted_answer": [
+        0.9542536115569823,
+        0.29
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_15-48-39.png",
+      "question": "subscribe the adidas skateboarding on toutube with in the virtual android machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.6028645833333334,
+        0.6226851851851852,
+        0.63046875,
+        0.6416666666666667
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1536, 879]}}\n</tool_call>",
+      "extracted_answer": [
+        0.6163723916532905,
+        0.6278571428571429
+      ],
+      "correct": 1
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_17-58-25.png",
+      "question": "open version control in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.24973958333333332,
+        0.8560185185185185,
+        0.2796875,
+        0.8671296296296296
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [657, 1183]}}\n</tool_call>",
+      "extracted_answer": [
+        0.26364365971107545,
+        0.845
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-07_14-47-06.png",
+      "question": "open app inspection in android studio ",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.29739583333333336,
+        0.8592592592592593,
+        0.3307291666666667,
+        0.8708333333333333
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [807, 1183]}}\n</tool_call>",
+      "extracted_answer": [
+        0.3238362760834671,
+        0.845
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-05_16-06-34.png",
+      "question": "attach debugger to android process in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.53984375,
+        0.0699074074074074,
+        0.5447916666666667,
+        0.08148148148148149
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1387, 131]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5565810593900482,
+        0.09357142857142857
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_15-03-59.png",
+      "question": "turn on the flash of the camera in virtual android machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.69921875,
+        0.6097222222222223,
+        0.7078125,
+        0.6236111111111111
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [657, 438]}}\n</tool_call>",
+      "extracted_answer": [
+        0.26364365971107545,
+        0.31285714285714283
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_17-07-21.png",
+      "question": "clear all notification of the android virtual machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.6408854166666667,
+        0.6712962962962963,
+        0.6565104166666667,
+        0.6847222222222222
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [637, 895]}}\n</tool_call>",
+      "extracted_answer": [
+        0.2556179775280899,
+        0.6392857142857142
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-07_15-06-51.png",
+      "question": "collapse all bookmarks in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.23932291666666666,
+        0.33240740740740743,
+        0.24583333333333332,
+        0.3449074074074074
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [553, 472]}}\n</tool_call>",
+      "extracted_answer": [
+        0.22191011235955055,
+        0.33714285714285713
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-07_15-42-10.png",
+      "question": "expand TODO items in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.10625,
+        0.649074074074074,
+        0.11197916666666667,
+        0.6606481481481481
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [305, 826]}}\n</tool_call>",
+      "extracted_answer": [
+        0.1223916532905297,
+        0.59
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_18-02-57.png",
+      "question": "click Profiler to open services window in the bottom of the android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.28020833333333334,
+        0.8564814814814815,
+        0.2981770833333333,
+        0.8671296296296296
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [678, 1179]}}\n</tool_call>",
+      "extracted_answer": [
+        0.27207062600321025,
+        0.8421428571428572
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_17-19-51.png",
+      "question": "stop the bilibili download in android virtual machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.4661458333333333,
+        0.4666666666666667,
+        0.47630208333333335,
+        0.4847222222222222
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1538, 667]}}\n</tool_call>",
+      "extracted_answer": [
+        0.6171749598715891,
+        0.4764285714285714
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_15-40-13.png",
+      "question": "clear all the modification of the photo with in the virtual android machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5263020833333333,
+        0.7342592592592593,
+        0.5424479166666667,
+        0.7509259259259259
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1317, 985]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5284911717495987,
+        0.7035714285714286
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_17-39-17.png",
+      "question": "zoom in the android virtual machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5559895833333334,
+        0.6717592592592593,
+        0.5625,
+        0.6837962962962963
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1395, 947]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5597913322632424,
+        0.6764285714285714
+      ],
+      "correct": 1
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_11-43-20.png",
+      "question": "back to home in virtual android machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.60078125,
+        0.3800925925925926,
+        0.6065104166666667,
+        0.3902777777777778
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [37, 825]}}\n</tool_call>",
+      "extracted_answer": [
+        0.014847512038523275,
+        0.5892857142857143
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_17-09-50.png",
+      "question": "turn on the falshlight of the android virtual machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5861979166666667,
+        0.47638888888888886,
+        0.62265625,
+        0.5
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1537, 668]}}\n</tool_call>",
+      "extracted_answer": [
+        0.6167736757624398,
+        0.47714285714285715
+      ],
+      "correct": 1
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_15-31-55.png",
+      "question": "click the third crying emoji with in the virtual android machine in android machine",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.553125,
+        0.5347222222222222,
+        0.5645833333333333,
+        0.5550925925925926
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1875, 693]}}\n</tool_call>",
+      "extracted_answer": [
+        0.7524077046548957,
+        0.495
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_16-15-46.png",
+      "question": "change video to camera in virtual android machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5247395833333334,
+        0.5532407407407407,
+        0.5546875,
+        0.5791666666666667
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 786]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5445425361155698,
+        0.5614285714285714
+      ],
+      "correct": 1
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_15-10-02.png",
+      "question": "close the detail of the photo in virtual android machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.6291666666666667,
+        0.6939814814814815,
+        0.6432291666666666,
+        0.7092592592592593
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1567, 980]}}\n</tool_call>",
+      "extracted_answer": [
+        0.6288121990369181,
+        0.7
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_15-15-42.png",
+      "question": "modify the shadows of the picture witn in virtual android machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5369791666666667,
+        0.6925925925925925,
+        0.55234375,
+        0.7240740740740741
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1367, 1035]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5485553772070626,
+        0.7392857142857143
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_15-05-13.png",
+      "question": "turn off the hdr of virtual android camera in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.6895833333333333,
+        0.6106481481481482,
+        0.6984375,
+        0.625
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1730, 899]}}\n</tool_call>",
+      "extracted_answer": [
+        0.6942215088282504,
+        0.6421428571428571
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_14-58-52.png",
+      "question": "take a picture of virtual android machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.51796875,
+        0.7638888888888888,
+        0.54296875,
+        0.7962962962962963
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1317, 856]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5284911717495987,
+        0.6114285714285714
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_15-16-55.png",
+      "question": "modify the highlights of the photo with in the virtual android machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.46197916666666666,
+        0.7342592592592593,
+        0.5502604166666667,
+        0.7490740740740741
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1315, 982]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5276886035313002,
+        0.7014285714285714
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-07_14-48-21.png",
+      "question": "open setting of app inspection in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5557291666666667,
+        0.5884259259259259,
+        0.5604166666666667,
+        0.5981481481481481
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [367, 835]}}\n</tool_call>",
+      "extracted_answer": [
+        0.1472712680577849,
+        0.5964285714285714
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_15-02-38.png",
+      "question": "open more items of virtual android machine in android studio ",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.6875,
+        0.6402777777777777,
+        0.70234375,
+        0.6680555555555555
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [357, 468]}}\n</tool_call>",
+      "extracted_answer": [
+        0.14325842696629212,
+        0.3342857142857143
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-07_15-38-30.png",
+      "question": "open options of terminal in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5546875,
+        0.5884259259259259,
+        0.5611979166666666,
+        0.600462962962963
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [357, 806]}}\n</tool_call>",
+      "extracted_answer": [
+        0.14325842696629212,
+        0.5757142857142857
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_17-11-07.png",
+      "question": "turn off the notification permission of google in android virtual machine",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.6505208333333333,
+        0.5648148148148148,
+        0.6635416666666667,
+        0.5828703703703704
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1617, 803]}}\n</tool_call>",
+      "extracted_answer": [
+        0.648876404494382,
+        0.5735714285714286
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-05_16-09-54.png",
+      "question": "supress warnings in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.4434895833333333,
+        0.2722222222222222,
+        0.5223958333333333,
+        0.2810185185185185
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1308, 386]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5248796147672552,
+        0.2757142857142857
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_15-26-41.png",
+      "question": "open settings of the keyboard within the virtual android machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.76015625,
+        0.5703703703703704,
+        0.7763020833333333,
+        0.5875
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1836, 795]}}\n</tool_call>",
+      "extracted_answer": [
+        0.7367576243980738,
+        0.5678571428571428
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_14-55-23.png",
+      "question": "open recent call of Android machine in Android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.578125,
+        0.6560185185185186,
+        0.59765625,
+        0.6875
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1463, 935]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5870786516853933,
+        0.6678571428571428
+      ],
+      "correct": 1
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_11-23-32.png",
+      "question": "shutdown the virtual android phone in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.45026041666666666,
+        0.47638888888888886,
+        0.4557291666666667,
+        0.487962962962963
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1387, 925]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5565810593900482,
+        0.6607142857142857
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_17-55-00.png",
+      "question": "make current file read-only in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.7734375,
+        0.8680555555555556,
+        0.7796875,
+        0.8759259259259259
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1753, 748]}}\n</tool_call>",
+      "extracted_answer": [
+        0.7034510433386838,
+        0.5342857142857143
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_18-01-01.png",
+      "question": "open app quality insights window in the bootom of the android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.32057291666666665,
+        0.8555555555555555,
+        0.3572916666666667,
+        0.8671296296296296
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [875, 1180]}}\n</tool_call>",
+      "extracted_answer": [
+        0.351123595505618,
+        0.8428571428571429
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-07_15-22-21.png",
+      "question": "rename bookmark name in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.2234375,
+        0.3314814814814815,
+        0.23046875,
+        0.3453703703703704
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [553, 472]}}\n</tool_call>",
+      "extracted_answer": [
+        0.22191011235955055,
+        0.33714285714285713
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_11-26-41.png",
+      "question": "log in to marscode AI in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.51640625,
+        0.5166666666666667,
+        0.5276041666666667,
+        0.5254629629629629
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1378, 695]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5529695024077047,
+        0.49642857142857144
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_11-25-07.png",
+      "question": "clear all notifications in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5440104166666667,
+        0.2791666666666667,
+        0.56015625,
+        0.2916666666666667
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 360]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5445425361155698,
+        0.2571428571428571
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-05_17-58-41.png",
+      "question": "edit selected AVD in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.25677083333333334,
+        0.13935185185185187,
+        0.26171875,
+        0.15231481481481482
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [657, 203]}}\n</tool_call>",
+      "extracted_answer": [
+        0.26364365971107545,
+        0.145
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_15-14-25.png",
+      "question": "crop the photo witnin the virtual android machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.525,
+        0.7384259259259259,
+        0.5377604166666666,
+        0.75
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1367, 1041]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5485553772070626,
+        0.7435714285714285
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_17-01-46.png",
+      "question": "open settings within the virtual android machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.6526041666666667,
+        0.6694444444444444,
+        0.6622395833333333,
+        0.6861111111111111
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [637, 805]}}\n</tool_call>",
+      "extracted_answer": [
+        0.2556179775280899,
+        0.575
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_14-45-58.png",
+      "question": "turn off internet of Android machine in Android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5200520833333333,
+        0.3638888888888889,
+        0.5723958333333333,
+        0.3958333333333333
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1357, 528]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5445425361155698,
+        0.37714285714285717
+      ],
+      "correct": 1
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-05_16-01-19.png",
+      "question": "filter resources in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.2234375,
+        0.09814814814814815,
+        0.228125,
+        0.1087962962962963
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [473, 168]}}\n</tool_call>",
+      "extracted_answer": [
+        0.18980738362760835,
+        0.12
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-07_15-47-45.png",
+      "question": "filter TODO item in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.10651041666666666,
+        0.625,
+        0.11223958333333334,
+        0.6361111111111111
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [357, 816]}}\n</tool_call>",
+      "extracted_answer": [
+        0.14325842696629212,
+        0.5828571428571429
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_11-33-15.png",
+      "question": "sign in google message in virtual android machine in android studio ",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.54609375,
+        0.7574074074074074,
+        0.58203125,
+        0.774537037037037
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1378, 1056]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5529695024077047,
+        0.7542857142857143
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_18-03-58.png",
+      "question": "open terminal windows in the bottom of the android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.39817708333333335,
+        0.8560185185185185,
+        0.4171875,
+        0.8675925925925926
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1035, 1181]}}\n</tool_call>",
+      "extracted_answer": [
+        0.4153290529695024,
+        0.8435714285714285
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-05_16-08-03.png",
+      "question": "sign in to android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5770833333333333,
+        0.07222222222222222,
+        0.5822916666666667,
+        0.08055555555555556
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1378, 1125]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5529695024077047,
+        0.8035714285714286
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_16-03-53.png",
+      "question": "like the QwQ-32b-preview on huggingface within the virtual android machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5341145833333333,
+        0.6101851851851852,
+        0.54765625,
+        0.6212962962962963
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1358, 867]}}\n</tool_call>",
+      "extracted_answer": [
+        0.5449438202247191,
+        0.6192857142857143
+      ],
+      "correct": 1
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-05_18-03-37.png",
+      "question": "open the device in the device explorer in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.2484375,
+        0.1398148148148148,
+        0.2552083333333333,
+        0.1537037037037037
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [658, 230]}}\n</tool_call>",
+      "extracted_answer": [
+        0.2640449438202247,
+        0.16428571428571428
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_15-13-10.png",
+      "question": "Modify the filters of a photo to warm within the virtual Android machine in Android Studio.",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.59453125,
+        0.638425925925926,
+        0.61796875,
+        0.6694444444444444
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1537, 916]}}\n</tool_call>",
+      "extracted_answer": [
+        0.6167736757624398,
+        0.6542857142857142
+      ],
+      "correct": 1
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_17-05-33.png",
+      "question": "decrease the volume of the android virtual machine in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5924479166666666,
+        0.3375,
+        0.5979166666666667,
+        0.3490740740740741
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [1650, 1037]}}\n</tool_call>",
+      "extracted_answer": [
+        0.6621187800963082,
+        0.7407142857142858
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-28_15-01-29.png",
+      "question": "open the picture taked in virtual android machine in android studio ",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.5697916666666667,
+        0.6884259259259259,
+        0.5864583333333333,
+        0.7162037037037037
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [357, 648]}}\n</tool_call>",
+      "extracted_answer": [
+        0.14325842696629212,
+        0.46285714285714286
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-07_15-42-48.png",
+      "question": "collapse all TODO list in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.10572916666666667,
+        0.6606481481481481,
+        0.11197916666666667,
+        0.6722222222222223
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [357, 816]}}\n</tool_call>",
+      "extracted_answer": [
+        0.14325842696629212,
+        0.5828571428571429
+      ],
+      "correct": 0
+    },
+    {
+      "image": "android_studio_mac/screenshot_2024-11-07_15-31-49.png",
+      "question": "add new terminal in android studio",
+      "resized_size": [
+        1400,
+        2492
+      ],
+      "ground_truth": [
+        0.14270833333333333,
+        0.5875,
+        0.14817708333333332,
+        0.5990740740740741
+      ],
+      "model_output": "<tool_call>\n{\"name\": \"computer_use\", \"arguments\": {\"action\": \"left_click\", \"coordinate\": [356, 807]}}\n</tool_call>",
+      "extracted_answer": [
+        0.14285714285714285,
+        0.5764285714285714
+      ],
+      "correct": 0
+    }
+  ]
+}

eval/test_grounding_r1.py ADDED Viewed

	@@ -0,0 +1,318 @@

+from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+import json
+from tqdm import tqdm
+import re
+import os
+from pprint import pprint
+import random
+from PIL import Image
+from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+import argparse
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
+def setup_distributed():
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group(backend="nccl")
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+    return local_rank, world_size, rank
+local_rank, world_size, rank = setup_distributed()
+device = f"cuda:{local_rank}"
+print(f"Process {rank} using {device}")
+steps = 4100
+if rank == 0:
+    print("Steps: ", steps)
+RUN_NAME="Qwen2.5-VL-7B-GRPO-GUI-Grounding_showui_desktop_no_position_high_quality_continual_dense_reward_quadratic_decay_0.5_format_bs16_kl0.004_think_4e"
+MODEL_PATH=f"/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/src/open-r1-multimodal/output/{RUN_NAME}/checkpoint-{steps}"
+OUTPUT_PATH="./logs/rec_results_{DATASET}_{RUN_NAME}_{STEPS}.json"
+BSZ=32
+DATA_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot-Pro/annotations"
+TEST_DATASETS = ['android_studio_macos', 'autocad_windows', 'blender_windows','davinci_macos','eviews_windows','excel_macos','fruitloops_windows','illustrator_windows','inventor_windows','linux_common_linux','macos_common_macos','matlab_macos','origin_windows','photoshop_windows','powerpoint_windows','premiere_windows','pycharm_macos','quartus_windows','solidworks_windows','stata_windows','unreal_engine_windows','vivado_windows','vmware_macos','vscode_macos','windows_common_windows','word_macos']
+IMAGE_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot-Pro/images"
+# TEST_DATASETS = ['lisa_test']
+# IMAGE_ROOT = "/data10/shz/dataset/lisa"
+#We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map={"": local_rank},
+)
+# default processer
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+print(processor.image_processor.min_pixels)
+print(processor.image_processor.max_pixels)
+# def extract_point_answer(content):
+#     # Try to find the bbox within <answer> tags, if can not find, return [0, 0, 0, 0]
+#     answer_tag_pattern = r'<answer>(.*?)</answer>'
+#     content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
+#     if content_answer_match:
+#         content_answer = content_answer_match.group(1).strip()
+#         tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content_answer, re.DOTALL)
+#         if tool_call_match:
+#                 tool_call_content = tool_call_match.group(1).strip()
+#                 # 解析 JSON
+#                 tool_call_json = json.loads(tool_call_content)
+#                 arguments = tool_call_json.get("arguments", {})
+#                 coordinate = arguments.get("coordinate", None)
+#                 if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
+#                     x, y = coordinate
+#                     extracted_coordinate = [x, y]
+#                     return extracted_coordinate
+#     return [0, 0]
+def extract_point_answer(content):
+    # 尝试在 <answer> 标签中查找内容，如果找不到则返回 [0, 0]
+    answer_tag_pattern = r'<answer>(.*?)</answer>'  # 修正正则表达式中的错误
+    content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
+    if content_answer_match:
+        content_answer = content_answer_match.group(1).strip()
+        tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content_answer, re.DOTALL)
+        if tool_call_match:
+            tool_call_content = tool_call_match.group(1).strip()
+            # 首先尝试将 tool_call_content 解析为 JSON
+            try:
+                tool_call_json = json.loads(tool_call_content)
+                arguments = tool_call_json.get("arguments", {})
+                coordinate = arguments.get("coordinate", None)
+                if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
+                    try:
+                        x = float(coordinate[0])
+                        y = float(coordinate[1])
+                        return [x, y]
+                    except (ValueError, TypeError):
+                        pass  # 如果转换失败，继续尝试���则提取
+            except json.JSONDecodeError:
+                pass  # 如果 JSON 解析失败，继续尝试正则提取
+            # 回退到正则表达式提取两个数字
+            numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
+            if len(numbers) >= 2:
+                x = float(numbers[-2])
+                y = float(numbers[-1])
+                return [x, y]
+    return [0, 0]
+def point_in_box(point, box):
+    x,y = point
+    if box[0] <= x < box[2] and box[1] <= y < box[3]:
+        return 1
+    else:
+        return 0
+num_samples = 2000
+num_all_sample = 0
+num_correct_sample = 0
+for ds in TEST_DATASETS:
+    if rank == 0:
+        print(f"Processing {ds}...")
+    ds_path = os.path.join(DATA_ROOT, f"{ds}.json")
+    data = json.load(open(ds_path, "r"))
+    random.seed(42)
+    random.shuffle(data)
+    data = data[:num_samples]
+    # Split data for distributed evaluation
+    per_rank_data = len(data) // world_size
+    start_idx = rank * per_rank_data
+    end_idx = start_idx + per_rank_data if rank < world_size - 1 else len(data)
+    rank_data = data[start_idx:end_idx]
+    messages = []
+    for x in rank_data:
+        image_path = os.path.join(IMAGE_ROOT, x['img_filename'])
+        width,height = x['img_size'][0],x['img_size'][1]
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
+            min_pixels = processor.image_processor.min_pixels,
+            max_pixels = processor.image_processor.max_pixels,
+        )
+        system_content = """You are a helpful assistant.
+#Tools
+You may call one or more functions to assist with the user query.
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{"type": "function", "function": {"name_for_human": "computer_use", "name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer, and take screenshots.\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n* The screen's resolution is {{screen_width}}x{{screen_height}}.\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\n* key: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.\n* type: Type a string of text on the keyboard.\n* mouse_move: Move the cursor to a specified (x, y) pixel coordinate on the screen.\n* left_click: Click the left mouse button.\n* left_click_drag: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.\n* right_click: Click the right mouse button.\n* middle_click: Click the middle mouse button.\n* double_click: Double-click the left mouse button.\n* scroll: Performs a scroll of the mouse scroll wheel.\n* wait: Wait specified seconds for the change to happen.\n* terminate: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "scroll", "wait", "terminate"], "type": "string"}, "keys": {"description": "Required only by action=key.", "type": "array"}, "text": {"description": "Required only by action=type.", "type": "string"}, "coordinate": {"description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by action=mouse_move and action=left_click_drag.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by action=scroll.", "type": "number"}, "time": {"description": "The seconds to wait. Required only by action=wait.", "type": "number"}, "status": {"description": "The status of the task. Required only by action=terminate.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}, "args_format": "Format the arguments as a JSON object."}}
+</tools>
+For each function call, first provide a detailed step-by-step thought process within <think></think> XML tags to analyze the user query and determine the action location, then return a json object with function name and arguments within <tool_call></tool_call> XML tags inside <answer></answer> XML tags:
+<think>
+[Your detailed step-by-step thought process here]
+</think>
+<answer>
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call>
+</answer>""".replace("{{screen_width}}", str(resized_width)).replace("{{screen_height}}", str(resized_height))
+        message = [
+            {
+             "role": "system",
+             "content": [
+                {
+                    "type": "text",
+                    "text": system_content
+                }
+              ]
+            },
+            {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": f"file://{image_path}"
+                },
+                {
+                    "type": "text",
+                    "text": x['instruction']
+                }
+            ]
+        },
+        ]
+        # print(message)
+        messages.append(message)
+    rank_outputs = [] # List to store answers for this rank
+    all_outputs = []  # List to store all answers
+    # Process data
+    for i in tqdm(range(0, len(messages), BSZ), disable=rank != 0):
+        batch_messages = messages[i:i + BSZ]
+        # Preparation for inference
+        text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
+        image_inputs, video_inputs = process_vision_info(batch_messages)
+        inputs = processor(
+            text=text,
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            padding_side="left",
+            return_tensors="pt",
+        )
+        inputs = inputs.to(device)
+        # Inference: Generation of the output
+        generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        batch_output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        rank_outputs.extend(batch_output_text)
+    print(f"Rank {rank} has finished processing {len(rank_outputs)} examples")
+    # Gather all outputs from all ranks
+    all_outputs = [None] * len(data)
+    rank_results = [(start_idx + i, output) for i, output in enumerate(rank_outputs)]
+    gathered_results = [None] * world_size
+    dist.all_gather_object(gathered_results, rank_results)
+    assert gathered_results[-1][-1][0] == len(data) - 1
+    # The main process will collect all results
+    if rank == 0:
+        for results in gathered_results:
+            for idx, output in results:
+                assert idx < len(all_outputs)
+                all_outputs[idx] = output
+        assert all_outputs[-1] is not None
+        final_output = []
+        correct_number = 0
+        for input_example, model_output in zip(data, all_outputs):
+            original_output = model_output
+            ground_truth = input_example['bbox']
+            ground_truth = [ground_truth[0] / input_example['img_size'][0], ground_truth[1] / input_example['img_size'][1], ground_truth[2] / input_example['img_size'][0], ground_truth[3] / input_example['img_size'][1]]
+            model_answer = extract_point_answer(original_output)
+            resized_height, resized_width = smart_resize(
+            input_example['img_size'][1],
+            input_example['img_size'][0],
+            factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
+            min_pixels = processor.image_processor.min_pixels,
+            max_pixels = processor.image_processor.max_pixels,
+        )
+            model_answer = [model_answer[0]/resized_width,model_answer[1]/resized_height]
+            # Count correct answers
+            correct = 0
+            if model_answer is not None:
+                correct = point_in_box(model_answer, ground_truth)
+            correct_number += correct
+            num_all_sample +=1
+            num_correct_sample += correct
+            # Create a result dictionary for this example
+            result = {
+                'image': input_example['img_filename'],
+                'question': input_example['instruction'],
+                'resized_size': [resized_height, resized_width],
+                'ground_truth': ground_truth,
+                'model_output': original_output,
+                'extracted_answer': model_answer,
+                'correct': correct
+            }
+            final_output.append(result)
+        # Calculate and print accuracy
+        accuracy = correct_number / len(data) * 100
+        print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
+        # Save results to a JSON file
+        output_path = OUTPUT_PATH.format(DATASET=ds, RUN_NAME=RUN_NAME, STEPS=steps)
+        output_dir = os.path.dirname(output_path)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        with open(output_path, "w") as f:
+            json.dump({
+                'accuracy': accuracy,
+                'results': final_output
+            }, f, indent=2)
+        print(f"Results saved to {output_path}")
+        print("-"*100)
+# 将最后的统计和打印移到rank==0的条件块内
+    if rank == 0:
+        accuracy = num_correct_sample / num_all_sample * 100
+        print(f"\nnumber of correct samples: {num_correct_sample}")
+        print(f"number of all samples: {num_all_sample}")
+        print(f"Accuracy of all datasets: {accuracy:.2f}%")
+    # Synchronize all processes
+    dist.barrier()

eval/test_grounding_r1_nothink.py ADDED Viewed

	@@ -0,0 +1,331 @@

+from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+import json
+from tqdm import tqdm
+import re
+import os
+from pprint import pprint
+import random
+from PIL import Image
+from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+import argparse
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
+def setup_distributed():
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group(backend="nccl")
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+    return local_rank, world_size, rank
+local_rank, world_size, rank = setup_distributed()
+device = f"cuda:{local_rank}"
+print(f"Process {rank} using {device}")
+steps = 3860
+if rank == 0:
+    print("Steps: ", steps)
+RUN_NAME = "Qwen2.5-VL-3B-GRPO-GUI-Grounding_showui_desktop_high_quality_attention_0.2_filtered_continual_dense_reward_quadratic_decay_0.5_format_bs16_kl0.004_nothink_10e_max_pixel_4028160"
+MODEL_PATH=f"/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/src/open-r1-multimodal/output/{RUN_NAME}/checkpoint-{steps}"
+#MODEL_PATH=f"/data/vjuicefs_ai_camera_jgroup_acadmic/public_data/11178625/VLM-R1/src/open-r1-multimodal/output/{RUN_NAME}/checkpoint-{steps}"
+OUTPUT_PATH="./logs/rec_results_{DATASET}_{RUN_NAME}_{STEPS}.json"
+BSZ=8
+DATA_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot-Pro/annotations"
+TEST_DATASETS = ['android_studio_macos', 'autocad_windows', 'blender_windows','davinci_macos','eviews_windows','excel_macos','fruitloops_windows','illustrator_windows','inventor_windows','linux_common_linux','macos_common_macos','matlab_macos','origin_windows','photoshop_windows','powerpoint_windows','premiere_windows','pycharm_macos','quartus_windows','solidworks_windows','stata_windows','unreal_engine_windows','vivado_windows','vmware_macos','vscode_macos','windows_common_windows','word_macos']
+IMAGE_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot-Pro/images"
+# TEST_DATASETS = ['lisa_test']
+# IMAGE_ROOT = "/data10/shz/dataset/lisa"
+#We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map={"": local_rank},
+)
+# default processer
+processor = AutoProcessor.from_pretrained(MODEL_PATH,max_pixels=7049280,min_pixels=3136)
+# processor.image_processor.min_pixels=3136
+# processor.image_processor.max_pixels=2007040
+print(processor.image_processor.min_pixels)
+print(processor.image_processor.max_pixels)
+# def extract_point_answer(content):
+#     # Try to find the bbox within <answer> tags, if can not find, return [0, 0, 0, 0]
+#     answer_tag_pattern = r'<answer>(.*?)</answer>'
+#     content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
+#     if content_answer_match:
+#         content_answer = content_answer_match.group(1).strip()
+#         tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content_answer, re.DOTALL)
+#         if tool_call_match:
+#                 tool_call_content = tool_call_match.group(1).strip()
+#                 # 解析 JSON
+#                 tool_call_json = json.loads(tool_call_content)
+#                 arguments = tool_call_json.get("arguments", {})
+#                 coordinate = arguments.get("coordinate", None)
+#                 if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
+#                     x, y = coordinate
+#                     extracted_coordinate = [x, y]
+#                     return extracted_coordinate
+#     return [0, 0]
+# def extract_point_answer(content):
+#     # 尝试在 <answer> 标签中查找内容，如果找不到则返回 [0, 0]
+#     tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
+#     if tool_call_match:
+#         tool_call_content = tool_call_match.group(1).strip()
+#         # 首先尝试将 tool_call_content 解析为 JSON
+#         try:
+#             tool_call_json = json.loads(tool_call_content)
+#             print(tool_call_json)
+#             arguments = tool_call_json.get("arguments", {})
+#             coordinate = arguments.get("coordinate", None)
+#             if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
+#                 try:
+#                     x = float(coordinate[0])
+#                     y = float(coordinate[1])
+#                     return [x, y]
+#                 except (ValueError, TypeError):
+#                     pass  # 如果转换失败，继续尝试正则提取
+#         except json.JSONDecodeError:
+#             pass  # 如果 JSON 解析失败，继续尝试正则提取
+#         # 回退到正则表达式提取两个数字
+#         numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
+#         if len(numbers) >= 2:
+#             x = float(numbers[-2])
+#             y = float(numbers[-1])
+#             return [x, y]
+#     return [0, 0]
+def extract_point_answer(content):
+    # 尝试在 <answer> 标签中查找内容，如果找不到则返回 [0, 0]
+    tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
+    if tool_call_match:
+        tool_call_content = tool_call_match.group(1).strip()
+        # 首先尝试将 tool_call_content 解析为 JSON
+        try:
+            numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
+            if len(numbers) >= 2:
+                x = float(numbers[-2])
+                y = float(numbers[-1])
+                return [x, y]
+        except json.JSONDecodeError:
+            pass  # 如果 JSON 解析失败，继续尝试正则提取
+        # 回退到正则表达式提取两个数字
+    return [0, 0]
+def point_in_box(point, box):
+    x,y = point
+    if box[0] <= x < box[2] and box[1] <= y < box[3]:
+        return 1
+    else:
+        return 0
+num_samples = 2000
+num_all_sample = 0
+num_correct_sample = 0
+for ds in TEST_DATASETS:
+    if rank == 0:
+        print(f"Processing {ds}...")
+    ds_path = os.path.join(DATA_ROOT, f"{ds}.json")
+    data = json.load(open(ds_path, "r"))
+    random.seed(42)
+    random.shuffle(data)
+    data = data[:num_samples]
+    # Split data for distributed evaluation
+    per_rank_data = len(data) // world_size
+    start_idx = rank * per_rank_data
+    end_idx = start_idx + per_rank_data if rank < world_size - 1 else len(data)
+    rank_data = data[start_idx:end_idx]
+    messages = []
+    for x in rank_data:
+        image_path = os.path.join(IMAGE_ROOT, x['img_filename'])
+        width,height = x['img_size'][0],x['img_size'][1]
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
+            min_pixels = processor.image_processor.min_pixels,
+            max_pixels = processor.image_processor.max_pixels,
+        )
+        system_content = """You are a helpful assistant.
+#Tools
+You may call one or more functions to assist with the user query.
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{"type": "function", "function": {"name_for_human": "computer_use", "name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer, and take screenshots.\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n* The screen's resolution is {{screen_width}}x{{screen_height}}.\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\n* key: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.\n* type: Type a string of text on the keyboard.\n* mouse_move: Move the cursor to a specified (x, y) pixel coordinate on the screen.\n* left_click: Click the left mouse button.\n* left_click_drag: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.\n* right_click: Click the right mouse button.\n* middle_click: Click the middle mouse button.\n* double_click: Double-click the left mouse button.\n* scroll: Performs a scroll of the mouse scroll wheel.\n* wait: Wait specified seconds for the change to happen.\n* terminate: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "scroll", "wait", "terminate"], "type": "string"}, "keys": {"description": "Required only by action=key.", "type": "array"}, "text": {"description": "Required only by action=type.", "type": "string"}, "coordinate": {"description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by action=mouse_move and action=left_click_drag.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by action=scroll.", "type": "number"}, "time": {"description": "The seconds to wait. Required only by action=wait.", "type": "number"}, "status": {"description": "The status of the task. Required only by action=terminate.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}, "args_format": "Format the arguments as a JSON object."}}
+</tools>
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call>""".replace("{{screen_width}}", str(resized_width)).replace("{{screen_height}}", str(resized_height))
+        message = [
+            {
+             "role": "system",
+             "content": [
+                {
+                    "type": "text",
+                    "text": system_content
+                }
+              ]
+            },
+            {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": f"file://{image_path}"
+                },
+                {
+                    "type": "text",
+                    "text": x['instruction']
+                }
+            ]
+        },
+        ]
+        # print(message)
+        messages.append(message)
+    rank_outputs = [] # List to store answers for this rank
+    all_outputs = []  # List to store all answers
+    # Process data
+    for i in tqdm(range(0, len(messages), BSZ), disable=rank != 0):
+        batch_messages = messages[i:i + BSZ]
+        # Preparation for inference
+        text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
+        image_inputs, video_inputs = process_vision_info(batch_messages)
+        inputs = processor(
+            text=text,
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            padding_side="left",
+            return_tensors="pt",
+        )
+        inputs = inputs.to(device)
+        # Inference: Generation of the output
+        generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        batch_output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        rank_outputs.extend(batch_output_text)
+    print(f"Rank {rank} has finished processing {len(rank_outputs)} examples")
+    # Gather all outputs from all ranks
+    all_outputs = [None] * len(data)
+    rank_results = [(start_idx + i, output) for i, output in enumerate(rank_outputs)]
+    gathered_results = [None] * world_size
+    dist.all_gather_object(gathered_results, rank_results)
+    assert gathered_results[-1][-1][0] == len(data) - 1
+    # The main process will collect all results
+    if rank == 0:
+        for results in gathered_results:
+            for idx, output in results:
+                assert idx < len(all_outputs)
+                all_outputs[idx] = output
+        assert all_outputs[-1] is not None
+        final_output = []
+        correct_number = 0
+        for input_example, model_output in zip(data, all_outputs):
+            original_output = model_output
+            ground_truth = input_example['bbox']
+            ground_truth = [ground_truth[0] / input_example['img_size'][0], ground_truth[1] / input_example['img_size'][1], ground_truth[2] / input_example['img_size'][0], ground_truth[3] / input_example['img_size'][1]]
+            model_answer = extract_point_answer(original_output)
+            resized_height, resized_width = smart_resize(
+            input_example['img_size'][1],
+            input_example['img_size'][0],
+            factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
+            min_pixels = processor.image_processor.min_pixels,
+            max_pixels = processor.image_processor.max_pixels,
+        )
+            model_answer = [model_answer[0]/resized_width,model_answer[1]/resized_height]
+            # Count correct answers
+            correct = 0
+            if model_answer is not None:
+                correct = point_in_box(model_answer, ground_truth)
+            correct_number += correct
+            num_all_sample +=1
+            num_correct_sample += correct
+            # Create a result dictionary for this example
+            result = {
+                'image': input_example['img_filename'],
+                'question': input_example['instruction'],
+                'resized_size': [resized_height, resized_width],
+                'ground_truth': ground_truth,
+                'model_output': original_output,
+                'extracted_answer': model_answer,
+                'correct': correct
+            }
+            final_output.append(result)
+        # Calculate and print accuracy
+        accuracy = correct_number / len(data) * 100
+        print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
+        # Save results to a JSON file
+        output_path = OUTPUT_PATH.format(DATASET=ds, RUN_NAME=RUN_NAME, STEPS=steps)
+        output_dir = os.path.dirname(output_path)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        with open(output_path, "w") as f:
+            json.dump({
+                'accuracy': accuracy,
+                'results': final_output
+            }, f, indent=2)
+        print(f"Results saved to {output_path}")
+        print("-"*100)
+# 将最后的统计和打印移到rank==0的条件块内
+    if rank == 0:
+        accuracy = num_correct_sample / num_all_sample * 100
+        print(f"\nnumber of correct samples: {num_correct_sample}")
+        print(f"number of all samples: {num_all_sample}")
+        print(f"Accuracy of all datasets: {accuracy:.2f}%")
+    # Synchronize all processes
+    dist.barrier()

eval/test_grounding_r1_nothink_ssv2.py ADDED Viewed

	@@ -0,0 +1,335 @@

+from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+import json
+from tqdm import tqdm
+import re
+import os
+from pprint import pprint
+import random
+from PIL import Image
+from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+import argparse
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
+def setup_distributed():
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group(backend="nccl")
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+    return local_rank, world_size, rank
+local_rank, world_size, rank = setup_distributed()
+device = f"cuda:{local_rank}"
+print(f"Process {rank} using {device}")
+steps = 3800
+if rank == 0:
+    print("Steps: ", steps)
+# #RUN_NAME = "base"
+# RUN_NAME = "Qwen2.5-VL-7B-GRPO-GUI-Grounding_showui_desktop_high_quality_attention_filtered_only_one_continual_dense_reward_quadratic_decay_0.5_format_bs16_kl0.004_nothink_10e"
+# #MODEL_PATH="/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/Qwen2.5-VL-7B-Instruct"
+# MODEL_PATH=f"/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/src/open-r1-multimodal/output/{RUN_NAME}/checkpoint-{steps}"
+# OUTPUT_PATH="./logs/rec_results_{DATASET}_{RUN_NAME}_{STEPS}.json"
+#RUN_NAME = "base"
+MODEL_PATH= "ByteDance-Seed/UI-TARS-2B-SFT"
+OUTPUT_PATH="./logs/rec_results_ui_tras_2B.json"
+BSZ=32
+DATA_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot-v2"
+TEST_DATASETS = ['screenspot_desktop_v2','screenspot_mobile_v2','screenspot_web_v2']
+IMAGE_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot-v2/screenspotv2_image"
+# TEST_DATASETS = ['lisa_test']
+# IMAGE_ROOT = "/data10/shz/dataset/lisa"
+#We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map={"": local_rank},
+)
+# default processer
+processor = AutoProcessor.from_pretrained(MODEL_PATH,max_pixels=3512320,min_pixels=3136)
+# processor.image_processor.min_pixels=3136
+# processor.image_processor.max_pixels=2007040
+print(processor.image_processor.min_pixels)
+print(processor.image_processor.max_pixels)
+# def extract_point_answer(content):
+#     # Try to find the bbox within <answer> tags, if can not find, return [0, 0, 0, 0]
+#     answer_tag_pattern = r'<answer>(.*?)</answer>'
+#     content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
+#     if content_answer_match:
+#         content_answer = content_answer_match.group(1).strip()
+#         tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content_answer, re.DOTALL)
+#         if tool_call_match:
+#                 tool_call_content = tool_call_match.group(1).strip()
+#                 # 解析 JSON
+#                 tool_call_json = json.loads(tool_call_content)
+#                 arguments = tool_call_json.get("arguments", {})
+#                 coordinate = arguments.get("coordinate", None)
+#                 if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
+#                     x, y = coordinate
+#                     extracted_coordinate = [x, y]
+#                     return extracted_coordinate
+#     return [0, 0]
+# def extract_point_answer(content):
+#     # 尝试在 <answer> 标签中查找内容，如果找不到则返回 [0, 0]
+#     tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
+#     if tool_call_match:
+#         tool_call_content = tool_call_match.group(1).strip()
+#         # 首先尝试将 tool_call_content 解析为 JSON
+#         try:
+#             tool_call_json = json.loads(tool_call_content)
+#             print(tool_call_json)
+#             arguments = tool_call_json.get("arguments", {})
+#             coordinate = arguments.get("coordinate", None)
+#             if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
+#                 try:
+#                     x = float(coordinate[0])
+#                     y = float(coordinate[1])
+#                     return [x, y]
+#                 except (ValueError, TypeError):
+#                     pass  # 如果转换失败，继续尝试正则提取
+#         except json.JSONDecodeError:
+#             pass  # 如果 JSON 解析失败，继续尝试正则提取
+#         # 回退到正则表达式提取两个数字
+#         numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
+#         if len(numbers) >= 2:
+#             x = float(numbers[-2])
+#             y = float(numbers[-1])
+#             return [x, y]
+#     return [0, 0]
+def extract_point_answer(content):
+    # 尝试在 <answer> 标签中查找内容，如果找不到则返回 [0, 0]
+    tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
+    if tool_call_match:
+        tool_call_content = tool_call_match.group(1).strip()
+        # 首先尝试将 tool_call_content 解析为 JSON
+        try:
+            numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
+            if len(numbers) >= 2:
+                x = float(numbers[-2])
+                y = float(numbers[-1])
+                return [x, y]
+        except json.JSONDecodeError:
+            pass  # 如果 JSON 解析失败，继续尝试正则提取
+        # 回退到正则表达式提取两个数字
+    return [0, 0]
+def point_in_box(point, box):
+    x,y = point
+    if box[0] <= x < box[2] and box[1] <= y < box[3]:
+        return 1
+    else:
+        return 0
+num_samples = 2000
+num_all_sample = 0
+num_correct_sample = 0
+for ds in TEST_DATASETS:
+    if rank == 0:
+        print(f"Processing {ds}...")
+    ds_path = os.path.join(DATA_ROOT, f"{ds}.json")
+    data = json.load(open(ds_path, "r"))
+    random.seed(42)
+    random.shuffle(data)
+    data = data[:num_samples]
+    # Split data for distributed evaluation
+    per_rank_data = len(data) // world_size
+    start_idx = rank * per_rank_data
+    end_idx = start_idx + per_rank_data if rank < world_size - 1 else len(data)
+    rank_data = data[start_idx:end_idx]
+    messages = []
+    for x in rank_data:
+        image_path = os.path.join(IMAGE_ROOT, x['img_filename'])
+        width,height = x['img_size'][0],x['img_size'][1]
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
+            min_pixels = processor.image_processor.min_pixels,
+            max_pixels = processor.image_processor.max_pixels,
+        )
+        system_content = """You are a helpful assistant.
+#Tools
+You may call one or more functions to assist with the user query.
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{"type": "function", "function": {"name_for_human": "computer_use", "name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer, and take screenshots.\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n* The screen's resolution is {{screen_width}}x{{screen_height}}.\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\n* key: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.\n* type: Type a string of text on the keyboard.\n* mouse_move: Move the cursor to a specified (x, y) pixel coordinate on the screen.\n* left_click: Click the left mouse button.\n* left_click_drag: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.\n* right_click: Click the right mouse button.\n* middle_click: Click the middle mouse button.\n* double_click: Double-click the left mouse button.\n* scroll: Performs a scroll of the mouse scroll wheel.\n* wait: Wait specified seconds for the change to happen.\n* terminate: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "scroll", "wait", "terminate"], "type": "string"}, "keys": {"description": "Required only by action=key.", "type": "array"}, "text": {"description": "Required only by action=type.", "type": "string"}, "coordinate": {"description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by action=mouse_move and action=left_click_drag.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by action=scroll.", "type": "number"}, "time": {"description": "The seconds to wait. Required only by action=wait.", "type": "number"}, "status": {"description": "The status of the task. Required only by action=terminate.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}, "args_format": "Format the arguments as a JSON object."}}
+</tools>
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call>""".replace("{{screen_width}}", str(resized_width)).replace("{{screen_height}}", str(resized_height))
+        message = [
+            {
+             "role": "system",
+             "content": [
+                {
+                    "type": "text",
+                    "text": system_content
+                }
+              ]
+            },
+            {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": f"file://{image_path}"
+                },
+                {
+                    "type": "text",
+                    "text": x['instruction']
+                }
+            ]
+        },
+        ]
+        # print(message)
+        messages.append(message)
+    rank_outputs = [] # List to store answers for this rank
+    all_outputs = []  # List to store all answers
+    # Process data
+    for i in tqdm(range(0, len(messages), BSZ), disable=rank != 0):
+        batch_messages = messages[i:i + BSZ]
+        # Preparation for inference
+        text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
+        image_inputs, video_inputs = process_vision_info(batch_messages)
+        inputs = processor(
+            text=text,
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            padding_side="left",
+            return_tensors="pt",
+        )
+        inputs = inputs.to(device)
+        # Inference: Generation of the output
+        generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        batch_output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        rank_outputs.extend(batch_output_text)
+    print(f"Rank {rank} has finished processing {len(rank_outputs)} examples")
+    # Gather all outputs from all ranks
+    all_outputs = [None] * len(data)
+    rank_results = [(start_idx + i, output) for i, output in enumerate(rank_outputs)]
+    gathered_results = [None] * world_size
+    dist.all_gather_object(gathered_results, rank_results)
+    assert gathered_results[-1][-1][0] == len(data) - 1
+    # The main process will collect all results
+    if rank == 0:
+        for results in gathered_results:
+            for idx, output in results:
+                assert idx < len(all_outputs)
+                all_outputs[idx] = output
+        assert all_outputs[-1] is not None
+        final_output = []
+        correct_number = 0
+        for input_example, model_output in zip(data, all_outputs):
+            original_output = model_output
+            ground_truth = input_example['bbox']
+            ground_truth = [ground_truth[0] / input_example['img_size'][0], ground_truth[1] / input_example['img_size'][1], (ground_truth[0]+ground_truth[2]) / input_example['img_size'][0], (ground_truth[1]+ground_truth[3]) / input_example['img_size'][1]]
+            model_answer = extract_point_answer(original_output)
+            resized_height, resized_width = smart_resize(
+            input_example['img_size'][1],
+            input_example['img_size'][0],
+            factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
+            min_pixels = processor.image_processor.min_pixels,
+            max_pixels = processor.image_processor.max_pixels,
+        )
+            model_answer = [model_answer[0]/resized_width,model_answer[1]/resized_height]
+            # Count correct answers
+            correct = 0
+            if model_answer is not None:
+                correct = point_in_box(model_answer, ground_truth)
+            correct_number += correct
+            num_all_sample +=1
+            num_correct_sample += correct
+            # Create a result dictionary for this example
+            result = {
+                'image': input_example['img_filename'],
+                'question': input_example['instruction'],
+                'resized_size': [resized_height, resized_width],
+                'ground_truth': ground_truth,
+                'model_output': original_output,
+                'extracted_answer': model_answer,
+                'correct': correct
+            }
+            final_output.append(result)
+        # Calculate and print accuracy
+        accuracy = correct_number / len(data) * 100
+        print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
+        # Save results to a JSON file
+        output_path = OUTPUT_PATH.format(DATASET=ds, RUN_NAME=RUN_NAME, STEPS=steps)
+        output_dir = os.path.dirname(output_path)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        with open(output_path, "w") as f:
+            json.dump({
+                'accuracy': accuracy,
+                'results': final_output
+            }, f, indent=2)
+        print(f"Results saved to {output_path}")
+        print("-"*100)
+# 将最后的统计和打印移到rank==0的条件块内
+    if rank == 0:
+        accuracy = num_correct_sample / num_all_sample * 100
+        print(f"\nnumber of correct samples: {num_correct_sample}")
+        print(f"number of all samples: {num_all_sample}")
+        print(f"Accuracy of all datasets: {accuracy:.2f}%")
+    # Synchronize all processes
+    dist.barrier()

eval/test_od_r1.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import re
+import os
+import json
+import torch
+import random
+from tqdm import tqdm
+from pprint import pprint
+from qwen_vl_utils import process_vision_info
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+def extract_bbox_answer(content):
+    pattern = r'```json(.*?)```'
+    json_match = re.search(pattern, content, re.DOTALL)
+    bbox_json = json_match.group(1).strip() if json_match else None
+    if bbox_json:
+        try:
+            bbox = json.loads(bbox_json)[0]['bbox_2d']
+            return bbox, False
+        except:
+            return [0, 0, 0, 0], False
+    else:
+        return [0, 0, 0, 0], False
+def iou(box1, box2):
+    inter_x1 = max(box1[0], box2[0])
+    inter_y1 = max(box1[1], box2[1])
+    inter_x2 = min(box1[2] - 1, box2[2] - 1)
+    inter_y2 = min(box1[3] - 1, box2[3] - 1)
+    if inter_x1 < inter_x2 and inter_y1 < inter_y2:
+        inter = (inter_x2 - inter_x1 + 1) * (inter_y2 - inter_y1 + 1)
+    else:
+        inter = 0
+    union = (box1[2] - box1[0]) * (box1[3] - box1[1]) + (box2[2] - box2[0]) * (box2[3] - box2[1]) - inter
+    return float(inter) / union
+def load_model(model_path, device_map):
+    #We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
+        device_map=device_map,
+    )
+    # default processer
+    processor = AutoProcessor.from_pretrained(model_path)
+    return model, processor
+def eval_od_r1(
+    model_path, test_datasets, data_root, image_root, question_template, output_dir, batch_size=32, sample_num=500, seed=42, device_map="cuda:0"
+):
+    random.seed(seed)
+    model, processor = load_model(model_path, device_map)
+    for ds in test_datasets:
+        print(f"Processing {ds}...")
+        ds_path = os.path.join(data_root, f"{ds}.json")
+        data = json.load(open(ds_path, "r"))
+        random.shuffle(data)
+        data = data[:sample_num]
+        messages = []
+        for x in data:
+            image_path = os.path.join(image_root, x['image'])
+            messages.append(
+                [
+                    {
+                        "role":
+                            "user",
+                        "content":
+                            [
+                                {
+                                    "type": "image",
+                                    "image": f"file://{image_path}"
+                                }, {
+                                    "type": "text",
+                                    "text": question_template.format(Question=x['normal_caption'])
+                                }
+                            ]
+                    }
+                ]
+            )
+        all_outputs = []  # List to store all answers
+        # Process data
+        for i in tqdm(range(0, len(messages), batch_size)):
+            batch_messages = messages[i:i + batch_size]
+            # Preparation for inference
+            text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
+            image_inputs, video_inputs = process_vision_info(batch_messages)
+            inputs = processor(
+                text=text,
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                return_tensors="pt",
+            )
+            inputs = inputs.to(device_map)
+            # Inference: Generation of the output
+            generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
+            generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+            batch_output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+            all_outputs.extend(batch_output_text)
+        final_output = []
+        correct_number = 0
+        for input_example, model_output in zip(data, all_outputs):
+            original_output = model_output
+            ground_truth = input_example['solution']
+            ground_truth_normalized = input_example['normalized_solution']
+            model_answer, normalized = extract_bbox_answer(original_output)
+            # Count correct answers
+            correct = 0
+            if model_answer is not None:
+                iou_value = iou(model_answer, ground_truth_normalized if normalized else ground_truth)
+                if iou_value > 0.5:
+                    correct = 1
+            correct_number += correct
+            # Create a result dictionary for this example
+            result = {
+                "question": question_template.format(Question=input_example['normal_caption']),
+                "ground_truth": ground_truth if not normalized else ground_truth_normalized,
+                "model_output": original_output,
+                "extracted_answer": model_answer,
+                "correct": correct,
+                "iou": iou_value
+            }
+            final_output.append(result)
+        # Calculate and print accuracy
+        accuracy = correct_number / len(data) * 100
+        print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
+        # Save results to a JSON file
+        result_path = os.path.join(output_dir, f"{os.path.basename(model_path)}", f"{ds}_od_r1.json")
+        os.makedirs(os.path.dirname(result_path), exist_ok=True)
+        with open(result_path, "w") as f:
+            json.dump({"accuracy": accuracy, "results": final_output}, f, indent=2)
+        print(f"Results saved to {result_path}")
+        print('-' * 100)
+if __name__ == "__main__":
+    model_path = ''  # Add the path to the model
+    data_root = ''  # Add the data root
+    test_datasets = ['refcoco_val', 'refcocop_val', 'refcocog_val']  # modify the datasets
+    image_root = ''  # Add the image root
+    output_dir = 'logs'  # Add the output directory, default is logs
+    device_map = 'cuda:0'  # select the device, default is cuda:0
+    question_template = '{Question} First output the thinking process in <think> </think> tags and then output the final answer in <answer> </answer> tags. Output the final answer in JSON format.'  # modify the question template which must contain {Question}, {Question} will be replaced by the caption
+    eval_od_r1(
+        model_path=model_path,
+        data_root=data_root,
+        test_datasets=test_datasets,
+        image_root=image_root,
+        question_template=question_template,
+        output_dir=output_dir,
+        device_map=device_map
+    )

eval/test_rec_baseline.py ADDED Viewed

	@@ -0,0 +1,225 @@

+from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+import json
+from tqdm import tqdm
+import re
+import os
+from pprint import pprint
+import random
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+import argparse
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
+def setup_distributed():
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group(backend="nccl")
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+    print(f"Process {rank}/{world_size} initialized on cuda:{local_rank}")
+    return local_rank, world_size, rank
+local_rank, world_size, rank = setup_distributed()
+device = f"cuda:{local_rank}"
+steps = 100
+MODEL_PATH=f"/data10/shz/project/LLaMA-Factory/saves/qwen2_5_vl-3b/full/sft/checkpoint-{steps}"
+OUTPUT_PATH="./logs/rec_results_{DATASET}_qwen2_5vl_3b_instruct_sft_{STEPS}.json"
+# MODEL_PATH = "/data10/shz/ckpt/vlm-r1-related/Qwen2.5-VL-3B-Instruct"
+# OUTPUT_PATH = "./logs/rec_results_{DATASET}_qwen2_5vl_3b_instruct_baseline_{STEPS}.json"
+BSZ=4
+DATA_ROOT = "/data10/shz/dataset/rec/rec_jsons_processed"
+TEST_DATASETS = ['refcoco_val', 'refcocop_val', 'refcocog_val']
+IMAGE_ROOT = "/data10/shz/dataset/coco"
+# TEST_DATASETS = ['lisa_test']
+# IMAGE_ROOT = "/data10/shz/dataset/lisa"
+#We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map={"": local_rank},
+)
+# default processer
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+def extract_bbox_answer(content):
+    bbox_pattern = r'\[(\s*-?\d*\.?\d+\s*),\s*(\s*-?\d*\.?\d+\s*),\s*(\s*-?\d*\.?\d+\s*),\s*(\s*-?\d*\.?\d+\s*)\]'
+    # bbox_pattern = r'\[(-?\d*\.?\d+),\s*(-?\d*\.?\d+),\s*(-?\d*\.?\d+),\s*(-?\d*\.?\d+)\]'
+    bbox_match = re.search(bbox_pattern, content)
+    if bbox_match:
+        bbox = [float(bbox_match.group(1)), float(bbox_match.group(2)), float(bbox_match.group(3)), float(bbox_match.group(4))]
+        return bbox
+    return [0, 0, 0, 0]
+def iou(box1, box2):
+    inter_x1 = max(box1[0], box2[0])
+    inter_y1 = max(box1[1], box2[1])
+    inter_x2 = min(box1[2]-1, box2[2]-1)
+    inter_y2 = min(box1[3]-1, box2[3]-1)
+    if inter_x1 < inter_x2 and inter_y1 < inter_y2:
+        inter = (inter_x2-inter_x1+1)*(inter_y2-inter_y1+1)
+    else:
+        inter = 0
+    union = (box1[2]-box1[0])*(box1[3]-box1[1]) + (box2[2]-box2[0])*(box2[3]-box2[1]) - inter
+    return float(inter)/union
+num_samples = 2000
+for ds in TEST_DATASETS:
+    if rank == 0:
+        print(f"Processing {ds}...")
+    ds_path = os.path.join(DATA_ROOT, f"{ds}.json")
+    data = json.load(open(ds_path, "r"))
+    random.seed(42)
+    random.shuffle(data)
+    data = data[:num_samples]
+    # QUESTION_TEMPLATE = "{Question}" if steps > 0 else "{Question} Please provide the bounding box coordinate in JSON format."
+    QUESTION_TEMPLATE = "{Question} Please provide the bounding box coordinate in JSON format."
+    # Split data for distributed evaluation
+    per_rank_data = len(data) // world_size
+    start_idx = rank * per_rank_data
+    end_idx = start_idx + per_rank_data if rank < world_size - 1 else len(data)
+    rank_data = data[start_idx:end_idx]
+    messages = []
+    for x in rank_data:
+        image_path = os.path.join(IMAGE_ROOT, x['image'])
+        message = [
+            # {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
+            {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": f"file://{image_path}"
+                },
+                {
+                    "type": "text",
+                    "text": QUESTION_TEMPLATE.format(Question=x['problem'])
+                }
+            ]
+        }]
+        messages.append(message)
+    rank_outputs = [] # List to store answers for this rank
+    all_outputs = []  # List to store all answers
+    # Process data
+    for i in tqdm(range(0, len(messages), BSZ), disable=rank != 0):
+        batch_messages = messages[i:i + BSZ]
+        # Preparation for inference
+        text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
+        image_inputs, video_inputs = process_vision_info(batch_messages)
+        inputs = processor(
+            text=text,
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            padding_side="left",
+            return_tensors="pt",
+        )
+        inputs = inputs.to(device)
+        # Inference: Generation of the output
+        generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        batch_output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        rank_outputs.extend(batch_output_text)
+    print(f"Rank {rank} has finished processing {len(rank_outputs)} examples")
+    # Gather all outputs from all ranks
+    all_outputs = [None] * len(data)
+    rank_results = [(start_idx + i, output) for i, output in enumerate(rank_outputs)]
+    gathered_results = [None] * world_size
+    dist.all_gather_object(gathered_results, rank_results)
+    assert gathered_results[-1][-1][0] == len(data) - 1
+    # The main process will collect all results
+    if rank == 0:
+        for results in gathered_results:
+            for idx, output in results:
+                assert idx < len(all_outputs)
+                all_outputs[idx] = output
+        assert all_outputs[-1] is not None
+        final_output = []
+        correct_number = 0
+        for input_example, model_output in zip(data, all_outputs):
+            original_output = model_output
+            ground_truth = input_example['solution']
+            model_answer = extract_bbox_answer(original_output)
+            # Count correct answers
+            correct = 0
+            if model_answer is not None:
+                if iou(model_answer, ground_truth) > 0.5:
+                    correct = 1
+            correct_number += correct
+            # Create a result dictionary for this example
+            result = {
+                'image': input_example['image'],
+                'question': input_example['problem'],
+                'ground_truth': ground_truth,
+                'model_output': original_output,
+                'extracted_answer': model_answer,
+                'correct': correct
+            }
+            final_output.append(result)
+        # Calculate and print accuracy
+        accuracy = correct_number / len(data) * 100
+        print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
+        # Save results to a JSON file
+        output_path = OUTPUT_PATH.format(DATASET=ds, STEPS=steps)
+        output_dir = os.path.dirname(output_path)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        with open(output_path, "w") as f:
+            json.dump({
+                'accuracy': accuracy,
+                'results': final_output
+            }, f, indent=2)
+        print(f"Results saved to {output_path}")
+        print("-"*100)
+    # Synchronize all processes
+    dist.barrier()

eval/test_rec_r1.py ADDED Viewed

	@@ -0,0 +1,232 @@

+from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+import json
+from tqdm import tqdm
+import re
+import os
+from pprint import pprint
+import random
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+import argparse
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
+def setup_distributed():
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group(backend="nccl")
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+    return local_rank, world_size, rank
+local_rank, world_size, rank = setup_distributed()
+device = f"cuda:{local_rank}"
+print(f"Process {rank} using {device}")
+steps = 100
+if rank == 0:
+    print("Steps: ", steps)
+RUN_NAME = "Qwen2.5-VL-7B-GRPO-GUI-Grounding_image_size"
+MODEL_PATH=f"/data/vjuicefs_ai_camera_jgroup_research/public_data/11179904/code/VLM_R1_CUSTOM/VLM-R1/src/open-r1-multimodal/output/{RUN_NAME}/checkpoint-{steps}"
+OUTPUT_PATH="./logs/rec_results_{DATASET}_{RUN_NAME}_{STEPS}.json"
+BSZ=16
+DATA_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/data/rec_jsons_processed"
+TEST_DATASETS = ['refcoco_val', 'refcocop_val', 'refcocog_val']
+IMAGE_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/data"
+# TEST_DATASETS = ['lisa_test']
+# IMAGE_ROOT = "/data10/shz/dataset/lisa"
+#We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map={"": local_rank},
+)
+# default processer
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+def extract_bbox_answer(content):
+    # Try to find the bbox within <answer> tags, if can not find, return [0, 0, 0, 0]
+    answer_tag_pattern = r'<answer>(.*?)</answer>'
+    bbox_pattern = r'\{.*\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)]\s*.*\}'
+    content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
+    if content_answer_match:
+        content_answer = content_answer_match.group(1).strip()
+        bbox_match = re.search(bbox_pattern, content_answer, re.DOTALL)
+        if bbox_match:
+            bbox = [int(bbox_match.group(1)), int(bbox_match.group(2)), int(bbox_match.group(3)), int(bbox_match.group(4))]
+            return bbox
+    return [0, 0, 0, 0]
+def iou(box1, box2):
+    inter_x1 = max(box1[0], box2[0])
+    inter_y1 = max(box1[1], box2[1])
+    inter_x2 = min(box1[2]-1, box2[2]-1)
+    inter_y2 = min(box1[3]-1, box2[3]-1)
+    if inter_x1 < inter_x2 and inter_y1 < inter_y2:
+        inter = (inter_x2-inter_x1+1)*(inter_y2-inter_y1+1)
+    else:
+        inter = 0
+    union = (box1[2]-box1[0])*(box1[3]-box1[1]) + (box2[2]-box2[0])*(box2[3]-box2[1]) - inter
+    return float(inter)/union
+num_samples = 2000
+for ds in TEST_DATASETS:
+    if rank == 0:
+        print(f"Processing {ds}...")
+    ds_path = os.path.join(DATA_ROOT, f"{ds}.json")
+    data = json.load(open(ds_path, "r"))
+    random.seed(42)
+    random.shuffle(data)
+    data = data[:num_samples]
+    QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> tags and then output the final answer in <answer> </answer> tags. Output the final answer in JSON format."
+    # Split data for distributed evaluation
+    per_rank_data = len(data) // world_size
+    start_idx = rank * per_rank_data
+    end_idx = start_idx + per_rank_data if rank < world_size - 1 else len(data)
+    rank_data = data[start_idx:end_idx]
+    messages = []
+    for x in rank_data:
+        image_path = os.path.join(IMAGE_ROOT, x['image'])
+        message = [
+            # {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
+            {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": f"file://{image_path}"
+                },
+                {
+                    "type": "text",
+                    "text": QUESTION_TEMPLATE.format(Question=x['problem'])
+                }
+            ]
+        }]
+        messages.append(message)
+    rank_outputs = [] # List to store answers for this rank
+    all_outputs = []  # List to store all answers
+    # Process data
+    for i in tqdm(range(0, len(messages), BSZ), disable=rank != 0):
+        batch_messages = messages[i:i + BSZ]
+        # Preparation for inference
+        text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
+        image_inputs, video_inputs = process_vision_info(batch_messages)
+        inputs = processor(
+            text=text,
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            padding_side="left",
+            return_tensors="pt",
+        )
+        inputs = inputs.to(device)
+        # Inference: Generation of the output
+        generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        batch_output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        rank_outputs.extend(batch_output_text)
+    print(f"Rank {rank} has finished processing {len(rank_outputs)} examples")
+    # Gather all outputs from all ranks
+    all_outputs = [None] * len(data)
+    rank_results = [(start_idx + i, output) for i, output in enumerate(rank_outputs)]
+    gathered_results = [None] * world_size
+    dist.all_gather_object(gathered_results, rank_results)
+    assert gathered_results[-1][-1][0] == len(data) - 1
+    # The main process will collect all results
+    if rank == 0:
+        for results in gathered_results:
+            for idx, output in results:
+                assert idx < len(all_outputs)
+                all_outputs[idx] = output
+        assert all_outputs[-1] is not None
+        final_output = []
+        correct_number = 0
+        for input_example, model_output in zip(data, all_outputs):
+            original_output = model_output
+            ground_truth = input_example['solution']
+            model_answer = extract_bbox_answer(original_output)
+            # Count correct answers
+            correct = 0
+            if model_answer is not None:
+                if iou(model_answer, ground_truth) > 0.5:
+                    correct = 1
+            correct_number += correct
+            # Create a result dictionary for this example
+            result = {
+                'image': input_example['image'],
+                'question': input_example['problem'],
+                'ground_truth': ground_truth,
+                'model_output': original_output,
+                'extracted_answer': model_answer,
+                'correct': correct
+            }
+            final_output.append(result)
+        # Calculate and print accuracy
+        accuracy = correct_number / len(data) * 100
+        print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
+        # Save results to a JSON file
+        output_path = OUTPUT_PATH.format(DATASET=ds, RUN_NAME=RUN_NAME, STEPS=steps)
+        output_dir = os.path.dirname(output_path)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        with open(output_path, "w") as f:
+            json.dump({
+                'accuracy': accuracy,
+                'results': final_output
+            }, f, indent=2)
+        print(f"Results saved to {output_path}")
+        print("-"*100)
+    # Synchronize all processes
+    dist.barrier()

open-r1-multimodal/.gitignore ADDED Viewed

	@@ -0,0 +1,178 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc
+# Temp folders
+data/
+wandb/
+scripts/
+checkpoints/
+.vscode/

open-r1-multimodal/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

open-r1-multimodal/Makefile ADDED Viewed

	@@ -0,0 +1,20 @@

+.PHONY: style quality
+# make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!)
+export PYTHONPATH = src
+check_dirs := src
+style:
+	black --line-length 119 --target-version py310 $(check_dirs) setup.py
+	isort $(check_dirs) setup.py
+quality:
+	black --check --line-length 119 --target-version py310 $(check_dirs) setup.py
+	isort --check-only $(check_dirs) setup.py
+	flake8 --max-line-length 119 $(check_dirs) setup.py
+# Evaluation
+evaluate:

open-r1-multimodal/configs/ddp.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

open-r1-multimodal/configs/zero2.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

open-r1-multimodal/configs/zero3.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

open-r1-multimodal/data_config/rec.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+datasets:
+    - json_path: /data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/data/rec_jsons_processed/refcoco_train.json
+    - json_path: /data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/data/rec_jsons_processed/refcocop_train.json
+    - json_path: /data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/data/rec_jsons_processed/refcocog_train.json

open-r1-multimodal/data_jsonl/gui_multi-image.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

open-r1-multimodal/local_scripts/lmms_eval_qwen2vl.sh ADDED Viewed

	@@ -0,0 +1,61 @@

+export HF_HOME="<CACHE_DIR>"
+export HF_TOKEN="<HF_TOKEN>"
+export HF_HUB_ENABLE_HF_TRANSFER="1"
+export API_TYPE="<API_TYPE>"
+export AZURE_ENDPOINT="<AZURE_ENDPOINT>"
+export AZURE_API_KEY="<API_KEY>"
+export API_VERSION="<API_VERSION>"
+export MODEL_VERSION="<MODEL_VERSION>"
+export NAVIT_ATTENTION_IMPLEMENTATION="eager"
+# Prompt for installation with 3-second timeout
+read -t 3 -p "Do you want to install dependencies? (YES/no, timeout in 3s): " install_deps || true
+if [ "$install_deps" = "YES" ]; then
+    # Prepare the environment
+    pip3 install --upgrade pip
+    pip3 install -U setuptools
+    cd <PROJECT_ROOT>
+    if [ ! -d "maas_engine" ]; then
+        git clone <REPO_URL>
+    else
+        echo "maas_engine directory already exists, skipping clone"
+    fi
+    cd maas_engine
+    git pull
+    git checkout <BRANCH_NAME>
+    pip3 install --no-cache-dir --no-build-isolation -e ".[standalone]"
+    current_version=$(pip3 show transformers | grep Version | cut -d' ' -f2)
+    if [ "$current_version" != "4.46.2" ]; then
+        echo "Installing transformers 4.46.2 (current version: $current_version)"
+        pip3 install transformers==4.46.2
+    else
+        echo "transformers 4.46.2 is already installed"
+    fi
+    cd <LMMS_EVAL_DIR>
+    rm -rf <TARGET_DIR>
+    pip3 install -e .
+    pip3 install -U pydantic
+    pip3 install Levenshtein
+    pip3 install nltk
+    python3 -c "import nltk; nltk.download('wordnet', quiet=True); nltk.download('punkt', quiet=True)"
+fi
+TASKS=mmmu_val,mathvista_testmini,mmmu_pro
+MODEL_BASENAME=qwen2_vl
+model_checkpoint="<MODEL_CHECKPOINT_PATH>"
+echo "MODEL_BASENAME: ${MODEL_BASENAME}"
+cd <LMMS_EVAL_DIR>
+python3 -m accelerate.commands.launch --num_processes=8 --main_process_port=12345 lmms_eval \
+    --model qwen2_vl \
+    --model_args=pretrained=${model_checkpoint},max_pixels=2359296 \
+    --tasks ${TASKS} \
+    --batch_size 1 \
+    --log_samples \
+    --log_samples_suffix ${MODEL_BASENAME} \
+    --output_path ./logs

open-r1-multimodal/local_scripts/prepare_hf_data.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import matplotlib.pyplot as plt
+import seaborn as sns
+import pandas as pd
+import random
+from typing import List, Dict
+import numpy as np
+from concurrent.futures import ThreadPoolExecutor
+from tqdm import tqdm
+import datasets
+import io
+from datasets import load_dataset, load_from_disk, concatenate_datasets
+from PIL import Image
+from tqdm import tqdm
+from functools import partial
+from pillow_avif import AvifImagePlugin
+from datasets import Dataset
+import json
+import yaml
+import os
+import re
+import time
+import random
+import base64
+from openai import AzureOpenAI
+import concurrent.futures
+from typing import List, Dict
+import argparse
+import time
+def extract_problem_solution(gpt4o_response):
+    # Split the response into parts
+    parts = gpt4o_response.split("<think>")
+    # Extract the problem (first part before any <think> tags)
+    problem = parts[0].strip()
+    # Remove "Question:" prefix if it exists
+    problem = re.sub(r"^Question:\s*", "", problem)
+    # Remove "Answer:" at the end of the problem
+    problem = re.sub(r"\s*Answer:\s*$", "", problem).strip()
+    # Combine all the reasoning steps into a single <think> block
+    think_parts = [p.split("</think>")[0].strip() for p in parts[1:] if "</think>" in p]
+    solution = f"<think>{' '.join(think_parts)}</think>"
+    # Add the final answer if it exists, removing "Answer:" prefix
+    if "<answer>" in gpt4o_response:
+        final_answer = (
+            gpt4o_response.split("<answer>")[-1].split("</answer>")[0].strip()
+        )
+        final_answer = re.sub(r"^Answer:\s*", "", final_answer)
+        solution += f"\n\n<answer>{final_answer}</answer>"
+    return problem, solution
+def load_image_from_path(image_path):
+    try:
+        img = Image.open(image_path)
+        return img
+    except Exception as e:
+        print(f"Error loading image {image_path}: {str(e)}")
+        return None
+def process_raw_data(raw_data):
+    # Parse the raw data if it's a string
+    if isinstance(raw_data, str):
+        data = json.loads(raw_data)
+    else:
+        data = raw_data
+    # Extract problem and solution
+    try:
+        problem, solution = extract_problem_solution(data["gpt4o_response"])
+        image = load_image_from_path(data["image_path"])
+        return {
+            "image": image,
+            "problem": problem,
+            "solution": solution,
+            "original_question": data["question"],
+            "original_answer": data["answer"],
+        }
+    except Exception as e:
+        print(f"Error processing data {data}: {str(e)}")
+        return {
+            "image": None,
+            "problem": None,
+            "solution": None,
+            "original_question": None,
+            "original_answer": None,
+        }
+raw_data_list = [
+    "/path/to/reasoning_data_with_response_90k_verified",
+]
+raw_data = concatenate_datasets([load_from_disk(path) for path in raw_data_list])
+processed_data = raw_data.map(process_raw_data, num_proc=128).shuffle(seed=42)
+hf_dict = {
+    "image": [],
+    "problem": [],
+    "solution": [],
+    "original_question": [],
+    "original_answer": [],
+}
+for item in tqdm(processed_data):
+    hf_dict["image"].append(item["image"])
+    hf_dict["problem"].append(item["problem"])
+    hf_dict["solution"].append(item["solution"])
+    hf_dict["original_question"].append(item["original_question"])
+    hf_dict["original_answer"].append(item["original_answer"])
+features = datasets.Features(
+    {
+        "image": datasets.Image(),
+        "problem": datasets.Value("string"),
+        "solution": datasets.Value("string"),
+        "original_question": datasets.Value("string"),
+        "original_answer": datasets.Value("string"),
+    }
+)
+def has_empty_tags(text):
+    # Pattern to match empty tags like <tag></tag>
+    pattern = r"<[^>]+></[^>]+>"
+    return bool(re.search(pattern, text))
+def has_answer_pattern(text):
+    if "Answer:" in text:
+        return True
+    return False
+def has_valid_image_size(example): # for Qwen2-VL-2B's processor requirement
+    # Assuming the image is in a format that can be checked for dimensions
+    # You might need to adjust this depending on how the image is stored in your dataset
+    try:
+        image = example["image"]  # or however your image is accessed
+        if isinstance(image, dict) and "height" in image and "width" in image:
+            return image["height"] >= 28 and image["width"] >= 28
+        # If image is a PIL Image or similar
+        return image.height >= 28 and image.width >= 28
+    except:
+        return False
+ds = datasets.Dataset.from_dict(hf_dict, features=features)
+ds = ds.filter(
+    lambda x: not has_empty_tags(x["solution"])
+    and not has_answer_pattern(x["problem"])
+    and has_valid_image_size(x)
+    and x["image"] is not None,
+    num_proc=128,
+)
+# Push to Hugging Face Hub
+ds.push_to_hub("path/to/your/dataset")

open-r1-multimodal/local_scripts/train_aria_moe.sh ADDED Viewed

	@@ -0,0 +1,68 @@

+#!/bin/bash
+export NCCL_BLOCKING_WAIT=0
+export TOKENIZERS_PARALLELISM=false
+export OMP_NUM_THREADS=8
+export NCCL_IB_DISABLE=0
+export NCCL_IB_GID_INDEX=3
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_DEBUG=INFO
+# CONFIG Huggingface
+# export HF_TOKEN="<PLACEHOLDER_HF_TOKEN_1>"
+export HF_TOKEN="<PLACEHOLDER_HF_TOKEN_2>"
+export HF_HOME="$HOME/.cache/huggingface"
+export HF_HUB_ENABLE_HF_TRANSFER="1"
+export NCCL_DEBUG=INFO
+GPUS="0,1,2,3,4,5,6,7"
+# 取 worker0 第一个 port
+ports=($(echo $METIS_WORKER_0_PORT | tr ',' ' '))
+port=${ports[0]}
+port_in_cmd="$(echo "${METIS_WORKER_0_PORT:-2000}" | awk -F',' '{print $1}')"
+echo "total workers: ${ARNOLD_WORKER_NUM}"
+echo "cur worker id: ${ARNOLD_ID}"
+echo "gpus per worker: ${ARNOLD_WORKER_GPU}"
+echo "master ip: ${METIS_WORKER_0_HOST}"
+echo "master port: ${port}"
+echo "master port in cmd: ${port_in_cmd}"
+# export WANDB_BASE_URL=https://api.wandb.ai
+# export WANDB_API_KEY="<PLACEHOLDER_WANDB_KEY_1>"
+# wandb login $WANDB_API_KEY
+export WANDB_BASE_URL=https://api.wandb.ai
+export WANDB_PROJECT=vision-reasoning
+export WANDB_API_KEY="<PLACEHOLDER_WANDB_KEY_2>"
+export WANDB_RUN_NAME=Qwen-VL-2B-GRPO-$(date +%Y-%m-%d-%H-%M-%S)
+wandb login $WANDB_API_KEY
+cd /home/tiger/multimodal-open-r1
+# pip3 install vllm==0.6.6.post1
+pip3 install -e ".[dev]"
+pip3 install wandb==0.18.3
+torchrun --nproc_per_node="${ARNOLD_WORKER_GPU}" \
+    --nnodes="${ARNOLD_WORKER_NUM}" \
+    --node_rank="${ARNOLD_ID}" \
+    --master_addr="${METIS_WORKER_0_HOST}" \
+    --master_port="${port_in_cmd}" \
+    src/open_r1/grpo.py \
+    --deepspeed scripts/zero3.json \
+    --output_dir Aria-GRPO-mini_cot_80k \
+    --model_name_or_path rhymes-ai/Aria \
+    --dataset_name luodian/mini_cot_80k \
+    --max_prompt_length 8192 \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --logging_steps 1 \
+    --bf16 \
+    --report_to wandb \
+    --gradient_checkpointing true \
+    --attn_implementation eager \
+    --save_total_limit 8 \
+    --num_train_epochs 1 \
+    --run_name $WANDB_RUN_NAME

open-r1-multimodal/local_scripts/train_qwen2_vl.sh ADDED Viewed

	@@ -0,0 +1,61 @@

+#!/bin/bash
+export NCCL_BLOCKING_WAIT=0
+export TOKENIZERS_PARALLELISM=false
+export OMP_NUM_THREADS=8
+export NCCL_IB_DISABLE=0
+export NCCL_IB_GID_INDEX=3
+export NCCL_SOCKET_IFNAME=eth0
+export NCCL_DEBUG=INFO
+GPUS="0,1,2,3,4,5,6,7"
+# 取 worker0 第一个 port
+ports=($(echo $METIS_WORKER_0_PORT | tr ',' ' '))
+port=${ports[0]}
+port_in_cmd="$(echo "${METIS_WORKER_0_PORT:-2000}" | awk -F',' '{print $1}')"
+echo "total workers: ${ARNOLD_WORKER_NUM}"
+echo "cur worker id: ${ARNOLD_ID}"
+echo "gpus per worker: ${ARNOLD_WORKER_GPU}"
+echo "master ip: ${METIS_WORKER_0_HOST}"
+echo "master port: ${port}"
+echo "master port in cmd: ${port_in_cmd}"
+# export WANDB_BASE_URL=https://api.wandb.ai
+# export WANDB_API_KEY="<PLACEHOLDER_WANDB_KEY_1>"
+# wandb login $WANDB_API_KEY
+export WANDB_BASE_URL=https://api.wandb.ai
+export WANDB_PROJECT=vision-reasoning
+export WANDB_API_KEY="<PLACEHOLDER_WANDB_KEY_2>"
+export WANDB_RUN_NAME=Qwen-VL-2B-GRPO-$(date +%Y-%m-%d-%H-%M-%S)
+wandb login $WANDB_API_KEY
+cd /home/tiger/multimodal-open-r1
+# pip3 install vllm==0.6.6.post1
+pip3 install -e ".[dev]"
+pip3 install wandb==0.18.3
+torchrun --nproc_per_node="${ARNOLD_WORKER_GPU}" \
+    --nnodes="${ARNOLD_WORKER_NUM}" \
+    --node_rank="${ARNOLD_ID}" \
+    --master_addr="${METIS_WORKER_0_HOST}" \
+    --master_port="${port_in_cmd}" \
+    src/open_r1/grpo.py \
+    --deepspeed scripts/zero3.json \
+    --output_dir checkpoints/${WANDB_RUN_NAME} \
+    --model_name_or_path Qwen/Qwen2-VL-2B-Instruct \
+    --dataset_name luodian/${DATASET_NAME} \
+    --max_prompt_length 8192 \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --logging_steps 1 \
+    --bf16 \
+    --report_to wandb \
+    --gradient_checkpointing true \
+    --attn_implementation flash_attention_2 \
+    --max_pixels 2359296 \
+    --save_total_limit 8 \
+    --num_train_epochs 1 \
+    --run_name $WANDB_RUN_NAME

open-r1-multimodal/local_scripts/zero2.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": false,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "contiguous_gradients": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}

open-r1-multimodal/local_scripts/zero3.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}

open-r1-multimodal/local_scripts/zero3_offload.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "steps_per_print": 1e5,
+    "wall_clock_breakdown": false
+}

open-r1-multimodal/run_scripts/multinode_training_args.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+output_dir: /path/to/output/runs/Qwen2.5-VL-3B-Idefics-V3-RSN-ai2d-500steps
+model_name_or_path: /path/to/models/Qwen2.5-VL-3B-Instruct
+dataset_name: Idefics-ai2d
+data_file_paths: /path/to/data/ai2d.jsonl
+image_folders: /path/to/images
+max_prompt_length: 1024
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 2
+logging_steps: 1
+bf16: true
+report_to: wandb
+gradient_checkpointing: false
+deepspeed: /path/to/config/zero3.json
+attn_implementation: flash_attention_2
+max_pixels: 401408
+max_steps: 500
+run_name: Qwen2.5-VL-3B-Idefics-V3-RSN-ai2d-500steps-multinode
+save_steps: 100
+save_total_limit: 3
+save_only_model: true
+num_generations: 8

open-r1-multimodal/run_scripts/multinode_training_demo.sh ADDED Viewed

	@@ -0,0 +1,145 @@

+#!/bin/bash
+RUN_NAME=multinode_training # assume there is a ${RUN_NAME}_args.yaml file in the current directory
+declare -A node2ip_map
+node2ip_map=(
+    ["node1"]="192.168.1.101"
+    ["node2"]="192.168.1.102"
+    ["node3"]="192.168.1.103"
+    ["node4"]="192.168.1.104"
+)
+# Default nodes if no arguments provided
+DEFAULT_NODES=("node1" "node2")
+# Local codebase path in file system
+LOCAL_CODEBASE_PATH="/path/to/your/codebase"
+# Use provided nodes or default nodes
+if [ "$#" -ge 1 ]; then
+    NODES=("$@")
+else
+    NODES=("${DEFAULT_NODES[@]}")
+    echo "Using default nodes: ${NODES[*]}"
+fi
+# Add this debug line
+echo "All nodes in order: ${NODES[@]}"
+TOTAL_NODES=${#NODES[@]}
+MASTER_NODE=${NODES[0]}
+MASTER_PORT=12345
+# Get project root directory (using the directory where this script is located)
+PROJECT_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+echo "Project root directory: $PROJECT_ROOT"
+# Get master node IP address
+echo "MASTER_NODE: $MASTER_NODE"
+MASTER_IP="${node2ip_map[$MASTER_NODE]}"
+echo "Master node IP: $MASTER_IP"
+# Create log directory for each node
+LOG_DIR="path/to/your/log/dir"
+mkdir -p $LOG_DIR
+# Generate docker-compose.yml
+echo "Generating docker-compose.yml..."
+cat > docker-compose.yml << EOL
+version: '3.8'
+services:
+  trainer:
+    image: your/training-image:tag
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
+    shm_size: '8gb'
+    volumes:
+      - /path/to/data:/data
+      - $LOCAL_CODEBASE_PATH/src:/workspace/src
+    environment:
+      - MASTER_ADDR=\${MASTER_ADDR:-$MASTER_IP}
+      - MASTER_PORT=\${MASTER_PORT:-12345}
+      - NODE_RANK=\${NODE_RANK:-0}
+      - WORLD_SIZE=\${WORLD_SIZE:-4}
+      - DEBUG_MODE=true
+      - LOG_PATH=${LOG_DIR}/debug_log.txt
+      - WANDB_API_KEY=your_wandb_api_key  # Optional: for logging with weights & biases
+      - WANDB_PROJECT=your_project_name
+      - WANDB_RUN_NAME=${RUN_NAME}-$(date +%Y-%m-%d-%H-%M-%S)
+      - PYTHONPATH=/workspace/src
+    network_mode: "host"
+    command: /bin/bash
+    working_dir: /workspace
+EOL
+# Function to build training arguments from yaml
+build_train_args() {
+    args=""
+    while IFS=": " read -r key value; do
+        [[ -z "$key" || "$key" =~ ^[[:space:]]*# ]] && continue
+        value=$(echo "$value" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' -e 's/^"//' -e 's/"$//')
+        if [[ "$value" == "true" ]]; then
+            args="$args --$key"
+        elif [[ "$value" == "false" ]]; then
+            continue
+        else
+            args="$args --$key $value"
+        fi
+    done < ${RUN_NAME}_args.yaml
+    echo "$args"
+}
+# Get training arguments
+TRAIN_ARGS=$(build_train_args)
+echo "TRAIN_ARGS: $TRAIN_ARGS"
+# Launch containers on each node
+NODE_RANK=0
+for host in "${NODES[@]}"; do
+    LOG_FILE="$LOG_DIR/${host}_rank${NODE_RANK}.log"
+    if [ "$host" = "$MASTER_NODE" ]; then
+        echo "Launching on master $host with rank $NODE_RANK, logging to $LOG_FILE"
+        ssh $host "cd $PROJECT_ROOT && \
+            MASTER_ADDR=$MASTER_IP \
+            NODE_RANK=$NODE_RANK \
+            WORLD_SIZE=$TOTAL_NODES \
+            sudo -E docker-compose -f docker-compose.yml run --rm trainer \
+            torchrun --nproc_per_node=8 \
+            --nnodes=$TOTAL_NODES \
+            --node_rank=$NODE_RANK \
+            --master_addr=$MASTER_IP \
+            --master_port=$MASTER_PORT \
+            src/train.py \
+            $TRAIN_ARGS" > "$LOG_FILE" 2>&1 &
+    else
+        echo "Launching on $host with rank $NODE_RANK, logging to $LOG_FILE"
+        ssh $host "cd $PROJECT_ROOT && \
+            MASTER_ADDR=$MASTER_IP \
+            NODE_RANK=$NODE_RANK \
+            WORLD_SIZE=$TOTAL_NODES \
+            sudo -E docker-compose -f docker-compose.yml run --rm trainer \
+            torchrun --nproc_per_node=8 \
+            --nnodes=$TOTAL_NODES \
+            --node_rank=$NODE_RANK \
+            --master_addr=$MASTER_IP \
+            --master_port=$MASTER_PORT \
+            src/train.py \
+            $TRAIN_ARGS" > "$LOG_FILE" 2>&1 &
+    fi
+    NODE_RANK=$((NODE_RANK + 1))
+done
+echo "Jobs launched. To monitor the logs, you can:"
+echo "1. Use 'tail -f $LOG_DIR/*.log' to watch all logs"
+echo "2. Use 'tail -f $LOG_DIR/<node_name>_rank<N>.log' to watch a specific node"
+# Wait for all background processes to complete
+wait

open-r1-multimodal/run_scripts/run_grpo_gui.sh ADDED Viewed

	@@ -0,0 +1,34 @@

+cd src/open-r1-multimodal
+export DEBUG_MODE="true"
+# export CUDA_VISIBLE_DEVICES=4,5,6,7
+RUN_NAME="Qwen2.5-VL-3B-GRPO-GUI_multi-image"
+export LOG_PATH="./debug_log_$RUN_NAME.txt"
+torchrun --nproc_per_node="8" \
+    --nnodes="1" \
+    --node_rank="0" \
+    --master_addr="127.0.0.1" \
+    --master_port="12346" \
+    src/open_r1/grpo_jsonl.py \
+    --deepspeed local_scripts/zero3.json \
+    --output_dir output/$RUN_NAME \
+    --model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \
+    --dataset_name none \
+    --image_folders /path/to/images/ \
+    --data_file_paths data_jsonl/gui_multi-image.jsonl \
+    --freeze_vision_modules true \
+    --max_prompt_length 1024 \
+    --num_generations 8 \
+    --per_device_train_batch_size 8 \
+    --gradient_accumulation_steps 2 \
+    --logging_steps 1 \
+    --bf16 \
+    --torch_dtype bfloat16 \
+    --data_seed 42 \
+    --report_to wandb \
+    --gradient_checkpointing true \
+    --attn_implementation flash_attention_2 \
+    --num_train_epochs 2 \
+    --run_name $RUN_NAME \
+    --save_steps 100 \
+    --save_only_model true

open-r1-multimodal/run_scripts/run_grpo_gui_grounding.sh ADDED Viewed

	@@ -0,0 +1,34 @@

+cd src/open-r1-multimodal
+export DEBUG_MODE="true"
+# export CUDA_VISIBLE_DEVICES=4,5,6,7
+RUN_NAME="Qwen2.5-VL-3B-GRPO-GUI-Grounding_showui_desktop_high_quality_attention_0.2_filtered_continual_dense_reward_quadratic_decay_0.5_format_bs16_kl0.004_nothink_10e_max_pixel_4028160"
+export LOG_PATH="./debug_log_$RUN_NAME.txt"
+torchrun --nproc_per_node="8" \
+    --nnodes="1" \
+    --node_rank="0" \
+    --master_addr="127.0.0.1" \
+    --master_port="12346" \
+    src/open_r1/grpo_gui_grounding.py \
+    --deepspeed local_scripts/zero3.json \
+    --output_dir output/$RUN_NAME \
+    --model_name_or_path /data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/Qwen2.5-VL-3B-Instruct \
+    --dataset_name data_config/gui_grounding.yaml \
+    --image_root /data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/data  \
+    --max_prompt_length 4096 \
+    --max_completion_length 1400 \
+    --num_generations 8 \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 2 \
+    --logging_steps 1 \
+    --bf16 \
+    --torch_dtype bfloat16 \
+    --data_seed 42 \
+    --report_to wandb \
+    --gradient_checkpointing false \
+    --attn_implementation flash_attention_2 \
+    --num_train_epochs 10 \
+    --run_name $RUN_NAME \
+    --save_steps 100 \
+    --save_only_model true

open-r1-multimodal/run_scripts/run_grpo_rec.sh ADDED Viewed

	@@ -0,0 +1,33 @@

+cd src/open-r1-multimodal
+export DEBUG_MODE="true"
+# export CUDA_VISIBLE_DEVICES=4,5,6,7
+RUN_NAME="Qwen2.5-VL-7B-GRPO-REC"
+export LOG_PATH="./debug_log_$RUN_NAME.txt"
+torchrun --nproc_per_node="8" \
+    --nnodes="1" \
+    --node_rank="0" \
+    --master_addr="127.0.0.1" \
+    --master_port="12346" \
+    src/open_r1/grpo_rec.py \
+    --deepspeed local_scripts/zero3.json \
+    --output_dir output/$RUN_NAME \
+    --model_name_or_path /data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/Qwen2.5-VL-3B-Instruct \
+    --dataset_name data_config/rec.yaml \
+    --image_root /data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/data  \
+    --max_prompt_length 1024 \
+    --num_generations 8 \
+    --per_device_train_batch_size 4 \
+    --gradient_accumulation_steps 4 \
+    --logging_steps 1 \
+    --bf16 \
+    --torch_dtype bfloat16 \
+    --data_seed 42 \
+    --report_to wandb \
+    --gradient_checkpointing false \
+    --attn_implementation flash_attention_2 \
+    --num_train_epochs 2 \
+    --run_name $RUN_NAME \
+    --save_steps 100 \
+    --save_only_model true

open-r1-multimodal/run_scripts/run_grpo_rec_internvl.sh ADDED Viewed

	@@ -0,0 +1,36 @@

+cd src/open-r1-multimodal
+export DEBUG_MODE="true"
+# export CUDA_VISIBLE_DEVICES=4,5,6,7
+RUN_NAME="InternVL-4B-GRPO-REC"
+export LOG_PATH="./debug_log_$RUN_NAME.txt"
+torchrun --nproc_per_node="8" \
+    --nnodes="1" \
+    --node_rank="0" \
+    --master_addr="127.0.0.1" \
+    --master_port="12346" \
+    src/open_r1/grpo_rec.py \
+    --deepspeed local_scripts/zero_stage2_config.json \
+    --output_dir output/$RUN_NAME \
+    --model_name_or_path /data10/shz/ckpt/vlm-r1-related/InternVL2_5-4B \
+    --dataset_name data_config/rec_internvl.yaml \
+    --image_root /data10/shz/dataset/coco \
+    --freeze_vision_modules true \
+    --max_anyres_num 6 \
+    --max_prompt_length 1024 \
+    --num_generations 8 \
+    --per_device_train_batch_size 8 \
+    --gradient_accumulation_steps 2 \
+    --logging_steps 1 \
+    --bf16 \
+    --torch_dtype bfloat16 \
+    --data_seed 42 \
+    --report_to wandb \
+    --gradient_checkpointing true \
+    --attn_implementation flash_attention_2 \
+    --num_train_epochs 2 \
+    --run_name $RUN_NAME \
+    --save_steps 100 \
+    --save_only_model true

open-r1-multimodal/run_scripts/run_grpo_rec_lora.sh ADDED Viewed

	@@ -0,0 +1,43 @@

+cd src/open-r1-multimodal
+export DEBUG_MODE="true"
+# export CUDA_VISIBLE_DEVICES=4,5,6,7
+RUN_NAME="Qwen2.5-VL-7B-GRPO-REC-lora"
+export LOG_PATH="./debug_log_$RUN_NAME.txt"
+torchrun --nproc_per_node="8" \
+    --nnodes="1" \
+    --node_rank="0" \
+    --master_addr="127.0.0.1" \
+    --master_port="12346" \
+    src/open_r1/grpo_rec.py \
+    --deepspeed local_scripts/zero2.json \
+    --output_dir output/$RUN_NAME \
+    --model_name_or_path Qwen/Qwen2.5-VL-7B-Instruct \
+    --dataset_name data_config/rec.yaml \
+    --image_root <your_image_root> \
+    --max_prompt_length 1024 \
+    --num_generations 8 \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 2 \
+    --logging_steps 1 \
+    --bf16 \
+    --torch_dtype bfloat16 \
+    --data_seed 42 \
+    --report_to wandb \
+    --gradient_checkpointing true \
+    --attn_implementation flash_attention_2 \
+    --num_train_epochs 2 \
+    --run_name $RUN_NAME \
+    --save_steps 100 \
+    --save_only_model true \
+    --learning_rate 1e-5 \
+    --use_peft true \
+    --lora_r 64 \
+    --lora_alpha 128 \
+    --lora_dropout 0.05 \
+    --lora_task_type CAUSAL_LM \
+    --freeze_vision_modules true

open-r1-multimodal/setup.cfg ADDED Viewed

	@@ -0,0 +1,41 @@

+[isort]
+default_section = FIRSTPARTY
+ensure_newline_before_comments = True
+force_grid_wrap = 0
+include_trailing_comma = True
+known_first_party = open_r1
+known_third_party =
+    transformers
+    datasets
+    fugashi
+    git
+    h5py
+    matplotlib
+    nltk
+    numpy
+    packaging
+    pandas
+    psutil
+    pytest
+    rouge_score
+    sacrebleu
+    seqeval
+    sklearn
+    streamlit
+    torch
+    tqdm
+line_length = 119
+lines_after_imports = 2
+multi_line_output = 3
+use_parentheses = True
+[flake8]
+ignore = E203, E501, E741, W503, W605
+max-line-length = 119
+per-file-ignores =
+    # imported but unused
+    __init__.py: F401
+[tool:pytest]
+doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS

open-r1-multimodal/src/open_r1.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,63 @@

+Metadata-Version: 2.2
+Name: open-r1
+Version: 0.1.0.dev0
+Summary: Open R1
+Home-page: https://github.com/huggingface/open-r1
+Author: The Hugging Face team (past and future)
+Author-email: lewis@huggingface.co
+License: Apache
+Keywords: llm inference-time compute reasoning
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Education
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: Apache Software License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.10.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: accelerate>=1.2.1
+Requires-Dist: bitsandbytes>=0.43.0
+Requires-Dist: einops>=0.8.0
+Requires-Dist: datasets>=3.2.0
+Requires-Dist: deepspeed==0.15.4
+Requires-Dist: hf_transfer>=0.1.4
+Requires-Dist: huggingface-hub[cli]<1.0,>=0.19.2
+Requires-Dist: liger_kernel==0.5.2
+Requires-Dist: packaging>=23.0
+Requires-Dist: safetensors>=0.3.3
+Requires-Dist: sentencepiece>=0.1.99
+Requires-Dist: transformers>=4.49.0
+Requires-Dist: trl@ git+https://github.com/huggingface/trl.git@main
+Provides-Extra: tests
+Requires-Dist: pytest; extra == "tests"
+Requires-Dist: parameterized>=0.9.0; extra == "tests"
+Provides-Extra: torch
+Requires-Dist: torch>=2.5.1; extra == "torch"
+Provides-Extra: quality
+Requires-Dist: black>=24.4.2; extra == "quality"
+Requires-Dist: isort>=5.12.0; extra == "quality"
+Requires-Dist: flake8>=6.0.0; extra == "quality"
+Provides-Extra: eval
+Requires-Dist: math-verify; extra == "eval"
+Provides-Extra: dev
+Requires-Dist: black>=24.4.2; extra == "dev"
+Requires-Dist: isort>=5.12.0; extra == "dev"
+Requires-Dist: flake8>=6.0.0; extra == "dev"
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: parameterized>=0.9.0; extra == "dev"
+Requires-Dist: math-verify; extra == "dev"
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: license
+Dynamic: provides-extra
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary

open-r1-multimodal/src/open_r1.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

open-r1-multimodal/src/open_r1.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,36 @@

+accelerate>=1.2.1
+bitsandbytes>=0.43.0
+einops>=0.8.0
+datasets>=3.2.0
+deepspeed==0.15.4
+hf_transfer>=0.1.4
+huggingface-hub[cli]<1.0,>=0.19.2
+liger_kernel==0.5.2
+packaging>=23.0
+safetensors>=0.3.3
+sentencepiece>=0.1.99
+transformers>=4.49.0
+trl@ git+https://github.com/huggingface/trl.git@main
+[dev]
+black>=24.4.2
+isort>=5.12.0
+flake8>=6.0.0
+pytest
+parameterized>=0.9.0
+math-verify
+[eval]
+math-verify
+[quality]
+black>=24.4.2
+isort>=5.12.0
+flake8>=6.0.0
+[tests]
+pytest
+parameterized>=0.9.0
+[torch]
+torch>=2.5.1

open-r1-multimodal/src/open_r1.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ open_r1

open-r1-multimodal/src/open_r1/__init__.py ADDED Viewed

File without changes

open-r1-multimodal/src/open_r1/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (222 Bytes). View file

open-r1-multimodal/src/open_r1/configs.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# coding=utf-8
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass, field
+from typing import Optional
+import trl
+# TODO: add the shared options with a mixin to reduce code duplication
+@dataclass
+class GRPOConfig(trl.GRPOConfig):
+    """
+    args for callbacks, benchmarks etc
+    """
+    benchmarks: list[str] = field(
+        default_factory=lambda: [], metadata={"help": "The benchmarks to run after training."}
+    )
+    callbacks: list[str] = field(
+        default_factory=lambda: [], metadata={"help": "The callbacks to run during training."}
+    )
+    system_prompt: Optional[str] = field(
+        default=None, metadata={"help": "The optional system prompt to use for benchmarking."}
+    )
+    hub_model_revision: Optional[str] = field(
+        default="main", metadata={"help": "The Hub model branch to push the model to."}
+    )
+    overwrite_hub_revision: bool = field(default=False, metadata={"help": "Whether to overwrite the Hub revision."})
+    push_to_hub_revision: bool = field(default=False, metadata={"help": "Whether to push to a Hub revision/branch."})
+    wandb_entity: Optional[str] = field(
+        default=None,
+        metadata={"help": ("The entity to store runs under.")},
+    )
+    wandb_project: Optional[str] = field(
+        default=None,
+        metadata={"help": ("The project to store runs under.")},
+    )
+@dataclass
+class SFTConfig(trl.SFTConfig):
+    """
+    args for callbacks, benchmarks etc
+    """
+    benchmarks: list[str] = field(
+        default_factory=lambda: [], metadata={"help": "The benchmarks to run after training."}
+    )
+    callbacks: list[str] = field(
+        default_factory=lambda: [], metadata={"help": "The callbacks to run during training."}
+    )
+    system_prompt: Optional[str] = field(
+        default=None,
+        metadata={"help": "The optional system prompt to use for benchmarking."},
+    )
+    hub_model_revision: Optional[str] = field(
+        default="main",
+        metadata={"help": "The Hub model branch to push the model to."},
+    )
+    overwrite_hub_revision: bool = field(default=False, metadata={"help": "Whether to overwrite the Hub revision."})
+    push_to_hub_revision: bool = field(default=False, metadata={"help": "Whether to push to a Hub revision/branch."})
+    wandb_entity: Optional[str] = field(
+        default=None,
+        metadata={"help": ("The entity to store runs under.")},
+    )
+    wandb_project: Optional[str] = field(
+        default=None,
+        metadata={"help": ("The project to store runs under.")},
+    )

open-r1-multimodal/src/open_r1/evaluate.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Custom evaluation tasks for LightEval."""
+from lighteval.metrics.dynamic_metrics import (
+    ExprExtractionConfig,
+    LatexExtractionConfig,
+    multilingual_extractive_match_metric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+from lighteval.utils.language import Language
+metric = multilingual_extractive_match_metric(
+    language=Language.ENGLISH,
+    fallback_mode="first_match",
+    precision=5,
+    gold_extraction_target=(LatexExtractionConfig(),),
+    pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig()),
+    aggregation_function=max,
+)
+def prompt_fn(line, task_name: str = None):
+    """Assumes the model is either prompted to emit \\boxed{answer} or does so automatically"""
+    return Doc(
+        task_name=task_name,
+        query=line["problem"],
+        choices=[line["solution"]],
+        gold_index=0,
+    )
+# Define tasks
+aime24 = LightevalTaskConfig(
+    name="aime24",
+    suite=["custom"],
+    prompt_function=prompt_fn,
+    hf_repo="HuggingFaceH4/aime_2024",
+    hf_subset="default",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=32768,
+    metric=[metric],
+    version=1,
+)
+math_500 = LightevalTaskConfig(
+    name="math_500",
+    suite=["custom"],
+    prompt_function=prompt_fn,
+    hf_repo="HuggingFaceH4/MATH-500",
+    hf_subset="default",
+    hf_avail_splits=["test"],
+    evaluation_splits=["test"],
+    few_shots_split=None,
+    few_shots_select=None,
+    generation_size=32768,
+    metric=[metric],
+    version=1,
+)
+# Add tasks to the table
+TASKS_TABLE = []
+TASKS_TABLE.append(aime24)
+TASKS_TABLE.append(math_500)
+# MODULE LOGIC
+if __name__ == "__main__":
+    print([t["name"] for t in TASKS_TABLE])
+    print(len(TASKS_TABLE))

open-r1-multimodal/src/open_r1/generate.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+from distilabel.llms import OpenAILLM
+from distilabel.pipeline import Pipeline
+from distilabel.steps.tasks import TextGeneration
+def build_distilabel_pipeline(
+    model: str,
+    base_url: str = "http://localhost:8000/v1",
+    prompt_column: Optional[str] = None,
+    temperature: Optional[float] = None,
+    top_p: Optional[float] = None,
+    max_new_tokens: int = 8192,
+    num_generations: int = 1,
+) -> Pipeline:
+    generation_kwargs = {"max_new_tokens": max_new_tokens}
+    if temperature is not None:
+        generation_kwargs["temperature"] = temperature
+    if top_p is not None:
+        generation_kwargs["top_p"] = top_p
+    with Pipeline().ray() as pipeline:
+        TextGeneration(
+            llm=OpenAILLM(
+                base_url=base_url,
+                api_key="something",
+                model=model,
+                # thinking can take some time...
+                timeout=10 * 60,
+                generation_kwargs=generation_kwargs,
+            ),
+            input_mappings={"instruction": prompt_column} if prompt_column is not None else {},
+            input_batch_size=64,  # on 4 nodes bs ~60+ leads to preemption due to KV cache exhaustion
+            num_generations=num_generations,
+        )
+    return pipeline
+if __name__ == "__main__":
+    import argparse
+    from datasets import load_dataset
+    parser = argparse.ArgumentParser(description="Run distilabel pipeline for generating responses with DeepSeek R1")
+    parser.add_argument(
+        "--hf-dataset",
+        type=str,
+        required=True,
+        help="HuggingFace dataset to load",
+    )
+    parser.add_argument(
+        "--hf-dataset-config",
+        type=str,
+        required=False,
+        help="Dataset config to use",
+    )
+    parser.add_argument(
+        "--hf-dataset-split",
+        type=str,
+        default="train",
+        help="Dataset split to use",
+    )
+    parser.add_argument("--prompt-column", type=str, default="prompt")
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Model name to use for generation",
+    )
+    parser.add_argument(
+        "--vllm-server-url",
+        type=str,
+        default="http://localhost:8000/v1",
+        help="URL of the vLLM server",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        help="Temperature for generation",
+    )
+    parser.add_argument(
+        "--top-p",
+        type=float,
+        help="Top-p value for generation",
+    )
+    parser.add_argument(
+        "--max-new-tokens",
+        type=int,
+        default=8192,
+        help="Maximum number of new tokens to generate",
+    )
+    parser.add_argument(
+        "--num-generations",
+        type=int,
+        default=1,
+        help="Number of generations per problem",
+    )
+    parser.add_argument(
+        "--hf-output-dataset",
+        type=str,
+        required=False,
+        help="HuggingFace repo to push results to",
+    )
+    parser.add_argument(
+        "--private",
+        action="store_true",
+        help="Whether to make the output dataset private when pushing to HF Hub",
+    )
+    args = parser.parse_args()
+    print("\nRunning with arguments:")
+    for arg, value in vars(args).items():
+        print(f"  {arg}: {value}")
+    print()
+    print(f"Loading '{args.hf_dataset}' (config: {args.hf_dataset_config}, split: {args.hf_dataset_split}) dataset...")
+    dataset = load_dataset(args.hf_dataset, split=args.hf_dataset_split)
+    print("Dataset loaded!")
+    pipeline = build_distilabel_pipeline(
+        model=args.model,
+        base_url=args.vllm_server_url,
+        prompt_column=args.prompt_column,
+        temperature=args.temperature,
+        top_p=args.top_p,
+        max_new_tokens=args.max_new_tokens,
+        num_generations=args.num_generations,
+    )
+    print("Running generation pipeline...")
+    distiset = pipeline.run(dataset=dataset, use_cache=False)
+    print("Generation pipeline finished!")
+    if args.hf_output_dataset:
+        print(f"Pushing resulting dataset to '{args.hf_output_dataset}'...")
+        distiset.push_to_hub(args.hf_output_dataset, private=args.private)
+        print("Dataset pushed!")

open-r1-multimodal/src/open_r1/grpo_gui_grounding.py ADDED Viewed

	@@ -0,0 +1,357 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# import debugpy
+# try:
+#     # 5678 is the default attach port in the VS Code debug configurations. Unless a host and port are specified, host defaults to 127.0.0.1
+#     debugpy.listen(("localhost", 9501))
+#     print("Waiting for debugger attach")
+#     debugpy.wait_for_client()
+# except Exception as e:
+#     pass
+import os
+import re
+from datetime import datetime
+from dataclasses import dataclass, field
+from typing import Optional
+from PIL import Image
+from torch.utils.data import Dataset
+from transformers import Qwen2VLForConditionalGeneration
+from math_verify import parse, verify
+from open_r1.trainer import VLMGRPOTrainer, GRPOConfig, Qwen2VLGRPOVLLMTrainer,Qwen2VLGRPOTrainer
+from open_r1.vlm_modules import *
+from trl import ModelConfig, ScriptArguments, TrlParser, get_peft_config
+from transformers import TrainingArguments
+import yaml
+import json
+import random
+import math
+# ----------------------- Fix the flash attention bug in the current version of transformers -----------------------
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLVisionFlashAttention2, apply_rotary_pos_emb_flashatt, flash_attn_varlen_func
+import torch
+from typing import Tuple
+def custom_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        # print(111, 222, 333, 444, 555, 666, 777, 888, 999)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+            cos = emb.cos().float()
+            sin = emb.sin().float()
+        else:
+            cos, sin = position_embeddings
+            # Add this
+            cos = cos.to(torch.float)
+            sin = sin.to(torch.float)
+        q, k = apply_rotary_pos_emb_flashatt(q.unsqueeze(0), k.unsqueeze(0), cos, sin)
+        q = q.squeeze(0)
+        k = k.squeeze(0)
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
+            seq_length, -1
+        )
+        attn_output = self.proj(attn_output)
+        return attn_output
+def smart_resize(
+    height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 4028160
+):
+    """Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    if height < factor or width < factor:
+        raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+    elif max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+Qwen2_5_VLVisionFlashAttention2.forward = custom_forward
+# ----------------------- Main Script -----------------------
+@dataclass
+class GRPOScriptArguments(ScriptArguments):
+    """
+    Script arguments for the GRPO training script.
+    Args:
+        reward_funcs (`list[str]`):
+            List of reward functions. Possible values: 'accuracy', 'format'.
+    """
+    reward_funcs: list[str] = field(
+        default_factory=lambda: ["accuracy","format"],
+        metadata={"help": "List of reward functions. Possible values: 'accuracy', 'format'"},
+    )
+    max_pixels: Optional[int] = field(
+        default=4028160,
+        metadata={"help": "Maximum number of pixels for the image (for QwenVL)"},
+    )
+    min_pixels: Optional[int] = field(
+        default=3136,
+        metadata={"help": "Minimum number of pixels for the image (for QwenVL)"},
+    )
+    max_anyres_num: Optional[int] = field(
+        default=12,
+        metadata={"help": "Maximum number of anyres blocks for the image (for InternVL)"},
+    )
+    image_root: Optional[str] = field(
+        default=None,
+        metadata={"help": "Root directory of the image"},
+    )
+@dataclass
+class GRPOModelConfig(ModelConfig):
+    freeze_vision_modules: bool = False
+SYSTEM_PROMPT = (
+    "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
+    "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
+    "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
+    "<think> reasoning process here </think><answer> answer here </answer>"
+)
+import json
+import os
+import random
+from PIL import Image
+import yaml
+from torch.utils.data import Dataset
+class LazySupervisedDataset(Dataset):
+    """A dataset class to process conversations with system, human, and GPT messages, including images."""
+    def __init__(self, data_path: str, script_args, question_template: str = None):
+        """
+        Initialize the dataset.
+        Args:
+            data_path (str): Path to the data file (.json or .yaml).
+            script_args: Arguments containing image_root and other configurations.
+            question_template (str, optional): Kept for compatibility, not used here.
+        """
+        super(LazySupervisedDataset, self).__init__()
+        self.script_args = script_args
+        self.list_data_dict = []
+        self.question_template = question_template  # Unused but kept for compatibility
+        # Load data based on file type
+        if data_path.endswith(".json"):
+            # Direct JSON file containing conversations
+            with open(data_path, "r") as json_file:
+                self.list_data_dict = json.load(json_file)
+            print(f"Loaded {len(self.list_data_dict)} samples from {data_path}")
+        elif data_path.endswith(".yaml"):
+            # Original YAML-based loading (for backward compatibility)
+            with open(data_path, "r") as file:
+                yaml_data = yaml.safe_load(file)
+                datasets = yaml_data.get("datasets", [])
+                for data in datasets:
+                    json_path = data.get("json_path")
+                    if json_path.endswith(".jsonl"):
+                        cur_data_dict = [json.loads(line.strip()) for line in open(json_path, "r")]
+                    elif json_path.endswith(".json"):
+                        with open(json_path, "r") as json_file:
+                            cur_data_dict = json.load(json_file)
+                    else:
+                        raise ValueError(f"Unsupported file type: {json_path}")
+                    self.list_data_dict.extend(cur_data_dict)
+            print(f"Loaded {len(self.list_data_dict)} samples from YAML config")
+        else:
+            raise ValueError(f"Unsupported file type: {data_path}")
+    def __len__(self):
+        """Return the number of samples in the dataset."""
+        return len(self.list_data_dict)
+    def __getitem__(self, i):
+        """
+        Retrieve a processed sample by index.
+        Args:
+            i (int): Index of the sample.
+        Returns:
+            dict: Contains 'image', 'prompt', and 'solution'.
+        """
+        example = self.list_data_dict[i]
+        conversations = example["conversations"]
+        images = example.get("images", [])
+        bbox = example.get("bbox", [])
+        # Extract messages (assuming one of each role)
+        try:
+            system_message = next(msg["value"] for msg in conversations if msg["from"] == "system")
+            human_message = next(msg["value"] for msg in conversations if msg["from"] == "human")
+            gpt_message = next(msg["value"] for msg in conversations if msg["from"] == "gpt")
+        except StopIteration:
+            raise ValueError("Conversation missing required system, human, or gpt message.")
+        # Handle image if present
+        image = None
+        image_root = self.script_args.image_root
+        if "<image>" in human_message and images:
+            image_path = os.path.join(image_root, images[0])
+            # Fallback: try another sample if image is missing
+            tries = 0
+            max_tries = 10
+            while tries < max_tries and not os.path.exists(image_path):
+                print(f"Warning: Image {image_path} not found, selecting another sample")
+                i = random.randint(0, len(self.list_data_dict) - 1)
+                example = self.list_data_dict[i]
+                conversations = example["conversations"]
+                images = example.get("images", [])
+                try:
+                    system_message = next(msg["value"] for msg in conversations if msg["from"] == "system")
+                    human_message = next(msg["value"] for msg in conversations if msg["from"] == "human")
+                    gpt_message = next(msg["value"] for msg in conversations if msg["from"] == "gpt")
+                except StopIteration:
+                    tries += 1
+                    continue
+                if "<image>" not in human_message or not images:
+                    image_path = None
+                    break
+                image_path = os.path.join(image_root, images[0])
+                tries += 1
+            if image_path and os.path.exists(image_path):
+                image = Image.open(image_path).convert("RGB")
+            elif tries >= max_tries:
+                print("Warning: No valid image found after max tries, proceeding without image")
+                image = None
+        height,width = image.size if image else (0, 0)
+        resized_height, resized_width = smart_resize(height, width)
+        image = image.resize((resized_height, resized_width))
+        print(f"Image size: {image.size}")
+        # Construct user content with image if applicable
+        if image and "<image>" in human_message:
+            # Split human message around <image> placeholder
+            parts = human_message.split("<image>", 1)
+            user_content = []
+            if parts[0]:  # Text before <image>
+                user_content.append({"type": "text", "text": parts[0]})
+            user_content.append({"type": "image"})  # Image placeholder
+            if len(parts) > 1 and parts[1]:  # Text after <image>
+                user_content.append({"type": "text", "text": parts[1]})
+        else:
+            user_content = human_message  # Plain text if no image
+        # Build the prompt
+        prompt = [
+            {"role": "system", "content": system_message},
+            {"role": "user", "content": user_content}
+        ]
+        # Return processed sample
+        return {
+            "image": image,        # PIL Image or None
+            "prompt": prompt,      # List of messages for the model
+            "solution": bbox  # GPT response (e.g., tool call)
+        }
+def get_vlm_module(model_name_or_path):
+    if "qwen" in model_name_or_path.lower():
+        return Qwen2VLModule
+    elif "internvl" in model_name_or_path.lower():
+        return InvernVLModule
+    else:
+        raise ValueError(f"Unsupported model: {model_name_or_path}")
+def main(script_args, training_args, model_args):
+    # Load the VLM module
+    vlm_module_cls = get_vlm_module(model_args.model_name_or_path)
+    # print("Module file:", vlm_module_cls.__module__)
+    # print("available attributes:",dir(vlm_module_cls))
+    # print("using vlm module:", vlm_module_cls.__name__)
+    # Load the reward functions
+    reward_funcs_registry = {
+        "accuracy": vlm_module_cls.point_reward,
+        # "accuracy_v2": vlm_module_cls.point_reward_v2,
+        "format": vlm_module_cls.format_reward_rec,
+    }
+    reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]
+    print("reward_funcs:", reward_funcs)
+    # Load the dataset
+    dataset = LazySupervisedDataset(script_args.dataset_name, script_args, question_template=vlm_module_cls.get_question_template(task_type="rec"))
+    trainer_cls = Qwen2VLGRPOTrainer
+    print('-'*100)
+    print(script_args.max_pixels)
+    print(script_args.min_pixels)
+    print('-'*100)
+    # Initialize the GRPO trainer
+    trainer = trainer_cls(
+        model=model_args.model_name_or_path,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        train_dataset=dataset,
+        eval_dataset=None,
+        peft_config=get_peft_config(model_args),
+        max_pixels=script_args.max_pixels,
+        min_pixels=script_args.min_pixels,
+        attn_implementation=model_args.attn_implementation,
+    )
+    # Train and push the model to the Hub
+    trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+if __name__ == "__main__":
+    parser = TrlParser((GRPOScriptArguments, GRPOConfig, GRPOModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    main(script_args, training_args, model_args)

open-r1-multimodal/src/open_r1/grpo_jsonl.py ADDED Viewed

	@@ -0,0 +1,649 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+import pathlib
+from datetime import datetime
+from dataclasses import dataclass, field
+from typing import Optional
+from babel.numbers import parse_decimal
+from utils.math import compute_score
+from datasets import load_dataset, load_from_disk
+from transformers import Qwen2VLForConditionalGeneration
+from math_verify import parse, verify
+from open_r1.trainer import VLMGRPOTrainer, GRPOConfig
+from trl import ModelConfig, ScriptArguments, TrlParser, get_peft_config
+import PIL
+from Levenshtein import ratio
+from open_r1.utils.pycocotools.coco import COCO
+from open_r1.utils.pycocotools.cocoeval import COCOeval
+import json
+from open_r1.vlm_modules import *
+# ----------------------- Fix the flash attention bug in the current version of transformers -----------------------
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLVisionFlashAttention2, apply_rotary_pos_emb_flashatt, flash_attn_varlen_func
+import torch
+from typing import Tuple
+from transformers.utils import logging
+from openai import OpenAI
+logger = logging.get_logger(__name__)
+client = OpenAI(
+    api_key=os.getenv("OPENAI_API_KEY", "sk-proj-1234567890"),
+    base_url=os.getenv("OPENAI_API_BASE", "https://api.openai.com/v1")
+)
+def custom_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        # print(111, 222, 333, 444, 555, 666, 777, 888, 999)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+            cos = emb.cos().float()
+            sin = emb.sin().float()
+        else:
+            cos, sin = position_embeddings
+            # Add this
+            cos = cos.to(torch.float)
+            sin = sin.to(torch.float)
+        q, k = apply_rotary_pos_emb_flashatt(q.unsqueeze(0), k.unsqueeze(0), cos, sin)
+        q = q.squeeze(0)
+        k = k.squeeze(0)
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
+            seq_length, -1
+        )
+        attn_output = self.proj(attn_output)
+        return attn_output
+Qwen2_5_VLVisionFlashAttention2.forward = custom_forward
+@dataclass
+class GRPOScriptArguments(ScriptArguments):
+    """
+    Script arguments for the GRPO training script.
+    """
+    data_file_paths: str = field(
+        default=None,
+        metadata={"help": "Paths to data files, separated by ':'"},
+    )
+    image_folders: str = field(
+        default=None,
+        metadata={"help": "Paths to image folders, separated by ':'"},
+    )
+    arrow_cache_dir: str = field(
+        default=None,
+        metadata={"help": "Path to arrow cache directory"},
+    )
+    val_split_ratio: float = field(
+        default=0.0,
+        metadata={"help": "Ratio of validation split, default 0.0"},
+    )
+    reward_funcs: list[str] = field(
+        default_factory=lambda: ["accuracy", "format"],
+        metadata={"help": "List of reward functions. Possible values: 'accuracy', 'format'"},
+    )
+    max_pixels: Optional[int] = field(
+        default=12845056,
+        metadata={"help": "Maximum number of pixels for the image (for QwenVL)"},
+    )
+    min_pixels: Optional[int] = field(
+        default=3136,
+        metadata={"help": "Minimum number of pixels for the image (for QwenVL)"},
+    )
+    max_anyres_num: Optional[int] = field(
+        default=12,
+        metadata={"help": "Maximum number of anyres blocks for the image (for InternVL)"},
+    )
+    reward_method: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Choose reward method: 'default', 'mcp', ..."
+        },
+    )
+def extract_choice(text):
+    # 1. Clean and normalize text
+    text = text.upper()  # Convert to uppercase
+    text = re.sub(r'\s+', ' ', text)  # Normalize spaces
+    # 2. Choice should not have uppercase letters before or after
+    choices = re.findall(r'(?<![A-Z])([A-Z])(?=[\.\,\?\!\:\;]|$)', text)
+    if not choices:
+        return None
+    # 3. If only one choice, return it directly
+    if len(choices) == 1:
+        return choices[0]
+    # 4. If multiple choices, use heuristic rules
+    choice_scores = {choice: 0 for choice in choices}
+    # 4.1 Keywords around choices get points
+    keywords = [
+        '答案', '选择', '正确', '是', '对',
+        'answer', 'correct', 'choose', 'select', 'right',
+        '认为', '应该', '觉得', 'think', 'believe', 'should'
+    ]
+    # Get context for each choice (20 chars before and after)
+    for choice in choices:
+        pos = text.find(choice)
+        context = text[max(0, pos-20):min(len(text), pos+20)]
+        # Add points for keywords
+        for keyword in keywords:
+            if keyword.upper() in context:
+                choice_scores[choice] += 1
+        # Add points if choice is near the end (usually final answer)
+        if pos > len(text) * 0.7:  # In last 30% of text
+            choice_scores[choice] += 2
+        # Add points if followed by punctuation
+        if pos < len(text) - 1 and text[pos+1] in '。.!！,，':
+            choice_scores[choice] += 1
+    # Return highest scoring choice
+    return max(choice_scores.items(), key=lambda x: x[1])[0]
+def evaluate_answer_similarity(student_answer, ground_truth):
+    """Use llm to evaluate answer similarity."""
+    try:
+        response = client.chat.completions.create(
+            model="qwen2.5:7b",
+            messages=[
+                {
+                    "role": "user",
+                    "content": "You are a evaluation expert. First, analyze the student's response to identify and extract their final answer. Then, compare the extracted answer with the correct solution. Output ONLY '1.0' if the extracted answer matches the correct solution in meaning, or '0.0' if the student's response does not contain a clear or correct answer. No other output is allowed."
+                },
+                {
+                    "role": "user",
+                    "content": f"Student's response: {student_answer}\nCorrect solution: {ground_truth}\nOutput only 1.0 or 0.0:"
+                }
+            ],
+            temperature=0
+        )
+        result = response.choices[0].message.content.strip()
+        return float(result)
+    except Exception as e:
+        print(f"Error in GPT evaluation: {e}")
+        # If API call fails, fall back to simple text matching
+        return 1.0 if student_answer ==ground_truth else 0.0
+def llm_reward(content, sol, **kwargs):
+    # Extract answer from content if it has think/answer tags
+    sol_match = re.search(r'<answer>(.*?)</answer>', sol)
+    ground_truth = sol_match.group(1).strip() if sol_match else sol.strip()
+    # Extract answer from content if it has think/answer tags
+    content_matches = re.findall(r'<answer>(.*?)</answer>', content, re.DOTALL)
+    student_answer = content_matches[-1].strip() if content_matches else content.strip()
+    return evaluate_answer_similarity(student_answer, ground_truth)
+def mcq_reward(content, sol, **kwargs):
+    # For multiple choice, extract and compare choices
+    has_choices = extract_choice(sol)
+    correct_choice = has_choices.upper() if has_choices else sol.strip()
+    # Extract answer from content if it has think/answer tags
+    content_match = re.search(r'<answer>(.*?)</answer>', content, re.DOTALL)
+    student_answer = content_match.group(1).strip() if content_match else content.strip()
+    student_choice = extract_choice(student_answer)
+    if student_choice:
+        reward = 1.0 if student_choice == correct_choice else 0.0
+    else:
+        reward = 0.0
+    return reward
+def yes_no_reward(content, sol, **kwargs):
+    content = content.lower()
+    sol = sol.lower()
+    # Extract answer from solution if it has think/answer tags
+    sol_match = re.search(r'<answer>(.*?)</answer>', sol)
+    ground_truth = sol_match.group(1).strip() if sol_match else sol.strip()
+    # Extract answer from content if it has think/answer tags
+    content_match = re.search(r'<answer>(.*?)</answer>', content, re.DOTALL)
+    student_answer = content_match.group(1).strip() if content_match else content.strip()
+    ground_yes_no = re.search(r'(yes|no)', ground_truth)
+    ground_yes_no = ground_yes_no.group(1) if ground_yes_no else ''
+    student_yes_no = re.search(r'(yes|no)', student_answer)
+    student_yes_no = student_yes_no.group(1) if student_yes_no else ''
+    reward = 1.0 if ground_yes_no == student_yes_no else 0.0
+    return reward
+def calculate_map(pred_bbox_list, gt_bbox_list):
+    # Calculate mAP
+    # Initialize COCO object for ground truth
+    gt_json = {"annotations": [], "images": [], "categories": []}
+    gt_json["images"] = [{
+        "id": 0,
+        "width": 2048,
+        "height": 2048,
+        "file_name": "image_0.jpg"
+    }]
+    gt_json["categories"] = []
+    cats2id = {}
+    cat_count = 0
+    for idx, gt_bbox in enumerate(gt_bbox_list):
+        if gt_bbox["label"] not in cats2id:
+            cats2id[gt_bbox["label"]] = cat_count
+            gt_json["categories"].append({
+                "id": cat_count,
+                "name": gt_bbox["label"]
+            })
+            cat_count += 1
+        gt_json["annotations"].append({
+            "id": idx+1,
+            "image_id": 0,
+            "category_id": cats2id[gt_bbox["label"]],
+            "bbox": [gt_bbox["bbox_2d"][0], gt_bbox["bbox_2d"][1], gt_bbox["bbox_2d"][2] - gt_bbox["bbox_2d"][0], gt_bbox["bbox_2d"][3] - gt_bbox["bbox_2d"][1]],
+            "area": (gt_bbox["bbox_2d"][2] - gt_bbox["bbox_2d"][0]) * (gt_bbox["bbox_2d"][3] - gt_bbox["bbox_2d"][1]),
+            "iscrowd": 0
+        })
+    coco_gt = COCO(gt_json)
+    dt_json = []
+    for idx, pred_bbox in enumerate(pred_bbox_list):
+        try:
+            dt_json.append({
+                "image_id": 0,
+                "category_id": cats2id[pred_bbox["label"]],
+                "bbox": [pred_bbox["bbox_2d"][0], pred_bbox["bbox_2d"][1], pred_bbox["bbox_2d"][2] - pred_bbox["bbox_2d"][0], pred_bbox["bbox_2d"][3] - pred_bbox["bbox_2d"][1]],
+                "score": 1.0,
+                "area": (pred_bbox["bbox_2d"][2] - pred_bbox["bbox_2d"][0]) * (pred_bbox["bbox_2d"][3] - pred_bbox["bbox_2d"][1])
+            })
+        except:
+            pass
+    if len(dt_json) == 0:
+        return 0.0
+    coco_dt = coco_gt.loadRes(dt_json)
+    coco_eval = COCOeval(coco_gt, coco_dt, "bbox")
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    return coco_eval.stats[1]
+def map_reward(content, sol, **kwargs):
+    """
+    Calculate mean average precision (mAP) reward between predicted and ground truth bounding boxes
+    Args:
+        content: String containing predicted bounding boxes in JSON format
+        sol: String containing ground truth bounding boxes in JSON format
+    Returns:
+        float: mAP reward score between 0 and 1
+    """
+    # Extract JSON content between ```json tags
+    pattern = r'```json(.*?)```'
+    json_match = re.search(pattern, sol, re.DOTALL)
+    bbox_json = json_match.group(1).strip() if json_match else None
+    # Parse ground truth JSON to get bbox list
+    gt_bbox_list = []
+    if bbox_json:
+        bbox_data = json.loads(bbox_json)
+        gt_bbox_list = [item for item in bbox_data]
+    # Parse predicted JSON to get bbox list
+    pred_bbox_list = []
+    json_match = re.search(pattern, content, re.DOTALL)
+    if json_match:
+        try:
+            bbox_data = json.loads(json_match.group(1).strip())
+            pred_bbox_list = [item for item in bbox_data]
+        except:
+            # Return empty list if JSON parsing fails
+            pred_bbox_list = []
+    # Calculate mAP if both prediction and ground truth exist
+    if len(pred_bbox_list) > 0 and len(gt_bbox_list) > 0:
+        bbox_reward = calculate_map(pred_bbox_list, gt_bbox_list)
+    else:
+        bbox_reward = 0.0
+    return bbox_reward
+def numeric_reward(content, sol, **kwargs):
+    content = clean_text(content)
+    sol = clean_text(sol)
+    try:
+        content, sol = float(content), float(sol)
+        return 1.0 if content == sol else 0.0
+    except:
+        return None
+def math_reward(content, sol, **kwargs):
+    content = clean_text(content)
+    sol = clean_text(sol)
+    return compute_score(content, sol)
+def clean_text(text, exclue_chars=['\n', '\r']):
+    # Extract content between <answer> and </answer> if present
+    answer_matches = re.findall(r'<answer>(.*?)</answer>', text, re.DOTALL)
+    if answer_matches:
+        # Use the last match
+        text = answer_matches[-1]
+    for char in exclue_chars:
+        if char in ['\n', '\r']:
+            # If there is a space before the newline, remove the newline
+            text = re.sub(r'(?<=\s)' + re.escape(char), '', text)
+            # If there is no space before the newline, replace it with a space
+            text = re.sub(r'(?<!\s)' + re.escape(char), ' ', text)
+        else:
+            text = text.replace(char, ' ')
+    # Remove leading and trailing spaces and convert to lowercase
+    return text.strip().rstrip('.').lower()
+def default_accuracy_reward(content, sol, **kwargs):
+    reward = 0.0
+        # Extract answer from solution if it has think/answer tags
+    sol_match = re.search(r'<answer>(.*?)</answer>', sol)
+    ground_truth = sol_match.group(1).strip() if sol_match else sol.strip()
+    # Extract answer from content if it has think/answer tags
+    content_matches = re.findall(r'<answer>(.*?)</answer>', content, re.DOTALL)
+    student_answer = content_matches[-1].strip() if content_matches else content.strip()
+    # Try symbolic verification first for numeric answers
+    try:
+        answer = parse(student_answer)
+        if float(verify(answer, parse(ground_truth))) > 0:
+            reward = 1.0
+    except Exception:
+        pass  # Continue to next verification method if this fails
+    # If symbolic verification failed, try string matching or fuzzy matching
+    if reward == 0.0:
+        try:
+            # Check if ground truth contains numbers
+            has_numbers = bool(re.search(r'\d', ground_truth))
+            # Check if it's a multiple choice question
+            has_choices = extract_choice(ground_truth)
+            if has_numbers:
+                # For numeric answers, use exact matching
+                reward = numeric_reward(student_answer, ground_truth)
+                if reward is None:
+                    reward = ratio(clean_text(student_answer), clean_text(ground_truth))
+            elif has_choices:
+                # For multiple choice, extract and compare choices
+                correct_choice = has_choices.upper()
+                student_choice = extract_choice(student_answer)
+                if student_choice:
+                    reward = 1.0 if student_choice == correct_choice else 0.0
+            else:
+                # For text answers, use fuzzy matching
+                reward = ratio(clean_text(student_answer), clean_text(ground_truth))
+        except Exception:
+            pass  # Keep reward as 0.0 if all methods fail
+    return reward
+def accuracy_reward(completions, solution, **kwargs):
+    """Reward function that checks if the completion is correct using symbolic verification, exact string matching, or fuzzy matching."""
+    contents = [completion[0]["content"] for completion in completions]
+    rewards = []
+    for content, sol, accu_reward_method in zip(contents, solution, kwargs.get("accu_reward_method")):
+        # if accu_reward_method is defined, use the corresponding reward function, otherwise use the default reward function
+        if accu_reward_method == "mcq":
+            reward = mcq_reward(content, sol)
+        elif accu_reward_method == 'yes_no':
+            reward = yes_no_reward(content, sol)
+        elif accu_reward_method == 'llm':
+            reward = llm_reward(content, sol)
+        elif accu_reward_method == 'map':
+            reward = map_reward(content, sol)
+        elif accu_reward_method == 'math':
+            reward = math_reward(content, sol)
+        else:
+            reward = default_accuracy_reward(content, sol)
+        rewards.append(reward)
+        if os.getenv("DEBUG_MODE") == "true":
+            log_path = os.getenv("LOG_PATH")
+            current_time = datetime.now().strftime("%d-%H-%M-%S-%f")
+            image_path = kwargs.get("image_path")[0] if "image_path" in kwargs else None
+            problem = kwargs.get("problem")[0]
+            if reward <= 1.0:  # this condition can be changed for debug
+                with open(log_path, "a", encoding='utf-8') as f:
+                    f.write(f"------------- {current_time} Accuracy reward: {reward} -------------\n")
+                    f.write(f"accu_reward_method: {accu_reward_method}\n")
+                    f.write(f"image_path: {image_path}\n")
+                    f.write(f"problem: {problem}\n")
+                    f.write(f"Content: {content}\n")
+                    f.write(f"Solution: {sol}\n")
+    return rewards
+def format_reward(completions, **kwargs):
+    """Reward function that checks if the completion has a specific format."""
+    pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
+    completion_contents = [completion[0]["content"] for completion in completions]
+    matches = [re.fullmatch(pattern, content, re.DOTALL) for content in completion_contents]
+    current_time = datetime.now().strftime("%d-%H-%M-%S-%f")
+    if os.getenv("DEBUG_MODE") == "true":
+        log_path = os.getenv("LOG_PATH")
+        with open(log_path.replace(".txt", "_format.txt"), "a", encoding='utf-8') as f:
+            f.write(f"------------- {current_time} Format reward -------------\n")
+            for content, match in zip(completion_contents, matches):
+                f.write(f"Content: {content}\n")
+                f.write(f"Has format: {bool(match)}\n")
+    return [1.0 if match else 0.0 for match in matches]
+reward_funcs_registry = {
+    "accuracy": accuracy_reward,
+    "format": format_reward,
+}
+@dataclass
+class GRPOModelConfig(ModelConfig):
+    freeze_vision_modules: bool = False
+SYSTEM_PROMPT = (
+    "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
+    "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
+    "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
+    "<think> reasoning process here </think><answer> answer here </answer>"
+)
+def get_vlm_module(model_name_or_path):
+    if "qwen" in model_name_or_path.lower():
+        return Qwen2VLModule
+    elif "internvl" in model_name_or_path.lower():
+        return InvernVLModule
+    else:
+        raise ValueError(f"Unsupported model: {model_name_or_path}")
+def main(script_args, training_args, model_args):
+    # Load the VLM module
+    vlm_module_cls = get_vlm_module(model_args.model_name_or_path)
+    print("using vlm module:", vlm_module_cls.__name__)
+    question_prompt = vlm_module_cls.get_question_template(task_type="default")
+    # Get reward functions
+    reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]
+    print("reward_funcs:", reward_funcs)
+    # Load the JSONL datasets
+    import json
+    from datasets import Dataset
+    data_files = script_args.data_file_paths.split(":")
+    image_folders = script_args.image_folders.split(":")
+    if len(data_files) != len(image_folders):
+        raise ValueError("Number of data files must match number of image folders")
+    if script_args.reward_method is None:
+        accu_reward_methods = ["default"] * len(data_files)
+    else:
+        accu_reward_methods = script_args.reward_method.split(":")
+        assert len(accu_reward_methods) == len(data_files), f"Number of reward methods must match number of data files: {len(accu_reward_methods)} != {len(data_files)}"
+    if len(data_files) != len(image_folders):
+        raise ValueError("Number of data files must match number of image folders")
+    all_data = []
+    for data_file, image_folder, accu_reward_method in zip(data_files, image_folders, accu_reward_methods):
+        with open(data_file, 'r') as f:
+            for line in f:
+                item = json.loads(line)
+                if 'image' in item:
+                    if isinstance(item['image'], str):
+                        # Store image path instead of loading the image
+                        item['image_path'] = [os.path.join(image_folder, item['image'])]
+                        del item['image'] # remove the image column so that it can be loaded later
+                    elif isinstance(item['image'], list):
+                        # if the image is a list, then it is a list of images (for multi-image input)
+                        item['image_path'] = [os.path.join(image_folder, image) for image in item['image']]
+                        del item['image'] # remove the image column so that it can be loaded later
+                    else:
+                        raise ValueError(f"Unsupported image type: {type(item['image'])}")
+                # Remove immediate image loading
+                item['problem'] = item['conversations'][0]['value'].replace('<image>', '')
+                # Handle solution that could be a float or string
+                solution_value = item['conversations'][1]['value']
+                if isinstance(solution_value, str):
+                    item['solution'] = solution_value.replace('<answer>', '').replace('</answer>', '').strip()
+                else:
+                    # If it's a float or other non-string type, keep it as is
+                    item['solution'] = str(solution_value)
+                del item['conversations']
+                item['accu_reward_method'] = item.get('accu_reward_method', accu_reward_method) # if accu_reward_method is in the data jsonl, use the value in the data jsonl, otherwise use the defined value
+                all_data.append(item)
+    dataset = Dataset.from_list(all_data)
+    def make_conversation_from_jsonl(example):
+        if 'image_path' in example and example['image_path'] is not None:
+            # Don't load image here, just store the path
+            return {
+                'image_path': [p for p in example['image_path']],  # Store path instead of loaded image
+                'problem': example['problem'],
+                'solution': f"<answer> {example['solution']} </answer>",
+                'accu_reward_method': example['accu_reward_method'],
+                'prompt': [{
+                    'role': 'user',
+                    'content': [
+                        *({'type': 'image', 'text': None} for _ in range(len(example['image_path']))),
+                        {'type': 'text', 'text': question_prompt.format(Question=example['problem'])}
+                    ]
+                }]
+            }
+        else:
+            return {
+                'problem': example['problem'],
+                'solution': f"<answer> {example['solution']} </answer>",
+                'accu_reward_method': example['accu_reward_method'],
+                'prompt': [{
+                    'role': 'user',
+                    'content': [
+                        {'type': 'text', 'text': question_prompt.format(Question=example['problem'])}
+                    ]
+                }]
+            }
+    # Map the conversations
+    dataset = dataset.map(make_conversation_from_jsonl, num_proc=8)
+    # Split dataset for validation if requested
+    splits = {'train': dataset}
+    if script_args.val_split_ratio > 0:
+        train_val_split = dataset.train_test_split(
+            test_size=script_args.val_split_ratio
+        )
+        splits['train'] = train_val_split['train']
+        splits['validation'] = train_val_split['test']
+    # Select trainer class based on vlm_trainer argument
+    trainer_cls = VLMGRPOTrainer
+    print("using trainer:", trainer_cls.__name__)
+    # Initialize the GRPO trainer
+    trainer = trainer_cls(
+        model=model_args.model_name_or_path,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        vlm_module=vlm_module_cls(),
+        train_dataset=splits['train'],
+        eval_dataset=splits.get('validation') if training_args.eval_strategy != "no" else None,
+        peft_config=get_peft_config(model_args),
+        freeze_vision_modules=model_args.freeze_vision_modules,
+        attn_implementation=model_args.attn_implementation,
+        max_pixels=script_args.max_pixels,
+        min_pixels=script_args.min_pixels,
+    )
+    # Train and push the model to the Hub
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub()
+if __name__ == "__main__":
+    parser = TrlParser((GRPOScriptArguments, GRPOConfig, GRPOModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    main(script_args, training_args, model_args)

open-r1-multimodal/src/open_r1/grpo_rec.py ADDED Viewed

	@@ -0,0 +1,291 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# import debugpy
+# try:
+#     # 5678 is the default attach port in the VS Code debug configurations. Unless a host and port are specified, host defaults to 127.0.0.1
+#     debugpy.listen(("localhost", 9501))
+#     print("Waiting for debugger attach")
+#     debugpy.wait_for_client()
+# except Exception as e:
+#     pass
+import os
+import re
+from datetime import datetime
+from dataclasses import dataclass, field
+from typing import Optional
+from PIL import Image
+from torch.utils.data import Dataset
+from transformers import Qwen2VLForConditionalGeneration
+from math_verify import parse, verify
+from open_r1.trainer import VLMGRPOTrainer, GRPOConfig
+from open_r1.vlm_modules import *
+from trl import ModelConfig, ScriptArguments, TrlParser, get_peft_config
+from transformers import TrainingArguments
+import yaml
+import json
+import random
+import math
+# ----------------------- Fix the flash attention bug in the current version of transformers -----------------------
+from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLVisionFlashAttention2, apply_rotary_pos_emb_flashatt, flash_attn_varlen_func
+import torch
+from typing import Tuple
+def custom_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        # print(111, 222, 333, 444, 555, 666, 777, 888, 999)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+            cos = emb.cos().float()
+            sin = emb.sin().float()
+        else:
+            cos, sin = position_embeddings
+            # Add this
+            cos = cos.to(torch.float)
+            sin = sin.to(torch.float)
+        q, k = apply_rotary_pos_emb_flashatt(q.unsqueeze(0), k.unsqueeze(0), cos, sin)
+        q = q.squeeze(0)
+        k = k.squeeze(0)
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
+            seq_length, -1
+        )
+        attn_output = self.proj(attn_output)
+        return attn_output
+Qwen2_5_VLVisionFlashAttention2.forward = custom_forward
+# ----------------------- Main Script -----------------------
+@dataclass
+class GRPOScriptArguments(ScriptArguments):
+    """
+    Script arguments for the GRPO training script.
+    Args:
+        reward_funcs (`list[str]`):
+            List of reward functions. Possible values: 'accuracy', 'format'.
+    """
+    reward_funcs: list[str] = field(
+        default_factory=lambda: ["accuracy", "format"],
+        metadata={"help": "List of reward functions. Possible values: 'accuracy', 'format'"},
+    )
+    max_pixels: Optional[int] = field(
+        default=3512320,
+        metadata={"help": "Maximum number of pixels for the image (for QwenVL)"},
+    )
+    min_pixels: Optional[int] = field(
+        default=3136,
+        metadata={"help": "Minimum number of pixels for the image (for QwenVL)"},
+    )
+    max_anyres_num: Optional[int] = field(
+        default=12,
+        metadata={"help": "Maximum number of anyres blocks for the image (for InternVL)"},
+    )
+    image_root: Optional[str] = field(
+        default=None,
+        metadata={"help": "Root directory of the image"},
+    )
+@dataclass
+class GRPOModelConfig(ModelConfig):
+    freeze_vision_modules: bool = False
+SYSTEM_PROMPT = (
+    "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
+    "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
+    "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
+    "<think> reasoning process here </think><answer> answer here </answer>"
+)
+class LazySupervisedDataset(Dataset):
+    def __init__(self, data_path: str, script_args: GRPOScriptArguments, question_template: str):
+        super(LazySupervisedDataset, self).__init__()
+        self.script_args = script_args
+        self.list_data_dict = []
+        self.question_template = question_template
+        if data_path.endswith(".yaml"):
+            with open(data_path, "r") as file:
+                yaml_data = yaml.safe_load(file)
+                datasets = yaml_data.get("datasets")
+                # file should be in the format of:
+                # datasets:
+                #   - json_path: xxxx1.json
+                #     sampling_strategy: first:1000
+                #   - json_path: xxxx2.json
+                #     sampling_strategy: end:3000
+                #   - json_path: xxxx3.json
+                #     sampling_strategy: random:999
+                for data in datasets:
+                    json_path = data.get("json_path")
+                    sampling_strategy = data.get("sampling_strategy", "all")
+                    sampling_number = None
+                    if json_path.endswith(".jsonl"):
+                        cur_data_dict = []
+                        with open(json_path, "r") as json_file:
+                            for line in json_file:
+                                cur_data_dict.append(json.loads(line.strip()))
+                    elif json_path.endswith(".json"):
+                        with open(json_path, "r") as json_file:
+                            cur_data_dict = json.load(json_file)
+                    else:
+                        raise ValueError(f"Unsupported file type: {json_path}")
+                    if ":" in sampling_strategy:
+                        sampling_strategy, sampling_number = sampling_strategy.split(":")
+                        if "%" in sampling_number:
+                            sampling_number = math.ceil(int(sampling_number.split("%")[0]) * len(cur_data_dict) / 100)
+                        else:
+                            sampling_number = int(sampling_number)
+                    # Apply the sampling strategy
+                    if sampling_strategy == "first" and sampling_number is not None:
+                        cur_data_dict = cur_data_dict[:sampling_number]
+                    elif sampling_strategy == "end" and sampling_number is not None:
+                        cur_data_dict = cur_data_dict[-sampling_number:]
+                    elif sampling_strategy == "random" and sampling_number is not None:
+                        random.shuffle(cur_data_dict)
+                        cur_data_dict = cur_data_dict[:sampling_number]
+                    print(f"Loaded {len(cur_data_dict)} samples from {json_path}")
+                    self.list_data_dict.extend(cur_data_dict)
+        else:
+            raise ValueError(f"Unsupported file type: {data_path}")
+    def __len__(self):
+        return len(self.list_data_dict)
+    def __getitem__(self, i):
+        # Format into conversation
+        def make_conversation(example):
+            return {
+                "prompt": [
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": example["problem"]},
+                ],
+            }
+        QUESTION_TEMPLATE = self.question_template
+        def make_conversation_image(example):
+            return {
+                "prompt": [
+                    # {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "image"},
+                            {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                        ],
+                    },
+                ],
+            }
+        example = self.list_data_dict[i]
+        image_root = self.script_args.image_root
+        if 'image' in example:
+            image_path = os.path.join(image_root, example['image'])
+            # In case the image is not found
+            while not os.path.exists(image_path):
+                print(f"Warning: Image {image_path} not found, randomly selecting another image")
+                new_index = random.randint(0, len(self.list_data_dict)-1)
+                example = self.list_data_dict[new_index]
+                image_path = os.path.join(image_root, example['image'])
+            image = Image.open(image_path).convert("RGB")
+        else:
+            image = None
+        return {
+            'image': image,
+            'problem': example['problem'],
+            'solution': example['solution'],
+            'prompt': make_conversation_image(example)['prompt'] if 'image' in example else make_conversation(example)['prompt'],
+        }
+def get_vlm_module(model_name_or_path):
+    if "qwen" in model_name_or_path.lower():
+        return Qwen2VLModule
+    elif "internvl" in model_name_or_path.lower():
+        return InvernVLModule
+    else:
+        raise ValueError(f"Unsupported model: {model_name_or_path}")
+def main(script_args, training_args, model_args):
+    # Load the VLM module
+    vlm_module_cls = get_vlm_module(model_args.model_name_or_path)
+    print("using vlm module:", vlm_module_cls.__name__)
+    # Load the reward functions
+    reward_funcs_registry = {
+        "accuracy": vlm_module_cls.iou_reward,
+        "format": vlm_module_cls.format_reward_rec,
+    }
+    reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]
+    print("reward_funcs:", reward_funcs)
+    # Load the dataset
+    dataset = LazySupervisedDataset(script_args.dataset_name, script_args, question_template=vlm_module_cls.get_question_template(task_type="rec"))
+    trainer_cls = VLMGRPOTrainer
+    # Initialize the GRPO trainer
+    trainer = trainer_cls(
+        model=model_args.model_name_or_path,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        vlm_module=vlm_module_cls(),
+        train_dataset=dataset,
+        eval_dataset=None,
+        peft_config=get_peft_config(model_args),
+        freeze_vision_modules=model_args.freeze_vision_modules,
+        attn_implementation=model_args.attn_implementation,
+        max_pixels=script_args.max_pixels,
+        min_pixels=script_args.min_pixels,
+        max_anyres_num=script_args.max_anyres_num,
+        torch_dtype=model_args.torch_dtype,
+    )
+    # Train and push the model to the Hub
+    trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+if __name__ == "__main__":
+    parser = TrlParser((GRPOScriptArguments, GRPOConfig, GRPOModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    main(script_args, training_args, model_args)

open-r1-multimodal/src/open_r1/trainer/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (487 Bytes). View file

open-r1-multimodal/src/open_r1/trainer/__pycache__/grpo_config.cpython-310.pyc ADDED Viewed

Binary file (13 kB). View file

open-r1-multimodal/src/open_r1/trainer/__pycache__/grpo_trainer.cpython-310.pyc ADDED Viewed

Binary file (27.3 kB). View file

open-r1-multimodal/src/open_r1/trainer/__pycache__/qwen_grpo_trainer.cpython-310.pyc ADDED Viewed

Binary file (26.7 kB). View file

open-r1-multimodal/src/open_r1/trainer/grpo_config.py ADDED Viewed

	@@ -0,0 +1,286 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass, field
+from typing import Optional
+from transformers import TrainingArguments
+@dataclass
+class GRPOConfig(TrainingArguments):
+    r"""
+    Configuration class for the [`GRPOTrainer`].
+    Only the parameters specific to GRPO training are listed here. For details on other parameters, refer to the
+    [`~transformers.TrainingArguments`] documentation.
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+    Parameters:
+        > Parameters that control the model and reference model
+        model_init_kwargs (`dict[str, Any]` or `None`, *optional*, defaults to `None`):
+            Keyword arguments for [`~transformers.AutoModelForCausalLM.from_pretrained`], used when the `model`
+            argument of the [`GRPOTrainer`] is provided as a string.
+        > Parameters that control the data preprocessing
+        remove_unused_columns (`bool`, *optional*, defaults to `False`):
+            Whether to only keep the column `"prompt"` in the dataset. If you use a custom reward function that
+            requires any column other than `"prompts"` and `"completions"`, you should keep this to `False`.
+        max_prompt_length (`int` or `None`, *optional*, defaults to `512`):
+            Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left.
+        num_generations (`int` or `None`, *optional*, defaults to `8`):
+            Number of generations per prompt to sample. The global batch size (num_processes * per_device_batch_size)
+            must be divisible by this value.
+        temperature (`float`, *optional*, defaults to `0.9`):
+            Temperature for sampling. The higher the temperature, the more random the completions.
+        max_completion_length (`int` or `None`, *optional*, defaults to `256`):
+            Maximum length of the generated completion.
+        ds3_gather_for_generation (`bool`, *optional*, defaults to `True`):
+            This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for generation,
+            improving generation speed. However, disabling this option allows training models that exceed the VRAM
+            capacity of a single GPU, albeit at the cost of slower generation. Disabling this option is not compatible
+            with vLLM generation.
+        > Parameters that control generation acceleration powered by vLLM
+        use_vllm (`bool`, *optional*, defaults to `False`):
+            Whether to use vLLM for generating completions. If set to `True`, ensure that a GPU is kept unused for
+            training, as vLLM will require one for generation. vLLM must be installed (`pip install vllm`).
+        vllm_device (`str`, *optional*, defaults to `"auto"`):
+            Device where vLLM generation will run, e.g. `"cuda:1"`. If set to `"auto"` (default), the system will
+            automatically select the next available GPU after the last one used for training. This assumes that
+            training has not already occupied all available GPUs. If only one device is available, the device will be
+            shared between both training and vLLM.
+        vllm_gpu_memory_utilization (`float`, *optional*, defaults to `0.9`):
+            Ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV cache on the
+            device dedicated to generation powered by vLLM. Higher values will increase the KV cache size and thus
+            improve the model's throughput. However, if the value is too high, it may cause out-of-memory (OOM) errors
+            during initialization.
+        vllm_dtype (`str`, *optional*, defaults to `"auto"`):
+            Data type to use for vLLM generation. If set to `"auto"`, the data type will be automatically determined
+            based on the model configuration. Find the supported values in the vLLM documentation.
+        vllm_max_model_len (`int` or `None`, *optional*, defaults to `None`):
+            If set, the `max_model_len` to use for vLLM. This could be useful when running with reduced
+            `vllm_gpu_memory_utilization`, leading to a reduced KV cache size. If not set, vLLM will use the model
+            context size, which might be much larger than the KV cache, leading to inefficiencies.
+        vllm_enable_prefix_caching (`bool`, *optional*, defaults to `True`):
+            Whether to enable prefix caching in vLLM. If set to `True` (default), ensure that the model and the hardware
+            support this feature.
+        vllm_guided_decoding_regex (`str` or `None`, *optional*, defaults to `None`):
+            Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled.
+        > Parameters that control the training
+        learning_rate (`float`, *optional*, defaults to `1e-6`):
+            Initial learning rate for [`AdamW`] optimizer. The default value replaces that of
+            [`~transformers.TrainingArguments`].
+        beta (`float`, *optional*, defaults to `0.04`):
+            KL coefficient. If `0.0`, the reference model is not loaded, reducing memory usage and improving training
+            speed.
+        num_iterations (`int`, *optional*, defaults to `1`):
+            Number of iterations per batch (denoted as μ in the algorithm).
+        epsilon (`float`, *optional*, defaults to `0.2`):
+            Epsilon value for clipping.
+        reward_weights (`list[float]` or `None`, *optional*, defaults to `None`):
+            Weights for each reward function. Must match the number of reward functions. If `None`, all rewards are
+            weighted equally with weight `1.0`.
+        sync_ref_model (`bool`, *optional*, defaults to `False`):
+            Whether to synchronize the reference model with the active model every `ref_model_sync_steps` steps, using
+            the `ref_model_mixup_alpha` parameter. This synchronization originites from the
+            [TR-DPO](https://huggingface.co/papers/2404.09656) paper.
+        ref_model_mixup_alpha (`float`, *optional*, defaults to `0.6`):
+            α parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which controls the mix
+            between the current policy and the previous reference policy during updates. The reference policy is
+            updated according to the equation: `π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you
+            must set `sync_ref_model=True`.
+        ref_model_sync_steps (`int`, *optional*, defaults to `512`):
+            τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which determines how
+            frequently the current policy is synchronized with the reference policy. To use this parameter, you must
+            set `sync_ref_model=True`.
+        > Parameters that control the logging
+        log_completions (`bool`, *optional*, defaults to `False`):
+            Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is
+            installed, it prints the sample. If `wandb` logging is enabled, it logs it to `wandb`.
+    """
+    # Parameters that control the model and reference model
+    model_init_kwargs: Optional[dict] = field(
+        default=None,
+        metadata={
+            "help": "Keyword arguments for `transformers.AutoModelForCausalLM.from_pretrained`, used when the `model` "
+            "argument of the `GRPOTrainer` is provided as a string."
+        },
+    )
+    # Parameters that control the data preprocessing
+    # The default value remove_unused_columns is overwritten from the parent class, because in GRPO we usually rely on
+    # additional columns to compute the reward
+    remove_unused_columns: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Whether to only keep the column 'prompt' in the dataset. If you use a custom reward function "
+            "that requires any column other than 'prompts' and 'completions', you should keep this to `False`."
+        },
+    )
+    max_prompt_length: Optional[int] = field(
+        default=512,
+        metadata={
+            "help": "Maximum length of the prompt. If the prompt is longer than this value, it will be truncated left."
+        },
+    )
+    num_generations: Optional[int] = field(
+        default=8,
+        metadata={
+            "help": "Number of generations to sample. The global batch size (num_processes * per_device_batch_size) "
+            "must be divisible by this value."
+        },
+    )
+    temperature: Optional[float] = field(
+        default=0.9,
+        metadata={"help": "Temperature for sampling. The higher the temperature, the more random the completions."},
+    )
+    max_completion_length: Optional[int] = field(
+        default=256,
+        metadata={"help": "Maximum length of the generated completion."},
+    )
+    ds3_gather_for_generation: bool = field(
+        default=True,
+        metadata={
+            "help": "This setting applies to DeepSpeed ZeRO-3. If enabled, the policy model weights are gathered for "
+            "generation, improving generation speed. However, disabling this option allows training models that "
+            "exceed the VRAM capacity of a single GPU, albeit at the cost of slower generation. Disabling this option "
+            "is not compatible with vLLM generation."
+        },
+    )
+    # Parameters that control generation acceleration powered by vLLM
+    use_vllm: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Whether to use vLLM for generating completions. If set to `True`, ensure that a GPU is kept "
+            "unused for training, as vLLM will require one for generation. vLLM must be installed "
+            "(`pip install vllm`)."
+        },
+    )
+    vllm_device: Optional[str] = field(
+        default="auto",
+        metadata={
+            "help": "Device where vLLM generation will run, e.g. 'cuda:1'. If set to 'auto' (default), the system "
+            "will automatically select the next available GPU after the last one used for training. This assumes "
+            "that training has not already occupied all available GPUs."
+        },
+    )
+    vllm_gpu_memory_utilization: float = field(
+        default=0.9,
+        metadata={
+            "help": "Ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV "
+            "cache on the device dedicated to generation powered by vLLM. Higher values will increase the KV cache "
+            "size and thus improve the model's throughput. However, if the value is too high, it may cause "
+            "out-of-memory (OOM) errors during initialization."
+        },
+    )
+    vllm_dtype: Optional[str] = field(
+        default="auto",
+        metadata={
+            "help": "Data type to use for vLLM generation. If set to 'auto', the data type will be automatically "
+            "determined based on the model configuration. Find the supported values in the vLLM documentation."
+        },
+    )
+    vllm_max_model_len: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "If set, the `max_model_len` to use for vLLM. This could be useful when running with reduced "
+            "`vllm_gpu_memory_utilization`, leading to a reduced KV cache size. If not set, vLLM will use the model "
+            "context size, which might be much larger than the KV cache, leading to inefficiencies."
+        },
+    )
+    vllm_enable_prefix_caching: Optional[bool] = field(
+        default=True,
+        metadata={
+            "help": "Whether to enable prefix caching in vLLM. If set to `True` (default), ensure that the model and "
+            "the hardware support this feature."
+        },
+    )
+    vllm_guided_decoding_regex: Optional[str] = field(
+        default=None,
+        metadata={"help": "Regex for vLLM guided decoding. If `None` (default), guided decoding is disabled."},
+    )
+    # Parameters that control the training
+    learning_rate: float = field(
+        default=1e-6,
+        metadata={
+            "help": "Initial learning rate for `AdamW` optimizer. The default value replaces that of "
+            "`transformers.TrainingArguments`."
+        },
+    )
+    beta: float = field(
+        default=0.004,
+        metadata={
+            "help": "KL coefficient. If `0.0`, the reference model is not loaded, reducing memory usage and improving "
+            "training speed."
+        },
+    )
+    num_iterations: int = field(
+        default=1,
+        metadata={"help": "Number of iterations per batch (denoted as μ in the algorithm)."},
+    )
+    epsilon: float = field(
+        default=0.2,
+        metadata={"help": "Epsilon value for clipping."},
+    )
+    reward_weights: Optional[list[float]] = field(
+        default=None,
+        metadata={
+            "help": "Weights for each reward function. Must match the number of reward functions. If `None`, all "
+            "rewards are weighted equally with weight `1.0`."
+        },
+    )
+    sync_ref_model: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to synchronize the reference model with the active model every `ref_model_sync_steps` "
+            "steps, using the `ref_model_mixup_alpha` parameter."
+        },
+    )
+    ref_model_mixup_alpha: float = field(
+        default=0.6,
+        metadata={
+            "help": "α parameter from the TR-DPO paper, which controls the mix between the current policy and the "
+            "previous reference policy during updates. The reference policy is updated according to the equation: "
+            "`π_ref = α * π_θ + (1 - α) * π_ref_prev`. To use this parameter, you must set `sync_ref_model=True`."
+        },
+    )
+    ref_model_sync_steps: int = field(
+        default=512,
+        metadata={
+            "help": "τ parameter from the TR-DPO paper, which determines how frequently the current policy is "
+            "synchronized with the reference policy. To use this parameter, you must set `sync_ref_model=True`."
+        },
+    )
+    # Parameters that control the logging
+    log_completions: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to log a sample of (prompt, completion) pairs every `logging_steps` steps. If `rich` is "
+            "installed, it prints the sample. If `wandb` logging is enabled, it logs it to `wandb`."
+        },
+    )

open-r1-multimodal/src/open_r1/trainer/grpo_trainer.py ADDED Viewed

	@@ -0,0 +1,849 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import textwrap
+from collections import defaultdict
+from typing import Any, Callable, Optional, Union, Sized
+import torch
+import torch.utils.data
+import transformers
+from datasets import Dataset, IterableDataset
+from packaging import version
+from transformers import (
+    AriaForConditionalGeneration,
+    AriaProcessor,
+    AutoModelForCausalLM,
+    AutoModelForSequenceClassification,
+    AutoProcessor,
+    AutoTokenizer,
+    GenerationConfig,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    Qwen2VLForConditionalGeneration,
+    Qwen2_5_VLForConditionalGeneration,
+    Trainer,
+    TrainerCallback,
+    is_wandb_available,
+)
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+from transformers.utils import is_peft_available
+from trl.data_utils import apply_chat_template, is_conversational, maybe_apply_chat_template
+from trl.models import create_reference_model, prepare_deepspeed, unwrap_model_for_generation
+from trl.trainer.grpo_config import GRPOConfig
+from trl.trainer.utils import generate_model_card, get_comet_experiment_url
+from trl import GRPOTrainer
+from accelerate.utils import is_peft_model, set_seed
+import PIL.Image
+import copy
+from torch.utils.data import Sampler
+import warnings
+if is_peft_available():
+    from peft import PeftConfig, get_peft_model
+if is_wandb_available():
+    import wandb
+from open_r1.vlm_modules.vlm_module import VLMBaseModule
+# What we call a reward function is a callable that takes a list of prompts and completions and returns a list of
+# rewards. When it's a string, it's a model ID, so it's loaded as a pretrained model.
+RewardFunc = Union[str, PreTrainedModel, Callable[[list, list], list[float]]]
+class RepeatRandomSampler(Sampler):
+    """
+    Sampler that repeats the indices of a dataset in a structured manner.
+    Args:
+        data_source (`Sized`):
+            Dataset to sample from.
+        mini_repeat_count (`int`):
+            Number of times to repeat each index per batch.
+        batch_size (`int`, *optional*, defaults to `1`):
+            Number of unique indices per batch.
+        repeat_count (`int`, *optional*, defaults to `1`):
+            Number of times to repeat the full sampling process.
+        seed (`int` or `None`, *optional*, defaults to `None`):
+            Random seed for reproducibility.
+    """
+    def __init__(
+        self,
+        data_source: Sized,
+        mini_repeat_count: int,
+        batch_size: int = 1,
+        repeat_count: int = 1,
+        seed: Optional[int] = None,
+    ):
+        self.data_source = data_source
+        self.mini_repeat_count = mini_repeat_count
+        self.batch_size = batch_size
+        self.repeat_count = repeat_count
+        self.num_samples = len(data_source)
+        self.seed = seed
+        self.generator = torch.Generator()
+        if seed is not None:
+            self.generator.manual_seed(seed)
+    def __iter__(self):
+        indexes = torch.randperm(self.num_samples, generator=self.generator).tolist()
+        indexes = [indexes[i : i + self.batch_size] for i in range(0, len(indexes), self.batch_size)]
+        indexes = [chunk for chunk in indexes if len(chunk) == self.batch_size]
+        for chunk in indexes:
+            for _ in range(self.repeat_count):
+                for index in chunk:
+                    for _ in range(self.mini_repeat_count):
+                        yield index
+    def __len__(self) -> int:
+        return self.num_samples * self.mini_repeat_count * self.repeat_count
+class VLMGRPOTrainer(Trainer):
+    """
+    Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the
+    paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+    Example:
+    ```python
+    from datasets import load_dataset
+    from trl import GRPOTrainer
+    dataset = load_dataset("trl-lib/tldr", split="train")
+    trainer = GRPOTrainer(
+        model="Qwen/Qwen2-0.5B-Instruct",
+        reward_funcs="weqweasdas/RM-Gemma-2B",
+        train_dataset=dataset,
+    )
+    trainer.train()
+    ```
+    Args:
+        model (`Union[str, PreTrainedModel]`):
+            Model to be trained. Can be either:
+            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or
+              a path to a *directory* containing model weights saved using
+              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is
+              loaded using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keywork arguments
+              in `args.model_init_kwargs`.
+            - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+        reward_funcs (`Union[RewardFunc, list[RewardFunc]]`):
+            Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
+            functions with the prompts and completions and sum the rewards. Can be either:
+            - A single reward function, such as:
+                - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a
+                path to a *directory* containing model weights saved using
+                [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+                using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the
+                keyword arguments in `args.model_init_kwargs`.
+                - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported.
+                - A custom reward function: The function is provided with the prompts and the generated completions,
+                  plus any additional columns in the dataset. It should return a list of rewards. For more details, see
+                  [Using a custom reward function](#using-a-custom-reward-function).
+            - A list of reward functions, where each item can independently be any of the above types. Mixing different
+            types within the list (e.g., a string model ID and a custom reward function) is allowed.
+        args ([`GRPOConfig`], *optional*, defaults to `None`):
+            Configuration for this trainer. If `None`, a default configuration is used.
+        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+            Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is
+            ignored. The format of the samples can be either:
+            - [Standard](dataset_formats#standard): Each sample contains plain text.
+            - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
+              and content).
+        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
+            Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], *optional*, defaults to `None`):
+            Processing class used to process the data. The padding side must be set to "left". If `None`, the
+            processing class is loaded from the model's name with [`~transformers.AutoTokenizer.from_pretrained`].
+        reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
+            Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:
+            - A single processing class: Used when `reward_funcs` contains only one reward function.
+            - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
+            If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is
+            `None`, the tokenizer for the model is automatically loaded using [`~transformers.AutoTokenizer.from_pretrained`].
+            For elements in `reward_funcs` that are custom reward functions (not [`~transformers.PreTrainedModel`]),
+            the corresponding entries in `reward_processing_classes` are ignored.
+        callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`):
+            List of callbacks to customize the training loop. Will add those to the list of default callbacks
+            detailed in [here](https://huggingface.co/docs/transformers/main_classes/callback).
+            If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
+            method.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
+            A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
+            model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+        peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
+            PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+    """
+    def __init__(
+        self,
+        model: Union[str, PreTrainedModel],
+        reward_funcs: Union[RewardFunc, list[RewardFunc]],
+        args: GRPOConfig = None,
+        vlm_module: VLMBaseModule = None,
+        train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
+        eval_dataset: Optional[Union[Dataset, IterableDataset, dict[str, Union[Dataset, IterableDataset]]]] = None,
+        processing_class: Optional[PreTrainedTokenizerBase] = None,
+        reward_processing_classes: Optional[Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None),
+        peft_config: Optional["PeftConfig"] = None,
+        freeze_vision_modules: Optional[bool] = True,
+        attn_implementation: str = "flash_attention_2",
+        torch_dtype: str = "bfloat16",
+        **kwargs,
+    ):
+        # Args
+        if args is None:
+            model_name = model if isinstance(model, str) else model.config._name_or_path
+            model_name = model_name.split("/")[-1]
+            args = GRPOConfig(f"{model_name}-GRPO")
+        self.vlm_module = vlm_module
+        # Models
+        # Trained model
+        model_init_kwargs = args.model_init_kwargs or {}
+        # FIXME
+        # Remember to modify it in the invernvl
+        model_init_kwargs["attn_implementation"] = attn_implementation
+        if model_init_kwargs.get("torch_dtype") is None:
+            model_init_kwargs["torch_dtype"] = torch_dtype
+        assert isinstance(model, str), "model must be a string in the current implementation"
+        model_id = model
+        torch_dtype = model_init_kwargs.get("torch_dtype")
+        if isinstance(torch_dtype, torch.dtype) or torch_dtype == "auto" or torch_dtype is None:
+            pass  # torch_dtype is already a torch.dtype or "auto" or None
+        elif isinstance(torch_dtype, str):  # it's a str, but not "auto"
+            torch_dtype = getattr(torch, torch_dtype)
+        else:
+            raise ValueError(
+                "Invalid `torch_dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing "
+                f"a `torch.dtype` (e.g., 'float32'), but got {torch_dtype}."
+            )
+        model_init_kwargs["use_cache"] = (
+            False if args.gradient_checkpointing else model_init_kwargs.get("use_cache")
+        )
+            # Disable caching if gradient checkpointing is enabled (not supported)
+        model_init_kwargs["use_cache"] = (
+            False if args.gradient_checkpointing else model_init_kwargs.get("use_cache")
+        )
+        model_cls = self.vlm_module.get_model_class(model_id, model_init_kwargs)
+        model = model_cls.from_pretrained(model_id, **model_init_kwargs)
+        # LoRA
+        self.vision_modules_keywords = self.vlm_module.get_vision_modules_keywords()
+        if peft_config is not None:
+            def find_all_linear_names(model, multimodal_keywords):
+                cls = torch.nn.Linear
+                lora_module_names = set()
+                for name, module in model.named_modules():
+                    # LoRA is not applied to the vision modules
+                    if any(mm_keyword in name for mm_keyword in multimodal_keywords):
+                        continue
+                    if isinstance(module, cls):
+                        lora_module_names.add(name)
+                for m in lora_module_names:  # needed for 16-bit
+                    if "embed_tokens" in m:
+                        lora_module_names.remove(m)
+                return list(lora_module_names)
+            target_modules = find_all_linear_names(model, self.vision_modules_keywords)
+            peft_config.target_modules = target_modules
+            model = get_peft_model(model, peft_config)
+        # Freeze vision modules
+        if freeze_vision_modules:
+            print("Freezing vision modules...")
+            for n, p in model.named_parameters():
+                if any(keyword in n for keyword in self.vision_modules_keywords):
+                    p.requires_grad = False
+        # Enable gradient checkpointing if requested
+        if args.gradient_checkpointing:
+            model = self._enable_gradient_checkpointing(model, args)
+        # Reference model
+        if is_deepspeed_zero3_enabled():
+            self.ref_model = model_cls.from_pretrained(model_id, **model_init_kwargs)
+        elif peft_config is None:
+            # If PEFT configuration is not provided, create a reference model based on the initial model.
+            self.ref_model = create_reference_model(model)
+        else:
+            # If PEFT is used, the reference model is not needed since the adapter can be disabled
+            # to revert to the initial model.
+            self.ref_model = None
+        # Processing class
+        if processing_class is None:
+            processing_cls = self.vlm_module.get_processing_class()
+            processing_class = processing_cls.from_pretrained(model_id, trust_remote_code=model_init_kwargs.get("trust_remote_code", None))
+            for processing_keyword in self.vlm_module.get_custom_processing_keywords():
+                if processing_keyword in kwargs:
+                    setattr(processing_class, processing_keyword, kwargs[processing_keyword])
+            if getattr(processing_class, "tokenizer",  None) is not None:
+                pad_token_id = processing_class.tokenizer.pad_token_id
+                processing_class.pad_token_id = pad_token_id
+                processing_class.eos_token_id = processing_class.tokenizer.eos_token_id
+            else:
+                assert isinstance(processing_class, PreTrainedTokenizerBase), "processing_class must be an instance of PreTrainedTokenizerBase if it has no tokenizer attribute"
+                pad_token_id = processing_class.pad_token_id
+        self.vlm_module.post_model_init(model, processing_class)
+        self.vlm_module.post_model_init(self.ref_model, processing_class)
+        # Reward functions
+        if not isinstance(reward_funcs, list):
+            reward_funcs = [reward_funcs]
+        for i, reward_func in enumerate(reward_funcs):
+            if isinstance(reward_func, str):
+                reward_funcs[i] = AutoModelForSequenceClassification.from_pretrained(
+                    reward_func, num_labels=1, **model_init_kwargs
+                )
+        self.reward_funcs = reward_funcs
+        # Reward processing class
+        if reward_processing_classes is None:
+            reward_processing_classes = [None] * len(reward_funcs)
+        elif not isinstance(reward_processing_classes, list):
+            reward_processing_classes = [reward_processing_classes]
+        else:
+            if len(reward_processing_classes) != len(reward_funcs):
+                raise ValueError("The number of reward processing classes must match the number of reward functions.")
+        for i, (reward_processing_class, reward_func) in enumerate(zip(reward_processing_classes, reward_funcs)):
+            if isinstance(reward_func, PreTrainedModel):
+                if reward_processing_class is None:
+                    reward_processing_class = AutoTokenizer.from_pretrained(reward_func.config._name_or_path)
+                if reward_processing_class.pad_token_id is None:
+                    reward_processing_class.pad_token = reward_processing_class.eos_token
+                # The reward model computes the reward for the latest non-padded token in the input sequence.
+                # So it's important to set the pad token ID to the padding token ID of the processing class.
+                reward_func.config.pad_token_id = reward_processing_class.pad_token_id
+                reward_processing_classes[i] = reward_processing_class
+        self.reward_processing_classes = reward_processing_classes
+        # Data collator
+        def data_collator(features):  # No data collation is needed in GRPO
+            return features
+        # Training arguments
+        self.max_prompt_length = args.max_prompt_length
+        self.max_prompt_length = None
+        if args.max_prompt_length is not None:
+            warnings.warn("Setting max_prompt_length is currently not supported, it has been set to None")
+        self.max_completion_length = args.max_completion_length  # = |o_i| in the GRPO paper
+        self.num_generations = args.num_generations  # = G in the GRPO paper
+        self.generation_config = GenerationConfig(
+            max_new_tokens=self.max_completion_length,
+            do_sample=True,
+            temperature=1,
+            pad_token_id=pad_token_id,
+        )
+        if hasattr(self.vlm_module, "get_eos_token_id"): # For InternVL
+            self.generation_config.eos_token_id = self.vlm_module.get_eos_token_id(processing_class)
+            print(222, self.vlm_module.get_eos_token_id(processing_class))
+        self.beta = args.beta
+        self.epsilon = args.epsilon
+        # Multi-step
+        self.num_iterations = args.num_iterations  # = 𝜇 in the GRPO paper
+        # Tracks the number of iterations (forward + backward passes), including those within a gradient accumulation cycle
+        self._step = 0
+        # Buffer the batch to reuse generated outputs across multiple updates
+        self._buffered_inputs = [None] * args.gradient_accumulation_steps
+        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in GRPO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys is "prompt". As a result, the trainer issues the warning:
+        # "Could not estimate the number of tokens of the input, floating-point operations will not be computed." To
+        # suppress this warning, we set the "estimate_tokens" key in the model's "warnings_issued" dictionary to True.
+        # This acts as a flag to indicate that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+        # Initialize the metrics
+        self._metrics = defaultdict(list)
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            callbacks=callbacks,
+            optimizers=optimizers,
+        )
+        # Check if the per_device_train/eval_batch_size * num processes can be divided by the number of generations
+        num_processes = self.accelerator.num_processes
+        global_batch_size = args.per_device_train_batch_size * num_processes
+        possible_values = [n_gen for n_gen in range(2, global_batch_size + 1) if (global_batch_size) % n_gen == 0]
+        if self.num_generations not in possible_values:
+            raise ValueError(
+                f"The global train batch size ({num_processes} x {args.per_device_train_batch_size}) must be evenly "
+                f"divisible by the number of generations per prompt ({self.num_generations}). Given the current train "
+                f"batch size, the valid values for the number of generations are: {possible_values}."
+            )
+        if self.args.eval_strategy != "no":
+            global_batch_size = args.per_device_eval_batch_size * num_processes
+            possible_values = [n_gen for n_gen in range(2, global_batch_size + 1) if (global_batch_size) % n_gen == 0]
+            if self.num_generations not in possible_values:
+                raise ValueError(
+                    f"The global eval batch size ({num_processes} x {args.per_device_eval_batch_size}) must be evenly "
+                    f"divisible by the number of generations per prompt ({self.num_generations}). Given the current "
+                    f"eval batch size, the valid values for the number of generations are: {possible_values}."
+                )
+        # Ensure each process receives a unique seed to prevent duplicate completions when generating with
+        # transformers if num_generations exceeds per_device_train_batch_size. We could skip it if we use vLLM, but
+        # it's safer to set it in all cases.
+        set_seed(args.seed, device_specific=True)
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+        if self.ref_model is not None:
+            if self.is_deepspeed_enabled:
+                self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+            else:
+                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+        for i, reward_func in enumerate(self.reward_funcs):
+            if isinstance(reward_func, PreTrainedModel):
+                self.reward_funcs[i] = self.accelerator.prepare_model(reward_func, evaluation_mode=True)
+    def _enable_gradient_checkpointing(self, model: PreTrainedModel, args: GRPOConfig) -> PreTrainedModel:
+        """Enables gradient checkpointing for the model."""
+        # Ensure use_cache is disabled
+        model.config.use_cache = False
+        # Enable gradient checkpointing on the base model for PEFT
+        if is_peft_model(model):
+            model.base_model.gradient_checkpointing_enable()
+        # Enable gradient checkpointing for non-PEFT models
+        else:
+            try:
+                model.gradient_checkpointing_enable()
+            except:
+                # For InternVL; these operations are copied from the original training script of InternVL
+                model.language_model.config.use_cache = False
+                model.vision_model.gradient_checkpointing = True
+                model.vision_model.encoder.gradient_checkpointing = True
+                model.language_model._set_gradient_checkpointing()
+                # This line is necessary, otherwise the `model.gradient_checkpointing_enable()` will be executed during the training process, leading to an error since InternVL does not support this operation.
+                args.gradient_checkpointing = False
+        gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs or {}
+        use_reentrant = (
+            "use_reentrant" not in gradient_checkpointing_kwargs or gradient_checkpointing_kwargs["use_reentrant"]
+        )
+        if use_reentrant:
+            model.enable_input_require_grads()
+        return model
+    def _set_signature_columns_if_needed(self):
+        # If `self.args.remove_unused_columns` is True, non-signature columns are removed.
+        # By default, this method sets `self._signature_columns` to the model's expected inputs.
+        # In GRPOTrainer, we preprocess data, so using the model's signature columns doesn't work.
+        # Instead, we set them to the columns expected by the `training_step` method, hence the override.
+        if self._signature_columns is None:
+            self._signature_columns = ["prompt"]
+    # Get the per-token log probabilities for the completions for the model and the reference model
+    def _get_per_token_logps(self, model, input_ids, attention_mask, **custom_multimodal_inputs):
+        logits = model(input_ids=input_ids, attention_mask=attention_mask, **custom_multimodal_inputs).logits  # (B, L, V)
+        logits = logits[:, :-1, :]  # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
+        input_ids = input_ids[:, 1:]  # (B, L-1), exclude the first input ID since we don't have logits for it
+        # Compute the log probabilities for the input tokens. Use a loop to reduce memory peak.
+        per_token_logps = []
+        for logits_row, input_ids_row in zip(logits, input_ids):
+            log_probs = logits_row.log_softmax(dim=-1)
+            token_log_prob = torch.gather(log_probs, dim=1, index=input_ids_row.unsqueeze(1)).squeeze(1)
+            per_token_logps.append(token_log_prob)
+        return torch.stack(per_token_logps)
+    def _prepare_inputs(self, inputs):
+        # Simple pass-through, just like original
+        return inputs
+    def _get_key_from_inputs(self, x, key):
+        ele = x.get(key, None)
+        assert ele is not None, f"The key {key} is not found in the input"
+        if isinstance(ele, list):
+            return [e for e in ele]
+        else:
+            return [ele]
+    def _generate_and_score_completions(self, inputs: dict[str, Union[torch.Tensor, Any]], model) -> dict[str, Union[torch.Tensor, Any]]:
+        device = self.accelerator.device
+        prompts = [x["prompt"] for x in inputs]
+        prompts_text = self.vlm_module.prepare_prompt(self.processing_class, inputs)
+        # Handle both pre-loaded images and image paths
+        images = []
+        for x in inputs:
+            if "image" in x:
+                imgs = self._get_key_from_inputs(x, "image")
+            elif "image_path" in x and x["image_path"] is not None:
+                imgs = [PIL.Image.open(p) for p in self._get_key_from_inputs(x, "image_path")]
+            for img in imgs:
+                try:
+                    # Ensure minimum dimensions of 28 pixels
+                    w, h = img.size
+                    if w < 28 or h < 28:
+                    # Calculate new dimensions maintaining aspect ratio
+                        if w < h:
+                            new_w = 28
+                            new_h = int(h * (28/w))
+                        else:
+                            new_h = 28
+                            new_w = int(w * (28/h))
+                    img = img.resize((new_w, new_h), PIL.Image.Resampling.LANCZOS)
+                except:
+                    pass
+                images.append(img)
+        prompt_inputs = self.vlm_module.prepare_model_inputs(
+            self.processing_class,
+            prompts_text,
+            images,
+            return_tensors="pt",
+            padding=True,
+            padding_side="left",
+            add_special_tokens=False,
+        )
+        prompt_inputs = super()._prepare_inputs(prompt_inputs)
+        prompt_ids, prompt_mask = prompt_inputs["input_ids"], prompt_inputs["attention_mask"]
+        # max_prompt_length is not supported yet
+        # if self.max_prompt_length is not None:
+        #     prompt_ids = prompt_ids[:, -self.max_prompt_length :]
+        #     prompt_inputs["input_ids"] = prompt_ids
+        #     prompt_mask = prompt_mask[:, -self.max_prompt_length :]
+        #     prompt_inputs["attention_mask"] = prompt_mask
+        # Generate completions
+        with unwrap_model_for_generation(model, self.accelerator) as unwrapped_model:
+            generate_returned_result = unwrapped_model.generate(
+                **{k: v for k, v in prompt_inputs.items() if k not in self.vlm_module.get_non_generate_params()},
+                generation_config=self.generation_config
+            )
+            prompt_length = prompt_ids.size(1)
+            if not self.vlm_module.is_embeds_input():
+                prompt_completion_ids = generate_returned_result
+                prompt_ids = prompt_completion_ids[:, :prompt_length]
+                completion_ids = prompt_completion_ids[:, prompt_length:]
+            else:
+                # In this case, the input of the LLM backbone is the embedding of the combination of the image and text prompt
+                # So the returned result of the `generate` method only contains the completion ids
+                completion_ids = generate_returned_result
+                prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+        # Mask everything after the first EOS token
+        is_eos = completion_ids == self.processing_class.eos_token_id
+        eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device)
+        eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
+        sequence_indices = torch.arange(is_eos.size(1), device=device).expand(is_eos.size(0), -1)
+        completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()
+        # Concatenate prompt_mask with completion_mask for logit computation
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)  # (B, P+C)
+        # Get the multimodal inputs
+        multimodal_keywords = self.vlm_module.get_custom_multimodal_keywords()
+        multimodal_inputs = {k: prompt_inputs[k] if k in prompt_inputs else None for k in multimodal_keywords}
+        with torch.no_grad():
+            # When using num_iterations == 1, old_per_token_logps == per_token_logps, so we can skip its
+            # computation here, and use per_token_logps.detach() instead.
+            if self.num_iterations > 1:
+                old_per_token_logps = self._get_per_token_logps(
+                    model, prompt_completion_ids, attention_mask, **multimodal_inputs
+                )
+                old_per_token_logps = old_per_token_logps[:, prompt_length - 1:]
+            else:
+                old_per_token_logps = None
+            if self.beta == 0.0:
+                ref_per_token_logps = None
+            elif self.ref_model is not None:
+                ref_per_token_logps = self._get_per_token_logps(
+                    self.ref_model, prompt_completion_ids, attention_mask, **multimodal_inputs
+                )
+            else:
+                with self.accelerator.unwrap_model(model).disable_adapter():
+                    ref_per_token_logps = self._get_per_token_logps(
+                        model, prompt_completion_ids, attention_mask, **multimodal_inputs
+                    )
+        ref_per_token_logps = ref_per_token_logps[:, prompt_length - 1:]
+        # Decode the generated completions
+        completions = self.processing_class.batch_decode(completion_ids, skip_special_tokens=True)
+        if is_conversational(inputs[0]):
+            completions = [[{"role": "assistant", "content": completion}] for completion in completions]
+        # Compute the rewards
+        # No need to duplicate prompts as we're not generating multiple completions per prompt
+        rewards_per_func = torch.zeros(len(prompts), len(self.reward_funcs), device=device)
+        for i, (reward_func, reward_processing_class) in enumerate(
+            zip(self.reward_funcs, self.reward_processing_classes)
+        ):
+            if isinstance(reward_func, PreTrainedModel):
+                if is_conversational(inputs[0]):
+                    messages = [{"messages": p + c} for p, c in zip(prompts, completions)]
+                    texts = [apply_chat_template(x, reward_processing_class)["text"] for x in messages]
+                else:
+                    texts = [p + c for p, c in zip(prompts, completions)]
+                reward_inputs = reward_processing_class(
+                    texts, return_tensors="pt", padding=True, padding_side="right", add_special_tokens=False
+                )
+                reward_inputs = super()._prepare_inputs(reward_inputs)
+                with torch.inference_mode():
+                    rewards_per_func[:, i] = reward_func(**reward_inputs).logits[:, 0]  # Shape (B*G,)
+            else:
+                # Repeat all input columns (but "prompt" and "completion") to match the number of generations
+                reward_kwargs = {key: [] for key in inputs[0].keys() if key not in ["prompt", "completion"]}
+                for key in reward_kwargs:
+                    for example in inputs:
+                        # No need to duplicate prompts as we're not generating multiple completions per prompt
+                        # reward_kwargs[key].extend([example[key]] * self.num_generations)
+                        reward_kwargs[key].extend([example[key]])
+                output_reward_func = reward_func(prompts=prompts, completions=completions, **reward_kwargs)
+                rewards_per_func[:, i] = torch.tensor(output_reward_func, dtype=torch.float32, device=device)
+        # Gather rewards across processes
+        rewards_per_func = self.accelerator.gather(rewards_per_func)
+        # Sum the rewards from all reward functions
+        rewards = rewards_per_func.sum(dim=1)
+        # Compute grouped-wise rewards
+        # Each group consists of num_generations completions for the same prompt
+        mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1)
+        std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1)
+        # Normalize the rewards to compute the advantages
+        mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(self.num_generations, dim=0)
+        std_grouped_rewards = std_grouped_rewards.repeat_interleave(self.num_generations, dim=0)
+        advantages = (rewards - mean_grouped_rewards) / (std_grouped_rewards + 1e-4)
+        # Get only the local slice of advantages
+        process_slice = slice(
+            self.accelerator.process_index * len(prompts),
+            (self.accelerator.process_index + 1) * len(prompts),
+        )
+        advantages = advantages[process_slice]
+        # Log the metrics
+        completion_length = self.accelerator.gather_for_metrics(completion_mask.sum(1)).float().mean().item()
+        self._metrics["completion_length"].append(completion_length)
+        reward_per_func = self.accelerator.gather_for_metrics(rewards_per_func).mean(0)
+        for i, reward_func in enumerate(self.reward_funcs):
+            if isinstance(reward_func, PreTrainedModel):
+                reward_func_name = reward_func.config._name_or_path.split("/")[-1]
+            else:
+                reward_func_name = reward_func.__name__
+            self._metrics[f"rewards/{reward_func_name}"].append(reward_per_func[i].item())
+        self._metrics["reward"].append(self.accelerator.gather_for_metrics(rewards).mean().item())
+        self._metrics["reward_std"].append(self.accelerator.gather_for_metrics(std_grouped_rewards).mean().item())
+        return {
+            "prompt_ids": prompt_ids,
+            "prompt_mask": prompt_mask,
+            "completion_ids": completion_ids,
+            "completion_mask": completion_mask,
+            "old_per_token_logps": old_per_token_logps,
+            "ref_per_token_logps": ref_per_token_logps,
+            "advantages": advantages,
+            "multimodal_inputs": multimodal_inputs
+        }
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+        if return_outputs:
+            raise ValueError("The GRPOTrainer does not support returning outputs")
+        # Check if we need to generate new completions or use buffered ones
+        if self.state.global_step % self.num_iterations == 0:
+            inputs = self._generate_and_score_completions(inputs, model)
+            self._buffered_inputs[self._step % self.args.gradient_accumulation_steps] = inputs
+        else:
+            inputs = self._buffered_inputs[self._step % self.args.gradient_accumulation_steps]
+        self._step += 1
+        # Get the prepared inputs
+        prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
+        completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"]
+        multimodal_inputs = inputs["multimodal_inputs"]
+        # Concatenate for full sequence
+        input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
+        # Get the current policy's log probabilities
+        per_token_logps = self._get_per_token_logps(model, input_ids, attention_mask, **multimodal_inputs)
+        # Get rid of the prompt (-1 because of the shift done in get_per_token_logps)
+        per_token_logps = per_token_logps[:, prompt_ids.size(1) - 1:]
+        # Get the advantages from inputs
+        advantages = inputs["advantages"]
+        # When using num_iterations == 1, old_per_token_logps == per_token_logps, so we can skip its computation
+        # and use per_token_logps.detach() instead
+        old_per_token_logps = inputs["old_per_token_logps"] if self.num_iterations > 1 else per_token_logps.detach()
+        # Compute the policy ratio and clipped version
+        coef_1 = torch.exp(per_token_logps - old_per_token_logps)
+        coef_2 = torch.clamp(coef_1, 1 - self.epsilon, 1 + self.epsilon)
+        per_token_loss1 = coef_1 * advantages.unsqueeze(1)
+        per_token_loss2 = coef_2 * advantages.unsqueeze(1)
+        per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
+        # Add KL penalty if beta > 0
+        if self.beta > 0:
+            ref_per_token_logps = inputs["ref_per_token_logps"]
+            per_token_kl = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1
+            per_token_loss = per_token_loss + self.beta * per_token_kl
+            # Log KL divergence
+            mean_kl = ((per_token_kl * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean()
+            self._metrics["kl"].append(self.accelerator.gather_for_metrics(mean_kl).mean().item())
+        # Compute final loss
+        loss = ((per_token_loss * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean()
+        # Log clip ratio
+        is_clipped = (per_token_loss1 < per_token_loss2).float()
+        clip_ratio = (is_clipped * completion_mask).sum() / completion_mask.sum()
+        self._metrics["clip_ratio"].append(self.accelerator.gather_for_metrics(clip_ratio).mean().item())
+        return loss
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
+        metrics = {key: sum(val) / len(val) for key, val in self._metrics.items()}  # average the metrics
+        logs = {**logs, **metrics}
+        if version.parse(transformers.__version__) >= version.parse("4.47.0.dev0"):
+            super().log(logs, start_time)
+        else:  # transformers<=4.46
+            super().log(logs)
+        self._metrics.clear()
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+        tags = tags or []
+        if isinstance(tags, str):
+            tags = [tags]
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.append("unsloth")
+        citation = textwrap.dedent(
+            """\
+            @article{zhihong2024deepseekmath,
+                title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+                author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+                year         = 2024,
+                eprint       = {arXiv:2402.03300},
+            """
+        )
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="GRPO",
+            trainer_citation=citation,
+            paper_title="DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
+            paper_id="2402.03300",
+        )
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+    def _get_train_sampler(self) -> Sampler:
+        """Returns a sampler that ensures proper data sampling for GRPO training."""
+        effective_batch_size = (
+            self.args.per_device_train_batch_size
+            * self.accelerator.num_processes
+            * self.args.gradient_accumulation_steps
+        )
+        return RepeatRandomSampler(
+            data_source=self.train_dataset,
+            mini_repeat_count=self.num_generations,
+            batch_size=effective_batch_size // self.num_generations,
+            repeat_count=self.num_iterations,
+            seed=self.args.seed,
+        )
+    def _get_eval_sampler(self, eval_dataset) -> Sampler:
+        """Returns a sampler for evaluation."""
+        return RepeatRandomSampler(
+            data_source=eval_dataset,
+            mini_repeat_count=self.num_generations,
+            seed=self.args.seed,
+        )

open-r1-multimodal/src/open_r1/trainer/qwen_grpo_trainer.py ADDED Viewed

	@@ -0,0 +1,877 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import textwrap
+from collections import defaultdict
+from typing import Any, Callable, Optional, Union, Sized
+from qwen_vl_utils import process_vision_info
+import torch
+import torch.utils.data
+import transformers
+from datasets import Dataset, IterableDataset
+from packaging import version
+from transformers import (
+    AriaForConditionalGeneration,
+    AriaProcessor,
+    AutoModelForCausalLM,
+    AutoModelForSequenceClassification,
+    AutoProcessor,
+    AutoTokenizer,
+    GenerationConfig,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    Qwen2VLForConditionalGeneration,
+    Qwen2_5_VLForConditionalGeneration,
+    Trainer,
+    TrainerCallback,
+    is_wandb_available,
+)
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+from transformers.utils import is_peft_available
+from trl.data_utils import apply_chat_template, is_conversational, maybe_apply_chat_template
+from trl.models import create_reference_model, prepare_deepspeed, unwrap_model_for_generation
+from trl.trainer.grpo_config import GRPOConfig
+from trl.trainer.utils import generate_model_card, get_comet_experiment_url
+from accelerate.utils import is_peft_model, set_seed
+import PIL.Image
+import copy
+from torch.utils.data import Sampler
+import warnings
+if is_peft_available():
+    from peft import PeftConfig, get_peft_model
+if is_wandb_available():
+    import wandb
+# What we call a reward function is a callable that takes a list of prompts and completions and returns a list of
+# rewards. When it's a string, it's a model ID, so it's loaded as a pretrained model.
+RewardFunc = Union[str, PreTrainedModel, Callable[[list, list], list[float]]]
+class RepeatRandomSampler(Sampler):
+    """
+    Sampler that repeats the indices of a dataset in a structured manner.
+    Args:
+        data_source (`Sized`):
+            Dataset to sample from.
+        mini_repeat_count (`int`):
+            Number of times to repeat each index per batch.
+        batch_size (`int`, *optional*, defaults to `1`):
+            Number of unique indices per batch.
+        repeat_count (`int`, *optional*, defaults to `1`):
+            Number of times to repeat the full sampling process.
+        seed (`int` or `None`, *optional*, defaults to `None`):
+            Random seed for reproducibility.
+    """
+    def __init__(
+        self,
+        data_source: Sized,
+        mini_repeat_count: int,
+        batch_size: int = 1,
+        repeat_count: int = 1,
+        seed: Optional[int] = None,
+    ):
+        self.data_source = data_source
+        self.mini_repeat_count = mini_repeat_count
+        self.batch_size = batch_size
+        self.repeat_count = repeat_count
+        self.num_samples = len(data_source)
+        self.seed = seed
+        self.generator = torch.Generator()
+        if seed is not None:
+            self.generator.manual_seed(seed)
+    def __iter__(self):
+        indexes = torch.randperm(self.num_samples, generator=self.generator).tolist()
+        indexes = [indexes[i : i + self.batch_size] for i in range(0, len(indexes), self.batch_size)]
+        indexes = [chunk for chunk in indexes if len(chunk) == self.batch_size]
+        for chunk in indexes:
+            for _ in range(self.repeat_count):
+                for index in chunk:
+                    for _ in range(self.mini_repeat_count):
+                        yield index
+    def __len__(self) -> int:
+        return self.num_samples * self.mini_repeat_count * self.repeat_count
+class Qwen2VLGRPOTrainer(Trainer):
+    """
+    Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the
+    paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
+    Example:
+    ```python
+    from datasets import load_dataset
+    from trl import GRPOTrainer
+    dataset = load_dataset("trl-lib/tldr", split="train")
+    trainer = GRPOTrainer(
+        model="Qwen/Qwen2-0.5B-Instruct",
+        reward_funcs="weqweasdas/RM-Gemma-2B",
+        train_dataset=dataset,
+    )
+    trainer.train()
+    ```
+    Args:
+        model (`Union[str, PreTrainedModel]`):
+            Model to be trained. Can be either:
+            - A string, being the *model id* of a pretrained model hosted inside a model repo on huggingface.co, or
+              a path to a *directory* containing model weights saved using
+              [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is
+              loaded using [`~transformers.AutoModelForCausalLM.from_pretrained`] with the keywork arguments
+              in `args.model_init_kwargs`.
+            - A [`~transformers.PreTrainedModel`] object. Only causal language models are supported.
+        reward_funcs (`Union[RewardFunc, list[RewardFunc]]`):
+            Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
+            functions with the prompts and completions and sum the rewards. Can be either:
+            - A single reward function, such as:
+                - A string: The *model ID* of a pretrained model hosted inside a model repo on huggingface.co, or a
+                path to a *directory* containing model weights saved using
+                [`~transformers.PreTrainedModel.save_pretrained`], e.g., `'./my_model_directory/'`. The model is loaded
+                using [`~transformers.AutoModelForSequenceClassification.from_pretrained`] with `num_labels=1` and the
+                keyword arguments in `args.model_init_kwargs`.
+                - A [`~transformers.PreTrainedModel`] object: Only sequence classification models are supported.
+                - A custom reward function: The function is provided with the prompts and the generated completions,
+                  plus any additional columns in the dataset. It should return a list of rewards. For more details, see
+                  [Using a custom reward function](#using-a-custom-reward-function).
+            - A list of reward functions, where each item can independently be any of the above types. Mixing different
+            types within the list (e.g., a string model ID and a custom reward function) is allowed.
+        args ([`GRPOConfig`], *optional*, defaults to `None`):
+            Configuration for this trainer. If `None`, a default configuration is used.
+        train_dataset ([`~datasets.Dataset`] or [`~datasets.IterableDataset`]):
+            Dataset to use for training. It must include a column `"prompt"`. Any additional columns in the dataset is
+            ignored. The format of the samples can be either:
+            - [Standard](dataset_formats#standard): Each sample contains plain text.
+            - [Conversational](dataset_formats#conversational): Each sample contains structured messages (e.g., role
+              and content).
+        eval_dataset ([`~datasets.Dataset`], [`~datasets.IterableDataset`] or `dict[str, Union[Dataset, IterableDataset]]`):
+            Dataset to use for evaluation. It must meet the same requirements as `train_dataset`.
+        processing_class ([`~transformers.PreTrainedTokenizerBase`], *optional*, defaults to `None`):
+            Processing class used to process the data. The padding side must be set to "left". If `None`, the
+            processing class is loaded from the model's name with [`~transformers.AutoTokenizer.from_pretrained`].
+        reward_processing_classes (`Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]`, *optional*, defaults to `None`):
+            Processing classes corresponding to the reward functions specified in `reward_funcs`. Can be either:
+            - A single processing class: Used when `reward_funcs` contains only one reward function.
+            - A list of processing classes: Must match the order and length of the reward functions in `reward_funcs`.
+            If set to `None`, or if an element of the list corresponding to a [`~transformers.PreTrainedModel`] is
+            `None`, the tokenizer for the model is automatically loaded using [`~transformers.AutoTokenizer.from_pretrained`].
+            For elements in `reward_funcs` that are custom reward functions (not [`~transformers.PreTrainedModel`]),
+            the corresponding entries in `reward_processing_classes` are ignored.
+        callbacks (list of [`~transformers.TrainerCallback`], *optional*, defaults to `None`):
+            List of callbacks to customize the training loop. Will add those to the list of default callbacks
+            detailed in [here](https://huggingface.co/docs/transformers/main_classes/callback).
+            If you want to remove one of the default callbacks used, use the [`~transformers.Trainer.remove_callback`]
+            method.
+        optimizers (`tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`):
+            A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your
+            model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`.
+        peft_config ([`~peft.PeftConfig`], *optional*, defaults to `None`):
+            PEFT configuration used to wrap the model. If `None`, the model is not wrapped.
+    """
+    def __init__(
+        self,
+        model: Union[str, PreTrainedModel],
+        reward_funcs: Union[RewardFunc, list[RewardFunc]],
+        args: GRPOConfig = None,
+        train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
+        eval_dataset: Optional[Union[Dataset, IterableDataset, dict[str, Union[Dataset, IterableDataset]]]] = None,
+        processing_class: Optional[PreTrainedTokenizerBase] = None,
+        reward_processing_classes: Optional[Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None),
+        peft_config: Optional["PeftConfig"] = None,
+        freeze_vision_modules: Optional[bool] = False,
+        max_pixels: Optional[int] = 4028160,
+        min_pixels: Optional[int] = 3136,
+        attn_implementation: str = "flash_attention_2",
+        torch_dtype: str = "bfloat16",
+    ):
+        # Args
+        if args is None:
+            model_name = model if isinstance(model, str) else model.config._name_or_path
+            model_name = model_name.split("/")[-1]
+            args = GRPOConfig(f"{model_name}-GRPO")
+        # Models
+        # Trained model
+        model_init_kwargs = args.model_init_kwargs or {}
+        # print(model_init_kwargs)
+        # print("========================")
+        model_init_kwargs["attn_implementation"] = attn_implementation
+        if model_init_kwargs.get("torch_dtype") is None:
+            model_init_kwargs["torch_dtype"] = torch_dtype
+        if isinstance(model, str):
+            model_id = model
+            torch_dtype = model_init_kwargs.get("torch_dtype")
+            if isinstance(torch_dtype, torch.dtype) or torch_dtype == "auto" or torch_dtype is None:
+                pass  # torch_dtype is already a torch.dtype or "auto" or None
+            elif isinstance(torch_dtype, str):  # it's a str, but not "auto"
+                torch_dtype = getattr(torch, torch_dtype)
+                model_init_kwargs["torch_dtype"] = torch_dtype
+            else:
+                raise ValueError(
+                    "Invalid `torch_dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing "
+                    f"a `torch.dtype` (e.g., 'float32'), but got {torch_dtype}."
+                )
+            # Disable caching if gradient checkpointing is enabled (not supported)
+            # model_init_kwargs["use_cache"] = (
+            #     False if args.gradient_checkpointing else model_init_kwargs.get("use_cache")
+            # )
+            model_init_kwargs["use_cache"] = True
+            if "Qwen2-VL" in model_id:
+                model = Qwen2VLForConditionalGeneration.from_pretrained(model, **model_init_kwargs)
+            elif "Qwen2.5-VL" in model_id or "qwen2.5" in model_id:
+                model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model, **model_init_kwargs)
+            elif "Aria" in model_id:
+                model_init_kwargs.pop("use_cache")
+                model = AriaForConditionalGeneration.from_pretrained(model, **model_init_kwargs)
+            else:
+                model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
+        else:
+            model_id = model.config._name_or_path
+            if args.model_init_kwargs is not None:
+                raise ValueError(
+                    "You passed `model_init_kwargs` to the `GRPOConfig`, but your model is already instantiated. "
+                    "This argument can only be used when the `model` argument is a string."
+                )
+        self.vision_modules_keywords = ["visual"]
+        if peft_config is not None:
+            def find_all_linear_names(model, multimodal_keywords):
+                cls = torch.nn.Linear
+                lora_module_names = set()
+                for name, module in model.named_modules():
+                    # LoRA is not applied to the vision modules
+                    if any(mm_keyword in name for mm_keyword in multimodal_keywords):
+                        continue
+                    if isinstance(module, cls):
+                        lora_module_names.add(name)
+                for m in lora_module_names:  # needed for 16-bit
+                    if "embed_tokens" in m:
+                        lora_module_names.remove(m)
+                return list(lora_module_names)
+            target_modules = find_all_linear_names(model, self.vision_modules_keywords)
+            peft_config.target_modules = target_modules
+            model = get_peft_model(model, peft_config)
+        if freeze_vision_modules:
+            print("Freezing vision modules...")
+            for n, p in model.named_parameters():
+                if any(keyword in n for keyword in self.vision_modules_keywords):
+                    p.requires_grad = False
+        # Enable gradient checkpointing if requested
+        if args.gradient_checkpointing:
+            model = self._enable_gradient_checkpointing(model, args)
+        # Reference model
+        if is_deepspeed_zero3_enabled():
+            if "Qwen2-VL" in model_id:
+                self.ref_model = Qwen2VLForConditionalGeneration.from_pretrained(model_id, **model_init_kwargs)
+            elif "Qwen2.5-VL" in model_id or "qwen2.5" in model_id:
+                self.ref_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_id, **model_init_kwargs)
+            elif "Aria" in model_id:
+                self.ref_model = AriaForConditionalGeneration.from_pretrained(model_id, **model_init_kwargs)
+            else:
+                self.ref_model = AutoModelForCausalLM.from_pretrained(model_id, **model_init_kwargs)
+        elif peft_config is None:
+            # If PEFT configuration is not provided, create a reference model based on the initial model.
+            self.ref_model = create_reference_model(model)
+        else:
+            # If PEFT is used, the reference model is not needed since the adapter can be disabled
+            # to revert to the initial model.
+            self.ref_model = None
+        # Processing class
+        if processing_class is None:
+            if "Qwen2-VL" in model_id or "Qwen2.5-VL" in model_id or "Aria" in model_id or "qwen2.5" in model_id:
+                processing_class = AutoProcessor.from_pretrained(model_id,max_pixels=max_pixels,min_pixels=min_pixels)
+                pad_token_id = processing_class.tokenizer.pad_token_id
+                processing_class.pad_token_id = pad_token_id
+                processing_class.eos_token_id = processing_class.tokenizer.eos_token_id
+                # if "Qwen" in model_id or "Qwen2.5-VL" in model_id:
+                    # processing_class.image_processor.max_pixels = max_pixels
+                    # # print(max_pixels)
+                    # # print('-'*100)
+                    # processing_class.image_processor.min_pixels = min_pixels
+            else:
+                processing_class = AutoTokenizer.from_pretrained(model.config._name_or_path, padding_side="left")
+                pad_token_id = processing_class.pad_token_id
+        # Reward functions
+        if not isinstance(reward_funcs, list):
+            reward_funcs = [reward_funcs]
+        for i, reward_func in enumerate(reward_funcs):
+            if isinstance(reward_func, str):
+                reward_funcs[i] = AutoModelForSequenceClassification.from_pretrained(
+                    reward_func, num_labels=1, **model_init_kwargs
+                )
+        self.reward_funcs = reward_funcs
+        # Reward processing class
+        if reward_processing_classes is None:
+            reward_processing_classes = [None] * len(reward_funcs)
+        elif not isinstance(reward_processing_classes, list):
+            reward_processing_classes = [reward_processing_classes]
+        else:
+            if len(reward_processing_classes) != len(reward_funcs):
+                raise ValueError("The number of reward processing classes must match the number of reward functions.")
+        for i, (reward_processing_class, reward_func) in enumerate(zip(reward_processing_classes, reward_funcs)):
+            if isinstance(reward_func, PreTrainedModel):
+                if reward_processing_class is None:
+                    reward_processing_class = AutoTokenizer.from_pretrained(reward_func.config._name_or_path)
+                if reward_processing_class.pad_token_id is None:
+                    reward_processing_class.pad_token = reward_processing_class.eos_token
+                # The reward model computes the reward for the latest non-padded token in the input sequence.
+                # So it's important to set the pad token ID to the padding token ID of the processing class.
+                reward_func.config.pad_token_id = reward_processing_class.pad_token_id
+                reward_processing_classes[i] = reward_processing_class
+        self.reward_processing_classes = reward_processing_classes
+        # self.reward_weight_schedule = [
+        #     (0, [1.0, 0.0, 0.5]),    # 第 0 步到第 1999 步，使用 "accuracy" (1.0) 和 "format" (0.5)
+        #     (2300, [0.0, 1.0, 0.5]), # 第 2000 步及之后，使用 "another_reward" (1.0)
+        # ]
+        # Data collator
+        def data_collator(features):  # No data collation is needed in GRPO
+            return features
+        # Training arguments
+        self.max_pixels = max_pixels
+        self.min_pixels = min_pixels
+        self.max_prompt_length = args.max_prompt_length
+        self.max_prompt_length = None
+        if args.max_prompt_length is not None:
+            warnings.warn("Setting max_prompt_length is currently not supported, it has been set to None")
+        self.max_completion_length = args.max_completion_length  # = |o_i| in the GRPO paper
+        # print(self.max_completion_length)
+        # print("max_completion_length")
+        # print('-'*100
+        self.num_generations = args.num_generations  # = G in the GRPO paper
+        self.generation_config = GenerationConfig(
+            max_new_tokens=self.max_completion_length,
+            do_sample=True,
+            temperature=1,
+            pad_token_id=pad_token_id,
+        )
+        self.beta = args.beta
+        self.epsilon = args.epsilon
+        # Multi-step
+        self.num_iterations = args.num_iterations  # = 𝜇 in the GRPO paper
+        # Tracks the number of iterations (forward + backward passes), including those within a gradient accumulation cycle
+        self._step = 0
+        # Buffer the batch to reuse generated outputs across multiple updates
+        self._buffered_inputs = [None] * args.gradient_accumulation_steps
+        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in GRPO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys is "prompt". As a result, the trainer issues the warning:
+        # "Could not estimate the number of tokens of the input, floating-point operations will not be computed." To
+        # suppress this warning, we set the "estimate_tokens" key in the model's "warnings_issued" dictionary to True.
+        # This acts as a flag to indicate that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+        # Initialize the metrics
+        self._metrics = defaultdict(list)
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            callbacks=callbacks,
+            optimizers=optimizers,
+        )
+        # Check if the per_device_train/eval_batch_size * num processes can be divided by the number of generations
+        num_processes = self.accelerator.num_processes
+        global_batch_size = args.per_device_train_batch_size * num_processes
+        possible_values = [n_gen for n_gen in range(2, global_batch_size + 1) if (global_batch_size) % n_gen == 0]
+        if self.num_generations not in possible_values:
+            raise ValueError(
+                f"The global train batch size ({num_processes} x {args.per_device_train_batch_size}) must be evenly "
+                f"divisible by the number of generations per prompt ({self.num_generations}). Given the current train "
+                f"batch size, the valid values for the number of generations are: {possible_values}."
+            )
+        if self.args.eval_strategy != "no":
+            global_batch_size = args.per_device_eval_batch_size * num_processes
+            possible_values = [n_gen for n_gen in range(2, global_batch_size + 1) if (global_batch_size) % n_gen == 0]
+            if self.num_generations not in possible_values:
+                raise ValueError(
+                    f"The global eval batch size ({num_processes} x {args.per_device_eval_batch_size}) must be evenly "
+                    f"divisible by the number of generations per prompt ({self.num_generations}). Given the current "
+                    f"eval batch size, the valid values for the number of generations are: {possible_values}."
+                )
+        # Ensure each process receives a unique seed to prevent duplicate completions when generating with
+        # transformers if num_generations exceeds per_device_train_batch_size. We could skip it if we use vLLM, but
+        # it's safer to set it in all cases.
+        set_seed(args.seed, device_specific=True)
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+        if self.ref_model is not None:
+            if self.is_deepspeed_enabled:
+                self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+            else:
+                self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
+        for i, reward_func in enumerate(self.reward_funcs):
+            if isinstance(reward_func, PreTrainedModel):
+                self.reward_funcs[i] = self.accelerator.prepare_model(reward_func, evaluation_mode=True)
+    def _enable_gradient_checkpointing(self, model: PreTrainedModel, args: GRPOConfig) -> PreTrainedModel:
+        """Enables gradient checkpointing for the model."""
+        # Ensure use_cache is disabled
+        model.config.use_cache = False
+        # Enable gradient checkpointing on the base model for PEFT
+        if is_peft_model(model):
+            model.base_model.gradient_checkpointing_enable()
+        # Enable gradient checkpointing for non-PEFT models
+        else:
+            model.gradient_checkpointing_enable()
+        gradient_checkpointing_kwargs = args.gradient_checkpointing_kwargs or {}
+        use_reentrant = (
+            "use_reentrant" not in gradient_checkpointing_kwargs or gradient_checkpointing_kwargs["use_reentrant"]
+        )
+        if use_reentrant:
+            model.enable_input_require_grads()
+        return model
+    def _set_signature_columns_if_needed(self):
+        # If `self.args.remove_unused_columns` is True, non-signature columns are removed.
+        # By default, this method sets `self._signature_columns` to the model's expected inputs.
+        # In GRPOTrainer, we preprocess data, so using the model's signature columns doesn't work.
+        # Instead, we set them to the columns expected by the `training_step` method, hence the override.
+        if self._signature_columns is None:
+            self._signature_columns = ["prompt"]
+    # Get the per-token log probabilities for the completions for the model and the reference model
+    def _get_per_token_logps(self, model, input_ids, attention_mask, pixel_values, image_grid_thw):
+        logits = model(input_ids, attention_mask=attention_mask, pixel_values=pixel_values, image_grid_thw=image_grid_thw).logits  # (B, L, V)
+        logits = logits[:, :-1, :]  # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
+        input_ids = input_ids[:, 1:]  # (B, L-1), exclude the first input ID since we don't have logits for it
+        # Compute the log probabilities for the input tokens. Use a loop to reduce memory peak.
+        per_token_logps = []
+        for logits_row, input_ids_row in zip(logits, input_ids):
+            log_probs = logits_row.log_softmax(dim=-1)
+            token_log_prob = torch.gather(log_probs, dim=1, index=input_ids_row.unsqueeze(1)).squeeze(1)
+            per_token_logps.append(token_log_prob)
+        return torch.stack(per_token_logps)
+    def _prepare_inputs(self, inputs):
+        # Simple pass-through, just like original
+        return inputs
+    def _generate_and_score_completions(self, inputs: dict[str, Union[torch.Tensor, Any]], model) -> dict[str, Union[torch.Tensor, Any]]:
+        device = self.accelerator.device
+        prompts = [x["prompt"] for x in inputs]
+        prompts_text = [maybe_apply_chat_template(example, self.processing_class)["prompt"] for example in inputs]
+        # Handle both pre-loaded images and image paths
+        images=[]
+        for x in inputs:
+            if "image" in x:
+                img = x["image"]
+            elif "image_path" in x and x["image_path"] is not None:
+                img = PIL.Image.open(x["image_path"])
+            try:
+                # Ensure minimum dimensions of 28 pixels and maximum dimensions under max pixels.
+                w, h = img.size
+                if w < 28 or h < 28:
+                    # Calculate new dimensions maintaining aspect ratio
+                    if w < h:
+                        new_w = 28
+                        new_h = int(h * (28/w))
+                    else:
+                        new_h = 28
+                        new_w = int(w * (28/h))
+                    img = img.resize((new_w, new_h), PIL.Image.Resampling.LANCZOS)
+                images.append(img)
+            except:
+                pass
+        if len(images) > 0:
+            # print(self.processing_class.image_processor.max_pixels)
+            prompt_inputs = self.processing_class(
+                text=prompts_text,
+                images=images,
+                return_tensors="pt",
+                padding=True,
+                padding_side="left",
+                add_special_tokens=False,
+            )
+        else:
+            prompt_inputs = self.processing_class(
+                text=prompts_text,
+                return_tensors="pt",
+                padding=True,
+                padding_side="left",
+                add_special_tokens=False,
+            )
+        prompt_inputs = super()._prepare_inputs(prompt_inputs)
+        prompt_ids, prompt_mask = prompt_inputs["input_ids"], prompt_inputs["attention_mask"]
+        if len(images) > 0:
+            pixel_values = prompt_inputs["pixel_values"]
+            image_grid_thw = prompt_inputs["image_grid_thw"]
+        else:
+            pixel_values = None
+            image_grid_thw = None
+        # if self.max_prompt_length is not None:
+        #     prompt_ids = prompt_ids[:, -self.max_prompt_length :]
+        #     prompt_inputs["input_ids"] = prompt_ids
+        #     prompt_mask = prompt_mask[:, -self.max_prompt_length :]
+        #     prompt_inputs["attention_mask"] = prompt_mask
+        # Generate completions
+        with unwrap_model_for_generation(model, self.accelerator) as unwrapped_model:
+            prompt_completion_ids = unwrapped_model.generate(
+                **prompt_inputs,
+                generation_config=self.generation_config
+            )
+            prompt_length = prompt_ids.size(1)
+            prompt_ids = prompt_completion_ids[:, :prompt_length]
+            completion_ids = prompt_completion_ids[:, prompt_length:]
+            # No need to repeat prompt_mask as we're not duplicating prompts during generation
+        # Mask everything after the first EOS token
+        is_eos = completion_ids == self.processing_class.eos_token_id
+        eos_idx = torch.full((is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device)
+        eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
+        sequence_indices = torch.arange(is_eos.size(1), device=device).expand(is_eos.size(0), -1)
+        completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()
+        # Concatenate prompt_mask with completion_mask for logit computation
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)  # (B, P+C)
+        try:
+            pixel_values = prompt_inputs["pixel_values"]
+            image_grid_thw = prompt_inputs["image_grid_thw"]
+        except:
+            pixel_values = None
+            image_grid_thw = None
+        with torch.no_grad():
+            # When using num_iterations == 1, old_per_token_logps == per_token_logps, so we can skip its
+            # computation here, and use per_token_logps.detach() instead.
+            if self.num_iterations > 1:
+                old_per_token_logps = self._get_per_token_logps(
+                    model, prompt_completion_ids, attention_mask, pixel_values, image_grid_thw
+                )
+                old_per_token_logps = old_per_token_logps[:, prompt_length - 1:]
+            else:
+                old_per_token_logps = None
+            if self.beta == 0.0:
+                ref_per_token_logps = None
+            elif self.ref_model is not None:
+                ref_per_token_logps = self._get_per_token_logps(
+                    self.ref_model, prompt_completion_ids, attention_mask, pixel_values, image_grid_thw
+                )
+            else:
+                with self.accelerator.unwrap_model(model).disable_adapter():
+                    ref_per_token_logps = self._get_per_token_logps(
+                        model, prompt_completion_ids, attention_mask, pixel_values, image_grid_thw
+                    )
+        ref_per_token_logps = ref_per_token_logps[:, prompt_length - 1:]
+        # Decode the generated completions
+        completions = self.processing_class.batch_decode(completion_ids, skip_special_tokens=True)
+        if is_conversational(inputs[0]):
+            completions = [[{"role": "assistant", "content": completion}] for completion in completions]
+        # Compute the rewards
+        # No need to duplicate prompts as we're not generating multiple completions per prompt
+        rewards_per_func = torch.zeros(len(prompts), len(self.reward_funcs), device=device)
+        for i, (reward_func, reward_processing_class) in enumerate(
+            zip(self.reward_funcs, self.reward_processing_classes)
+        ):
+            if isinstance(reward_func, PreTrainedModel):
+                if is_conversational(inputs[0]):
+                    messages = [{"messages": p + c} for p, c in zip(prompts, completions)]
+                    texts = [apply_chat_template(x, reward_processing_class)["text"] for x in messages]
+                else:
+                    texts = [p + c for p, c in zip(prompts, completions)]
+                reward_inputs = reward_processing_class(
+                    texts, return_tensors="pt", padding=True, padding_side="right", add_special_tokens=False
+                )
+                reward_inputs = super()._prepare_inputs(reward_inputs)
+                with torch.inference_mode():
+                    rewards_per_func[:, i] = reward_func(**reward_inputs).logits[:, 0]  # Shape (B*G,)
+            else:
+                # Repeat all input columns (but "prompt" and "completion") to match the number of generations
+                reward_kwargs = {key: [] for key in inputs[0].keys() if key not in ["prompt", "completion"]}
+                for key in reward_kwargs:
+                    for example in inputs:
+                        # No need to duplicate prompts as we're not generating multiple completions per prompt
+                        # reward_kwargs[key].extend([example[key]] * self.num_generations)
+                        reward_kwargs[key].extend([example[key]])
+                output_reward_func = reward_func(prompts=prompts, completions=completions, **reward_kwargs)
+                rewards_per_func[:, i] = torch.tensor(output_reward_func, dtype=torch.float32, device=device)
+        # Gather rewards across processes
+        rewards_per_func = self.accelerator.gather(rewards_per_func)
+        # # 动态选择权重
+        # current_weights = self.reward_weight_schedule[0][1]  # 默认使用第一个权重
+        # for step, weights in self.reward_weight_schedule:
+        #     if self.state.global_step >= step:
+        #         current_weights = weights
+        #     else:
+        #         break
+        # 将权重转换为张量
+        weights = torch.tensor([1,0.5], dtype=torch.float32, device=device)
+        rewards = (rewards_per_func * weights).sum(dim=1)
+        # print(rewards.shape)
+        # breakpoint()
+        # Compute grouped-wise rewards
+        # Each group consists of num_generations completions for the same prompt
+        mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1)
+        std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1)
+        # Normalize the rewards to compute the advantages
+        mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(self.num_generations, dim=0)
+        std_grouped_rewards = std_grouped_rewards.repeat_interleave(self.num_generations, dim=0)
+        advantages = (rewards - mean_grouped_rewards) / (std_grouped_rewards + 1e-4)
+        # Get only the local slice of advantages
+        process_slice = slice(
+            self.accelerator.process_index * len(prompts),
+            (self.accelerator.process_index + 1) * len(prompts),
+        )
+        advantages = advantages[process_slice]
+        # Log the metrics
+        completion_length = self.accelerator.gather_for_metrics(completion_mask.sum(1)).float().mean().item()
+        self._metrics["completion_length"].append(completion_length)
+        reward_per_func = self.accelerator.gather_for_metrics(rewards_per_func).mean(0)
+        for i, reward_func in enumerate(self.reward_funcs):
+            if isinstance(reward_func, PreTrainedModel):
+                reward_func_name = reward_func.config._name_or_path.split("/")[-1]
+            else:
+                reward_func_name = reward_func.__name__
+            self._metrics[f"rewards/{reward_func_name}"].append(reward_per_func[i].item())
+        self._metrics["reward"].append(self.accelerator.gather_for_metrics(rewards).mean().item())
+        self._metrics["reward_std"].append(self.accelerator.gather_for_metrics(std_grouped_rewards).mean().item())
+        return {
+            "prompt_ids": prompt_ids,
+            "prompt_mask": prompt_mask,
+            "completion_ids": completion_ids,
+            "completion_mask": completion_mask,
+            "old_per_token_logps": old_per_token_logps,
+            "ref_per_token_logps": ref_per_token_logps,
+            "advantages": advantages,
+            "pixel_values": pixel_values,
+            "image_grid_thw": image_grid_thw
+        }
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+        if return_outputs:
+            raise ValueError("The GRPOTrainer does not support returning outputs")
+        # Check if we need to generate new completions or use buffered ones
+        if self.state.global_step % self.num_iterations == 0:
+            inputs = self._generate_and_score_completions(inputs, model)
+            self._buffered_inputs[self._step % self.args.gradient_accumulation_steps] = inputs
+        else:
+            inputs = self._buffered_inputs[self._step % self.args.gradient_accumulation_steps]
+        self._step += 1
+        # Get the prepared inputs
+        prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
+        completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"]
+        pixel_values = inputs["pixel_values"]
+        image_grid_thw = inputs["image_grid_thw"]
+        # Concatenate for full sequence
+        input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
+        # Get the current policy's log probabilities
+        per_token_logps = self._get_per_token_logps(model, input_ids, attention_mask, pixel_values, image_grid_thw)
+        # Get rid of the prompt (-1 because of the shift done in get_per_token_logps)
+        per_token_logps = per_token_logps[:, prompt_ids.size(1) - 1:]
+        # Get the advantages from inputs
+        advantages = inputs["advantages"]
+        # When using num_iterations == 1, old_per_token_logps == per_token_logps, so we can skip its computation
+        # and use per_token_logps.detach() instead
+        old_per_token_logps = inputs["old_per_token_logps"] if self.num_iterations > 1 else per_token_logps.detach()
+        # Compute the policy ratio and clipped version
+        coef_1 = torch.exp(per_token_logps - old_per_token_logps)
+        coef_2 = torch.clamp(coef_1, 1 - self.epsilon, 1 + self.epsilon)
+        per_token_loss1 = coef_1 * advantages.unsqueeze(1)
+        per_token_loss2 = coef_2 * advantages.unsqueeze(1)
+        per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
+        # Add KL penalty if beta > 0
+        if self.beta > 0:
+            ref_per_token_logps = inputs["ref_per_token_logps"]
+            per_token_kl = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1
+            per_token_loss = per_token_loss + self.beta * per_token_kl
+            print(f"per_token_loss:{per_token_loss}")
+            print('-'*100)
+            print(f"per_token_kl:{per_token_kl}")
+            # Log KL divergence
+            mean_kl = ((per_token_kl * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean()
+            self._metrics["kl"].append(self.accelerator.gather_for_metrics(mean_kl).mean().item())
+        # Compute final loss
+        loss = ((per_token_loss * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean()
+        # Log clip ratio
+        is_clipped = (per_token_loss1 < per_token_loss2).float()
+        clip_ratio = (is_clipped * completion_mask).sum() / completion_mask.sum()
+        self._metrics["clip_ratio"].append(self.accelerator.gather_for_metrics(clip_ratio).mean().item())
+        return loss
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
+        metrics = {key: sum(val) / len(val) for key, val in self._metrics.items()}  # average the metrics
+        logs = {**logs, **metrics}
+        if version.parse(transformers.__version__) >= version.parse("4.47.0.dev0"):
+            super().log(logs, start_time)
+        else:  # transformers<=4.46
+            super().log(logs)
+        self._metrics.clear()
+    def create_model_card(
+        self,
+        model_name: Optional[str] = None,
+        dataset_name: Optional[str] = None,
+        tags: Union[str, list[str], None] = None,
+    ):
+        """
+        Creates a draft of a model card using the information available to the `Trainer`.
+        Args:
+            model_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the model.
+            dataset_name (`str` or `None`, *optional*, defaults to `None`):
+                Name of the dataset used for training.
+            tags (`str`, `list[str]` or `None`, *optional*, defaults to `None`):
+                Tags to be associated with the model card.
+        """
+        if not self.is_world_process_zero():
+            return
+        if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path):
+            base_model = self.model.config._name_or_path
+        else:
+            base_model = None
+        tags = tags or []
+        if isinstance(tags, str):
+            tags = [tags]
+        if hasattr(self.model.config, "unsloth_version"):
+            tags.append("unsloth")
+        citation = textwrap.dedent(
+            """\
+            @article{zhihong2024deepseekmath,
+                title        = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
+                author       = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
+                year         = 2024,
+                eprint       = {arXiv:2402.03300},
+            """
+        )
+        model_card = generate_model_card(
+            base_model=base_model,
+            model_name=model_name,
+            hub_model_id=self.hub_model_id,
+            dataset_name=dataset_name,
+            tags=tags,
+            wandb_url=wandb.run.get_url() if is_wandb_available() and wandb.run is not None else None,
+            comet_url=get_comet_experiment_url(),
+            trainer_name="GRPO",
+            trainer_citation=citation,
+            paper_title="DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
+            paper_id="2402.03300",
+        )
+        model_card.save(os.path.join(self.args.output_dir, "README.md"))
+    def _get_train_sampler(self) -> Sampler:
+        """Returns a sampler that ensures proper data sampling for GRPO training."""
+        effective_batch_size = (
+            self.args.per_device_train_batch_size
+            * self.accelerator.num_processes
+            * self.args.gradient_accumulation_steps
+        )
+        return RepeatRandomSampler(
+            data_source=self.train_dataset,
+            mini_repeat_count=self.num_generations,
+            batch_size=effective_batch_size // self.num_generations,
+            repeat_count=self.num_iterations,
+            seed=self.args.seed,
+        )
+    def _get_eval_sampler(self, eval_dataset) -> Sampler:
+        """Returns a sampler for evaluation."""
+        return RepeatRandomSampler(
+            data_source=eval_dataset,
+            mini_repeat_count=self.num_generations,
+            seed=self.args.seed,
+        )

open-r1-multimodal/src/open_r1/trainer/vllm_grpo_trainer.py ADDED Viewed

	@@ -0,0 +1,828 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import textwrap
+from collections import defaultdict
+from typing import Any, Callable, Optional, Union
+from accelerate.utils.other import is_compiled_module
+from accelerate.utils import broadcast_object_list, gather, gather_object
+import torch
+import torch.utils.data
+import transformers
+import warnings
+from unittest.mock import patch
+from datasets import Dataset, IterableDataset
+from packaging import version
+from transformers import (
+    AriaForConditionalGeneration,
+    AriaProcessor,
+    AutoModelForCausalLM,
+    AutoModelForSequenceClassification,
+    AutoProcessor,
+    AutoTokenizer,
+    GenerationConfig,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+    Qwen2VLForConditionalGeneration,
+    Qwen2_5_VLForConditionalGeneration,
+    Trainer,
+    TrainerCallback,
+    is_wandb_available,
+)
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+from transformers.utils import is_peft_available
+from trl.data_utils import (
+    apply_chat_template,
+    is_conversational,
+    maybe_apply_chat_template,
+)
+from trl.import_utils import is_vllm_available
+from trl.models import (
+    create_reference_model,
+    prepare_deepspeed,
+    unwrap_model_for_generation,
+)
+from trl.trainer.grpo_config import GRPOConfig
+from trl.trainer.utils import generate_model_card, get_comet_experiment_url, pad
+from trl import GRPOTrainer
+import copy
+if is_peft_available():
+    from peft import PeftConfig, get_peft_model
+if is_vllm_available():
+    from vllm import LLM, SamplingParams
+if is_wandb_available():
+    import wandb
+import torch.nn as nn
+from torch.utils.data import Sampler
+# What we call a reward function is a callable that takes a list of prompts and completions and returns a list of
+# rewards. When it's a string, it's a model ID, so it's loaded as a pretrained model.
+RewardFunc = Union[str, PreTrainedModel, Callable[[list, list], list[float]]]
+class RepeatRandomSampler(Sampler):
+    """
+    Sampler that repeats the indices of a dataset N times.
+    Args:
+        data_source (`Sized`):
+            Dataset to sample from.
+        repeat_count (`int`):
+            Number of times to repeat each index.
+    Example:
+    ```python
+    >>> sampler = RepeatRandomSampler(["a", "b", "c", "d"], repeat_count=2)
+    >>> list(sampler)
+    [2, 2, 0, 0, 3, 3, 1, 1]
+    ```
+    """
+    def __init__(self, data_source, repeat_count: int):
+        self.data_source = data_source
+        self.repeat_count = repeat_count
+        self.num_samples = len(data_source)
+    def __iter__(self):
+        indexes = [
+            idx
+            for idx in torch.randperm(self.num_samples).tolist()
+            for _ in range(self.repeat_count)
+        ]
+        return iter(indexes)
+    def __len__(self):
+        return self.num_samples * self.repeat_count
+class Qwen2VLGRPOVLLMTrainer(Trainer):
+    def __init__(
+        self,
+        model: Union[str, PreTrainedModel],
+        reward_funcs: Union[RewardFunc, list[RewardFunc]],
+        args: GRPOConfig = None,
+        train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
+        eval_dataset: Optional[
+            Union[Dataset, IterableDataset, dict[str, Union[Dataset, IterableDataset]]]
+        ] = None,
+        processing_class: Optional[PreTrainedTokenizerBase] = None,
+        reward_processing_classes: Optional[
+            Union[PreTrainedTokenizerBase, list[PreTrainedTokenizerBase]]
+        ] = None,
+        callbacks: Optional[list[TrainerCallback]] = None,
+        optimizers: tuple[
+            Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]
+        ] = (None, None),
+        peft_config: Optional["PeftConfig"] = None,
+        # qwen2-vl related params
+        max_pixels: Optional[int] = 1003520,
+        min_pixels: Optional[int] = 3136,
+        attn_implementation: str = "flash_attention_2",
+    ):
+        # Args
+        if args is None:
+            model_name = model if isinstance(model, str) else model.config._name_or_path
+            model_name = model_name.split("/")[-1]
+            args = GRPOConfig(f"{model_name}-GRPO")
+        # Models
+        # Trained model
+        model_init_kwargs = args.model_init_kwargs or {}
+        model_init_kwargs["attn_implementation"] = attn_implementation
+        if isinstance(model, str):
+            model_id = model
+            torch_dtype = model_init_kwargs.get("torch_dtype")
+            if (
+                isinstance(torch_dtype, torch.dtype)
+                or torch_dtype == "auto"
+                or torch_dtype is None
+            ):
+                pass  # torch_dtype is already a torch.dtype or "auto" or None
+            elif isinstance(torch_dtype, str):  # it's a str, but not "auto"
+                torch_dtype = getattr(torch, torch_dtype)
+                model_init_kwargs["torch_dtype"] = torch_dtype
+            else:
+                raise ValueError(
+                    "Invalid `torch_dtype` passed to `GRPOConfig`. Expected either 'auto' or a string representing "
+                    f"a `torch.dtype` (e.g., 'float32'), but got {torch_dtype}."
+                )
+            # Disable caching if gradient checkpointing is enabled (not supported)
+            model_init_kwargs["use_cache"] = (
+                False
+                if args.gradient_checkpointing
+                else model_init_kwargs.get("use_cache")
+            )
+            if "Qwen2-VL" in model_id:
+                model = Qwen2VLForConditionalGeneration.from_pretrained(
+                    model, **model_init_kwargs
+                )
+            elif "Qwen2.5-VL" in model_id:
+                model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model, **model_init_kwargs)
+            elif "Aria" in model_id:
+                model_init_kwargs.pop("use_cache")
+                model = AriaForConditionalGeneration.from_pretrained(
+                    model, **model_init_kwargs
+                )
+            else:
+                model = AutoModelForCausalLM.from_pretrained(model, **model_init_kwargs)
+        else:
+            model_id = model.config._name_or_path
+            if args.model_init_kwargs is not None:
+                raise ValueError(
+                    "You passed `model_init_kwargs` to the `GRPOConfig`, but your model is already instantiated. "
+                    "This argument can only be used when the `model` argument is a string."
+                )
+        if peft_config is not None:
+            model = get_peft_model(model, peft_config)
+        # Reference model
+        if is_deepspeed_zero3_enabled():
+            if "Qwen2-VL" in model_id:
+                self.ref_model = Qwen2VLForConditionalGeneration.from_pretrained(
+                    model_id, **model_init_kwargs
+                )
+            elif "Qwen2.5-VL" in model_id:
+                self.ref_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_id, **model_init_kwargs)
+            elif "Aria" in model_id:
+                self.ref_model = AriaForConditionalGeneration.from_pretrained(
+                    model_id, **model_init_kwargs
+                )
+            else:
+                self.ref_model = AutoModelForCausalLM.from_pretrained(
+                    model_id, **model_init_kwargs
+                )
+        elif peft_config is None:
+            # If PEFT configuration is not provided, create a reference model based on the initial model.
+            self.ref_model = create_reference_model(model)
+        else:
+            # If PEFT is used, the reference model is not needed since the adapter can be disabled
+            # to revert to the initial model.
+            self.ref_model = None
+        # Processing class
+        if processing_class is None:
+            if "Qwen2-VL" in model_id or "Qwen2.5-VL" in model_id or "Aria" in model_id:
+                processing_class = AutoProcessor.from_pretrained(model_id)
+                pad_token_id = processing_class.tokenizer.pad_token_id
+                processing_class.pad_token_id = pad_token_id
+                processing_class.eos_token_id = processing_class.tokenizer.eos_token_id
+                if "Qwen" in model_id or "Qwen2.5-VL" in model_id:
+                    processing_class.image_processor.max_pixels = max_pixels
+                    processing_class.image_processor.min_pixels = min_pixels
+            else:
+                processing_class = AutoTokenizer.from_pretrained(
+                    model.config._name_or_path, padding_side="left"
+                )
+                pad_token_id = processing_class.pad_token_id
+        # Reward functions
+        if not isinstance(reward_funcs, list):
+            reward_funcs = [reward_funcs]
+        for i, reward_func in enumerate(reward_funcs):
+            if isinstance(reward_func, str):
+                reward_funcs[i] = AutoModelForSequenceClassification.from_pretrained(
+                    reward_func, num_labels=1, **model_init_kwargs
+                )
+        self.reward_funcs = reward_funcs
+        # Reward processing class
+        if reward_processing_classes is None:
+            reward_processing_classes = [None] * len(reward_funcs)
+        elif not isinstance(reward_processing_classes, list):
+            reward_processing_classes = [reward_processing_classes]
+        else:
+            if len(reward_processing_classes) != len(reward_funcs):
+                raise ValueError(
+                    "The number of reward processing classes must match the number of reward functions."
+                )
+        for i, (reward_processing_class, reward_func) in enumerate(
+            zip(reward_processing_classes, reward_funcs)
+        ):
+            if isinstance(reward_func, PreTrainedModel):
+                if reward_processing_class is None:
+                    reward_processing_class = AutoTokenizer.from_pretrained(
+                        reward_func.config._name_or_path
+                    )
+                if reward_processing_class.pad_token_id is None:
+                    reward_processing_class.pad_token = (
+                        reward_processing_class.eos_token
+                    )
+                # The reward model computes the reward for the latest non-padded token in the input sequence.
+                # So it's important to set the pad token ID to the padding token ID of the processing class.
+                reward_func.config.pad_token_id = reward_processing_class.pad_token_id
+                reward_processing_classes[i] = reward_processing_class
+        self.reward_processing_classes = reward_processing_classes
+        # Data collator
+        def data_collator(features):  # No data collation is needed in GRPO
+            return features
+        # Training arguments
+        self.max_prompt_length = args.max_prompt_length
+        self.max_completion_length = (
+            args.max_completion_length
+        )  # = |o_i| in the GRPO paper
+        self.num_generations = args.num_generations  # = G in the GRPO paper
+        self.generation_config = GenerationConfig(
+            max_new_tokens=self.max_completion_length,
+            do_sample=True,
+            temperature=1,  # HACK
+            num_return_sequences=self.num_generations,
+            pad_token_id=pad_token_id,
+        )
+        self.beta = args.beta
+        # The trainer estimates the number of FLOPs (floating-point operations) using the number of elements in the
+        # input tensor associated with the key "input_ids". However, in GRPO, the sampled data does not include the
+        # "input_ids" key. Instead, the available keys is "prompt". As a result, the trainer issues the warning:
+        # "Could not estimate the number of tokens of the input, floating-point operations will not be computed." To
+        # suppress this warning, we set the "estimate_tokens" key in the model's "warnings_issued" dictionary to True.
+        # This acts as a flag to indicate that the warning has already been issued.
+        model.warnings_issued["estimate_tokens"] = True
+        # Initialize the metrics
+        self._metrics = defaultdict(list)
+        self.use_vllm = args.use_vllm
+        # # rewrite the processing AutoTokenizer -> AutoProcessor
+        # model_id = model if isinstance(model, str) else model.config._name_or_path
+        # if processing_class is None:
+        #     if "Qwen2-VL" in model_id or "Aria" in model_id:
+        #         processing_class = AutoProcessor.from_pretrained(model_id)
+        #         pad_token_id = processing_class.tokenizer.pad_token_id
+        #         processing_class.pad_token_id = pad_token_id
+        #         processing_class.eos_token_id = processing_class.tokenizer.eos_token_id
+        #         if "Qwen2-VL" in model_id:
+        #             processing_class.image_processor.max_pixels = max_pixels
+        #             processing_class.image_processor.min_pixels = min_pixels
+        #     else:
+        #         processing_class = AutoTokenizer.from_pretrained(
+        #             model.config._name_or_path, padding_side="left"
+        #         )
+        #         pad_token_id = processing_class.pad_token_id
+        super().__init__(
+            model=model,
+            args=args,
+            data_collator=data_collator,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            processing_class=processing_class,
+            callbacks=callbacks,
+            optimizers=optimizers,
+        )
+        # Gradient accumulation requires scaled loss. Normally, loss scaling in the parent class depends on whether the
+        # model accepts loss-related kwargs. Since we compute our own loss, this check is irrelevant. We set
+        # self.model_accepts_loss_kwargs to False to enable scaling.
+        self.model_accepts_loss_kwargs = False
+        # Check if the per_device_train/eval_batch_size * num processes can be divided by the number of generations
+        num_processes = self.accelerator.num_processes
+        global_batch_size = args.per_device_train_batch_size * num_processes
+        possible_values = [
+            n_gen
+            for n_gen in range(2, global_batch_size + 1)
+            if (global_batch_size) % n_gen == 0
+        ]
+        if self.num_generations not in possible_values:
+            raise ValueError(
+                f"The global train batch size ({num_processes} x {args.per_device_train_batch_size}) must be evenly "
+                f"divisible by the number of generations per prompt ({self.num_generations}). Given the current train "
+                f"batch size, the valid values for the number of generations are: {possible_values}."
+            )
+        if self.args.eval_strategy != "no":
+            global_batch_size = args.per_device_eval_batch_size * num_processes
+            possible_values = [
+                n_gen
+                for n_gen in range(2, global_batch_size + 1)
+                if (global_batch_size) % n_gen == 0
+            ]
+            if self.num_generations not in possible_values:
+                raise ValueError(
+                    f"The global eval batch size ({num_processes} x {args.per_device_eval_batch_size}) must be evenly "
+                    f"divisible by the number of generations per prompt ({self.num_generations}). Given the current "
+                    f"eval batch size, the valid values for the number of generations are: {possible_values}."
+                )
+        if self.use_vllm:
+            if not is_vllm_available():
+                raise ImportError(
+                    "vLLM is not available and `use_vllm` is set to True. Please install vLLM with "
+                    "`pip install vllm` to use it."
+                )
+            if self.accelerator.is_main_process:
+                vllm_device = self.args.vllm_device
+                if vllm_device == "auto":
+                    vllm_device = f"cuda:{self.accelerator.num_processes}"  # take the next GPU idx
+                # Check that the requested device is available
+                if (
+                    vllm_device.split(":")[0] == "cuda"
+                    and int(vllm_device.split(":")[1]) >= torch.cuda.device_count()
+                ):
+                    raise ValueError(
+                        f"The requested device for vllm ({vllm_device}) is not available. You are likely using vLLM "
+                        "without restricting the number of GPUs for training. Set the `--num_processes` argument to a "
+                        "value lower than the number of GPUs available on your machine—typically, reducing it by one "
+                        f"is sufficient. In your case: `--num_processes {torch.cuda.device_count() - 1}`."
+                    )
+                # Check that the requested device is not also used for training
+                if vllm_device in {
+                    f"cuda:{idx}" for idx in range(self.accelerator.num_processes)
+                }:
+                    warnings.warn(
+                        f"The requested device {vllm_device} is also used for training. This may lead to unexpected "
+                        "behavior. It is recommended to use a dedicated device for vLLM."
+                    )
+                # vLLM is not compatible with accelerate. So we need to patch it to make sure we can (1) place the vLLM
+                # model on the desired device (world_size_patch) and (2) avoid a test that is not designed for our
+                # setting (profiling_patch).
+                world_size_patch = patch(
+                    "torch.distributed.get_world_size", return_value=1
+                )
+                profiling_patch = patch(
+                    "vllm.worker.worker.Worker._assert_memory_footprint_increased_during_profiling",
+                    return_value=None,
+                )
+                with world_size_patch, profiling_patch:
+                    print("vllm is running on: ", vllm_device)
+                    self.llm = LLM(
+                        model=model.name_or_path,
+                        device=vllm_device,
+                        gpu_memory_utilization=self.args.vllm_gpu_memory_utilization,
+                        dtype=torch.bfloat16,
+                        # Automatic Prefix Caching caches the KV cache of existing queries, so that a new query can
+                        # directly reuse the KV cache if it shares the same prefix with one of the existing queries.
+                        # This is particularly useful here because we generate completions from the same prompts.
+                        enable_prefix_caching=True,
+                        enforce_eager=True,
+                        max_model_len=args.max_completion_length,
+                    )
+                self.sampling_params = SamplingParams(
+                    temperature=args.temperature,
+                    max_tokens=self.max_completion_length,
+                )
+            self._last_loaded_step = (
+                0  # tag to avoid useless loading during grad accumulation
+            )
+            # When using vLLM, the main process is responsible for loading the model weights. This can cause process
+            # desynchronization and seems to lead to DeepSpeed hanging during initialization. To prevent this, we
+            # synchronize all processes after vLLM has been fully initialized.
+            self.accelerator.wait_for_everyone()
+        else:
+            raise ValueError(
+                "Qwen2VLGRPOVLLMTrainer only supports vllm generation, please set --use_vllm True"
+            )
+        if self.ref_model is not None:
+            if self.is_deepspeed_enabled:
+                self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
+            else:
+                self.ref_model = self.accelerator.prepare_model(
+                    self.ref_model, evaluation_mode=True
+                )
+        for i, reward_func in enumerate(self.reward_funcs):
+            if isinstance(reward_func, PreTrainedModel):
+                self.reward_funcs[i] = self.accelerator.prepare_model(
+                    reward_func, evaluation_mode=True
+                )
+    def _set_signature_columns_if_needed(self):
+        # If `self.args.remove_unused_columns` is True, non-signature columns are removed.
+        # By default, this method sets `self._signature_columns` to the model's expected inputs.
+        # In GRPOTrainer, we preprocess data, so using the model's signature columns doesn't work.
+        # Instead, we set them to the columns expected by the `training_step` method, hence the override.
+        if self._signature_columns is None:
+            self._signature_columns = ["prompt"]
+    # We need a custom sampler that samples the same prompt multiple times
+    def _get_train_sampler(self):
+        return RepeatRandomSampler(self.train_dataset, self.num_generations)
+    # Get the per-token log probabilities for the completions for the model and the reference model
+    def _get_per_token_logps(
+        self,
+        model,
+        input_ids,
+        attention_mask,
+        pixel_values,
+        image_grid_thw,
+        logits_to_keep,
+    ):
+        pixel_values = pixel_values.to(model.device)
+        image_grid_thw = image_grid_thw.to(device=model.device)
+        logits = model(
+            input_ids,
+            attention_mask=attention_mask,
+            pixel_values=pixel_values,
+            image_grid_thw=image_grid_thw,
+        ).logits  # (B, L, V)
+        logits = logits[
+            :, :-1, :
+        ]  # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
+        input_ids = input_ids[
+            :, -logits_to_keep:
+        ]  # (B, L-1), exclude the first input ID since we don't have logits for it
+        # Compute the log probabilities for the input tokens. Use a loop to reduce memory peak.
+        logits = logits[:, -logits_to_keep:]
+        per_token_logps = []
+        for logits_row, input_ids_row in zip(logits, input_ids):
+            log_probs = logits_row.log_softmax(dim=-1)
+            token_log_prob = torch.gather(
+                log_probs, dim=1, index=input_ids_row.unsqueeze(1)
+            ).squeeze(1)
+            per_token_logps.append(token_log_prob)
+        return torch.stack(per_token_logps)
+    # Trainer "prepares" the inputs before calling `compute_loss`. It converts to tensor and move to device.
+    # Since we preprocess the data in `compute_loss`, we need to override this method to skip this step.
+    def _prepare_inputs(
+        self, inputs: dict[str, Union[torch.Tensor, Any]]
+    ) -> dict[str, Union[torch.Tensor, Any]]:
+        device = self.accelerator.device
+        prompts = [x["prompt"] for x in inputs]
+        images = [x["image"] for x in inputs]
+        prompts_text = [
+            maybe_apply_chat_template(example, self.processing_class)["prompt"]
+            for example in inputs
+        ]
+        prompt_inputs = self.processing_class(
+            # prompts_text, return_tensors="pt", padding=True, padding_side="left", add_special_tokens=False
+            text=prompts_text,
+            images=images,
+            return_tensors="pt",
+            padding=True,
+            padding_side="left",
+            add_special_tokens=False,
+        )
+        prompt_ids, prompt_mask = (
+            prompt_inputs["input_ids"].to(device),
+            prompt_inputs["attention_mask"].to(device),
+        )
+        if self.max_prompt_length is not None:
+            prompt_ids = prompt_ids[:, -self.max_prompt_length :]
+            prompt_mask = prompt_mask[:, -self.max_prompt_length :]
+        if self.args.use_vllm:
+            # First, have main process load weights if needed
+            if self.state.global_step != self._last_loaded_step:
+                with unwrap_model_for_generation(
+                    self.model,
+                    self.accelerator,
+                    gather_deepspeed3_params=False,  # TODO: fix this, self.args.ds3_gather_for_generation,
+                ) as unwrapped_model:
+                    if is_compiled_module(unwrapped_model):
+                        state_dict = unwrapped_model._orig_mod.state_dict()
+                    else:
+                        state_dict = unwrapped_model.state_dict()
+                if self.accelerator.is_main_process:
+                    llm_model = (
+                        self.llm.llm_engine.model_executor.driver_worker.model_runner.model
+                    )
+                    llm_model.load_weights(state_dict.items())
+                self._last_loaded_step = self.state.global_step
+            # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
+            all_prompts_text = gather_object(prompts_text)
+            all_images = gather_object(images)
+            # group into pairs
+            all_multimodal_inputs = [
+                {"prompt": p, "multi_modal_data": {"image": i}}
+                for p, i in zip(all_prompts_text, all_images)
+            ]
+            if self.accelerator.is_main_process:
+                outputs = self.llm.generate(
+                    all_multimodal_inputs,
+                    sampling_params=self.sampling_params,
+                    use_tqdm=False,
+                )
+                completion_ids = [
+                    out.token_ids
+                    for completions in outputs
+                    for out in completions.outputs
+                ]
+            else:
+                completion_ids = [None] * len(all_prompts_text)
+            completion_ids = broadcast_object_list(completion_ids, from_process=0)
+            process_slice = slice(
+                self.accelerator.process_index * len(prompts),
+                (self.accelerator.process_index + 1) * len(prompts),
+            )
+            completion_ids = completion_ids[process_slice]
+            # Pad the completions, and concatenate them with the prompts
+            completion_ids = [
+                torch.tensor(ids, device=device) for ids in completion_ids
+            ]
+            completion_ids = pad(
+                completion_ids, padding_value=self.processing_class.pad_token_id
+            )
+            prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+        else:
+            raise ValueError("Only vLLM generation is supported in this version ")
+        # below are the same with yifan's code
+        # Mask everything after the first EOS token
+        is_eos = completion_ids == self.processing_class.eos_token_id
+        device = self.accelerator.device
+        eos_idx = torch.full(
+            (is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device
+        )
+        eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
+        sequence_indices = torch.arange(is_eos.size(1), device=device).expand(
+            is_eos.size(0), -1
+        )
+        completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()
+        # Concatenate prompt_mask with completion_mask for logit computation
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)  # (B*G, P+C)
+        # pixel_values = prompt_inputs["pixel_values"].repeat_interleave(
+        #     self.num_generations, dim=0
+        # )
+        pixel_values = prompt_inputs["pixel_values"]
+        # [None].repeat_interleave(self.num_generations, dim=0)
+        # pixel_values = pixel_values.view(-1, pixel_values.shape[-1])
+        image_grid_thw = prompt_inputs["image_grid_thw"]
+        # .repeat_interleave(
+        #     self.num_generations, dim=0
+        # )
+        logits_to_keep = completion_ids.size(1)
+        with torch.inference_mode():
+            if self.ref_model is not None:
+                ref_per_token_logps = self._get_per_token_logps(
+                    self.ref_model,
+                    prompt_completion_ids,
+                    attention_mask,
+                    pixel_values,
+                    image_grid_thw,
+                    logits_to_keep,
+                )
+            else:
+                with self.accelerator.unwrap_model(self.model).disable_adapter():
+                    ref_per_token_logps = self._get_per_token_logps(
+                        self.model,
+                        prompt_completion_ids,
+                        attention_mask,
+                        pixel_values,
+                        image_grid_thw,
+                        logits_to_keep,
+                    )
+        # Decode the generated completions
+        completions = self.processing_class.batch_decode(
+            completion_ids, skip_special_tokens=True
+        )
+        if is_conversational(inputs[0]):
+            completions = [
+                [{"role": "assistant", "content": completion}]
+                for completion in completions
+            ]
+        # Compute the rewards
+        rewards_per_func = torch.zeros(
+            len(prompts), len(self.reward_funcs), device=device
+        )
+        for i, (reward_func, reward_processing_class) in enumerate(
+            zip(self.reward_funcs, self.reward_processing_classes)
+        ):
+            if isinstance(reward_func, PreTrainedModel):
+                if is_conversational(inputs[0]):
+                    messages = [
+                        {"messages": p + c} for p, c in zip(prompts, completions)
+                    ]
+                    texts = [
+                        apply_chat_template(x, reward_processing_class)["text"]
+                        for x in messages
+                    ]
+                else:
+                    texts = [p + c for p, c in zip(prompts, completions)]
+                reward_inputs = reward_processing_class(
+                    texts,
+                    return_tensors="pt",
+                    padding=True,
+                    padding_side="right",
+                    add_special_tokens=False,
+                )
+                reward_inputs = super()._prepare_inputs(reward_inputs)
+                with torch.inference_mode():
+                    rewards_per_func[:, i] = reward_func(**reward_inputs).logits[
+                        :, 0
+                    ]  # Shape (B*G,)
+            else:
+                # Repeat all input columns (but "prompt" and "completion") to match the number of generations
+                reward_kwargs = {
+                    key: []
+                    for key in inputs[0].keys()
+                    if key not in ["prompt", "completion"]
+                }
+                for key in reward_kwargs:
+                    for example in inputs:
+                        # Repeat each value in the column for `num_generations` times
+                        reward_kwargs[key].extend([example[key]] * self.num_generations)
+                output_reward_func = reward_func(
+                    prompts=prompts, completions=completions, **reward_kwargs
+                )
+                rewards_per_func[:, i] = torch.tensor(
+                    output_reward_func, dtype=torch.float32, device=device
+                )
+        rewards_per_func = gather(rewards_per_func)
+        # Sum the rewards from all reward functions
+        rewards = rewards_per_func.sum(dim=1)
+        # Compute grouped-wise rewards
+        mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1)
+        std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1)
+        # Normalize the rewards to compute the advantages
+        mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(
+            self.num_generations, dim=0
+        )
+        std_grouped_rewards = std_grouped_rewards.repeat_interleave(
+            self.num_generations, dim=0
+        )
+        advantages = (rewards - mean_grouped_rewards) / (std_grouped_rewards + 1e-4)
+        # Slice to keep only the local part of the data
+        process_slice = slice(
+            self.accelerator.process_index * len(prompts),
+            (self.accelerator.process_index + 1) * len(prompts),
+        )
+        advantages = advantages[process_slice]
+        # Log the metrics
+        reward_per_func = rewards_per_func.mean(0)
+        for i, reward_func in enumerate(self.reward_funcs):
+            if isinstance(
+                reward_func, nn.Module
+            ):  # Module instead of PretrainedModel for compat with compiled models
+                reward_func_name = reward_func.config._name_or_path.split("/")[-1]
+            else:
+                reward_func_name = reward_func.__name__
+            self._metrics[f"rewards/{reward_func_name}"].append(
+                reward_per_func[i].item()
+            )
+        self._metrics["reward"].append(rewards.mean().item())
+        self._metrics["reward_std"].append(std_grouped_rewards.mean().item())
+        return {
+            "prompt_ids": prompt_ids,
+            "prompt_mask": prompt_mask,
+            "completion_ids": completion_ids,
+            "completion_mask": completion_mask,
+            "ref_per_token_logps": ref_per_token_logps,
+            "advantages": advantages,
+            "pixel_values": pixel_values,
+            "image_grid_thw": image_grid_thw,
+        }
+    def compute_loss(
+        self, model, inputs, return_outputs=False, num_items_in_batch=None
+    ):
+        if return_outputs:
+            raise ValueError("The GRPOTrainer does not support returning outputs")
+        # Compute the per-token log probabilities for the model
+        prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
+        completion_ids, completion_mask = (
+            inputs["completion_ids"],
+            inputs["completion_mask"],
+        )
+        input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
+        pixel_values = inputs["pixel_values"]
+        image_grid_thw = inputs["image_grid_thw"]
+        logits_to_keep = completion_ids.size(
+            1
+        )  # we only need to compute the logits for the completion tokens
+        per_token_logps = self._get_per_token_logps(
+            model,
+            input_ids,
+            attention_mask,
+            pixel_values,
+            image_grid_thw,
+            logits_to_keep,
+        )
+        # Compute the KL divergence between the model and the reference model
+        ref_per_token_logps = inputs["ref_per_token_logps"]
+        per_token_kl = (
+            torch.exp(ref_per_token_logps - per_token_logps)
+            - (ref_per_token_logps - per_token_logps)
+            - 1
+        )
+        # x - x.detach() allows for preserving gradients from x
+        advantages = inputs["advantages"]
+        per_token_loss = torch.exp(
+            per_token_logps - per_token_logps.detach()
+        ) * advantages.unsqueeze(1)
+        per_token_loss = -(per_token_loss - self.beta * per_token_kl)
+        loss = (
+            (per_token_loss * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)
+        ).mean()
+        # Log the metrics
+        completion_length = (
+            self.accelerator.gather_for_metrics(completion_mask.sum(1))
+            .float()
+            .mean()
+            .item()
+        )
+        self._metrics["completion_length"].append(completion_length)
+        mean_kl = (
+            (per_token_kl * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)
+        ).mean()
+        self._metrics["kl"].append(
+            self.accelerator.gather_for_metrics(mean_kl).mean().item()
+        )
+        return loss
+    def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None:
+        metrics = {key: sum(val) / len(val) for key, val in self._metrics.items()}  # average the metrics
+        # This method can be called both in training and evaluation. When called in evaluation, the keys in `logs`
+        # start with "eval_". We need to add the prefix "eval_" to the keys in `metrics` to match the format.
+        if next(iter(logs.keys())).startswith("eval_"):
+            metrics = {f"eval_{key}": val for key, val in metrics.items()}
+        logs = {**logs, **metrics}
+        if version.parse(transformers.__version__) >= version.parse("4.47.0.dev0"):
+            super().log(logs, start_time)
+        else:  # transformers<=4.46
+            super().log(logs)
+        self._metrics.clear()