XinBB commited on Jun 5, 2025

Commit

22f6d8c

verified ·

1 Parent(s): ee53828

Add files using upload-large-folder tool

Browse files

Files changed (20) hide show

.gitattributes +1 -0
eval/test_grounding_r1_nothink_ss.py +352 -0
open-r1-multimodal/configs/qwen2vl_sft_config.yaml +42 -0
open-r1-multimodal/data_config/gui_grounding.yaml +2 -0
open-r1-multimodal/data_config/rec_internvl.yaml +4 -0
open-r1-multimodal/data_jsonl/showui_desktop_qwen25vl_absolute_position.json +3 -0
open-r1-multimodal/local_scripts/create_vision_cot_data.py +153 -0
open-r1-multimodal/local_scripts/zero3.yaml +22 -0
open-r1-multimodal/setup.py +137 -0
open-r1-multimodal/src/open_r1.egg-info/SOURCES.txt +32 -0
open-r1-multimodal/src/open_r1.egg-info/not-zip-safe +1 -0
open-r1-multimodal/src/open_r1/grpo.py +214 -0
open-r1-multimodal/src/open_r1/sft.py +346 -0
open-r1-multimodal/src/open_r1/trainer/__init__.py +5 -0
open-r1-multimodal/src/open_r1/trainer/__pycache__/vllm_grpo_trainer.cpython-310.pyc +0 -0
open-r1-multimodal/src/open_r1/utils/__pycache__/math.cpython-310.pyc.139714633805856 +0 -0
open-r1-multimodal/src/open_r1/utils/__pycache__/math.cpython-310.pyc.140170314805280 +0 -0
open-r1-multimodal/src/open_r1/vlm_modules/__pycache__/internvl_module.cpython-310.pyc +0 -0
open-r1-multimodal/src/open_r1/vlm_modules/__pycache__/qwen_module.cpython-310.pyc +0 -0
open-r1-multimodal/src/open_r1/vlm_modules/qwen_module.py +238 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+open-r1-multimodal/data_jsonl/showui_desktop_qwen25vl_absolute_position.json filter=lfs diff=lfs merge=lfs -text

eval/test_grounding_r1_nothink_ss.py ADDED Viewed

	@@ -0,0 +1,352 @@

+from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+import json
+from tqdm import tqdm
+import re
+import os
+from pprint import pprint
+import random
+from PIL import Image
+from transformers.models.qwen2_vl.image_processing_qwen2_vl_fast import smart_resize
+import torch.distributed as dist
+from torch.nn.parallel import DistributedDataParallel as DDP
+import argparse
+import warnings
+warnings.filterwarnings("ignore", category=UserWarning, module="transformers")
+def setup_distributed():
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group(backend="nccl")
+    world_size = dist.get_world_size()
+    rank = dist.get_rank()
+    return local_rank, world_size, rank
+local_rank, world_size, rank = setup_distributed()
+device = f"cuda:{local_rank}"
+print(f"Process {rank} using {device}")
+steps = 3800
+if rank == 0:
+    print("Steps: ", steps)
+#RUN_NAME = "base"
+RUN_NAME = "Qwen2.5-VL-7B-GRPO-GUI-Grounding_showui_desktop_high_quality_attention_filtered_only_one_continual_dense_reward_quadratic_decay_0.5_format_bs16_kl0.004_nothink_10e"
+#MODEL_PATH="/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/Qwen2.5-VL-7B-Instruct"
+MODEL_PATH=f"/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/src/open-r1-multimodal/output/{RUN_NAME}/checkpoint-{steps}"
+OUTPUT_PATH="./logs/rec_results_{DATASET}_{RUN_NAME}_{STEPS}.json"
+BSZ=32
+DATA_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot/metadata"
+TEST_DATASETS = ['hf_test_full']
+IMAGE_ROOT = "/data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/ScreenSpot-Pro-GUI-Grounding/ScreenSpot/images"
+# TEST_DATASETS = ['lisa_test']
+# IMAGE_ROOT = "/data10/shz/dataset/lisa"
+#We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map={"": local_rank},
+)
+# default processer
+processor = AutoProcessor.from_pretrained(MODEL_PATH,max_pixels=2007040,min_pixels=3136)
+# processor.image_processor.min_pixels=3136
+# processor.image_processor.max_pixels=2007040
+print(processor.image_processor.min_pixels)
+print(processor.image_processor.max_pixels)
+# def extract_point_answer(content):
+#     # Try to find the bbox within <answer> tags, if can not find, return [0, 0, 0, 0]
+#     answer_tag_pattern = r'<answer>(.*?)</answer>'
+#     content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
+#     if content_answer_match:
+#         content_answer = content_answer_match.group(1).strip()
+#         tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content_answer, re.DOTALL)
+#         if tool_call_match:
+#                 tool_call_content = tool_call_match.group(1).strip()
+#                 # 解析 JSON
+#                 tool_call_json = json.loads(tool_call_content)
+#                 arguments = tool_call_json.get("arguments", {})
+#                 coordinate = arguments.get("coordinate", None)
+#                 if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
+#                     x, y = coordinate
+#                     extracted_coordinate = [x, y]
+#                     return extracted_coordinate
+#     return [0, 0]
+# def extract_point_answer(content):
+#     # 尝试在 <answer> 标签中查找内容，如果找不到则返回 [0, 0]
+#     tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
+#     if tool_call_match:
+#         tool_call_content = tool_call_match.group(1).strip()
+#         # 首先尝试将 tool_call_content 解析为 JSON
+#         try:
+#             tool_call_json = json.loads(tool_call_content)
+#             print(tool_call_json)
+#             arguments = tool_call_json.get("arguments", {})
+#             coordinate = arguments.get("coordinate", None)
+#             if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
+#                 try:
+#                     x = float(coordinate[0])
+#                     y = float(coordinate[1])
+#                     return [x, y]
+#                 except (ValueError, TypeError):
+#                     pass  # 如果转换失败，继续尝试正则提取
+#         except json.JSONDecodeError:
+#             pass  # 如果 JSON 解析失败，继续尝试正则提取
+#         # 回退到正则表达式提取两个数字
+#         numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
+#         if len(numbers) >= 2:
+#             x = float(numbers[-2])
+#             y = float(numbers[-1])
+#             return [x, y]
+#     return [0, 0]
+def extract_point_answer(content):
+    # 尝试在 <answer> 标签中查找内容，如果找不到则返回 [0, 0]
+    tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
+    if tool_call_match:
+        tool_call_content = tool_call_match.group(1).strip()
+        # 首先尝试将 tool_call_content 解析为 JSON
+        try:
+            numbers = re.findall(r'\d+(?:\.\d+)?', tool_call_content)
+            if len(numbers) >= 2:
+                x = float(numbers[-2])
+                y = float(numbers[-1])
+                return [x, y]
+        except json.JSONDecodeError:
+            pass  # 如果 JSON 解析失败，继续尝试正则提取
+        # 回退到正则表达式提取两个数字
+    return [0, 0]
+def point_in_box(point, box):
+    x,y = point
+    if box[0] <= x < box[2] and box[1] <= y < box[3]:
+        return 1
+    else:
+        return 0
+num_samples = 2000
+num_all_sample = 0
+num_desktop_sample = 0
+num_mobile_sample = 0
+num_web_sample = 0
+num_correct_sample = 0
+for ds in TEST_DATASETS:
+    if rank == 0:
+        print(f"Processing {ds}...")
+    ds_path = os.path.join(DATA_ROOT, f"{ds}.json")
+    data = json.load(open(ds_path, "r"))
+    random.seed(42)
+    random.shuffle(data)
+    data = data[:num_samples]
+    # Split data for distributed evaluation
+    per_rank_data = len(data) // world_size
+    start_idx = rank * per_rank_data
+    end_idx = start_idx + per_rank_data if rank < world_size - 1 else len(data)
+    rank_data = data[start_idx:end_idx]
+    messages = []
+    for x in rank_data:
+        image_path = os.path.join(IMAGE_ROOT, x['img_url'])
+        width,height = x['img_size'][0],x['img_size'][1]
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
+            min_pixels = processor.image_processor.min_pixels,
+            max_pixels = processor.image_processor.max_pixels,
+        )
+        system_content = """You are a helpful assistant.
+#Tools
+You may call one or more functions to assist with the user query.
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{"type": "function", "function": {"name_for_human": "computer_use", "name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer, and take screenshots.\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n* The screen's resolution is {{screen_width}}x{{screen_height}}.\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.", "parameters": {"properties": {"action": {"description": "The action to perform. The available actions are:\n* key: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.\n* type: Type a string of text on the keyboard.\n* mouse_move: Move the cursor to a specified (x, y) pixel coordinate on the screen.\n* left_click: Click the left mouse button.\n* left_click_drag: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.\n* right_click: Click the right mouse button.\n* middle_click: Click the middle mouse button.\n* double_click: Double-click the left mouse button.\n* scroll: Performs a scroll of the mouse scroll wheel.\n* wait: Wait specified seconds for the change to happen.\n* terminate: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "scroll", "wait", "terminate"], "type": "string"}, "keys": {"description": "Required only by action=key.", "type": "array"}, "text": {"description": "Required only by action=type.", "type": "string"}, "coordinate": {"description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by action=mouse_move and action=left_click_drag.", "type": "array"}, "pixels": {"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by action=scroll.", "type": "number"}, "time": {"description": "The seconds to wait. Required only by action=wait.", "type": "number"}, "status": {"description": "The status of the task. Required only by action=terminate.", "type": "string", "enum": ["success", "failure"]}}, "required": ["action"], "type": "object"}, "args_format": "Format the arguments as a JSON object."}}
+</tools>
+For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call>""".replace("{{screen_width}}", str(resized_width)).replace("{{screen_height}}", str(resized_height))
+        message = [
+            {
+             "role": "system",
+             "content": [
+                {
+                    "type": "text",
+                    "text": system_content
+                }
+              ]
+            },
+            {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": f"file://{image_path}"
+                },
+                {
+                    "type": "text",
+                    "text": x['task']
+                }
+            ]
+        },
+        ]
+        # print(message)
+        messages.append(message)
+    rank_outputs = [] # List to store answers for this rank
+    all_outputs = []  # List to store all answers
+    # Process data
+    for i in tqdm(range(0, len(messages), BSZ), disable=rank != 0):
+        batch_messages = messages[i:i + BSZ]
+        # Preparation for inference
+        text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in batch_messages]
+        image_inputs, video_inputs = process_vision_info(batch_messages)
+        inputs = processor(
+            text=text,
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            padding_side="left",
+            return_tensors="pt",
+        )
+        inputs = inputs.to(device)
+        # Inference: Generation of the output
+        generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=False)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        batch_output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        rank_outputs.extend(batch_output_text)
+    print(f"Rank {rank} has finished processing {len(rank_outputs)} examples")
+    # Gather all outputs from all ranks
+    all_outputs = [None] * len(data)
+    rank_results = [(start_idx + i, output) for i, output in enumerate(rank_outputs)]
+    gathered_results = [None] * world_size
+    dist.all_gather_object(gathered_results, rank_results)
+    assert gathered_results[-1][-1][0] == len(data) - 1
+    # The main process will collect all results
+    if rank == 0:
+        for results in gathered_results:
+            for idx, output in results:
+                assert idx < len(all_outputs)
+                all_outputs[idx] = output
+        assert all_outputs[-1] is not None
+        final_output = []
+        correct_number = 0
+        correct_number_desktop = 0
+        correct_number_mobile = 0
+        correct_number_web = 0
+        for input_example, model_output in zip(data, all_outputs):
+            original_output = model_output
+            ground_truth = input_example['bbox']
+            split_class = input_example['split']
+            ground_truth = [ground_truth[0] / input_example['img_size'][0], ground_truth[1] / input_example['img_size'][1], (ground_truth[0]+ground_truth[2]) / input_example['img_size'][0], (ground_truth[1]+ground_truth[3]) / input_example['img_size'][1]]
+            model_answer = extract_point_answer(original_output)
+            resized_height, resized_width = smart_resize(
+            input_example['img_size'][1],
+            input_example['img_size'][0],
+            factor = processor.image_processor.patch_size * processor.image_processor.merge_size,
+            min_pixels = processor.image_processor.min_pixels,
+            max_pixels = processor.image_processor.max_pixels,
+        )
+            model_answer = [model_answer[0]/resized_width,model_answer[1]/resized_height]
+            # Count correct answers
+            correct = 0
+            if model_answer is not None:
+                correct = point_in_box(model_answer, ground_truth)
+            correct_number += correct
+            num_all_sample +=1
+            num_correct_sample += correct
+            if split_class == "desktop":
+                correct_number_desktop += correct
+                num_desktop_sample += 1
+            if split_class == "mobile":
+                correct_number_mobile += correct
+                num_mobile_sample += 1
+            if split_class == "web":
+                correct_number_web += correct
+                num_web_sample += 1
+            # Create a result dictionary for this example
+            result = {
+                'image': input_example['img_url'],
+                'question': input_example['task'],
+                'resized_size': [resized_height, resized_width],
+                'ground_truth': ground_truth,
+                'model_output': original_output,
+                'extracted_answer': model_answer,
+                'correct': correct
+            }
+            final_output.append(result)
+        # Calculate and print accuracy
+        accuracy = correct_number / len(data) * 100
+        accuracy_desktop = correct_number_desktop / num_desktop_sample * 100
+        accuracy_mobile = correct_number_mobile / num_mobile_sample * 100
+        accuracy_web = correct_number_web / num_web_sample * 100
+        print(f"\nAccuracy of {ds}: {accuracy:.2f}%")
+        print(f"Accuracy of desktop: {accuracy_desktop:.2f}%")
+        print(f"Accuracy of mobile: {accuracy_mobile:.2f}%")
+        print(f"Accuracy of web: {accuracy_web:.2f}%")
+        # Save results to a JSON file
+        output_path = OUTPUT_PATH.format(DATASET=ds, RUN_NAME=RUN_NAME, STEPS=steps)
+        output_dir = os.path.dirname(output_path)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        with open(output_path, "w") as f:
+            json.dump({
+                'accuracy': accuracy,
+                'results': final_output
+            }, f, indent=2)
+        print(f"Results saved to {output_path}")
+        print("-"*100)
+# 将最后的统计和打印移到rank==0的条件块内
+    if rank == 0:
+        accuracy = num_correct_sample / num_all_sample * 100
+        print(f"\nnumber of correct samples: {num_correct_sample}")
+        print(f"number of all samples: {num_all_sample}")
+        print(f"Accuracy of all datasets: {accuracy:.2f}%")
+    # Synchronize all processes
+    dist.barrier()

open-r1-multimodal/configs/qwen2vl_sft_config.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+# Model arguments
+model_name_or_path: /data/shz/ckpt/Qwen2.5-VL-3B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+# Data training arguments
+dataset_name: /data/shz/project/vlm-r1/VLM-R1/src/open-r1-multimodal/data_script/rec.yaml
+image_root: /data/shz/dataset/coco
+dataset_configs:
+- all
+preprocessing_num_workers: 8
+# SFT trainer config
+bf16: true
+do_eval: true
+eval_strategy: "no"
+gradient_accumulation_steps: 2
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-VL-3B-Instruct
+hub_strategy: every_save
+learning_rate: 2.0e-05
+log_level: info
+logging_steps: 5
+logging_strategy: steps
+lr_scheduler_type: cosine
+packing: true
+max_seq_length: 4096
+max_steps: -1
+num_train_epochs: 3
+output_dir: /data/shz/project/vlm-r1/VLM-R1/output/Qwen2.5-VL-3B-Instruct-SFT
+overwrite_output_dir: true
+per_device_eval_batch_size: 1
+per_device_train_batch_size: 4
+push_to_hub: false
+report_to:
+- wandb
+save_strategy: "no"
+seed: 42
+data_seed: 42
+warmup_ratio: 0.1

open-r1-multimodal/data_config/gui_grounding.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ datasets:
2	+ - json_path: /data/vjuicefs_ai_camera_jgroup_research/public_data/11178625/LLaMA-Factory/VLM-R1/data/rec_jsons_processed/showui_desktop_no_position_high_quality_qwen25vl_4028160_attention_0.2_filtered_only_one.json

open-r1-multimodal/data_config/rec_internvl.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+datasets:
+    - json_path: /data10/shz/dataset/rec/rec_jsons_internvl/refcoco_train.json
+    - json_path: /data10/shz/dataset/rec/rec_jsons_internvl/refcocop_train.json
+    - json_path: /data10/shz/dataset/rec/rec_jsons_internvl/refcocog_train.json

open-r1-multimodal/data_jsonl/showui_desktop_qwen25vl_absolute_position.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19d1823752455bca732cc85c0f7c6327db602e8140044d946e690abc9bb3ad52
+size 30595146

open-r1-multimodal/local_scripts/create_vision_cot_data.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import argparse
+import base64
+import concurrent.futures
+import io
+import json
+import os
+import random
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from io import BytesIO
+from typing import Dict, List
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from datasets import Dataset, concatenate_datasets, load_dataset, load_from_disk
+from tqdm import tqdm
+import bytedtos
+import seaborn as sns
+import yaml
+from openai import AzureOpenAI
+from PIL import Image
+from pillow_avif import AvifImagePlugin
+PROMPT_FORMAT = """I will provide you with an image, an original question, and its answer related to the image. Your task is to rewrite the question in such a way that answering it requires step-by-step Chain-of-Thought (CoT) reasoning with numerical or mathematical expressions where applicable. The reasoning process can include expressions like "let me think," "oh, I see," or other natural language thought expressions.
+Please make sure your question is to ask for a certain answer with a certain value, do not ask for open-ended answer, and the answer is correct and easy to verify via simple protocol, like "2" or "A".
+Please strictly do not include "Answer:" in the question part to avoid confusion and leakage.
+Input Format:
+Original Question: {original_question}
+Original Answer: {original_answer}
+Output Format:
+Question: [rewrite the question if necessary]
+Answer: [answer with reasoning steps, including calculations where applicable]
+<think>step-by-step reasoning process</think>
+<answer>easy to verify answer</answer>
+"""
+def get_image_data_url(image_input):
+    if isinstance(image_input, str) and image_input.startswith("data:"):
+        return image_input
+    if isinstance(image_input, str) and image_input.startswith("http"):
+        image_input = load_image(image_input)
+    if isinstance(image_input, str):
+        image_input = Image.open(image_input)
+    if not isinstance(image_input, Image.Image):
+        raise ValueError("Unsupported image input type")
+    if image_input.mode != "RGB":
+        image_input = image_input.convert("RGB")
+    buffer = BytesIO()
+    image_input.save(buffer, format="JPEG")
+    img_bytes = buffer.getvalue()
+    base64_data = base64.b64encode(img_bytes).decode("utf-8")
+    return f"data:image/jpeg;base64,{base64_data}"
+def gpt4o_query(image, prompt, max_retries=5, initial_delay=3):
+    if image is None:
+        return None
+    data_url_list = [get_image_data_url(image)]
+    client = AzureOpenAI(
+        azure_endpoint="YOUR_AZURE_ENDPOINT",
+        api_version="2023-07-01-preview",
+        api_key="YOUR_API_KEY",
+    )
+    for attempt in range(max_retries):
+        try:
+            messages = [
+                {
+                    "role": "system",
+                    "content": "You are an expert to analyze the image and provide useful information for users.",
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                    ],
+                },
+            ]
+            for data_url in data_url_list:
+                messages[1]["content"].insert(
+                    0, {"type": "image_url", "image_url": {"url": data_url}}
+                )
+            response = client.chat.completions.create(
+                model="gpt-4o-2024-08-06",
+                messages=messages,
+                temperature=0.2,
+                max_tokens=8192,
+            )
+            return response.choices[0].message.content
+        except Exception as e:
+            if attempt == max_retries - 1:
+                raise Exception(
+                    f"Failed after {max_retries} attempts. Last error: {str(e)}"
+                )
+            delay = initial_delay * (2**attempt) + random.uniform(
+                0, 0.1 * initial_delay * (2**attempt)
+            )
+            time.sleep(delay)
+def process_single_item(example):
+    try:
+        image_path = example["image_path"]
+        formatted_prompt = PROMPT_FORMAT.format(
+            original_question=example["question"], original_answer=example["answer"]
+        )
+        response = gpt4o_query(image_path, formatted_prompt)
+        example["gpt4o_response"] = response
+        return example
+    except Exception as e:
+        print(f"Error processing item: {str(e)}")
+        example["gpt4o_response"] = None
+        return example
+def main():
+    dataset_path = "path/to/your/dataset"
+    full_dataset = load_from_disk(dataset_path)
+    processed_dataset = full_dataset.map(
+        function=partial(process_single_item),
+        num_proc=256,
+        desc="Processing dataset with GPT-4o",
+        keep_in_memory=True,
+    )
+    output_path = f"{dataset_path}_processed"
+    processed_dataset.save_to_disk(output_path)
+    print(f"Processed dataset saved to: {output_path}")
+if __name__ == "__main__":
+    main()

open-r1-multimodal/local_scripts/zero3.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

open-r1-multimodal/setup.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Adapted from huggingface/transformers: https://github.com/huggingface/transformers/blob/21a2d900eceeded7be9edc445b56877b95eda4ca/setup.py
+import re
+import shutil
+from pathlib import Path
+from setuptools import find_packages, setup
+# Remove stale open_r1.egg-info directory to avoid https://github.com/pypa/pip/issues/5466
+stale_egg_info = Path(__file__).parent / "open_r1.egg-info"
+if stale_egg_info.exists():
+    print(
+        (
+            "Warning: {} exists.\n\n"
+            "If you recently updated open_r1, this is expected,\n"
+            "but it may prevent open_r1 from installing in editable mode.\n\n"
+            "This directory is automatically generated by Python's packaging tools.\n"
+            "I will remove it now.\n\n"
+            "See https://github.com/pypa/pip/issues/5466 for details.\n"
+        ).format(stale_egg_info)
+    )
+    shutil.rmtree(stale_egg_info)
+# IMPORTANT: all dependencies should be listed here with their version requirements, if any.
+#   * If a dependency is fast-moving (e.g. transformers), pin to the exact version
+_deps = [
+    "accelerate>=1.2.1",
+    "bitsandbytes>=0.43.0",
+    "black>=24.4.2",
+    "datasets>=3.2.0",
+    "deepspeed==0.15.4",
+    "distilabel[vllm,ray,openai]>=1.5.2",
+    "einops>=0.8.0",
+    "flake8>=6.0.0",
+    "hf_transfer>=0.1.4",
+    "huggingface-hub[cli]>=0.19.2,<1.0",
+    "isort>=5.12.0",
+    "liger_kernel==0.5.2",
+    # "lighteval @ git+https://github.com/huggingface/lighteval.git@4f381b352c0e467b5870a97d41cb66b487a2c503#egg=lighteval[math]",
+    "math-verify",  # Used for math verification in grpo
+    "packaging>=23.0",
+    "parameterized>=0.9.0",
+    "pytest",
+    "safetensors>=0.3.3",
+    "sentencepiece>=0.1.99",
+    "torch>=2.5.1",
+    "transformers>=4.49.0",
+    "trl @ git+https://github.com/huggingface/trl.git@main",
+    "vllm==0.6.6.post1",
+    "wandb>=0.19.1",
+    "pillow",
+]
+# this is a lookup table with items like:
+#
+# tokenizers: "tokenizers==0.9.4"
+# packaging: "packaging"
+#
+# some of the values are versioned whereas others aren't.
+deps = {b: a for a, b in (re.findall(r"^(([^!=<>~ \[\]]+)(?:\[[^\]]+\])?(?:[!=<>~ ].*)?$)", x)[0] for x in _deps)}
+def deps_list(*pkgs):
+    return [deps[pkg] for pkg in pkgs]
+extras = {}
+extras["tests"] = deps_list("pytest", "parameterized")
+extras["torch"] = deps_list("torch")
+extras["quality"] = deps_list("black", "isort", "flake8")
+# extras["eval"] = deps_list("lighteval", "math-verify")
+extras["eval"] = deps_list("math-verify")
+extras["dev"] = extras["quality"] + extras["tests"] + extras["eval"]
+# core dependencies shared across the whole project - keep this to a bare minimum :)
+install_requires = [
+    deps["accelerate"],
+    deps["bitsandbytes"],
+    deps["einops"],
+    deps["datasets"],
+    deps["deepspeed"],
+    deps["hf_transfer"],
+    deps["huggingface-hub"],
+    deps["liger_kernel"],
+    deps["packaging"],  # utilities from PyPA to e.g., compare versions
+    deps["safetensors"],
+    deps["sentencepiece"],
+    deps["transformers"],
+    deps["trl"],
+]
+setup(
+    name="open-r1",
+    version="0.1.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    author="The Hugging Face team (past and future)",
+    author_email="lewis@huggingface.co",
+    description="Open R1",
+    # long_description=open("README.md", "r", encoding="utf-8").read(),
+    long_description_content_type="text/markdown",
+    keywords="llm inference-time compute reasoning",
+    license="Apache",
+    url="https://github.com/huggingface/open-r1",
+    package_dir={"": "src"},
+    packages=find_packages("src"),
+    zip_safe=False,
+    extras_require=extras,
+    python_requires=">=3.10.9",
+    install_requires=install_requires,
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Education",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.10",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    ],
+)

open-r1-multimodal/src/open_r1.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,32 @@

+LICENSE
+setup.cfg
+setup.py
+src/open_r1/__init__.py
+src/open_r1/configs.py
+src/open_r1/evaluate.py
+src/open_r1/generate.py
+src/open_r1/grpo.py
+src/open_r1/grpo_gui_grounding.py
+src/open_r1/grpo_jsonl.py
+src/open_r1/grpo_rec.py
+src/open_r1/sft.py
+src/open_r1.egg-info/PKG-INFO
+src/open_r1.egg-info/SOURCES.txt
+src/open_r1.egg-info/dependency_links.txt
+src/open_r1.egg-info/not-zip-safe
+src/open_r1.egg-info/requires.txt
+src/open_r1.egg-info/top_level.txt
+src/open_r1/trainer/__init__.py
+src/open_r1/trainer/grpo_config.py
+src/open_r1/trainer/grpo_trainer.py
+src/open_r1/trainer/qwen_grpo_trainer.py
+src/open_r1/trainer/vllm_grpo_trainer.py
+src/open_r1/utils/__init__.py
+src/open_r1/utils/callbacks.py
+src/open_r1/utils/evaluation.py
+src/open_r1/utils/hub.py
+src/open_r1/utils/math.py
+src/open_r1/vlm_modules/__init__.py
+src/open_r1/vlm_modules/internvl_module.py
+src/open_r1/vlm_modules/qwen_module.py
+src/open_r1/vlm_modules/vlm_module.py

open-r1-multimodal/src/open_r1.egg-info/not-zip-safe ADDED Viewed

	@@ -0,0 +1 @@


1	+

open-r1-multimodal/src/open_r1/grpo.py ADDED Viewed

	@@ -0,0 +1,214 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# import debugpy
+# try:
+#     # 5678 is the default attach port in the VS Code debug configurations. Unless a host and port are specified, host defaults to 127.0.0.1
+#     debugpy.listen(("localhost", 9501))
+#     print("Waiting for debugger attach")
+#     debugpy.wait_for_client()
+# except Exception as e:
+#     pass
+import os
+import re
+from datetime import datetime
+from dataclasses import dataclass, field
+from typing import Optional
+from datasets import load_dataset, load_from_disk
+from transformers import Qwen2VLForConditionalGeneration
+from math_verify import parse, verify
+from open_r1.trainer import VLMGRPOTrainer
+from trl import GRPOConfig, GRPOTrainer, ModelConfig, ScriptArguments, TrlParser, get_peft_config
+@dataclass
+class GRPOScriptArguments(ScriptArguments):
+    """
+    Script arguments for the GRPO training script.
+    Args:
+        reward_funcs (`list[str]`):
+            List of reward functions. Possible values: 'accuracy', 'format'.
+    """
+    reward_funcs: list[str] = field(
+        default_factory=lambda: ["accuracy", "format"],
+        metadata={"help": "List of reward functions. Possible values: 'accuracy', 'format'"},
+    )
+    max_pixels: Optional[int] = field(
+        default=12845056,
+        metadata={"help": "Maximum number of pixels for the image"},
+    )
+    min_pixels: Optional[int] = field(
+        default=3136,
+        metadata={"help": "Minimum number of pixels for the image"},
+    )
+def accuracy_reward(completions, solution, **kwargs):
+    """Reward function that checks if the completion is correct using either symbolic verification or exact string matching."""
+    contents = [completion[0]["content"] for completion in completions]
+    rewards = []
+    current_time = datetime.now().strftime("%d-%H-%M-%S-%f")
+    for content, sol in zip(contents, solution):
+        reward = 0.0
+        # Try symbolic verification first
+        try:
+            answer = parse(content)
+            if float(verify(answer, parse(sol))) > 0:
+                reward = 1.0
+        except Exception:
+            pass  # Continue to next verification method if this fails
+        # If symbolic verification failed, try string matching
+        if reward == 0.0:
+            try:
+                # Extract answer from solution if it has think/answer tags
+                sol_match = re.search(r'<answer>(.*?)</answer>', sol)
+                ground_truth = sol_match.group(1).strip() if sol_match else sol.strip()
+                # Extract answer from content if it has think/answer tags
+                content_match = re.search(r'<answer>(.*?)</answer>', content)
+                student_answer = content_match.group(1).strip() if content_match else content.strip()
+                # Compare the extracted answers
+                if student_answer == ground_truth:
+                    reward = 1.0
+            except Exception:
+                pass  # Keep reward as 0.0 if both methods fail
+        rewards.append(reward)
+        if os.getenv("DEBUG_MODE") == "true":
+            log_path = os.getenv("LOG_PATH")
+            # local_rank = int(os.getenv("LOCAL_RANK", 0))
+            with open(log_path, "a") as f:
+                f.write(f"------------- {current_time} Accuracy reward: {reward} -------------\n")
+                f.write(f"Content: {content}\n")
+                f.write(f"Solution: {sol}\n")
+    return rewards
+def format_reward(completions, **kwargs):
+    """Reward function that checks if the completion has a specific format."""
+    pattern = r"<think>.*?</think>\s*<answer>.*?</answer>"
+    completion_contents = [completion[0]["content"] for completion in completions]
+    matches = [re.match(pattern, content) for content in completion_contents]
+    return [1.0 if match else 0.0 for match in matches]
+reward_funcs_registry = {
+    "accuracy": accuracy_reward,
+    "format": format_reward,
+}
+SYSTEM_PROMPT = (
+    "A conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant "
+    "first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning "
+    "process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., "
+    "<think> reasoning process here </think><answer> answer here </answer>"
+)
+def main(script_args, training_args, model_args):
+    # Get reward functions
+    reward_funcs = [reward_funcs_registry[func] for func in script_args.reward_funcs]
+    print("reward_funcs:", reward_funcs)
+    # Load the dataset
+    dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
+    # Format into conversation
+    def make_conversation(example):
+        return {
+            "prompt": [
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user", "content": example["problem"]},
+            ],
+        }
+    # def make_conversation_image(example):
+    #     return {
+    #         "prompt": [
+    #             {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
+    #             {
+    #                 "role": "user",
+    #                 "content": [
+    #                     {"type": "image"},
+    #                     {"type": "text", "text": example["problem"]},
+    #                 ],
+    #             },
+    #         ],
+    #     }
+    QUESTION_TEMPLATE = "{Question}  Output the thinking process in <think> </think> and final answer (number) in <answer> </answer> tags."
+    def make_conversation_image(example):
+        return {
+            "prompt": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {"type": "text", "text": QUESTION_TEMPLATE.format(Question=example["problem"])},
+                    ],
+                },
+            ],
+        }
+    if "image" in dataset[script_args.dataset_train_split].features:
+        print("has image in dataset")
+        dataset = dataset.map(make_conversation_image)  # Utilize multiprocessing for faster mapping
+        # dataset = dataset.remove_columns(["original_question", "original_answer"])
+    else:
+        print("no image in dataset")
+        dataset = dataset.map(make_conversation)
+        dataset = dataset.remove_columns("messages")
+    trainer_cls = VLMGRPOTrainer
+    # Initialize the GRPO trainer
+    trainer = trainer_cls(
+        model=model_args.model_name_or_path,
+        reward_funcs=reward_funcs,
+        args=training_args,
+        train_dataset=dataset[script_args.dataset_train_split],
+        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
+        peft_config=get_peft_config(model_args),
+        attn_implementation=model_args.attn_implementation,
+        max_pixels=script_args.max_pixels,
+        min_pixels=script_args.min_pixels,
+        torch_dtype=model_args.torch_dtype,
+    )
+    # Train and push the model to the Hub
+    trainer.train()
+    # Save and push to hub
+    trainer.save_model(training_args.output_dir)
+    if training_args.push_to_hub:
+        trainer.push_to_hub(dataset_name=script_args.dataset_name)
+if __name__ == "__main__":
+    parser = TrlParser((GRPOScriptArguments, GRPOConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    main(script_args, training_args, model_args)

open-r1-multimodal/src/open_r1/sft.py ADDED Viewed

	@@ -0,0 +1,346 @@

+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Supervised fine-tuning script for decoder language models.
+Usage:
+# One 1 node of 8 x H100s
+accelerate launch --config_file=configs/zero3.yaml src/open_r1/sft.py \
+    --model_name_or_path Qwen/Qwen2.5-1.5B-Instruct \
+    --dataset_name HuggingFaceH4/Bespoke-Stratos-17k \
+    --learning_rate 2.0e-5 \
+    --num_train_epochs 1 \
+    --packing \
+    --max_seq_length 4096 \
+    --per_device_train_batch_size 4 \
+    --gradient_accumulation_steps 4 \
+    --gradient_checkpointing \
+    --bf16 \
+    --logging_steps 5 \
+    --eval_strategy steps \
+    --eval_steps 100 \
+    --output_dir data/Qwen2.5-1.5B-Open-R1-Distill
+"""
+import logging
+import os
+import sys
+import datasets
+import torch
+from torch.utils.data import Dataset
+import transformers
+from datasets import load_dataset
+from transformers import AutoTokenizer, set_seed, AutoProcessor
+from transformers.trainer_utils import get_last_checkpoint
+from open_r1.configs import SFTConfig
+from open_r1.utils.callbacks import get_callbacks
+import yaml
+import json
+import math
+import random
+from PIL import Image
+from trl import (
+    ModelConfig,
+    ScriptArguments,
+    SFTTrainer,
+    TrlParser,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+)
+from dataclasses import field
+from qwen_vl_utils import process_vision_info
+logger = logging.getLogger(__name__)
+from dataclasses import dataclass
+@dataclass
+class SFTScriptArguments(ScriptArguments):
+    image_root: str = field(default=None, metadata={"help": "The root directory of the image."})
+processor = None
+class LazySupervisedDataset(Dataset):
+    def __init__(self, data_path: str, script_args: ScriptArguments):
+        super(LazySupervisedDataset, self).__init__()
+        self.script_args = script_args
+        self.list_data_dict = []
+        if data_path.endswith(".yaml"):
+            with open(data_path, "r") as file:
+                yaml_data = yaml.safe_load(file)
+                datasets = yaml_data.get("datasets")
+                # file should be in the format of:
+                # datasets:
+                #   - json_path: xxxx1.json
+                #     sampling_strategy: first:1000
+                #   - json_path: xxxx2.json
+                #     sampling_strategy: end:3000
+                #   - json_path: xxxx3.json
+                #     sampling_strategy: random:999
+                for data in datasets:
+                    json_path = data.get("json_path")
+                    sampling_strategy = data.get("sampling_strategy", "all")
+                    sampling_number = None
+                    if json_path.endswith(".jsonl"):
+                        cur_data_dict = []
+                        with open(json_path, "r") as json_file:
+                            for line in json_file:
+                                cur_data_dict.append(json.loads(line.strip()))
+                    elif json_path.endswith(".json"):
+                        with open(json_path, "r") as json_file:
+                            cur_data_dict = json.load(json_file)
+                    else:
+                        raise ValueError(f"Unsupported file type: {json_path}")
+                    if ":" in sampling_strategy:
+                        sampling_strategy, sampling_number = sampling_strategy.split(":")
+                        if "%" in sampling_number:
+                            sampling_number = math.ceil(int(sampling_number.split("%")[0]) * len(cur_data_dict) / 100)
+                        else:
+                            sampling_number = int(sampling_number)
+                    # Apply the sampling strategy
+                    if sampling_strategy == "first" and sampling_number is not None:
+                        cur_data_dict = cur_data_dict[:sampling_number]
+                    elif sampling_strategy == "end" and sampling_number is not None:
+                        cur_data_dict = cur_data_dict[-sampling_number:]
+                    elif sampling_strategy == "random" and sampling_number is not None:
+                        random.shuffle(cur_data_dict)
+                        cur_data_dict = cur_data_dict[:sampling_number]
+                    print(f"Loaded {len(cur_data_dict)} samples from {json_path}")
+                    self.list_data_dict.extend(cur_data_dict)
+        else:
+            raise ValueError(f"Unsupported file type: {data_path}")
+    def __len__(self):
+        return len(self.list_data_dict)
+    def __getitem__(self, i):
+        # Format into conversation
+        def make_conversation_image(example):
+            image_root = self.script_args.image_root
+            # print(111, image_root)
+            # print(222, example['image'])
+            image_path = os.path.join(image_root, example['image'])
+            x1, y1, x2, y2 = example["solution"]
+            normal_caption = example["normal_caption"]
+            return  [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "image", "image": f"file://{image_path}"},
+                            {"type": "text", "text": example["problem"]},
+                        ],
+                    },
+                    {
+                        "role": "assistant",
+                        "content": f'```json\n[\n\t{{"bbox_2d": [{int(x1)}, {int(y1)}, {int(x2)}, {int(y2)}], "label": "{normal_caption}"}}\n]\n```',
+                    }
+                ]
+        example = self.list_data_dict[i]
+        example["messages"] = make_conversation_image(example)
+        return example
+def collate_fn(examples):
+    texts = [
+        processor.apply_chat_template(example["messages"], tokenize=False, add_generation_prompt=True)
+        for example in examples
+    ]
+    image_inputs = []
+    for example in examples:
+        imgs, vids = process_vision_info(example["messages"])
+        image_inputs.append(imgs)
+    batch = processor(
+        text=texts,
+        images=image_inputs,
+        return_tensors="pt",
+        padding=True,
+    )
+    labels = batch["input_ids"].clone()
+    labels[labels == processor.tokenizer.pad_token_id] = -100
+    image_token_id = processor.tokenizer.convert_tokens_to_ids(processor.image_token)
+    labels[labels == image_token_id] = -100
+    batch["labels"] = labels
+    return batch
+def main(script_args, training_args, model_args):
+    # Set seed for reproducibility
+    set_seed(training_args.seed)
+    ###############
+    # Setup logging
+    ###############
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+    # Log on each process a small summary
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Model parameters {model_args}")
+    logger.info(f"Script parameters {script_args}")
+    logger.info(f"Data parameters {training_args}")
+    # Check for last checkpoint
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir):
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+    if last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+        logger.info(f"Checkpoint detected, resuming training at {last_checkpoint=}.")
+    ################
+    # Load datasets
+    ################
+    dataset = LazySupervisedDataset(script_args.dataset_name, script_args)
+    ################
+    # Load tokenizer
+    ################
+    global processor
+    if "vl" in model_args.model_name_or_path.lower():
+        processor = AutoProcessor.from_pretrained(
+            model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
+        )
+        logger.info("Using AutoProcessor for vision-language model.")
+    else:
+        processor = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code, use_fast=True
+        )
+        logger.info("Using AutoTokenizer for text-only model.")
+    if hasattr(processor, "pad_token") and processor.pad_token is None:
+        processor.pad_token = processor.eos_token
+    elif hasattr(processor.tokenizer, "pad_token") and processor.tokenizer.pad_token is None:
+        processor.tokenizer.pad_token = processor.tokenizer.eos_token
+    ###################
+    # Model init kwargs
+    ###################
+    logger.info("*** Initializing model kwargs ***")
+    torch_dtype = (
+        model_args.torch_dtype if model_args.torch_dtype in ["auto", None] else getattr(torch, model_args.torch_dtype)
+    )
+    quantization_config = get_quantization_config(model_args)
+    model_kwargs = dict(
+        revision=model_args.model_revision,
+        trust_remote_code=model_args.trust_remote_code,
+        attn_implementation=model_args.attn_implementation,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+    # training_args.model_init_kwargs = model_kwargs
+    from transformers import Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration
+    if "Qwen2-VL" in model_args.model_name_or_path:
+        model = Qwen2VLForConditionalGeneration.from_pretrained(
+            model_args.model_name_or_path, **model_kwargs
+        )
+    elif "Qwen2.5-VL" in model_args.model_name_or_path:
+        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_args.model_name_or_path, **model_kwargs
+        )
+    else:
+        raise ValueError(f"Unsupported model: {model_args.model_name_or_path}")
+    ############################
+    # Initialize the SFT Trainer
+    ############################
+    training_args.dataset_kwargs = {
+        "skip_prepare_dataset": True,
+    }
+    training_args.remove_unused_columns = False
+    trainer = SFTTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=dataset,
+        eval_dataset=None,
+        processing_class=processor.tokenizer,
+        data_collator=collate_fn,
+        peft_config=get_peft_config(model_args),
+        callbacks=get_callbacks(training_args, model_args),
+    )
+    ###############
+    # Training loop
+    ###############
+    logger.info("*** Train ***")
+    checkpoint = None
+    if training_args.resume_from_checkpoint is not None:
+        checkpoint = training_args.resume_from_checkpoint
+    elif last_checkpoint is not None:
+        checkpoint = last_checkpoint
+    train_result = trainer.train(resume_from_checkpoint=checkpoint)
+    metrics = train_result.metrics
+    metrics["train_samples"] = len(dataset[script_args.dataset_train_split])
+    trainer.log_metrics("train", metrics)
+    trainer.save_metrics("train", metrics)
+    trainer.save_state()
+    ##################################
+    # Save model and create model card
+    ##################################
+    logger.info("*** Save model ***")
+    trainer.save_model(training_args.output_dir)
+    logger.info(f"Model saved to {training_args.output_dir}")
+    # Save everything else on main process
+    kwargs = {
+        "finetuned_from": model_args.model_name_or_path,
+        "dataset": list(script_args.dataset_name),
+        "dataset_tags": list(script_args.dataset_name),
+        "tags": ["open-r1"],
+    }
+    if trainer.accelerator.is_main_process:
+        trainer.create_model_card(**kwargs)
+        # Restore k,v cache for fast inference
+        trainer.model.config.use_cache = True
+        trainer.model.config.save_pretrained(training_args.output_dir)
+    #############
+    # push to hub
+    #############
+    if training_args.push_to_hub:
+        logger.info("Pushing to hub...")
+        trainer.push_to_hub(**kwargs)
+if __name__ == "__main__":
+    parser = TrlParser((SFTScriptArguments, SFTConfig, ModelConfig))
+    script_args, training_args, model_args = parser.parse_args_and_config()
+    print(script_args)
+    main(script_args, training_args, model_args)

open-r1-multimodal/src/open_r1/trainer/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .grpo_trainer import VLMGRPOTrainer
+from .grpo_config import GRPOConfig
+from .vllm_grpo_trainer import Qwen2VLGRPOVLLMTrainer
+from .qwen_grpo_trainer import Qwen2VLGRPOTrainer
+__all__ = ["VLMGRPOTrainer",'Qwen2VLGRPOVLLMTrainer', "Qwen2VLGRPOTrainer"]

open-r1-multimodal/src/open_r1/trainer/__pycache__/vllm_grpo_trainer.cpython-310.pyc ADDED Viewed

Binary file (18.4 kB). View file

open-r1-multimodal/src/open_r1/utils/__pycache__/math.cpython-310.pyc.139714633805856 ADDED Viewed

Binary file (3.88 kB). View file

open-r1-multimodal/src/open_r1/utils/__pycache__/math.cpython-310.pyc.140170314805280 ADDED Viewed

Binary file (3.88 kB). View file

open-r1-multimodal/src/open_r1/vlm_modules/__pycache__/internvl_module.cpython-310.pyc ADDED Viewed

Binary file (11.2 kB). View file

open-r1-multimodal/src/open_r1/vlm_modules/__pycache__/qwen_module.cpython-310.pyc ADDED Viewed

Binary file (9.36 kB). View file

open-r1-multimodal/src/open_r1/vlm_modules/qwen_module.py ADDED Viewed

	@@ -0,0 +1,238 @@

+from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2VLForConditionalGeneration, AutoProcessor
+from typing import Dict, Any, Union
+from trl.data_utils import maybe_apply_chat_template
+import torch
+from open_r1.vlm_modules.vlm_module import VLMBaseModule
+class Qwen2VLModule(VLMBaseModule):
+    def __init__(self):
+        super().__init__()
+    def get_vlm_key(self):
+        return "qwen"
+    def get_model_class(self, model_id: str, model_init_kwargs: dict):
+        if "Qwen2-VL" in model_id:
+            model_cls = Qwen2VLForConditionalGeneration
+        elif "Qwen2.5-VL" in model_id:
+            model_cls = Qwen2_5_VLForConditionalGeneration
+        else:
+            raise ValueError(f"Unsupported model: {model_id}")
+        return model_cls
+    def post_model_init(self, model, processing_class):
+        pass
+    def get_processing_class(self):
+        return AutoProcessor
+    def get_vision_modules_keywords(self):
+        return ['visual']
+    def get_custom_multimodal_keywords(self):
+        return ['pixel_values', 'image_grid_thw']
+    def get_non_generate_params(self):
+        return []
+    def get_custom_processing_keywords(self):
+        return ['max_pixels', 'min_pixels']
+    def prepare_prompt(self, processing_class, inputs: dict[str, Union[torch.Tensor, Any]]):
+        prompts_text = [maybe_apply_chat_template(example, processing_class)["prompt"] for example in inputs]
+        return prompts_text
+    def prepare_model_inputs(self, processing_class, prompts_text, images, return_tensors="pt", padding=True, padding_side="left", add_special_tokens=False):
+        # FIXME
+        # This could only process pure-multimodal or pure-text inputs
+        if len(images) > 0:
+            prompt_inputs = processing_class(
+                text=prompts_text,
+                images=images,
+                return_tensors=return_tensors,
+                padding=padding,
+                padding_side=padding_side,
+                add_special_tokens=add_special_tokens)
+        else:
+            prompt_inputs = processing_class(
+                text=prompts_text,
+                return_tensors=return_tensors,
+                padding=padding,
+                padding_side=padding_side,
+                add_special_tokens=add_special_tokens)
+        return prompt_inputs
+    @staticmethod
+    def get_question_template(task_type: str):
+        match task_type:
+            case "rec":
+                return "{Question} First output the thinking process in <think> </think> tags and then output the final answer in <answer> </answer> tags. Output the final answer in JSON format."
+            case _:
+                return "{Question} First output the thinking process in <think> </think> tags and then output the final answer in <answer> </answer> tags."
+    @staticmethod
+    def format_reward_rec(completions, **kwargs):
+        """Check if the Qwen model output matches a specific format."""
+        import re
+        # pattern = r"<think>.*?</think>\s*<answer>.*?\{.*\[\d+,\s*\d+,\s*\d+,\s*\d+\].*\}.*?</answer>"
+        pattern = r"<tool_call>.*?\{.*\[\d+,\s*\d+\].*\}.*?</tool_call>"
+        completion_contents = [completion[0]["content"] for completion in completions]
+        print(completion_contents)
+        print('-'*100)
+        # print(completion_contents)
+        # print('-'*100)
+        matches = [re.search(pattern, content, re.DOTALL) is not None for content in completion_contents]
+        return [1.0 if match else 0.0 for match in matches]
+    def format_reward(completions, **kwargs):
+        pattern = r"<think>.*?</think>\s*<answer>.*?\[.*?{\"bbox_2d\":\s*\[\s*\d+,\s*\d+,\s*\d+,\s*\d+\s*\]\s*,\s*\"label\":\s*\".*?\"\s*}.*?\].*?</answer>"
+        completion_contents = [completion[0]["content"] for completion in completions]
+        matches = [re.search(pattern, content, re.DOTALL) is not None for content in completion_contents]
+        return [1.0 if match else 0.0 for match in matches]
+    def point_reward(completions, solution, **kwargs):
+        """Calculate reward based on whether the predicted point is inside the bounding box and its distance from the box center."""
+        import re
+        import json
+        import os
+        from datetime import datetime
+        import math
+        # 从每个 completion 中提取 content
+        contents = [completion[0]["content"] for completion in completions]
+        rewards = []
+        current_time = datetime.now().strftime("%d-%H-%M-%S-%f")
+        # 遍历每个 content 和对应的 solution
+        for content, sol in zip(contents, solution):
+            reward = 0.0
+            log_details = None
+            try:
+                # 使用正则表达式提取 <tool_call> 标签中的内容
+                tool_call_match = re.search(r'<tool_call>(.*?)</tool_call>', content, re.DOTALL)
+                if tool_call_match:
+                    tool_call_content = tool_call_match.group(1).strip()
+                    # 解析 JSON
+                    tool_call_json = json.loads(tool_call_content)
+                    arguments = tool_call_json.get("arguments", {})
+                    coordinate = arguments.get("coordinate", None)
+                    # 检查坐标是否是一个长度为 2 的列表
+                    if coordinate and isinstance(coordinate, list) and len(coordinate) == 2:
+                        x, y = coordinate
+                        # 确保 x 和 y 是数值类型
+                        if isinstance(x, (int, float)) and isinstance(y, (int, float)):
+                            # 提取边界框和图像尺寸
+                            box = sol[:4]  # [x_min, y_min, x_max, y_max]
+                            img_width, img_height = sol[4], sol[5]
+                            # 检查点是否在边界框内
+                            if box[0] <= x <= box[2] and box[1] <= y <= box[3]:
+                                base_reward = 1.0
+                            else:
+                                base_reward = 0.0
+                            # 计算边界框中心
+                            cx = (box[0] + box[2]) / 2
+                            cy = (box[1] + box[3]) / 2
+                            # 归一化坐标
+                            nx = x / img_width
+                            ny = y / img_height
+                            ncx = cx / img_width
+                            ncy = cy / img_height
+                            # 计算边界框中心到图像四个角的归一化距离
+                            d1 = math.sqrt((ncx - 0)**2 + (ncy - 0)**2)
+                            d2 = math.sqrt((ncx - 1)**2 + (ncy - 0)**2)
+                            d3 = math.sqrt((ncx - 0)**2 + (ncy - 1)**2)
+                            d4 = math.sqrt((ncx - 1)**2 + (ncy - 1)**2)
+                            max_d = max(d1, d2, d3, d4)
+                            # 计算点到中心的归一化距离
+                            d = math.sqrt((nx - ncx)**2 + (ny - ncy)**2)
+                            d_normalized = d / max_d if max_d > 0 else 0
+                            decay_term = 1 - d_normalized**2 if d <= 1 else 0
+                            # 总奖励
+                            reward = base_reward + decay_term
+                            # 为日志记录准备数据
+                            log_details = {
+                                "extracted_coordinate": [x, y],
+                                "base_reward": base_reward,
+                                "decay_term": decay_term
+                            }
+            except Exception as e:
+                # 如果解析失败或发生异常，reward 保持为 0.0
+                pass
+            rewards.append(reward)
+            # 如果启用 DEBUG_MODE，则记录详细信息
+            if os.getenv("DEBUG_MODE") == "true":
+                log_path = os.getenv("LOG_PATH")
+                with open(log_path, "a", encoding='utf-8') as f:
+                    f.write(f"------------- {current_time} Point-in-box reward: {reward} -------------\n")
+                    f.write(f"Content: {content}\n")
+                    f.write(f"Solution box: {sol[:4]}\n")
+                    f.write(f"Image size: {sol[4]}x{sol[5]}\n")
+                    if log_details:
+                        f.write(f"Extracted coordinate: {log_details['extracted_coordinate']}\n")
+                        f.write(f"Base reward: {log_details['base_reward']}\n")
+                        f.write(f"Decay term: {log_details['decay_term']}\n")
+                    else:
+                        f.write("Failed to extract coordinate\n")
+        return rewards
+    @staticmethod
+    def iou_reward(completions, solution, **kwargs):
+        """Calculate IoU reward between predicted bounding box from Qwen model and ground truth bounding box."""
+        import re
+        import os
+        from datetime import datetime
+        def iou(box1, box2):
+            inter_x1 = max(box1[0], box2[0])
+            inter_y1 = max(box1[1], box2[1])
+            inter_x2 = min(box1[2]-1, box2[2]-1)
+            inter_y2 = min(box1[3]-1, box2[3]-1)
+            if inter_x1 < inter_x2 and inter_y1 < inter_y2:
+                inter = (inter_x2-inter_x1+1)*(inter_y2-inter_y1+1)
+            else:
+                inter = 0
+            union = (box1[2]-box1[0])*(box1[3]-box1[1]) + (box2[2]-box2[0])*(box2[3]-box2[1]) - inter
+            return float(inter)/union
+        contents = [completion[0]["content"] for completion in completions]
+        rewards = []
+        current_time = datetime.now().strftime("%d-%H-%M-%S-%f")
+        answer_tag_pattern = r'<answer>(.*?)</answer>'
+        bbox_pattern = r'\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)]'
+        for content, sol in zip(contents, solution):
+            reward = 0.0
+            # Try symbolic verification first
+            try:
+                content_answer_match = re.search(answer_tag_pattern, content, re.DOTALL)
+                if content_answer_match:
+                    content_answer = content_answer_match.group(1).strip()
+                    bbox_match = re.search(bbox_pattern, content_answer)
+                    if bbox_match:
+                        bbox = [int(bbox_match.group(1)), int(bbox_match.group(2)), int(bbox_match.group(3)), int(bbox_match.group(4))]
+                        # if iou(bbox, sol) > 0.5:
+                        #     reward = 1.0
+                        reward = iou(bbox, sol)
+            except Exception:
+                pass  # Continue to next verification method if this fails
+            rewards.append(reward)
+            if os.getenv("DEBUG_MODE") == "true":
+                log_path = os.getenv("LOG_PATH")
+                # local_rank = int(os.getenv("LOCAL_RANK", 0))
+                with open(log_path, "a", encoding='utf-8') as f:
+                    f.write(f"------------- {current_time} Accuracy reward: {reward} -------------\n")
+                    f.write(f"Content: {content}\n")
+                    f.write(f"Solution: {sol}\n")
+        return rewards