1f commited on Jun 7, 2025

Commit

ff53362

verified ·

1 Parent(s): 9cdf7a2

Add files using upload-large-folder tool

Browse files

Files changed (20) hide show

r1-a/final_dataset/preference_relative/dataset_info.json +84 -0
r1-a/final_dataset/preference_relative/state.json +33 -0
r1-a/final_dataset/preference_relative_processed_shards/logs/shard_0_gpu_0.log +16 -0
r1-a/final_dataset/preference_relative_processed_shards/shard_0/dataset_info.json +100 -0
r1-a/final_dataset/preference_relative_processed_shards/shard_0/state.json +37 -0
r1-a/final_dataset/prompt_only_relative_paths/dataset_info.json +24 -0
r1-a/final_dataset/prompt_only_relative_paths/state.json +13 -0
r1-a/response_generation/glm4voice.py +579 -0
r1-a/response_generation/gpt4o.py +464 -0
r1-a/response_generation/gpt4o_mini.py +464 -0
r1-a/response_generation/gpt5o_retry.py +461 -0
r1-a/response_generation/kimi.py +532 -0
r1-a/response_generation/minicpm.py +519 -0
r1-a/response_generation/minicpm/MiniCPM-o/.gitignore +3 -0
r1-a/response_generation/minicpm/MiniCPM-o/LICENSE +201 -0
r1-a/response_generation/minicpm/MiniCPM-o/README.md +0 -0
r1-a/response_generation/minicpm/MiniCPM-o/README_zh.md +2524 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/cgbench.py +1760 -0
r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/cmmmu.py +354 -0
r1-a/response_generation/qwenomni.py +451 -0

r1-a/final_dataset/preference_relative/dataset_info.json ADDED Viewed

	@@ -0,0 +1,84 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "question_text": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "question_audio": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "source_dataset": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "metadata": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "model_1": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "prompt_name_1": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "prompt_text_1": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "response_text_1": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "response_audio_path_1": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "model_2": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "prompt_name_2": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "prompt_text_2": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "response_text_2": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "response_audio_path_2": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "model_3": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "prompt_name_3": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "prompt_text_3": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "response_text_3": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "response_audio_path_3": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

r1-a/final_dataset/preference_relative/state.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "6ada4a1b690526f4",
+  "_format_columns": [
+    "question_text",
+    "question_audio",
+    "source_dataset",
+    "metadata",
+    "model_1",
+    "prompt_name_1",
+    "prompt_text_1",
+    "response_text_1",
+    "response_audio_path_1",
+    "model_2",
+    "prompt_name_2",
+    "prompt_text_2",
+    "response_text_2",
+    "response_audio_path_2",
+    "model_3",
+    "prompt_name_3",
+    "prompt_text_3",
+    "response_text_3",
+    "response_audio_path_3"
+  ],
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

r1-a/final_dataset/preference_relative_processed_shards/logs/shard_0_gpu_0.log ADDED Viewed

	@@ -0,0 +1,16 @@

+2025-05-04 14:41:37,640 - INFO - [Shard 0] - Process started for Shard 0 on GPU 0 (logical device cuda:0)
+2025-05-04 14:41:37,640 - INFO - [Shard 0] - Arguments: Namespace(shard_index=0, gpu_id=0, wer_threshold=0.4, pipeline_batch_size=16, map_batch_size=16, num_check_workers=4)
+2025-05-04 14:41:37,640 - INFO - [Shard 0] - Loading dataset from /home/chenyifu/audio-r1/r1-a/final_dataset/preference_relative
+2025-05-04 16:30:18,197 - ERROR - [Shard 0] - Failed to load dataset:
+Traceback (most recent call last):
+  File "/home/chenyifu/audio-r1/r1-a/dataset/retts.py", line 380, in main
+    logger.info(f"Full dataset loaded with {full_ds.num_rows} rows.")
+  File "/home/chenyifu/audio-r1/r1-a/dataset/retts.py", line 380, in main
+    logger.info(f"Full dataset loaded with {full_ds.num_rows} rows.")
+  File "/home/chenyifu/miniconda3/envs/cosyvoice/lib/python3.10/bdb.py", line 90, in trace_dispatch
+    return self.dispatch_line(frame)
+  File "/home/chenyifu/miniconda3/envs/cosyvoice/lib/python3.10/bdb.py", line 115, in dispatch_line
+    if self.quitting: raise BdbQuit
+bdb.BdbQuit
+2025-05-04 16:30:18,199 - WARNING - [Shard 0] - Processing did not complete or failed early. No statistics to log.
+2025-05-04 16:30:18,199 - INFO - [Shard 0] - Process for Shard 0 on GPU 0 finished.

r1-a/final_dataset/preference_relative_processed_shards/shard_0/dataset_info.json ADDED Viewed

	@@ -0,0 +1,100 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "question_text": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "question_audio": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "source_dataset": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "metadata": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "model_1": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "prompt_name_1": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "prompt_text_1": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "response_text_1": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "response_audio_path_1": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "model_2": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "prompt_name_2": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "prompt_text_2": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "response_text_2": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "response_audio_path_2": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "model_3": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "prompt_name_3": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "prompt_text_3": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "response_text_3": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "response_audio_path_3": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "asr_transcription": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "wer": {
+      "dtype": "float32",
+      "_type": "Value"
+    },
+    "is_bad_tts": {
+      "dtype": "bool",
+      "_type": "Value"
+    },
+    "error_message": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

r1-a/final_dataset/preference_relative_processed_shards/shard_0/state.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "00775948802a271b",
+  "_format_columns": [
+    "asr_transcription",
+    "error_message",
+    "is_bad_tts",
+    "metadata",
+    "model_1",
+    "model_2",
+    "model_3",
+    "prompt_name_1",
+    "prompt_name_2",
+    "prompt_name_3",
+    "prompt_text_1",
+    "prompt_text_2",
+    "prompt_text_3",
+    "question_audio",
+    "question_text",
+    "response_audio_path_1",
+    "response_audio_path_2",
+    "response_audio_path_3",
+    "response_text_1",
+    "response_text_2",
+    "response_text_3",
+    "source_dataset",
+    "wer"
+  ],
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

r1-a/final_dataset/prompt_only_relative_paths/dataset_info.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "source_dataset": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "question_text": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "question_audio": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "metadata": {
+      "dtype": "string",
+      "_type": "Value"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

r1-a/final_dataset/prompt_only_relative_paths/state.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "419ded2384418f0a",
+  "_format_columns": null,
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}

r1-a/response_generation/glm4voice.py ADDED Viewed

	@@ -0,0 +1,579 @@

+import os
+import json
+import base64
+import uuid
+import time
+import re
+from io import BytesIO
+import concurrent.futures
+from tqdm import tqdm
+import threading
+import itertools
+import traceback # For detailed error logging
+import numpy as np
+import soundfile as sf
+from zhipuai import ZhipuAI
+# Import specific error type if available and helpful
+# Attempt to import specific error, handle if it doesn't exist
+try:
+    from zhipuai.core._errors import APIStatusError
+except ImportError:
+    # Define a dummy class if the specific error isn't available
+    # This allows the except block to still catch general exceptions
+    # that might represent API status issues if the SDK changes.
+    print("Warning: zhipuai.core._errors.APIStatusError not found. Using generic Exception for status errors.")
+    class APIStatusError(Exception):
+        def __init__(self, message, status_code=None, body=None):
+            super().__init__(message)
+            self.status_code = status_code
+            self.body = body
+            self.message = message # Add message attribute for consistency
+from datasets import load_from_disk, Dataset
+from dotenv import load_dotenv
+# --- Configuration (User's Original Settings) ---
+load_dotenv()
+# 1. API Client Setup
+GLM_MODEL_NAME = "glm-4-voice" # <<< User's original model name
+# --- API Key Rotation Setup (User's Original Keys & Logic) ---
+ZHIPUAI_API_KEYS = [
+    "14a67189b8bc4ee489e83b6247c36d0e.AIPUNrII50wREvsh",
+    "72120787822c4123a9654965ff90e4e6.JS1nuey9MncQscPa",
+    "d41b3b5bb49f4c8680b3836e7fc49bbf.u0jGxYc5sYPeRr5p",
+    "bc9bccd6ddd145fc844a014521c26868.JwsZXHzA3l32dDwz",
+    "0e5a05d709794737923ebd122e07d491.sL67ALh6BiLYaaGW", # New key
+    "db87c1fda8af4eb8b505f36e791d700d.w5M0Q3ZssT55tvlW", # New key
+    "1594ac60fbca4973809f4da425238e0c.ZMMfchqbok992Dmu", # New key
+    "469c0fa3b14e4913b1d14bc5d6f0c858.0KdQjFqdi66VPMnb",
+    "b9b538bb0e134438bacaf922b023d1fd.sogFUUp57UJ8YSd6",
+    "50bb382993a345cfa35833fc89caaa52.oR921jSW8iwzCV22",
+    "44512bbede5940f7964db7694bfc04df.yhDEQyPOXQCqh1Mn",
+    "99aba409b55c432696b9d5f1ff565d30.GmfRNngBOo8qDUbf"
+] # <<< User's original keys
+if not ZHIPUAI_API_KEYS:
+    print("FATAL: No ZHIPUAI_API_KEYS provided in the list.")
+    exit(1)
+# Make sure keys are unique if duplicates were accidental
+unique_keys = list(dict.fromkeys(ZHIPUAI_API_KEYS))
+if len(unique_keys) != len(ZHIPUAI_API_KEYS):
+    print(f"Warning: Duplicate API keys found and removed. Using {len(unique_keys)} unique keys.")
+    ZHIPUAI_API_KEYS = unique_keys
+key_cycler = itertools.cycle(ZHIPUAI_API_KEYS)
+key_lock = threading.Lock()
+disabled_keys = set() # Shared set to store disabled keys
+class AllKeysDisabledError(Exception):
+    """Custom exception raised when all API keys are disabled."""
+    pass
+def get_next_active_key():
+    """
+    Thread-safely gets the next API key from the cycle, skipping disabled keys.
+    Raises AllKeysDisabledError if all keys are disabled.
+    (User's Original Logic)
+    """
+    with key_lock:
+        initial_key_count = len(ZHIPUAI_API_KEYS)
+        checked_count = 0
+        while checked_count < initial_key_count:
+            potential_key = next(key_cycler)
+            if potential_key not in disabled_keys:
+                return potential_key
+            checked_count += 1
+            # Prevent infinite loop if somehow cycle changes mid-operation (shouldn't happen)
+            if checked_count > initial_key_count * 2:
+                 print("Warning: Potential issue in get_next_active_key cycle detection.")
+                 break
+        # If we exit the loop, all keys have been checked and are disabled
+        if len(disabled_keys) == initial_key_count:
+             raise AllKeysDisabledError("All API keys have been disabled.")
+        else:
+             # This case should ideally not be reached if logic is sound
+             # but indicates a potential problem finding an active key
+             print(f"Warning: Could not find an active key after checking {checked_count}. Disabled: {len(disabled_keys)}/{initial_key_count}")
+             raise RuntimeError("Failed to find an active API key.")
+# --- End API Key Rotation Setup ---
+# 2. Dataset Paths (User's Original Paths)
+INPUT_DATASET_DIR = "/root/autodl-tmp/audio-r1/r1-a/dataset/preference_sampling_tasks" # <<< User's original path
+OUTPUT_DATASET_DIR = "/root/autodl-tmp/audio-r1/r1-a/dataset/preference_tasks_with_glm" # <<< User's original path
+# 3. Output Audio Configuration (User's Original Settings)
+OUTPUT_AUDIO_ROOT_DIR = "/root/autodl-tmp/audio-r1/r1-a/generated_audio/glm_voice" # <<< User's original path
+OUTPUT_AUDIO_FORMAT = "wav" # <<< User's original setting
+OUTPUT_AUDIO_SAMPLERATE = 44100 # <<< User's original setting
+# 4. API Call Settings (User's Original Settings)
+API_RETRY_DELAY = 5 # <<< User's original setting
+API_MAX_RETRIES = 3 # <<< User's original setting
+MAX_WORKERS = 10 # <<< User's original setting
+# --- Helper Functions (User's Original Functions) ---
+def encode_audio_base64(audio_path):
+    # ... (implementation unchanged from user's script) ...
+    if not audio_path or not os.path.exists(audio_path):
+        print(f"Warning: Input audio file not found or path is empty: {audio_path}")
+        return None
+    try:
+        with open(audio_path, "rb") as audio_file:
+            return base64.b64encode(audio_file.read()).decode("utf-8")
+    except Exception as e:
+        print(f"Error encoding audio file {audio_path}: {e}")
+        return None
+def parse_ultra_history(history_str):
+    # ... (implementation unchanged from user's script) ...
+    messages = []
+    pattern = re.compile(r"\[(USER|ASSISTANT)\]\s*([\s\S]*?)(?=\s*\[(?:USER|ASSISTANT)\]|$)")
+    matches = pattern.findall(history_str)
+    if not matches:
+        return [] # Return empty list if no matches, as per user's original code
+    for role_tag, content in matches:
+        role = role_tag.lower()
+        cleaned_content = content.strip()
+        if cleaned_content:
+             messages.append({"role": role, "content": cleaned_content})
+    return messages
+# --- Modified API Call Worker Function (Handles Key Disabling & History Flattening) ---
+def call_glm_voice_api_worker(task_info):
+    """
+    Worker function to call GLM Voice API, handling key disabling for error 1113,
+    and flattening history into the user prompt with clear markers.
+    (Incorporates Method 2 flattening into user's worker structure)
+    """
+    row_idx = task_info["row_idx"]
+    slot_idx = task_info["slot_idx"]
+    current_api_key = task_info["api_key"]
+    history_messages = task_info["history_messages"] # Original parsed history
+    prompt_text = task_info["prompt_text"]           # The user's current text request
+    question_audio_path = task_info["question_audio_path"]
+    output_audio_filepath = task_info["output_audio_filepath"]
+    retries = 0
+    local_glm_client = None
+    while retries < API_MAX_RETRIES:
+        # --- Initialize or Re-initialize client (User's Original Logic) ---
+        if local_glm_client is None or getattr(local_glm_client, 'api_key', None) != current_api_key:
+             try:
+                with key_lock:
+                    if current_api_key in disabled_keys:
+                        print(f"Info (Row {row_idx}, Slot {slot_idx}): Assigned key ...{current_api_key[-6:]} was disabled before use, getting new key.")
+                        current_api_key = get_next_active_key()
+                        task_info["api_key"] = current_api_key # Update task_info potentially for logging?
+                print(f"  [Thread-{threading.get_ident()}] Initializing client for Row {row_idx}, Slot {slot_idx} (Key: ...{current_api_key[-6:]})")
+                local_glm_client = ZhipuAI(api_key=current_api_key)
+             except AllKeysDisabledError:
+                 print(f"FATAL (Row {row_idx}, Slot {slot_idx}): All API keys are disabled. Cannot proceed with task.")
+                 return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[ERROR: All Keys Disabled]", "saved_audio_path": None}
+             except Exception as client_init_e:
+                print(f"Error (Row {row_idx}, Slot {slot_idx}): Failed to initialize ZhipuAI client with key ...{current_api_key[-6:]}: {client_init_e}")
+                retries += 1
+                time.sleep(API_RETRY_DELAY)
+                continue
+        # --- Attempt API Call ---
+        try:
+            # 1. Prepare Input Audio (User's Original Logic)
+            base64_audio_data = encode_audio_base64(question_audio_path)
+            if not base64_audio_data:
+                # This is a data error, not an API error, fail the task immediately (User's Original Logic)
+                print(f"Error (Row {row_idx}, Slot {slot_idx}): Skipping GLM API call - missing input audio.")
+                return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[ERROR: Missing input audio]", "saved_audio_path": None}
+            input_audio_format = os.path.splitext(question_audio_path)[1].lstrip('.') or 'wav'
+            # 2. *** Flatten History and Construct Combined User Text Prompt (Method 2 Implementation) ***
+            text_parts = []
+            if history_messages:
+                print(f"  (Row {row_idx}, Slot {slot_idx}) Flattening history ({len(history_messages)} turns) into prompt.")
+                text_parts.append("--- Start of Conversation History ---")
+                for msg in history_messages:
+                    role_tag = "[User]" if msg['role'] == 'user' else "[Assistant]"
+                    # Ensure content is string, handle potential non-string data defensively
+                    content_str = str(msg.get('content', '')).strip()
+                    if content_str: # Avoid adding empty messages
+                         text_parts.append(f"{role_tag}: {content_str}")
+                text_parts.append("--- End of Conversation History ---")
+                text_parts.append("\n--- Current Task ---") # Clear separator
+                # Explicit instruction referencing history and audio
+                text_parts.append("Based on the conversation history above and the accompanying audio input, please respond to the following request:")
+            else:
+                 # No history, just provide the current prompt directly
+                 print(f"  (Row {row_idx}, Slot {slot_idx}) No history found. Using prompt directly.")
+                 text_parts.append("--- Current Task ---")
+                 text_parts.append("Please respond to the following request based on the accompanying audio input:")
+            # Add the user's actual current request text
+            if prompt_text: # Only add if not empty
+                text_parts.append(prompt_text.strip())
+            combined_user_text = "\n".join(text_parts)
+            # --- End Flattening Logic ---
+            # 3. Construct User Message Content List (Text + Audio)
+            user_content_list = [
+                {"type": "text", "text": combined_user_text}, # Use the combined text
+                {"type": "input_audio", "input_audio": {"data": base64_audio_data, "format": input_audio_format}}
+            ]
+            # 4. Construct Final Messages List (Only the single combined user message)
+            #    This replaces the user's original 'messages = history_messages + [{"role": "user", "content": user_content_list}]'
+            messages = [{"role": "user", "content": user_content_list}]
+            # 5. Make API Call (User's Original Logic)
+            # Optional: print(f"Debug (Row {row_idx}, Slot {slot_idx}): Sending messages structure:\n{json.dumps(messages, indent=2, ensure_ascii=False)}")
+            response = local_glm_client.chat.completions.create(
+                model=GLM_MODEL_NAME,
+                messages=messages, # Send the single, combined user message
+                stream=False
+                # Add other parameters like temperature if the user had them originally (they didn't)
+            )
+            # 6. Process SUCCESSFUL Response (User's Original Logic -unchanged-)
+            if response and response.choices:
+                message = response.choices[0].message
+                collected_text = message.content
+                audio_info = getattr(message, 'audio', None) # Use getattr for safety as per user's original code
+                if audio_info and 'data' in audio_info:
+                    audio_base64_string = audio_info['data']
+                    try:
+                        decoded_data = base64.b64decode(audio_base64_string)
+                        if len(decoded_data) == 0: # Check after decode (User's Original Check)
+                             print(f"Warning (Row {row_idx}, Slot {slot_idx}, Key ...{current_api_key[-6:]}): GLM returned empty audio data.")
+                             return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": collected_text.strip(), "saved_audio_path": None}
+                        os.makedirs(os.path.dirname(output_audio_filepath), exist_ok=True)
+                        # Soundfile saving logic (User's Original Logic -unchanged-)
+                        with BytesIO(decoded_data) as bio:
+                            try:
+                                audio_data, samplerate = sf.read(bio, dtype='int16')
+                            except Exception:
+                                bio.seek(0) # Rewind buffer before trying float
+                                try:
+                                    audio_data_float, samplerate = sf.read(bio, dtype='float32')
+                                    # Convert float to int16
+                                    audio_data = (audio_data_float * 32767).astype(np.int16)
+                                except Exception as sf_read_err_float:
+                                     print(f"Error (Row {row_idx}, Slot {slot_idx}, Key ...{current_api_key[-6:]}): Soundfile failed to read audio data: {sf_read_err_float}")
+                                     # Return text, audio failed
+                                     return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": collected_text.strip(), "saved_audio_path": None}
+                            # Use detected samplerate, fallback to configured rate if detection failed
+                            write_samplerate = samplerate if samplerate > 0 else OUTPUT_AUDIO_SAMPLERATE
+                            sf.write(output_audio_filepath, audio_data, write_samplerate)
+                        # TASK SUCCEEDED!
+                        return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": collected_text.strip(), "saved_audio_path": output_audio_filepath}
+                    except base64.binascii.Error as b64_e:
+                         print(f"Error (Row {row_idx}, Slot {slot_idx}, Key ...{current_api_key[-6:]}): GLM b64 decode failed: {b64_e}")
+                         # Return text, audio failed
+                         return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": collected_text.strip(), "saved_audio_path": None}
+                    except Exception as e:
+                        print(f"Error (Row {row_idx}, Slot {slot_idx}, Key ...{current_api_key[-6:]}): Saving GLM audio failed: {e}")
+                         # Return text, audio failed
+                        return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": collected_text.strip(), "saved_audio_path": None}
+                else: # No audio in successful text response (User's Original Logic)
+                    print(f"Warning (Row {row_idx}, Slot {slot_idx}, Key ...{current_api_key[-6:]}): No audio data in GLM response.")
+                    return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": collected_text.strip(), "saved_audio_path": None}
+            else: # Invalid/empty successful response (User's Original Logic)
+                print(f"Error (Row {row_idx}, Slot {slot_idx}, Key ...{current_api_key[-6:]}): Invalid/empty GLM API response. Response: {response}")
+                # Treat as a retryable error for the task
+                retries += 1
+                time.sleep(API_RETRY_DELAY)
+                continue # Go to next iteration of while loop
+        # --- Handle API Errors (User's Original Logic -unchanged-) ---
+        except APIStatusError as e:
+            # --- Log the error details ---
+            print(f"Error (Row {row_idx}, Slot {slot_idx}, Key ...{current_api_key[-6:]}): APIStatusError Encountered")
+            print(f"  Status Code: {getattr(e, 'status_code', 'N/A')}") # Use getattr for safety
+            error_details = getattr(e, 'body', getattr(e, 'message', str(e)))
+            print(f"  Error Details: {error_details}")
+            # --- End Logging ---
+            # Check for the specific "account overdue" error (User's Original Logic)
+            is_overdue_error = False
+            status_code = getattr(e, 'status_code', None)
+            # Adjust check to handle both 429 and potential 400 errors with code 1113 in body
+            if status_code == 429 or (status_code == 400 and '1113' in str(error_details)):
+                try:
+                    error_body = {}
+                    # Try parsing if details look like JSON
+                    if isinstance(error_details, (str, bytes)) and error_details.strip().startswith('{'):
+                         error_body = json.loads(error_details)
+                    elif isinstance(error_details, dict):
+                         error_body = error_details # If body is already a dict
+                    if isinstance(error_body, dict) and str(error_body.get("error", {}).get("code", "")) == "1113":
+                        is_overdue_error = True
+                except (json.JSONDecodeError, AttributeError):
+                    # Can't parse body or access attributes, assume not the specific error for safety
+                    pass
+                except Exception as parse_err:
+                    print(f"Warning: Error parsing API error body: {parse_err}")
+            if is_overdue_error:
+                key_to_disable = current_api_key
+                print(f"Error (Row {row_idx}, Slot {slot_idx}): Account overdue (1113) for Key ...{key_to_disable[-6:]}. Disabling key.")
+                with key_lock:
+                    disabled_keys.add(key_to_disable)
+                    print(f"  Disabled keys count: {len(disabled_keys)}/{len(ZHIPUAI_API_KEYS)}")
+                # Don't increment retries here, try getting a new key immediately
+                try:
+                    current_api_key = get_next_active_key() # Get a new key
+                    print(f"  (Row {row_idx}, Slot {slot_idx}) Switched to new key ...{current_api_key[-6:]} for next attempt.")
+                    local_glm_client = None # Force re-initialization with new key
+                    continue # Go immediately to the next iteration of the while loop with the new key
+                except AllKeysDisabledError:
+                     print(f"FATAL (Row {row_idx}, Slot {slot_idx}): All API keys are disabled after key ...{key_to_disable[-6:]} failed. Cannot retry task.")
+                     # Return failure for this task as no keys are left
+                     return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[ERROR: All Keys Disabled]", "saved_audio_path": None}
+            else:
+                # Other APIStatusError (rate limit, server error, etc.) - treat as retryable
+                retries += 1
+                print(f"Error (Row {row_idx}, Slot {slot_idx}, Key ...{current_api_key[-6:]}): GLM API Call Attempt {retries}/{API_MAX_RETRIES} failed: HTTP {status_code}, {error_details}")
+                if retries < API_MAX_RETRIES:
+                    time.sleep(API_RETRY_DELAY)
+                    # Continue loop to retry with the *same* key (unless it was just disabled above)
+                    continue
+                else:
+                    print(f"Error (Row {row_idx}, Slot {slot_idx}, Key ...{current_api_key[-6:]}): Max retries reached after API error.")
+                    # Return failure for the task
+                    return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[API ERROR: Max retries after status error]", "saved_audio_path": None}
+        except Exception as e:
+            # Handle other unexpected errors during API call or processing (User's Original Logic)
+            retries += 1
+            print(f"Error (Row {row_idx}, Slot {slot_idx}, Key ...{current_api_key[-6:]}): Unexpected Error Attempt {retries}/{API_MAX_RETRIES}: {type(e).__name__} - {e}")
+            print(traceback.format_exc()) # Print traceback for unexpected errors
+            if retries < API_MAX_RETRIES:
+                time.sleep(API_RETRY_DELAY)
+                continue # Continue loop to retry
+            else:
+                 print(f"Error (Row {row_idx}, Slot {slot_idx}, Key ...{current_api_key[-6:]}): Max retries reached after unexpected error.")
+                 # Return failure for the task
+                 return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[API ERROR: Max retries after unexpected error]", "saved_audio_path": None}
+    # If loop finishes without returning, max retries were hit (User's Original Logic)
+    print(f"Error (Row {row_idx}, Slot {slot_idx}): Task failed after {API_MAX_RETRIES} attempts (may include key switches).")
+    return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[API ERROR: Max retries reached]", "saved_audio_path": None}
+# --- Main Processing Logic (User's Original Logic -unchanged-) ---
+print("Loading dataset...")
+try:
+    dataset = load_from_disk(INPUT_DATASET_DIR)
+    print(f"Dataset loaded successfully with {len(dataset)} rows from {INPUT_DATASET_DIR}.")
+except Exception as e:
+    print(f"FATAL: Error loading dataset from {INPUT_DATASET_DIR}: {e}")
+    exit(1)
+os.makedirs(OUTPUT_AUDIO_ROOT_DIR, exist_ok=True)
+# --- Pre-calculation Step for GLM (User's Original Logic -unchanged-) ---
+print("Pre-calculating GLM tasks and assigning initial API keys...")
+tasks_to_process = []
+original_data = list(dataset) # Convert to list for easier updates later
+initial_keys_available = True
+for idx, row in enumerate(tqdm(original_data, desc="Scanning dataset for GLM tasks")):
+    if not initial_keys_available:
+        # Stop scanning if we know no keys are left
+        print("Stopping task scanning as no active keys are available.")
+        break
+    for i in range(1, 4):
+        model_key = f"model_{i}"
+        response_text_key = f"response_text_{i}"
+        prompt_text_key = f"prompt_text_{i}"
+        model_assigned = row.get(model_key)
+        # Check if response exists and is not empty string (User's original check was just existence)
+        response_text_exists = row.get(response_text_key) is not None and str(row.get(response_text_key)).strip() != ""
+        if model_assigned == "glm_voice" and not response_text_exists: # Check using configured model name
+            question_audio_path = row.get('question_audio')
+            # Add check if audio path exists on disk
+            if not question_audio_path or not os.path.exists(question_audio_path):
+                print(f"Warning (Row {idx}, Slot {i}): Skipping GLM task - Missing or invalid 'question_audio' path: {question_audio_path}")
+                continue # Skip this slot if audio is missing
+            # --- Get initial active API key (User's Original Logic) ---
+            try:
+                assigned_key = get_next_active_key()
+            except AllKeysDisabledError:
+                 print("FATAL: All API keys are disabled during initial task scanning. Cannot proceed.")
+                 initial_keys_available = False
+                 break # Stop processing this row
+            except Exception as key_err:
+                 print(f"FATAL: Error getting initial API key: {key_err}. Stopping.")
+                 initial_keys_available = False
+                 break
+            # ---
+            metadata_str = row.get('metadata', "{}")
+            source_dataset = row.get('source_dataset')
+            metadata = {}
+            try:
+                # Handle case where metadata might already be a dict or is a JSON string
+                if metadata_str and isinstance(metadata_str, str): metadata = json.loads(metadata_str)
+                elif isinstance(metadata_str, dict): metadata = metadata_str
+            except json.JSONDecodeError:
+                 print(f"Warning (Row {idx}): Could not parse metadata string: {metadata_str[:100]}...")
+                 pass # Continue with empty metadata
+            # Parse history here - it will be flattened later in the worker
+            history_messages = []
+            if source_dataset == 'ultra':
+                history_str = metadata.get('history', '')
+                if history_str: history_messages = parse_ultra_history(history_str)
+            unique_id = str(uuid.uuid4()).replace("-", "")
+            output_audio_filename = f"glm_r{idx}_s{i}_{unique_id}.{OUTPUT_AUDIO_FORMAT}"
+            output_audio_filepath = os.path.join(OUTPUT_AUDIO_ROOT_DIR, output_audio_filename)
+            task_info = {
+                "row_idx": idx,
+                "slot_idx": i,
+                "api_key": assigned_key, # Initial key
+                "history_messages": history_messages, # Pass the original parsed history
+                "prompt_text": row.get(prompt_text_key, ""),
+                "question_audio_path": question_audio_path,
+                "output_audio_filepath": output_audio_filepath,
+            }
+            tasks_to_process.append(task_info)
+            # Process only the first unfilled GLM slot found per row (User's Implicit Logic)
+            break # Stop checking slots for this row
+    if not initial_keys_available: break # Exit outer loop too
+total_tasks = len(tasks_to_process)
+if total_tasks == 0:
+    if not initial_keys_available:
+         print("No tasks processed because all initial keys were disabled.")
+    else:
+         print("No GLM Voice tasks found needing processing.")
+    exit(0)
+print(f"Found {total_tasks} GLM Voice tasks to process using initially {len(ZHIPUAI_API_KEYS)} API keys.")
+if len(disabled_keys) > 0: # Should be 0 here, but for safety
+    print(f"Note: {len(disabled_keys)} keys already marked as disabled (should not happen at this stage).")
+# --- Threaded Execution for GLM (User's Original Logic -unchanged-) ---
+print(f"Starting GLM processing with up to {MAX_WORKERS} worker threads...")
+start_total_time = time.time()
+results = {}
+tasks_completed = 0
+tasks_failed = 0
+executor_shutdown = False # Flag to stop submitting new tasks if all keys die
+# Use context manager for ThreadPoolExecutor
+with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+    # Create futures mapping back to task info for easier result merging
+    future_to_task = {executor.submit(call_glm_voice_api_worker, task): task for task in tasks_to_process}
+    for future in tqdm(concurrent.futures.as_completed(future_to_task), total=total_tasks, desc="Processing GLM tasks"):
+        task = future_to_task[future] # Get the original task info associated with this future
+        row_idx = task["row_idx"]
+        slot_idx = task["slot_idx"]
+        try:
+            result = future.result() # Get the result from the worker
+            results[(row_idx, slot_idx)] = result # Store result using (row, slot) tuple as key
+            # Check if the task failed because all keys got disabled during its execution
+            if result["response_text"] == "[ERROR: All Keys Disabled]" and not executor_shutdown:
+                print("\n--- CRITICAL: All Keys Disabled detected during execution. Stopping submission of new tasks. ---")
+                # Potentially cancel remaining futures if possible/desired
+                # Note: Standard ThreadPoolExecutor doesn't easily support cancelling submitted tasks
+                # We will just let running tasks finish but won't submit new ones if we had that logic.
+                # For now, just set flag and log.
+                executor_shutdown = True # Prevent theoretical resubmission logic
+                tasks_failed += 1 # Count this task as failed
+            # Check for other errors in the result text or missing audio path
+            elif result["saved_audio_path"] is None or "[ERROR" in result["response_text"]:
+                 tasks_failed += 1
+            tasks_completed += 1
+        except Exception as exc: # Catch exceptions raised *by* the future (e.g., if worker itself crashes)
+            print(f"Error (Row {row_idx}, Slot {slot_idx}): GLM Task generated an unhandled exception: {exc}")
+            print(traceback.format_exc())
+            # Store an error result
+            results[(row_idx, slot_idx)] = {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": f"[ERROR: Worker Crash - {type(exc).__name__}]", "saved_audio_path": None}
+            tasks_failed += 1
+            tasks_completed += 1
+        # No finally block needed here unless cleaning up future_to_task is desired
+end_total_time = time.time()
+print("\n--- GLM Processing Complete ---")
+print(f"Total GLM tasks attempted: {tasks_completed} (Succeeded: {tasks_completed - tasks_failed}, Failed: {tasks_failed})")
+print(f"Final disabled key count: {len(disabled_keys)}/{len(ZHIPUAI_API_KEYS)}")
+print(f"Total GLM processing time: {(end_total_time - start_total_time)/60:.2f} minutes")
+# --- Merge Results back into the dataset structure (User's Original Logic -unchanged-) ---
+print("Merging GLM results...")
+updated_data = original_data # Use the list created earlier
+for (row_idx, slot_idx), result in tqdm(results.items(), desc="Merging GLM results"):
+    response_text_key = f"response_text_{slot_idx}"
+    response_audio_key = f"response_audio_path_{slot_idx}"
+    # Check index validity before updating
+    if 0 <= row_idx < len(updated_data):
+        # Ensure the item at the index is a dictionary (it should be if loaded from dataset)
+        if isinstance(updated_data[row_idx], dict):
+            updated_data[row_idx][response_text_key] = result["response_text"]
+            updated_data[row_idx][response_audio_key] = result["saved_audio_path"]
+        else:
+            print(f"Warning: Item at index {row_idx} is not a dictionary. Skipping merge for Slot {slot_idx}.")
+    else:
+        print(f"Warning: Invalid row index {row_idx} encountered during GLM result merge.")
+# --- Save the final updated dataset (User's Original Logic -unchanged, including fallback) ---
+if updated_data:
+    print(f"\nSaving updated dataset with GLM results to {OUTPUT_DATASET_DIR}...")
+    try:
+        # Use the features from the original loaded dataset if available
+        updated_dataset = Dataset.from_list(updated_data, features=dataset.features if dataset else None)
+        updated_dataset.save_to_disk(OUTPUT_DATASET_DIR)
+        print("Updated dataset saved successfully.")
+    except Exception as final_save_e:
+        print(f"Error saving final dataset using datasets lib: {final_save_e}")
+        print(f"Final disabled key count at save: {len(disabled_keys)}/{len(ZHIPUAI_API_KEYS)}")
+        print("Attempting to save as JSON lines as fallback...")
+        # Fallback to JSON Lines (User's original fallback logic)
+        output_jsonl_path = OUTPUT_DATASET_DIR.rstrip('/') + ".jsonl" # Ensure no trailing slash before adding extension
+        try:
+            with open(output_jsonl_path, 'w', encoding='utf-8') as f:
+                for item in updated_data:
+                    # Attempt to make item JSON serializable
+                    serializable_item = {}
+                    for k, v in item.items():
+                       if isinstance(v, (str, int, float, bool, list, dict)) or v is None:
+                            serializable_item[k] = v
+                       elif isinstance(v, np.ndarray):
+                            serializable_item[k] = v.tolist() # Convert numpy arrays
+                       else:
+                            serializable_item[k] = str(v) # Convert other types to string as fallback
+                    f.write(json.dumps(serializable_item, ensure_ascii=False) + '\n')
+            print(f"Fallback save successful to {output_jsonl_path}")
+        except Exception as json_save_e:
+            print(f"Error saving as JSON lines: {json_save_e}")
+else:
+    print("No data was available to save (potentially all keys disabled early or no tasks processed).")

r1-a/response_generation/gpt4o.py ADDED Viewed

	@@ -0,0 +1,464 @@

+import os
+import json
+import base64
+import uuid
+import time
+import re
+import random
+import concurrent.futures
+from tqdm import tqdm
+import threading
+import traceback # For detailed error logging
+import requests # Use requests library for HTTP calls
+# Make sure numpy is imported if needed for potential fallback serialization
+import numpy as np
+from datasets import load_from_disk, Dataset
+from dotenv import load_dotenv
+# --- Configuration ---
+load_dotenv()
+# 1. API Client Setup
+GPT4O_MODEL_NAME = "gpt4o" # How it's identified in your dataset's model columns
+API_MODEL_NAME = "gpt-4o-audio-preview" # Actual model name for the API call
+API_ENDPOINT = "https://api.vansai.cn/v1/chat/completions"
+try:
+    # Assuming a single key for this service based on the original script
+    API_TOKEN = "sk-uOJ27X9jNsYh1PDx1e665b0f92434bEc9bD53bE6D3BaD29a"
+    if not API_TOKEN:
+        raise ValueError("AIGCBEST_API_KEY environment variable not set.")
+    print("AIGCBEST API Key loaded.")
+except Exception as e:
+    print(f"FATAL: Error getting API Key: {e}")
+    exit(1)
+# 2. Dataset Paths
+INPUT_DATASET_DIR = "/root/autodl-tmp/audio-r1/r1-a/dataset/preference_sampling_tasks"
+OUTPUT_DATASET_DIR = "/root/autodl-tmp/audio-r1/r1-a/dataset/preference_tasks_with_gpt4o"
+# 3. Output Audio Configuration
+OUTPUT_AUDIO_ROOT_DIR = "/root/autodl-tmp/audio-r1/r1-a/generated_audio/gpt4o_2"
+OUTPUT_AUDIO_FORMAT = "wav" # API will be requested to return wav
+AVAILABLE_VOICES = ['alloy', 'ash', 'ballad', 'coral', 'echo', 'sage', 'shimmer', 'verse']
+# 4. API Call Settings
+API_TIMEOUT = 120
+API_RETRY_DELAY = 5
+API_MAX_RETRIES = 3 # Max attempts *for the task*
+MAX_WORKERS = 8 # Adjust based on API rate limits and system resources
+# 5. Checkpoint Saving Configuration # <-- NEW
+CHECKPOINT_INTERVAL = 500 # Save every 500 completed tasks
+# --- Helper Functions (encode_audio_base64 and parse_ultra_history remain the same) ---
+def encode_audio_base64(audio_path):
+    if not audio_path or not os.path.exists(audio_path):
+        print(f"Warning: Input audio file not found or path is empty: {audio_path}")
+        return None
+    try:
+        with open(audio_path, "rb") as audio_file:
+            return base64.b64encode(audio_file.read()).decode("utf-8")
+    except Exception as e:
+        print(f"Error encoding audio file {audio_path}: {e}")
+        return None
+def parse_ultra_history(history_str):
+    messages = []
+    pattern = re.compile(r"\[(USER|ASSISTANT)\]\s*([\s\S]*?)(?=\s*\[(?:USER|ASSISTANT)\]|$)")
+    matches = pattern.findall(history_str)
+    if not matches:
+        return []
+    for role_tag, content in matches:
+        role = role_tag.lower()
+        cleaned_content = content.strip()
+        if cleaned_content:
+             messages.append({"role": role, "content": cleaned_content})
+    return messages
+# --- Modified API Call Worker Function for GPT-4o (Reduced Prints) ---
+def call_gpt4o_api_worker(task_info):
+    """
+    Worker function to call the custom GPT-4o API for a single task.
+    """
+    row_idx = task_info["row_idx"]
+    slot_idx = task_info["slot_idx"]
+    history_messages = task_info["history_messages"]
+    prompt_text = task_info["prompt_text"]
+    question_text = task_info["question_text"]
+    question_audio_path = task_info["question_audio_path"]
+    output_audio_filepath = task_info["output_audio_filepath"]
+    retries = 0
+    headers = {
+        'Accept': 'application/json',
+        'Authorization': f'Bearer {API_TOKEN}', # Use the single loaded token
+        'Content-Type': 'application/json'
+    }
+    selected_voice = random.choice(AVAILABLE_VOICES)
+    # print(f"  [Thread-{threading.get_ident()}] Processing Row {row_idx}, Slot {slot_idx} (GPT4o Voice: {selected_voice})") # Optional log
+    while retries < API_MAX_RETRIES:
+        try:
+            # 1. Prepare Input Audio
+            base64_audio_data = encode_audio_base64(question_audio_path)
+            if not base64_audio_data:
+                print(f"Error (Row {row_idx}, Slot {slot_idx}): Skipping GPT4o API call - missing input audio.")
+                return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[ERROR: Missing input audio]", "saved_audio_path": None}
+            input_audio_format = os.path.splitext(question_audio_path)[1].lstrip('.') or 'wav'
+            # 2. Construct User Message Content
+            combined_text = f"{prompt_text}"
+            user_content_list = [
+                {"type": "text", "text": combined_text},
+                {"type": "input_audio", "input_audio": {"data": base64_audio_data, "format": input_audio_format}}
+            ]
+            messages = history_messages + [{"role": "user", "content": user_content_list}]
+            # 4. Construct Payload
+            payload = {
+                "model": API_MODEL_NAME,
+                "modalities": ["text", "audio"],
+                "audio": {"voice": selected_voice, "format": OUTPUT_AUDIO_FORMAT},
+                "messages": messages
+            }
+            # 5. Make API Call
+            response = requests.post(
+                API_ENDPOINT,
+                headers=headers,
+                json=payload,
+                timeout=API_TIMEOUT
+            )
+            # 6. Process Response
+            if response.status_code == 200:
+                try:
+                    response_data = response.json()
+                    # Make parsing more robust
+                    choices = response_data.get('choices')
+                    if not choices or not isinstance(choices, list) or len(choices) == 0:
+                        raise ValueError("Invalid or empty 'choices' field in response.")
+                    message_content = choices[0].get('message', {})
+                    if not message_content:
+                         raise ValueError("Missing 'message' field in the first choice.")
+                    audio_info = message_content.get('audio', {})
+                    if not isinstance(audio_info, dict): audio_info = {} # Handle case where audio might be null or not a dict
+                    audio_base64_string = audio_info.get('data', '')
+                    # Try getting text from 'content' if 'transcript' is missing/empty in 'audio'
+                    collected_text = audio_info.get('transcript', '').strip()
+                    if not collected_text:
+                        text_content_list = message_content.get('content', [])
+                        if isinstance(text_content_list, list):
+                            for item in text_content_list:
+                                if isinstance(item, dict) and item.get("type") == "text":
+                                    collected_text = item.get("text", "").strip()
+                                    break # Take the first text part found
+                        # Still no text? Try the top-level message content directly if it's a string
+                        elif isinstance(message_content.get('content'), str):
+                           collected_text = message_content['content'].strip()
+                    if not collected_text: print(f"Warning (Row {row_idx}, Slot {slot_idx}): No text content found after checking multiple fields.")
+                    if not audio_base64_string: print(f"Warning (Row {row_idx}, Slot {slot_idx}): No audio data found.")
+                    saved_audio_path = None
+                    if audio_base64_string:
+                        try:
+                            wav_bytes = base64.b64decode(audio_base64_string)
+                            if len(wav_bytes) == 0:
+                                 print(f"Warning (Row {row_idx}, Slot {slot_idx}): Decoded audio bytes are empty.")
+                            else:
+                                 os.makedirs(os.path.dirname(output_audio_filepath), exist_ok=True)
+                                 with open(output_audio_filepath, "wb") as f:
+                                     f.write(wav_bytes)
+                                 saved_audio_path = output_audio_filepath
+                                 # print(f"  Audio saved to: {output_audio_filepath}") # Less verbose log
+                        except base64.binascii.Error as b64_err:
+                            print(f"Error (Row {row_idx}, Slot {slot_idx}): Decoding base64 audio data failed: {b64_err}")
+                        except Exception as e:
+                            print(f"Error (Row {row_idx}, Slot {slot_idx}): Saving audio file failed: {e}")
+                    # TASK SUCCEEDED (even if audio saving failed, text might be valid)
+                    return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": collected_text, "saved_audio_path": saved_audio_path}
+                except (json.JSONDecodeError, IndexError, KeyError, TypeError, ValueError) as e:
+                    print(f"Error (Row {row_idx}, Slot {slot_idx}): Parsing successful API response failed: {type(e).__name__} - {e}")
+                    print(f"  Response Text (start): {response.text[:500]}...")
+                    retries += 1
+                    print(f"  Retrying task ({retries}/{API_MAX_RETRIES})...")
+                    time.sleep(API_RETRY_DELAY)
+                    continue
+                except Exception as e: # Catch-all for unexpected errors during processing
+                     print(f"Error (Row {row_idx}, Slot {slot_idx}): Unexpected error processing response: {e}")
+                     print(traceback.format_exc())
+                     retries += 1
+                     print(f"  Retrying task ({retries}/{API_MAX_RETRIES})...")
+                     time.sleep(API_RETRY_DELAY)
+                     continue
+            else: # Handle non-200 status codes
+                print(f"Error (Row {row_idx}, Slot {slot_idx}): API returned status {response.status_code}. Response: {response.text[:500]}...")
+                retries += 1
+                if retries < API_MAX_RETRIES:
+                    print(f"  Retrying task ({retries}/{API_MAX_RETRIES})...")
+                    time.sleep(API_RETRY_DELAY)
+                    continue # Go to next iteration of while loop
+                else:
+                    print(f"Error (Row {row_idx}, Slot {slot_idx}): Max retries reached after status {response.status_code}.")
+                    return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": f"[API ERROR: Status {response.status_code}]", "saved_audio_path": None}
+        except requests.exceptions.Timeout:
+            retries += 1
+            print(f"Error (Row {row_idx}, Slot {slot_idx}): API Call Attempt {retries}/{API_MAX_RETRIES} timed out after {API_TIMEOUT}s.")
+            if retries < API_MAX_RETRIES:
+                time.sleep(API_RETRY_DELAY)
+                continue
+            else:
+                print(f"Error (Row {row_idx}, Slot {slot_idx}): Max retries reached after timeout.")
+                return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[API ERROR: Timeout]", "saved_audio_path": None}
+        except requests.exceptions.RequestException as e:
+            retries += 1
+            print(f"Error (Row {row_idx}, Slot {slot_idx}): Network/Request Error Attempt {retries}/{API_MAX_RETRIES}: {e}")
+            if retries < API_MAX_RETRIES:
+                time.sleep(API_RETRY_DELAY)
+                continue
+            else:
+                print(f"Error (Row {row_idx}, Slot {slot_idx}): Max retries reached after network error.")
+                return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[API ERROR: Network Error]", "saved_audio_path": None}
+        except Exception as e:
+            retries += 1
+            print(f"Error (Row {row_idx}, Slot {slot_idx}): Unexpected Error in Worker Loop Attempt {retries}/{API_MAX_RETRIES}: {type(e).__name__} - {e}")
+            print(traceback.format_exc())
+            if retries < API_MAX_RETRIES:
+                time.sleep(API_RETRY_DELAY)
+                continue
+            else:
+                 print(f"Error (Row {row_idx}, Slot {slot_idx}): Max retries reached after unexpected error.")
+                 return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[API ERROR: Unexpected Worker Error]", "saved_audio_path": None}
+    # If loop finishes without returning, max retries were hit
+    print(f"Error (Row {row_idx}, Slot {slot_idx}): Task failed after {API_MAX_RETRIES} attempts.")
+    return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[API ERROR: Max retries reached]", "saved_audio_path": None}
+# --- Checkpoint Saving Function --- # <-- NEW (Copied from previous response)
+def save_checkpoint(data_to_save, output_dir, dataset_features):
+    """Saves the current state of the data to disk."""
+    if not data_to_save:
+        print("Checkpoint: No data available to save.")
+        return
+    # Ensure output directory exists before saving
+    os.makedirs(output_dir, exist_ok=True)
+    print(f"\nCheckpoint: Saving {len(data_to_save)} rows to {output_dir}...")
+    try:
+        # Convert list of dicts back to Dataset object
+        checkpoint_dataset = Dataset.from_list(data_to_save, features=dataset_features)
+        checkpoint_dataset.save_to_disk(output_dir)
+        print(f"Checkpoint: Saved successfully to {output_dir}")
+    except Exception as ckpt_save_e:
+        print(f"Error saving checkpoint dataset using datasets lib: {ckpt_save_e}")
+        # Fallback to JSON Lines (optional, but good practice)
+        output_jsonl_path = os.path.join(output_dir, "checkpoint_data.jsonl") # Save inside the dir
+        print(f"Attempting to save checkpoint as JSON lines to {output_jsonl_path}...")
+        try:
+            with open(output_jsonl_path, 'w', encoding='utf-8') as f:
+                for item in data_to_save:
+                    # Basic serialization handling for common types like numpy arrays
+                    serializable_item = {k: (v.tolist() if isinstance(v, np.ndarray) else v) for k, v in item.items()}
+                    f.write(json.dumps(serializable_item, ensure_ascii=False) + '\n')
+            print(f"Checkpoint: Fallback save successful to {output_jsonl_path}")
+        except Exception as json_save_e:
+            print(f"Error saving checkpoint as JSON lines: {json_save_e}")
+# --- Main Processing Logic ---
+print("Checking for existing checkpoint/output dataset...")
+dataset = None
+original_features = None # Initialize
+try:
+    # 检查输出目录是否存在，并且看起来像一个 Hugging Face datasets 目录
+    # (dataset_info.json 或 state.json 是常见的指示文件)
+    potential_checkpoint_info = os.path.join(OUTPUT_DATASET_DIR, "dataset_info.json")
+    potential_checkpoint_state = os.path.join(OUTPUT_DATASET_DIR, "state.json")
+    if os.path.exists(OUTPUT_DATASET_DIR) and \
+       (os.path.exists(potential_checkpoint_info) or os.path.exists(potential_checkpoint_state)):
+        print(f"Attempting to load existing data from output directory: {OUTPUT_DATASET_DIR}")
+        try:
+            dataset = load_from_disk(OUTPUT_DATASET_DIR)
+            original_features = dataset.features # 获取已保存数据集的特征
+            print(f"Successfully resumed from {OUTPUT_DATASET_DIR}. Loaded {len(dataset)} rows.")
+        except Exception as load_ckpt_e:
+            print(f"Warning: Failed to load from {OUTPUT_DATASET_DIR}: {load_ckpt_e}")
+            print("Falling back to loading original input dataset.")
+            dataset = None # Ensure we proceed to load original if checkpoint load failed
+    else:
+        print(f"No valid existing data found in {OUTPUT_DATASET_DIR}.")
+        # If no checkpoint, ensure dataset is None so original loading happens
+    # 如果 dataset 仍然是 None (因为没有找到 checkpoint 或加载失败)
+    if dataset is None:
+        print(f"Loading original dataset from {INPUT_DATASET_DIR}...")
+        dataset = load_from_disk(INPUT_DATASET_DIR)
+        original_features = dataset.features
+        print(f"Original dataset loaded successfully with {len(dataset)} rows.")
+except Exception as initial_load_e:
+    print(f"FATAL: Error during initial dataset loading (original or checkpoint): {initial_load_e}")
+    print(traceback.format_exc()) # 打印详细错误
+    exit(1)
+os.makedirs(OUTPUT_AUDIO_ROOT_DIR, exist_ok=True)
+# --- Pre-calculation Step for GPT-4o ---
+print("Pre-calculating GPT-4o tasks...")
+tasks_to_process = []
+# Use a list of dictionaries, which is mutable and easier for direct updates
+updated_data = list(dataset)
+for idx, row in enumerate(tqdm(updated_data, desc="Scanning dataset for GPT-4o tasks")):
+    for i in range(1, 4):
+        model_key = f"model_{i}"
+        response_text_key = f"response_text_{i}"
+        prompt_text_key = f"prompt_text_{i}"
+        response_audio_key = f"response_audio_path_{i}" # Key for storing the *new* audio path
+        model_assigned = row.get(model_key)
+        response_text_exists = row.get(response_text_key) is not None
+        # Check for the specific model name used in the dataset
+        if model_assigned == GPT4O_MODEL_NAME and not response_text_exists:
+            question_audio_path = row.get('question_audio')
+            if not question_audio_path or not os.path.exists(question_audio_path): # Check path validity here
+                print(f"Warning (Row {idx}, Slot {i}): Skipping GPT-4o task - Missing or invalid 'question_audio' path: {question_audio_path}")
+                # Pre-fill error? Let's just skip task creation for now.
+                # If needed: updated_data[idx][response_text_key] = "[ERROR: Missing input audio]"
+                # If needed: updated_data[idx][response_audio_key] = None
+                continue # Skip this task
+            metadata_str = row.get('metadata', "{}")
+            source_dataset = row.get('source_dataset')
+            metadata = {}
+            try:
+                if metadata_str and isinstance(metadata_str, str): metadata = json.loads(metadata_str)
+                elif isinstance(metadata_str, dict): metadata = metadata_str
+            except json.JSONDecodeError: pass
+            history_messages = []
+            if source_dataset == 'ultra':
+                history_str = metadata.get('history', '')
+                if history_str: history_messages = parse_ultra_history(history_str)
+            unique_id = str(uuid.uuid4()).replace("-", "")
+            output_audio_filename = f"gpt4o_r{idx}_s{i}_{unique_id}.{OUTPUT_AUDIO_FORMAT}"
+            output_audio_filepath = os.path.join(OUTPUT_AUDIO_ROOT_DIR, output_audio_filename)
+            task_info = {
+                "row_idx": idx,
+                "slot_idx": i,
+                # No API key needed here as it's global/single
+                "history_messages": history_messages,
+                "prompt_text": row.get(prompt_text_key, ""),
+                "question_text": row.get('question_text', ""), # Pass question text
+                "question_audio_path": question_audio_path,
+                "output_audio_filepath": output_audio_filepath,
+            }
+            tasks_to_process.append(task_info)
+            # Decide if you process all slots or just the first unfilled one
+            # break # Uncomment this line if you only want the *first* unfilled gpt4o slot per row processed
+total_tasks = len(tasks_to_process)
+if total_tasks == 0:
+    print("No GPT-4o tasks found needing processing.")
+    exit(0)
+print(f"Found {total_tasks} GPT-4o tasks to process.")
+# --- Threaded Execution with Checkpointing for GPT-4o --- # <-- MODIFIED SECTION
+print(f"Starting GPT-4o processing with up to {MAX_WORKERS} worker threads...")
+start_total_time = time.time()
+# results = {} # No longer needed
+tasks_completed = 0
+tasks_failed = 0
+completed_since_last_save = 0 # <-- Counter for checkpointing
+# Use context manager for ThreadPoolExecutor
+with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+    future_to_task = {executor.submit(call_gpt4o_api_worker, task): task for task in tasks_to_process}
+    for future in tqdm(concurrent.futures.as_completed(future_to_task), total=total_tasks, desc="Processing GPT-4o tasks"):
+        task_info = future_to_task[future] # Get original task info
+        row_idx = task_info["row_idx"]
+        slot_idx = task_info["slot_idx"]
+        result = None # Define result scope
+        try:
+            result = future.result()
+            # --- Direct Update and Checkpointing Logic ---
+            response_text_key = f"response_text_{slot_idx}"
+            response_audio_key = f"response_audio_path_{slot_idx}"
+            if 0 <= row_idx < len(updated_data):
+                 updated_data[row_idx][response_text_key] = result["response_text"]
+                 updated_data[row_idx][response_audio_key] = result["saved_audio_path"]
+                 if result["saved_audio_path"] is None or "[ERROR" in result["response_text"]: # Check for error marker
+                     tasks_failed += 1
+            else:
+                 print(f"Warning: Invalid row index {row_idx} encountered during result merge. Skipping update.")
+                 tasks_failed += 1 # Count as failed if index is bad
+            tasks_completed += 1
+            completed_since_last_save += 1 # Increment checkpoint counter
+            # Check if it's time to save a checkpoint
+            if completed_since_last_save >= CHECKPOINT_INTERVAL:
+                save_checkpoint(updated_data, OUTPUT_DATASET_DIR, original_features)
+                completed_since_last_save = 0 # Reset counter
+        except Exception as exc: # Catch exceptions raised *by* the future/worker if not handled inside
+            print(f"Error (Row {row_idx}, Slot {slot_idx}): GPT-4o Task generated an unhandled exception: {exc}")
+            print(traceback.format_exc())
+            # Attempt to record error in the main data structure
+            response_text_key = f"response_text_{slot_idx}"
+            response_audio_key = f"response_audio_path_{slot_idx}"
+            if 0 <= row_idx < len(updated_data):
+                 updated_data[row_idx][response_text_key] = f"[ERROR: Worker Crash - {exc}]"
+                 updated_data[row_idx][response_audio_key] = None
+            else:
+                print(f"Warning: Invalid row index {row_idx} encountered during exception handling merge.")
+            tasks_failed += 1
+            tasks_completed += 1 # Count as completed (though failed)
+            completed_since_last_save += 1 # Also increment for checkpointing
+            # Check if it's time to save a checkpoint even after an error
+            if completed_since_last_save >= CHECKPOINT_INTERVAL:
+                save_checkpoint(updated_data, OUTPUT_DATASET_DIR, original_features)
+                completed_since_last_save = 0 # Reset counter
+end_total_time = time.time()
+print("\n--- GPT-4o Processing Complete ---")
+print(f"Total GPT-4o tasks processed: {tasks_completed} (Succeeded: {tasks_completed - tasks_failed}, Failed: {tasks_failed})")
+print(f"Total GPT-4o processing time: {(end_total_time - start_total_time)/60:.2f} minutes")
+# --- Final Save ---
+# Save one last time to ensure any remaining processed items (< CHECKPOINT_INTERVAL) are saved
+print("\nPerforming final save...")
+save_checkpoint(updated_data, OUTPUT_DATASET_DIR, original_features)
+print("\nScript finished.")
+# --- (Removed the old merging and saving logic as it's now handled by save_checkpoint) ---

r1-a/response_generation/gpt4o_mini.py ADDED Viewed

	@@ -0,0 +1,464 @@

+import os
+import json
+import base64
+import uuid
+import time
+import re
+import random
+import concurrent.futures
+from tqdm import tqdm
+import threading
+import traceback # For detailed error logging
+import requests # Use requests library for HTTP calls
+# Make sure numpy is imported if needed for potential fallback serialization
+import numpy as np
+from datasets import load_from_disk, Dataset
+from dotenv import load_dotenv
+# --- Configuration ---
+load_dotenv()
+# 1. API Client Setup
+GPT4O_MODEL_NAME = "freeze_omni" # How it's identified in your dataset's model columns
+API_MODEL_NAME = "gpt-4o-mini-audio-preview" # Actual model name for the API call
+API_ENDPOINT = "https://api2.aigcbest.top/v1/chat/completions"
+try:
+    # Assuming a single key for this service based on the original script
+    API_TOKEN = "sk-J6Y4OBCEG0D75suEZoj22eFmiwO1DHzLCqvt4bRmyZRTMlTa"
+    if not API_TOKEN:
+        raise ValueError("AIGCBEST_API_KEY environment variable not set.")
+    print("AIGCBEST API Key loaded.")
+except Exception as e:
+    print(f"FATAL: Error getting API Key: {e}")
+    exit(1)
+# 2. Dataset Paths
+INPUT_DATASET_DIR = "/root/autodl-tmp/audio-r1/r1-a/dataset/preference_sampling_tasks"
+OUTPUT_DATASET_DIR = "/root/autodl-tmp/audio-r1/r1-a/dataset/preference_tasks_with_gpt4o_mini"
+# 3. Output Audio Configuration
+OUTPUT_AUDIO_ROOT_DIR = "/root/autodl-tmp/audio-r1/r1-a/generated_audio/gpt4o_mini"
+OUTPUT_AUDIO_FORMAT = "wav" # API will be requested to return wav
+AVAILABLE_VOICES = ['alloy', 'ash', 'ballad', 'coral', 'echo', 'sage', 'shimmer', 'verse']
+# 4. API Call Settings
+API_TIMEOUT = 240
+API_RETRY_DELAY = 5
+API_MAX_RETRIES = 3 # Max attempts *for the task*
+MAX_WORKERS = 8 # Adjust based on API rate limits and system resources
+# 5. Checkpoint Saving Configuration # <-- NEW
+CHECKPOINT_INTERVAL = 500 # Save every 500 completed tasks
+# --- Helper Functions (encode_audio_base64 and parse_ultra_history remain the same) ---
+def encode_audio_base64(audio_path):
+    if not audio_path or not os.path.exists(audio_path):
+        print(f"Warning: Input audio file not found or path is empty: {audio_path}")
+        return None
+    try:
+        with open(audio_path, "rb") as audio_file:
+            return base64.b64encode(audio_file.read()).decode("utf-8")
+    except Exception as e:
+        print(f"Error encoding audio file {audio_path}: {e}")
+        return None
+def parse_ultra_history(history_str):
+    messages = []
+    pattern = re.compile(r"\[(USER|ASSISTANT)\]\s*([\s\S]*?)(?=\s*\[(?:USER|ASSISTANT)\]|$)")
+    matches = pattern.findall(history_str)
+    if not matches:
+        return []
+    for role_tag, content in matches:
+        role = role_tag.lower()
+        cleaned_content = content.strip()
+        if cleaned_content:
+             messages.append({"role": role, "content": cleaned_content})
+    return messages
+# --- Modified API Call Worker Function for GPT-4o (Reduced Prints) ---
+def call_gpt4o_api_worker(task_info):
+    """
+    Worker function to call the custom GPT-4o API for a single task.
+    """
+    row_idx = task_info["row_idx"]
+    slot_idx = task_info["slot_idx"]
+    history_messages = task_info["history_messages"]
+    prompt_text = task_info["prompt_text"]
+    question_text = task_info["question_text"]
+    question_audio_path = task_info["question_audio_path"]
+    output_audio_filepath = task_info["output_audio_filepath"]
+    retries = 0
+    headers = {
+        'Accept': 'application/json',
+        'Authorization': f'Bearer {API_TOKEN}', # Use the single loaded token
+        'Content-Type': 'application/json'
+    }
+    selected_voice = random.choice(AVAILABLE_VOICES)
+    # print(f"  [Thread-{threading.get_ident()}] Processing Row {row_idx}, Slot {slot_idx} (GPT4o Voice: {selected_voice})") # Optional log
+    while retries < API_MAX_RETRIES:
+        try:
+            # 1. Prepare Input Audio
+            base64_audio_data = encode_audio_base64(question_audio_path)
+            if not base64_audio_data:
+                print(f"Error (Row {row_idx}, Slot {slot_idx}): Skipping GPT4o API call - missing input audio.")
+                return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[ERROR: Missing input audio]", "saved_audio_path": None}
+            input_audio_format = os.path.splitext(question_audio_path)[1].lstrip('.') or 'wav'
+            # 2. Construct User Message Content
+            combined_text = f"{prompt_text}"
+            user_content_list = [
+                {"type": "text", "text": combined_text},
+                {"type": "input_audio", "input_audio": {"data": base64_audio_data, "format": input_audio_format}}
+            ]
+            messages = history_messages + [{"role": "user", "content": user_content_list}]
+            # 4. Construct Payload
+            payload = {
+                "model": API_MODEL_NAME,
+                "modalities": ["text", "audio"],
+                "audio": {"voice": selected_voice, "format": OUTPUT_AUDIO_FORMAT},
+                "messages": messages
+            }
+            # 5. Make API Call
+            response = requests.post(
+                API_ENDPOINT,
+                headers=headers,
+                json=payload,
+                timeout=API_TIMEOUT
+            )
+            # 6. Process Response
+            if response.status_code == 200:
+                try:
+                    response_data = response.json()
+                    # Make parsing more robust
+                    choices = response_data.get('choices')
+                    if not choices or not isinstance(choices, list) or len(choices) == 0:
+                        raise ValueError("Invalid or empty 'choices' field in response.")
+                    message_content = choices[0].get('message', {})
+                    if not message_content:
+                         raise ValueError("Missing 'message' field in the first choice.")
+                    audio_info = message_content.get('audio', {})
+                    if not isinstance(audio_info, dict): audio_info = {} # Handle case where audio might be null or not a dict
+                    audio_base64_string = audio_info.get('data', '')
+                    # Try getting text from 'content' if 'transcript' is missing/empty in 'audio'
+                    collected_text = audio_info.get('transcript', '').strip()
+                    if not collected_text:
+                        text_content_list = message_content.get('content', [])
+                        if isinstance(text_content_list, list):
+                            for item in text_content_list:
+                                if isinstance(item, dict) and item.get("type") == "text":
+                                    collected_text = item.get("text", "").strip()
+                                    break # Take the first text part found
+                        # Still no text? Try the top-level message content directly if it's a string
+                        elif isinstance(message_content.get('content'), str):
+                           collected_text = message_content['content'].strip()
+                    if not collected_text: print(f"Warning (Row {row_idx}, Slot {slot_idx}): No text content found after checking multiple fields.")
+                    if not audio_base64_string: print(f"Warning (Row {row_idx}, Slot {slot_idx}): No audio data found.")
+                    saved_audio_path = None
+                    if audio_base64_string:
+                        try:
+                            wav_bytes = base64.b64decode(audio_base64_string)
+                            if len(wav_bytes) == 0:
+                                 print(f"Warning (Row {row_idx}, Slot {slot_idx}): Decoded audio bytes are empty.")
+                            else:
+                                 os.makedirs(os.path.dirname(output_audio_filepath), exist_ok=True)
+                                 with open(output_audio_filepath, "wb") as f:
+                                     f.write(wav_bytes)
+                                 saved_audio_path = output_audio_filepath
+                                 # print(f"  Audio saved to: {output_audio_filepath}") # Less verbose log
+                        except base64.binascii.Error as b64_err:
+                            print(f"Error (Row {row_idx}, Slot {slot_idx}): Decoding base64 audio data failed: {b64_err}")
+                        except Exception as e:
+                            print(f"Error (Row {row_idx}, Slot {slot_idx}): Saving audio file failed: {e}")
+                    # TASK SUCCEEDED (even if audio saving failed, text might be valid)
+                    return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": collected_text, "saved_audio_path": saved_audio_path}
+                except (json.JSONDecodeError, IndexError, KeyError, TypeError, ValueError) as e:
+                    print(f"Error (Row {row_idx}, Slot {slot_idx}): Parsing successful API response failed: {type(e).__name__} - {e}")
+                    print(f"  Response Text (start): {response.text[:500]}...")
+                    retries += 1
+                    print(f"  Retrying task ({retries}/{API_MAX_RETRIES})...")
+                    time.sleep(API_RETRY_DELAY)
+                    continue
+                except Exception as e: # Catch-all for unexpected errors during processing
+                     print(f"Error (Row {row_idx}, Slot {slot_idx}): Unexpected error processing response: {e}")
+                     print(traceback.format_exc())
+                     retries += 1
+                     print(f"  Retrying task ({retries}/{API_MAX_RETRIES})...")
+                     time.sleep(API_RETRY_DELAY)
+                     continue
+            else: # Handle non-200 status codes
+                print(f"Error (Row {row_idx}, Slot {slot_idx}): API returned status {response.status_code}. Response: {response.text[:500]}...")
+                retries += 1
+                if retries < API_MAX_RETRIES:
+                    print(f"  Retrying task ({retries}/{API_MAX_RETRIES})...")
+                    time.sleep(API_RETRY_DELAY)
+                    continue # Go to next iteration of while loop
+                else:
+                    print(f"Error (Row {row_idx}, Slot {slot_idx}): Max retries reached after status {response.status_code}.")
+                    return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": f"[API ERROR: Status {response.status_code}]", "saved_audio_path": None}
+        except requests.exceptions.Timeout:
+            retries += 1
+            print(f"Error (Row {row_idx}, Slot {slot_idx}): API Call Attempt {retries}/{API_MAX_RETRIES} timed out after {API_TIMEOUT}s.")
+            if retries < API_MAX_RETRIES:
+                time.sleep(API_RETRY_DELAY)
+                continue
+            else:
+                print(f"Error (Row {row_idx}, Slot {slot_idx}): Max retries reached after timeout.")
+                return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[API ERROR: Timeout]", "saved_audio_path": None}
+        except requests.exceptions.RequestException as e:
+            retries += 1
+            print(f"Error (Row {row_idx}, Slot {slot_idx}): Network/Request Error Attempt {retries}/{API_MAX_RETRIES}: {e}")
+            if retries < API_MAX_RETRIES:
+                time.sleep(API_RETRY_DELAY)
+                continue
+            else:
+                print(f"Error (Row {row_idx}, Slot {slot_idx}): Max retries reached after network error.")
+                return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[API ERROR: Network Error]", "saved_audio_path": None}
+        except Exception as e:
+            retries += 1
+            print(f"Error (Row {row_idx}, Slot {slot_idx}): Unexpected Error in Worker Loop Attempt {retries}/{API_MAX_RETRIES}: {type(e).__name__} - {e}")
+            print(traceback.format_exc())
+            if retries < API_MAX_RETRIES:
+                time.sleep(API_RETRY_DELAY)
+                continue
+            else:
+                 print(f"Error (Row {row_idx}, Slot {slot_idx}): Max retries reached after unexpected error.")
+                 return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[API ERROR: Unexpected Worker Error]", "saved_audio_path": None}
+    # If loop finishes without returning, max retries were hit
+    print(f"Error (Row {row_idx}, Slot {slot_idx}): Task failed after {API_MAX_RETRIES} attempts.")
+    return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[API ERROR: Max retries reached]", "saved_audio_path": None}
+# --- Checkpoint Saving Function --- # <-- NEW (Copied from previous response)
+def save_checkpoint(data_to_save, output_dir, dataset_features):
+    """Saves the current state of the data to disk."""
+    if not data_to_save:
+        print("Checkpoint: No data available to save.")
+        return
+    # Ensure output directory exists before saving
+    os.makedirs(output_dir, exist_ok=True)
+    print(f"\nCheckpoint: Saving {len(data_to_save)} rows to {output_dir}...")
+    try:
+        # Convert list of dicts back to Dataset object
+        checkpoint_dataset = Dataset.from_list(data_to_save, features=dataset_features)
+        checkpoint_dataset.save_to_disk(output_dir)
+        print(f"Checkpoint: Saved successfully to {output_dir}")
+    except Exception as ckpt_save_e:
+        print(f"Error saving checkpoint dataset using datasets lib: {ckpt_save_e}")
+        # Fallback to JSON Lines (optional, but good practice)
+        output_jsonl_path = os.path.join(output_dir, "checkpoint_data.jsonl") # Save inside the dir
+        print(f"Attempting to save checkpoint as JSON lines to {output_jsonl_path}...")
+        try:
+            with open(output_jsonl_path, 'w', encoding='utf-8') as f:
+                for item in data_to_save:
+                    # Basic serialization handling for common types like numpy arrays
+                    serializable_item = {k: (v.tolist() if isinstance(v, np.ndarray) else v) for k, v in item.items()}
+                    f.write(json.dumps(serializable_item, ensure_ascii=False) + '\n')
+            print(f"Checkpoint: Fallback save successful to {output_jsonl_path}")
+        except Exception as json_save_e:
+            print(f"Error saving checkpoint as JSON lines: {json_save_e}")
+# --- Main Processing Logic ---
+print("Checking for existing checkpoint/output dataset...")
+dataset = None
+original_features = None # Initialize
+try:
+    # 检查输出目录是否存在，并且看起来像一个 Hugging Face datasets 目录
+    # (dataset_info.json 或 state.json 是常见的指示文件)
+    potential_checkpoint_info = os.path.join(OUTPUT_DATASET_DIR, "dataset_info.json")
+    potential_checkpoint_state = os.path.join(OUTPUT_DATASET_DIR, "state.json")
+    if os.path.exists(OUTPUT_DATASET_DIR) and \
+       (os.path.exists(potential_checkpoint_info) or os.path.exists(potential_checkpoint_state)):
+        print(f"Attempting to load existing data from output directory: {OUTPUT_DATASET_DIR}")
+        try:
+            dataset = load_from_disk(OUTPUT_DATASET_DIR)
+            original_features = dataset.features # 获取已保存数据集的特征
+            print(f"Successfully resumed from {OUTPUT_DATASET_DIR}. Loaded {len(dataset)} rows.")
+        except Exception as load_ckpt_e:
+            print(f"Warning: Failed to load from {OUTPUT_DATASET_DIR}: {load_ckpt_e}")
+            print("Falling back to loading original input dataset.")
+            dataset = None # Ensure we proceed to load original if checkpoint load failed
+    else:
+        print(f"No valid existing data found in {OUTPUT_DATASET_DIR}.")
+        # If no checkpoint, ensure dataset is None so original loading happens
+    # 如果 dataset 仍然是 None (因为没有找到 checkpoint 或加载失败)
+    if dataset is None:
+        print(f"Loading original dataset from {INPUT_DATASET_DIR}...")
+        dataset = load_from_disk(INPUT_DATASET_DIR)
+        original_features = dataset.features
+        print(f"Original dataset loaded successfully with {len(dataset)} rows.")
+except Exception as initial_load_e:
+    print(f"FATAL: Error during initial dataset loading (original or checkpoint): {initial_load_e}")
+    print(traceback.format_exc()) # 打印详细错误
+    exit(1)
+os.makedirs(OUTPUT_AUDIO_ROOT_DIR, exist_ok=True)
+# --- Pre-calculation Step for GPT-4o ---
+print("Pre-calculating GPT-4o tasks...")
+tasks_to_process = []
+# Use a list of dictionaries, which is mutable and easier for direct updates
+updated_data = list(dataset)
+for idx, row in enumerate(tqdm(updated_data, desc="Scanning dataset for GPT-4o tasks")):
+    for i in range(1, 4):
+        model_key = f"model_{i}"
+        response_text_key = f"response_text_{i}"
+        prompt_text_key = f"prompt_text_{i}"
+        response_audio_key = f"response_audio_path_{i}" # Key for storing the *new* audio path
+        model_assigned = row.get(model_key)
+        response_text_exists = row.get(response_text_key) is not None
+        # Check for the specific model name used in the dataset
+        if model_assigned == GPT4O_MODEL_NAME and not response_text_exists:
+            question_audio_path = row.get('question_audio')
+            if not question_audio_path or not os.path.exists(question_audio_path): # Check path validity here
+                print(f"Warning (Row {idx}, Slot {i}): Skipping GPT-4o task - Missing or invalid 'question_audio' path: {question_audio_path}")
+                # Pre-fill error? Let's just skip task creation for now.
+                # If needed: updated_data[idx][response_text_key] = "[ERROR: Missing input audio]"
+                # If needed: updated_data[idx][response_audio_key] = None
+                continue # Skip this task
+            metadata_str = row.get('metadata', "{}")
+            source_dataset = row.get('source_dataset')
+            metadata = {}
+            try:
+                if metadata_str and isinstance(metadata_str, str): metadata = json.loads(metadata_str)
+                elif isinstance(metadata_str, dict): metadata = metadata_str
+            except json.JSONDecodeError: pass
+            history_messages = []
+            if source_dataset == 'ultra':
+                history_str = metadata.get('history', '')
+                if history_str: history_messages = parse_ultra_history(history_str)
+            unique_id = str(uuid.uuid4()).replace("-", "")
+            output_audio_filename = f"gpt4o_r{idx}_s{i}_{unique_id}.{OUTPUT_AUDIO_FORMAT}"
+            output_audio_filepath = os.path.join(OUTPUT_AUDIO_ROOT_DIR, output_audio_filename)
+            task_info = {
+                "row_idx": idx,
+                "slot_idx": i,
+                # No API key needed here as it's global/single
+                "history_messages": history_messages,
+                "prompt_text": row.get(prompt_text_key, ""),
+                "question_text": row.get('question_text', ""), # Pass question text
+                "question_audio_path": question_audio_path,
+                "output_audio_filepath": output_audio_filepath,
+            }
+            tasks_to_process.append(task_info)
+            # Decide if you process all slots or just the first unfilled one
+            # break # Uncomment this line if you only want the *first* unfilled gpt4o slot per row processed
+total_tasks = len(tasks_to_process)
+if total_tasks == 0:
+    print("No GPT-4o tasks found needing processing.")
+    exit(0)
+print(f"Found {total_tasks} GPT-4o tasks to process.")
+# --- Threaded Execution with Checkpointing for GPT-4o --- # <-- MODIFIED SECTION
+print(f"Starting GPT-4o processing with up to {MAX_WORKERS} worker threads...")
+start_total_time = time.time()
+# results = {} # No longer needed
+tasks_completed = 0
+tasks_failed = 0
+completed_since_last_save = 0 # <-- Counter for checkpointing
+# Use context manager for ThreadPoolExecutor
+with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+    future_to_task = {executor.submit(call_gpt4o_api_worker, task): task for task in tasks_to_process}
+    for future in tqdm(concurrent.futures.as_completed(future_to_task), total=total_tasks, desc="Processing GPT-4o tasks"):
+        task_info = future_to_task[future] # Get original task info
+        row_idx = task_info["row_idx"]
+        slot_idx = task_info["slot_idx"]
+        result = None # Define result scope
+        try:
+            result = future.result()
+            # --- Direct Update and Checkpointing Logic ---
+            response_text_key = f"response_text_{slot_idx}"
+            response_audio_key = f"response_audio_path_{slot_idx}"
+            if 0 <= row_idx < len(updated_data):
+                 updated_data[row_idx][response_text_key] = result["response_text"]
+                 updated_data[row_idx][response_audio_key] = result["saved_audio_path"]
+                 if result["saved_audio_path"] is None or "[ERROR" in result["response_text"]: # Check for error marker
+                     tasks_failed += 1
+            else:
+                 print(f"Warning: Invalid row index {row_idx} encountered during result merge. Skipping update.")
+                 tasks_failed += 1 # Count as failed if index is bad
+            tasks_completed += 1
+            completed_since_last_save += 1 # Increment checkpoint counter
+            # Check if it's time to save a checkpoint
+            if completed_since_last_save >= CHECKPOINT_INTERVAL:
+                save_checkpoint(updated_data, OUTPUT_DATASET_DIR, original_features)
+                completed_since_last_save = 0 # Reset counter
+        except Exception as exc: # Catch exceptions raised *by* the future/worker if not handled inside
+            print(f"Error (Row {row_idx}, Slot {slot_idx}): GPT-4o Task generated an unhandled exception: {exc}")
+            print(traceback.format_exc())
+            # Attempt to record error in the main data structure
+            response_text_key = f"response_text_{slot_idx}"
+            response_audio_key = f"response_audio_path_{slot_idx}"
+            if 0 <= row_idx < len(updated_data):
+                 updated_data[row_idx][response_text_key] = f"[ERROR: Worker Crash - {exc}]"
+                 updated_data[row_idx][response_audio_key] = None
+            else:
+                print(f"Warning: Invalid row index {row_idx} encountered during exception handling merge.")
+            tasks_failed += 1
+            tasks_completed += 1 # Count as completed (though failed)
+            completed_since_last_save += 1 # Also increment for checkpointing
+            # Check if it's time to save a checkpoint even after an error
+            if completed_since_last_save >= CHECKPOINT_INTERVAL:
+                save_checkpoint(updated_data, OUTPUT_DATASET_DIR, original_features)
+                completed_since_last_save = 0 # Reset counter
+end_total_time = time.time()
+print("\n--- GPT-4o Processing Complete ---")
+print(f"Total GPT-4o tasks processed: {tasks_completed} (Succeeded: {tasks_completed - tasks_failed}, Failed: {tasks_failed})")
+print(f"Total GPT-4o processing time: {(end_total_time - start_total_time)/60:.2f} minutes")
+# --- Final Save ---
+# Save one last time to ensure any remaining processed items (< CHECKPOINT_INTERVAL) are saved
+print("\nPerforming final save...")
+save_checkpoint(updated_data, OUTPUT_DATASET_DIR, original_features)
+print("\nScript finished.")
+# --- (Removed the old merging and saving logic as it's now handled by save_checkpoint) ---

r1-a/response_generation/gpt5o_retry.py ADDED Viewed

	@@ -0,0 +1,461 @@

+import os
+import json
+import base64
+import uuid
+import time
+import re
+import random
+import concurrent.futures
+from tqdm import tqdm
+import threading
+import traceback # For detailed error logging
+import requests # Use requests library for HTTP calls
+import numpy as np # Import numpy for potential fallback serialization
+from datasets import load_from_disk, Dataset
+from dotenv import load_dotenv
+# --- Configuration ---
+load_dotenv()
+# --- !!! KEY CONFIGURATION FOR RETRY SCRIPT !!! ---
+# 1. Identify the model you are retrying
+TARGET_MODEL_NAME = "gpt4o" # Or "qwen_omni" if retrying Qwen
+# 2. Set the INPUT/OUTPUT dataset directory to the PREVIOUS script's OUTPUT directory
+#    This is where the partially processed data (with errors) resides.
+#    The script will LOAD from here and SAVE back to here.
+DATASET_DIR = "/root/autodl-tmp/audio-r1/r1-a/dataset/preference_tasks_with_gpt4o" # Adjust if needed
+# 3. Set the audio output directory (can be the same as before)
+OUTPUT_AUDIO_ROOT_DIR = "/root/autodl-tmp/audio-r1/r1-a/generated_audio/gpt4o_2" # Adjust if needed
+# 4. API Configuration (Specific to the model being retried)
+API_MODEL_NAME = "gpt-4o-audio-preview" # Actual model name for the API call
+API_ENDPOINT = "https://api2.aigcbest.top/v1/chat/completions"
+try:
+    API_TOKEN = "sk-D6jMssP7AZw3ZU6LEZaljdNMO1zif6wzef6XVh4kOgZAhQzI" # Use the correct key
+    if not API_TOKEN:
+        raise ValueError("API_TOKEN environment variable not set.")
+    print(f"{TARGET_MODEL_NAME} API Key loaded.")
+except Exception as e:
+    print(f"FATAL: Error getting API Key: {e}")
+    exit(1)
+# 5. Output Audio Configuration (Specific to the model being retried)
+OUTPUT_AUDIO_FORMAT = "wav"
+AVAILABLE_VOICES = ['alloy', 'ash', 'ballad', 'coral', 'echo', 'sage', 'shimmer', 'verse'] # GPT-4o voices
+# 6. API Call Settings
+API_TIMEOUT = 120
+API_RETRY_DELAY = 5
+API_MAX_RETRIES = 3
+MAX_WORKERS = 8
+# 7. Checkpoint Saving Configuration
+CHECKPOINT_INTERVAL = 50 # Save every 500 *retried* tasks completed
+# --- Error Markers to Look For ---
+# These prefixes indicate a failed task that needs retrying
+ERROR_MARKERS = ("[API ERROR", "[ERROR")
+# --- Helper Functions (encode_audio_base64, parse_ultra_history - unchanged) ---
+def encode_audio_base64(audio_path):
+    if not audio_path or not os.path.exists(audio_path):
+        print(f"Warning: Input audio file not found or path is empty: {audio_path}")
+        return None
+    try:
+        with open(audio_path, "rb") as audio_file:
+            return base64.b64encode(audio_file.read()).decode("utf-8")
+    except Exception as e:
+        print(f"Error encoding audio file {audio_path}: {e}")
+        return None
+def parse_ultra_history(history_str):
+    messages = []
+    pattern = re.compile(r"\[(USER|ASSISTANT)\]\s*([\s\S]*?)(?=\s*\[(?:USER|ASSISTANT)\]|$)")
+    matches = pattern.findall(history_str)
+    if not matches:
+        return []
+    for role_tag, content in matches:
+        role = role_tag.lower()
+        cleaned_content = content.strip()
+        if cleaned_content:
+             messages.append({"role": role, "content": cleaned_content})
+    return messages
+# --- API Call Worker Function (Use the correct one for the target model - GPT-4o version shown) ---
+# --- (This function call_gpt4o_api_worker is copied directly from the previous script) ---
+def call_gpt4o_api_worker(task_info):
+    """
+    Worker function to call the custom GPT-4o API for a single task.
+    (Identical to the function in the previous script)
+    """
+    row_idx = task_info["row_idx"]
+    slot_idx = task_info["slot_idx"]
+    history_messages = task_info["history_messages"]
+    prompt_text = task_info["prompt_text"]
+    question_text = task_info["question_text"]
+    question_audio_path = task_info["question_audio_path"]
+    output_audio_filepath = task_info["output_audio_filepath"]
+    retries = 0
+    headers = {
+        'Accept': 'application/json',
+        'Authorization': f'Bearer {API_TOKEN}', # Use the single loaded token
+        'Content-Type': 'application/json'
+    }
+    selected_voice = random.choice(AVAILABLE_VOICES)
+    while retries < API_MAX_RETRIES:
+        try:
+            # 1. Prepare Input Audio
+            base64_audio_data = encode_audio_base64(question_audio_path)
+            if not base64_audio_data:
+                print(f"Error (Row {row_idx}, Slot {slot_idx}): Skipping GPT4o API call - missing input audio.")
+                return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[ERROR: Missing input audio]", "saved_audio_path": None}
+            input_audio_format = os.path.splitext(question_audio_path)[1].lstrip('.') or 'wav'
+            # 2. Construct User Message Content
+            combined_text = f"{prompt_text}"
+            user_content_list = [
+                {"type": "text", "text": combined_text},
+                {"type": "input_audio", "input_audio": {"data": base64_audio_data, "format": input_audio_format}}
+            ]
+            messages = history_messages + [{"role": "user", "content": user_content_list}]
+            # 4. Construct Payload
+            payload = {
+                "model": API_MODEL_NAME,
+                "modalities": ["text", "audio"],
+                "audio": {"voice": selected_voice, "format": OUTPUT_AUDIO_FORMAT},
+                "messages": messages
+            }
+            # 5. Make API Call
+            response = requests.post(
+                API_ENDPOINT,
+                headers=headers,
+                json=payload,
+                timeout=API_TIMEOUT
+            )
+            # 6. Process Response
+            if response.status_code == 200:
+                try:
+                    response_data = response.json()
+                    choices = response_data.get('choices')
+                    if not choices or not isinstance(choices, list) or len(choices) == 0:
+                        raise ValueError("Invalid or empty 'choices' field in response.")
+                    message_content = choices[0].get('message', {})
+                    if not message_content:
+                         raise ValueError("Missing 'message' field in the first choice.")
+                    audio_info = message_content.get('audio', {})
+                    if not isinstance(audio_info, dict): audio_info = {}
+                    audio_base64_string = audio_info.get('data', '')
+                    collected_text = audio_info.get('transcript', '').strip()
+                    if not collected_text:
+                        text_content_list = message_content.get('content', [])
+                        if isinstance(text_content_list, list):
+                            for item in text_content_list:
+                                if isinstance(item, dict) and item.get("type") == "text":
+                                    collected_text = item.get("text", "").strip()
+                                    break
+                        elif isinstance(message_content.get('content'), str):
+                           collected_text = message_content['content'].strip()
+                    if not collected_text: print(f"Warning (Row {row_idx}, Slot {slot_idx}): No text content found after checking multiple fields.")
+                    if not audio_base64_string: print(f"Warning (Row {row_idx}, Slot {slot_idx}): No audio data found.")
+                    saved_audio_path = None
+                    if audio_base64_string:
+                        try:
+                            wav_bytes = base64.b64decode(audio_base64_string)
+                            if len(wav_bytes) == 0:
+                                 print(f"Warning (Row {row_idx}, Slot {slot_idx}): Decoded audio bytes are empty.")
+                            else:
+                                 os.makedirs(os.path.dirname(output_audio_filepath), exist_ok=True)
+                                 with open(output_audio_filepath, "wb") as f:
+                                     f.write(wav_bytes)
+                                 saved_audio_path = output_audio_filepath
+                        except base64.binascii.Error as b64_err:
+                            print(f"Error (Row {row_idx}, Slot {slot_idx}): Decoding base64 audio data failed: {b64_err}")
+                        except Exception as e:
+                            print(f"Error (Row {row_idx}, Slot {slot_idx}): Saving audio file failed: {e}")
+                    return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": collected_text, "saved_audio_path": saved_audio_path}
+                except (json.JSONDecodeError, IndexError, KeyError, TypeError, ValueError) as e:
+                    print(f"Error (Row {row_idx}, Slot {slot_idx}): Parsing successful API response failed: {type(e).__name__} - {e}")
+                    print(f"  Response Text (start): {response.text[:500]}...")
+                    retries += 1
+                    print(f"  Retrying task ({retries}/{API_MAX_RETRIES})...")
+                    time.sleep(API_RETRY_DELAY)
+                    continue
+                except Exception as e:
+                     print(f"Error (Row {row_idx}, Slot {slot_idx}): Unexpected error processing response: {e}")
+                     print(traceback.format_exc())
+                     retries += 1
+                     print(f"  Retrying task ({retries}/{API_MAX_RETRIES})...")
+                     time.sleep(API_RETRY_DELAY)
+                     continue
+            else: # Handle non-200 status codes
+                print(f"Error (Row {row_idx}, Slot {slot_idx}): API returned status {response.status_code}. Response: {response.text[:500]}...")
+                retries += 1
+                if retries < API_MAX_RETRIES:
+                    print(f"  Retrying task ({retries}/{API_MAX_RETRIES})...")
+                    time.sleep(API_RETRY_DELAY)
+                    continue
+                else:
+                    print(f"Error (Row {row_idx}, Slot {slot_idx}): Max retries reached after status {response.status_code}.")
+                    return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": f"[API ERROR: Status {response.status_code}]", "saved_audio_path": None}
+        except requests.exceptions.Timeout:
+            retries += 1
+            print(f"Error (Row {row_idx}, Slot {slot_idx}): API Call Attempt {retries}/{API_MAX_RETRIES} timed out after {API_TIMEOUT}s.")
+            if retries < API_MAX_RETRIES:
+                time.sleep(API_RETRY_DELAY)
+                continue
+            else:
+                print(f"Error (Row {row_idx}, Slot {slot_idx}): Max retries reached after timeout.")
+                return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[API ERROR: Timeout]", "saved_audio_path": None}
+        except requests.exceptions.RequestException as e:
+            retries += 1
+            print(f"Error (Row {row_idx}, Slot {slot_idx}): Network/Request Error Attempt {retries}/{API_MAX_RETRIES}: {e}")
+            if retries < API_MAX_RETRIES:
+                time.sleep(API_RETRY_DELAY)
+                continue
+            else:
+                print(f"Error (Row {row_idx}, Slot {slot_idx}): Max retries reached after network error.")
+                return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[API ERROR: Network Error]", "saved_audio_path": None}
+        except Exception as e:
+            retries += 1
+            print(f"Error (Row {row_idx}, Slot {slot_idx}): Unexpected Error in Worker Loop Attempt {retries}/{API_MAX_RETRIES}: {type(e).__name__} - {e}")
+            print(traceback.format_exc())
+            if retries < API_MAX_RETRIES:
+                time.sleep(API_RETRY_DELAY)
+                continue
+            else:
+                 print(f"Error (Row {row_idx}, Slot {slot_idx}): Max retries reached after unexpected error.")
+                 return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[API ERROR: Unexpected Worker Error]", "saved_audio_path": None}
+    print(f"Error (Row {row_idx}, Slot {slot_idx}): Task failed after {API_MAX_RETRIES} attempts (Worker Loop Exited).")
+    return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[API ERROR: Max retries reached]", "saved_audio_path": None}
+# --- Checkpoint Saving Function (Unchanged) ---
+def save_checkpoint(data_to_save, output_dir, dataset_features):
+    """Saves the current state of the data to disk."""
+    if not data_to_save:
+        print("Checkpoint: No data available to save.")
+        return
+    os.makedirs(output_dir, exist_ok=True)
+    print(f"\nCheckpoint: Saving {len(data_to_save)} rows to {output_dir}...")
+    try:
+        checkpoint_dataset = Dataset.from_list(data_to_save, features=dataset_features)
+        checkpoint_dataset.save_to_disk(output_dir)
+        print(f"Checkpoint: Saved successfully to {output_dir}")
+    except Exception as ckpt_save_e:
+        print(f"Error saving checkpoint dataset using datasets lib: {ckpt_save_e}")
+        output_jsonl_path = os.path.join(output_dir, "checkpoint_data.jsonl")
+        print(f"Attempting to save checkpoint as JSON lines to {output_jsonl_path}...")
+        try:
+            with open(output_jsonl_path, 'w', encoding='utf-8') as f:
+                for item in data_to_save:
+                    serializable_item = {k: (v.tolist() if isinstance(v, np.ndarray) else v) for k, v in item.items()}
+                    f.write(json.dumps(serializable_item, ensure_ascii=False) + '\n')
+            print(f"Checkpoint: Fallback save successful to {output_jsonl_path}")
+        except Exception as json_save_e:
+            print(f"Error saving checkpoint as JSON lines: {json_save_e}")
+# --- Main Processing Logic (Retry Focus) ---
+print(f"--- Starting Retry Script for {TARGET_MODEL_NAME} ---")
+print(f"Loading dataset to retry from: {DATASET_DIR}")
+try:
+    # Attempt to load the dataset from the specified directory
+    if not os.path.exists(DATASET_DIR) or \
+       not (os.path.exists(os.path.join(DATASET_DIR, "dataset_info.json")) or \
+            os.path.exists(os.path.join(DATASET_DIR, "state.json"))):
+        print(f"FATAL: Dataset directory not found or invalid: {DATASET_DIR}")
+        print("Please ensure this path points to the OUTPUT directory of the previous script run.")
+        exit(1)
+    dataset = load_from_disk(DATASET_DIR)
+    original_features = dataset.features # Store features for saving
+    print(f"Dataset loaded successfully with {len(dataset)} rows.")
+except Exception as e:
+    print(f"FATAL: Error loading dataset from {DATASET_DIR}: {e}")
+    print(traceback.format_exc())
+    exit(1)
+# Ensure audio output directory exists
+os.makedirs(OUTPUT_AUDIO_ROOT_DIR, exist_ok=True)
+# --- Pre-calculation Step for Retrying Failed Tasks ---
+print(f"Scanning dataset for failed {TARGET_MODEL_NAME} tasks to retry...")
+tasks_to_process = []
+# Use a list of dictionaries, which is mutable and easier for direct updates
+updated_data = list(dataset) # Load data into memory for modification
+for idx, row in enumerate(tqdm(updated_data, desc=f"Scanning for failed {TARGET_MODEL_NAME} tasks")):
+    for i in range(1, 4): # Check slots 1, 2, 3
+        model_key = f"model_{i}"
+        response_text_key = f"response_text_{i}"
+        prompt_text_key = f"prompt_text_{i}"
+        response_audio_key = f"response_audio_path_{i}"
+        model_assigned = row.get(model_key)
+        response_text_value = row.get(response_text_key)
+        # --- Core Retry Logic ---
+        # Check if the model assigned matches the one we are retrying
+        if model_assigned == TARGET_MODEL_NAME:
+            # Check if the response text indicates an error
+            is_error = False
+            if isinstance(response_text_value, str):
+                cleaned_text = response_text_value.strip()
+                if cleaned_text.startswith(ERROR_MARKERS): # Check if it starts with any error prefix
+                    is_error = True
+            # Optional: You might also want to retry if text is None or empty,
+            # but the primary goal is retrying explicit errors.
+            # elif response_text_value is None or response_text_value == "":
+            #    is_error = True # Uncomment if needed
+            if is_error:
+                print(f"\nInfo (Row {idx}, Slot {i}): Found failed task to retry. Current text: '{str(response_text_value)[:100]}...'") # Log finding
+                # --- Gather info needed for the task (same as original script) ---
+                question_audio_path = row.get('question_audio')
+                if not question_audio_path or not os.path.exists(question_audio_path):
+                    print(f"Warning (Row {idx}, Slot {i}): Skipping retry - Missing or invalid 'question_audio' path: {question_audio_path}")
+                    # Keep the old error message in updated_data for this case
+                    continue # Skip this specific task retry
+                metadata_str = row.get('metadata', "{}")
+                source_dataset = row.get('source_dataset')
+                metadata = {}
+                try:
+                    if metadata_str and isinstance(metadata_str, str): metadata = json.loads(metadata_str)
+                    elif isinstance(metadata_str, dict): metadata = metadata_str
+                except json.JSONDecodeError: pass
+                history_messages = []
+                if source_dataset == 'ultra':
+                    history_str = metadata.get('history', '')
+                    if history_str: history_messages = parse_ultra_history(history_str)
+                unique_id = str(uuid.uuid4()).replace("-", "")
+                # Generate a *new* filename for the potential audio output
+                output_audio_filename = f"{TARGET_MODEL_NAME}_retry_r{idx}_s{i}_{unique_id}.{OUTPUT_AUDIO_FORMAT}"
+                output_audio_filepath = os.path.join(OUTPUT_AUDIO_ROOT_DIR, output_audio_filename)
+                task_info = {
+                    "row_idx": idx,
+                    "slot_idx": i,
+                    "history_messages": history_messages,
+                    "prompt_text": row.get(prompt_text_key, ""),
+                    "question_text": row.get('question_text', ""),
+                    "question_audio_path": question_audio_path,
+                    "output_audio_filepath": output_audio_filepath,
+                }
+                tasks_to_process.append(task_info)
+                # Decide if you want to retry all failed slots in a row or just the first one found
+                # break # Uncomment if you only want to retry the FIRST failed slot per row
+total_tasks = len(tasks_to_process)
+if total_tasks == 0:
+    print(f"No failed {TARGET_MODEL_NAME} tasks found needing reprocessing in {DATASET_DIR}.")
+    exit(0)
+print(f"Found {total_tasks} failed {TARGET_MODEL_NAME} tasks to retry.")
+# --- Threaded Execution with Checkpointing (Identical structure to previous script) ---
+print(f"Starting reprocessing with up to {MAX_WORKERS} worker threads...")
+start_total_time = time.time()
+tasks_completed = 0
+tasks_failed_retries = 0 # Count failures during the *retry* attempt
+completed_since_last_save = 0
+with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+    # Ensure the correct worker function is called based on TARGET_MODEL_NAME
+    api_worker_function = call_gpt4o_api_worker # Default to GPT-4o
+    # Add logic here if TARGET_MODEL_NAME could be Qwen
+    # if TARGET_MODEL_NAME == "qwen_omni":
+    #     api_worker_function = call_qwen_omni_api_worker # Assuming you have this function defined/imported
+    future_to_task = {executor.submit(api_worker_function, task): task for task in tasks_to_process}
+    for future in tqdm(concurrent.futures.as_completed(future_to_task), total=total_tasks, desc="Reprocessing tasks"):
+        task_info = future_to_task[future]
+        row_idx = task_info["row_idx"]
+        slot_idx = task_info["slot_idx"]
+        result = None
+        try:
+            result = future.result()
+            # --- Direct Update and Checkpointing Logic ---
+            response_text_key = f"response_text_{slot_idx}"
+            response_audio_key = f"response_audio_path_{slot_idx}"
+            if 0 <= row_idx < len(updated_data):
+                 # Update the data in memory
+                 updated_data[row_idx][response_text_key] = result["response_text"]
+                 updated_data[row_idx][response_audio_key] = result["saved_audio_path"]
+                 # Check if the *retry* attempt failed
+                 if result["saved_audio_path"] is None or str(result["response_text"]).strip().startswith(ERROR_MARKERS):
+                     tasks_failed_retries += 1
+                     print(f"Warning (Row {row_idx}, Slot {i}): Retry attempt failed. Result: {str(result['response_text'])[:100]}...")
+            else:
+                 print(f"Warning: Invalid row index {row_idx} encountered during result merge. Skipping update.")
+                 tasks_failed_retries += 1
+            tasks_completed += 1
+            completed_since_last_save += 1
+            # Checkpoint saving
+            if completed_since_last_save >= CHECKPOINT_INTERVAL:
+                # Save the updated data back to the SAME directory
+                save_checkpoint(updated_data, DATASET_DIR, original_features)
+                completed_since_last_save = 0
+        except Exception as exc:
+            print(f"Error (Row {row_idx}, Slot {slot_idx}): Retry Task generated an unhandled exception: {exc}")
+            print(traceback.format_exc())
+            response_text_key = f"response_text_{slot_idx}"
+            response_audio_key = f"response_audio_path_{slot_idx}"
+            if 0 <= row_idx < len(updated_data):
+                 updated_data[row_idx][response_text_key] = f"[ERROR: Retry Worker Crash - {exc}]" # Mark as worker crash during retry
+                 updated_data[row_idx][response_audio_key] = None
+            else:
+                print(f"Warning: Invalid row index {row_idx} encountered during exception handling merge.")
+            tasks_failed_retries += 1
+            tasks_completed += 1
+            completed_since_last_save += 1
+            # Checkpoint saving after error
+            if completed_since_last_save >= CHECKPOINT_INTERVAL:
+                save_checkpoint(updated_data, DATASET_DIR, original_features)
+                completed_since_last_save = 0
+end_total_time = time.time()
+print("\n--- Reprocessing Complete ---")
+print(f"Total tasks retried: {tasks_completed}")
+print(f"  Succeeded on retry: {tasks_completed - tasks_failed_retries}")
+print(f"  Failed on retry: {tasks_failed_retries}")
+print(f"Total reprocessing time: {(end_total_time - start_total_time)/60:.2f} minutes")
+# --- Final Save ---
+# Save the final state of the updated data back to the original location
+print("\nPerforming final save of the reprocessed dataset...")
+save_checkpoint(updated_data, DATASET_DIR, original_features)
+print(f"\nRetry script finished. Updated dataset saved in: {DATASET_DIR}")

r1-a/response_generation/kimi.py ADDED Viewed

	@@ -0,0 +1,532 @@

+import os
+import json
+import time
+import re # For parsing history
+import uuid # For generating unique filenames
+import torch # Kimi might return tensors
+import soundfile as sf # For saving Kimi audio output
+import sys
+from datasets import load_from_disk, Dataset, Features, Audio, Value
+from dotenv import load_dotenv
+import datetime # For ETA formatting
+from tqdm import tqdm # Import tqdm
+import traceback # For detailed error printing
+# --- Kimi-Audio Project Path Setup ---
+# <--- *** IMPORTANT: Update this path to the PARENT directory containing the 'kimia_infer' folder *** --->
+kimia_project_parent_dir = "/home/chenyifu/audio-r1/r1-a/response_generation/Kimi-Audio"
+# Check if the path exists and add it to sys.path
+if os.path.isdir(kimia_project_parent_dir):
+    if kimia_project_parent_dir not in sys.path:
+        sys.path.insert(0, kimia_project_parent_dir)
+        print(f"Added '{kimia_project_parent_dir}' to Python path.")
+    # Try importing KimiAudio only after potentially adding the path
+    try:
+        from kimia_infer.api.kimia import KimiAudio # Kimi model class
+    except ImportError as import_err:
+        print(f"Error: Could not import KimiAudio from '{kimia_project_parent_dir}'.")
+        print(f"ImportError: {import_err}")
+        print("Please ensure the 'kimia_infer' directory exists within the specified path and check dependencies.")
+        exit(1)
+else:
+    print(f"Error: Kimi project parent directory not found: '{kimia_project_parent_dir}'")
+    print("Please update the 'kimia_project_parent_dir' variable in the script.")
+    exit(1)
+# --- Configuration ---
+load_dotenv() # Load environment variables if needed (e.g., API keys, though not typical for local Kimi)
+# 1. Model & Tokenizer Setup (Kimi Specific)
+KIMI_MODEL_NAME = "kimi_audio" # Identifier used in your dataset's model_N columns
+KIMI_MODEL_PATH = "/home/chenyifu/audio-r1/r1-a/response_generation/Kimi-Audio/checkpoint/Kimi-Audio-7B-Instruct" # Path to your Kimi model checkpoint directory
+# KIMI_DEVICE = 'cuda' # KimiAudio class likely handles device selection based on availability. Verify its internal logic if issues arise.
+# KIMI_DTYPE = torch.bfloat16 # KimiAudio likely handles dtype internally.
+# 2. Dataset Paths
+INPUT_DATASET_DIR = "/home/chenyifu/audio-r1/r1-a/dataset/preference_sampling_tasks" # Original source
+OUTPUT_DATASET_DIR = "/home/chenyifu/audio-r1/r1-a/dataset/preference_tasks_with_kimi" # Where Kimi processed data is saved/resumed from
+# 3. Output Audio Configuration (Kimi Specific)
+OUTPUT_AUDIO_ROOT_DIR = "/home/chenyifu/audio-r1/r1-a/generated_audio/kimi" # Where Kimi generated audio files are saved
+OUTPUT_AUDIO_FORMAT = "wav"
+OUTPUT_AUDIO_SAMPLERATE = 24000 # Kimi example uses 24kHz output. Confirm this matches your model's expected/native output SR.
+# 4. Kimi Call Settings (Based on example, adjust as needed)
+KIMI_SAMPLING_PARAMS = {
+    "audio_temperature": 0.8,
+    "audio_top_k": 10,
+    "text_temperature": 0.0, # 0.0 for deterministic text, increase for more variety
+    "text_top_k": 5,         # Relevant if text_temperature > 0
+    "audio_repetition_penalty": 1.0,
+    "audio_repetition_window_size": 64,
+    "text_repetition_penalty": 1.0,
+    "text_repetition_window_size": 16,
+    # "max_new_tokens": 128 # Add if needed and supported by KimiAudio.generate
+}
+KIMI_OUTPUT_TYPE = "both" # Generate both audio and text
+# 5. Periodic Save Settings
+SAVE_EVERY_N_SAMPLES = 50 # Save after processing this many samples
+# --- Helper Functions ---
+def format_time(seconds):
+    """Formats seconds into a human-readable string H:MM:SS"""
+    if seconds < 0:
+        return "N/A"
+    return str(datetime.timedelta(seconds=int(seconds)))
+# REMOVED load_audio_minicpm - Kimi takes the path directly
+def parse_ultra_history(history_str):
+    """Parses the specific history string format from ultra metadata for Kimi."""
+    messages = []
+    # Relaxed pattern to capture content even if tags are slightly off or whitespace varies
+    pattern = re.compile(r"\[\s*(USER|ASSISTANT)\s*\]\s*([\s\S]*?)(?=\s*\[\s*(?:USER|ASSISTANT)\s*\]|$)")
+    matches = pattern.findall(history_str)
+    if not matches and history_str and history_str.strip():
+        # Simple fallback if standard pattern fails but there's content
+        if history_str.lower().startswith("user:") or history_str.lower().startswith("[user]"):
+             role = "user"
+             content = re.sub(r"^(user:|\[user\])\s*", "", history_str, flags=re.IGNORECASE).strip()
+             if content: messages.append({"role": role, "message_type": "text", "content": content}) # Add Kimi message_type
+        elif history_str.lower().startswith("assistant:") or history_str.lower().startswith("[assistant]"):
+             role = "assistant"
+             content = re.sub(r"^(assistant:|\[assistant\])\s*", "", history_str, flags=re.IGNORECASE).strip()
+             if content: messages.append({"role": role, "message_type": "text", "content": content}) # Add Kimi message_type
+        else:
+            print(f"Warning: Could not parse history string format: {history_str[:100]}...")
+        return messages # Return whatever was parsed, even if empty
+    for role_tag, content in matches:
+        role = role_tag.strip().lower()
+        cleaned_content = content.strip()
+        if cleaned_content:
+             # IMPORTANT: Add message_type='text' for Kimi history
+             messages.append({"role": role, "message_type": "text", "content": cleaned_content})
+    return messages
+# --- Kimi Model Interaction Function ---
+def call_kimi_model(model, messages_input, sampling_params, output_audio_filepath, output_sample_rate):
+    """Calls the Kimi-Audio model, saves audio, returns text and audio path."""
+    try:
+        # 1. Ensure Output Directory Exists
+        os.makedirs(os.path.dirname(output_audio_filepath), exist_ok=True)
+        # 2. Call Kimi's Generate Function
+        wav_output, text_output = model.generate(
+            messages_input,
+            **sampling_params,
+            output_type=KIMI_OUTPUT_TYPE # Use 'both'
+        )
+        # 3. Process and Save Audio Output
+        saved_audio_path = None
+        if wav_output is not None and isinstance(wav_output, torch.Tensor) and wav_output.numel() > 0: # Check if tensor is not empty
+            try:
+                # Ensure tensor is on CPU, reshape (if needed, often view(-1)), convert to numpy
+                # Check KimiAudio output format - might already be 1D or need specific shape
+                audio_data = wav_output.detach().cpu().view(-1).numpy()
+                # Ensure data is float32 or int16 as supported by soundfile/WAV
+                if audio_data.dtype != 'float32':
+                   # Attempt conversion, potentially scale if it's int
+                   # print(f"  Info: Converting Kimi audio output from {audio_data.dtype} to float32 for saving.")
+                   if np.issubdtype(audio_data.dtype, np.integer):
+                       # Scale integer types to [-1, 1] float range if necessary
+                       # Example: if int16 -> audio_data = audio_data.astype(np.float32) / 32768.0
+                       # Adjust scaling based on the actual integer range if known
+                       audio_data = audio_data.astype(np.float32) # Simplest conversion, might need scaling
+                   else:
+                       audio_data = audio_data.astype(np.float32)
+                sf.write(output_audio_filepath, audio_data, output_sample_rate)
+                # Check if file was actually created and has size
+                if os.path.exists(output_audio_filepath) and os.path.getsize(output_audio_filepath) > 100: # Check for a reasonable size threshold
+                    saved_audio_path = output_audio_filepath
+                else:
+                    print(f"  Error: Kimi generate finished but output audio file seems empty or too small at {output_audio_filepath}")
+                    if os.path.exists(output_audio_filepath):
+                        try: os.remove(output_audio_filepath)
+                        except OSError as rm_err: print(f"  Warning: Could not remove empty/small file {output_audio_filepath}: {rm_err}")
+            except ImportError:
+                 print("Error: NumPy library not found. Please install it (`pip install numpy`)")
+                 return "[ERROR: NumPy Missing]", None # Indicate failure clearly
+            except Exception as sf_err:
+                print(f"  Error saving Kimi audio output to {output_audio_filepath}: {sf_err}")
+                traceback.print_exc()
+                if os.path.exists(output_audio_filepath):
+                    try: os.remove(output_audio_filepath)
+                    except OSError as rm_err: print(f"  Warning: Could not remove potentially corrupt file {output_audio_filepath}: {rm_err}")
+        elif wav_output is None:
+             print("  Warning: Kimi model did not return an audio tensor (wav_output is None).")
+        elif isinstance(wav_output, torch.Tensor) and wav_output.numel() == 0:
+             print("  Warning: Kimi model returned an empty audio tensor.")
+        else:
+             print(f"  Warning: Kimi model returned unexpected audio output type: {type(wav_output)}. Expected torch.Tensor.")
+        # 4. Process Text Output
+        response_text_cleaned = ""
+        if isinstance(text_output, str):
+            response_text_cleaned = text_output.strip()
+        elif text_output is not None:
+            response_text_cleaned = str(text_output).strip() # Convert just in case
+        else:
+            # If text is None but audio might exist, use a specific marker
+            if saved_audio_path:
+                 response_text_cleaned = "[Audio Generated, No Text Output]"
+            else:
+                 response_text_cleaned = "[ERROR: No Text Output]"
+        # Return text (even if audio failed) and the path (or None)
+        return response_text_cleaned, saved_audio_path
+    except Exception as e:
+        print(f"\n  --- Error during Kimi model call ---")
+        # Avoid printing potentially huge message list directly
+        first_message = messages_input[0] if messages_input else "N/A"
+        last_message_content = messages_input[-1]['content'] if messages_input else "N/A"
+        if isinstance(last_message_content, str) and len(last_message_content) > 100 :
+             last_message_preview = last_message_content[:100] + "..."
+        else:
+             last_message_preview = last_message_content
+        print(f"  Input Messages Info: Count={len(messages_input)}, First={first_message}, Last Content Preview='{last_message_preview}'")
+        print(f"  Exception Type: {type(e).__name__}")
+        print(f"  Error Details: {e}")
+        print("  Traceback:")
+        traceback.print_exc()
+        print("  --- End Error Details ---")
+        # Attempt cleanup of potentially incomplete output file
+        if 'output_audio_filepath' in locals() and os.path.exists(output_audio_filepath):
+            try:
+                os.remove(output_audio_filepath)
+            except OSError as rm_err:
+                print(f"  Warning: Could not remove file {output_audio_filepath} after error: {rm_err}")
+        # Return clear error markers
+        return "[ERROR: Kimi Model Call Failed]", None
+# --- Dataset Saving Function (Modified for Kimi context) ---
+def save_checkpoint(data_list, features, output_dir, fallback_dir=None):
+    """Saves the current state of the data list as a Hugging Face Dataset."""
+    if not data_list:
+        print("\nSkipping checkpoint save: data list is empty.")
+        return
+    print(f"\nSaving checkpoint with {len(data_list)} rows to {output_dir}...")
+    try:
+        # Ensure the list contains dictionaries
+        data_to_save = [dict(item) for item in data_list]
+        # --- Feature Check/Adaptation (Optional but recommended) ---
+        # Sometimes saving fails if data types changed unexpectedly (e.g., None -> str)
+        # It's safer to create the Dataset *without* features first, then cast
+        temp_dataset = Dataset.from_list(data_to_save)
+        # Now cast to the original features, allowing potential None/type mismatches
+        # This might raise warnings but is often more robust than direct from_list with features
+        updated_dataset = temp_dataset.cast(features)
+        # --- End Feature Check ---
+        # Ensure output directory exists before saving
+        os.makedirs(output_dir, exist_ok=True)
+        updated_dataset.save_to_disk(output_dir)
+        print("Checkpoint saved successfully.")
+    except Exception as e:
+        print(f"Error saving checkpoint dataset using save_to_disk to {output_dir}: {e}")
+        traceback.print_exc()
+        if fallback_dir:
+            # Use Kimi-specific name in fallback path
+            fallback_path = os.path.join(fallback_dir, f"updated_{KIMI_MODEL_NAME}_data_checkpoint_{int(time.time())}.jsonl")
+            print(f"Attempting to save data as JSON Lines fallback to: {fallback_path}")
+            try:
+                os.makedirs(fallback_dir, exist_ok=True)
+                with open(fallback_path, 'w', encoding='utf-8') as f:
+                    # Reuse data_to_save which is already list of dicts
+                    for item in data_to_save:
+                        # Ensure all values are serializable
+                        serializable_item = {}
+                        for k, v in item.items():
+                            if isinstance(v, (datetime.datetime, datetime.date)):
+                                serializable_item[k] = v.isoformat()
+                            elif isinstance(v, bytes):
+                                serializable_item[k] = v.decode('utf-8', errors='ignore')
+                            elif isinstance(v, torch.Tensor): # Handle potential tensors if not caught earlier
+                                print(f"  Warning: Found unexpected Tensor for key '{k}' in fallback save. Converting to list.")
+                                serializable_item[k] = v.tolist()
+                            elif not isinstance(v, (str, int, float, bool, list, dict, type(None))):
+                                print(f"  Warning: Converting non-standard type {type(v)} for key '{k}' to string for JSON fallback.")
+                                serializable_item[k] = str(v)
+                            else:
+                                serializable_item[k] = v
+                        try:
+                            f.write(json.dumps(serializable_item, ensure_ascii=False) + '\n')
+                        except TypeError as json_type_err:
+                             print(f"  Skipping row due to JSON serialization error: {json_type_err} in item part: {k}={v}")
+                print("Fallback JSON Lines checkpoint saved successfully.")
+            except Exception as json_e:
+                print(f"Error saving fallback JSON Lines checkpoint: {json_e}")
+# =============================================
+# --- Main Processing Logic ---
+# =============================================
+# --- STEP 1: Dataset Loading (Modified for Resumption) ---
+print("="*30)
+print("STEP 1: Loading Dataset")
+print("="*30)
+dataset = None
+original_features = None # Initialize
+# Check if the Kimi-specific output directory exists
+if os.path.exists(OUTPUT_DATASET_DIR):
+    print(f"Found existing Kimi processed dataset directory at: {OUTPUT_DATASET_DIR}")
+    print("Attempting to load it to resume processing...")
+    try:
+        dataset = load_from_disk(OUTPUT_DATASET_DIR)
+        original_features = dataset.features # Get features from the loaded dataset
+        print(f"Resumed Kimi dataset loaded successfully with {len(dataset)} rows.")
+        print(f"Features from resumed dataset: {original_features}")
+    except Exception as e:
+        print(f"Warning: Error loading existing Kimi dataset from {OUTPUT_DATASET_DIR}: {e}")
+        traceback.print_exc()
+        print("Will attempt to load the original input dataset instead.")
+        dataset = None # Reset dataset variable
+else:
+    print(f"No existing Kimi processed dataset found at {OUTPUT_DATASET_DIR}.")
+    print("Will attempt to load the original input dataset.")
+# If dataset is still None, load from the original input directory
+if dataset is None:
+    print(f"\nLoading original input dataset from: {INPUT_DATASET_DIR}")
+    if not os.path.exists(INPUT_DATASET_DIR):
+         print(f"FATAL: Original input dataset directory not found at {INPUT_DATASET_DIR}")
+         exit(1)
+    try:
+        dataset = load_from_disk(INPUT_DATASET_DIR)
+        original_features = dataset.features # Get features from the input dataset
+        print(f"Original input dataset loaded successfully with {len(dataset)} rows.")
+        print(f"Features from input dataset: {original_features}")
+    except Exception as e:
+        print(f"FATAL: Error loading original input dataset from {INPUT_DATASET_DIR}: {e}")
+        traceback.print_exc()
+        exit(1)
+# --- Ensure dataset and features were loaded ---
+if dataset is None or original_features is None:
+     print("FATAL: Failed to load any dataset. Exiting.")
+     exit(1)
+# --- End Dataset Loading ---
+# --- STEP 2: Pre-computation - Identify Kimi Tasks ---
+print("\n" + "="*30)
+print(f"STEP 2: Identifying '{KIMI_MODEL_NAME}' Tasks to Process")
+print("="*30)
+pkusafe_tasks_indices = []
+other_tasks_indices = []
+# Iterate through the loaded dataset structure
+for idx, row in enumerate(dataset):
+    source_dataset = row.get('source_dataset')
+    processed_in_row = False # Flag to ensure we only pick one Kimi slot per row
+    for i in range(1, 4): # Check slots 1, 2, 3
+        model_key = f"model_{i}"
+        response_text_key = f"response_text_{i}"
+        # Check if the slot is assigned to Kimi and is NOT yet filled (text response missing)
+        is_target_model_task = row.get(model_key) == KIMI_MODEL_NAME
+        is_unfilled = not row.get(response_text_key) # True if None or empty string
+        if is_target_model_task and is_unfilled and not processed_in_row:
+            task_info = (idx, i) # Store tuple of (original_row_index, slot_index)
+            if source_dataset == 'pkusafe':
+                pkusafe_tasks_indices.append(task_info)
+            else:
+                other_tasks_indices.append(task_info)
+            processed_in_row = True # Mark row as having a task identified
+# Combine lists, prioritizing pkusafe
+tasks_to_process_indices = pkusafe_tasks_indices + other_tasks_indices
+total_tasks_to_process = len(tasks_to_process_indices)
+print(f"Found {len(pkusafe_tasks_indices)} 'pkusafe' tasks and {len(other_tasks_indices)} other tasks requiring '{KIMI_MODEL_NAME}' processing in the loaded dataset.")
+print(f"Total tasks remaining to process: {total_tasks_to_process}")
+if total_tasks_to_process == 0:
+    print(f"\nNo remaining tasks to process for {KIMI_MODEL_NAME} based on the loaded dataset.")
+    # Optionally, perform a final save for consistency
+    # print("Performing a final save to ensure consistency...")
+    # final_data_list = [dict(row) for row in dataset]
+    # fallback_save_dir_final = os.path.join(os.path.dirname(OUTPUT_DATASET_DIR), f"{KIMI_MODEL_NAME}_checkpoints_fallback")
+    # save_checkpoint(final_data_list, original_features, OUTPUT_DATASET_DIR, fallback_save_dir_final)
+    print("Exiting.")
+    exit(0)
+# --- End Pre-computation Step ---
+# --- STEP 3: Load Kimi Model ---
+print("\n" + "="*30)
+print(f"STEP 3: Loading {KIMI_MODEL_NAME} Model")
+print("="*30)
+try:
+    # Load Kimi model using the class imported earlier
+    model = KimiAudio(model_path=KIMI_MODEL_PATH, load_detokenizer=True) # Assuming detokenizer is needed based on example
+    print(f"{KIMI_MODEL_NAME} model loaded successfully from {KIMI_MODEL_PATH}.")
+except NameError:
+     print("FATAL: KimiAudio class not defined. Import likely failed earlier.")
+     exit(1)
+except Exception as e:
+    print(f"Error loading {KIMI_MODEL_NAME} model from {KIMI_MODEL_PATH}: {e}")
+    traceback.print_exc()
+    exit(1)
+# --- STEP 4: Prepare for Processing ---
+print("\n" + "="*30)
+print(f"STEP 4: Preparing for {KIMI_MODEL_NAME} Processing")
+print("="*30)
+# Create output directories if they don't exist
+os.makedirs(OUTPUT_AUDIO_ROOT_DIR, exist_ok=True)
+os.makedirs(OUTPUT_DATASET_DIR, exist_ok=True)
+# Define and create fallback directory for Kimi
+fallback_save_dir = os.path.join(os.path.dirname(OUTPUT_DATASET_DIR), f"{KIMI_MODEL_NAME}_checkpoints_fallback")
+os.makedirs(fallback_save_dir, exist_ok=True)
+print(f"Audio outputs will be saved in: {OUTPUT_AUDIO_ROOT_DIR}")
+print(f"Dataset checkpoints will be saved in: {OUTPUT_DATASET_DIR}")
+print(f"Fallback checkpoints (JSONL) in: {fallback_save_dir}")
+# Create a mutable list of dictionaries from the loaded dataset for updates
+updated_data = [dict(row) for row in dataset] # Convert each row to a dictionary
+tasks_processed_count = 0 # Count successful completions for average time calculation
+start_total_time = time.time()
+# --- STEP 5: Start Processing Loop ---
+print("\n" + "="*30)
+print(f"STEP 5: Starting {KIMI_MODEL_NAME} Processing Loop ({total_tasks_to_process} Tasks)")
+print("="*30)
+# Use tqdm for the progress bar, iterating over the identified task indices
+pbar = tqdm(enumerate(tasks_to_process_indices), total=total_tasks_to_process, desc=f"Processing {KIMI_MODEL_NAME} Tasks")
+for loop_idx, (row_idx, slot_i) in pbar:
+    # Get the row data *from our mutable list* using the original index
+    row = updated_data[row_idx] # This is already a dictionary
+    # Set description in tqdm dynamically
+    pbar.set_description(f"Processing Row {row_idx}, Slot {slot_i}")
+    prompt_text_key = f"prompt_text_{slot_i}"
+    response_text_key = f"response_text_{slot_i}"
+    response_audio_key = f"response_audio_path_{slot_i}"
+    model_key = f"model_{slot_i}"
+    # --- Sanity Check ---
+    if row.get(model_key) != KIMI_MODEL_NAME:
+         tqdm.write(f"  Skipping Row {row_idx}, Slot {slot_i}: Model is '{row.get(model_key)}', not '{KIMI_MODEL_NAME}'.")
+         continue
+    if row.get(response_text_key):
+         tqdm.write(f"  Skipping Row {row_idx}, Slot {slot_i}: Already has response text '{str(row.get(response_text_key))[:50]}...'.")
+         continue
+    # --- Prepare Kimi Model Inputs ---
+    prompt_text = row.get(prompt_text_key, "")
+    question_audio_path = row.get('question_audio')
+    metadata_str = row.get('metadata', "{}")
+    source_dataset = row.get('source_dataset')
+    # Check for essential input audio path validity
+    if not question_audio_path or not os.path.exists(question_audio_path):
+        tqdm.write(f"  Error: Input audio path missing or invalid for Row {row_idx}: '{question_audio_path}'. Skipping model call.")
+        updated_data[row_idx][response_text_key] = "[ERROR: Missing Input Audio]"
+        updated_data[row_idx][response_audio_key] = None
+        continue # Move to the next task in the loop
+    # --- Construct Kimi `messages` list ---
+    kimi_messages = []
+    # 1. Parse History (if any)
+    if source_dataset == 'ultra' and metadata_str:
+        try:
+            metadata = json.loads(metadata_str)
+            history_str = metadata.get('history', '')
+            if history_str:
+                # Ensure history messages have 'message_type': 'text'
+                history_messages_parsed = parse_ultra_history(history_str)
+                kimi_messages.extend(history_messages_parsed)
+        except json.JSONDecodeError:
+            tqdm.write(f"  Warning: Could not parse metadata JSON for row {row_idx}")
+        except Exception as hist_e:
+             tqdm.write(f"  Warning: Error processing history for row {row_idx}: {hist_e}")
+    # Add elif blocks here for history parsing from other datasets if needed
+    # 2. Add Current User Turn (Text Prompt + Audio Path)
+    # Add text prompt first, if it exists and is not empty
+    if prompt_text and prompt_text.strip():
+        kimi_messages.append({"role": "user", "message_type": "text", "content": prompt_text.strip()})
+    # Add the user audio query using its path
+    kimi_messages.append({"role": "user", "message_type": "audio", "content": question_audio_path})
+    # Generate unique output audio filename
+    unique_id = str(uuid.uuid4())
+    output_audio_filename = f"{KIMI_MODEL_NAME}_row{row_idx}_slot{slot_i}_{unique_id}.{OUTPUT_AUDIO_FORMAT}"
+    output_audio_filepath = os.path.join(OUTPUT_AUDIO_ROOT_DIR, output_audio_filename)
+    # --- Call Kimi Model ---
+    # tqdm.write(f"  Calling {KIMI_MODEL_NAME} for Row {row_idx}, Slot {slot_i}...") # Less verbose log
+    call_start_time = time.time()
+    response_text, saved_audio_path = call_kimi_model(
+        model,
+        kimi_messages,
+        KIMI_SAMPLING_PARAMS,
+        output_audio_filepath,
+        OUTPUT_AUDIO_SAMPLERATE
+    )
+    call_end_time = time.time()
+    audio_basename = os.path.basename(str(saved_audio_path)) if saved_audio_path else "None"
+    tqdm.write(f"  Row {row_idx}, Slot {slot_i}: Finished in {call_end_time - call_start_time:.2f}s. Text: '{str(response_text)[:50]}...', Audio: {audio_basename}")
+    # Store results back into the main data list (updated_data)
+    updated_data[row_idx][response_text_key] = response_text # Store text/error marker
+    updated_data[row_idx][response_audio_key] = saved_audio_path # Store path or None
+    # Increment success counter based on successful generation (e.g., text isn't an error marker)
+    # Consider if audio generation failure should also mark task as failed.
+    # Current logic counts success if text seems okay.
+    if response_text is not None and not response_text.startswith("[ERROR"):
+        tasks_processed_count += 1
+    # --- Periodic Saving ---
+    processed_count_in_loop = loop_idx + 1
+    if processed_count_in_loop % SAVE_EVERY_N_SAMPLES == 0 or processed_count_in_loop == total_tasks_to_process:
+        save_checkpoint(updated_data, original_features, OUTPUT_DATASET_DIR, fallback_save_dir)
+# --- STEP 6: Final Summary and Save ---
+end_total_time = time.time()
+print("\n" + "="*30)
+print(f"STEP 6: {KIMI_MODEL_NAME} Processing Complete - Summary")
+print("="*30)
+print(f"Total tasks identified for processing in this run: {total_tasks_to_process}")
+print(f"Total tasks successfully processed (generated text): {tasks_processed_count}") # Update definition if needed
+total_duration = end_total_time - start_total_time
+print(f"Total processing time for this run: {format_time(total_duration)}")
+if tasks_processed_count > 0:
+     avg_time = total_duration / tasks_processed_count
+     print(f"Average time per successfully processed task in this run: {avg_time:.2f} seconds")
+else:
+     print("Average time per task: N/A (no tasks successfully processed in this run)")
+# --- Final Save ---
+print("\nPerforming final save of the dataset...")
+save_checkpoint(updated_data, original_features, OUTPUT_DATASET_DIR, fallback_save_dir)
+print("\nScript finished.")

r1-a/response_generation/minicpm.py ADDED Viewed

	@@ -0,0 +1,519 @@

+import os
+import json
+import time
+import re # For parsing history
+import uuid # For generating unique filenames
+import random # For random voice selection
+import torch # For MiniCPM-o
+import librosa # For audio loading
+from transformers import AutoModel, AutoTokenizer # For MiniCPM-o
+from datasets import load_from_disk, Dataset, Features, Audio, Value # Import necessary types
+from dotenv import load_dotenv
+import datetime # For ETA formatting
+from tqdm import tqdm # Import tqdm
+import traceback # For detailed error printing
+# --- Configuration ---
+load_dotenv()
+# 1. Model & Tokenizer Setup
+MINICPMO_MODEL_NAME = "minicpm" # Name used in the dataset to identify tasks for this model
+MINICPMO_HF_ID = 'openbmb/MiniCPM-o-2_6'
+MINICPMO_DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+MINICPMO_DTYPE = torch.bfloat16
+MINICPMO_ATTN_IMPL = 'sdpa'
+# 2. Dataset Paths
+INPUT_DATASET_DIR = "/root/autodl-tmp/audio-r1/r1-a/dataset/preference_sampling_tasks" # Original source
+OUTPUT_DATASET_DIR = "/root/autodl-tmp/audio-r1/r1-a/dataset/preference_tasks_with_minicpmo" # Where processed data is saved/resumed from
+# 3. Output Audio Configuration
+OUTPUT_AUDIO_ROOT_DIR = "/root/autodl-tmp/audio-r1/r1-a/generated_audio/minicpmo" # Where generated audio files are saved
+OUTPUT_AUDIO_FORMAT = "wav"
+OUTPUT_AUDIO_SAMPLERATE = 16000
+# --- !!! IMPORTANT: Update these paths to your actual reference voice files !!! ---
+REF_VOICE_PATHS = {
+    "female": "/root/autodl-tmp/audio-r1/r1-a/response_generation/minicpm/MiniCPM-o/assets/input_examples/assistant_female_voice.wav",
+    "male": "/root/autodl-tmp/audio-r1/r1-a/response_generation/minicpm/MiniCPM-o/assets/input_examples/assistant_male_voice.wav",
+    "default_female": "/root/autodl-tmp/audio-r1/r1-a/response_generation/minicpm/MiniCPM-o/assets/input_examples/assistant_default_female_voice.wav"
+}
+# --- End Reference Voice Paths ---
+# Check voice paths exist early
+for key, path in REF_VOICE_PATHS.items():
+    if not os.path.exists(path):
+        print(f"FATAL ERROR: Reference voice file not found for '{key}': {path}")
+        print("Please ensure the reference voice files exist at the specified paths in REF_VOICE_PATHS.")
+        exit(1) # Exit early if critical files are missing
+AVAILABLE_MINICPMO_VOICES = list(REF_VOICE_PATHS.keys())
+# 4. MiniCPM-o Call Settings
+MODEL_MAX_NEW_TOKENS = 128
+MODEL_TEMPERATURE = 0.3
+MODEL_SAMPLING = True
+# 5. Periodic Save Settings
+SAVE_EVERY_N_SAMPLES = 50 # Save after processing this many samples
+# --- Helper Functions ---
+def format_time(seconds):
+    """Formats seconds into a human-readable string H:MM:SS"""
+    if seconds < 0:
+        return "N/A"
+    return str(datetime.timedelta(seconds=int(seconds)))
+def load_audio_minicpm(audio_path, target_sr=16000):
+    """Loads audio using librosa, handling potential errors."""
+    if not audio_path or not os.path.exists(audio_path):
+        # print(f"Warning: Audio file not found or path is empty: {audio_path}") # Less verbose
+        return None
+    try:
+        audio_array, sr = librosa.load(audio_path, sr=None, mono=True)
+        if sr != target_sr:
+            # print(f"  Resampling audio from {sr} Hz to {target_sr} Hz...") # Less verbose
+            audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=target_sr)
+        return audio_array
+    except Exception as e:
+        print(f"\nWarning: Error loading/processing audio file {audio_path}: {e}")
+        return None
+def parse_ultra_history(history_str):
+    """Parses the specific history string format from ultra metadata."""
+    messages = []
+    # Relaxed pattern to capture content even if tags are slightly off or whitespace varies
+    pattern = re.compile(r"\[\s*(USER|ASSISTANT)\s*\]\s*([\s\S]*?)(?=\s*\[\s*(?:USER|ASSISTANT)\s*\]|$)")
+    matches = pattern.findall(history_str)
+    if not matches and history_str and history_str.strip():
+        # Simple fallback if standard pattern fails but there's content
+        if history_str.lower().startswith("user:") or history_str.lower().startswith("[user]"):
+             role = "user"
+             content = re.sub(r"^(user:|\[user\])\s*", "", history_str, flags=re.IGNORECASE).strip()
+             if content: messages.append({"role": role, "content": content})
+        elif history_str.lower().startswith("assistant:") or history_str.lower().startswith("[assistant]"):
+             role = "assistant"
+             content = re.sub(r"^(assistant:|\[assistant\])\s*", "", history_str, flags=re.IGNORECASE).strip()
+             if content: messages.append({"role": role, "content": content})
+        else:
+            print(f"Warning: Could not parse history string format: {history_str[:100]}...")
+        return messages # Return whatever was parsed, even if empty
+    for role_tag, content in matches:
+        role = role_tag.strip().lower()
+        cleaned_content = content.strip()
+        if cleaned_content:
+             messages.append({"role": role, "content": cleaned_content})
+        # else: # Removed warning for empty content for brevity
+             # print(f"Warning: Empty content found for role {role_tag} in history.")
+    return messages
+# --- MiniCPM-o Model Interaction Function ---
+def call_minicpmo_model(model, tokenizer, history_messages, prompt_text, question_audio_path, output_audio_filepath):
+    """Calls the local MiniCPM-o model, saves audio, returns text and audio path."""
+    try:
+        # 1. Select and Load Random Reference Voice
+        selected_voice_key = random.choice(AVAILABLE_MINICPMO_VOICES)
+        ref_voice_path = REF_VOICE_PATHS[selected_voice_key]
+        ref_audio_array = load_audio_minicpm(ref_voice_path, target_sr=OUTPUT_AUDIO_SAMPLERATE)
+        if ref_audio_array is None:
+            print(f"  Error: Failed to load reference voice: {ref_voice_path}")
+            return None, None # Signal failure
+        # 2. Generate System Prompt
+        sys_prompt = model.get_sys_prompt(ref_audio=ref_audio_array, mode='audio_assistant', language='en')
+        # 3. Load User Question Audio
+        user_audio_array = load_audio_minicpm(question_audio_path, target_sr=OUTPUT_AUDIO_SAMPLERATE)
+        if user_audio_array is None:
+            print(f"  Error: Failed to load user question audio: {question_audio_path}")
+            return None, None # Signal failure
+        # 4. Construct User Message
+        user_message_content = []
+        if prompt_text and prompt_text.strip():
+             user_message_content.append(prompt_text.strip())
+        # Ensure user_audio_array is added only if loaded successfully
+        if user_audio_array is not None:
+            user_message_content.append(user_audio_array) # Add audio array
+        else:
+             print("  Warning: Proceeding without user audio due to loading error.")
+             # Optionally decide if you want to proceed without user audio or return error
+             # return None, None # If user audio is essential
+        user_message = {'role': 'user', 'content': user_message_content}
+        # 5. Construct Full Message List
+        msgs = [sys_prompt] + history_messages + [user_message]
+        # 6. Ensure Output Directory Exists
+        os.makedirs(os.path.dirname(output_audio_filepath), exist_ok=True)
+        # 7. Call Model's Chat Function
+        response_obj = model.chat(
+            msgs=msgs,
+            tokenizer=tokenizer,
+            sampling=MODEL_SAMPLING,
+            max_new_tokens=MODEL_MAX_NEW_TOKENS,
+            use_tts_template=True,
+            generate_audio=True,
+            temperature=MODEL_TEMPERATURE,
+            output_audio_path=output_audio_filepath # Model saves the audio directly
+        )
+        # --- Extract text from the response object ---
+        response_text = None
+        if hasattr(response_obj, 'text'):
+            response_text = response_obj.text
+        elif hasattr(response_obj, 'content'):
+             response_text = response_obj.content
+        elif isinstance(response_obj, str):
+             response_text = response_obj
+        else:
+            print(f"  Warning: Could not automatically extract text from model response object of type {type(response_obj)}. Response object dir: {dir(response_obj)}")
+            response_text = "[ERROR: Could not extract text]"
+        # Ensure response_text is a string before stripping
+        response_text_cleaned = ""
+        if isinstance(response_text, str):
+            response_text_cleaned = response_text.strip()
+        elif response_text is not None:
+            response_text_cleaned = str(response_text).strip()
+        # 8. Check if audio file was actually created by the model
+        if os.path.exists(output_audio_filepath) and os.path.getsize(output_audio_filepath) > 0: # Check size too
+            # Success: Return text and the path where the model saved the audio
+            return response_text_cleaned, output_audio_filepath
+        else:
+            print(f"  Error: Model finished but output audio file not found or empty at {output_audio_filepath}")
+            # Attempt cleanup if file exists but is empty
+            if os.path.exists(output_audio_filepath):
+                 try:
+                     os.remove(output_audio_filepath)
+                 except OSError as rm_err:
+                     print(f"  Warning: Could not remove empty file {output_audio_filepath}: {rm_err}")
+            return response_text_cleaned, None # Return text (if any) but signal audio failure
+    except Exception as e:
+        print(f"\n  --- Error during MiniCPM-o model call for {os.path.basename(question_audio_path)} ---")
+        print(f"  Exception Type: {type(e).__name__}")
+        print(f"  Error Details: {e}")
+        print("  Traceback:")
+        traceback.print_exc()
+        print("  --- End Error Details ---")
+        # Attempt cleanup of potentially incomplete output file
+        if 'output_audio_filepath' in locals() and os.path.exists(output_audio_filepath):
+            try:
+                os.remove(output_audio_filepath)
+            except OSError as rm_err:
+                print(f"  Warning: Could not remove file {output_audio_filepath} after error: {rm_err}")
+        return None, None # Indicate failure
+# --- Dataset Saving Function ---
+def save_checkpoint(data_list, features, output_dir, fallback_dir=None):
+    """Saves the current state of the data list as a Hugging Face Dataset."""
+    if not data_list:
+        print("\nSkipping checkpoint save: data list is empty.")
+        return
+    print(f"\nSaving checkpoint with {len(data_list)} rows to {output_dir}...")
+    try:
+        # Ensure the list contains dictionaries, not Dataset rows or other objects
+        data_to_save = [dict(item) for item in data_list]
+        # Create dataset from the current list of dictionaries using original features
+        updated_dataset = Dataset.from_list(data_to_save, features=features)
+        # Ensure output directory exists before saving
+        os.makedirs(output_dir, exist_ok=True)
+        updated_dataset.save_to_disk(output_dir)
+        print("Checkpoint saved successfully.")
+    except Exception as e:
+        print(f"Error saving checkpoint dataset using save_to_disk: {e}")
+        traceback.print_exc()
+        if fallback_dir:
+            fallback_path = os.path.join(fallback_dir, f"updated_minicpmo_data_checkpoint_{int(time.time())}.jsonl")
+            print(f"Attempting to save data as JSON Lines fallback to: {fallback_path}")
+            try:
+                os.makedirs(fallback_dir, exist_ok=True)
+                with open(fallback_path, 'w', encoding='utf-8') as f:
+                    for item in data_to_save:
+                        # Ensure all values are serializable
+                        serializable_item = {}
+                        for k, v in item.items():
+                            if isinstance(v, (datetime.datetime, datetime.date)):
+                                serializable_item[k] = v.isoformat()
+                            elif isinstance(v, bytes):
+                                serializable_item[k] = v.decode('utf-8', errors='ignore')
+                            # Add handling for specific non-serializable types if they appear
+                            elif not isinstance(v, (str, int, float, bool, list, dict, type(None))):
+                                print(f"  Warning: Converting non-standard type {type(v)} for key '{k}' to string for JSON fallback.")
+                                serializable_item[k] = str(v)
+                            else:
+                                serializable_item[k] = v
+                        try:
+                            f.write(json.dumps(serializable_item, ensure_ascii=False) + '\n')
+                        except TypeError as json_type_err:
+                             print(f"  Skipping row due to JSON serialization error: {json_type_err} in item part: {k}={v}")
+                print("Fallback JSON Lines checkpoint saved successfully.")
+            except Exception as json_e:
+                print(f"Error saving fallback JSON Lines checkpoint: {json_e}")
+# =============================================
+# --- Main Processing Logic ---
+# =============================================
+# --- Dataset Loading (Modified for Resumption) ---
+print("="*30)
+print("STEP 1: Loading Dataset")
+print("="*30)
+dataset = None
+original_features = None # Initialize
+if os.path.exists(OUTPUT_DATASET_DIR):
+    print(f"Found existing processed dataset directory at: {OUTPUT_DATASET_DIR}")
+    print("Attempting to load it to resume processing...")
+    try:
+        # Need write permissions check sometimes? If saving fails later.
+        dataset = load_from_disk(OUTPUT_DATASET_DIR)
+        original_features = dataset.features # Get features from the loaded dataset
+        print(f"Resumed dataset loaded successfully with {len(dataset)} rows.")
+        print(f"Features from resumed dataset: {original_features}")
+    except Exception as e:
+        print(f"Warning: Error loading existing dataset from {OUTPUT_DATASET_DIR}: {e}")
+        traceback.print_exc()
+        print("Will attempt to load the original input dataset instead.")
+        dataset = None # Reset dataset variable
+else:
+    print(f"No existing processed dataset found at {OUTPUT_DATASET_DIR}.")
+    print("Will attempt to load the original input dataset.")
+# If dataset is still None (either output dir didn't exist or loading it failed), load from input
+if dataset is None:
+    print(f"\nLoading original input dataset from: {INPUT_DATASET_DIR}")
+    if not os.path.exists(INPUT_DATASET_DIR):
+         print(f"FATAL: Original input dataset directory not found at {INPUT_DATASET_DIR}")
+         exit(1)
+    try:
+        dataset = load_from_disk(INPUT_DATASET_DIR)
+        original_features = dataset.features # Get features from the input dataset
+        print(f"Original input dataset loaded successfully with {len(dataset)} rows.")
+        print(f"Features from input dataset: {original_features}")
+    except Exception as e:
+        print(f"FATAL: Error loading original input dataset from {INPUT_DATASET_DIR}: {e}")
+        traceback.print_exc()
+        exit(1)
+# --- Ensure dataset was loaded ---
+if dataset is None or original_features is None:
+     print("FATAL: Failed to load any dataset. Exiting.")
+     exit(1)
+# --- End Dataset Loading Modification ---
+# --- Pre-computation Step: Identify and Prioritize Tasks ---
+print("\n" + "="*30)
+print("STEP 2: Identifying Tasks to Process")
+print("="*30)
+# NO CHANGES NEEDED HERE. This logic will now run on the dataset loaded above
+# (which could be the original input or the partially processed output).
+# It correctly identifies tasks where model is 'minicpm' and response_text is missing.
+pkusafe_tasks_indices = []
+other_tasks_indices = []
+# Iterate through the loaded dataset structure
+for idx, row in enumerate(dataset):
+    source_dataset = row.get('source_dataset')
+    processed_in_row = False # Flag to ensure we only pick one slot per row initially
+    for i in range(1, 4): # Check slots 1, 2, 3
+        model_key = f"model_{i}"
+        response_text_key = f"response_text_{i}"
+        # Check if the slot is assigned to minicpm and is NOT yet filled
+        is_minicpm_task = row.get(model_key) == MINICPMO_MODEL_NAME
+        # Crucially, check if the response text field is missing or empty in the loaded data
+        is_unfilled = not row.get(response_text_key) # True if None or empty string
+        if is_minicpm_task and is_unfilled and not processed_in_row:
+            task_info = (idx, i) # Store tuple of (original_row_index, slot_index)
+            if source_dataset == 'pkusafe':
+                pkusafe_tasks_indices.append(task_info)
+            else:
+                other_tasks_indices.append(task_info)
+            processed_in_row = True # Mark as processed for this row for task identification
+# Combine lists, prioritizing pkusafe
+tasks_to_process_indices = pkusafe_tasks_indices + other_tasks_indices
+total_tasks_to_process = len(tasks_to_process_indices)
+print(f"Found {len(pkusafe_tasks_indices)} 'pkusafe' tasks and {len(other_tasks_indices)} other tasks requiring '{MINICPMO_MODEL_NAME}' processing in the loaded dataset.")
+print(f"Total tasks remaining to process: {total_tasks_to_process}")
+if total_tasks_to_process == 0:
+    print("\nNo remaining tasks to process for MiniCPM-o based on the loaded dataset.")
+    # Optionally, perform a final save here if you want ensure the output dir reflects the 'completed' state
+    # print("Performing a final save to ensure consistency...")
+    # final_data_list = [dict(row) for row in dataset] # Convert dataset rows back to dicts
+    # fallback_save_dir_final = os.path.join(os.path.dirname(OUTPUT_DATASET_DIR), "minicpmo_checkpoints_fallback")
+    # save_checkpoint(final_data_list, original_features, OUTPUT_DATASET_DIR, fallback_save_dir_final)
+    print("Exiting.")
+    exit(0)
+# --- End Pre-computation Step ---
+# --- Load Model (Only if tasks exist) ---
+print("\n" + "="*30)
+print("STEP 3: Loading Model")
+print("="*30)
+print(f"Loading MiniCPM-o model ({MINICPMO_HF_ID}) and tokenizer...")
+try:
+    model = AutoModel.from_pretrained(
+        MINICPMO_HF_ID,
+        trust_remote_code=True,
+        attn_implementation=MINICPMO_ATTN_IMPL,
+        torch_dtype=MINICPMO_DTYPE
+    )
+    model = model.eval().to(MINICPMO_DEVICE)
+    tokenizer = AutoTokenizer.from_pretrained(MINICPMO_HF_ID, trust_remote_code=True)
+    print("Initializing TTS...")
+    model.init_tts()
+    model.tts.float() # Use float32 for TTS stability
+    print(f"Model and TTS initialized successfully on {MINICPMO_DEVICE}.")
+except Exception as e:
+    print(f"Error loading MiniCPM-o model or tokenizer: {e}")
+    traceback.print_exc()
+    exit(1)
+# --- Prepare for Processing ---
+# Create output directory for MiniCPM-o audio if it doesn't exist
+os.makedirs(OUTPUT_AUDIO_ROOT_DIR, exist_ok=True)
+# Ensure the main output dataset directory exists for saving checkpoints
+os.makedirs(OUTPUT_DATASET_DIR, exist_ok=True)
+# Define and create fallback directory
+fallback_save_dir = os.path.join(os.path.dirname(OUTPUT_DATASET_DIR), "minicpmo_checkpoints_fallback")
+os.makedirs(fallback_save_dir, exist_ok=True)
+# Create a mutable list of dictionaries from the loaded dataset for updates
+# This is crucial as Hugging Face datasets are typically immutable
+updated_data = [dict(row) for row in dataset] # Convert each row to a dictionary
+tasks_processed_count = 0 # Count successful completions for average time calculation
+start_total_time = time.time()
+print("\n" + "="*30)
+print(f"STEP 4: Starting MiniCPM-o Processing for {total_tasks_to_process} Tasks")
+print("="*30)
+# Use tqdm for the progress bar, iterating over the identified task indices
+pbar = tqdm(enumerate(tasks_to_process_indices), total=total_tasks_to_process, desc="Processing MiniCPM-o Tasks")
+for loop_idx, (row_idx, slot_i) in pbar:
+    # Get the row data *from our mutable list* using the original index
+    row = updated_data[row_idx] # This is already a dictionary
+    # Set description in tqdm dynamically
+    pbar.set_description(f"Processing Row {row_idx}, Slot {slot_i}")
+    prompt_text_key = f"prompt_text_{slot_i}"
+    response_text_key = f"response_text_{slot_i}"
+    response_audio_key = f"response_audio_path_{slot_i}"
+    model_key = f"model_{slot_i}" # Get model key for verification
+    # --- Sanity Check: Ensure this is still a valid MiniCPM-o task ---
+    # (This might be redundant if identification was perfect, but good for safety)
+    if row.get(model_key) != MINICPMO_MODEL_NAME:
+         tqdm.write(f"  Skipping Row {row_idx}, Slot {slot_i}: Model is no longer '{MINICPMO_MODEL_NAME}'.")
+         continue
+    if row.get(response_text_key): # Check again if it got filled somehow concurrently (unlikely here)
+         tqdm.write(f"  Skipping Row {row_idx}, Slot {slot_i}: Already has response text '{str(row.get(response_text_key))[:50]}...'.")
+         continue
+    # --- Prepare Model Inputs ---
+    prompt_text = row.get(prompt_text_key, "")
+    question_audio_path = row.get('question_audio')
+    metadata_str = row.get('metadata', "{}")
+    source_dataset = row.get('source_dataset') # Used for history parsing
+    # Basic check for essential input audio
+    if not question_audio_path or not os.path.exists(question_audio_path):
+        tqdm.write(f"  Error: Input audio path missing or invalid for Row {row_idx}: '{question_audio_path}'. Skipping model call.")
+        # Update the specific row in the list (mark as failed/skipped)
+        updated_data[row_idx][response_text_key] = "[ERROR: Missing Input Audio]"
+        updated_data[row_idx][response_audio_key] = None
+        continue # Move to the next task
+    # Parse History
+    history_messages = []
+    if source_dataset == 'ultra' and metadata_str:
+        try:
+            metadata = json.loads(metadata_str)
+            history_str = metadata.get('history', '')
+            if history_str:
+                history_messages = parse_ultra_history(history_str)
+        except json.JSONDecodeError:
+            tqdm.write(f"  Warning: Could not parse metadata JSON for row {row_idx}")
+        except Exception as hist_e:
+             tqdm.write(f"  Warning: Error processing history for row {row_idx}: {hist_e}")
+    # Add elif blocks here if other datasets have different history formats in metadata
+    # Generate unique output audio filename
+    unique_id = str(uuid.uuid4())
+    output_audio_filename = f"minicpmo_row{row_idx}_slot{slot_i}_{unique_id}.{OUTPUT_AUDIO_FORMAT}"
+    output_audio_filepath = os.path.join(OUTPUT_AUDIO_ROOT_DIR, output_audio_filename)
+    # --- Call Model ---
+    # tqdm.write(f"  Calling model for Row {row_idx}, Slot {slot_i} (Source: {source_dataset}). Output: {output_audio_filepath}") # More verbose
+    call_start_time = time.time()
+    response_text, saved_audio_path = call_minicpmo_model(
+        model,
+        tokenizer,
+        history_messages,
+        prompt_text,
+        question_audio_path,
+        output_audio_filepath
+    )
+    call_end_time = time.time()
+    tqdm.write(f"  Row {row_idx}, Slot {slot_i}: Finished in {call_end_time - call_start_time:.2f}s. Text: '{str(response_text)[:50]}...', Audio: {os.path.basename(str(saved_audio_path))}")
+    # Store results directly into the list item (updated_data)
+    updated_data[row_idx][response_text_key] = response_text if response_text is not None else "[ERROR: Model Call Failed]"
+    updated_data[row_idx][response_audio_key] = saved_audio_path # Will be None if audio saving/generation failed
+    if response_text is not None and saved_audio_path is not None: # Count as successfully processed only if both text and audio are generated
+        tasks_processed_count += 1
+    # --- Periodic Saving ---
+    # Save after processing N samples (using loop_idx + 1 because index is 0-based)
+    # Also save on the very last iteration
+    processed_count_in_loop = loop_idx + 1
+    if processed_count_in_loop % SAVE_EVERY_N_SAMPLES == 0 or processed_count_in_loop == total_tasks_to_process:
+        save_checkpoint(updated_data, original_features, OUTPUT_DATASET_DIR, fallback_save_dir)
+    # Optional small delay if needed for hardware cooling, etc.
+    # time.sleep(0.1)
+# --- Final Summary ---
+end_total_time = time.time()
+print("\n" + "="*30)
+print("STEP 5: Processing Complete - Summary")
+print("="*30)
+print(f"Total tasks identified for processing in this run: {total_tasks_to_process}")
+print(f"Total tasks successfully processed (generated text & audio) in this run: {tasks_processed_count}")
+total_duration = end_total_time - start_total_time
+print(f"Total processing time for this run: {format_time(total_duration)}")
+if tasks_processed_count > 0:
+     avg_time = total_duration / tasks_processed_count
+     print(f"Average time per successfully processed task in this run: {avg_time:.2f} seconds")
+else:
+     print("Average time per task: N/A (no tasks successfully processed in this run)")
+# --- Final Save ---
+# This ensures the very last state is saved, even if the last iteration didn't trigger the periodic save exactly.
+# It might be redundant if SAVE_EVERY_N_SAMPLES aligns perfectly, but it's safe to include.
+print("\nPerforming final save of the dataset...")
+save_checkpoint(updated_data, original_features, OUTPUT_DATASET_DIR, fallback_save_dir)
+print("\nScript finished.")

r1-a/response_generation/minicpm/MiniCPM-o/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+*.bk
+__pycache__
+.DS_Store

r1-a/response_generation/minicpm/MiniCPM-o/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright 2024 OpenBMB
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

r1-a/response_generation/minicpm/MiniCPM-o/README.md ADDED Viewed

The diff for this file is too large to render. See raw diff

r1-a/response_generation/minicpm/MiniCPM-o/README_zh.md ADDED Viewed

	@@ -0,0 +1,2524 @@

+<div align="center">
+<img src="./assets/MiniCPM-o.png" width="300em" ></img>
+**端侧可用的 GPT-4o 级视觉、语音、多模态实时流式大模型**
+  <strong>中文 |
+  [English](./README.md)</strong>
+ <span style="display: inline-flex; align-items: center; margin-right: 2px;">
+   <a href="docs/wechat.md" target="_blank"> 微信社区</a> &nbsp;|
+ </span>
+  <span style="display: inline-flex; align-items: center; margin-left: 2px;">
+   MiniCPM-V <a href="docs/best_practice_summary_zh.md" target="_blank">&nbsp; 📖 最佳实践</a>
+ </span>
+  <p align="center">
+  MiniCPM-o 2.6 <a href="https://huggingface.co/openbmb/MiniCPM-o-2_6">🤗</a> <a href="https://minicpm-omni-webdemo-us.modelbest.cn/"> 🤖</a> | MiniCPM-V 2.6 <a href="https://huggingface.co/openbmb/MiniCPM-V-2_6">🤗</a> <a href="http://120.92.209.146:8887/">🤖</a> |
+  📄 技术报告 [<a href="https://openbmb.notion.site/MiniCPM-o-2-6-GPT-4o-188ede1b7a558084b3aedd669cb80730">中文</a>/<a href="https://openbmb.notion.site/MiniCPM-o-2-6-A-GPT-4o-Level-MLLM-for-Vision-Speech-and-Multimodal-Live-Streaming-on-Your-Phone-185ede1b7a558042b5d5e45e6b237da9">English</a>]
+</p>
+</div>
+**MiniCPM-o** 是从 MiniCPM-V 升级的最新端侧多模态大模型系列。该系列模型可以以端到端方式，接受图像、视频、文本、音频作为输入，并生成高质量文本和语音输出。自2024年2月以来，我们以实现高性能和高效部署为目标，发布了6个版本的模型。目前系列中最值得关注的模型包括：
+- **MiniCPM-o 2.6**: 🔥🔥🔥 MiniCPM-o 系列的最新、性能最佳模型。总参数量 8B，**视觉、语音和多模态流式能力达到了 GPT-4o-202405 级别**，是开源社区中模态支持最丰富、性能最佳的模型之一。在新的语音模式中，MiniCPM-o 2.6 **支持可配置声音的中英双语语音对话，还具备情感/语速/风格控制、端到端声音克隆、角色扮演等进阶能力**。模型也进一步提升了 MiniCPM-V 2.6 的 **OCR、可信行为、多语言支持和视频理解等视觉能力**。基于其领先的视觉 token 密度，MiniCPM-V 2.6 成为了**首个支持在 iPad 等端侧设备上进行多模态实时流式交互**的多模态大模型。
+- **MiniCPM-V 2.6**: MiniCPM-V 系列中性能最佳的模型。总参数量 8B，单图、多图和视频理解性能**超越了 GPT-4V**。它取得了优于 **GPT-4o mini、Gemini 1.5 Pro 和 Claude 3.5 Sonnet**等的单图理解表现，并成为了首个支持在 iPad 等端侧设备上进行实时视频理解的多模态大模型。
+## 更新日志 <!-- omit in toc -->
+#### 📌 置顶
+* [2025.03.01] 🚀🚀🚀 MiniCPM-o 系列的对齐技术 RLAIF-V 被 CVPR 2025 接收了！其[代码](https://github.com/RLHF-V/RLAIF-V)、[数据](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset)、[论文](https://arxiv.org/abs/2405.17220)均已开源。
+* [2025.01.24] 📢📢📢 MiniCPM-o 2.6 技术报告已发布! 欢迎点击[这里](https://openbmb.notion.site/MiniCPM-o-2-6-A-GPT-4o-Level-MLLM-for-Vision-Speech-and-Multimodal-Live-Streaming-on-Your-Phone-185ede1b7a558042b5d5e45e6b237da9)查看.
+* [2025.01.23] 💡💡💡 MiniCPM-o 2.6 现在已被北大团队开发的 [Align-Anything](https://github.com/PKU-Alignment/align-anything)，一个用于对齐全模态大模型的框架集成，支持 DPO 和 SFT 在视觉和音频模态上的微调。欢迎试用！
+* [2025.01.19] 📢 **注意!** 我们正在努力将 MiniCPM-o 2.6 的支持合并到 llama.cpp、ollama、vLLM 的官方仓库，但还未完成。请大家暂时先使用我们提供的 fork 来进行部署：[llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpm-omni/examples/llava/README-minicpmo2.6.md)、[ollama](https://github.com/OpenBMB/ollama/blob/minicpm-v2.6/examples/minicpm-v2.6/README.md)、[vllm](https://github.com/OpenBMB/MiniCPM-o?tab=readme-ov-file#efficient-inference-with-llamacpp-ollama-vllm)。 **合并完成前，使用官方仓库可能会导致不可预期的问题**。
+* [2025.01.19] ⭐️⭐️⭐️ MiniCPM-o 在 GitHub Trending 上登顶， Hugging Face Trending 上也达到了第二！
+* [2025.01.17] 我们更新了 MiniCPM-o 2.6 int4 量化版本的使用方式，解决了模型初始化的问题，欢迎点击[这里](https://huggingface.co/openbmb/MiniCPM-o-2_6-int4)试用！
+* [2025.01.13] 🔥🔥🔥 我们开源了 MiniCPM-o 2.6，该模型视觉、语音和多模态流式能力达到了 GPT-4o-202405 级别，进一步优化了 MiniCPM-V 2.6 的众多亮点能力，还支持了很多有趣的新功能。欢迎试用！
+* [2024.08.17] 🚀🚀🚀 llama.cpp [官方仓库](https://github.com/ggerganov/llama.cpp)正式支持 MiniCPM-V 2.6 啦！点击[这里](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf)查看各种大小的 GGUF 版本。
+* [2024.08.06] 🔥🔥🔥 我们开源了 MiniCPM-V 2.6，该模型在单图、多图和视频理解方面取得了优于 GPT-4V 的表现。我们还进一步���升了 MiniCPM-Llama3-V 2.5 的多项亮点能力，并首次支持了 iPad 上的实时视频理解。欢迎试用！
+* [2024.08.03] MiniCPM-Llama3-V 2.5 技术报告已发布！欢迎点击[这里](https://arxiv.org/abs/2408.01800)查看。
+* [2024.05.23] 🔥🔥🔥 MiniCPM-V 在 GitHub Trending 和 Hugging Face Trending 上登顶！MiniCPM-Llama3-V 2.5 Demo 被 Hugging Face 的 Gradio 官方账户推荐，欢迎点击[这里](https://huggingface.co/spaces/openbmb/MiniCPM-Llama3-V-2_5)体验！
+<br>
+<details>
+<summary>点击查看完整更新日志。</summary>
+* [2024.08.15] MiniCPM-V 2.6 现在支持多图像 SFT。有关更多详细信息，请参阅[微调文档](https://github.com/OpenBMB/MiniCPM-V/tree/main/finetune)
+* [2024.08.14] MiniCPM-V 2.6 现在可以通过 SWIFT 框架 [微调](https://github.com/modelscope/ms-swift/issues/1613) 了！
+* [2024.08.10] 🚀🚀🚀 llama.cpp [官方仓库](https://github.com/ggerganov/llama.cpp)正式支持 MiniCPM-Llama3-V 2.5 啦！点击[这里](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf/tree/main)查看各种大小的 GGUF 版本。
+* [2024.07.19] MiniCPM-Llama3-V 2.5 现已支持[vLLM](#vllm-部署-) ！
+* [2024.06.03] 现在，你可以利用多张低显存显卡（12G/16G）进行GPU串行推理。详情请参见该[文档](https://github.com/OpenBMB/MiniCPM-V/blob/main/docs/inference_on_multiple_gpus.md)配置。
+* [2024.05.28] 💫 我们现在支持 MiniCPM-Llama3-V 2.5 的 LoRA 微调，更多内存使用统计信息可以在[这里](https://github.com/OpenBMB/MiniCPM-V/tree/main/finetune#model-fine-tuning-memory-usage-statistics)找到。
+* [2024.05.28] 💥 MiniCPM-Llama3-V 2.5 现在在 llama.cpp 和 ollama 中完全支持其功能！**请拉取我们最新的 fork 来使用**：[llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpm-v2.5/examples/minicpmv/README.md) & [ollama](https://github.com/OpenBMB/ollama/tree/minicpm-v2.5/examples/minicpm-v2.5)。我们还发布了各种大小的 GGUF 版本，请点击[这里](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf/tree/main)查看。请注意，**目前官方仓库尚未支持 MiniCPM-Llama3-V 2.5**，我们也正积极推进将这些功能合并到 llama.cpp & ollama 官方仓库，敬请关注！
+* [2024.05.25] MiniCPM-Llama3-V 2.5 [支持流式输出和自定义系统提示词](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5#usage)了，欢迎试用!
+* [2024.05.24] 我们开源了 MiniCPM-Llama3-V 2.5 [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf)，支持 [llama.cpp](#llamacpp-部署) 推理！实现端侧 6-8 tokens/s 的流畅解码，欢迎试用！
+* [2024.05.23] 🔍 我们添加了Phi-3-vision-128k-instruct 与 MiniCPM-Llama3-V 2.5的全面对比，包括基准测试评估、多语言能力和推理效率 🌟📊🌍🚀。点击[这里](./docs/compare_with_phi-3_vision.md)查看详细信息。
+* [2024.05.20] 我们开源了 MiniCPM-Llama3-V 2.5，增强了 OCR 能力，支持 30 多种语言，并首次在端侧实现了 GPT-4V 级的多模态能力！我们提供了[高效推理](#手机端部署)和[简易微调](./finetune/readme.md)的支持，欢迎试用！
+* [2024.04.23] 我们增加了MiniCPM-V 2.0对 [vLLM](#vllm-部署-) 的支持，欢迎体验！
+* [2024.04.18] 我们在 HuggingFace Space 新增了 MiniCPM-V 2.0 的 [demo](https://huggingface.co/spaces/openbmb/MiniCPM-V-2)，欢迎体验！
+* [2024.04.17] MiniCPM-V 2.0 现在支持用户部署本地 [WebUI Demo](#本地webui-demo部署) 了，欢迎试用!
+* [2024.04.15] MiniCPM-V 2.0 现在可以通过 SWIFT 框架 [微调](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v-2最佳实践.md) 了，支持流式输出!
+* [2024.04.12] 我们开源了 MiniCPM-V 2.0，该模型刷新了 OCRBench 开源模型最佳成绩，在场景文字识别能力上比肩 Gemini Pro，同时还在综合了 11 个主流多模态大模型评测基准的 <a href="https://rank.opencompass.org.cn/leaderboard-multimodal">OpenCompass</a> 榜单上超过了 Qwen-VL-Chat 10B、CogVLM-Chat 17B 和 Yi-VL 34B 等更大参数规模的模型！点击<a href="https://openbmb.vercel.app/minicpm-v-2">这里</a>查看 MiniCPM-V 2.0 技术博客。
+* [2024.03.14] MiniCPM-V 现在支持 SWIFT 框架下的[微调](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v最佳实践.md)了，感谢 [Jintao](https://github.com/Jintao-Huang) 的贡献！
+* [2024.03.01] MiniCPM-V 现在支持在 Mac 电脑上进行部署！
+* [2024.02.01] 我们开源了 MiniCPM-V 和 OmniLMM-12B，分别可以支持高效的端侧部署和同规模领先的多模态能力！
+</details>
+## 目录 <!-- omit in toc -->
+- [MiniCPM-o 2.6](#minicpm-o-26)
+- [MiniCPM-V 2.6](#minicpm-v-26)
+- [Chat with Our Demo on Gradio 🤗](#chat-with-our-demo-on-gradio-)
+- [推理](#推理)
+  - [模型库](#模型库)
+  - [多轮对话](#多轮对话)
+    - [多图对话](#多图对话)
+    - [少样本上下文对话](#少样本上下文对话)
+    - [视频��话](#视频对话)
+    - [语音对话](#语音对话)
+      - [Mimick](#mimick)
+      - [可配置声音的语音对话](#可配置声音的语音对话)
+      - [更多语音任务](#更多语音任务)
+    - [多模态流式交互](#多模态流式交互)
+  - [多卡推理](#多卡推理)
+  - [Mac 推理](#mac-推理)
+  - [基于 llama.cpp、ollama、vLLM 的高效推理](#基于-llamacppollamavllm-的高效推理)
+- [微调](#微调)
+- [FAQs](#faqs)
+- [模型局限性](#模型局限性)
+## MiniCPM-o 2.6
+MiniCPM-o 2.6 是 MiniCPM-o 系列的最新、性能最佳模型。该模型基于 SigLip-400M、Whisper-medium-300M、ChatTTS-200M 和 Qwen2.5-7B 构建，共 8B 参数，通过端到端方式训练和推理。相比 MiniCPM-V 2.6，该模型在性能上有了显著提升，并支持了实时语音对话和多模态流式交互的新功能。MiniCPM-o 2.6 的主要特性包括：
+- 🔥 **领先的视觉能力。**
+MiniCPM-o 2.6 在 OpenCompass 榜单上（综合 8 个主流多模态评测基准）平均得分 70.2，**以 8B 量级的大小在单图理解方面超越了 GPT-4o-202405、Gemini 1.5 Pro 和 Claude 3.5 Sonnet 等主流商用闭源多模态大模型**。此外，它的多图和视频理解表现也**优于 GPT-4V 和 Claude 3.5 Sonnet**，并展现出了优秀的上下文学习能力。
+- 🎙 **出色的语音能力。**
+MiniCPM-o 2.6 **支持可配置声音的中英双语实时对话**。MiniCPM-o 2.6 在语音理解任务（如 ASR 和 STT 等）**优于 GPT-4o-realtime**，并在语音对话的语义和声学评估中展现了**开源模型中最高的语音生成性能**。它还支持情绪/语速/风格控制、语音克隆、角色扮演等进阶能力。
+- 🎬 **强大的多模态流式交互能力。**
+作为一项新功能，MiniCPM-o 2.6 能够**接受连续的视频和音频流，并和用户进行实时语音交互**。在针对实时视频理解、全模态视音频理解、多模态上下文理解的综合评测基准 StreamingBench 中，MiniCPM-o 2.6 取得开源社区最佳水平，并**超过了 GPT-4o-202408 和 Claude 3.5 Sonnet**。
+- 💪 **强大的 OCR 能力及其他功能。**
+MiniCPM-o 2.6 进一步优化了 MiniCPM-V 2.6 的众多视觉理解能力，其可以处理任意长宽比的图像，像素数可达 180 万（如 1344x1344）。在 OCRBench 上取得**25B 以下最佳水平，超过 GPT-4o-202405 等商用闭源模型**。基于最新的 [RLHF-V](https://rlhf-v.github.io/)、[RLAIF-V](https://github.com/RLHF-V/RLAIF-V/) 和 [VisCPM](https://github.com/OpenBMB/VisCPM) 技术，其具备了**可信的多模态行为**，在 MMHal-Bench 上超过了 GPT-4o 和 Claude 3.5，并支持英语、中文、德语、法语、意大利语、韩语等**30多种语言**。
+- 🚀 **卓越的效率。**
+除了对个人用户友好的模型大小，MiniCPM-o 2.6 还表现出**最先进的视觉 token 密度**（即每个视觉 token 编码的像素数量）。它**仅需 640 个 token 即可处理 180 万像素图像，比大多数模型少 75%**。这一特性优化了模型的推理速度、首 token 延迟、内存占用和功耗。因此，MiniCPM-o 2.6 可以支持 iPad 等终端设备上的高效**多模态实时流式交互**。
+- 💫 **易于使用。**
+MiniCPM-o 2.6 可以通过多种方式轻松使用：(1) [llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpm-omni/examples/llava/README-minicpmo2.6.md) 支持在本地设备上进行高效的 CPU 推理，(2) [int4](https://huggingface.co/openbmb/MiniCPM-V-2_6-int4) 和 [GGUF](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) 格式的量化模型，有 16 种尺寸，(3) [vLLM](#基于-llamacppollamavllm-的高效推理) 支持高吞吐量和内存高效的推理，(4) 通过[LLaMA-Factory](./docs/llamafactory_train_and_infer.md)框架针对新领域和任务进行微调，(5) 使用 [Gradio](#本地-webui-demo-) 快速设置本地 WebUI 演示，(6) 部署于服务器的在线 [demo](https://minicpm-omni-webdemo-us.modelbest.cn/)。
+**模型架构。**
+- **端到端全模态架构。** 通过**端到端**的方式连接和训练不同模态的编/解码模块以充分利用丰富的多模态知识。模型完全使用 CE 损失端到端训练。
+- **全模态流式机制。** (1) 我们将不同模态的离线编/解码器改造为适用于**流式输入/输出**的在线模块。 (2) 我们针对大语言模型基座设计了**时分复用的全模态流式信息处理机制**，将平行的不同模态的信息流拆分重组为周期性时间片序列。
+- **可配置的声音方案。** 我们设计了新的多模态系统提示，包含传统文本系统提示词，和**用于指定模型声音的语音系统提示词**。模型可在推理时灵活地通过文字或语音样例控制声音风格，并支持端到端声音克隆和音色创建等高级能力。
+<div align="center">
+<img src="./assets/minicpm-o-26-framework-v2.png" , width=80%>
+</div>
+<br>
+### 性能评估  <!-- omit in toc -->
+<div align="center">
+  <img src="./assets/radar.jpg", width=80%>
+</div>
+<details>
+<summary>点击查看视觉理解能力详细评测结果。</summary>
+**图像理解能力**
+<div align="center">
+<table style="margin: 0px auto;">
+    <thead>
+        <tr>
+            <th align="left">Model</th>
+            <th>Size</th>
+            <th>Token Density<sup>+</sup></th>
+            <th>OpenCompass</th>
+            <th>OCRBench</th>
+            <th>MathVista mini</th>
+            <th>ChartQA</th>
+            <th>MMVet</th>
+            <th>MMStar</th>
+            <th>MME</th>
+            <th>MMB1.1 test</th>
+            <th>AI2D</th>
+            <th>MMMU val</th>
+            <th>HallusionBench</th>
+            <th>TextVQA val</th>
+            <th>DocVQA test</th>
+            <th>MathVerse mini</th>
+            <th>MathVision</th>
+            <th>MMHal Score</th>
+        </tr>
+    </thead>
+    <tbody align="center">
+        <tr>
+            <td colspan="19" align="left"><strong>Proprietary</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GPT-4o-20240513</td>
+            <td>-</td>
+            <td>1088</td>
+            <td><u>69.9</u></td>
+            <td>736</td>
+            <td>61.3</td>
+            <td>85.7</td>
+            <td><strong>69.1</strong></td>
+            <td>63.9</td>
+            <td>2328.7</td>
+            <td>82.2</td>
+            <td>84.6</td>
+            <td><strong>69.2</strong></td>
+            <td><strong>55.0</strong></td>
+            <td>-</td>
+            <td>92.8</td>
+            <td><strong>50.2</strong></td>
+            <td><strong>30.4</strong></td>
+            <td><u>3.6</u></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Claude3.5-Sonnet</td>
+            <td>-</td>
+            <td>750</td>
+            <td>67.9</td>
+            <td>788</td>
+            <td>61.6</td>
+            <td><strong>90.8</strong></td>
+            <td>66.0</td>
+            <td>62.2</td>
+            <td>1920.0</td>
+            <td>78.5</td>
+            <td>80.2</td>
+            <td><u>65.9</u></td>
+            <td>49.9</td>
+            <td>-</td>
+            <td><strong>95.2</strong></td>
+            <td>-</td>
+            <td>-</td>
+            <td>3.4</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Gemini 1.5 Pro</td>
+            <td>-</td>
+            <td>-</td>
+            <td>64.4</td>
+            <td>754</td>
+            <td>57.7</td>
+            <td>81.3</td>
+            <td>64.0</td>
+            <td>59.1</td>
+            <td>2110.6</td>
+            <td>73.9</td>
+            <td>79.1</td>
+            <td>60.6</td>
+            <td>45.6</td>
+            <td>73.5</td>
+            <td>86.5</td>
+            <td>-</td>
+            <td>19.2</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GPT-4o-mini-20240718</td>
+            <td>-</td>
+            <td>1088</td>
+            <td>64.1</td>
+            <td>785</td>
+            <td>52.4</td>
+            <td>-</td>
+            <td>66.9</td>
+            <td>54.8</td>
+            <td>2003.4</td>
+            <td>76.0</td>
+            <td>77.8</td>
+            <td>60.0</td>
+            <td>46.1</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>3.3</td>
+        </tr>
+        <tr>
+            <td colspan="19" align="left"><strong>Open Source</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Cambrian-34B</td>
+            <td>34B</td>
+            <td><u>1820</u></td>
+            <td>58.3</td>
+            <td>591</td>
+            <td>50.3</td>
+            <td>75.6</td>
+            <td>53.2</td>
+            <td>54.2</td>
+            <td>2049.9</td>
+            <td>77.8</td>
+            <td>79.5</td>
+            <td>50.4</td>
+            <td>41.6</td>
+            <td>76.7</td>
+            <td>75.5</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GLM-4V-9B</td>
+            <td>13B</td>
+            <td>784</td>
+            <td>59.1</td>
+            <td>776</td>
+            <td>51.1</td>
+            <td>-</td>
+            <td>58.0</td>
+            <td>54.8</td>
+            <td>2018.8</td>
+            <td>67.9</td>
+            <td>71.2</td>
+            <td>46.9</td>
+            <td>45.0</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Pixtral-12B</td>
+            <td>12B</td>
+            <td>256</td>
+            <td>61.0</td>
+            <td>685</td>
+            <td>56.9</td>
+            <td>81.8</td>
+            <td>58.5</td>
+            <td>54.5</td>
+            <td>-</td>
+            <td>72.7</td>
+            <td>79.0</td>
+            <td>51.1</td>
+            <td>47.0</td>
+            <td>75.7</td>
+            <td>90.7</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">DeepSeek-VL2-27B (4B)</td>
+            <td>27B</td>
+            <td>672</td>
+            <td>66.4</td>
+            <td>809</td>
+            <td>63.9</td>
+            <td>86.0</td>
+            <td>60.0</td>
+            <td>61.9</td>
+            <td>2253.0</td>
+            <td>81.2</td>
+            <td>83.8</td>
+            <td>54.0</td>
+            <td>45.3</td>
+            <td><u>84.2</u></td>
+            <td>93.3</td>
+            <td>-</td>
+            <td>-</td>
+            <td>3.0</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Qwen2-VL-7B</td>
+            <td>8B</td>
+            <td>784</td>
+            <td>67.1</td>
+            <td><u>866</u></td>
+            <td>58.2</td>
+            <td>83.0</td>
+            <td>62.0</td>
+            <td>60.7</td>
+            <td>2326.0</td>
+            <td>81.8</td>
+            <td>83.0</td>
+            <td>54.1</td>
+            <td>50.6</td>
+            <td><strong>84.3</strong></td>
+            <td><u>94.5</u></td>
+            <td>31.9</td>
+            <td>16.3</td>
+            <td>3.2</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">LLaVA-OneVision-72B</td>
+            <td>72B</td>
+            <td>182</td>
+            <td>68.1</td>
+            <td>741</td>
+            <td>67.5</td>
+            <td>83.7</td>
+            <td>60.6</td>
+            <td><strong>65.8</strong></td>
+            <td>2261.0</td>
+            <td><strong>85.0</strong></td>
+            <td><u>85.6</u></td>
+            <td>56.8</td>
+            <td>49.0</td>
+            <td>80.5</td>
+            <td>91.3</td>
+            <td>39.1</td>
+            <td>-</td>
+            <td>3.5</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">InternVL2.5-8B</td>
+            <td>8B</td>
+            <td>706</td>
+            <td>68.3</td>
+            <td>822</td>
+            <td><u>64.4</u></td>
+            <td>84.8</td>
+            <td>62.8</td>
+            <td>62.8</td>
+            <td>2344.0</td>
+            <td><u>83.6</u></td>
+            <td>84.5</td>
+            <td>56.0</td>
+            <td>50.1</td>
+            <td>79.1</td>
+            <td>93.0</td>
+            <td>39.5</td>
+            <td>19.7</td>
+            <td>3.4</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">MiniCPM-V 2.6</td>
+            <td>8B</td>
+            <td><strong>2822</strong></td>
+            <td>65.2</td>
+            <td>852*</td>
+            <td>60.6</td>
+            <td>79.4</td>
+            <td>60.0</td>
+            <td>57.5</td>
+            <td><u>2348.4*</u></td>
+            <td>78.0</td>
+            <td>82.1</td>
+            <td>49.8*</td>
+            <td>48.1*</td>
+            <td>80.1</td>
+            <td>90.8</td>
+            <td>25.7</td>
+            <td>18.3</td>
+            <td>3.6</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">MiniCPM-o 2.6</td>
+            <td>8B</td>
+            <td><strong>2822</strong></td>
+            <td><strong>70.2</strong></td>
+            <td><strong>897*</strong></td>
+            <td><strong>71.9*</strong></td>
+            <td><u>86.9*</u></td>
+            <td><u>67.5</u></td>
+            <td><u>64.0</u></td>
+            <td><strong>2372.0*</strong></td>
+            <td>80.5</td>
+            <td><strong>85.8</strong></td>
+            <td>50.4*</td>
+            <td><u>51.9</u></td>
+            <td>82.0</td>
+            <td>93.5</td>
+            <td><u>41.4*</u></td>
+            <td><u>23.1*</u></td>
+            <td><strong>3.8</strong></td>
+        </tr>
+    </tbody>
+</table>
+</div>
+* 我们使用思维链提示词来评估这些基准，对于 MME 我们只在 Cognition 任务上使用了思维链。
++ Token Density：每个视觉 token 在最大分辨率下编码的像素数，即最大分辨率下的像素数 / 视觉 token 数。
+注意：闭源模型的 Token Density 由 API 收费方式估算得到。
+**多图和视频理解能力**
+<div align="center">
+<table style="margin: 0px auto;">
+    <thead>
+        <tr>
+            <th align="left">Model</th>
+            <th>Size</th>
+            <th>BLINK val</th>
+            <th>Mantis Eval</th>
+            <th>MIRB</th>
+            <th>Video-MME (wo / w subs)</th>
+        </tr>
+    </thead>
+    <tbody align="center">
+        <tr>
+            <td colspan="6" align="left"><strong>Proprietary</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GPT-4o-20240513</td>
+            <td>-</td>
+            <td><strong>68</strong></td>
+            <td>-</td>
+            <td>-</td>
+            <td><strong>71.9/77.2<strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GPT4V</td>
+            <td>-</td>
+            <td>54.6</td>
+            <td>62.7</td>
+            <td>53.1</td>
+            <td>59.9/63.3</td>
+        </tr>
+        <tr>
+            <td colspan="6" align="left"><strong>Open-source</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">LLaVA-NeXT-Interleave 14B</td>
+            <td>14B</td>
+            <td>52.6</td>
+            <td>66.4</td>
+            <td>30.2</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">LLaVA-OneVision-72B</td>
+            <td>72B</td>
+            <td>55.4</td>
+            <td><strong>77.6</strong></td>
+            <td>-</td>
+            <td><u>66.2/69.5</u></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">MANTIS 8B</td>
+            <td>8B</td>
+            <td>49.1</td>
+            <td>59.5</td>
+            <td>34.8</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Qwen2-VL-7B</td>
+            <td>8B</td>
+            <td>53.2</td>
+            <td>69.6*</td>
+            <td><strong>67.6*</strong></td>
+            <td>63.3/69.0</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">InternVL2.5-8B</td>
+            <td>8B</td>
+            <td>54.8</td>
+            <td>67.7</td>
+            <td>52.5</td>
+            <td>64.2/66.9</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">MiniCPM-V 2.6</td>
+            <td>8B</td>
+            <td>53</td>
+            <td>69.1</td>
+            <td>53.8</td>
+            <td>60.9/63.6</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">MiniCPM-o 2.6</td>
+            <td>8B</td>
+            <td><u>56.7</u></td>
+            <td><u>71.9</u></td>
+            <td><u>58.6</u></td>
+            <td>63.9/67.9</td>
+        </tr>
+    </tbody>
+</table>
+</div>
+* 正式开源模型权重的评测结果。
+</details>
+<details>
+<summary>点击查看语音理解和生成能力的详细评测结果。</summary>
+**语音理解能力**
+<div align="center">
+<table style="margin: 0px auto;">
+    <thead>
+        <tr>
+            <th align="left">Task</th>
+            <th>Size</th>
+            <th colspan="3">ASR (zh)</th>
+            <th colspan="3">ASR (en)</th>
+            <th colspan="2">AST</th>
+            <th>Emotion</th>
+        </tr>
+        <tr>
+            <th align="left">Metric</th>
+            <td></td>
+            <th colspan="3">CER↓</th>
+            <th colspan="3">WER↓</th>
+            <th colspan="2">BLEU↑</th>
+            <th>ACC↑</th>
+        </tr>
+        <tr>
+            <th align="left">Dataset</th>
+            <td></td>
+            <th>AISHELL-1</th>
+            <th>Fleurs zh</th>
+            <th>WenetSpeech test-net</th>
+            <th>LibriSpeech test-clean</th>
+            <th>GigaSpeech</th>
+            <th>TED-LIUM</th>
+            <th>CoVoST en2zh</th>
+            <th>CoVoST zh2en</th>
+            <th>MELD emotion</th>
+        </tr>
+    </thead>
+    <tbody align="center">
+        <tr>
+            <td colspan="11" align="left"><strong>Proprietary</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GPT-4o-Realtime</td>
+            <td>-</td>
+            <td>7.3*</td>
+            <td><u>5.4*</u></td>
+            <td>28.9*</td>
+            <td>2.6*</td>
+            <td>12.9*</td>
+            <td>4.8*</td>
+            <td>37.1*</td>
+            <td>15.7*</td>
+            <td>33.2*</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Gemini 1.5 Pro</td>
+            <td>-</td>
+            <td>4.5*</td>
+            <td>5.9*</td>
+            <td>14.3*</td>
+            <td>2.9*</td>
+            <td>10.6*</td>
+            <td><strong>3.0*</strong></td>
+            <td><u>47.3*</u></td>
+            <td>22.6*</td>
+            <td>48.4*</td>
+        </tr>
+        <tr>
+            <td colspan="11" align="left"><strong>Open-Source</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Qwen2-Audio-7B</td>
+            <td>8B</td>
+            <td>-</td>
+            <td>7.5</td>
+            <td>-</td>
+            <td><strong>1.6</strong></td>
+            <td>-</td>
+            <td>-</td>
+            <td>45.2</td>
+            <td><u>24.4</u></td>
+            <td><strong>55.3</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Qwen2-Audio-7B-Instruct</td>
+            <td>8B</td>
+            <td>2.6*</td>
+            <td>6.9*</td>
+            <td><u>10.3*</u></td>
+            <td>3.1*</td>
+            <td><u>9.7</u>*</td>
+            <td>5.9*</td>
+            <td>39.5*</td>
+            <td>22.9*</td>
+            <td>17.4*</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GLM-4-Voice-Base</td>
+            <td>9B</td>
+            <td><u>2.5</u></td>
+            <td>-</td>
+            <td>-</td>
+            <td>2.8</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">MiniCPM-o 2.6</td>
+            <td>8B</td>
+            <td><strong>1.6</strong></td>
+            <td><strong>4.4</strong></td>
+            <td><strong>6.9</strong></td>
+            <td><u>1.7</u></td>
+            <td><strong>8.7</strong></td>
+            <td><strong>3.0</strong></td>
+            <td><strong>48.2</strong></td>
+            <td><strong>27.2</strong></td>
+            <td><u>52.4</u></td>
+        </tr>
+    </tbody>
+</table>
+</div>
+* 正式开源模型权重的评测结果。<br><br>
+**语音生成能力。**
+<div align="center">
+<table style="margin: 0px auto;">
+    <thead>
+        <tr>
+            <th align="left">Task</th>
+            <th>Size</th>
+            <th colspan="9">SpeechQA</th>
+        </tr>
+        <tr>
+            <th align="left">Metric</th>
+            <th></th>
+            <th colspan="3">ACC↑</th>
+            <th>G-Eval (10 point)↑</th>
+            <th>Semantic ELO score↑</th>
+            <th>Acoustic ELO score↑</th>
+            <th>Overall ELO score↑</th>
+            <th>UTMOS↑</th>
+            <th>ASR-WER↓</th>
+        </tr>
+        <tr>
+            <th align="left">Dataset</th>
+            <th></th>
+            <th>Speech Llama Q.</th>
+            <th>Speech Web Q.</th>
+            <th>Speech Trivia QA</th>
+            <th>Speech AlpacaEval</th>
+            <th colspan="5">AudioArena</th>
+        </tr>
+    </thead>
+    <tbody align="center">
+        <tr>
+            <td colspan="11" align="left"><strong>Proprietary</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GPT-4o-Realtime</td>
+            <td></td>
+            <td><strong>71.7</strong></td>
+            <td><strong>51.6</strong></td>
+            <td><strong>69.7</strong></td>
+            <td><strong>7.4</strong></td>
+            <td><strong>1157</strong></td>
+            <td><strong>1203</strong></td>
+            <td><strong>1200</strong></td>
+            <td><strong>4.2</strong></td>
+            <td><strong>2.3</strong></td>
+        </tr>
+        <tr>
+            <td colspan="11" align="left"><strong>Open-Source</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GLM-4-Voice</td>
+            <td>9B</td>
+            <td>50.0</td>
+            <td>32.0</td>
+            <td>36.4</td>
+            <td><u>5.1</u></td>
+            <td>999</td>
+            <td>1147</td>
+            <td>1035</td>
+            <td><u>4.1</u></td>
+            <td><u>11.7</u></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Llama-Omni</td>
+            <td>8B</td>
+            <td>45.3</td>
+            <td>22.9</td>
+            <td>10.7</td>
+            <td>3.9</td>
+            <td>960</td>
+            <td>878</td>
+            <td>897</td>
+            <td>3.2</td>
+            <td>24.3</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">VITA-1.5</td>
+            <td>8B</td>
+            <td>46.7</td>
+            <td>28.1</td>
+            <td>23.3</td>
+            <td>2.0</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Moshi</td>
+            <td>7B</td>
+            <td>43.7</td>
+            <td>23.8</td>
+            <td>16.7</td>
+            <td>2.4</td>
+            <td>871</td>
+            <td>808</td>
+            <td>875</td>
+            <td>2.8</td>
+            <td>8.2</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Mini-Omni</td>
+            <td>1B</td>
+            <td>22.0</td>
+            <td>12.8</td>
+            <td>6.9</td>
+            <td>2.5</td>
+            <td>926</td>
+            <td>803</td>
+            <td>865</td>
+            <td>3.4</td>
+            <td>10.0</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">MiniCPM-o 2.6</td>
+            <td>8B</td>
+            <td><u>61.0</u></td>
+            <td><u>40.0</u></td>
+            <td><u>40.2</u></td>
+            <td><u>5.1</u></td>
+            <td><u>1088</u></td>
+            <td><u>1163</u></td>
+            <td><u>1131</u></td>
+            <td><strong>4.2</strong></td>
+            <td>9.8</td>
+        </tr>
+    </tbody>
+</table>
+</div>
+所有的结果都基于 <a href="https://github.com/OpenBMB/UltraEval-Audio" target="_blank">AudioEvals</a>。<br><br>
+**端到端声音克隆能力。**
+<div align="center">
+<table style="margin: 0px auto;">
+    <thead>
+        <tr>
+            <th align="left">Task</th>
+            <th colspan="2">TTS</th>
+        </tr>
+        <tr>
+            <th align="left">Metric</th>
+            <th>SIMO↑</th>
+            <th>SIMO↑</th>
+        </tr>
+        <tr>
+            <th align="left">Dataset</th>
+            <th>Seed-TTS test-zh</th>
+            <th>Seed-TTS test-en</th>
+        </tr>
+    </thead>
+    <tbody align="center">
+        <tr>
+            <td nowrap="nowrap" align="left">F5-TTS</td>
+            <td><strong>76</strong></td>
+            <td><strong>67</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">CosyVoice</td>
+            <td><u>75</u></td>
+            <td><u>64</u></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">FireRedTTS</td>
+            <td>63</td>
+            <td>46</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">MiniCPM-o 2.6</td>
+            <td>57</td>
+            <td>47</td>
+        </tr>
+    </tbody>
+</table>
+</div>
+</details>
+<details>
+<summary>点击查看多模态流式交互能力评测详细结果。</summary>
+**多模态流式交互能力**: StreamingBench 分数
+<table style="margin: 0px auto;">
+    <thead>
+        <tr>
+            <th align="left">Model</th>
+            <th>Size</th>
+            <th>Real-Time Video Understanding</th>
+            <th>Omni-Source Understanding</th>
+            <th>Contextual Understanding</th>
+            <th>Overall</th>
+        </tr>
+    </thead>
+    <tbody align="center">
+        <tr>
+            <td colspan="7" align="left"><strong>Proprietary</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Gemini 1.5 Pro</td>
+            <td>-</td>
+            <td><u>77.4</u></td>
+            <td><strong>67.8</strong></td>
+            <td><strong>51.1</strong></td>
+            <td><strong>70.3</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GPT-4o-202408</td>
+            <td>-</td>
+            <td>74.5</td>
+            <td>51.0</td>
+            <td><u>48.0</u></td>
+            <td>64.1</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Claude-3.5-Sonnet</td>
+            <td>-</td>
+            <td>74.0</td>
+            <td>41.4</td>
+            <td>37.8</td>
+            <td>59.7</td>
+        </tr>
+        <tr>
+            <td colspan="9" align="left"><strong>Open-source</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">VILA-1.5</td>
+            <td>8B</td>
+            <td>61.5</td>
+            <td>37.5</td>
+            <td>26.7</td>
+            <td>49.5</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">LongVA</td>
+            <td>7B</td>
+            <td>63.1</td>
+            <td>35.9</td>
+            <td>30.2</td>
+            <td>50.7</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">LLaVA-Next-Video-34B</td>
+            <td>34B</td>
+            <td>69.8</td>
+            <td>41.7</td>
+            <td>34.3</td>
+            <td>56.7</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Qwen2-VL-7B</td>
+            <td>8B</td>
+            <td>71.2</td>
+            <td>40.7</td>
+            <td>33.1</td>
+            <td>57.0</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">InternVL2-8B</td>
+            <td>8B</td>
+            <td>70.1</td>
+            <td>42.7</td>
+            <td>34.1</td>
+            <td>57.0</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">VITA-1.5</td>
+            <td>8B</td>
+            <td>70.9</td>
+            <td>40.8</td>
+            <td>35.8</td>
+            <td>57.4</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">LLaVA-OneVision-7B</td>
+            <td>8B</td>
+            <td>74.3</td>
+            <td>40.8</td>
+            <td>31.0</td>
+            <td>58.4</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">InternLM-XC2.5-OL-7B</td>
+            <td>8B</td>
+            <td>75.4</td>
+            <td>46.2</td>
+            <td>33.6</td>
+            <td>60.8</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">MiniCPM-V 2.6</td>
+            <td>8B</td>
+            <td>72.4</td>
+            <td>40.2</td>
+            <td>33.4</td>
+            <td>57.7</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">MiniCPM-o 2.6</td>
+            <td>8B</td>
+            <td><strong>79.9</strong></td>
+            <td><u>53.4</u></td>
+            <td>38.5</td>
+            <td><u>66.0</u></td>
+        </tr>
+    </tbody>
+</table>
+</details>
+### 典型示例 <!-- omit in toc -->
+以下为 MiniCPM-o 2.6 的 iPad Pro 实机演示和 web demo 演示样例：
+<div align="center">
+  <a href="https://www.youtube.com/watch?v=vRIMbxJzStY&t=2s"><img src="./assets/minicpmo2_6/2dot6_o_demo_video_img.png", width=70%></a>
+</div>
+<br>
+<div style="display: flex; flex-direction: column; align-items: center;">
+  <img src="assets/minicpmo2_6/minicpmo2_6_math_intersect.png" alt="math" style="margin-bottom: 5px;">
+  <img src="assets/minicpmo2_6/minicpmo2_6_diagram_train_NN.png" alt="diagram" style="margin-bottom: 5px;">
+  <img src="assets/minicpmo2_6/minicpmo2_6_multi-image_bike.png" alt="bike" style="margin-bottom: 5px;">
+</div>
+<details>
+<summary>Click to view more details of MiniCPM-V 2.6</summary>
+## MiniCPM-V 2.6
+**MiniCPM-V 2.6** 是 MiniCPM-V 系列中最新、性能最佳的模型。该模型基于 SigLip-400M 和 Qwen2-7B 构建，共 8B 参数。与 MiniCPM-Llama3-V 2.5 相比，MiniCPM-V 2.6 性能提升显著，并引入了多图和视频理解的新功能。MiniCPM-V 2.6 的主要特点包括：
+- 🔥 **领先的性能。**
+  MiniCPM-V 2.6 在最新版本 OpenCompass 榜单上（综合 8 个主流多模态评测基准）平均得分 65.2，**以8B量级的大小在单图理解方面超越了 GPT-4o mini、GPT-4V、Gemini 1.5 Pro 和 Claude 3.5 Sonnet 等主流商用闭源多模态大模型**。
+- 🖼️ **多图理解和上下文学习。**
+  MiniCPM-V 2.6 还支持**多图对话和推理**。它在 Mantis-Eval、BLINK、Mathverse mv 和 Sciverse mv 等主流多图评测基准中取得了**最佳水平**，并展现出了优秀的上下文学习能力。
+- 🎬 **视频理解。**
+  MiniCPM-V 2.6 还可以**接受视频输入**，进行对话和提供涵盖时序和空间信息的详细视频描述。模型在 有/无字幕 评测场景下的 Video-MME 表现均超过了 **GPT-4V、Claude 3.5 Sonnet 和 LLaVA-NeXT-Video-34B**等商用闭源模型。
+- 💪 **强大的 OCR 能力及其他功能。**
+  MiniCPM-V 2.6 可以处理任意长宽比的图像，像素数可达 180 万（如 1344x1344）。在 OCRBench 上取得**最佳水平，超过 GPT-4o、GPT-4V 和 Gemini 1.5 Pro 等商用闭源模型**。基于最新的 [RLAIF-V](https://github.com/RLHF-V/RLAIF-V/) 和 [VisCPM](https://github.com/OpenBMB/VisCPM) 技术，其具备了**可信的多模态行为**，在 Object HalBench 上的幻觉率显著低于 GPT-4o 和 GPT-4V，并支持英语、中文、德语、法语、意大利语、韩语等**多种语言**。
+- 🚀 **卓越的效率。**
+  除了对个人用户友好的模型大小，MiniCPM-V 2.6 还表现出**最先进的视觉 token 密度**（即每个视觉 token 编码的像素数量）。它**仅需 640 个 token 即可处理 180 万像素图像，比大多数模型少 75%**。这一特性优化了模型的推理速度、首 token 延迟、内存占用和功耗。因此，MiniCPM-V 2.6 可以支持 iPad 等终端设备上的高效**实时视频理解**。
+- 💫 **易于使用。**
+  MiniCPM-V 2.6 可以通过多种方式轻松使用：(1) [llama.cpp](https://github.com/OpenBMB/llama.cpp/blob/minicpmv-main/examples/llava/README-minicpmv2.6.md) 和 [ollama](https://github.com/OpenBMB/ollama/blob/minicpm-v2.6/examples/minicpm-v2.6/README.md) 支持在本地设备上进行高效的 CPU 推理，(2) [int4](https://huggingface.co/openbmb/MiniCPM-V-2_6-int4) 和 [GGUF](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) 格式的量化模型，有 16 种尺寸，(3) [vLLM](#vllm-部署-) 支持高吞吐量和内存高效的推理，(4) 针对新领域和任务进行微调，(5) 使用 [Gradio](#本地-webui-demo-) 快速设置本地 WebUI 演示，(6) 在线[demo](http://120.92.209.146:8887/)即可体验。
+### 性能评估  <!-- omit in toc -->
+<div align="center">
+    <img src=assets/radar_final.png width=90% />
+</div>
+<details>
+<summary>点击查看 OpenCompass, MME, MMVet, OCRBench, MMMU, MathVista, MMB, AI2D, TextVQA, DocVQA, HallusionBench, Object HalBench 上的单图评测结果详情。 </summary>
+<div align="center">
+<table style="margin: 0px auto;">
+    <thead>
+        <tr>
+            <th align="left">Model</th>
+            <th>Size</th>
+            <th>Token Density<sup>+</sup></th>
+            <th>OpenCompass</th>
+            <th>MME</th>
+            <th>MMVet</th>
+            <th>OCRBench</th>
+            <th>MMMU val</th>
+            <th>MathVista mini</th>
+            <th>MMB1.1 test</th>
+            <th>AI2D</th>
+            <th>TextVQA val</th>
+            <th>DocVQA test</th>
+            <th>HallusionBench</th>
+            <th>Object HalBench</th>
+        </tr>
+    </thead>
+    <tbody align="center">
+        <tr>
+            <td colspan="15" align="left"><strong>Proprietary</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GPT-4o</td>
+            <td>-</td>
+            <td>1088</td>
+            <td>69.9</td>
+            <td>2328.7</td>
+            <td>69.1</td>
+            <td>736</td>
+            <td>69.2</td>
+            <td>61.3</td>
+            <td>82.2</td>
+            <td>84.6</td>
+            <td>-</td>
+            <td>92.8</td>
+            <td>55.0</td>
+            <td>17.6</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Claude 3.5 Sonnet</td>
+            <td>-</td>
+            <td>750</td>
+            <td>67.9</td>
+            <td>1920.0</td>
+            <td>66.0</td>
+            <td>788</td>
+            <td>65.9</td>
+            <td>61.6</td>
+            <td>78.5</td>
+            <td>80.2</td>
+            <td>-</td>
+            <td>95.2</td>
+            <td>49.9</td>
+            <td>13.8</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Gemini 1.5 Pro</td>
+            <td>-</td>
+            <td>-</td>
+            <td>64.4</td>
+            <td>2110.6</td>
+            <td>64.0</td>
+            <td>754</td>
+            <td>60.6</td>
+            <td>57.7</td>
+            <td>73.9</td>
+            <td>79.1</td>
+            <td>73.5</td>
+            <td>86.5</td>
+            <td>45.6</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GPT-4o mini</td>
+            <td>-</td>
+            <td>1088</td>
+            <td>64.1</td>
+            <td>2003.4</td>
+            <td>66.9</td>
+            <td>785</td>
+            <td>60.0</td>
+            <td>52.4</td>
+            <td>76.0</td>
+            <td>77.8</td>
+            <td>-</td>
+            <td>-</td>
+            <td>46.1</td>
+            <td>12.4</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GPT-4V</td>
+            <td>-</td>
+            <td>1088</td>
+            <td>63.5</td>
+            <td>2070.2</td>
+            <td>67.5</td>
+            <td>656</td>
+            <td>61.7</td>
+            <td>54.7</td>
+            <td>79.8</td>
+            <td>78.6</td>
+            <td>78.0</td>
+            <td>87.2</td>
+            <td>43.9</td>
+            <td>14.2</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Step-1V</td>
+            <td>-</td>
+            <td>-</td>
+            <td>59.5</td>
+            <td>2206.4</td>
+            <td>63.3</td>
+            <td>625</td>
+            <td>49.9</td>
+            <td>44.8</td>
+            <td>78.0</td>
+            <td>79.2</td>
+            <td>71.6</td>
+            <td>-</td>
+            <td>48.4</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Qwen-VL-Max</td>
+            <td>-</td>
+            <td>784</td>
+            <td>58.3</td>
+            <td>2281.7</td>
+            <td>61.8</td>
+            <td>684</td>
+            <td>52.0</td>
+            <td>43.4</td>
+            <td>74.6</td>
+            <td>75.7</td>
+            <td>79.5</td>
+            <td>93.1</td>
+            <td>41.2</td>
+            <td>13.4</td>
+        </tr>
+        <tr>
+            <td colspan="15" align="left"><strong>Open-source</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">LLaVA-NeXT-Yi-34B</td>
+            <td>34B</td>
+            <td>157</td>
+            <td>55.0</td>
+            <td>2006.5</td>
+            <td>50.7</td>
+            <td>574</td>
+            <td>48.8</td>
+            <td>40.4</td>
+            <td>77.8</td>
+            <td>78.9</td>
+            <td>69.3</td>
+            <td>-</td>
+            <td>34.8</td>
+            <td>12.6</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Mini-Gemini-HD-34B</td>
+            <td>34B</td>
+            <td>157</td>
+            <td>-</td>
+            <td>2141</td>
+            <td>59.3</td>
+            <td>518</td>
+            <td>48.0</td>
+            <td>43.3</td>
+            <td>-</td>
+            <td>80.5</td>
+            <td>74.1</td>
+            <td>78.9</td>
+            <td>-</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Cambrian-34B</td>
+            <td>34B</td>
+            <td>1820</td>
+            <td>58.3</td>
+            <td>2049.9</td>
+            <td>53.2</td>
+            <td>591</td>
+            <td>50.4</td>
+            <td>50.3</td>
+            <td>77.8</td>
+            <td>79.5</td>
+            <td>76.7</td>
+            <td>75.5</td>
+            <td>41.6</td>
+            <td>14.7</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GLM-4V-9B</td>
+            <td>13B</td>
+            <td>784</td>
+            <td>59.1</td>
+            <td>2018.8</td>
+            <td>58.0</td>
+            <td>776</td>
+            <td>46.9</td>
+            <td>51.1</td>
+            <td>67.9</td>
+            <td>71.2</td>
+            <td>-</td>
+            <td>-</td>
+            <td>45.0</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">InternVL2-8B</td>
+            <td>8B</td>
+            <td>706</td>
+            <td>64.1</td>
+            <td>2215.1</td>
+            <td>54.3</td>
+            <td>794</td>
+            <td><strong>51.2</strong></td>
+            <td>58.3</td>
+            <td><strong>79.4</strong></td>
+            <td><strong>83.6</strong></td>
+            <td>77.4</td>
+            <td><strong>91.6</strong></td>
+            <td>45.0</td>
+            <td>21.3</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">MiniCPM-Llama-V 2.5</td>
+            <td>8B</td>
+            <td>1882</td>
+            <td>58.8</td>
+            <td>2024.6</td>
+            <td>52.8</td>
+            <td>725</td>
+            <td>45.8</td>
+            <td>54.3</td>
+            <td>72.0</td>
+            <td>78.4</td>
+            <td>76.6</td>
+            <td>84.8</td>
+            <td>42.4</td>
+            <td>10.3</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">MiniCPM-V 2.6</td>
+            <td>8B</td>
+            <td><strong>2822</strong></td>
+            <td><strong>65.2</strong></td>
+            <td><strong>2348.4</strong>*</td>
+            <td><strong>60.0</strong></td>
+            <td><strong>852</strong>*</td>
+            <td>49.8*</td>
+            <td><strong>60.6</strong></td>
+            <td>78.0</td>
+            <td>82.1</td>
+            <td><strong>80.1<strong></td>
+            <td>90.8</td>
+            <td><strong>48.1</strong>*</td>
+            <td><strong>8.2</strong></td>
+        </tr>
+    </tbody>
+</table>
+</div>
+* 我们使用思维链提示词来评估这些基准。
+<sup>+</sup> Token Density：每个视觉 token 在最大分辨率下编码的像素数，即最大分辨率下的像素数 / 视觉 token 数。
+注意：闭源模型的 Token Density 由 API 收费方式估算得到。
+</details>
+<details>
+<summary>点击查看 Mantis Eval, BLINK, Mathverse mv, Sciverse mv, MIRB 上的多图评测结果详情。</summary>
+<div align="center">
+<table style="margin: 0px auto;">
+    <thead>
+        <tr>
+            <th align="left">Model</th>
+            <th>Size</th>
+            <th>Mantis Eval</th>
+            <th>BLINK val</th>
+            <th>Mathverse mv</th>
+            <th>Sciverse mv</th>
+            <th>MIRB</th>
+        </tr>
+    </thead>
+    <tbody align="center">
+        <tr>
+            <td colspan="7" align="left"><strong>Proprietary</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GPT-4V</td>
+            <td>-</td>
+            <td>62.7</td>
+            <td>54.6</td>
+            <td>60.3</td>
+            <td>66.9</td>
+            <td>53.1</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">LLaVA-NeXT-Interleave-14B</td>
+            <td>14B</td>
+            <td>66.4</td>
+            <td>52.6</td>
+            <td>32.7</td>
+            <td>30.2</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td colspan="7" align="left"><strong>Open-source</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Emu2-Chat</td>
+            <td>37B</td>
+            <td>37.8</td>
+            <td>36.2</td>
+            <td>-</td>
+            <td>27.2</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">CogVLM</td>
+            <td>17B</td>
+            <td>45.2</td>
+            <td>41.1</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">VPG-C</td>
+            <td>7B</td>
+            <td>52.4</td>
+            <td>43.1</td>
+            <td>24.3</td>
+            <td>23.1</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">VILA 8B</td>
+            <td>8B</td>
+            <td>51.2</td>
+            <td>39.3</td>
+            <td>-</td>
+            <td>36.5</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">InternLM-XComposer-2.5</td>
+            <td>8B</td>
+            <td>53.1*</td>
+            <td>48.9</td>
+            <td>32.1*</td>
+            <td>-</td>
+            <td>42.5</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">InternVL2-8B</td>
+            <td>8B</td>
+            <td>59.0*</td>
+            <td>50.9</td>
+            <td>30.5*</td>
+            <td>34.4*</td>
+            <td><strong>56.9*</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">MiniCPM-V 2.6</td>
+            <td>8B</td>
+            <td><strong>69.1</strong></td>
+            <td><strong>53.0</strong></td>
+            <td><strong>84.9</strong></td>
+            <td><strong>74.9</strong></td>
+            <td>53.8</td>
+        </tr>
+    </tbody>
+</table>
+</div>
+* 正式开源模型权重的评测结果。
+</details>
+<details>
+<summary>点击查看 Video-MME 和 Video-ChatGPT 上的视频评测结果详情。</summary>
+<div align="center">
+<table style="margin: 0px auto;">
+    <thead>
+        <tr>
+            <th align="left">Model</th>
+            <th>Size</th>
+            <th colspan="2">Video-MME</th>
+            <th colspan="5">Video-ChatGPT</th>
+        </tr>
+        <tr>
+            <th align="left"></th>
+            <th></th>
+            <th>w/o subs</th>
+            <th>w subs</th>
+            <th>Correctness</th>
+            <th>Detail</th>
+            <th>Context</th>
+            <th>Temporal</th>
+            <th>Consistency</th>
+        </tr>
+    </thead>
+    <tbody align="center">
+        <tr>
+            <td colspan="9" align="left"><strong>Proprietary</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">Claude 3.5 Sonnet</td>
+            <td>-</td>
+            <td>60.0</td>
+            <td>62.9</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">GPT-4V</td>
+            <td>-</td>
+            <td>59.9</td>
+            <td>63.3</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td colspan="9" align="left"><strong>Open-source</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">LLaVA-NeXT-7B</td>
+            <td>7B</td>
+            <td>-</td>
+            <td>-</td>
+            <td>3.39</td>
+            <td>3.29</td>
+            <td>3.92</td>
+            <td>2.60</td>
+            <td>3.12</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">LLaVA-NeXT-34B</td>
+            <td>34B</td>
+            <td>-</td>
+            <td>-</td>
+            <td>3.29</td>
+            <td>3.23</td>
+            <td>3.83</td>
+            <td>2.51</td>
+            <td>3.47</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">CogVLM2-Video</td>
+            <td>12B</td>
+            <td>-</td>
+            <td>-</td>
+            <td>3.49</td>
+            <td><strong>3.46</strong></td>
+            <td>3.23</td>
+            <td><strong>2.98</strong></td>
+            <td><strong>3.64</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">LongVA</td>
+            <td>7B</td>
+            <td>52.4</td>
+            <td>54.3</td>
+            <td>3.05</td>
+            <td>3.09</td>
+            <td>3.77</td>
+            <td>2.44</td>
+            <td><strong>3.64</strong></td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">InternVL2-8B</td>
+            <td>8B</td>
+            <td>54.0</td>
+            <td>56.9</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">InternLM-XComposer-2.5</td>
+            <td>8B</td>
+            <td>55.8</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+            <td>-</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">LLaVA-NeXT-Video</td>
+            <td>32B</td>
+            <td>60.2</td>
+            <td>63.0</td>
+            <td>3.48</td>
+            <td>3.37</td>
+            <td><strong>3.95</strong></td>
+            <td>2.64</td>
+            <td>3.28</td>
+        </tr>
+        <tr>
+            <td nowrap="nowrap" align="left">MiniCPM-V 2.6</td>
+            <td>8B</td>
+            <td><strong>60.9</strong></td>
+            <td><strong>63.6</strong></td>
+            <td><strong>3.59</strong></td>
+            <td>3.28</td>
+            <td>3.93</td>
+            <td>2.73</td>
+            <td>3.62</td>
+        </tr>
+    </tbody>
+</table>
+</div>
+</details>
+<details>
+<summary>点击查看 TextVQA, VizWiz, VQAv2, OK-VQA上的少样本评测结果详情。</summary>
+<div align="center">
+<table style="margin: 0px auto;">
+    <thead>
+        <tr>
+            <th align="left">Model</th>
+            <th>Size</th>
+            <th>Shot</th>
+            <th>TextVQA val</th>
+            <th>VizWiz test-dev</th>
+            <th>VQAv2 test-dev</th>
+            <th>OK-VQA val</th>
+        </tr>
+    </thead>
+    <tbody align="center">
+        <tr>
+            <td align="left" nowrap="nowrap" rowspan="3">Flamingo</td>
+            <td rowspan="3">80B</td>
+            <td>0*</td>
+            <td>35.0</td>
+            <td>31.6</td>
+            <td>56.3</td>
+            <td>40.6</td>
+        </tr>
+        <tr>
+            <td>4</td>
+            <td>36.5</td>
+            <td>39.6</td>
+            <td>63.1</td>
+            <td><strong>57.4</strong></td>
+        </tr>
+        <tr>
+            <td>8</td>
+            <td>37.3</td>
+            <td>44.8</td>
+            <td>65.6</td>
+            <td>57.5</td>
+        </tr>
+        <tr>
+            <td align="left" nowrap="nowrap" rowspan="3">IDEFICS</td>
+            <td rowspan="3">80B</td>
+            <td>0*</td>
+            <td>30.9</td>
+            <td>36.0</td>
+            <td>60.0</td>
+            <td>45.2</td>
+        </tr>
+        <tr>
+            <td>4</td>
+            <td>34.3</td>
+            <td>40.4</td>
+            <td>63.6</td>
+            <td>52.4</td>
+        </tr>
+        <tr>
+            <td>8</td>
+            <td>35.7</td>
+            <td>46.1</td>
+            <td>64.8</td>
+            <td>55.1</td>
+        </tr>
+        <tr>
+            <td align="left" nowrap="nowrap" rowspan="3">OmniCorpus</td>
+            <td rowspan="3">7B</td>
+            <td>0*</td>
+            <td>43.0</td>
+            <td>49.8</td>
+            <td>63.2</td>
+            <td>45.5</td>
+        </tr>
+        <tr>
+            <td>4</td>
+            <td>45.4</td>
+            <td>51.3</td>
+            <td>64.5</td>
+            <td>46.5</td>
+        </tr>
+        <tr>
+            <td>8</td>
+            <td>45.6</td>
+            <td>52.2</td>
+            <td>64.7</td>
+            <td>46.6</td>
+        </tr>
+        <tr>
+            <td align="left" nowrap="nowrap" rowspan="3">Emu2</td>
+            <td rowspan="3">37B</td>
+            <td>0</td>
+            <td>26.4</td>
+            <td>40.4</td>
+            <td>33.5</td>
+            <td>26.7</td>
+        </tr>
+        <tr>
+            <td>4</td>
+            <td>48.2</td>
+            <td>54.6</td>
+            <td>67.0</td>
+            <td>53.2</td>
+        </tr>
+        <tr>
+            <td>8</td>
+            <td>49.3</td>
+            <td>54.7</td>
+            <td>67.8</td>
+            <td>54.1</td>
+        </tr>
+        <tr>
+            <td align="left" nowrap="nowrap" rowspan="2">MM1</td>
+            <td rowspan="2">30B</td>
+            <td>0</td>
+            <td>26.2</td>
+            <td>40.4</td>
+            <td>48.9</td>
+            <td>26.7</td>
+        </tr>
+        <tr>
+            <td>8</td>
+            <td>49.3</td>
+            <td>54.7</td>
+            <td><strong>70.9</strong></td>
+            <td>54.1</td>
+        </tr>
+        <tr>
+            <td align="left" nowrap="nowrap" rowspan="3">MiniCPM-V 2.6<sup>+</sup></td>
+            <td rowspan="3">8B</td>
+            <td>0</td>
+            <td>43.9</td>
+            <td>33.8</td>
+            <td>45.4</td>
+            <td>23.9</td>
+        </tr>
+        <tr>
+            <td>4</td>
+            <td>63.6</td>
+            <td>60.5</td>
+            <td>65.5</td>
+            <td>50.1</td>
+        </tr>
+        <tr>
+            <td>8</td>
+            <td><strong>64.6</strong></td>
+            <td><strong>63.4</strong></td>
+            <td>68.2</td>
+            <td>51.4</td>
+        </tr>
+    </tbody>
+</table>
+</div>
+* 使用 Flamingo 方式 zero image shot 和 two additional text shots 评估零样本性能。
+<sup>+</sup> 我们在没有进行监督微调 (SFT) 的情况下评估预训练的模型权重 (ckpt)。
+</details>
+### 典型示例 <!-- omit in toc -->
+<div style="display: flex; flex-direction: column; align-items: center;">
+  <img src="assets/minicpmv2_6/multi_img-bike.png" alt="Bike" style="margin-bottom: 5px;">
+  <img src="assets/minicpmv2_6/multi_img-menu.png" alt="Menu" style="margin-bottom: 5px;">
+  <img src="assets/minicpmv2_6/multi_img-code.png" alt="Code" style="margin-bottom: 5px;">
+  <img src="assets/minicpmv2_6/ICL-Mem.png" alt="Mem" style="margin-bottom: 5px;">
+  <img src="assets/minicpmv2_6/multiling-medal.png" alt="medal" style="margin-bottom: 10px;">
+</div>
+<details>
+  <summary>点击查看更多示例。</summary>
+  <div style="display: flex; flex-direction: column; align-items: center;">
+    <img src="assets/minicpmv2_6/ICL-elec.png" alt="elec" style="margin-bottom: 5px;">
+    <img src="assets/minicpmv2_6/multiling-olympic.png" alt="Menu" style="margin-bottom: 10px;">
+  </div>
+</details>
+我们将 MiniCPM-V 2.6 部署在iPad Pro上，并录制了以下演示视频。
+<table align="center">
+    <p align="center">
+      <img src="assets/gif_cases/ai.gif" width=32%/>
+      &nbsp;&nbsp;&nbsp;&nbsp;
+      <img src="assets/gif_cases/beer.gif" width=32%/>
+    </p>
+</table>
+<table align="center">
+    <p align="center">
+      <video src="https://github.com/user-attachments/assets/21f4b818-ede1-4822-920e-91281725c830" width="360" /> </video>
+      <!-- <video src="https://github.com/user-attachments/assets/c835f757-206b-4d9c-8e36-70d67b453628" width="360" /> </video> -->
+    </p>
+</table>
+</details>
+## 历史版本模型  <!-- omit in toc -->
+| 模型                | 介绍信息和使用教程       |
+|:----------------------|:-------------------:|
+| MiniCPM-Llama3-V 2.5  | [文档](./docs/minicpm_llama3_v2dot5.md)   |
+| MiniCPM-V 2.0  | [文档](./docs/minicpm_v2.md)   |
+| MiniCPM-V 1.0  | [文档](./docs/minicpm_v1.md)   |
+| OmniLMM-12B  | [文档](./omnilmm.md)   |
+## Chat with Our Demo on Gradio 🤗
+我们提供由 Hugging Face Gradio <a href='https://github.com/gradio-app/gradio'><img src='https://img.shields.io/github/stars/gradio-app/gradio'></a> 支持的在线和本地 Demo。Gradio 是目前最流行的模型部署框架，支持流式输出、进度条、process bars 和其他常用功能。
+### Online Demo <!-- omit in toc -->
+欢迎试用 Online Demo: [MiniCPM-V 2.6](http://120.92.209.146:8887/) | [MiniCPM-Llama3-V 2.5](https://huggingface.co/spaces/openbmb/MiniCPM-Llama3-V-2_5) | [MiniCPM-V 2.0](https://huggingface.co/spaces/openbmb/MiniCPM-V-2) 。
+### 本地 WebUI Demo <!-- omit in toc -->
+您可以使用以下命令轻松构建自己的本地 WebUI Demo。更详细的部署教程请参考[文档](https://modelbest.feishu.cn/wiki/RnjjwnUT7idMSdklQcacd2ktnyN)。
+**实时流式视频/语音通话demo:**
+1. 启动model server:
+```shell
+pip install -r requirements_o2.6.txt
+python web_demos/minicpm-o_2.6/model_server.py
+```
+请确保 `transformers==4.44.2`，其他版本目前可能会有兼容性问题，我们正在解决。
+如果你使用的低版本的 Pytorch，你可能会遇到这个错误`"weight_norm_fwd_first_dim_kernel" not implemented for 'BFloat16'`, 请在模型初始化的时候添加 `self.minicpmo_model.tts.float()`
+2. 启动web server:
+```shell
+# Make sure Node and PNPM is installed.
+sudo apt-get update
+sudo apt-get install nodejs npm
+npm install -g pnpm
+cd web_demos/minicpm-o_2.6/web_server
+# 为https创建自签名证书, 要申请浏览器摄像头和麦克风权限须启动https.
+bash ./make_ssl_cert.sh  # output key.pem and cert.pem
+pnpm install  # install requirements
+pnpm run dev  # start server
+```
+浏览器打开`https://localhost:8088/`，开始体验实时流式视频/语音通话.
+**Chatbot图文对话demo:**
+```shell
+pip install -r requirements_o2.6.txt
+python web_demos/minicpm-o_2.6/chatbot_web_demo_o2.6.py
+```
+浏览器打开`http://localhost:8000/`，开始体验图文对话Chatbot.
+## 推理
+### 模型库
+| 模型           | 设备 | 资源     | &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; 简介       | 下载链接 |
+|:--------------|:-:|:----------:|:-------------------|:---------------:|
+| MiniCPM-o 2.6| GPU | 18 GB  | 最新版本，提供端侧 GPT-4o 级的视觉、语音、多模态流式交互能力。   |  [🤗](https://huggingface.co/openbmb/MiniCPM-o-2_6) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-o-2_6) |
+| MiniCPM-o 2.6 gguf | CPU | 8 GB  | gguf 版本，更低的内存占用和更高的推理效率。   |  [🤗](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-o-2_6-gguf) |
+| MiniCPM-o 2.6 int4 | GPU | 9 GB  | int4量化版，更低显存占用。   |  [🤗](https://huggingface.co/openbmb/MiniCPM-o-2_6-int4) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-o-2_6-int4) |
+| MiniCPM-V 2.6| GPU | 17 GB  | 提供出色的端侧单图、多图、视频理解能力。   |  [🤗](https://huggingface.co/openbmb/MiniCPM-V-2_6) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6) |
+| MiniCPM-V 2.6 gguf | CPU | 6 GB  | gguf 版本，更低的内存占用和更高的推理效率。   |  [🤗](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6-gguf) |
+| MiniCPM-V 2.6 int4 | GPU | 7 GB  | int4量化版，更低显存占用。   |  [🤗](https://huggingface.co/openbmb/MiniCPM-V-2_6-int4) &nbsp;&nbsp; [<img src="./assets/modelscope_logo.png" width="20px"></img>](https://modelscope.cn/models/OpenBMB/MiniCPM-V-2_6-int4) |
+更多[历史版本模型](#legacy-models)
+### 多轮对话
+请确保 `transformers==4.44.2`，其他版本目前可能会有兼容性问题
+```shell
+pip install -r requirements_o2.6.txt
+```
+<div align="center">
+<img src="assets/minicpmo2_6/show_demo.jpg" width="500px">
+</div>
+```python
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+torch.manual_seed(100)
+model = AutoModel.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True,
+    attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
+model = model.eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
+image = Image.open('./assets/minicpmo2_6/show_demo.jpg').convert('RGB')
+# First round chat
+question = "What is the landform in the picture?"
+msgs = [{'role': 'user', 'content': [image, question]}]
+answer = model.chat(
+    msgs=msgs,
+    tokenizer=tokenizer
+)
+print(answer)
+# Second round chat, pass history context of multi-turn conversation
+msgs.append({"role": "assistant", "content": [answer]})
+msgs.append({"role": "user", "content": ["What should I pay attention to when traveling here?"]})
+answer = model.chat(
+    msgs=msgs,
+    tokenizer=tokenizer
+)
+print(answer)
+```
+你可以得到如下推理结果：
+```
+"The landform in the picture is a mountain range. The mountains appear to be karst formations, characterized by their steep, rugged peaks and smooth, rounded shapes. These types of mountains are often found in regions with limestone bedrock and are shaped by processes such as erosion and weathering. The reflection of the mountains in the water adds to the scenic beauty of the landscape."
+"When traveling to this scenic location, it's important to pay attention to the weather conditions, as the area appears to be prone to fog and mist, especially during sunrise or sunset. Additionally, ensure you have proper footwear for navigating the potentially slippery terrain around the water. Lastly, respect the natural environment by not disturbing the local flora and fauna."
+```
+#### 多图对话
+<details>
+<summary> 点击查看 MiniCPM-o 2.6 多图输入的 Python 代码。 </summary>
+```python
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+model = AutoModel.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True,
+    attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
+model = model.eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
+image1 = Image.open('image1.jpg').convert('RGB')
+image2 = Image.open('image2.jpg').convert('RGB')
+question = 'Compare image 1 and image 2, tell me about the differences between image 1 and image 2.'
+msgs = [{'role': 'user', 'content': [image1, image2, question]}]
+answer = model.chat(
+    msgs=msgs,
+    tokenizer=tokenizer
+)
+print(answer)
+```
+</details>
+#### 少样本上下文对话
+<details>
+<summary> 点击查看 MiniCPM-o 2.6 少样本上下文对话的 Python 代码。 </summary>
+```python
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+model = AutoModel.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True,
+    attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
+model = model.eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
+question = "production date"
+image1 = Image.open('example1.jpg').convert('RGB')
+answer1 = "2023.08.04"
+image2 = Image.open('example2.jpg').convert('RGB')
+answer2 = "2007.04.24"
+image_test = Image.open('test.jpg').convert('RGB')
+msgs = [
+    {'role': 'user', 'content': [image1, question]}, {'role': 'assistant', 'content': [answer1]},
+    {'role': 'user', 'content': [image2, question]}, {'role': 'assistant', 'content': [answer2]},
+    {'role': 'user', 'content': [image_test, question]}
+]
+answer = model.chat(
+    msgs=msgs,
+    tokenizer=tokenizer
+)
+print(answer)
+```
+</details>
+#### 视频对话
+<details>
+<summary> 点击查看 MiniCPM-o 2.6 视频输入的 Python 代码。 </summary>
+```python
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+from decord import VideoReader, cpu    # pip install decord
+model = AutoModel.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True,
+    attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
+model = model.eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
+MAX_NUM_FRAMES=64 # if cuda OOM set a smaller number
+def encode_video(video_path):
+    def uniform_sample(l, n):
+        gap = len(l) / n
+        idxs = [int(i * gap + gap / 2) for i in range(n)]
+        return [l[i] for i in idxs]
+    vr = VideoReader(video_path, ctx=cpu(0))
+    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+    frame_idx = [i for i in range(0, len(vr), sample_fps)]
+    if len(frame_idx) > MAX_NUM_FRAMES:
+        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
+    frames = vr.get_batch(frame_idx).asnumpy()
+    frames = [Image.fromarray(v.astype('uint8')) for v in frames]
+    print('num frames:', len(frames))
+    return frames
+video_path="video_test.mp4"
+frames = encode_video(video_path)
+question = "Describe the video"
+msgs = [
+    {'role': 'user', 'content': frames + [question]},
+]
+# Set decode params for video
+params = {}
+params["use_image_id"] = False
+params["max_slice_nums"] = 2 # use 1 if cuda OOM and video resolution > 448*448
+answer = model.chat(
+    msgs=msgs,
+    tokenizer=tokenizer,
+    **params
+)
+print(answer)
+```
+</details>
+#### 语音对话
+<details> <summary> 初始化模型 </summary>
+```python
+import torch
+import librosa
+from transformers import AutoModel, AutoTokenizer
+model = AutoModel.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True,
+    attn_implementation='sdpa', torch_dtype=torch.bfloat16) # sdpa or flash_attention_2, no eager
+model = model.eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
+model.init_tts()
+model.tts.float()
+```
+</details>
+##### Mimick
+<details> <summary> 点击查看 MiniCPM-o 2.6 端到端语音理解生成的 Python 代码。 </summary>
+- `Mimick` 任务反映了模型的端到端语音建模能力。模型接受音频输入，输出语音识别（ASR）转录结果，并随后以高相似度重建原始音频。重建的音频相似度和原始音频越高，表明模型有越高的���音端到端建模基础能力。
+```python
+mimick_prompt = "Please repeat each user's speech, including voice style and speech content."
+audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
+msgs = [{'role': 'user', 'content': [mimick_prompt,audio_input]}]
+res = model.chat(
+    msgs=msgs,
+    tokenizer=tokenizer,
+    sampling=True,
+    max_new_tokens=128,
+    use_tts_template=True,
+    temperature=0.3,
+    generate_audio=True,
+    output_audio_path='output.wav', # save the tts result to output_audio_path
+)
+```
+</details>
+##### 可配置声音的语音对话
+<details> <summary> 点击查看个性化配置 MiniCPM-o 2.6 对话声音的 Python 代码。</summary>
+```python
+ref_audio, _ = librosa.load('./assets/voice_01.wav', sr=16000, mono=True) # load the reference audio
+# Audio RolePlay:  # With this mode, model will role-play the character based on the audio prompt.
+sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_roleplay', language='en')
+user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]}
+# Audio Assistant: # With this mode, model will speak with the voice in ref_audio as a AI assistant.
+# sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_assistant', language='en')
+# user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # Try to ask something!
+```
+```python
+msgs = [sys_prompt, user_question]
+res = model.chat(
+    msgs=msgs,
+    tokenizer=tokenizer,
+    sampling=True,
+    max_new_tokens=128,
+    use_tts_template=True,
+    generate_audio=True,
+    temperature=0.3,
+    output_audio_path='result.wav',
+)
+# round two
+history = msgs.append({'role': 'assistant', 'content': res})
+user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]}
+msgs = history.append(user_question)
+res = model.chat(
+    msgs=msgs,
+    tokenizer=tokenizer,
+    sampling=True,
+    max_new_tokens=128,
+    use_tts_template=True,
+    generate_audio=True,
+    temperature=0.3,
+    output_audio_path='result_round_2.wav',
+)
+print(res)
+```
+</details>
+##### 更多语音任务
+<details>
+<summary>  点击查看 MiniCPM-o 2.6 完成更多语音任务的 Python 代码。 </summary>
+```python
+'''
+Audio Understanding Task Prompt:
+Speech:
+    ASR with ZH(same as AST en2zh): 请仔细听这段音频片段，并将其内容逐字记录。
+    ASR with EN(same as AST zh2en): Please listen to the audio snippet carefully and transcribe the content.
+    Speaker Analysis: Based on the speaker's content, speculate on their gender, condition, age range, and health status.
+General Audio:
+    Audio Caption: Summarize the main content of the audio.
+    Sound Scene Tagging: Utilize one keyword to convey the audio's content or the associated scene.
+'''
+task_prompt = "\n"
+audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
+msgs = [{'role': 'user', 'content': [task_prompt,audio_input]}]
+res = model.chat(
+    msgs=msgs,
+    tokenizer=tokenizer,
+    sampling=True,
+    max_new_tokens=128,
+    use_tts_template=True,
+    generate_audio=True,
+    temperature=0.3,
+    output_audio_path='result.wav',
+)
+print(res)
+```
+```python
+'''
+Speech Generation Task Prompt:
+    Human Instruction-to-Speech: see https://voxinstruct.github.io/VoxInstruct/
+    Example:
+        # 在新闻中，一个年轻男性兴致勃勃地说：“祝福亲爱的祖国母亲美丽富强！”他用低音调和低音量，慢慢地说出了这句话。
+        # Delighting in a surprised tone, an adult male with low pitch and low volume comments:"One even gave my little dog a biscuit" This dialogue takes place at a leisurely pace, delivering a sense of excitement and surprise in the context.
+    Voice Cloning or Voice Creation: With this mode, model will act like a TTS model.
+'''
+# Human Instruction-to-Speech:
+task_prompt = '' #Try to make some Human Instruction-to-Speech prompt
+msgs = [{'role': 'user', 'content': [task_prompt]}] # you can try to use the same audio question
+# Voice Cloning mode: With this mode, model will act like a TTS model.
+# sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='voice_cloning', language='en')
+# text_prompt = f"Please read the text below."
+# user_question = {'role': 'user', 'content': [text_prompt, "content that you want to read"]} # using same voice in sys_prompt to read the text. (Voice Cloning)
+# user_question = {'role': 'user', 'content': [text_prompt, librosa.load('xxx.wav', sr=16000, mono=True)[0]]} # using same voice in sys_prompt to read 'xxx.wav'. (Voice Creation)
+msgs = [sys_prompt, user_question]
+res = model.chat(
+    msgs=msgs,
+    tokenizer=tokenizer,
+    sampling=True,
+    max_new_tokens=128,
+    use_tts_template=True,
+    generate_audio=True,
+    temperature=0.3,
+    output_audio_path='result.wav',
+)
+```
+</details>
+#### 多模态流式交互
+<details>
+<summary> 点击查看 MiniCPM-o 2.6 多模态流式交互的 Python 代码。 </summary>
+```python
+import math
+import numpy as np
+from PIL import Image
+from moviepy.editor import VideoFileClip
+import tempfile
+import librosa
+import soundfile as sf
+import torch
+from transformers import AutoModel, AutoTokenizer
+def get_video_chunk_content(video_path, flatten=True):
+    video = VideoFileClip(video_path)
+    print('video_duration:', video.duration)
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_audio_file:
+        temp_audio_file_path = temp_audio_file.name
+        video.audio.write_audiofile(temp_audio_file_path, codec="pcm_s16le", fps=16000)
+        audio_np, sr = librosa.load(temp_audio_file_path, sr=16000, mono=True)
+    num_units = math.ceil(video.duration)
+    # 1 frame + 1s audio chunk
+    contents= []
+    for i in range(num_units):
+        frame = video.get_frame(i+1)
+        image = Image.fromarray((frame).astype(np.uint8))
+        audio = audio_np[sr*i:sr*(i+1)]
+        if flatten:
+            contents.extend(["<unit>", image, audio])
+        else:
+            contents.append(["<unit>", image, audio])
+    return contents
+model = AutoModel.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True,
+    attn_implementation='sdpa', torch_dtype=torch.bfloat16)
+model = model.eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)
+model.init_tts()
+# If you are using an older version of PyTorch, you might encounter this issue "weight_norm_fwd_first_dim_kernel" not implemented for 'BFloat16', Please convert the TTS to float32 type.
+# model.tts.float()
+# https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/assets/Skiing.mp4
+video_path="assets/Skiing.mp4"
+sys_msg = model.get_sys_prompt(mode='omni', language='en')
+# if use voice clone prompt, please set ref_audio
+# ref_audio_path = '/path/to/ref_audio'
+# ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
+# sys_msg = model.get_sys_prompt(ref_audio=ref_audio, mode='omni', language='en')
+contents = get_video_chunk_content(video_path)
+msg = {"role":"user", "content": contents}
+msgs = [sys_msg, msg]
+# please set generate_audio=True and output_audio_path to save the tts result
+generate_audio = True
+output_audio_path = 'output.wav'
+res = model.chat(
+    msgs=msgs,
+    tokenizer=tokenizer,
+    sampling=True,
+    temperature=0.5,
+    max_new_tokens=4096,
+    omni_input=True, # please set omni_input=True when omni inference
+    use_tts_template=True,
+    generate_audio=generate_audio,
+    output_audio_path=output_audio_path,
+    max_slice_nums=1,
+    use_image_id=False,
+    return_dict=True
+)
+print(res)
+```
+</details>
+<details>
+<summary> 点击查看多模态流式推理设置。 </summary>
+注意：流式推理存在轻微的性能下降，因为音频编码并非全局的。
+```python
+# a new conversation need reset session first, it will reset the kv-cache
+model.reset_session()
+contents = get_video_chunk_content(video_path, flatten=False)
+session_id = '123'
+generate_audio = True
+# 1. prefill system prompt
+res = model.streaming_prefill(
+    session_id=session_id,
+    msgs=[sys_msg],
+    tokenizer=tokenizer
+)
+# 2. prefill video/audio chunks
+for content in contents:
+    msgs = [{"role":"user", "content": content}]
+    res = model.streaming_prefill(
+        session_id=session_id,
+        msgs=msgs,
+        tokenizer=tokenizer
+    )
+# 3. generate
+res = model.streaming_generate(
+    session_id=session_id,
+    tokenizer=tokenizer,
+    temperature=0.5,
+    generate_audio=generate_audio
+)
+audios = []
+text = ""
+if generate_audio:
+    for r in res:
+        audio_wav = r.audio_wav
+        sampling_rate = r.sampling_rate
+        txt = r.text
+        audios.append(audio_wav)
+        text += txt
+    res = np.concatenate(audios)
+    sf.write("output.wav", res, samplerate=sampling_rate)
+    print("text:", text)
+    print("audio saved to output.wav")
+else:
+    for r in res:
+        text += r['text']
+    print("text:", text)
+```
+</details>
+### 多卡推理
+您可以通过将模型的层分布在多个低显存显卡（12 GB 或 16 GB）上，运行 MiniCPM-Llama3-V 2.5。请查看该[教程](https://github.com/OpenBMB/MiniCPM-V/blob/main/docs/inference_on_multiple_gpus.md)，详细了解如何使用多张低显存显卡载入模型并进行推理。
+### Mac 推理
+<details>
+<summary>点击查看 MiniCPM-Llama3-V 2.5 / MiniCPM-V 2.0 基于Mac MPS运行 (Apple silicon 或 AMD GPUs)的示例。 </summary>
+```python
+# test.py    Need more than 16GB memory to run.
+import torch
+from PIL import Image
+from transformers import AutoModel, AutoTokenizer
+model = AutoModel.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True, low_cpu_mem_usage=True)
+model = model.to(device='mps')
+tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True)
+model.eval()
+image = Image.open('./assets/hk_OCR.jpg').convert('RGB')
+question = 'Where is this photo taken?'
+msgs = [{'role': 'user', 'content': question}]
+answer, context, _ = model.chat(
+    image=image,
+    msgs=msgs,
+    context=None,
+    tokenizer=tokenizer,
+    sampling=True
+)
+print(answer)
+```
+运行:
+```shell
+PYTORCH_ENABLE_MPS_FALLBACK=1 python test.py
+```
+</details>
+### 基于 llama.cpp、ollama、vLLM 的高效推理
+llama.cpp 用法请参考[我们的fork llama.cpp](https://github.com/OpenBMB/llama.cpp/tree/minicpmv-main/examples/llava/README-minicpmv2.6.md)， 在iPad上可以支持 16~18 token/s 的流畅推理（测试环境：iPad Pro + M4）。
+ollama 用法请参考[我们的fork ollama](https://github.com/OpenBMB/ollama/blob/minicpm-v2.6/examples/minicpm-v2.6/README.md)， 在iPad上可以支持 16~18 token/s 的流畅推理（测试环境：iPad Pro + M4）。
+<details>
+<summary>点击查看, vLLM 现已官方支持MiniCPM-o 2.6、MiniCPM-V 2.6、MiniCPM-Llama3-V 2.5 和 MiniCPM-V 2.0。  </summary>
+1. 安装 vLLM(>=0.7.1):
+```shell
+pip install vllm
+```
+2. 运行示例代码:（注意：如果使用本地路径的模型，请确保模型代码已更新到Hugging Face上的最新版)
+  * [图文示例](https://docs.vllm.ai/en/latest/getting_started/examples/vision_language.html)
+  * [音频示例](https://docs.vllm.ai/en/latest/getting_started/examples/audio_language.html)
+</details>
+## 微调
+### 简易微调 <!-- omit in toc -->
+我们支持使用 Huggingface Transformers 库简易地微调 MiniCPM-o 2.6、MiniCPM-V 2.6、MiniCPM-Llama3-V 2.5 和 MiniCPM-V 2.0 模型。
+[参考文档](./finetune/readme.md)
+### 使用 Align-Anything <!-- omit in toc -->
+我们支持使用北大团队开发的 [Align-Anything](https://github.com/PKU-Alignment/align-anything) 框架微调 MiniCPM-o 系列模型，同时支持 DPO 和 SFT 在视觉和音频模态上的微调。Align-Anything 是一个用于对齐全模态大模型的高度可扩展框架，开源了[数据集、模型和评测](https://huggingface.co/datasets/PKU-Alignment/align-anything)。它支持了 30+ 开源基准，40+ 模型，以及包含SFT、SimPO、RLHF在内的多种算法，并提供了 30+ 直接可运行的脚本，适合初学者快速上手。
+最佳实践: [MiniCPM-o 2.6](https://github.com/PKU-Alignment/align-anything/tree/main/scripts).
+### 使用 LLaMA-Factory <!-- omit in toc -->
+我们支持使用 LLaMA-Factory 微调 MiniCPM-o 2.6 和 MiniCPM-V 2.6。LLaMA-Factory 提供了一种灵活定制 200 多个大型语言模型（LLM）微调（Lora/Full/Qlora）解决方案，无需编写代码，通过内置的 Web 用户界面 LLaMABoard 即可实现训练/推理/评估。它支持多种训练方法，如 sft/ppo/dpo/kto，并且还支持如 Galore/BAdam/LLaMA-Pro/Pissa/LongLoRA 等高级算法。
+最佳实践: [MiniCPM-o 2.6 | MiniCPM-V 2.6](./docs/llamafactory_train_and_infer.md).
+### 使用 SWIFT 框架 <!-- omit in toc -->
+我们支持使用 SWIFT 框架微调 MiniCPM-V 系列模型。SWIFT 支持近 200 种大语言模型和多模态大模型的训练、推理、评测和部署。支持 PEFT 提供的轻量训练方案和完整的 Adapters 库支持的最新训练技术如 NEFTune、LoRA+、LLaMA-PRO 等。
+参考文档：[MiniCPM-V 1.0](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v最佳实践.md)，[MiniCPM-V 2.0](https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v-2最佳实践.md) [MiniCPM-V 2.6](https://github.com/modelscope/ms-swift/issues/1613).
+## FAQs
+点击查看 [FAQs](./docs/faqs.md)
+## 模型局限性
+我们实验发现 MiniCPM-o 2.6 存在一些显著的局限性，需要进一步研究和改进：
+- **不稳定的语音输出。** 语音生成可能会受到背景噪音和无意义声音的影响，表现不稳定。
+- **重复响应。** 当遇到连续相似的用户请求时，模型往往会重复相同的回答。
+- **Web Demo 延迟较高。** 用户在使用远程服务器上部署的 web demo 时可能会产生较高延迟。我们推荐用户在本地部署来获得更低延迟的体验。
+## 模型协议 <!-- omit in toc -->
+* 本仓库中代码依照 [Apache-2.0](https://github.com/OpenBMB/MiniCPM/blob/main/LICENSE) 协议开源
+* MiniCPM-o/V 模型权重的使用则需要遵循 [“MiniCPM模型商用许可协议.md”](https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%E6%A8%A1%E5%9E%8B%E5%95%86%E7%94%A8%E8%AE%B8%E5%8F%AF%E5%8D%8F%E8%AE%AE.md)。
+* MiniCPM 模型权重对学术研究完全开放，在填写[“问卷”](https://modelbest.feishu.cn/share/base/form/shrcnpV5ZT9EJ6xYjh3Kx0J6v8g)进行登记后亦允许免费商业使用。
+## 声明 <!-- omit in toc -->
+作为多模态大模型，MiniCPM-o/V 系列模型（包括 OmniLMM）通过学习大量的多模态数据来生成内容，但它无法理解、表达个人观点或价值判断，它所输出的任何内容都不代表模型开发者的观点和立场。
+因此用户在使用本项目的系列模型生成的内容时，应自行负责对其进行评估和验证。如果由于使用本项目的系列开源模型而导致的任何问题，包括但不限于数据安全问题、公共舆论风险，或模型被误导、滥用、传播或不当利用所带来的任何风险和问题，我们将不承担任何责任。
+## 机构 <!-- omit in toc -->
+本项目由以下机构共同开发：
+- <img src="assets/thunlp.png" width="28px"> [清华大学自然语言处理实验室](https://nlp.csai.tsinghua.edu.cn/)
+- <img src="assets/modelbest.png" width="28px"> [面壁智能](https://modelbest.cn/)
+## 🌟 Star History <!-- omit in toc -->
+<!-- <table align="center">
+    <p align="center">
+      <img src="assets/star_history.svg"/>
+    </p>
+</table> -->
+<picture>
+  <source
+    media="(prefers-color-scheme: dark)"
+    srcset="
+      https://api.star-history.com/svg?repos=OpenBMB/MiniCPM-o&type=Date&theme=dark
+    "
+  />
+  <source
+    media="(prefers-color-scheme: light)"
+    srcset="
+      https://api.star-history.com/svg?repos=OpenBMB/MiniCPM-o&type=Date
+    "
+  />
+  <img
+    alt="Star History Chart"
+    src="https://api.star-history.com/svg?repos=OpenBMB/MiniCPM-o&type=Date"
+  />
+</picture>
+## 支持技术和其他多模态项目 <!-- omit in toc -->
+👏 欢迎了解 MiniCPM-o/V 背后的支持技术和更多我们的多模态项目！
+[VisCPM](https://github.com/OpenBMB/VisCPM/tree/main) | [RLHF-V](https://github.com/RLHF-V/RLHF-V) | [LLaVA-UHD](https://github.com/thunlp/LLaVA-UHD) | [RLAIF-V](https://github.com/RLHF-V/RLAIF-V)
+## 引用 <!-- omit in toc -->
+如果您觉得我们模型/代码/论文有帮助，请给我们 ⭐ 和 引用 📝，感谢！
+```bib
+@article{yao2024minicpm,
+  title={MiniCPM-V: A GPT-4V Level MLLM on Your Phone},
+  author={Yao, Yuan and Yu, Tianyu and Zhang, Ao and Wang, Chongyi and Cui, Junbo and Zhu, Hongji and Cai, Tianchi and Li, Haoyu and Zhao, Weilin and He, Zhihui and others},
+  journal={arXiv preprint arXiv:2408.01800},
+  year={2024}
+}
+```

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/cgbench.py ADDED Viewed

	@@ -0,0 +1,1760 @@

+from huggingface_hub import snapshot_download
+from ..smp import *
+from .video_base import VideoBaseDataset
+from .utils import build_judge, DEBUG_MESSAGE
+from .utils.cgbench import *
+from ..utils import track_progress_rich
+class CGBench_MCQ_Grounding_Mini(VideoBaseDataset):
+    dataset = "CG-Bench_MCQ_Grounding_Mini"
+    TYPE = "Video-MCQ-Grounding"
+    MD5 = "54ed3e90a51a6fb375c92b319a715f72"
+    SYS = {
+        "long_acc": (
+            "You will be provided with sampled frames from a video, along with a "
+            "multiple-choice question that includes a question and several answer options.\n"
+            "Your task is to analyze the provided frames, infer the most plausible "
+            "answer based on the visual information.\n"
+            "If the video does not provide enough information, infer the answer based "
+            "on the options available and still provide a result. "
+            "Therefore, In all cases, an answer must be given.\n"
+            "Only output the answer in the following format:\n\n"
+            '```json\n{"result": "option"}\n```\n\n'
+            'The "option" is the uppercase letter corresponding to your answer.\n\n'
+        ),
+        "clue_acc": (
+            "You will be provided with sampled frames from a video, along with a "
+            "multiple-choice question that includes a question and several answer options.\n"
+            "Your task is to analyze the provided frames, infer the most plausible "
+            "answer based on the visual information.\n"
+            "If the video does not provide enough information, infer the answer based "
+            "on the options available and still provide a result. "
+            "Therefore, In all cases, an answer must be given.\n"
+            "Only output the answer in the following format:\n\n"
+            '```json\n{"result": "option"}\n```\n\n'
+            "The 'option' is the uppercase letter corresponding to your answer.\n\n"
+        ),
+        "miou": (
+            "You will be provided with uniformly sampled frames from a video and their "
+            "timestamps, along with a multiple-choice question that includes a question "
+            "and several answer options.\n"
+            "Your task is to determine in which intervals the 'clue intervals' exist "
+            "that contain visual information needed to answer the question.\n"
+            "Only output the answer in the following format:\n\n"
+            '```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n'
+            "In this output format, each 'start' and 'end' represents the beginning and "
+            "end of an interval in seconds where relevant clues can be found.\n"
+            "You must provide at least one interval and at most five intervals. "
+            "Intervals exceeding five will NOT be considered valid.\n"
+        ),
+        "miou_wo_frame_time": (
+            "You will be provided with uniformly sampled frames from a video, along "
+            "with a multiple-choice question that includes a question and several "
+            "answer options.\n"
+            "Your task is to determine in which intervals the 'clue intervals' exist "
+            "that contain visual information needed to answer the question.\n"
+            "Only output the answer in the following format:\n\n"
+            '```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n'
+            'In this output format, each "start" and "end" represents the start and '
+            "end of the video where the relevant clue can be found in the form of a "
+            "floating point number between 0 and 1, where 0 represents the start time "
+            "of the video and 1 represents the end time of the video.\n"
+            "You must provide at least one interval and at most five intervals. "
+            "Intervals exceeding five will NOT be considered valid.\n"
+        ),
+    }
+    def __init__(
+        self,
+        dataset="CG-Bench_MCQ_Grounding_Mini",
+        use_subtitle=False,
+        use_subtitle_time=False,
+        use_frame_time=False,
+        nframe=0,
+        fps=-1,
+    ):
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+        self.use_subtitle = use_subtitle
+        self.use_subtitle_time = use_subtitle_time
+        self.use_frame_time = use_frame_time
+        self.dataset_name = dataset
+        lmu_root = LMUDataRoot()
+        self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset)
+    @classmethod
+    def supported_datasets(cls):
+        return ["CG-Bench_MCQ_Grounding_Mini"]
+    def clue_frame_paths(self, qid, num_frames=8):
+        frame_root = osp.join(self.clue_frame_root, qid)
+        os.makedirs(frame_root, exist_ok=True)
+        return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
+    def clue_frame_paths_fps(self, qid, num_frames=8, fps=-1):
+        frame_root = osp.join(self.clue_frame_root, qid)
+        os.makedirs(frame_root, exist_ok=True)
+        return [osp.join(frame_root, self.frame_tmpl_fps.format(i, num_frames, fps)) for i in range(1, num_frames + 1)]
+    def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False):
+        subtitles = []
+        srt_path = osp.join(self.data_root, subtitle_path)
+        assert osp.exists(srt_path)
+        import pysubs2
+        subs = pysubs2.load(srt_path, encoding="utf-8")
+        if not frame_indices:
+            for sub in subs:
+                sub_text = sub.text.replace("\\N", " ")
+                if sub_time:
+                    start_time = milliseconds_to_seconds(sub.start)
+                    end_time = milliseconds_to_seconds(sub.end)
+                    sub_text = f"[{start_time}, {end_time}] {sub_text}"
+                if sub_text.strip() and sub_text not in subtitles:
+                    subtitles.append(sub_text)
+        else:
+            for selected_frame_id in frame_indices:
+                cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id)
+                for sub in subs:
+                    if sub.start < cur_time and sub.end > cur_time:
+                        sub_text = sub.text.replace("\\N", " ")
+                        if sub_time:
+                            start_time = milliseconds_to_seconds(sub.start)
+                            end_time = milliseconds_to_seconds(sub.end)
+                            sub_text = f"[{start_time}, {end_time}] {sub_text}"
+                        if sub_text.strip() and sub_text not in subtitles:
+                            subtitles.append(sub_text)
+        if subtitles:
+            subtitles_str = '\n'.join(subtitles)
+            return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n"
+        else:
+            return ""
+    def prepare_dataset(self, dataset_name="CG-Bench_MCQ_Grounding_Mini", repo_id="CG-Bench/CG-Bench"):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f"{dataset_name}.tsv")
+            if not os.path.exists(data_file):
+                return False
+            if md5(data_file) != self.MD5:
+                return False
+            data = load(data_file)
+            for video_pth in data["video"]:
+                if not osp.exists(osp.join(pth, video_pth)):
+                    return False
+            return True
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+            def generate_tsv(pth):
+                tsv_file = osp.join(pth, f"{dataset_name}.tsv")
+                task_modes = ["long_acc", "clue_acc", "miou"]
+                all_data = []
+                for task_mode in task_modes:
+                    with open(osp.join(pth, "cgbench_mini.json"), "r") as f:
+                        data_file = pd.DataFrame(json.load(f))
+                    data_file = data_file.assign(index=range(len(data_file)))
+                    data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4")
+                    data_file["subtitle_path"] = data_file["video_uid"].apply(
+                        lambda x: (
+                            f"cg_subtitles/{x}.srt"
+                            if osp.exists(osp.join(dataset_path, f"cg_subtitles/{x}.srt"))
+                            else ""
+                        )
+                    )
+                    data_file["clue_video_path"] = ""
+                    if task_mode in ["clue_acc"]:
+                        data_file["clue_video_path"] = data_file["clue_video_path"] = data_file.apply(
+                            lambda row: f"cg_clue_videos/{row['qid']}.mp4", axis=1
+                        )
+                    data_file["task_mode"] = task_mode
+                    if task_mode in ["clue_acc", "long_acc"]:
+                        data_file["answer"] = data_file["right_answer"]
+                    if task_mode == "miou":
+                        data_file["answer"] = data_file["clue_intervals"]
+                    if task_mode in ["long_acc", "miou"]:
+                        data_file["clue_intervals"] = ""
+                    data_file = data_file[
+                        [
+                            "index",
+                            "video_uid",
+                            "video",
+                            "duration",
+                            "domain",
+                            "choices",
+                            "sub_category",
+                            "subtitle_path",
+                            "question",
+                            "answer",
+                            "task_mode",
+                            "clue_intervals",
+                            "qid",
+                            "clue_video_path",
+                        ]
+                    ]
+                    all_data.append(data_file)
+                final_data = pd.concat(all_data, ignore_index=True)
+                final_data["index"] = range(len(final_data))
+                final_data.to_csv(tsv_file, sep="\t", index=False)
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
+            unzip_hf_zip(dataset_path)
+            generate_tsv(dataset_path)
+        tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv")
+        return dict(data_file=tsv_file, root=dataset_path)
+    def build_prompt(self, line, video_llm):
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+        task_mode = line["task_mode"]
+        message = []
+        origin_use_subtitle_time = self.use_subtitle_time
+        try:
+            if task_mode in ["long_acc", "clue_acc"]:
+                system_prompt = self.SYS[task_mode]
+            elif task_mode == "miou":
+                if self.use_frame_time and not video_llm:
+                    system_prompt = self.SYS[task_mode]
+                else:
+                    system_prompt = self.SYS["miou_wo_frame_time"]
+                    if self.use_subtitle_time is True:
+                        self.use_subtitle_time = False
+            user_prompt = ""
+            if task_mode in ["long_acc", "miou"]:
+                video_path = line["video"]
+                if video_llm:
+                    message.append(dict(type="video", value=osp.join(self.data_root, video_path)))
+                    if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                        if self.nframe:
+                            image_paths, frame_indices, vid_fps = self.save_video_frames(
+                                video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+                            )
+                            user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices,
+                                                              fps=vid_fps, sub_time=self.use_subtitle_time)
+                        else:
+                            user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time)
+                else:
+                    image_paths, frame_indices, vid_fps = self.save_video_frames(
+                        video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+                    )
+                    message.extend(dict(type="image", value=im) for im in image_paths)
+                    if self.use_frame_time:
+                        user_prompt += get_timestampes(frame_indices, vid_fps)
+                    if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                        user_prompt += self.get_subtitles(
+                            line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps,
+                            sub_time=self.use_subtitle_time
+                        )
+            elif task_mode == "clue_acc":
+                clue_video_path = line["clue_video_path"]
+                video_path = line["video"]
+                if video_llm:
+                    message.append(dict(type="video", value=osp.join(self.data_root, clue_video_path)))
+                    print(message)
+                    if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                        if self.nframe:
+                            image_paths, frame_indices, vid_fps = self.save_video_frames(
+                                video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+                            )
+                            user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices,
+                                                              fps=vid_fps, sub_time=self.use_subtitle_time)
+                        else:
+                            user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time)
+                else:
+                    if self.nframe > 32:
+                        self.nframe = 32
+                        print("The maximum number of frames is 32 when evaluating clue-based mcq in CG-Bench !")
+                    clue_intervals = eval(line["clue_intervals"])
+                    image_paths, frame_indices, vid_fps = self.save_video_frames(
+                        video_path, uid=line["qid"], clue_intervals=clue_intervals, num_frames=self.nframe, fps=self.fps
+                    )
+                    message.extend(dict(type="image", value=im) for im in image_paths)
+                    if self.use_frame_time:
+                        user_prompt += get_timestampes(frame_indices, vid_fps)
+                    if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                        user_prompt += self.get_subtitles(
+                            line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps,
+                            sub_time=self.use_subtitle_time
+                        )
+            question = line["question"]
+            user_prompt += f"Question: {question}\n\n"
+            choices = eval(line["choices"])
+            labels = [chr(ord("A") + i) for i in range(len(choices))]
+            user_prompt += "\n".join([f"{label}:{value}" for label, value in zip(labels, choices)]) + "\n\n"
+            message.append(dict(type="text", value=system_prompt + user_prompt))
+            return message
+        finally:
+            # Ensure that `use_subtitle_time` is always restored to its original value
+            self.use_subtitle_time = origin_use_subtitle_time
+    def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1):
+        if type(uid) is not str:
+            uid = str(uid)
+        vid_path = osp.join(self.data_root, video)
+        vid = decord.VideoReader(vid_path)
+        vid_fps = vid.get_avg_fps()
+        n_frames = len(vid)
+        if clue_intervals is not None:
+            merged_intervals = merge_intervals(clue_intervals)
+            if num_frames > 0 and fps < 0:
+                indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps)
+                frame_paths = self.clue_frame_paths(uid, len(indices))
+            elif fps > 0:
+                frame_indices = []
+                for start, end in merged_intervals:
+                    start_frame = int(start * vid_fps)
+                    end_frame = int(end * vid_fps)
+                    step = vid_fps / fps
+                    interval_indices = [
+                        int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step))
+                    ]
+                    frame_indices.extend(interval_indices)
+                if len(frame_indices) < 32:
+                    indices = sample_frames_clue_average(merged_intervals, 32, vid_fps)
+                else:
+                    indices = frame_indices
+                frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps)
+        else:
+            if num_frames > 0 and fps < 0:
+                step_size = len(vid) / (num_frames + 1)
+                indices = [int(i * step_size) for i in range(1, num_frames + 1)]
+                frame_paths = self.frame_paths(uid)
+            elif fps > 0:
+                total_duration = n_frames / vid_fps
+                required_frames = int(total_duration * fps)
+                step_size = vid_fps / fps
+                indices = [int(i * step_size) for i in range(required_frames)]
+                frame_paths = self.frame_paths_fps(uid, len(indices))
+        # Save and validate frames
+        valid_paths = []
+        valid_indices = []
+        if not np.all([osp.exists(p) for p in frame_paths]):
+            images = [vid[i].asnumpy() for i in indices]
+            for i, (img_array, path) in enumerate(zip(images, frame_paths)):
+                if osp.exists(path):
+                    try:
+                        with Image.open(path) as img:
+                            img.verify()
+                        valid_paths.append(path)
+                        valid_indices.append(indices[i])
+                    except Exception:
+                        continue
+                else:
+                    try:
+                        img = Image.fromarray(img_array)
+                        img.save(path)
+                        img.verify()
+                        valid_paths.append(path)
+                        valid_indices.append(indices[i])
+                    except Exception:
+                        continue
+        else:
+            for i, path in enumerate(frame_paths):
+                try:
+                    with Image.open(path) as img:
+                        img.verify()
+                    valid_paths.append(path)
+                    valid_indices.append(indices[i])
+                except Exception:
+                    continue
+        return valid_paths, valid_indices, vid_fps
+    def evaluate(self, eval_file, **judge_kwargs):
+        assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+        tgt_file = eval_file.replace(".xlsx", "_rating.json")
+        score_file = eval_file.replace(".xlsx", "_score.xlsx")
+        data = load(eval_file)
+        data_un = data[~pd.isna(data["prediction"])]
+        data_pred_na = data[pd.isna(data["prediction"])]
+        data_pred_na["score"] = -1
+        data_un["score"] = data_un.apply(
+            lambda row: post_process(
+                response=row["prediction"],
+                right_answer=row["answer"],
+                task_mode=row["task_mode"],
+                duration=row["duration"],
+            ),
+            axis=1,
+        )
+        data = pd.concat([data_pred_na, data_un])
+        rejected_count = (data["score"] == -1).sum()
+        print(
+            f"Among {len(data)} questions, "
+            f"failed to obtain prediction for {len(data_pred_na)} questions, "
+            f"failed to obtain the score for {rejected_count - len(data_pred_na)} questions. "
+            f"Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating."
+        )
+        dump(data, score_file)
+        rating = get_dimention_rating_mcq_grouding(score_file)
+        dump(rating, tgt_file)
+        return rating
+# 评估时，step_2 评估时，给出 [prompt] + image_paths 就行
+class CGBench_OpenEnded_Mini(VideoBaseDataset):
+    TYPE = "Video-OpenEnded"
+    dataset = "CG-Bench_OpenEnded_Mini"
+    MD5 = "9175791b11afdfa305fdb3e525b7a4ee"
+    SYS = (
+        "You will be provided with sampled frames from a video, along with a "
+        "question.\n"
+        "Your task is to analyze the provided frames and infer the most plausible "
+        "answer based on the visual information.\n"
+        "If the visual information is ambiguous or insufficient, use the available "
+        "context to reason your answer.\n"
+        "Only output the answer in the following format:\n\n"
+        '```json\n{"result": "answer"}\n```\n\n'
+        'The "answer" can be a word, phrase, or sentence that directly responds to '
+        "the question.\n\n"
+    )
+    def __init__(
+        self,
+        dataset="CG-Bench_OpenEnded_Mini",
+        use_subtitle=False,
+        use_subtitle_time=False,
+        use_frame_time=False,
+        nframe=0,
+        fps=-1,
+    ):
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+        self.use_subtitle = use_subtitle
+        self.use_subtitle_time = use_subtitle_time
+        self.use_frame_time = use_frame_time
+        self.dataset_name = dataset
+        lmu_root = LMUDataRoot()
+        self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset)
+    @classmethod
+    def supported_datasets(cls):
+        return ["CG-Bench_OpenEnded_Mini"]
+    def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False):
+        subtitles = []
+        srt_path = osp.join(self.data_root, subtitle_path)
+        assert osp.exists(srt_path)
+        import pysubs2
+        subs = pysubs2.load(srt_path, encoding="utf-8")
+        if not frame_indices:
+            for sub in subs:
+                sub_text = sub.text.replace("\\N", " ")
+                if sub_time:
+                    start_time = milliseconds_to_seconds(sub.start)
+                    end_time = milliseconds_to_seconds(sub.end)
+                    sub_text = f"[{start_time}, {end_time}] {sub_text}"
+                if sub_text.strip() and sub_text not in subtitles:
+                    subtitles.append(sub_text)
+        else:
+            for selected_frame_id in frame_indices:
+                cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id)
+                for sub in subs:
+                    if sub.start < cur_time and sub.end > cur_time:
+                        sub_text = sub.text.replace("\\N", " ")
+                        if sub_time:
+                            start_time = milliseconds_to_seconds(sub.start)
+                            end_time = milliseconds_to_seconds(sub.end)
+                            sub_text = f"[{start_time}, {end_time}] {sub_text}"
+                            if sub_text.strip() and sub_text not in subtitles:
+                                subtitles.append(sub_text)
+        if subtitles:
+            subtitles_str = '\n'.join(subtitles)
+            return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n"
+        else:
+            return ""
+    def prepare_dataset(self, dataset_name="CG-Bench_OpenEnded_Mini", repo_id="CG-Bench/CG-Bench"):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f"{dataset_name}.tsv")
+            if not os.path.exists(data_file):
+                return False
+            if md5(data_file) != self.MD5:
+                return False
+            data = load(data_file)
+            for video_pth in data["video"]:
+                if not osp.exists(osp.join(pth, video_pth)):
+                    return False
+            return True
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+            def generate_tsv(pth):
+                tsv_file = osp.join(pth, f"{dataset_name}.tsv")
+                with open(osp.join(pth, "cgbench_mini.json"), "r") as f:
+                    data_file = pd.DataFrame(json.load(f))
+                data_file = data_file.assign(index=range(len(data_file)))
+                data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4")
+                data_file["subtitle_path"] = data_file["video_uid"].apply(
+                    lambda x: f"cg_subtitles/{x}.srt" if osp.exists(osp.join(pth, f"cg_subtitles/{x}.srt")) else ""
+                )
+                data_file = data_file[
+                    [
+                        "index",
+                        "video_uid",
+                        "video",
+                        "duration",
+                        "domain",
+                        "sub_category",
+                        "subtitle_path",
+                        "question",
+                        "answer",
+                        "clue_intervals",
+                        "qid",
+                    ]
+                ]
+                data_file.to_csv(tsv_file, sep="\t", index=False)
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
+            unzip_hf_zip(dataset_path)
+            generate_tsv(dataset_path)
+        tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv")
+        return dict(data_file=tsv_file, root=dataset_path)
+    def build_prompt(self, line, video_llm):
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+        message = []
+        sys_prompt = self.SYS
+        user_prompt = ""
+        video_path = line["video"]
+        if video_llm:
+            message.append(dict(type="video", value=osp.join(self.data_root, video_path)))
+            if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                if self.nframe:
+                    image_paths, frame_indices, vid_fps = self.save_video_frames(
+                        video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+                    )
+                    user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices,
+                                                      fps=vid_fps, sub_time=self.use_subtitle_time)
+                else:
+                    user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time)
+        else:
+            image_paths, frame_indices, vid_fps = self.save_video_frames(
+                video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+            )
+            message.extend(dict(type="image", value=im) for im in image_paths)
+            if self.use_frame_time:
+                user_prompt += get_timestampes(frame_indices, vid_fps)
+            if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                user_prompt += self.get_subtitles(
+                    line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps,
+                    sub_time=self.use_subtitle_time
+                )
+        question = line["question"]
+        user_prompt += f"Question: {question}\n\n"
+        message.append(dict(type="text", value=sys_prompt + user_prompt))
+        return message
+    def clue_frame_paths(self, qid, num_frames=8):
+        frame_root = osp.join(self.clue_frame_root, qid)
+        os.makedirs(frame_root, exist_ok=True)
+        return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
+    def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1):
+        if type(uid) is not str:
+            uid = str(uid)
+        vid_path = osp.join(self.data_root, video)
+        vid = decord.VideoReader(vid_path)
+        vid_fps = vid.get_avg_fps()
+        n_frames = len(vid)
+        if clue_intervals is not None:
+            merged_intervals = merge_intervals(clue_intervals)
+            if num_frames > 0 and fps < 0:
+                indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps)
+                frame_paths = self.clue_frame_paths(uid, len(indices))
+            elif fps > 0:
+                frame_indices = []
+                for start, end in merged_intervals:
+                    start_frame = int(start * vid_fps)
+                    end_frame = int(end * vid_fps)
+                    step = vid_fps / fps
+                    interval_indices = [
+                        int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step))
+                    ]
+                    frame_indices.extend(interval_indices)
+                if len(frame_indices) < 32:
+                    indices = sample_frames_clue_average(merged_intervals, 32, vid_fps)
+                else:
+                    indices = frame_indices
+                frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps)
+        else:
+            if num_frames > 0 and fps < 0:
+                step_size = len(vid) / (num_frames + 1)
+                indices = [int(i * step_size) for i in range(1, num_frames + 1)]
+                frame_paths = self.frame_paths(uid)
+            elif fps > 0:
+                total_duration = n_frames / vid_fps
+                required_frames = int(total_duration * fps)
+                step_size = vid_fps / fps
+                indices = [int(i * step_size) for i in range(required_frames)]
+                frame_paths = self.frame_paths_fps(uid, len(indices))
+        valid_paths = []
+        valid_indices = []
+        if not np.all([osp.exists(p) for p in frame_paths]):
+            images = [vid[i].asnumpy() for i in indices]
+            for i, (img_array, path) in enumerate(zip(images, frame_paths)):
+                if osp.exists(path):
+                    try:
+                        with Image.open(path) as img:
+                            img.verify()
+                        valid_paths.append(path)
+                        valid_indices.append(indices[i])
+                    except Exception:
+                        continue
+                else:
+                    try:
+                        img = Image.fromarray(img_array)
+                        img.save(path)
+                        img.verify()
+                        valid_paths.append(path)
+                        valid_indices.append(indices[i])
+                    except Exception:
+                        continue
+        else:
+            for i, path in enumerate(frame_paths):
+                try:
+                    with Image.open(path) as img:
+                        img.verify()
+                    valid_paths.append(path)
+                    valid_indices.append(indices[i])
+                except Exception:
+                    continue
+        return valid_paths, valid_indices, vid_fps
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.cgbench import get_dimention_rating_open_ended, post_process_open
+        assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+        tgt_file = eval_file.replace(".xlsx", "_rating.json")
+        score_file = eval_file.replace(".xlsx", "_score.xlsx")
+        step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl")
+        step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl")
+        data = load(eval_file)
+        data_pred_no_na = data[~pd.isna(data["prediction"])]
+        data_pred_na = data[pd.isna(data["prediction"])]
+        data_pred_na["model_result"] = -1
+        data_pred_na["step_1_result"] = -1
+        data_pred_na["step_2_result"] = -1
+        data_pred_na["score"] = -1
+        data_pred_no_na["model_result"] = data_pred_no_na.apply(
+            lambda row: post_process_open(
+                response=row["prediction"],
+            ),
+            axis=1,
+        )
+        data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1]
+        data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1]
+        if judge_kwargs.get("model", None) != "gpt-4o-0806":
+            judge_kwargs["model"] = "gpt-4o-0806"
+            print("The judge model in cg-bench is gpt-4o-0806!")
+        model_step_1 = build_judge(system_prompt=sys_prompt_open_eval_step_1, **judge_kwargs)
+        nproc = judge_kwargs.pop("nproc", 32)
+        lines_step_1 = data_step_1.to_dict("records")
+        tups_step_1 = [(model_step_1, line) for line in lines_step_1]
+        keys_step_1 = {line["qid"] for line in lines_step_1}
+        ans = {}
+        if osp.exists(step_1_tmp_file):
+            ans = load(step_1_tmp_file)
+        tups_step_1 = [x for x, i in zip(tups_step_1, keys_step_1) if i not in ans]
+        keys_step_1 = [i for i in keys_step_1 if i not in ans]
+        _ = track_progress_rich(
+            eval_open_first,
+            tups_step_1,
+            nproc=nproc,
+            keys=keys_step_1,
+            save=step_1_tmp_file,
+        )
+        step_1_results = load(step_1_tmp_file)
+        data_step_1 = save_step_1_steps(data_step_1, step_1_results)  # -1, 0, 1, 2
+        data_no_step_1_results = data_step_1[data_step_1["step_1_result"] == -1]
+        data_step_1_over = data_step_1[data_step_1["step_1_result"].isin([0, 1])]
+        data_step_2 = data_step_1[data_step_1["step_1_result"] == 2]
+        print(judge_kwargs)
+        model_step_2 = build_judge(system_prompt=sys_prompt_open_eval_step_2, **judge_kwargs)
+        lines_step_2 = data_step_2.to_dict("records")
+        tups_step_2 = []
+        for line in tqdm(lines_step_2):
+            clue_intervals = eval(line["clue_intervals"])
+            lmu_root = LMUDataRoot()
+            clue_frame_root = osp.join(lmu_root, "clue_images", self.dataset)
+            data_root = self.data_root
+            frame_paths, _, _ = save_clue_video_frames(
+                data_root,
+                clue_frame_root,
+                video=line["video"],
+                uid=line["qid"],
+                clue_intervals=clue_intervals,
+                num_frames=32,
+            )
+            tups_step_2.append((model_step_2, line, frame_paths))
+        keys_step_2 = {line["qid"] for line in lines_step_2}
+        ans = {}
+        if osp.exists(step_2_tmp_file):
+            ans = load(step_2_tmp_file)
+        tups_step_2 = [x for x, i in zip(tups_step_2, keys_step_2) if i not in ans]
+        keys_step_2 = [i for i in keys_step_2 if i not in ans]
+        _ = track_progress_rich(
+            eval_open_second,
+            tups_step_2,
+            nproc=nproc,
+            keys=keys_step_2,
+            save=step_2_tmp_file,
+        )
+        step_2_results = load(step_2_tmp_file)
+        data_step_2 = save_step_2_steps(data_step_2, step_2_results)
+        data_no_step_2_results = data_step_2[data_step_2["score"] == -1]
+        data_step_2_over = data_step_2[data_step_2["score"].isin([0, 1])]
+        data = pd.concat(
+            [
+                data_pred_na,
+                data_no_model_result,
+                data_no_step_1_results,
+                data_step_1_over,
+                data_no_step_2_results,
+                data_step_2_over,
+            ]
+        )
+        dump(data, score_file)
+        rating = get_dimention_rating_open_ended(score_file)
+        dump(rating, tgt_file)
+        return rating
+class CGBench_MCQ_Grounding(VideoBaseDataset):
+    TYPE = "Video-MCQ-Grounding"
+    MD5 = "eaead3d978a689269fefce4ae29c86df"
+    SYS = {
+        "long_acc": (
+            "You will be provided with sampled frames from a video, along with a "
+            "multiple-choice question that includes a question and several answer options.\n"
+            "Your task is to analyze the provided frames, infer the most plausible "
+            "answer based on the visual information.\n"
+            "If the video does not provide enough information, infer the answer based "
+            "on the options available and still provide a result. "
+            "Therefore, In all cases, an answer must be given.\n"
+            "Only output the answer in the following format:\n\n"
+            '```json\n{"result": "option"}\n```\n\n'
+            'The "option" is the uppercase letter corresponding to your answer.\n\n'
+        ),
+        "clue_acc": (
+            "You will be provided with sampled frames from a video, along with a "
+            "multiple-choice question that includes a question and several answer options.\n"
+            "Your task is to analyze the provided frames, infer the most plausible "
+            "answer based on the visual information.\n"
+            "If the video does not provide enough information, infer the answer based "
+            "on the options available and still provide a result. "
+            "Therefore, In all cases, an answer must be given.\n"
+            "Only output the answer in the following format:\n\n"
+            '```json\n{"result": "option"}\n```\n\n'
+            "The 'option' is the uppercase letter corresponding to your answer.\n\n"
+        ),
+        "miou": (
+            "You will be provided with uniformly sampled frames from a video and their "
+            "timestamps, along with a multiple-choice question that includes a question "
+            "and several answer options.\n"
+            "Your task is to determine in which intervals the 'clue intervals' exist "
+            "that contain visual information needed to answer the question.\n"
+            "Only output the answer in the following format:\n\n"
+            '```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n'
+            "In this output format, each 'start' and 'end' represents the beginning and "
+            "end of an interval in seconds where relevant clues can be found.\n"
+            "You must provide at least one interval and at most five intervals. "
+            "Intervals exceeding five will NOT be considered valid.\n"
+        ),
+        "miou_wo_frame_time": (
+            "You will be provided with uniformly sampled frames from a video, along "
+            "with a multiple-choice question that includes a question and several "
+            "answer options.\n"
+            "Your task is to determine in which intervals the 'clue intervals' exist "
+            "that contain visual information needed to answer the question.\n"
+            "Only output the answer in the following format:\n\n"
+            '```json\n{"result": [[start1, end1], [start2, end2], ...]}\n```\n\n'
+            'In this output format, each "start" and "end" represents the start and '
+            "end of the video where the relevant clue can be found in the form of a "
+            "floating point number between 0 and 1, where 0 represents the start time "
+            "of the video and 1 represents the end time of the video.\n"
+            "You must provide at least one interval and at most five intervals. "
+            "Intervals exceeding five will NOT be considered valid.\n"
+        ),
+    }
+    def __init__(
+        self,
+        dataset="CG-Bench_MCQ_Grounding",
+        use_subtitle=False,
+        use_subtitle_time=False,
+        use_frame_time=False,
+        nframe=0,
+        fps=-1,
+    ):
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+        self.use_subtitle = use_subtitle
+        self.use_subtitle_time = use_subtitle_time
+        self.use_frame_time = use_frame_time
+        self.dataset_name = dataset
+        lmu_root = LMUDataRoot()
+        self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset)
+    @classmethod
+    def supported_datasets(cls):
+        return ["CG-Bench_MCQ_Grounding"]
+    def clue_frame_paths(self, qid, num_frames=8):
+        frame_root = osp.join(self.clue_frame_root, qid)
+        os.makedirs(frame_root, exist_ok=True)
+        return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
+    def clue_frame_paths_fps(self, qid, num_frames=8, fps=-1):
+        frame_root = osp.join(self.clue_frame_root, qid)
+        os.makedirs(frame_root, exist_ok=True)
+        return [osp.join(frame_root, self.frame_tmpl_fps.format(i, num_frames, fps)) for i in range(1, num_frames + 1)]
+    def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False):
+        subtitles = []
+        srt_path = osp.join(self.data_root, subtitle_path)
+        assert osp.exists(srt_path)
+        import pysubs2
+        subs = pysubs2.load(srt_path, encoding="utf-8")
+        if not frame_indices:
+            for sub in subs:
+                sub_text = sub.text.replace("\\N", " ")
+                if sub_time:
+                    start_time = milliseconds_to_seconds(sub.start)
+                    end_time = milliseconds_to_seconds(sub.end)
+                    sub_text = f"[{start_time}, {end_time}] {sub_text}"
+                if sub_text.strip() and sub_text not in subtitles:
+                    subtitles.append(sub_text)
+        else:
+            for selected_frame_id in frame_indices:
+                cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id)
+                for sub in subs:
+                    if sub.start < cur_time and sub.end > cur_time:
+                        sub_text = sub.text.replace("\\N", " ")
+                        if sub_time:
+                            start_time = milliseconds_to_seconds(sub.start)
+                            end_time = milliseconds_to_seconds(sub.end)
+                            sub_text = f"[{start_time}, {end_time}] {sub_text}"
+                        if sub_text.strip() and sub_text not in subtitles:
+                            subtitles.append(sub_text)
+        if subtitles:
+            subtitles_str = '\n'.join(subtitles)
+            return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n"
+        else:
+            return ""
+    def prepare_dataset(self, dataset_name="CG-Bench_MCQ_Grounding", repo_id="CG-Bench/CG-Bench"):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f"{dataset_name}.tsv")
+            if not os.path.exists(data_file):
+                return False
+            if md5(data_file) != self.MD5:
+                return False
+            data = load(data_file)
+            for video_pth in data["video"]:
+                if not osp.exists(osp.join(pth, video_pth)):
+                    return False
+            for clue_video_pth in data["clue_video_path"]:
+                if clue_video_pth and not (isinstance(clue_video_pth, float) and np.isnan(clue_video_pth)):
+                    if not osp.exists(osp.join(pth, clue_video_pth)):
+                        return False
+            return True
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+            def generate_tsv(pth):
+                tsv_file = osp.join(pth, f"{dataset_name}.tsv")
+                task_modes = ["long_acc", "clue_acc", "miou"]
+                all_data = []
+                for task_mode in task_modes:
+                    with open(osp.join(pth, "cgbench.json"), "r") as f:
+                        data_file = pd.DataFrame(json.load(f))
+                    data_file = data_file.assign(index=range(len(data_file)))
+                    data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4")
+                    data_file["subtitle_path"] = data_file["video_uid"].apply(
+                        lambda x: (
+                            f"cg_subtitles/{x}.srt"
+                            if osp.exists(osp.join(dataset_path, f"cg_subtitles/{x}.srt"))
+                            else ""
+                        )
+                    )
+                    data_file["clue_video_path"] = ""
+                    if task_mode in ["clue_acc"]:
+                        data_file["clue_video_path"] = data_file["clue_video_path"] = data_file.apply(
+                            lambda row: f"cg_clue_videos/{row['qid']}.mp4", axis=1
+                        )
+                    data_file["task_mode"] = task_mode
+                    if task_mode in ["clue_acc", "long_acc"]:
+                        data_file["answer"] = data_file["right_answer"]
+                    if task_mode == "miou":
+                        data_file["answer"] = data_file["clue_intervals"]
+                    if task_mode in ["long_acc", "miou"]:
+                        data_file["clue_intervals"] = ""
+                    data_file = data_file[
+                        [
+                            "index",
+                            "video_uid",
+                            "video",
+                            "duration",
+                            "domain",
+                            "choices",
+                            "sub_category",
+                            "subtitle_path",
+                            "question",
+                            "answer",
+                            "task_mode",
+                            "clue_intervals",
+                            "qid",
+                            "clue_video_path",
+                        ]
+                    ]
+                    all_data.append(data_file)
+                final_data = pd.concat(all_data, ignore_index=True)
+                final_data["index"] = range(len(final_data))
+                final_data.to_csv(tsv_file, sep="\t", index=False)
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
+            unzip_hf_zip(dataset_path)
+            generate_tsv(dataset_path)
+        tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv")
+        return dict(data_file=tsv_file, root=dataset_path)
+    def build_prompt(self, line, video_llm):
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+        task_mode = line["task_mode"]
+        message = []
+        origin_use_subtitle_time = self.use_subtitle_time
+        try:
+            if task_mode in ["long_acc", "clue_acc"]:
+                system_prompt = self.SYS[task_mode]
+            elif task_mode == "miou":
+                if self.use_frame_time and not video_llm:
+                    system_prompt = self.SYS[task_mode]
+                else:
+                    system_prompt = self.SYS["miou_wo_frame_time"]
+                    if self.use_subtitle_time is True:
+                        self.use_subtitle_time = False
+            user_prompt = ""
+            if task_mode in ["long_acc", "miou"]:
+                video_path = line["video"]
+                if video_llm:
+                    message.append(dict(type="video", value=osp.join(self.data_root, video_path)))
+                    if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                        if self.nframe:
+                            image_paths, frame_indices, vid_fps = self.save_video_frames(
+                                video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+                            )
+                            user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices,
+                                                              fps=vid_fps, sub_time=self.use_subtitle_time)
+                        else:
+                            user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time)
+                else:
+                    image_paths, frame_indices, vid_fps = self.save_video_frames(
+                        video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+                    )
+                    message.extend(dict(type="image", value=im) for im in image_paths)
+                    if self.use_frame_time:
+                        user_prompt += get_timestampes(frame_indices, vid_fps)
+                    if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                        user_prompt += self.get_subtitles(
+                            line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps,
+                            sub_time=self.use_subtitle_time
+                        )
+            elif task_mode == "clue_acc":
+                clue_video_path = line["clue_video_path"]
+                video_path = line["video"]
+                if video_llm:
+                    message.append(dict(type="video", value=osp.join(self.data_root, clue_video_path)))
+                    print(message)
+                    if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                        if self.nframe:
+                            image_paths, frame_indices, vid_fps = self.save_video_frames(
+                                video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+                            )
+                            user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices,
+                                                              fps=vid_fps, sub_time=self.use_subtitle_time)
+                        else:
+                            user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time)
+                else:
+                    if self.nframe > 32:
+                        self.nframe = 32
+                        print("The maximum number of frames is 32 when evaluating clue-based mcq in CG-Bench !")
+                    clue_intervals = eval(line["clue_intervals"])
+                    image_paths, frame_indices, vid_fps = self.save_video_frames(
+                        video_path, uid=line["qid"], clue_intervals=clue_intervals, num_frames=self.nframe, fps=self.fps
+                    )
+                    message.extend(dict(type="image", value=im) for im in image_paths)
+                    if self.use_frame_time:
+                        user_prompt += get_timestampes(frame_indices, vid_fps)
+                    if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                        user_prompt += self.get_subtitles(
+                            line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps,
+                            sub_time=self.use_subtitle_time
+                        )
+            question = line["question"]
+            user_prompt += f"Question: {question}\n\n"
+            choices = eval(line["choices"])
+            labels = [chr(ord("A") + i) for i in range(len(choices))]
+            user_prompt += "\n".join([f"{label}:{value}" for label, value in zip(labels, choices)]) + "\n\n"
+            message.append(dict(type="text", value=system_prompt + user_prompt))
+            return message
+        finally:
+            # Ensure that `use_subtitle_time` is always restored to its original value
+            self.use_subtitle_time = origin_use_subtitle_time
+    def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1):
+        if type(uid) is not str:
+            uid = str(uid)
+        vid_path = osp.join(self.data_root, video)
+        vid = decord.VideoReader(vid_path)
+        vid_fps = vid.get_avg_fps()
+        n_frames = len(vid)
+        if clue_intervals is not None:
+            merged_intervals = merge_intervals(clue_intervals)
+            if num_frames > 0 and fps < 0:
+                indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps)
+                frame_paths = self.clue_frame_paths(uid, len(indices))
+            elif fps > 0:
+                frame_indices = []
+                for start, end in merged_intervals:
+                    start_frame = int(start * vid_fps)
+                    end_frame = int(end * vid_fps)
+                    step = vid_fps / fps
+                    interval_indices = [
+                        int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step))
+                    ]
+                    frame_indices.extend(interval_indices)
+                if len(frame_indices) < 32:
+                    indices = sample_frames_clue_average(merged_intervals, 32, vid_fps)
+                else:
+                    indices = frame_indices
+                frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps)
+        else:
+            if num_frames > 0 and fps < 0:
+                step_size = len(vid) / (num_frames + 1)
+                indices = [int(i * step_size) for i in range(1, num_frames + 1)]
+                frame_paths = self.frame_paths(uid)
+            elif fps > 0:
+                total_duration = n_frames / vid_fps
+                required_frames = int(total_duration * fps)
+                step_size = vid_fps / fps
+                indices = [int(i * step_size) for i in range(required_frames)]
+                frame_paths = self.frame_paths_fps(uid, len(indices))
+        # Save and validate frames
+        valid_paths = []
+        valid_indices = []
+        if not np.all([osp.exists(p) for p in frame_paths]):
+            images = [vid[i].asnumpy() for i in indices]
+            for i, (img_array, path) in enumerate(zip(images, frame_paths)):
+                if osp.exists(path):
+                    try:
+                        with Image.open(path) as img:
+                            img.verify()
+                        valid_paths.append(path)
+                        valid_indices.append(indices[i])
+                    except Exception:
+                        continue
+                else:
+                    try:
+                        img = Image.fromarray(img_array)
+                        img.save(path)
+                        img.verify()
+                        valid_paths.append(path)
+                        valid_indices.append(indices[i])
+                    except Exception:
+                        continue
+        else:
+            for i, path in enumerate(frame_paths):
+                try:
+                    with Image.open(path) as img:
+                        img.verify()
+                    valid_paths.append(path)
+                    valid_indices.append(indices[i])
+                except Exception:
+                    continue
+        return valid_paths, valid_indices, vid_fps
+    def evaluate(self, eval_file, **judge_kwargs):
+        assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+        tgt_file = eval_file.replace(".xlsx", "_rating.json")
+        score_file = eval_file.replace(".xlsx", "_score.xlsx")
+        data = load(eval_file)
+        data_un = data[~pd.isna(data["prediction"])]
+        data_pred_na = data[pd.isna(data["prediction"])]
+        data_pred_na["score"] = -1
+        data_un["score"] = data_un.apply(
+            lambda row: post_process(
+                response=row["prediction"],
+                right_answer=row["answer"],
+                task_mode=row["task_mode"],
+                duration=row["duration"],
+            ),
+            axis=1,
+        )
+        data = pd.concat([data_pred_na, data_un])
+        rejected_count = (data["score"] == -1).sum()
+        print(
+            f"Among {len(data)} questions, "
+            f"failed to obtain prediction for {len(data_pred_na)} questions, "
+            f"failed to obtain the score for {rejected_count - len(data_pred_na)} questions. "
+            f"Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating."
+        )
+        dump(data, score_file)
+        rating = get_dimention_rating_mcq_grouding(score_file)
+        dump(rating, tgt_file)
+        return rating
+# 评估时，step_2 评估时，给出 [prompt] + image_paths 就行
+class CGBench_OpenEnded(VideoBaseDataset):
+    TYPE = "Video-OpenEnded"
+    dataset = "CG-Bench_OpenEnded"
+    MD5 = "796035eda0b1e916c517cdc1bc145cfc"
+    SYS = (
+        "You will be provided with sampled frames from a video, along with a "
+        "question.\n"
+        "Your task is to analyze the provided frames and infer the most plausible "
+        "answer based on the visual information.\n"
+        "If the visual information is ambiguous or insufficient, use the available "
+        "context to reason your answer.\n"
+        "Only output the answer in the following format:\n\n"
+        '```json\n{"result": "answer"}\n```\n\n'
+        'The "answer" can be a word, phrase, or sentence that directly responds to '
+        "the question.\n\n"
+    )
+    def __init__(
+        self,
+        dataset="CG-Bench_OpenEnded",
+        use_subtitle=False,
+        use_subtitle_time=False,
+        use_frame_time=False,
+        nframe=0,
+        fps=-1,
+    ):
+        super().__init__(dataset=dataset, nframe=nframe, fps=fps)
+        self.use_subtitle = use_subtitle
+        self.use_subtitle_time = use_subtitle_time
+        self.use_frame_time = use_frame_time
+        self.dataset_name = dataset
+        lmu_root = LMUDataRoot()
+        self.clue_frame_root = osp.join(lmu_root, "clue_images", dataset)
+    @classmethod
+    def supported_datasets(cls):
+        return ["CG-Bench_OpenEnded"]
+    def get_subtitles(self, subtitle_path, frame_indices=None, fps=None, sub_time=False):
+        subtitles = []
+        srt_path = osp.join(self.data_root, subtitle_path)
+        assert osp.exists(srt_path)
+        import pysubs2
+        subs = pysubs2.load(srt_path, encoding="utf-8")
+        if not frame_indices:
+            for sub in subs:
+                sub_text = sub.text.replace("\\N", " ")
+                if sub_time:
+                    start_time = milliseconds_to_seconds(sub.start)
+                    end_time = milliseconds_to_seconds(sub.end)
+                    sub_text = f"[{start_time}, {end_time}] {sub_text}"
+                if sub_text.strip() and sub_text not in subtitles:
+                    subtitles.append(sub_text)
+        else:
+            for selected_frame_id in frame_indices:
+                cur_time = pysubs2.make_time(fps=fps, frames=selected_frame_id)
+                for sub in subs:
+                    if sub.start < cur_time and sub.end > cur_time:
+                        sub_text = sub.text.replace("\\N", " ")
+                        if sub_time:
+                            start_time = milliseconds_to_seconds(sub.start)
+                            end_time = milliseconds_to_seconds(sub.end)
+                            sub_text = f"[{start_time}, {end_time}] {sub_text}"
+                            if sub_text.strip() and sub_text not in subtitles:
+                                subtitles.append(sub_text)
+        if subtitles:
+            subtitles_str = '\n'.join(subtitles)
+            return f"The subtitles of the video are as follows:\n\n{subtitles_str}\n\n"
+        else:
+            return ""
+    def prepare_dataset(self, dataset_name="CG-Bench_OpenEnded", repo_id="CG-Bench/CG-Bench"):
+        def check_integrity(pth):
+            data_file = osp.join(pth, f"{dataset_name}.tsv")
+            if not os.path.exists(data_file):
+                return False
+            if md5(data_file) != self.MD5:
+                return False
+            data = load(data_file)
+            for video_pth in data["video"]:
+                if not osp.exists(osp.join(pth, video_pth)):
+                    return False
+            return True
+        cache_path = get_cache_path(repo_id)
+        if cache_path is not None and check_integrity(cache_path):
+            dataset_path = cache_path
+        else:
+            def generate_tsv(pth):
+                tsv_file = osp.join(pth, f"{dataset_name}.tsv")
+                with open(osp.join(pth, "cgbench.json"), "r") as f:
+                    data_file = pd.DataFrame(json.load(f))
+                data_file = data_file.assign(index=range(len(data_file)))
+                data_file["video"] = data_file["video_uid"].apply(lambda x: f"cg_videos_720p/{x}.mp4")
+                data_file["subtitle_path"] = data_file["video_uid"].apply(
+                    lambda x: f"cg_subtitles/{x}.srt" if osp.exists(osp.join(pth, f"cg_subtitles/{x}.srt")) else ""
+                )
+                data_file = data_file[
+                    [
+                        "index",
+                        "video_uid",
+                        "video",
+                        "duration",
+                        "domain",
+                        "sub_category",
+                        "subtitle_path",
+                        "question",
+                        "answer",
+                        "clue_intervals",
+                        "qid",
+                    ]
+                ]
+                data_file.to_csv(tsv_file, sep="\t", index=False)
+            if modelscope_flag_set():
+                from modelscope import dataset_snapshot_download
+                dataset_path = dataset_snapshot_download(dataset_id=repo_id)
+            else:
+                dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset")
+            unzip_hf_zip(dataset_path)
+            generate_tsv(dataset_path)
+        tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv")
+        return dict(data_file=tsv_file, root=dataset_path)
+    def build_prompt(self, line, video_llm):
+        if isinstance(line, int):
+            assert line < len(self)
+            line = self.data.iloc[line]
+        message = []
+        sys_prompt = self.SYS
+        user_prompt = ""
+        video_path = line["video"]
+        if video_llm:
+            message.append(dict(type="video", value=osp.join(self.data_root, video_path)))
+            if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                if self.nframe:
+                    image_paths, frame_indices, vid_fps = self.save_video_frames(
+                        video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+                    )
+                    user_prompt += self.get_subtitles(line["subtitle_path"], frame_indices=frame_indices,
+                                                      fps=vid_fps, sub_time=self.use_subtitle_time)
+                else:
+                    user_prompt += self.get_subtitles(line["subtitle_path"], sub_time=self.use_subtitle_time)
+        else:
+            image_paths, frame_indices, vid_fps = self.save_video_frames(
+                video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps
+            )
+            message.extend(dict(type="image", value=im) for im in image_paths)
+            if self.use_frame_time:
+                user_prompt += get_timestampes(frame_indices, vid_fps)
+            if self.use_subtitle and line["subtitle_path"] and not pd.isna(line["subtitle_path"]):
+                user_prompt += self.get_subtitles(
+                    line["subtitle_path"], frame_indices=frame_indices, fps=vid_fps,
+                    sub_time=self.use_subtitle_time
+                )
+        question = line["question"]
+        user_prompt += f"Question: {question}\n\n"
+        message.append(dict(type="text", value=sys_prompt + user_prompt))
+        return message
+    def clue_frame_paths(self, qid, num_frames=8):
+        frame_root = osp.join(self.clue_frame_root, qid)
+        os.makedirs(frame_root, exist_ok=True)
+        return [osp.join(frame_root, self.frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
+    def save_video_frames(self, video, uid, clue_intervals=None, num_frames=8, fps=-1):
+        if type(uid) is not str:
+            uid = str(uid)
+        vid_path = osp.join(self.data_root, video)
+        vid = decord.VideoReader(vid_path)
+        vid_fps = vid.get_avg_fps()
+        n_frames = len(vid)
+        if clue_intervals is not None:
+            merged_intervals = merge_intervals(clue_intervals)
+            if num_frames > 0 and fps < 0:
+                indices = sample_frames_clue_average(merged_intervals, num_frames, vid_fps)
+                frame_paths = self.clue_frame_paths(uid, len(indices))
+            elif fps > 0:
+                frame_indices = []
+                for start, end in merged_intervals:
+                    start_frame = int(start * vid_fps)
+                    end_frame = int(end * vid_fps)
+                    step = vid_fps / fps
+                    interval_indices = [
+                        int(start_frame + i * step) for i in range(int((end_frame - start_frame) / step))
+                    ]
+                    frame_indices.extend(interval_indices)
+                if len(frame_indices) < 32:
+                    indices = sample_frames_clue_average(merged_intervals, 32, vid_fps)
+                else:
+                    indices = frame_indices
+                frame_paths = self.clue_frame_paths_fps(uid, len(indices), fps)
+        else:
+            if num_frames > 0 and fps < 0:
+                step_size = len(vid) / (num_frames + 1)
+                indices = [int(i * step_size) for i in range(1, num_frames + 1)]
+                frame_paths = self.frame_paths(uid)
+            elif fps > 0:
+                total_duration = n_frames / vid_fps
+                required_frames = int(total_duration * fps)
+                step_size = vid_fps / fps
+                indices = [int(i * step_size) for i in range(required_frames)]
+                frame_paths = self.frame_paths_fps(uid, len(indices))
+        valid_paths = []
+        valid_indices = []
+        if not np.all([osp.exists(p) for p in frame_paths]):
+            images = [vid[i].asnumpy() for i in indices]
+            for i, (img_array, path) in enumerate(zip(images, frame_paths)):
+                if osp.exists(path):
+                    try:
+                        with Image.open(path) as img:
+                            img.verify()
+                        valid_paths.append(path)
+                        valid_indices.append(indices[i])
+                    except Exception:
+                        continue
+                else:
+                    try:
+                        img = Image.fromarray(img_array)
+                        img.save(path)
+                        img.verify()
+                        valid_paths.append(path)
+                        valid_indices.append(indices[i])
+                    except Exception:
+                        continue
+        else:
+            for i, path in enumerate(frame_paths):
+                try:
+                    with Image.open(path) as img:
+                        img.verify()
+                    valid_paths.append(path)
+                    valid_indices.append(indices[i])
+                except Exception:
+                    continue
+        return valid_paths, valid_indices, vid_fps
+    def evaluate(self, eval_file, **judge_kwargs):
+        from .utils.cgbench import get_dimention_rating_open_ended, post_process_open
+        assert eval_file.endswith(".xlsx"), "data file should be an xlsx file"
+        tgt_file = eval_file.replace(".xlsx", "_rating.json")
+        score_file = eval_file.replace(".xlsx", "_score.xlsx")
+        step_1_tmp_file = eval_file.replace(".xlsx", "_step_1.pkl")
+        step_2_tmp_file = eval_file.replace(".xlsx", "_step_2.pkl")
+        data = load(eval_file)
+        data_pred_no_na = data[~pd.isna(data["prediction"])]
+        data_pred_na = data[pd.isna(data["prediction"])]
+        data_pred_na["model_result"] = -1
+        data_pred_na["step_1_result"] = -1
+        data_pred_na["step_2_result"] = -1
+        data_pred_na["score"] = -1
+        data_pred_no_na["model_result"] = data_pred_no_na.apply(
+            lambda row: post_process_open(
+                response=row["prediction"],
+            ),
+            axis=1,
+        )
+        if judge_kwargs.get("model", None) != "gpt-4o-0806":
+            judge_kwargs["model"] = "gpt-4o-0806"
+            print("The judge model in cg-bench is gpt-4o-0806!")
+        data_no_model_result = data_pred_no_na[data_pred_no_na["model_result"] == -1]
+        data_step_1 = data_pred_no_na[data_pred_no_na["model_result"] != -1]
+        model_step_1 = build_judge(system_prompt=sys_prompt_open_eval_step_1, **judge_kwargs)
+        nproc = judge_kwargs.pop('nproc', 32)
+        lines_step_1 = data_step_1.to_dict("records")
+        tups_step_1 = [(model_step_1, line) for line in lines_step_1]
+        keys_step_1 = {line["qid"] for line in lines_step_1}
+        ans = {}
+        if osp.exists(step_1_tmp_file):
+            ans = load(step_1_tmp_file)
+        tups_step_1 = [x for x, i in zip(tups_step_1, keys_step_1) if i not in ans]
+        keys_step_1 = [i for i in keys_step_1 if i not in ans]
+        _ = track_progress_rich(
+            eval_open_first,
+            tups_step_1,
+            nproc=nproc,
+            keys=keys_step_1,
+            save=step_1_tmp_file,
+        )
+        step_1_results = load(step_1_tmp_file)
+        data_step_1 = save_step_1_steps(data_step_1, step_1_results)  # -1, 0, 1, 2
+        data_no_step_1_results = data_step_1[data_step_1["step_1_result"] == -1]
+        data_step_1_over = data_step_1[data_step_1["step_1_result"].isin([0, 1])]
+        data_step_2 = data_step_1[data_step_1["step_1_result"] == 2]
+        model_step_2 = build_judge(system_prompt=sys_prompt_open_eval_step_2, **judge_kwargs)
+        lines_step_2 = data_step_2.to_dict("records")
+        tups_step_2 = []
+        for line in tqdm(lines_step_2):
+            clue_intervals = eval(line["clue_intervals"])
+            lmu_root = LMUDataRoot()
+            clue_frame_root = osp.join(lmu_root, "clue_images", self.dataset)
+            data_root = self.data_root
+            frame_paths, _, _ = save_clue_video_frames(
+                data_root,
+                clue_frame_root,
+                video=line["video"],
+                uid=line["qid"],
+                clue_intervals=clue_intervals,
+                num_frames=32,
+            )
+            tups_step_2.append((model_step_2, line, frame_paths))
+        keys_step_2 = {line["qid"] for line in lines_step_2}
+        ans = {}
+        if osp.exists(step_2_tmp_file):
+            ans = load(step_2_tmp_file)
+        tups_step_2 = [x for x, i in zip(tups_step_2, keys_step_2) if i not in ans]
+        keys_step_2 = [i for i in keys_step_2 if i not in ans]
+        _ = track_progress_rich(
+            eval_open_second,
+            tups_step_2,
+            nproc=nproc,
+            keys=keys_step_2,
+            save=step_2_tmp_file,
+        )
+        step_2_results = load(step_2_tmp_file)
+        data_step_2 = save_step_2_steps(data_step_2, step_2_results)
+        data_no_step_2_results = data_step_2[data_step_2["score"] == -1]
+        data_step_2_over = data_step_2[data_step_2["score"].isin([0, 1])]
+        data = pd.concat(
+            [
+                data_pred_na,
+                data_no_model_result,
+                data_no_step_1_results,
+                data_step_1_over,
+                data_no_step_2_results,
+                data_step_2_over,
+            ]
+        )
+        dump(data, score_file)
+        rating = get_dimention_rating_open_ended(score_file)
+        dump(rating, tgt_file)
+        return rating

r1-a/response_generation/minicpm/MiniCPM-o/eval_mm/vlmevalkit/vlmeval/dataset/cmmmu.py ADDED Viewed

	@@ -0,0 +1,354 @@

+from .image_base import ImageBaseDataset
+import random
+from collections import Counter
+import os
+import re
+import tempfile
+from ..smp import *
+def get_multi_choice_prediction(response, all_choices, index2ans):
+    for char in [',', '.', '!', '?', ';', ':', "'"]:
+        response = response.strip(char)
+    response = " " + response + " "  # add space to avoid partial match
+    candidates = []
+    for choice in all_choices:  # (A) (B) (C) (D)
+        # Add the choice to candidates each time it appears in the response
+        candidates.extend([choice for _ in range(response.count(f'({choice})'))])
+    if len(candidates) == 0:
+        for choice in all_choices:  # A B C D
+            # Similarly, add the choice for each occurrence
+            candidates.extend([choice for _ in range(response.count(f'{choice}'))])
+    if len(candidates) == 0 and len(response.split()) >= 1:
+        for index, ans in index2ans.items():
+            # Add index for each occurrence of ans in response
+            candidates.extend([index for _ in range(response.count(ans))])
+    # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
+    if len(candidates) == 0 and len(response.split()) >= 1:
+        for index, ans in index2ans.items():
+            if ans in response:
+                candidates.append(index)
+                # index_ans = False  # it's content ans.
+    if len(candidates) == 0:  # still not get answer, randomly choose one.
+        return random.choice(all_choices)
+        # return ''
+    else:
+        # Count the occurrence of each candidate
+        candidate_counts = Counter(candidates)
+        # Select the most frequent candidates
+        max_count = max(candidate_counts.values())
+        most_frequent_candidates = [c for c in all_choices if candidate_counts.get(c, 0) == max_count]
+        # Combine the most frequent candidates in ABCD order
+        return ''.join(most_frequent_candidates)
+def extract_numbers(string):
+    # Pattern for numbers with Chinese commas
+    pattern_commas = r'-?\d{1,3}(?:，\d{3})+'
+    # Pattern for scientific notation
+    pattern_scientific = r'-?\d+(?:\.\d+)?[eE][+-]?\d+'
+    # Pattern for simple numbers without Chinese commas
+    pattern_simple = r'-?(?:\d+\.\d+|\.\d+|\d+)(?![eE][+-]?\d+)(?!，\d)'
+    # Extract numbers with Chinese commas
+    numbers_with_commas = re.findall(pattern_commas, string)
+    # Extract numbers in scientific notation
+    numbers_scientific = re.findall(pattern_scientific, string)
+    # Extract simple numbers without Chinese commas
+    numbers_simple = re.findall(pattern_simple, string)
+    # Combine all extracted numbers
+    all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
+    return all_numbers
+def check_is_number(string):
+    try:
+        float(string.replace(',', ''))
+        return True
+    except ValueError:
+        # check if there's comma inside
+        return False
+def count_letters(string):
+    return sum(c.isalpha() and 'a' <= c <= 'z' or 'A' <= c <= 'Z' for c in string)
+def normalize_str(string, answer):
+    # check if characters in the string
+    # if number, numerize it.
+    if string is None:
+        return [string]
+    string = string.strip()
+    is_number = check_is_number(string)
+    if is_number:
+        string = string.replace(',', '')
+        string = float(string)
+        # leave 2 decimal
+        string = round(string, 2)
+        return [string]
+    else:  # it's likely to be a string
+        if len(string) > len(answer) + 20 or count_letters(string) > count_letters(answer) + 2:
+            return []
+        return [string]
+def get_fill_blank_prediction(response, answer):
+    """get the prediction from the generated response,
+    return a list of predicted strings or numbers"""
+    def get_key_subresponses(response):
+        response = response.strip("。").strip()
+        sub_responses = re.split(r'。|\n', response)
+        indicators_of_keys = ['是', '为', '所以', '等于', '方案', '选择',
+                              '正确答案', '因此', '最后', '答案', '结果']
+        key_responses = []
+        for index, resp in enumerate(sub_responses):
+            # if last one, accept it's an equation (the entire response can be just one sentence with equation)
+            if index == len(sub_responses) - 1:
+                indicators_of_keys.extend(['='])
+            shortest_key_response = None
+            # the shortest response that may contain the answer (tail part of the response)
+            for indicator in indicators_of_keys:
+                if indicator in resp:
+                    if not shortest_key_response:
+                        shortest_key_response = resp.split(indicator)[-1].strip()
+                    else:
+                        if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response):
+                            shortest_key_response = resp.split(indicator)[-1].strip()
+            if shortest_key_response:
+                # and it's not trivial
+                if shortest_key_response.strip() not in [":", ",", ".", "!", "?", ";", ":", "'"]:
+                    key_responses.append(shortest_key_response)
+        if len(key_responses) == 0:  # did not found any
+            return [response]
+        return key_responses
+    key_responses = get_key_subresponses(response)
+    pred_list = key_responses.copy()  # keep the original string response
+    for resp in key_responses:
+        pred_list.extend(extract_numbers(resp))
+    tmp_pred_list = []
+    for i in range(len(pred_list)):
+        tmp_pred_list.extend(normalize_str(pred_list[i], answer))
+    pred_list = tmp_pred_list
+    # remove duplicates
+    pred_list = list(set(pred_list))
+    return pred_list
+def get_TF_prediction(response):
+    """get the prediction from the generated response,
+    return a list of predicted strings or numbers"""
+    def get_key_subresponses(response):
+        response = response.strip("。").strip()
+        sub_responses = re.split(r'。|\n', response)
+        indicators_of_keys = ['是', '为', '所以', '判断',
+                              '陈述', '说法', '表达', '答案', '结果']
+        key_responses = []
+        for index, resp in enumerate(sub_responses):
+            shortest_key_response = None
+            # the shortest response that may contain the answer (tail part of the response)
+            for indicator in indicators_of_keys:
+                if indicator in resp:
+                    if not shortest_key_response:
+                        shortest_key_response = resp.split(indicator)[-1].strip()
+                    else:
+                        if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response):
+                            shortest_key_response = resp.split(indicator)[-1].strip()
+            if shortest_key_response:
+                # and it's not trivial
+                if shortest_key_response.strip() not in [":", ",", ".", "!", "?", ";", ":", "'"]:
+                    key_responses.append(shortest_key_response)
+        if len(key_responses) == 0:  # did not found any
+            return [response]
+        return key_responses
+    key_responses = get_key_subresponses(response)
+    pred_list = key_responses.copy()  # keep the original string response
+    # remove duplicates
+    pred_list = list(set(pred_list))
+    return pred_list
+class CMMMU(ImageBaseDataset):
+    TYPE = 'VQA'
+    DATASET_URL = {
+        'CMMMU_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/CMMMU_VAL.tsv'
+    }
+    DATASET_MD5 = {
+        'CMMMU_VAL': 'b4727e2fce2415bf646379e60c11a726'
+    }
+    def dump_image(self, line):
+        os.makedirs(self.img_root, exist_ok=True)
+        tgt_path_z = []
+        if isinstance(line['image'], list):
+            for i in range(len(line['image'])):
+                tgt_path = osp.join(self.img_root, f"{line['index']}--{i + 1}.jpg")
+                if not read_ok(tgt_path):
+                    decode_base64_to_image_file(line['image'][i], tgt_path)
+                tgt_path_z.append(tgt_path)
+        else:
+            tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
+            if not read_ok(tgt_path):
+                decode_base64_to_image_file(line['image'], tgt_path)
+            tgt_path_z.append(tgt_path)
+        return tgt_path_z
+    @classmethod
+    def evaluate(self, eval_file, **judge_kwargs):
+        suffix = eval_file.split('.')[-1]
+        result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
+        if not osp.exists(result_file):
+            data = load(eval_file)
+            assert 'answer' in data and 'prediction' in data
+            data['prediction'] = [str(x) for x in data['prediction']]
+            data['answer'] = [str(x) for x in data['answer']]
+            correct_count = 0
+            correct_category = {
+                '技术与工程': [0, 0],
+                '科学': [0, 0],
+                '健康与医学': [0, 0],
+                '商业': [0, 0],
+                '艺术与设计': [0, 0],
+                '人文社会科学': [0, 0],
+            }
+            for i in tqdm(data.iterrows()):
+                line = i[1]
+                correct_category[line['category']][0] += 1
+                # Options
+                if line['type'] == '选择':
+                    index2ans = {
+                        'A': line['option1'],
+                        'B': line['option2'],
+                        'C': line['option3'],
+                        'D': line['option4']
+                    }
+                    fact_option = get_multi_choice_prediction(line['prediction'], ['A', 'B', 'C', 'D'], index2ans)
+                    if fact_option == line['answer']:
+                        correct_count += 1
+                        correct_category[line['category']][1] += 1
+                # Binary
+                elif line['type'] == '判断':
+                    positive_keywords = ['正确', '对', '准确', '肯定', '对的']
+                    negative_keywords = ['不对', '错误', '不正确', '不准确', '不合适', '否定', '错的', '错']
+                    ambiguous_keywords = ['对错', '是否正确', '否正确', '或者', '是否', '正确性', '对不']
+                    def judge_similarity(pred_list, positive_keywords, negative_keywords):
+                        positive_count = 0
+                        negative_count = 0
+                        for pred in pred_list:
+                            if any(pos_word in pred for pos_word in positive_keywords):
+                                positive_count += 1
+                            elif any(neg_word in pred for neg_word in negative_keywords):
+                                negative_count += 1
+                        if positive_count > negative_count:
+                            return "对"
+                        elif negative_count > positive_count:
+                            return "错"
+                        else:
+                            return random.choice(['对', '错'])
+                    answer = get_TF_prediction(line['prediction'])
+                    answer = [word for word in answer if not any(ambiguous in word for ambiguous in ambiguous_keywords)]
+                    fact_answer = judge_similarity(answer, positive_keywords, negative_keywords)
+                    if fact_answer == line['answer']:
+                        correct_count += 1
+                        correct_category[line['category']][1] += 1
+                # Fill the Blank
+                else:
+                    norm_answers = normalize_str(line['answer'], line['answer'])
+                    predicted_answer = get_fill_blank_prediction(line['prediction'], line['answer'])
+                    for pred in predicted_answer:
+                        # already normalized
+                        if isinstance(pred, str):  # if it's a string, then find if ans in the pred_i
+                            for norm_ans in norm_answers:
+                                # only see if the string answer in the string pred
+                                # print(norm_ans, pred)
+                                if isinstance(norm_ans, str) and norm_ans in pred:
+                                    correct_count += 1
+                                    correct_category[line['category']][1] += 1
+                        else:  # it's a number
+                            if pred in norm_answers:
+                                correct_count += 1
+                                correct_category[line['category']][1] += 1
+            accuracyz = {}
+            accuracyz['总准确率'] = correct_count / len(data)
+            for i in correct_category.keys():
+                accuracyz[i] = correct_category[i][1] / correct_category[i][0]
+            accuracyz = d2df(accuracyz)
+            accuracyz.round(10)
+            dump(accuracyz, result_file)
+        result = pd.read_csv(result_file)
+        return result
+    def build_prompt(self, line):
+        if line['type'] == '选择':
+            tgt_path = self.dump_image(line)
+            question = line['question']
+            options_prompt = 'Options:\n'
+            for i in [['A', '1'], ['B', '2'], ['C', '3'], ['D', '4']]:
+                options_prompt += i[0] + '. ' + line['option' + i[1]] + '\n'
+            prompt = (f'问题: {question}\n' + options_prompt
+                      + '请回答上述多项选择题，并选出正确选项。这些题目可能包括单选和多选题型。如果所提供的信息不足以确定一个明确的答案，那么请根据可用的数据和你的判断来选择最可能正确的选项。')
+            msgs = []
+            if isinstance(tgt_path, list):
+                msgs.extend([dict(type='image', value=p) for p in tgt_path])
+            else:
+                msgs = [dict(type='image', value=tgt_path)]
+            msgs.append(dict(type='text', value=prompt))
+            return msgs
+        elif line['type'] == '判断':
+            msgs = super().build_prompt(line)
+            assert msgs[-1]['type'] == 'text'
+            msgs[-1]['value'] += '\n请回答上述判断题，并根据题目描述和所给的信息来判断问题中陈述的对错。如果信息不完整或不足以作出绝对判断，请运用你的逻辑推理和现有信息来做出最可能的判断。'
+            return msgs
+        else:
+            msgs = super().build_prompt(line)
+            assert msgs[-1]['type'] == 'text'
+            msgs[-1]['value'] += '\n请回答上述填空题，并根据题目的要求和所提供的信息来给出最恰当的答案。如果信息不足以确切回答，那么请依据现有的数据和你的推理能力来填写最合理的答案。'
+            return msgs

r1-a/response_generation/qwenomni.py ADDED Viewed

	@@ -0,0 +1,451 @@

+import os
+import json
+import base64
+import uuid # For generating unique filenames
+import time
+import re # For parsing history
+from io import BytesIO
+import random
+import concurrent.futures # <-- For ThreadPoolExecutor
+from tqdm import tqdm # <-- For progress bar
+import threading # <-- For potential thread-local data or locks if needed later
+import traceback # <-- For detailed error printing
+import numpy as np
+import soundfile as sf
+from openai import OpenAI
+from datasets import load_from_disk, Dataset, Features, Value # Ensure Features is imported
+from dotenv import load_dotenv
+# --- Configuration ---
+load_dotenv()
+# 1. API Client Setup & Model Rotation Setup
+QWEN_MODEL_LIST = [
+    "qwen-omni-turbo",
+    "qwen-omni-turbo-latest",
+    "qwen-omni-turbo-2025-03-26",
+    "qwen-omni-turbo-2025-01-19",
+]
+NUM_MODELS = len(QWEN_MODEL_LIST)
+print(f"Using Qwen models in rotation: {QWEN_MODEL_LIST}")
+client = OpenAI(
+    # api_key=os.getenv("DASHSCOPE_API_KEY"),
+    api_key="sk-368bc96f5be74b9bbc880cc6161ab64b", # Replace with your actual key or os.getenv("DASHSCOPE_API_KEY")
+    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+)
+# 2. Dataset Paths
+INPUT_DATASET_DIR = "/home/chenyifu/audio-r1/r1-a/dataset/preference_tasks_fully_merged_with_audio/train/final_dataset"
+OUTPUT_DATASET_DIR = "/root/autodl-tmp/audio-r1/r1-a/dataset/preference_tasks_with_qwen_rotated" # <-- Adjusted name
+# 3. Output Audio Configuration
+OUTPUT_AUDIO_ROOT_DIR = "/root/autodl-tmp/audio-r1/r1-a/generated_audio/qwen_omni_rotated" # <-- Adjusted name
+OUTPUT_AUDIO_FORMAT = "wav"
+AVAILABLE_QWEN_VOICES = ["Cherry", "Serena", "Ethan", "Chelsie"]
+OUTPUT_AUDIO_SAMPLERATE = 24000
+# 4. API Call Settings
+API_RETRY_DELAY = 5
+API_MAX_RETRIES = 3
+MAX_WORKERS = 10 # <-- Set desired number of threads (Be mindful of rate limits!)
+# 5. Checkpoint Saving Configuration
+CHECKPOINT_INTERVAL = 50 # Save every 500 completed tasks
+# --- Helper Functions ---
+def encode_audio_base64(audio_path):
+    if not audio_path or not os.path.exists(audio_path):
+        print(f"Warning: Input audio file not found or path is empty: {audio_path}")
+        return None
+    try:
+        with open(audio_path, "rb") as audio_file:
+            return base64.b64encode(audio_file.read()).decode("utf-8")
+    except Exception as e:
+        print(f"Error encoding audio file {audio_path}: {e}")
+        return None
+def parse_ultra_history(history_str):
+    messages = []
+    pattern = re.compile(r"\[(USER|ASSISTANT)\]\s*([\s\S]*?)(?=\s*\[(?:USER|ASSISTANT)\]|$)")
+    matches = pattern.findall(history_str)
+    if not matches and history_str and history_str.strip():
+        if history_str.lower().startswith("user:") or history_str.lower().startswith("[user]"):
+            role = "user"
+            content = re.sub(r"^(user:|\[user\])\s*", "", history_str, flags=re.IGNORECASE).strip()
+            if content: messages.append({"role": role, "content": content})
+        elif history_str.lower().startswith("assistant:") or history_str.lower().startswith("[assistant]"):
+            role = "assistant"
+            content = re.sub(r"^(assistant:|\[assistant\])\s*", "", history_str, flags=re.IGNORECASE).strip()
+            if content: messages.append({"role": role, "content": content})
+        else:
+            return []
+    else:
+        for role_tag, content in matches:
+            role = role_tag.lower()
+            cleaned_content = content.strip()
+            if cleaned_content:
+                 messages.append({"role": role, "content": cleaned_content})
+    return messages
+# --- API Call Worker Function (Takes model_name) ---
+def call_qwen_omni_api_worker(task_info):
+    """
+    Worker function to call Qwen API for a single task using a specific model.
+    Returns results including the model used.
+    """
+    row_idx = task_info["row_idx"]
+    slot_idx = task_info["slot_idx"]
+    model_to_use = task_info["model_name"]
+    history_messages = task_info["history_messages"]
+    prompt_text = task_info["prompt_text"]
+    question_audio_path = task_info["question_audio_path"]
+    output_audio_filepath = task_info["output_audio_filepath"]
+    retries = 0
+    selected_voice = random.choice(AVAILABLE_QWEN_VOICES)
+    while retries < API_MAX_RETRIES:
+        try:
+            base64_audio_data = encode_audio_base64(question_audio_path)
+            if not base64_audio_data:
+                print(f"Error (Row {row_idx}, Slot {slot_idx}): Skipping API call due to missing input audio: {question_audio_path}")
+                # Return the model name even on error for potential logging
+                return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": "[ERROR: Missing input audio]", "saved_audio_path": None, "model_used": model_to_use}
+            input_audio_format = os.path.splitext(question_audio_path)[1].lstrip('.') or 'wav'
+            user_content = []
+            user_content.append({
+                "type": "input_audio",
+                "input_audio": {
+                    "data": f"data:audio/{input_audio_format};base64,{base64_audio_data}",
+                    "format": input_audio_format,
+                },
+            })
+            user_content.append({"type": "text", "text": prompt_text})
+            messages = history_messages + [{"role": "user", "content": user_content}]
+            completion = client.chat.completions.create(
+                model=model_to_use,
+                messages=messages,
+                modalities=["text", "audio"],
+                audio={"voice": selected_voice, "format": OUTPUT_AUDIO_FORMAT},
+                stream=True,
+                stream_options={"include_usage": True},
+            )
+            collected_text = ""
+            audio_base64_string = ""
+            usage_info = None
+            for chunk in completion:
+                if chunk.choices and len(chunk.choices) > 0:
+                    delta = chunk.choices[0].delta
+                    if hasattr(delta, 'content') and delta.content:
+                        collected_text += delta.content
+                    if hasattr(delta, "audio") and delta.audio:
+                        if "data" in delta.audio and delta.audio["data"]:
+                             audio_base64_string += delta.audio["data"]
+                        if "transcript" in delta.audio and delta.audio["transcript"]:
+                            collected_text += delta.audio["transcript"]
+                elif hasattr(chunk, "usage") and chunk.usage:
+                    usage_info = chunk.usage
+            if audio_base64_string:
+                try:
+                    wav_bytes = base64.b64decode(audio_base64_string)
+                    if len(wav_bytes) == 0:
+                         print(f"Warning (Row {row_idx}, Slot {slot_idx}, Model: {model_to_use}): Decoded audio bytes are empty.")
+                         return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": collected_text.strip(), "saved_audio_path": None, "model_used": model_to_use}
+                    if len(wav_bytes) % 2 != 0:
+                         wav_bytes = wav_bytes[:-1] # Truncate for int16
+                    if len(wav_bytes) == 0:
+                        print(f"Warning (Row {row_idx}, Slot {slot_idx}, Model: {model_to_use}): Audio bytes became empty after truncation.")
+                        return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": collected_text.strip(), "saved_audio_path": None, "model_used": model_to_use}
+                    audio_np = np.frombuffer(wav_bytes, dtype=np.int16)
+                    os.makedirs(os.path.dirname(output_audio_filepath), exist_ok=True)
+                    sf.write(output_audio_filepath, audio_np, OUTPUT_AUDIO_SAMPLERATE, format=OUTPUT_AUDIO_FORMAT.upper())
+                    return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": collected_text.strip(), "saved_audio_path": output_audio_filepath, "model_used": model_to_use}
+                except base64.binascii.Error as b64_e:
+                    print(f"Error (Row {row_idx}, Slot {slot_idx}, Model: {model_to_use}): Decoding base64 failed: {b64_e}")
+                    return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": collected_text.strip(), "saved_audio_path": None, "model_used": model_to_use}
+                except ValueError as val_e:
+                    print(f"Error (Row {row_idx}, Slot {slot_idx}, Model: {model_to_use}): Interpreting buffer as int16 failed: {val_e} (Bytes: {len(wav_bytes)})")
+                    return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": collected_text.strip(), "saved_audio_path": None, "model_used": model_to_use}
+                except Exception as e:
+                    print(f"Error (Row {row_idx}, Slot {slot_idx}, Model: {model_to_use}): Processing/saving audio bytes failed: {e}")
+                    traceback.print_exc()
+                    return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": collected_text.strip(), "saved_audio_path": None, "model_used": model_to_use}
+            else:
+                print(f"Warning (Row {row_idx}, Slot {slot_idx}, Model: {model_to_use}): No audio data received in the stream.")
+                return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": collected_text.strip(), "saved_audio_path": None, "model_used": model_to_use}
+        except Exception as e:
+            retries += 1
+            print(f"Error (Row {row_idx}, Slot {slot_idx}, Model: {model_to_use}): API Call Attempt {retries}/{API_MAX_RETRIES} failed: {e}")
+            if "rate limit" in str(e).lower() or "too many requests" in str(e).lower():
+                 print("Rate limit likely hit. Consider reducing MAX_WORKERS or increasing delays.")
+                 time.sleep(API_RETRY_DELAY * 2)
+            elif retries < API_MAX_RETRIES:
+                time.sleep(API_RETRY_DELAY)
+            else:
+                print(f"Error (Row {row_idx}, Slot {slot_idx}, Model: {model_to_use}): Max retries reached. Giving up.")
+                return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": f"[API ERROR: Max retries on {model_to_use}]", "saved_audio_path": None, "model_used": model_to_use}
+    return {"row_idx": row_idx, "slot_idx": slot_idx, "response_text": f"[UNEXPECTED ERROR on {model_to_use}]", "saved_audio_path": None, "model_used": model_to_use}
+# --- Checkpoint Saving Function (Strictly using original_features) --- # <-- MODIFIED
+def save_checkpoint(data_to_save, output_dir, dataset_features):
+    """Saves the current state of the data list as a Hugging Face Dataset,
+       strictly adhering to the provided dataset_features."""
+    if not data_to_save:
+        print("Checkpoint: No data available to save.")
+        return
+    print(f"\nCheckpoint: Saving {len(data_to_save)} rows to {output_dir}...")
+    try:
+        # --- REMOVED logic to add 'model_used' feature ---
+        # Create dataset using the original features passed to the function
+        # This will raise an error if data_to_save contains keys not in dataset_features
+        # or if data types are incompatible after processing.
+        # Ensure data_to_save only contains keys present in dataset_features.
+        # Filter data_to_save to only include keys present in the original features
+        feature_keys = set(dataset_features.keys())
+        filtered_data_to_save = []
+        for item in data_to_save:
+            filtered_item = {k: v for k, v in item.items() if k in feature_keys}
+            # Optional: Fill missing keys with None if required by schema, though from_list handles this.
+            # for key in feature_keys:
+            #    if key not in filtered_item:
+            #        filtered_item[key] = None
+            filtered_data_to_save.append(filtered_item)
+        checkpoint_dataset = Dataset.from_list(filtered_data_to_save, features=dataset_features)
+        os.makedirs(output_dir, exist_ok=True)
+        checkpoint_dataset.save_to_disk(output_dir)
+        print(f"Checkpoint: Saved successfully to {output_dir}")
+    except Exception as ckpt_save_e:
+        print(f"Error saving checkpoint dataset using datasets lib: {ckpt_save_e}")
+        print("Detailed error:", traceback.format_exc()) # Print full traceback for save errors
+        # Fallback to JSON Lines (does not strictly enforce schema)
+        output_jsonl_path = output_dir + "_checkpoint.jsonl"
+        print(f"Attempting to save checkpoint as JSON lines to {output_jsonl_path}...")
+        try:
+            # Save the original unfiltered data to JSONL for debugging if needed
+            with open(output_jsonl_path, 'w', encoding='utf-8') as f:
+                for item in data_to_save: # Use original data for JSON fallback
+                    serializable_item = {k: (v.tolist() if isinstance(v, np.ndarray) else v) for k, v in item.items()}
+                    f.write(json.dumps(serializable_item, ensure_ascii=False) + '\n')
+            print(f"Checkpoint: Fallback save successful to {output_jsonl_path}")
+        except Exception as json_save_e:
+            print(f"Error saving checkpoint as JSON lines: {json_save_e}")
+# --- Main Processing Logic ---
+print("Checking for existing checkpoint/output dataset...")
+dataset = None
+original_features = None
+try:
+    potential_checkpoint_info = os.path.join(OUTPUT_DATASET_DIR, "dataset_info.json")
+    potential_checkpoint_state = os.path.join(OUTPUT_DATASET_DIR, "state.json")
+    if os.path.exists(OUTPUT_DATASET_DIR) and \
+       (os.path.exists(potential_checkpoint_info) or os.path.exists(potential_checkpoint_state)):
+        print(f"Attempting to load existing data from output directory: {OUTPUT_DATASET_DIR}")
+        try:
+            dataset = load_from_disk(OUTPUT_DATASET_DIR)
+            original_features = dataset.features
+            print(f"Successfully resumed from {OUTPUT_DATASET_DIR}. Loaded {len(dataset)} rows.")
+            print(f"Resumed features: {original_features}") # Log the features
+        except Exception as load_ckpt_e:
+            print(f"Warning: Failed to load from {OUTPUT_DATASET_DIR}: {load_ckpt_e}")
+            dataset = None
+    else:
+        print(f"No valid existing data found in {OUTPUT_DATASET_DIR}.")
+    if dataset is None:
+        print(f"Loading original dataset from {INPUT_DATASET_DIR}...")
+        if not os.path.exists(INPUT_DATASET_DIR):
+             print(f"FATAL: Original input dataset directory not found at {INPUT_DATASET_DIR}")
+             exit(1)
+        dataset = load_from_disk(INPUT_DATASET_DIR)
+        original_features = dataset.features
+        print(f"Original dataset loaded successfully with {len(dataset)} rows.")
+        print(f"Original features: {original_features}") # Log the features
+except Exception as initial_load_e:
+    print(f"FATAL: Error during initial dataset loading: {initial_load_e}")
+    traceback.print_exc()
+    exit(1)
+breakpoint()
+# Ensure original_features is loaded
+if original_features is None:
+     print("FATAL: Failed to load dataset features. Exiting.")
+     exit(1)
+os.makedirs(OUTPUT_AUDIO_ROOT_DIR, exist_ok=True)
+# --- Pre-calculation Step (Assign Models Round-Robin) ---
+print("Pre-calculating tasks and assigning models...")
+tasks_to_process = []
+updated_data = list(dataset) # Use mutable list of dicts
+task_creation_counter = 0
+for idx, row in enumerate(tqdm(updated_data, desc="Scanning dataset")):
+    needs_processing_in_row = False
+    qwen_tasks_in_row = []
+    for i in range(1, 4):
+        model_key = f"model_{i}"
+        response_text_key = f"response_text_{i}"
+        model_assigned = row.get(model_key)
+        response_text_exists = row.get(response_text_key) is not None
+        if model_assigned == "qwen_omni" and not response_text_exists:
+             needs_processing_in_row = True
+             qwen_tasks_in_row.append(i)
+    if needs_processing_in_row:
+        slot_to_process = qwen_tasks_in_row[0]
+        i = slot_to_process
+        prompt_text_key = f"prompt_text_{i}"
+        response_audio_key = f"response_audio_path_{i}" # Define key for clarity
+        question_audio_path = row.get('question_audio')
+        if not question_audio_path or not os.path.exists(question_audio_path):
+            print(f"Warning (Row {idx}, Slot {i}): Skipping task creation - Missing or non-existent 'question_audio': {question_audio_path}")
+            # Ensure error state is marked in updated_data if skipping task creation
+            response_text_key_for_error = f"response_text_{i}"
+            response_audio_key_for_error = f"response_audio_path_{i}"
+            if 0 <= idx < len(updated_data):
+                updated_data[idx][response_text_key_for_error] = "[SKIPPED: Missing input audio]"
+                updated_data[idx][response_audio_key_for_error] = None
+            continue
+        metadata_str = row.get('metadata', "{}")
+        source_dataset = row.get('source_dataset')
+        metadata = {}
+        try:
+            if metadata_str and isinstance(metadata_str, str): metadata = json.loads(metadata_str)
+            elif isinstance(metadata_str, dict): metadata = metadata_str
+        except (json.JSONDecodeError, TypeError): pass
+        history_messages = []
+        if source_dataset == 'ultra':
+            history_str = metadata.get('history', '')
+            if history_str: history_messages = parse_ultra_history(history_str)
+        model_to_use_for_this_task = QWEN_MODEL_LIST[task_creation_counter % NUM_MODELS]
+        task_creation_counter += 1
+        unique_id = str(uuid.uuid4()).replace("-", "")
+        output_audio_filename = f"qwen_r{idx}_s{i}_{unique_id}.{OUTPUT_AUDIO_FORMAT}"
+        output_audio_filepath = os.path.join(OUTPUT_AUDIO_ROOT_DIR, output_audio_filename)
+        task_info = {
+            "row_idx": idx,
+            "slot_idx": i,
+            "model_name": model_to_use_for_this_task,
+            "history_messages": history_messages,
+            "prompt_text": row.get(prompt_text_key, ""),
+            "question_audio_path": question_audio_path,
+            "output_audio_filepath": output_audio_filepath,
+        }
+        tasks_to_process.append(task_info)
+total_tasks = len(tasks_to_process)
+if total_tasks == 0:
+    print("No Qwen tasks found needing processing in the loaded dataset.")
+    exit(0)
+print(f"Found {total_tasks} Qwen tasks to process.")
+model_counts = {model: 0 for model in QWEN_MODEL_LIST}
+for task in tasks_to_process: model_counts[task['model_name']] += 1
+print("Task distribution per model:", model_counts)
+# --- Threaded Execution with Checkpointing ---
+print(f"Starting processing with up to {MAX_WORKERS} worker threads...")
+start_total_time = time.time()
+tasks_completed = 0
+tasks_failed = 0
+completed_since_last_save = 0
+# --- REMOVED code block that updated original_features ---
+with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+    future_to_task = {executor.submit(call_qwen_omni_api_worker, task): task for task in tasks_to_process}
+    for future in tqdm(concurrent.futures.as_completed(future_to_task), total=total_tasks, desc="Processing tasks"):
+        task_info = future_to_task[future]
+        row_idx = task_info["row_idx"]
+        slot_idx = task_info["slot_idx"]
+        result = None
+        try:
+            result = future.result()
+            response_text_key = f"response_text_{slot_idx}"
+            response_audio_key = f"response_audio_path_{slot_idx}"
+            # --- REMOVED model_used_key and assignment ---
+            if 0 <= row_idx < len(updated_data):
+                 updated_data[row_idx][response_text_key] = result["response_text"]
+                 updated_data[row_idx][response_audio_key] = result["saved_audio_path"]
+                 # --- REMOVED assignment to updated_data[row_idx][model_used_key] ---
+                 if result["saved_audio_path"] is None or "ERROR" in result["response_text"]:
+                     tasks_failed += 1
+            else:
+                 print(f"Warning: Invalid row index {row_idx} encountered during result merge. Skipping update.")
+                 tasks_failed += 1
+            tasks_completed += 1
+            completed_since_last_save += 1
+            if completed_since_last_save >= CHECKPOINT_INTERVAL:
+                # Pass the unmodified original_features
+                save_checkpoint(updated_data, OUTPUT_DATASET_DIR, original_features)
+                completed_since_last_save = 0
+        except Exception as exc:
+            print(f"Error (Row {row_idx}, Slot {slot_idx}): Task generated an exception: {exc}")
+            traceback.print_exc()
+            response_text_key = f"response_text_{slot_idx}"
+            response_audio_key = f"response_audio_path_{slot_idx}"
+            # --- REMOVED model_used_key ---
+            if 0 <= row_idx < len(updated_data):
+                 updated_data[row_idx][response_text_key] = f"[ERROR: Task Exception {type(exc).__name__}]"
+                 updated_data[row_idx][response_audio_key] = None
+                 # --- REMOVED assignment to updated_data[row_idx][model_used_key] ---
+            tasks_failed += 1
+            tasks_completed += 1
+            completed_since_last_save += 1
+            if completed_since_last_save >= CHECKPOINT_INTERVAL:
+                # Pass the unmodified original_features
+                save_checkpoint(updated_data, OUTPUT_DATASET_DIR, original_features)
+                completed_since_last_save = 0
+end_total_time = time.time()
+print("\n--- Processing Complete ---")
+print(f"Total tasks submitted: {total_tasks}")
+print(f"Total tasks processed (returned): {tasks_completed} (Succeeded-ish: {tasks_completed - tasks_failed}, Failed: {tasks_failed})")
+print(f"Total processing time: {(end_total_time - start_total_time)/60:.2f} minutes")
+# --- Final Save ---
+print("\nPerforming final save...")
+# Pass the unmodified original_features
+save_checkpoint(updated_data, OUTPUT_DATASET_DIR, original_features)
+print("\nScript finished.")