Adinosaur commited on May 17, 2025

Commit

1c980b1

verified ·

1 Parent(s): e877058

Upload folder using huggingface_hub

Browse files

Files changed (49) hide show

utils/cost/Embb.txt +5 -0
utils/cost/Embs.txt +5 -0
utils/cost/Lisa.txt +5 -0
utils/cost/MMRt.txt +5 -0
utils/cost/MMRv.txt +5 -0
utils/cost/cost_text.py +87 -0
utils/cost/count_q.py +24 -0
utils/cost/p_cost.py +53 -0
utils/cost/token_results.csv +0 -0
utils/csv/4D-BA.py +61 -0
utils/csv/4D-BO.py +55 -0
utils/json/Lisa.py +168 -0
utils/json/Lisa_jtj.py +60 -0
utils/json/MMR.py +128 -0
utils/json/MedQA_jtj.py +126 -0
utils/json/RS_jtj.py +87 -0
utils/json/RS_merge.py +51 -0
utils/json/correct.py +50 -0
utils/json/display_j.py +35 -0
utils/json/emb_ai_jtp.py +215 -0
utils/json/emb_jtj.py +140 -0
utils/json/ems_jtj.py +209 -0
utils/json/jsonl.py +30 -0
utils/json/mask.py +91 -0
utils/json/merge_json.py +57 -0
utils/oss/oss_batch_upload.py +75 -0
utils/oss/oss_upload.py +63 -0
utils/oss/testis.py +4 -0
utils/parquet/ChemQA_ptj.py +159 -0
utils/parquet/MathVerse_ptj.py +209 -0
utils/parquet/MathVision_ptj.py +178 -0
utils/parquet/MathVista_ptj.py +199 -0
utils/parquet/merge_jp.py +90 -0
utils/parquet/pa_to_p.py +214 -0
utils/parquet/pathQA_ptj.py +136 -0
utils/upload/batch_download.py +58 -0
utils/upload/batch_search.py +46 -0
utils/upload/batch_upload.py +45 -0
utils/upload/compare.py +75 -0
utils/upload/download.py +10 -0
utils/upload/jsonl_otest.py +32 -0
utils/upload/jsonl_split.py +44 -0
utils/upload/load_ll.py +151 -0
utils/upload/load_vl.py +198 -0
utils/upload/request_create.py +143 -0
utils/upload_test/batch_create.py +16 -0
utils/upload_test/batch_download.py +12 -0
utils/upload_test/batch_search.py +10 -0
utils/upload_test/finderror.jsonl +94 -0

utils/cost/Embb.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+图片文件夹路径: /mnt/data/users/zys/proj/vlm_reasoning/dataset/data/EmbSpatial_bench
+总Token数量: 2121770
+计算规则：
+- 高质量模式：基础 85 Tokens + 每区块 170 Tokens
+- 低质量模式：固定 85 Tokens

utils/cost/Embs.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+图片文件夹路径: /mnt/data/users/zys/proj/vlm_reasoning/dataset/data/EmbSpatial_sft
+总Token数量: 10827725
+计算规则：
+- 高质量模式：基础 85 Tokens + 每区块 170 Tokens
+- 低质量模式：固定 85 Tokens

utils/cost/Lisa.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+图片文件夹路径: /mnt/data/users/zys/proj/vlm_reasoning/dataset/data/Lisa
+总Token数量: 944350
+计算规则：
+- 高质量模式：基础 85 Tokens + 每区块 170 Tokens
+- 低质量模式：固定 85 Tokens

utils/cost/MMRt.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+图片文件夹路径: /mnt/data/users/zys/proj/vlm_reasoning/dataset/data/MMR/train2017
+总Token数量: 50084125
+计算规则：
+- 高质量模式：基础 85 Tokens + 每区块 170 Tokens
+- 低质量模式：固定 85 Tokens

utils/cost/MMRv.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+图片文件夹路径: /mnt/data/users/zys/proj/vlm_reasoning/dataset/data/MMR/val2017
+总Token数量: 2101370
+计算规则：
+- 高质量模式：基础 85 Tokens + 每区块 170 Tokens
+- 低质量模式：固定 85 Tokens

utils/cost/cost_text.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import json
+import tiktoken
+from tqdm import tqdm
+from multiprocessing import Pool
+import pandas as pd
+# 全局编码器初始化（每个子进程独立初始化）
+def init_process():
+    global encoder
+    encoder = tiktoken.get_encoding("cl100k_base")
+def calculate_tokens(obj):
+    """计算单个对象的token数量（子进程内部调用）"""
+    global encoder
+    total_text = []
+    try:
+        messages = obj.get("body", {}).get("messages", [])
+        for msg in messages:
+            # 系统提示
+            if msg.get("role") == "system":
+                content = msg.get("content", "")
+                if content:  # 跳过空内容
+                    total_text.append(content)
+            # 用户消息
+            elif msg.get("role") == "user":
+                content = msg.get("content", [])
+                if isinstance(content, list):
+                    for item in content:
+                        if isinstance(item, dict) and item.get("type") == "text":
+                            text = item.get("text", "")
+                            if text:
+                                total_text.append(text)
+                elif isinstance(content, dict) and content.get("type") == "text":
+                    text = content.get("text", "")
+                    if text:
+                        total_text.append(text)
+        # 合并文本并计算Token
+        return len(encoder.encode("\n".join(total_text)))
+    except Exception as e:
+        print(f"处理错误: {e} | 数据: {obj.get('custom_id')}")
+        return 0
+def process_line(line):
+    """处理单行数据"""
+    try:
+        data = json.loads(line)
+        return {
+            "custom_id": data.get("custom_id"),
+            "tokens": calculate_tokens(data)
+        }
+    except json.JSONDecodeError:
+        print(f"无效JSON: {line[:100]}...")  # 打印前100字符辅助定位
+        return None
+    except Exception as e:
+        print(f"全局错误: {e}")
+        return None
+if __name__ == "__main__":
+    # 读取数据
+    with open("/mnt/data/users/zys/proj/vlm_reasoning/request/vqa_batch_requests.jsonl", "r") as f:
+        lines = f.readlines()
+    # 并行处理
+    with Pool(processes=8, initializer=init_process) as pool:
+        results = []
+        with tqdm(total=len(lines), desc="处理进度") as pbar:
+            for result in pool.imap(process_line, lines):
+                if result is not None:  # 过滤失败记录
+                    results.append(result)
+                pbar.update()
+    # 保存结果
+    df = pd.DataFrame(results)
+    df.to_csv("token_results.csv", index=False)
+    # 统计输出
+    total_tokens = df["tokens"].sum()
+    avg_tokens = df["tokens"].mean()
+    print(f"统计报告:\n"
+          f"- 总Token数: {total_tokens:,}\n"
+          f"- 平均每条: {avg_tokens:.1f}\n"
+          f"- 最大单条: {df['tokens'].max()}\n"
+          f"- 有效数据: {len(df)}/{len(lines)}")

utils/cost/count_q.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import json
+def count_questions(json_file):
+    with open(json_file, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    total = 0
+    for obj in data:
+        # 获取question列表，若不存在则返回空列表
+        questions = obj.get('question', [])
+        # 确保questions是列表类型
+        if not isinstance(questions, list):
+            questions = [questions]
+        total += len(questions)
+    return total
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) != 2:
+        print("Usage: python count_questions.py <json_file>")
+        sys.exit(1)
+    file_path = sys.argv[1]
+    count = count_questions(file_path)
+    print(f"Total number of questions: {count}")

utils/cost/p_cost.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+from PIL import Image
+def calculate_image_tokens(image_path, detail="high"):
+    if detail == "low":
+        return 85  # 低质量模式固定 85 Tokens
+    # 高质量模式（基础 85 Tokens + 分块计算）
+    with Image.open(image_path) as img:
+        width, height = img.size
+        short_side = min(width, height)
+        # 判断是否需要缩放
+        if short_side < 768:
+            # 不缩放，直接使用原图尺寸
+            new_width, new_height = width, height
+        else:
+            # 缩放短边到 768px
+            scale = 768 / short_side
+            new_width = int(width * scale)
+            new_height = int(height * scale)
+        # 计算分块数量（向上取整）
+        tiles_width = (new_width + 511) // 512
+        tiles_height = (new_height + 511) // 512
+        total_tiles = tiles_width * tiles_height
+        # 总 Tokens = 基础 85 + 分块数 × 170
+        return 85 + (total_tiles * 170)
+def calculate_folder_tokens(folder_path, detail="high"):
+    total = 0
+    for filename in os.listdir(folder_path):
+        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
+            path = os.path.join(folder_path, filename)
+            total += calculate_image_tokens(path, detail)
+    return total
+# 使用示例
+folder_path = "/mnt/data/users/zys/proj/vlm_reasoning/dataset/data/EmbSpatial_sft"
+output_file = "Embs.txt"
+total_tokens = calculate_folder_tokens(folder_path, detail="high")
+# 保存结果到文件
+with open(output_file, "w", encoding="utf-8") as f:
+    f.write(f"图片文件夹路径: {folder_path}\n")
+    f.write(f"总Token数量: {total_tokens}\n")
+    f.write("计算规则：\n")
+    f.write("- 高质量模式：基础 85 Tokens + 每区块 170 Tokens\n")
+    f.write("- 低质量模式：固定 85 Tokens\n")
+print(f"结果已保存至 {output_file}")

utils/cost/token_results.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

utils/csv/4D-BA.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import json
+from pathlib import Path
+def process_json(input_path, output_path, file_stem):
+    # 读取原始JSON文件
+    with open(input_path, 'r', encoding='utf-8') as f:
+        original_data = json.load(f)
+    processed = []
+    # 处理每个条目
+    for index, (key, item) in enumerate(original_data.items()):
+        # 生成媒体路径
+        video_id = key.split('_')[0]
+        media_path = "./" + (Path("data") / file_stem / video_id).as_posix()
+        # 处理选项
+        options = []
+        for opt_id in ['A', 'B', 'C', 'D']:
+            if text := item.get(f'({opt_id})', ''):
+                options.append({"id": opt_id, "text": text.strip()})
+        # 处理答案
+        try:
+            answer_num = int(item['Answer index'])
+            answer_ids = [options[answer_num]['id']] if 0 <= answer_num < len(options) else []
+        except (ValueError, IndexError, KeyError):
+            answer_ids = []
+        # 构建数据结构
+        processed.append({
+            "index": index,
+            "media_type": "Video",
+            "media_paths": media_path,
+            "description": item.get("Category", ""),
+            "task_type": "Vision-Question-Answer",
+            "question": [item.get("Question", "")],
+            "question_type": "multi-choice",
+            "annotations": {},
+            "options": options,
+            "answer": answer_ids,
+            "source": "4D-Bench",
+            "domain": "Embodied_ai"
+        })
+    # 保存结果
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(processed, f, indent=2, ensure_ascii=False)
+if __name__ == "__main__":
+    # 静态参数配置
+    input_path = "/mnt/data/users/zys/proj/vlm_reasoning/unprocessed_data/emb_ai/4d/4D_Object_Question_Answering/data/4d_qa.json"     # 默认输入文件
+    output_path = "/mnt/data/users/zys/proj/vlm_reasoning/dataset/4D_Object_Question_Answering.json"   # 默认输出文件
+    file_stem = "4D_Object_Question_Answering"        # 专用数据集标识
+    # 执行处理流程
+    process_json(
+        input_path=input_path,
+        output_path=output_path,
+        file_stem=file_stem
+    )

utils/csv/4D-BO.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import csv
+import json
+from pathlib import Path
+def csv_to_json(csv_path, json_path, file_stem):
+    # 初始化JSON数据结构
+    json_data = []
+    with open(csv_path, 'r', encoding='utf-8') as csv_file:
+        csv_reader = csv.DictReader(csv_file)
+        for index, row in enumerate(csv_reader):
+            # 构建media_path
+            folder_name = row['folder_name']
+            media_path = "./" + (Path("data") / file_stem / folder_name).as_posix()
+            # 收集五个caption作为答案
+            answer = [
+                row['caption_1'],
+                row['caption_2'],
+                row['caption_3'],
+                row['caption_4'],
+                row['caption_5']
+            ]
+            # 构建JSON条目
+            entry = {
+                "index": index,
+                "media_type": "Video",
+                "media_paths": media_path,
+                "description": "",
+                "task_type": "Vision-Question-Answer",
+                "question": ["Please generate descriptive captions for this multi-view video."],
+                "question_type": "free-form",
+                "annotations": {},
+                "options": [],
+                "answer": answer,
+                "source": "4D-Bench",
+                "domain": "Embodied_ai"
+            }
+            json_data.append(entry)
+    # 写入JSON文件
+    with open(json_path, 'w', encoding='utf-8') as json_file:
+        json.dump(json_data, json_file, indent=2)
+# 使用示例
+if __name__ == "__main__":
+    # 用户需要修改以下参数
+    INPUT_CSV = "/mnt/data/users/zys/proj/vlm_reasoning/unprocessed_data/emb_ai/4d/4D_Object_Captioning/data/human_annotations.csv"      # 输入CSV文件路径
+    OUTPUT_JSON = "/mnt/data/users/zys/proj/vlm_reasoning/dataset/4D_Object_Captioning.json"  # 输出JSON文件路径
+    FILE_STEM = "4D_Object_Captioning"        # media_path中的file_stem部分
+    csv_to_json(INPUT_CSV, OUTPUT_JSON, FILE_STEM)

utils/json/Lisa.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import json
+import math
+from pathlib import Path
+import numpy as np
+import cv2
+from pycocotools import mask as mask_utils
+from PIL import Image, ExifTags
+def get_image_dimensions(image_path):
+    """更健壮的尺寸获取方法，包含多种异常处理"""
+    try:
+        # 优先使用PIL获取信息
+        with Image.open(image_path) as img:
+            width, height = img.size
+            orientation = 1
+            try:
+                exif = img._getexif() or {}
+                for tag, name in ExifTags.TAGS.items():
+                    if name == 'Orientation':
+                        orientation = exif.get(tag, 1)
+                        break
+            except Exception as e:
+                print(f"EXIF读取警告 [{image_path.name}]: {str(e)}")
+            # 根据方向调整宽高
+            if orientation in [5, 6, 7, 8]:
+                return height, width  # 返回交换后的尺寸 (width, height)
+            else:
+                return width, height
+    except Exception as pil_error:
+        print(f"PIL读取失败 [{image_path.name}], 尝试OpenCV: {str(pil_error)}")
+        try:
+            # PIL失败时使用OpenCV
+            img = cv2.imread(str(image_path))
+            if img is not None:
+                h, w = img.shape[:2]
+                return w, h
+            raise ValueError("OpenCV返回空图像")
+        except Exception as cv_error:
+            print(f"严重错误: 无法获取尺寸 [{image_path.name}]: {str(cv_error)}")
+            return (0, 0)  # 返回无效尺寸，后续会报错但避免崩溃
+def points_to_rle(points, img_dimensions):
+    """带安全坐标钳位的多边形转换"""
+    width, height = img_dimensions
+    mask = np.zeros((height, width), dtype=np.uint8)
+    polygon = []
+    for x, y in points:
+        # 先四舍五入再钳位（根据标注工具特性选择策略）
+        safe_x = min(max(0, int(round(x))), width - 1)
+        safe_y = min(max(0, int(round(y))), height - 1)
+        # 若需要更保守处理（如截断小数部分）
+        # safe_x = min(max(0, int(math.floor(x))), width - 1)
+        # safe_y = min(max(0, int(math.floor(y))), height - 1)
+        polygon.append((safe_x, safe_y))
+    # 验证多边形有效性
+    if len(polygon) < 3:
+        raise ValueError(f"无效多边形，点数不足3个")
+    # 生成掩码
+    cv2.fillPoly(mask, [np.array(polygon, dtype=np.int32)], color=1)
+    rle = mask_utils.encode(np.asfortranarray(mask))
+    return {
+        "size": [height, width],
+        "counts": rle['counts'].decode('utf-8')
+    }
+def convert_medical_json(input_file, config=None):
+    """增强版转换函数"""
+    cfg = {
+        "task_type": "Image-Segmentation",
+        "source": "Lisa",
+        "domain": "General",
+        **(config or {})
+    }
+    try:
+        input_path = Path(input_file)
+        image_path = input_path.with_suffix('.jpg')
+        # 强制校验图片存在性
+        if not image_path.exists():
+            raise FileNotFoundError(f"关联图片不存在: {image_path.name}")
+        media_paths=(Path(".") / "data" / cfg['source'] / image_path.name).as_posix()
+        media_paths = f"./{media_paths}"
+        # 获取真实尺寸（已处理EXIF）
+        width, height = get_image_dimensions(image_path)
+        if width == 0 or height == 0:
+            raise ValueError("获取图片尺寸失败")
+        # 处理标注数据
+        with open(input_file, 'r', encoding='utf-8') as f:
+            raw_data = json.load(f)
+        annotations = []
+        for shape in raw_data.get('shapes', []):
+            if shape.get('label') != 'target':
+                continue
+            points = shape.get('points', [])
+            try:
+                rle = points_to_rle(points, (width, height))
+                annotations.append({
+                    "bbox": [],
+                    "segmentation": rle,
+                    "category_name": ""
+                })
+            except ValueError as e:
+                print(f"标注跳过 [{input_path.name}]: {str(e)}")
+        return [{
+            "index": 0,
+            "media_type": "image",
+            "media_paths": media_paths,
+            "description": "",
+            "task_type": cfg['task_type'],
+            "question": raw_data.get('text', []),
+            "question_type": "detection-form",
+            "options": [],
+            "annotations": [annotations],
+            "answer": [],
+            "source": cfg['source'],
+            "domain": cfg['domain']
+        }]
+    except Exception as e:
+        print(f"转换失败 [{input_path.name}]: {str(e)}")
+        return None
+def batch_convert(input_dir, output_file):
+    """批量处理增强版"""
+    input_dir = Path(input_dir)
+    all_data = []
+    success_count = 0
+    failed_files = []
+    index_counter = 0  # 新增全局索引计数器
+    for json_file in input_dir.glob('*.json'):
+        if result := convert_medical_json(json_file):
+            # 为每个条目分配递增索引
+            for item in result:
+                item["index"] = index_counter
+                index_counter += 1
+            all_data.extend(result)
+            success_count += len(result)
+        else:
+            failed_files.append(json_file.name)
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(all_data, f, indent=2, ensure_ascii=False)
+    print(f"转换完成: 成功 {success_count} 个文件，失败 {len(failed_files)} 个")
+    if failed_files:
+        print("失败文件列表:\n" + "\n".join(failed_files))
+if __name__ == "__main__":
+    batch_convert(
+        input_dir="/mnt/data/users/zys/proj/vlm_reasoning/unprocessed_data/general/lisa/image/val",
+        output_file="/mnt/data/users/zys/proj/vlm_reasoning/utils/json/converted_dataset3.json"
+    )

utils/json/Lisa_jtj.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import os
+import shutil
+import argparse
+def delete_json_files(folder_path):
+    """删除指定文件夹中的所有JSON文件"""
+    for filename in os.listdir(folder_path):
+        if filename.endswith('.json'):
+            file_path = os.path.join(folder_path, filename)
+            try:
+                os.remove(file_path)
+                print(f"已删除JSON文件: {file_path}")
+            except Exception as e:
+                print(f"删除 {file_path} 时出错: {str(e)}")
+def merge_folders(input_path, output_dir):
+    """合并三个文件夹的内容到输出目录"""
+    # 确保输出目录存在
+    os.makedirs(output_dir, exist_ok=True)
+    # 定义要处理的子文件夹
+    subfolders = ['train', 'test', 'val']
+    # 检查输入目录结构有效性
+    for sub in subfolders:
+        if not os.path.exists(os.path.join(input_path, sub)):
+            raise ValueError(f"输入路径中缺少 {sub} 文件夹")
+    # 处理每个子文件夹
+    for sub in subfolders:
+        sub_path = os.path.join(input_path, sub)
+        # 删除JSON文件
+        delete_json_files(sub_path)
+        # 复制所有文件到目标目录
+        for filename in os.listdir(sub_path):
+            src = os.path.join(sub_path, filename)
+            dst = os.path.join(output_dir, filename)
+            # 处理重复文件名（保留后复制到的文件）
+            if os.path.exists(dst):
+                print(f"警告: {filename} 已存在，将被覆盖")
+            shutil.copy2(src, dst)
+            print(f"已复制: {src} -> {dst}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="合并训练/测试/验证集并删除JSON文件")
+    parser.add_argument('-i','--input_path', required=True, help='包含train/test/val文件夹的输入路径')
+    parser.add_argument('-o','--output_dir', required=True, help='合并后的输出目录路径')
+    args = parser.parse_args()
+    try:
+        print(f"开始处理，输入路径: {args.input_path}")
+        merge_folders(args.input_path, args.output_dir)
+        print("\n操作完成！合并后的文件位于:", os.path.abspath(args.output_dir))
+    except Exception as e:
+        print(f"\n发生错误: {str(e)}")

utils/json/MMR.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import json
+import os
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor
+def convert_medical_json(input_file, output_file, config=None):
+    default_config = {
+        "task_type": "Visual_Question_Answering",
+        "source": "Embspatial",
+        "domain": "Embodied_ai"
+    }
+    cfg = {**default_config, **(config or {})}
+    try:
+        with open(input_file, 'r', encoding='utf-8') as f:
+            raw_data = json.load(f)
+        converted = []
+        for index, item in enumerate(raw_data if isinstance(raw_data, list) else [raw_data]):
+            # 重构媒体路径
+            img_path = item['file_name']
+            media_path = (Path("data") / cfg['source'] / img_path).as_posix()
+            media_path = f"./{media_path}"
+            # 处理annotations（原样保留answers结构）
+            annotations = []
+            for answer_group in item.get("answers", []):  # 遍历每个答案组
+                group_annotations = []
+                for answer in answer_group:  # 遍历组中的每个答案对象
+                    annotation = {
+                        "bbox": answer.get("bbox", []),
+                        "segmentation": answer.get("segmentation", {}),
+                        "category_name": answer.get("category_name", "")
+                    }
+                    group_annotations.append(annotation)
+                annotations.append(group_annotations)
+            converted.append({
+                "index": index,
+                "media_type": "image",
+                "media_paths": media_path,
+                "description": "",
+                "task_type": cfg['task_type'],
+                "question": item.get('questions', []),    # 直接取列表值
+                "question_type": "detection-form",
+                "options": [],
+                "annotations": annotations,              # 保持原列表套列表结构
+                "answer": item.get('raw_answers', []),    # 直接取列表值
+                "source": cfg['source'],
+                "domain": cfg['domain']
+            })
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(converted, f, indent=2, ensure_ascii=False)
+        return True
+    except Exception as e:
+        print(f"转换失败: {input_file} → {str(e)}")
+        return False
+def process_single_file(input_path, output_dir, config):
+    """单个文件处理函数（扁平化输出）"""
+    try:
+        # 生成输出路径：输出目录 + 原文件名
+        output_file = output_dir / input_path.name
+        return convert_medical_json(
+            input_file=str(input_path),
+            output_file=str(output_file),
+            config=config
+        )
+    except Exception as e:
+        print(f"文件处理异常: {input_path} → {str(e)}")
+        return False
+def batch_convert_json(input_dir, output_dir, config=None, max_workers=8):
+    """扁平化批量处理器"""
+    input_path = Path(input_dir)
+    output_path = Path(output_dir)
+    # 创建输出目录（只需创建一次）
+    output_path.mkdir(parents=True, exist_ok=True)
+    if not input_path.exists():
+        raise FileNotFoundError(f"输入目录不存在: {input_dir}")
+    success_count = 0
+    failure_count = 0
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = []
+        # 仅遍历当前目录的JSON文件
+        for input_file in input_path.glob('*.json'):         # 关键修改点
+            if input_file.is_file():                        # 确保是文件
+                futures.append(
+                    executor.submit(
+                        process_single_file,
+                        input_path=input_file,
+                        output_dir=output_path,
+                        config=config
+                    )
+                )
+        # 统计处理结果
+        for future in futures:
+            if future.result():
+                success_count += 1
+            else:
+                failure_count += 1
+    print(f"\n处理完成: 成功 {success_count} 个文件，失败 {failure_count} 个文件")
+    print(f"输出目录: {output_path.resolve()}")
+if __name__ == "__main__":
+    custom_config = {
+        "source": "MMR",
+        "task_type": "Multi-Format-Task",
+        "domain": "General"
+    }
+    try:
+        batch_convert_json(
+            input_dir="/mnt/data/users/zys/proj/vlm_reasoning/unprocessed_data/general/mmr",
+            output_dir="/mnt/data/users/zys/proj/vlm_reasoning/dataset",
+            config=custom_config,
+            max_workers=os.cpu_count() * 2
+        )
+    except Exception as e:
+        print(f"批量处理异常终止: {str(e)}")

utils/json/MedQA_jtj.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import json
+import os
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor
+def convert_medical_json(input_file, output_file, config=None):
+    """医疗数据格式转换器（扁平化输出版本）"""
+    default_config = {
+        "task_type": "Visual_Question_Answering",
+        "source": "OmniMedVQA"
+    }
+    cfg = {**default_config, **(config or {})}
+    try:
+        with open(input_file, 'r', encoding='utf-8') as f:
+            raw_data = json.load(f)
+        converted = []
+        for index, item in enumerate(raw_data if isinstance(raw_data, list) else [raw_data]):
+            # 重构媒体路径
+            img_path = item['image_path'].split('Images/', 1)[-1]
+            media_path = f"./{cfg['source']}/{img_path}"
+            # 构建选项结构
+            options = []
+            for opt_id in ['A', 'B', 'C', 'D']:
+                if text := item.get(f'option_{opt_id}', ''):
+                    options.append({"id": opt_id, "text": text.strip()})
+            # 匹配正确答案
+            gt_answer = str(item['gt_answer']).strip()
+            correct_ids = [opt['id'] for opt in options if opt['text'] == gt_answer]
+            converted.append({
+                "index": index,
+                "media_type": "image",
+                "media_paths": media_path,
+                "description": "",
+                "task_type": cfg['task_type'],
+                "question": [item['question']],
+                "question_type": "multi-choice",
+                "options": options,
+                "annotations": [],
+                "answer": correct_ids,
+                "source": cfg['source'],
+                "domain": "Biomedical"
+            })
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(converted, f, indent=2, ensure_ascii=False)
+        return True
+    except Exception as e:
+        print(f"转换失败: {input_file} → {str(e)}")
+        return False
+def process_single_file(input_path, output_dir, config):
+    """单个文件处理函数（扁平化输出）"""
+    try:
+        # 生成输出路径：输出目录 + 原文件名
+        output_file = output_dir / input_path.name
+        return convert_medical_json(
+            input_file=str(input_path),
+            output_file=str(output_file),
+            config=config
+        )
+    except Exception as e:
+        print(f"文件处理异常: {input_path} → {str(e)}")
+        return False
+def batch_convert_json(input_dir, output_dir, config=None, max_workers=8):
+    """扁平化批量处理器"""
+    input_path = Path(input_dir)
+    output_path = Path(output_dir)
+    # 创建输出目录（只需创建一次）
+    output_path.mkdir(parents=True, exist_ok=True)
+    if not input_path.exists():
+        raise FileNotFoundError(f"输入目录不存在: {input_dir}")
+    success_count = 0
+    failure_count = 0
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = []
+        # 遍历所有JSON文件（忽略目录结构）
+        for root, _, files in os.walk(input_dir):
+            for filename in files:
+                if filename.lower().endswith('.json'):
+                    input_file = Path(root) / filename
+                    futures.append(
+                        executor.submit(
+                            process_single_file,
+                            input_path=input_file,
+                            output_dir=output_path,
+                            config=config
+                        )
+                    )
+        # 统计处理结果
+        for future in futures:
+            if future.result():
+                success_count += 1
+            else:
+                failure_count += 1
+    print(f"\n处理完成: 成功 {success_count} 个文件，失败 {failure_count} 个文件")
+    print(f"输出目录: {output_path.resolve()}")
+if __name__ == "__main__":
+    custom_config = {
+        "source": "OmniMedVQA",
+        "task_type": "Visual_Question_Answering"
+    }
+    try:
+        batch_convert_json(
+            input_dir="/mnt/data/users/zys/proj/vlm_reasoning/unprocessed_data/biomedical/medvqa/OmniMedVQA/QA_information",
+            output_dir="/mnt/data/users/zys/proj/vlm_reasoning/unprocessed_data/biomedical/medvqa",
+            config=custom_config,
+            max_workers=min(os.cpu_count() * 2, 32)
+        )
+    except Exception as e:
+        print(f"批量处理异常终止: {str(e)}")

utils/json/RS_jtj.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import json
+from collections import defaultdict
+def transform_json_structure(input_path, output_path):
+    """
+    将RSVQA原始JSON格式转换为按图片分组的结构
+    :param input_path: 输入JSON文件路径
+    :param output_path: 输出JSON文件路径
+    """
+    # 读取原始数据
+    try:
+        with open(input_path, 'r', encoding='utf-8') as f:
+            original_data = json.load(f)
+            question_list = original_data.get('merged_data', [])
+    except Exception as e:
+        raise RuntimeError(f"读取输入文件失败: {str(e)}")
+    # 创建分组容器（自动处理键不存在的情况）
+    image_groups = defaultdict(list)
+    # 分组处理原始数据
+    for qa_pair in question_list:
+        try:
+            img_id = qa_pair['img_id']
+            # 过滤无效数据
+            if not isinstance(img_id, int) or img_id < 0:
+                continue
+            # 每组最多保留15个问题
+            if len(image_groups[img_id]) < 15:
+                image_groups[img_id].append({
+                    'question': qa_pair.get('question', ''),
+                    'answer': qa_pair.get('answer', '')
+                })
+        except KeyError as ke:
+            print(f"跳过缺少关键字段的数据: {str(ke)}")
+            continue
+    # 构建新数据结构
+    transformed_data = []
+    for index, (img_id, qa_pairs) in enumerate(image_groups.items()):
+        # 生成媒体文件路径
+        media_path = f"./data/RSVQA/{img_id}.png"
+        # 提取问题和答案列表
+        questions = [pair['question'] for pair in qa_pairs]
+        answers = [pair['answer'] for pair in qa_pairs]
+        # 构建输出格式
+        transformed_data.append({
+            "index": img_id,
+            "media_type": "image",
+            "media_paths": media_path,
+            "description": "",
+            "task_type": "Vision-Question-Answer",
+            "question": questions,
+            "question_type": "free-form",
+            "annotations": [],
+            "options": [],
+            "answer": answers,
+            "source": "RSVQA",
+            "domain": "Satellite-Remote-Sensing"
+        })
+    # 保存转换后的数据
+    try:
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(transformed_data, f, indent=2, ensure_ascii=False)
+    except Exception as e:
+        raise RuntimeError(f"写入输出文件失败: {str(e)}")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='RSVQA数据格式转换工具')
+    parser.add_argument('-i','--input', type=str, required=True, help='输入JSON文件路径')
+    parser.add_argument('-o','--output', type=str, default='transformed.json',
+                       help='输出JSON文件路径 (默认: transformed.json)')
+    args = parser.parse_args()
+    try:
+        transform_json_structure(args.input, args.output)
+        print(f"转换成功！输出文件已保存至: {args.output}")
+    except Exception as e:
+        print(f"处理过程中发生错误: {str(e)}")
+        exit(1)

utils/json/RS_merge.py ADDED Viewed

	@@ -0,0 +1,51 @@

+'''
+这个程序是需要先运行的一个程序它的作用是将哦answers和of questions合并为一个JSON文件便于去转换
+使用方法很简单看程序最后的使用示例，只需填入两个文件的地址然后给定输出地址就可以
+'''
+import json
+from collections import defaultdict
+def deep_merge(base_dict, merge_dict):
+    """递归合并字典，处理嵌套结构和冲突"""
+    for key in merge_dict:
+        if key in base_dict:
+            # 处理字典类型合并
+            if isinstance(base_dict[key], dict) and isinstance(merge_dict[key], dict):
+                deep_merge(base_dict[key], merge_dict[key])
+            # 处理数组类型合并（保留原数组）
+            elif isinstance(base_dict[key], list) and isinstance(merge_dict[key], list):
+                base_dict[key] = base_dict[key] + merge_dict[key]
+            # 处理其他类型冲突（保留原始值）
+            else:
+                pass  # 保持base_dict原有值
+        else:
+            # 新增不存在字段
+            base_dict[key] = merge_dict[key]
+    return base_dict
+def merge_json_files(answers_file, questions_file, output_file):
+    # 加载数据并建立索引
+    with open(answers_file) as f:
+        answers = {item['id']: item for item in json.load(f)['answers']}
+    with open(questions_file) as f:
+        questions = json.load(f)['questions']
+    # 智能合并处理
+    merged = []
+    for q in questions:
+        merged_q = q.copy()
+        # 处理answers_ids关联
+        for ans_id in q.get('answers_ids', []):
+            if ans_id in answers:
+                # 执行深度合并
+                merged_q = deep_merge(merged_q, answers[ans_id])
+        merged.append(merged_q)
+    # 保存结果
+    with open(output_file, 'w') as f:
+        json.dump({"merged_data": merged}, f, indent=2, ensure_ascii=False)
+# 使用示例
+merge_json_files("/mnt/data/users/zys/proj/vlm_reasoning/unprocessed_data/Satellite/USGSanswers.json", "/mnt/data/users/zys/proj/vlm_reasoning/unprocessed_data/Satellite/USGSquestions.json", '/mnt/data/users/zys/proj/vlm_reasoning/unprocessed_data/Satellite/merged_output.json')

utils/json/correct.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+import json
+def replace_underscores_in_task_type(data):
+    """处理JSON数据，替换task_type中的下划线"""
+    for item in data:
+        if isinstance(item, dict) and 'task_type' in item:
+            original = item['task_type']
+            if isinstance(original, str):
+                item['task_type'] = original.replace('_', '-')
+    return data
+def process_json_file(file_path):
+    """处理单个JSON文件"""
+    try:
+        with open(file_path, 'r+', encoding='utf-8') as f:
+            # 读取并验证数据格式
+            data = json.load(f)
+            if not isinstance(data, list):
+                print(f" 已跳过非数组文件：{file_path}")
+                return
+            # 处理数据并写回
+            modified_data = replace_underscores_in_task_type(data)
+            f.seek(0)
+            json.dump(modified_data, f, indent=4, ensure_ascii=False)
+            f.truncate()
+        print(f" 成功处理：{os.path.basename(file_path)}")
+    except Exception as e:
+        print(f" 处理失败 [{os.path.basename(file_path)}]：{str(e)}")
+def process_directory(target_dir):
+    """处理目录下的JSON文件（不包含子目录）"""
+    for file in os.listdir(target_dir):
+        if file.lower().endswith('.json'):
+            file_path = os.path.join(target_dir, file)
+            if os.path.isfile(file_path):
+                process_json_file(file_path)
+if __name__ == '__main__':
+    input_directory = "/mnt/data/users/zys/proj/vlm_reasoning/utils/json"  # 修改为你的目录路径
+    if os.path.isdir(input_directory):
+        print(f" 正在处理目录：{input_directory}")
+        process_directory(input_directory)
+        print(" 处理完成")
+    else:
+        print(f" 目录不存在：{input_directory}")

utils/json/display_j.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import json
+import sys
+def show_jsonl_object(file_path, index=0):
+    """
+    从JSONL文件中提取并展示指定索引位置的对象
+    :param file_path: JSONL文件路径
+    :param index: 要提取的对象索引（从0开始）
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for i, line in enumerate(f):
+                if i == index:
+                    try:
+                        data = json.loads(line)
+                        print(f"第 {index} 个对象的JSON内容：")
+                        print(json.dumps(data, indent=4, ensure_ascii=False))
+                        return
+                    except json.JSONDecodeError:
+                        print(f"错误：第 {index} 行不是有效的JSON格式")
+                        return
+            print(f"警告：文件只有 {i+1} 行，无法读取第 {index} 行")
+    except FileNotFoundError:
+        print(f"错误：文件 {file_path} 不存在")
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("使用方法：python show_jsonl.py <文件路径> [行号]")
+        sys.exit(1)
+    file_path = sys.argv[1]
+    index = int(sys.argv[2]) if len(sys.argv) > 2 else 0
+    show_jsonl_object(file_path, index)

utils/json/emb_ai_jtp.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import argparse
+import json
+import base64
+import sys
+import time
+import logging
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Tuple, List
+from PIL import Image
+import io
+# 配置日志格式
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    stream=sys.stdout
+)
+def process_json_element(element: dict,
+                         index: int,
+                         output_dir: Path,
+                         overwrite: bool,
+                         output_format: str) -> Tuple[int, str]:  # 新增格式参数
+    """处理单个JSON数组元素（支持格式选择）"""
+    try:
+        # 参数校验并统一转为小写
+        output_format = output_format.lower()  # 确保后续判断统一使用小写
+        if output_format not in ['jpg', 'png']:
+            raise ValueError(f"不支持的格式: {output_format}")
+        # 动态生成路径参数（文件扩展名保持小写）
+        file_ext = output_format
+        # 映射PIL所需的格式名称（JPEG非JPG）
+        img_format = 'JPEG' if output_format == 'jpg' else output_format.upper()
+        save_args = {'quality': 95} if output_format == 'jpg' else {'compress_level': 6}
+        output_path = output_dir / f"{index}.{file_ext}"
+        # 跳过已存在文件
+        if not overwrite and output_path.exists():
+            return (index, "skipped")
+        # 数据校验
+        if not isinstance(element, dict):
+            raise ValueError("数组元素不是字典类型")
+        if "image" not in element:
+            raise KeyError("缺少'image'字段")
+        # 图像解码
+        image_bytes = base64.b64decode(element["image"])
+        with Image.open(io.BytesIO(image_bytes)) as img:
+            # 公共处理：CMYK转换
+            if img.mode == 'CMYK':
+                img = img.convert('RGB')
+            # 格式专用处理
+            if output_format == 'jpg':
+                # 处理需要转换的透明模式
+                if img.mode == 'RGBA':
+                    background = Image.new('RGB', img.size, (255, 255, 255))
+                    background.paste(img, mask=img.split()[-1])
+                    img = background
+                elif img.mode in ['P', 'PA']:  # 调色板模式处理
+                    img = img.convert('RGBA')
+                    background = Image.new('RGB', img.size, (255, 255, 255))
+                    background.paste(img, mask=img.split()[-1])
+                    img = background
+                elif img.mode == 'LA':  # 灰度+透明度
+                    img = img.convert('L')
+                # 最终模式校验
+                if img.mode not in ['RGB', 'L']:
+                    img = img.convert('RGB')
+            # 保存图像（使用PIL兼容的格式名）
+            img.save(output_path, img_format, **save_args)
+            return (index, "success")
+    except Exception as e:
+        return (index, f"error: {str(e)}")
+def process_single_json(json_path: Path,
+                       output_root: Path,
+                       threads: int = 4,
+                       overwrite: bool = False,
+                       output_format: str = 'jpg') -> Tuple[str, int, int]:  # 新增格式参数
+    """处理单个JSON文件（支持并发）"""
+    start_time = time.time()
+    file_stem = json_path.stem
+    output_dir = output_root / file_stem
+    output_dir.mkdir(parents=True, exist_ok=True)
+    error_log = []
+    success_count = 0
+    skipped_count = 0
+    try:
+        with open(json_path, "r") as f:
+            json_data = json.load(f)
+        if not isinstance(json_data, list):
+            raise ValueError("JSON根元素不是数组类型")
+        with ThreadPoolExecutor(max_workers=threads) as executor:
+            futures = [
+                executor.submit(
+                    process_json_element,
+                    element,
+                    idx,
+                    output_dir,
+                    overwrite,
+                    output_format  # 传递格式参数
+                )
+                for idx, element in enumerate(json_data)
+            ]
+            for future in as_completed(futures):
+                idx, status = future.result()
+                if status == "success":
+                    success_count += 1
+                elif status == "skipped":
+                    skipped_count += 1
+                elif status.startswith("error"):
+                    error_log.append(f"元素{idx}错误: {status[6:]}")
+        process_time = time.time() - start_time
+        logging.info(
+            f"文件 {file_stem} 处理完成 | "
+            f"成功: {success_count} | "
+            f"跳过: {skipped_count} | "
+            f"错误: {len(error_log)} | "
+            f"耗时: {process_time:.2f}s"
+        )
+        if error_log:
+            (output_dir / "process_errors.log").write_text("\n".join(error_log))
+        return (json_path.name, success_count, len(error_log))
+    except Exception as e:
+        logging.error(f"文件处理失败: {str(e)}")
+        return (json_path.name, 0, 1)
+def batch_process_jsons(input_dir: Path,
+                       output_root: Path,
+                       threads: int = 4,
+                       overwrite: bool = False,
+                       output_format: str = 'jpg'):  # 新增格式参数
+    """批量处理JSON文件"""
+    input_path = Path(input_dir)
+    output_root = Path(output_root)
+    if not input_path.exists():
+        raise FileNotFoundError(f"输入目录不存在: {input_path}")
+    json_files = list(input_path.glob("*.json"))
+    if not json_files:
+        logging.warning("未找到JSON文件")
+        return
+    total_stats = {"success": 0, "errors": 0}
+    with ThreadPoolExecutor(max_workers=threads) as executor:
+        futures = {
+            executor.submit(
+                process_single_json,
+                json_file,
+                output_root,
+                threads,
+                overwrite,
+                output_format  # 传递格式参数
+            ): json_file for json_file in json_files
+        }
+        for future in as_completed(futures):
+            try:
+                filename, success, errors = future.result()
+                total_stats["success"] += success
+                total_stats["errors"] += errors
+            except Exception as e:
+                total_stats["errors"] += 1
+                logging.error(f"处理异常: {str(e)}")
+    logging.info(f"\n{'='*40}")
+    logging.info(f"处理完成文件总数: {len(json_files)}")
+    logging.info(f"总成功图片数: {total_stats['success']}")
+    logging.info(f"总错误数: {total_stats['errors']}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="处理JSON文件中的Base64图片（支持格式选择）")
+    parser.add_argument("-i", "--input", required=True, help="输入目录路径")
+    parser.add_argument("-o", "--output", required=True, help="输出目录路径")
+    parser.add_argument("--threads", type=int, default=4, help="并发线程数（默认4）")
+    parser.add_argument("--overwrite", action="store_true", help="覆盖已存在的文件")
+    parser.add_argument("--format", choices=['png', 'jpg'], default='jpg',
+                        help="输出图片格式（png/jpg，默认jpg）")
+    args = parser.parse_args()
+    try:
+        start = time.time()
+        batch_process_jsons(
+            input_dir=args.input,
+            output_root=args.output,
+            threads=args.threads,
+            overwrite=args.overwrite,
+            output_format=args.format  # 传递格式参数
+        )
+        logging.info(f"\n总耗时: {time.time()-start:.2f}秒")
+    except Exception as e:
+        logging.error(f"程序异常终止: {str(e)}")
+        sys.exit(1)

utils/json/emb_jtj.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import json
+import os
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor
+def convert_medical_json(input_file, output_file, config=None):
+    default_config = {
+        "task_type": "Visual_Question_Answering",
+        "source": "Embspatial",
+        "domain": "Embodied_ai"
+    }
+    cfg = {**default_config, **(config or {})}
+    input_path = Path(input_file)
+    file_stem = input_path.stem
+    try:
+        with open(input_file, 'r', encoding='utf-8') as f:
+            raw_data = json.load(f)
+        converted = []
+        for index, item in enumerate(raw_data if isinstance(raw_data, list) else [raw_data]):
+            # 重构媒体路径
+            media_path = "./" + (Path("data") / file_stem / f"{index}.jpg").as_posix()
+            # 构建选项结构
+            options = []
+            for idx, option_text in enumerate(item['answer_options']):
+                opt_id = chr(65 + idx)  # 65对应ASCII码'A'
+                options.append({"id": opt_id, "text": option_text.strip()})
+            try:
+                answer_num = int(item['answer'])
+                answer_ids = [options[answer_num]['id']] if 0 <= answer_num < len(options) else []
+            except (ValueError, IndexError, KeyError):
+                answer_ids = []
+            annotations = []
+            for obj in item.get("objects", []):  # 提取每个item中的object数组
+                annotation = {
+                    "bbox": obj.get("bbox", []),
+                    "segmentation": {
+                        "size": [],  # 无对应数据，置空
+                        "counts": ""   # 无对应数据，置空字符串
+                    },
+                "category_name": obj.get("name", "")
+            }
+            annotations.append(annotation)
+            converted.append({
+                "index": index,
+                "media_type": "image",
+                "media_paths": media_path,
+                "description": str(item.get('relation', "")),
+                "task_type": cfg['task_type'],
+                "question": [item.get('question', "")],
+                "question_type": "multi-choice",
+                "options": options,
+                "annotations": annotations,
+                "answer": answer_ids,
+                "source": cfg['source'],
+                "domain": cfg['domain']
+            })
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(converted, f, indent=2, ensure_ascii=False)
+        return True
+    except Exception as e:
+        print(f"转换失败: {input_file} → {str(e)}")
+        return False
+def process_single_file(input_path, output_dir, config):
+    """单个文件处理函数（扁平化输出）"""
+    try:
+        # 生成输出路径：输出目录 + 原文件名
+        output_file = output_dir / input_path.name
+        return convert_medical_json(
+            input_file=str(input_path),
+            output_file=str(output_file),
+            config=config
+        )
+    except Exception as e:
+        print(f"文件处理异常: {input_path} → {str(e)}")
+        return False
+def batch_convert_json(input_dir, output_dir, config=None, max_workers=8):
+    """扁平化批量处理器"""
+    input_path = Path(input_dir)
+    output_path = Path(output_dir)
+    # 创建输出目录（只需创建一次）
+    output_path.mkdir(parents=True, exist_ok=True)
+    if not input_path.exists():
+        raise FileNotFoundError(f"输入目录不存在: {input_dir}")
+    success_count = 0
+    failure_count = 0
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = []
+        # 仅遍历当前目录的JSON文件
+        for input_file in input_path.glob('*.json'):         # 关键修改点
+            if input_file.is_file():                        # 确保是文件
+                futures.append(
+                    executor.submit(
+                        process_single_file,
+                        input_path=input_file,
+                        output_dir=output_path,
+                        config=config
+                    )
+                )
+        # 统计处理结果
+        for future in futures:
+            if future.result():
+                success_count += 1
+            else:
+                failure_count += 1
+    print(f"\n处理完成: 成功 {success_count} 个文件，失败 {failure_count} 个文件")
+    print(f"输出目录: {output_path.resolve()}")
+if __name__ == "__main__":
+    custom_config = {
+        "source": "EmbSpatial",
+        "task_type": "Object_Detection",
+        "domain": "Embodied_ai"
+    }
+    try:
+        batch_convert_json(
+            input_dir="/mnt/data/users/zys/proj/vlm_reasoning/unprocessed_data/emb_ai",
+            output_dir="/mnt/data/users/zys/proj/vlm_reasoning/dataset",
+            config=custom_config,
+            max_workers=min(os.cpu_count() * 2, 32)
+        )
+    except Exception as e:
+        print(f"批量处理异常终止: {str(e)}")

utils/json/ems_jtj.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import json
+import os
+import re
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor
+def convert_medical_json(input_file, output_file, config=None):
+    """医疗数据格式转换器（支持多格式选项解析）"""
+    default_config = {
+        "task_type": "Visual_Question_Answering",
+        "source": "Embspatial",
+        "domain": "Embodied_ai"
+    }
+    cfg = {**default_config, **(config or {})}
+    input_path = Path(input_file)
+    file_stem = input_path.stem
+    try:
+        with open(input_file, 'r', encoding='utf-8') as f:
+            raw_data = json.load(f)
+        converted = []
+        for index, item in enumerate(raw_data if isinstance(raw_data, list) else [raw_data]):
+            # 媒体路径生成
+            media_path = "./" + (Path("data") / file_stem / f"{index}.jpg").as_posix()
+            # 处理对象标注
+            annotations = []
+            objects_list = []
+            for obj in item.get("objects", []):
+                annotation = {
+                    "bbox": obj.get("bbox", []),
+                    "segmentation": {},
+                    "category_name": obj.get("name", "")
+                }
+                objects_list.append(annotation)
+            annotations.append(objects_list)
+            # 问题解析增强逻辑
+            questions_list = item.get('questions', [])
+            question_for_eval = str(questions_list[0]) if questions_list else ""
+            options = []
+            question_text = ""
+            question_type = "free-form"
+            # 多格式选项解析
+            if "Options:" in question_for_eval:
+                question_type = "multi-choice"
+                q_parts = question_for_eval.split("Options:", 1)
+                question_part = q_parts[0].strip()
+                choices_part = q_parts[1].strip() if len(q_parts) > 1 else ""
+                # 清理问题文本
+                question_text = re.sub(r'\s+', ' ', question_part.replace("\n", " ")).strip()
+                # 自动生成选项ID的计数器
+                option_id_counter = 65  # ASCII 'A'
+                # 分层次解析选项
+                for line in re.split(r'[\n;]', choices_part):
+                    line = line.strip()
+                    if not line:
+                        continue
+                    # 处理分号分隔的选项（新增逻辑）
+                    if re.match(r'^[^:\.]+$', line):  # 没有冒号或点号的情况
+                        for sub_opt in re.split(r';\s*', line):
+                            sub_opt = sub_opt.strip()
+                            if sub_opt:
+                                options.append({
+                                    "id": chr(option_id_counter),
+                                    "text": re.sub(r'\s+', ' ', sub_opt)
+                                })
+                                option_id_counter += 1
+                    else:
+                        # 处理标准格式（A: 或 A.）
+                        match = re.match(r'^([A-Za-z])[\.:]\s*(.+)$', line)
+                        if match:
+                            opt_id, opt_text = match.groups()
+                            options.append({
+                                "id": opt_id.upper(),
+                                "text": re.sub(r'\s+', ' ', opt_text.strip())
+                            })
+                        else:
+                            # 保底处理：自动生成ID
+                            options.append({
+                                "id": chr(option_id_counter),
+                                "text": re.sub(r'\s+', ' ', line.strip())
+                            })
+                            option_id_counter += 1
+            else:
+                # 自由格式问题处理
+                question_text = re.sub(r'\s+', ' ', question_for_eval.replace("\n", " ")).strip()
+            # 智能答案匹配系统
+            def match_answer(raw_answer, options_list):
+                """四层答案匹配机制"""
+                raw_answer = str(raw_answer).strip()
+                if not raw_answer:
+                    return ""
+                # 1. 直接ID匹配
+                id_map = {opt['id'].upper(): opt['id'] for opt in options_list}
+                if raw_answer.upper() in id_map:
+                    return id_map[raw_answer.upper()]
+                # 2. 精确文本匹配
+                text_to_id = {opt['text'].lower(): opt['id'] for opt in options_list}
+                if raw_answer.lower() in text_to_id:
+                    return text_to_id[raw_answer.lower()]
+                # 3. 包含匹配（去除标点）
+                clean_answer = re.sub(r'[^\w\s]', '', raw_answer).lower()
+                for opt in options_list:
+                    clean_text = re.sub(r'[^\w\s]', '', opt['text']).lower()
+                    if clean_answer in clean_text:
+                        return opt['id']
+                # 4. 首字母匹配
+                if len(raw_answer) == 1 and raw_answer.isalpha():
+                    return raw_answer.upper()
+                return raw_answer  # 保底返回原始值
+            # 处理答案
+            raw_answer = item.get('answer', '')
+            processed_answer = match_answer(raw_answer, options) if question_type == "multi-choice" else str(raw_answer)
+            answer = [processed_answer.strip().upper() if question_type == "multi-choice" else processed_answer.strip()]
+            converted.append({
+                "index": index,
+                "media_type": "image",
+                "media_paths": media_path,
+                "description": str(item.get('relation', "")),
+                "task_type": cfg['task_type'],
+                "question": [question_text],
+                "question_type": question_type,
+                "options": options,
+                "annotations": annotations,
+                "answer": answer,
+                "source": cfg['source'],
+                "domain": cfg['domain']
+            })
+        with open(output_file, 'w', encoding='utf-8') as f:
+            json.dump(converted, f, indent=2, ensure_ascii=False)
+        return True
+    except Exception as e:
+        print(f"转换失败: {input_file} → {str(e)}")
+        return False
+def process_single_file(input_path, output_dir, config):
+    """文件处理单元"""
+    try:
+        output_file = output_dir / input_path.name
+        return convert_medical_json(
+            input_file=str(input_path),
+            output_file=str(output_file),
+            config=config
+        )
+    except Exception as e:
+        print(f"文件处理异常: {input_path} → {str(e)}")
+        return False
+def batch_convert_json(input_dir, output_dir, config=None, max_workers=8):
+    """并行批量处理器"""
+    input_path = Path(input_dir)
+    output_path = Path(output_dir)
+    output_path.mkdir(parents=True, exist_ok=True)
+    success_count = 0
+    failure_count = 0
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = []
+        for input_file in input_path.glob('*.json'):
+            if input_file.is_file():
+                futures.append(executor.submit(
+                    process_single_file,
+                    input_path=input_file,
+                    output_dir=output_path,
+                    config=config
+                ))
+        for future in futures:
+            success_count += 1 if future.result() else 0
+            failure_count += 0 if future.result() else 1
+    print(f"\n处理完成: 成功 {success_count} 个，失败 {failure_count} 个")
+    print(f"输出目录: {output_path.resolve()}")
+if __name__ == "__main__":
+    custom_config = {
+        "source": "EmbSpatial",
+        "task_type": "Object-Detection",
+        "domain": "Embodied_ai"
+    }
+    try:
+        batch_convert_json(
+            input_dir="/mnt/data/users/zys/proj/vlm_reasoning/unprocessed_data/emb_ai/EmbSpatial",
+            output_dir="/mnt/data/users/zys/proj/vlm_reasoning/dataset",
+            config=custom_config,
+            max_workers=os.cpu_count() * 2
+        )
+    except Exception as e:
+        print(f"批处理异常: {str(e)}")

utils/json/jsonl.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import sys
+import json
+def count_jsonl_objects(file_path):
+    count = 0
+    with open(file_path, 'r', encoding='utf-8') as file:
+        for line_number, line in enumerate(file, 1):
+            stripped_line = line.strip()
+            if not stripped_line:
+                continue  # 跳过空行
+            try:
+                json.loads(stripped_line)
+                count += 1
+            except json.JSONDecodeError as e:
+                print(f"解析第 {line_number} 行时发现无效JSON: {e}")
+    return count
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("使用方法: python count_jsonl.py <文件路径>")
+        sys.exit(1)
+    file_path = sys.argv[1]
+    try:
+        total = count_jsonl_objects(file_path)
+        print(f"文件 {file_path} 中共包含 {total} 个有效的JSON对象")
+    except FileNotFoundError:
+        print(f"错误：文件 {file_path} 未找到")
+    except Exception as e:
+        print(f"发生未知错误: {e}")

utils/json/mask.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import numpy as np
+import cv2
+from matplotlib import pyplot as plt
+def visualize_mask(points, size, save_path="mask.png"):
+    """
+    输入:
+        points    : 多边形坐标 [[x1,y1], [x2,y2], ...]
+        size      : 图像尺寸 [height, width]
+        save_path : 输出图片保存路径（默认当前目录mask.png）
+    输出:
+        展示并保存包含多边形顶点（红色）和掩码区域（半透明绿）的图片
+    """
+    # 数值有效性检查
+    assert len(size) == 2, "size must be [height, width]"
+    height, width = size
+    assert height > 0 and width > 0, "invalid image size"
+    # 1. 创建彩色画布（RGBA格式，支持半透明叠加）
+    canvas = np.zeros((height, width, 4), dtype=np.uint8)
+    # 2. 预处理坐标点
+    polygon = []
+    for x, y in points:
+        # 将点限制在图像范围内（防止越界）
+        x = int(np.clip(round(x), 0, width-1))
+        y = int(np.clip(round(y), 0, height-1))
+        polygon.append([x, y])
+    polygon = np.array([polygon], dtype=np.int32)  # OpenCV要求形状为[N,1,2]
+    # 3. 生成掩码区域（绿色半透明填充）
+    mask = np.zeros((height, width), dtype=np.uint8)
+    cv2.fillPoly(mask, polygon, color=1)
+    canvas[..., 1] = 80 * mask  # 绿色通道
+    canvas[..., 3] = 200 * mask  # 透明度通道（掩码区域半透明）
+    # 4. 绘制多边形顶点（红色圆点标记）
+    for idx, (x, y) in enumerate(polygon[0]):
+        color = (0, 0, 255, 255)  # 纯红色（RGBA）
+        cv2.circle(canvas, (x, y), radius=5, color=color, thickness=-1)
+        cv2.putText(canvas, str(idx+1), (x+7, y+3),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255,255), 1)
+    # 5. 绘制多边形边界线（蓝色）
+    cv2.polylines(canvas, polygon, isClosed=True,
+                 color=(255, 0, 0, 255), thickness=2)  # 蓝色边界
+    # 6. 保存和显示结果
+    cv2.imwrite(save_path, canvas)
+    # 使用matplotlib显示（确保透明天正确处理）
+    plt.figure(figsize=(10, 6))
+    plt.title(f"Mask Visualization (Saved to {save_path})")
+    plt.imshow(cv2.cvtColor(canvas, cv2.COLOR_BGRA2RGBA))
+    plt.axis('off')
+    plt.show()
+# ------------ 使用示例 ------------
+if __name__ == "__main__":
+    # 你的输入数据
+    points = [
+        [602.245, 290.396],
+        [585.264, 289.452],
+        [568.283, 301.716],
+        [557.905, 309.264],
+        [546.584, 324.358],
+        [543.754, 342.283],
+        [544.698, 365.867],
+        [552.245, 381.905],
+        [571.113, 398.886],
+        [583.377, 407.377],
+        [597.528, 409.264],
+        [607.905, 398.886],
+        [626.773, 387.566],
+        [636.208, 384.735],
+        [646.584, 381.905],
+        [660.735, 370.584],
+        [661.679, 335.679],
+        [654.132, 313.037],
+        [639.981, 302.660],
+        [618.283, 293.226]
+    ]
+    size = [960, 1280]  # 高度=488，宽度=640
+    # 生成可视化图像
+    visualize_mask(
+        points=points,
+        size=size,
+        save_path="custom_mask_visualization.png"
+    )

utils/json/merge_json.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import argparse
+import json
+from pathlib import Path
+def merge_json_dataset(dataset_dir: str, output_name: str = "ChemQA") -> None:
+    """
+    自动合并dataset_dir中所有JSON文件的数据部分
+    参数：
+    dataset_dir : 存放.json文件的目录
+    output_name : 合并后的数据集名称（默认ChemQA）
+    """
+    dataset_path = Path(dataset_dir)
+    output_json = dataset_path / f"{output_name}.json"
+    # 获取所有需要合并的JSON文件（排除输出文件自身）
+    json_files = sorted(dataset_path.glob("*.json"))
+    json_files = [f for f in json_files if f != output_json]
+    if not json_files:
+        raise FileNotFoundError("未找到任何JSON文件")
+    merged_data = []
+    global_offset = 0  # 全局索引偏移量
+    for json_file in json_files:
+        # 读取JSON数据
+        with open(json_file, 'r', encoding='utf-8') as f:
+            part_data = json.load(f)
+        # 更新索引
+        for item in part_data:
+            item["index"] = global_offset + item["index"]
+        # 合并数据并更新偏移量
+        merged_data.extend(part_data)
+        global_offset += len(part_data)
+    # 保存合并结果
+    with open(output_json, 'w', encoding='utf-8') as f:
+        json.dump(merged_data, f, indent=2, ensure_ascii=False)
+    print(f"\n合并完成！共处理 {len(json_files)} 个JSON文件")
+    print(f"生成数据集: {output_name}.json")
+    print(f"- 总条目数: {len(merged_data)} 条")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="合并JSON格式的问答数据集")
+    parser.add_argument('-i',dest="dataset_dir", required=True, help="JSON文件所在的目录路径")
+    parser.add_argument('-o',"--output_name", default="ChemQA", help="输出数据集名称（默认为ChemQA）")
+    args = parser.parse_args()
+    merge_json_dataset(
+        dataset_dir=args.dataset_dir,
+        output_name=args.output_name
+    )

utils/oss/oss_batch_upload.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import argparse
+import os
+import alibabacloud_oss_v2 as oss
+def main():
+    parser = argparse.ArgumentParser(description="批量上传JSONL文件到OSS")
+    # 必需参数
+    parser.add_argument('--region', required=True, help='OSS存储空间所在区域')
+    parser.add_argument('--bucket', required=True, help='目标存储空间名称')
+    parser.add_argument('--key', required=True, help='OSS目标文件夹路径（如：/data）')
+    parser.add_argument('--file_path', required=True, help='本地包含JSONL文件的文件夹路径')
+    # 可选参数
+    parser.add_argument('--endpoint', help='自定义访问端点')
+    args = parser.parse_args()
+    # 验证文件路径有效性
+    if not os.path.isdir(args.file_path):
+        raise ValueError(f"无效的目录路径: {args.file_path}")
+    # 收集所有JSONL文件
+    jsonl_files = []
+    for filename in os.listdir(args.file_path):
+        if filename.endswith('.jsonl'):
+            full_path = os.path.join(args.file_path, filename)
+            if os.path.isfile(full_path):
+                jsonl_files.append((full_path, filename))
+    if not jsonl_files:
+        print("未找到任何JSONL文件")
+        return
+    # 初始化OSS配置
+    credentials_provider = oss.credentials.EnvironmentVariableCredentialsProvider()
+    cfg = oss.config.load_default()
+    cfg.credentials_provider = credentials_provider
+    cfg.region = args.region
+    if args.endpoint:
+        cfg.endpoint = args.endpoint
+    # 创建OSS客户端
+    client = oss.Client(cfg)
+    uploader = client.uploader()
+    # 处理OSS路径格式
+    base_key = args.key.rstrip('/')
+    # 批量上传
+    for local_path, filename in jsonl_files:
+        oss_key = f"{base_key}/{filename}" if base_key else filename
+        try:
+            result = uploader.upload_file(
+                oss.PutObjectRequest(
+                    bucket=args.bucket,
+                    key=oss_key,
+                ),
+                filepath=local_path
+            )
+            # 输出上传结果
+            print(f"   成功上传 {filename}")
+            print(f"   OSS路径: {oss_key}")
+            print(f"   状态码: {result.status_code}")
+            print(f"   请求ID: {result.request_id}")
+            print(f"   ETag: {result.etag}\n")
+        except Exception as e:
+            print(f"   上传失败 {filename}")
+            print(f"   错误信息: {str(e)}\n")
+if __name__ == "__main__":
+    main()

utils/oss/oss_upload.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import argparse
+import alibabacloud_oss_v2 as oss
+# 创建一个命令行参数解析器，并描述脚本用途：上传文件示例
+parser = argparse.ArgumentParser(description="upload file sample")
+# 添加命令行参数 --region，表示存储空间所在的区域，必需参数
+parser.add_argument('--region', help='The region in which the bucket is located.', required=True)
+# 添加命令行参数 --bucket，表示要上传文件到的存储空间名称，必需参数
+parser.add_argument('--bucket', help='The name of the bucket.', required=True)
+# 添加命令行参数 --endpoint，表示其他服务可用来访问OSS的域名，非必需参数
+parser.add_argument('--endpoint', help='The domain names that other services can use to access OSS')
+# 添加命令行参数 --key，表示对象（文件）在OSS中的键名，必需参数
+parser.add_argument('--key', help='The name of the object.', required=True)
+# 添加命令行参数 --file_path，表示本地待上传文件的路径，必需参数，例如“/Users/yourLocalPath/yourFileName”
+parser.add_argument('--file_path', help='The path of Upload file.', required=True)
+def main():
+    # 解析命令行提供的参数，获取用户输入的值
+    args = parser.parse_args()
+    # 从环境变量中加载访问OSS所需的认证信息，用于身份验证
+    credentials_provider = oss.credentials.EnvironmentVariableCredentialsProvider()
+    # 使用SDK的默认配置创建配置对象，并设置认证提供者
+    cfg = oss.config.load_default()
+    cfg.credentials_provider = credentials_provider
+    # 设置配置对象的区域属性，根据用户提供的命令行参数
+    cfg.region = args.region
+    # 如果提供了自定义endpoint，则更新配置对象中的endpoint属性
+    if args.endpoint is not None:
+        cfg.endpoint = args.endpoint
+    # 使用上述配置初始化OSS客户端，准备与OSS交互
+    client = oss.Client(cfg)
+    # 创建一个用于上传文件的对象
+    uploader = client.uploader()
+    # 调用方法执行文件上传操作
+    result = uploader.upload_file(
+        oss.PutObjectRequest(
+            bucket=args.bucket,  # 指定目标存储空间
+            key=args.key,        # 指定文件在OSS中的名称
+        ),
+        filepath=args.file_path  # 指定本地文件的位置
+    )
+    # 打印上传结果的相关信息，包括状态码、请求ID、内容MD5等
+    print(f'status code: {result.status_code},'
+          f' request id: {result.request_id},'
+          f' content md5: {result.headers.get("Content-MD5")},'
+          f' etag: {result.etag},'
+          f' hash crc64: {result.hash_crc64},'
+          f' version id: {result.version_id},'
+          f' server time: {result.headers.get("x-oss-server-time")},'
+          )
+# 当此脚本被直接执行时，调用main函数开始处理逻辑
+if __name__ == "__main__":
+    main()  # 脚本入口点，控制程序流程从这里开始

utils/oss/testis.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import os
+print("ACCESS_KEY_ID:", os.getenv('ALIBABA_CLOUD_ACCESS_KEY_ID'))
+print("ACCESS_KEY_SECRET:", os.getenv('ALIBABA_CLOUD_ACCESS_KEY_SECRET'))

utils/parquet/ChemQA_ptj.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import argparse
+import json
+import os
+import logging
+import sys
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor
+from ast import literal_eval
+import time
+from typing import Tuple
+import pandas as pd
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    stream=sys.stdout
+)
+def process_row(args):
+    """处理单行数据（线程安全）"""
+    index, row, file_stem= args
+    try:
+        # 生成媒体路径
+        media_path = "./" + (Path("data") / file_stem / f"{index}.jpg").as_posix()
+        # 处理选项
+        choices = row.get('choices', [])
+        if isinstance(choices, str):
+            try:
+                choices = literal_eval(choices)
+            except:
+                choices = []
+        # 生成选项对象
+        options = [
+            {"id": chr(65 + i), "text": str(text)}
+            for i, text in enumerate(choices)
+        ]
+        # 处理答案
+        label = row.get('label', '')
+        answer = []
+        try:
+            label_idx = int(label)
+            if 0 <= label_idx < len(options):
+                answer = [options[label_idx]['id']]
+        except (ValueError, TypeError):
+            pass
+        return {
+            "index": index,
+            "media_type": "image",
+            "media_paths": media_path,
+            "description": row.get('description', ''),
+            "task_type": "Vision-Question-Answer",
+            "question": [row.get('question', '')],
+            "question_type": "multi-choice",
+            "options": options,
+            "annotations":[],
+            "answer": answer,
+            "source": "ChemQA",
+            "domain": "Chemistry"
+        }
+    except Exception as e:
+        logging.error(f"处理行 {index} 时出错: {str(e)}")
+        return None
+def process_single_parquet(parquet_path: Path, output_root: Path) -> Tuple[int, int]:
+    """处理单个Parquet文件"""
+    start_time = time.time()
+    file_stem = parquet_path.stem
+    output_dir = output_root
+    output_json = output_dir / f"{file_stem}.json"
+    success_count = 0
+    error_count = 0
+    results = []
+    try:
+        df = pd.read_parquet(parquet_path)
+        total_rows = len(df)
+        logging.info(f"\n{'='*40}\nProcessing: {parquet_path.name}")
+        logging.info(f"Output directory: {output_dir}")
+        # 创建线程池
+        with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as executor:
+            task_args = [(idx, row, file_stem) for idx, row in df.iterrows()]
+            futures = [executor.submit(process_row, args) for args in task_args]
+            for future in futures:
+                result = future.result()
+                if result:
+                    results.append(result)
+                    success_count += 1
+                else:
+                    error_count += 1
+        # 写入JSON文件
+        with open(output_json, 'w', encoding='utf-8') as f:
+            json.dump(results, f, ensure_ascii=False, indent=2)
+        # 生成报告
+        process_time = time.time() - start_time
+        logging.info(
+            f"Processed: {success_count}/{total_rows} | "
+            f"Errors: {error_count} | "
+            f"Time: {process_time:.2f}s"
+        )
+        return success_count, error_count
+    except Exception as e:
+        logging.error(f"处理文件失败: {str(e)}")
+        return 0, total_rows
+def batch_process_parquets(input_dir: Path, output_root: Path):
+    """批量处理目录下所有Parquet文件"""
+    input_path = Path(input_dir)
+    output_root = Path(output_root)
+    if not input_path.exists():
+        raise FileNotFoundError(f"输入目录不存在: {input_path}")
+    parquet_files = list(input_path.glob("*.parquet"))
+    if not parquet_files:
+        logging.warning("未找到Parquet文件")
+        return
+    total_stats = {'success': 0, 'errors': 0}
+    for parquet_file in parquet_files:
+        success, errors = process_single_parquet(parquet_file, output_root)
+        total_stats['success'] += success
+        total_stats['errors'] += errors
+    logging.info(f"\n{'='*40}\n批量处理完成")
+    logging.info(f"处理文件总数: {len(parquet_files)}")
+    logging.info(f"总成功条目: {total_stats['success']}")
+    logging.info(f"总失败条目: {total_stats['errors']}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='批量处理Parquet文件转JSON')
+    parser.add_argument('-i', '--input', required=True, help='输入目录路径')
+    parser.add_argument('-o', '--output', required=True, help='输出根目录路径')
+    args = parser.parse_args()
+    try:
+        start_time = time.time()
+        batch_process_parquets(
+            input_dir=args.input,
+            output_root=args.output
+        )
+        logging.info(f"\n总耗时: {time.time()-start_time:.2f}s")
+    except Exception as e:
+        logging.error(f"程序异常终止: {str(e)}")
+        sys.exit(1)

utils/parquet/MathVerse_ptj.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import argparse
+import json
+import os
+import logging
+import sys
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor
+from ast import literal_eval
+import time
+from typing import Tuple
+import pandas as pd
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    stream=sys.stdout
+)
+def process_row(args):
+    """处理单行数据（线程安全）"""
+    index, row, file_stem = args
+    try:
+        # ================== 基础字段处理 ==================
+        media_path = "./" + (Path("data") / file_stem / f"{index}.jpg").as_posix()
+        raw_question_type = row.get("question_type", "").strip().lower()
+        # ================== 题目类型转换 ==================
+        # 根据数据集规范转换question_type格式
+        if raw_question_type == "multi-choice":
+            formatted_question_type = "multi-choice"
+        elif raw_question_type == "free-form":
+            formatted_question_type = "free-form"
+        else:
+            logging.warning(f"未知的题目类型: {raw_question_type}")
+            formatted_question_type = raw_question_type
+        # ================== 公共字段处理 ==================
+        description = str(row.get("problem_version", "")).strip()
+        # ================== 初始化关键字段 ==================
+        options = []
+        answer = []
+        question_text = ""
+        # ================== 分类型处理 ==================
+        if formatted_question_type == "multi-choice":
+            # 提取问题文本和选项
+            question_for_eval = str(row.get("question_for_eval", ""))
+            # 分割问题描述和选项部分
+            if "Choices:" in question_for_eval:
+                q_parts = question_for_eval.split("Choices:", 1)
+                question_part = q_parts[0].strip()
+                choices_part = q_parts[1].strip()
+            else:
+                question_part = question_for_eval.strip()
+                choices_part = ""
+            # 清理问题文本格式（合并换行和多余空格）
+            question_text = " ".join(question_part.replace("\n", " ").split())
+            # 解析选项
+            for line in choices_part.split('\n'):
+                line = line.strip()
+                if not line:
+                    continue
+                # 支持 A: 或 A. 两种分隔符
+                if ':' in line:
+                    id_text = line.split(':', 1)
+                elif '.' in line:
+                    id_text = line.split('.', 1)
+                else:
+                    continue  # 跳过无法解析的行
+                if len(id_text) == 2:
+                    option_id = id_text[0].strip().upper()
+                    option_text = " ".join(id_text[1].replace("\n", " ").split())
+                    options.append({"id": option_id, "text": option_text})
+            # 处理答案（直接使用原始答案字母）
+            raw_answer = str(row.get("answer", "")).strip().upper()
+            if raw_answer:
+                answer = [raw_answer]
+        elif formatted_question_type == "free-form":
+            # 提取问题文本
+            question_text = " ".join(str(row.get("query", "")).replace("\n", " ").split())
+            # 处理答案（统一转为字符串）
+            raw_answer = row.get("answer", "")
+            if pd.isna(raw_answer):
+                answer = [""]
+            else:
+                cleaned_answer = " ".join(str(raw_answer).strip().split())
+                answer = [cleaned_answer]
+        # ================== 构建结果对象 ==================
+        return {
+            "index": index,
+            "media_type": "image",
+            "media_paths": media_path,
+            "description": description,
+            "task_type": "Vision-Question-Answer",
+            "question": [question_text],
+            "question_type": formatted_question_type,
+            "options": options,
+            "annotations": [],
+            "answer": answer,
+            "source": "MathVerse",
+            "domain": "Math"
+        }
+    except Exception as e:
+        logging.error(f"处理行 {index} 时出错: {str(e)}")
+        return None
+def process_single_parquet(parquet_path: Path, output_root: Path) -> Tuple[int, int]:
+    """处理单个Parquet文件"""
+    start_time = time.time()
+    file_stem = parquet_path.stem
+    output_dir = output_root
+    output_json = output_dir / f"{file_stem}.json"
+    success_count = 0
+    error_count = 0
+    results = []
+    try:
+        df = pd.read_parquet(parquet_path)
+        total_rows = len(df)
+        logging.info(f"\n{'='*40}\nProcessing: {parquet_path.name}")
+        logging.info(f"Output directory: {output_dir}")
+        # 创建线程池
+        with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as executor:
+            task_args = [(idx, row, file_stem) for idx, row in df.iterrows()]
+            futures = [executor.submit(process_row, args) for args in task_args]
+            for future in futures:
+                result = future.result()
+                if result:
+                    results.append(result)
+                    success_count += 1
+                else:
+                    error_count += 1
+        # 写入JSON文件
+        with open(output_json, 'w', encoding='utf-8') as f:
+            json.dump(results, f, ensure_ascii=False, indent=2)
+        # 生成报告
+        process_time = time.time() - start_time
+        logging.info(
+            f"Processed: {success_count}/{total_rows} | "
+            f"Errors: {error_count} | "
+            f"Time: {process_time:.2f}s"
+        )
+        return success_count, error_count
+    except Exception as e:
+        logging.error(f"处理文件失败: {str(e)}")
+        return 0, total_rows
+def batch_process_parquets(input_dir: Path, output_root: Path):
+    """批量处理目录下所有Parquet文件"""
+    input_path = Path(input_dir)
+    output_root = Path(output_root)
+    if not input_path.exists():
+        raise FileNotFoundError(f"输入目录不存在: {input_path}")
+    parquet_files = list(input_path.glob("*.parquet"))
+    if not parquet_files:
+        logging.warning("未找到Parquet文件")
+        return
+    total_stats = {'success': 0, 'errors': 0}
+    for parquet_file in parquet_files:
+        success, errors = process_single_parquet(parquet_file, output_root)
+        total_stats['success'] += success
+        total_stats['errors'] += errors
+    logging.info(f"\n{'='*40}\n批量处理完成")
+    logging.info(f"处理文件总数: {len(parquet_files)}")
+    logging.info(f"总成功条目: {total_stats['success']}")
+    logging.info(f"总失败条目: {total_stats['errors']}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='批量处理Parquet文件转JSON')
+    parser.add_argument('-i', '--input', required=True, help='输入目录路径')
+    parser.add_argument('-o', '--output', required=True, help='输出根目录路径')
+    args = parser.parse_args()
+    try:
+        start_time = time.time()
+        batch_process_parquets(
+            input_dir=args.input,
+            output_root=args.output
+        )
+        logging.info(f"\n总耗时: {time.time()-start_time:.2f}s")
+    except Exception as e:
+        logging.error(f"程序异常终止: {str(e)}")
+        sys.exit(1)

utils/parquet/MathVision_ptj.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import argparse
+import json
+import os
+import logging
+import sys
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor
+from ast import literal_eval
+import time
+from typing import Tuple
+import pandas as pd
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    stream=sys.stdout
+)
+def process_row(args):
+    """处理单行数据（线程安全）"""
+    index, row, file_stem = args
+    try:
+        # ================== 基础字段处理 ==================
+        media_path = "./" + (Path("data") / file_stem / f"{index}.jpg").as_posix()
+        description = row.get("subject", "")  # 从subject字段获取描述
+        # ================== 动态确定问题类型 ==================
+        # 解析options字段（支持字符串格式的列表）
+        options_data = row.get("options", [])
+        if isinstance(options_data, str):
+            try:
+                options_data = literal_eval(options_data)  # 尝试解析字符串为列表
+            except:
+                options_data = []  # 解析失败时视为空列表
+        elif not isinstance(options_data, list):
+            options_data = list(options_data)  # 强制转换为列表（处理数组/Series）
+        formatted_question_type = "free-form" if len(options_data) == 0 else "multi-choice"
+        # ================== 不同类型处理 ==================
+        options = []
+        answer = []
+        if formatted_question_type == "multi-choice":
+            # 生成标准选项结构
+            options = [
+                {"id": chr(65 + i), "text": str(text).strip()}
+                for i, text in enumerate(options_data)
+            ]
+            # 匹配答案选项
+            answer_text = str(row.get("answer", "")).strip()
+            for option in options:
+                if option["text"] == answer_text:
+                    answer = [option["id"]]
+                    break
+        else:  # free-form类型处理
+            raw_answer = row.get("answer", "")
+            # 处理空值和特殊格式
+            if pd.isna(raw_answer) or raw_answer in ["nan", "None"]:
+                answer = [""]
+            else:
+                # 统一转换为字符串并清理格式
+                cleaned_answer = " ".join(str(raw_answer).strip().split())
+                answer = [cleaned_answer]
+        # ================== 构建结果对象 ==================
+        return {
+            "index": index,
+            "media_type": "image",
+            "media_paths": media_path,
+            "description": description,
+            "task_type": "Vision-Question-Answer",
+            "question": [row.get('question', '')],
+            "question_type": formatted_question_type,
+            "options": options,
+            "annotations": [],
+            "answer": answer,
+            "source": "MathVision",
+            "domain": "Math"
+        }
+    except Exception as e:
+        logging.error(f"处理行 {index} 时出错: {str(e)}")
+        return None
+def process_single_parquet(parquet_path: Path, output_root: Path) -> Tuple[int, int]:
+    """处理单个Parquet文件"""
+    start_time = time.time()
+    file_stem = parquet_path.stem
+    output_dir = output_root
+    output_json = output_dir / f"{file_stem}.json"
+    success_count = 0
+    error_count = 0
+    results = []
+    try:
+        df = pd.read_parquet(parquet_path)
+        total_rows = len(df)
+        logging.info(f"\n{'='*40}\nProcessing: {parquet_path.name}")
+        logging.info(f"Output Directory: {output_dir.name}")
+        # 创建线程池
+        with ThreadPoolExecutor(max_workers = min(os.cpu_count() * 2, 32)) as executor:
+            task_args = [(idx, row, file_stem) for idx, row in df.iterrows()]
+            futures = [executor.submit(process_row, args) for args in task_args]
+            for future in futures:
+                result = future.result()
+                if result:
+                    results.append(result)
+                    success_count += 1
+                else:
+                    error_count += 1
+        # 写入JSON文件
+        with open(output_json, 'w', encoding='utf-8') as f:
+            json.dump(results, f, ensure_ascii=False, indent=2)
+        # 生成报告
+        process_time = time.time() - start_time
+        logging.info(
+            f"Processed: {success_count}/{total_rows} | "
+            f"Errors: {error_count} | "
+            f"Time: {process_time:.2f}s"
+        )
+        return success_count, error_count
+    except Exception as e:
+        logging.error(f"处理文件失败: {str(e)}")
+        return 0, total_rows
+def batch_process_parquets(input_dir: Path, output_root: Path):
+    """批量处理目录下所有Parquet文件"""
+    input_path = Path(input_dir)
+    output_root = Path(output_root)
+    if not input_path.exists():
+        raise FileNotFoundError(f"输入目录不存在: {input_path}")
+    parquet_files = list(input_path.glob("*.parquet"))
+    if not parquet_files:
+        logging.warning("未找到Parquet文件")
+        return
+    total_stats = {'success': 0, 'errors': 0}
+    for parquet_file in parquet_files:
+        success, errors = process_single_parquet(parquet_file, output_root)
+        total_stats['success'] += success
+        total_stats['errors'] += errors
+    logging.info(f"\n{'='*40}\n批量处理完成")
+    logging.info(f"处理文件总数: {len(parquet_files)}")
+    logging.info(f"总成功条目: {total_stats['success']}")
+    logging.info(f"总失败条目: {total_stats['errors']}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='批量处理Parquet文件转JSON')
+    parser.add_argument('-i', '--input', required=True, help='输入目录路径')
+    parser.add_argument('-o', '--output', required=True, help='输出根目录路径')
+    args = parser.parse_args()
+    try:
+        start_time = time.time()
+        batch_process_parquets(
+            input_dir=args.input,
+            output_root=args.output
+        )
+        logging.info(f"\n总耗时: {time.time()-start_time:.2f}s")
+    except Exception as e:
+        logging.error(f"程序异常终止: {str(e)}")
+        sys.exit(1)

utils/parquet/MathVista_ptj.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import argparse
+import json
+import os
+import logging
+import sys
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor
+from ast import literal_eval
+import time
+from typing import Tuple
+import pandas as pd
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    stream=sys.stdout
+)
+def process_row(args):
+    """处理单行数据（线程安全）"""
+    index, row, file_stem = args
+    try:
+        # ================== 基础字段处理 ==================
+        media_path = "./" + (Path("data") / file_stem / f"{index}.jpg").as_posix()
+        question_type = row.get("question_type", "")
+        # ================== 题目类型转换 ==================
+        # 转换question_type格式
+        if question_type == "multi_choice":
+            formatted_question_type = "multi-choice"
+        elif question_type == "free_form":
+            formatted_question_type = "free-form"
+        else:
+            logging.warning(f"未知的题目类型: {question_type}")
+            formatted_question_type = question_type.replace("_", "-")
+        # ================== 解析query字段 ==================
+        query = row.get("query", "")
+        description = ""
+        # 仅提取Hint部分
+        if "Hint:" in query:
+            # 获取第一个出现的Question:位置之前的文本
+            hint_part = query.split("Question:")[0].replace("Hint:", "").strip()
+            # 清理格式（保留自然空格）
+            description = " ".join(hint_part.replace("\n", " ").split())
+        # ================== 不同类型处理 ==================
+        options = []
+        answer = []
+        if formatted_question_type == "multi-choice":
+            # 处理选项
+            choices = row.get("choices", [])
+            if isinstance(choices, str):
+                try:
+                    choices = literal_eval(choices)
+                except:
+                    choices = []
+            # 生成选项对象
+            options = [
+                {"id": chr(65 + i), "text": str(text).strip()}
+                for i, text in enumerate(choices)
+            ]
+            # 处理答案（基于label索引）
+            label = row.get("answer", "")
+            answer = []
+            if label is not None:
+                label_text = str(label).strip()
+                for option in options:
+                    if option["text"] == label_text:
+                        answer.append(option["id"])
+                        break
+        elif formatted_question_type == "free-form":
+            # 自由格式处理
+            raw_answer = row.get("answer", "")
+            # 处理数字和空值
+            if pd.isna(raw_answer):
+                answer = [""]
+            else:
+                # 转换为字符串并清理格式
+                cleaned_answer = " ".join(str(raw_answer).strip().split())
+                answer = [cleaned_answer]
+        # ================== 构建结果对象 ==================
+        return {
+            "index": index,
+            "media_type": "image",
+            "media_paths": media_path,
+            "description": description,
+            "task_type": "Vision-Question-Answer",
+            "question": [row.get('question', '')],
+            "question_type": formatted_question_type,
+            "options": options,
+            "annotations": [],
+            "answer": answer,
+            "source": "MathVista",
+            "domain": "Math"
+        }
+    except Exception as e:
+        logging.error(f"处理行 {index} 时出错: {str(e)}\n原始数据: {row.to_dict()}")
+        return None
+def process_single_parquet(parquet_path: Path, output_root: Path) -> Tuple[int, int]:
+    """处理单个Parquet文件"""
+    start_time = time.time()
+    file_stem = parquet_path.stem
+    output_dir = output_root
+    output_json = output_dir / f"{file_stem}.json"
+    success_count = 0
+    error_count = 0
+    results = []
+    try:
+        df = pd.read_parquet(parquet_path)
+        total_rows = len(df)
+        logging.info(f"\n{'='*40}\nProcessing: {parquet_path.name}")
+        logging.info(f"Output directory: {output_dir}")
+        # 创建线程池
+        with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as executor:
+            task_args = [(idx, row, file_stem) for idx, row in df.iterrows()]
+            futures = [executor.submit(process_row, args) for args in task_args]
+            for future in futures:
+                result = future.result()
+                if result:
+                    results.append(result)
+                    success_count += 1
+                else:
+                    error_count += 1
+        # 写入JSON文件
+        with open(output_json, 'w', encoding='utf-8') as f:
+            json.dump(results, f, ensure_ascii=False, indent=2)
+        # 生成报告
+        process_time = time.time() - start_time
+        logging.info(
+            f"Processed: {success_count}/{total_rows} | "
+            f"Errors: {error_count} | "
+            f"Time: {process_time:.2f}s"
+        )
+        return success_count, error_count
+    except Exception as e:
+        logging.error(f"处理文件失败: {str(e)}")
+        return 0, total_rows
+def batch_process_parquets(input_dir: Path, output_root: Path):
+    """批量处理目录下所有Parquet文件"""
+    input_path = Path(input_dir)
+    output_root = Path(output_root)
+    if not input_path.exists():
+        raise FileNotFoundError(f"输入目录不存在: {input_path}")
+    parquet_files = list(input_path.glob("*.parquet"))
+    if not parquet_files:
+        logging.warning("未找到Parquet文件")
+        return
+    total_stats = {'success': 0, 'errors': 0}
+    for parquet_file in parquet_files:
+        success, errors = process_single_parquet(parquet_file, output_root)
+        total_stats['success'] += success
+        total_stats['errors'] += errors
+    logging.info(f"\n{'='*40}\n批量处理完成")
+    logging.info(f"处理文件总数: {len(parquet_files)}")
+    logging.info(f"总成功条目: {total_stats['success']}")
+    logging.info(f"总失败条目: {total_stats['errors']}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='批量处理Parquet文件转JSON')
+    parser.add_argument('-i', '--input', required=True, help='输入目录路径')
+    parser.add_argument('-o', '--output', required=True, help='输出根目录路径')
+    args = parser.parse_args()
+    try:
+        start_time = time.time()
+        batch_process_parquets(
+            input_dir=args.input,
+            output_root=args.output
+        )
+        logging.info(f"\n总耗时: {time.time()-start_time:.2f}s")
+    except Exception as e:
+        logging.error(f"程序异常终止: {str(e)}")
+        sys.exit(1)

utils/parquet/merge_jp.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import argparse
+import json
+import shutil
+from pathlib import Path
+import pyarrow.parquet as pq
+def merge_chemqa_dataset(dataset_dir: str, data_dir: str, output_name: str = "ChemQA") -> None:
+    """
+    自动合并dataset_dir中所有Parquet文件对应的数据部分
+    参数：
+    dataset_dir : 存放.parquet文件的目录
+    data_dir    : 存放图片文件夹的根目录（JSON文件位于其父目录）
+    output_name : 合并后的数据集名称
+    """
+    # 转换为Path对象
+    dataset_path = Path(dataset_dir)
+    data_path = Path(data_dir)
+    # 获取所有需要合并的Parquet文件（按文件名排序）
+    parquet_files = sorted(dataset_path.glob("*.parquet"))
+    if not parquet_files:
+        raise FileNotFoundError("未找到任何Parquet文件")
+    # 创建目标图片目录
+    chemqa_img_dir = data_path / output_name
+    chemqa_img_dir.mkdir(exist_ok=True)
+    merged_data = []
+    global_offset = 0  # 全局索引偏移量
+    for pq_file in parquet_files:
+        part_name = pq_file.stem  # 获取不带扩展名的文件名，如test-00000-of-00001
+        part_img_dir = data_path / part_name
+        # 验证图片文件夹存在
+        if not part_img_dir.exists():
+            raise FileNotFoundError(f"图片目录不存在: {part_img_dir}")
+        # 从Parquet获取行数
+        with pq.ParquetFile(pq_file) as pf:
+            num_rows = pf.metadata.num_rows
+        # 复制并重命名图片
+        print(f"合并 {part_name} 的 {num_rows} 张图片...")
+        for idx in range(num_rows):
+            src = part_img_dir / f"{idx}.jpg"  # 假设图片格式为jpg
+            dst = chemqa_img_dir / f"{global_offset + idx}.jpg"
+            shutil.copy2(src, dst)
+        # 从data_dir的父目录加载对应JSON文件
+        json_file = data_path.parent / f"{part_name}.json"
+        if not json_file.exists():
+            raise FileNotFoundError(f"JSON文件不存在: {json_file}")
+        with open(json_file, 'r', encoding='utf-8') as f:
+            part_data = json.load(f)
+        # 更新索引和路径
+        for item in part_data:
+            original_idx = item["index"]
+            item["index"] = global_offset + original_idx
+            item["media_paths"] = f"./data/{output_name}/{global_offset + original_idx}.jpg"
+        merged_data.extend(part_data)
+        global_offset += num_rows
+    # 保存合并后的JSON到data_dir父目录
+    output_json = data_path.parent / f"{output_name}.json"
+    with open(output_json, 'w', encoding='utf-8') as f:
+        json.dump(merged_data, f, indent=2, ensure_ascii=False)
+    print(f"\n合并完成！共处理 {len(parquet_files)} 个部分")
+    print(f"生成数据集: {output_name}")
+    print(f"- 图片总数: {global_offset} 张（位于 {chemqa_img_dir}）")
+    print(f"- JSON条目: {len(merged_data)} 条（位于 {output_json}）")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="合并问答数据集")
+    parser.add_argument('-i', dest="dataset_dir", required=True, help="Parquet文件所在的目录路径")
+    parser.add_argument('-ip', dest="data_dir", required=True, help="图片文件夹根目录路径（JSON在其父目录）")
+    parser.add_argument('-o', "--output_name", default="ChemQA", help="输出数据集名称（默认为ChemQA）")
+    args = parser.parse_args()
+    merge_chemqa_dataset(
+        dataset_dir=args.dataset_dir,
+        data_dir=args.data_dir,
+        output_name=args.output_name
+    )

utils/parquet/pa_to_p.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import argparse
+import pandas as pd
+from PIL import Image
+from pathlib import Path
+import io
+import sys
+import time
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from typing import Tuple, List
+# 配置日志格式
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    stream=sys.stdout
+)
+def process_image_row(row_tuple: Tuple[int, pd.Series],
+                     output_dir: Path,
+                     overwrite: bool = False,
+                     output_format: str = 'png') -> Tuple[int, str]:  # 新增格式参数
+    """处理单行图像数据（线程安全）"""
+    idx, row = row_tuple
+    # 根据格式确定扩展名和保存参数
+    if output_format.lower() == 'jpg':
+        file_ext = 'jpg'
+        img_format = 'JPEG'
+        save_args = {'quality': 95}  # JPEG质量参数
+    elif output_format.lower() == 'png':
+        file_ext = 'png'
+        img_format = 'PNG'
+        save_args = {'compress_level': 6}  # PNG压缩级别
+    else:
+        raise ValueError(f"Unsupported format: {output_format}")
+    output_path = output_dir / f"{idx}.{file_ext}"  # 动态扩展名
+    try:
+        # 跳过已存在文件
+        if not overwrite and output_path.exists():
+            return (idx, "skipped")
+        img_data = row['image']
+        # 统一处理不同存储格式
+        if isinstance(img_data, dict):
+            img_bytes = img_data['bytes']
+        elif isinstance(img_data, bytes):
+            img_bytes = img_data
+        else:
+            raise ValueError("Unknown image storage format")
+        with Image.open(io.BytesIO(img_bytes)) as img:
+            # 公共处理：CMYK模式必须转换（PNG和JPEG都不支持）
+            if img.mode == 'CMYK':
+                img = img.convert('RGB')
+            # 格式专用处理
+            if output_format == 'jpg':
+                # JPEG不支持的模式转换
+                if img.mode in ['P', 'PA']:  # 调色板模式
+                    img = img.convert('RGBA')
+                    background = Image.new('RGB', img.size, (255, 255, 255))
+                    background.paste(img, mask=img.split()[-1])
+                    img = background
+                elif img.mode == 'LA':       # 带透明度的灰度
+                    img = img.convert('L')   # 转换为纯灰度
+                elif img.mode in ['RGBA', 'RGBa']:  # 带透明度的RGB
+                    background = Image.new('RGB', img.size, (255, 255, 255))
+                    background.paste(img, mask=img.split()[-1])
+                    img = background
+                # 最终保险：强制转换非标准模式
+                if img.mode not in ['RGB', 'L']:
+                    img = img.convert('RGB')  # 确保所有JPEG都是RGB或灰度
+            else:
+                # PNG专用处理（仅CMYK需要转换，其他保留原始模式）
+                pass  # 可以添加其他PNG优化逻辑（如调色板压缩）
+            # 保存图像（根据格式使用不同参数）
+            img.save(output_path, img_format, **save_args)
+            return (idx, "success")
+    except Exception as e:
+        return (idx, f"error: {str(e)}")
+def process_single_parquet(parquet_path: Path,
+                          output_root: Path,
+                          threads: int = 4,
+                          overwrite: bool = False,
+                          output_format: str = 'png') -> Tuple[int, int]:  # 新增格式参数
+    """处理单个Parquet文件（带并发）"""
+    start_time = time.time()
+    file_name = parquet_path.stem
+    output_dir = output_root / f"{file_name}"
+    output_dir.mkdir(parents=True, exist_ok=True)
+    error_log = []
+    success_count = 0
+    skipped_count = 0
+    logging.info(f"\n{'='*40}\nProcessing: {parquet_path.name}")
+    logging.info(f"Output format: {output_format.upper()}")
+    logging.info(f"Output directory: {output_dir}")
+    try:
+        df = pd.read_parquet(parquet_path)
+        total_rows = len(df)
+        # 创建线程池
+        with ThreadPoolExecutor(max_workers=threads) as executor:
+            futures = [
+                executor.submit(
+                    process_image_row,
+                    (idx, row),
+                    output_dir,
+                    overwrite,
+                    output_format  # 传递格式参数
+                )
+                for idx, row in df.iterrows()
+            ]
+            # 收集处理结果
+            for future in as_completed(futures):
+                idx, status = future.result()
+                if status == "success":
+                    success_count += 1
+                elif status.startswith("error"):
+                    error_log.append(f"line {idx} error: {status[6:]}")
+                elif status == "skipped":
+                    skipped_count += 1
+        # 生成处理报告
+        process_time = time.time() - start_time
+        report = (
+            f"Results: {success_count}/{total_rows} succeed | "
+            f"Skipped: {skipped_count} | "
+            f"Errors: {len(error_log)}\n"
+            f"Time: {process_time:.2f}s | "
+            f"Speed: {total_rows/process_time:.2f} rows/s"
+        )
+        logging.info(report)
+        # 保存错误日志
+        if error_log:
+            (output_dir / "process_errors.log").write_text("\n".join(error_log))
+        return success_count, len(error_log)
+    except Exception as e:
+        logging.error(f"File processing failed: {str(e)}")
+        return 0, 0
+def batch_process_parquets(input_dir: Path,
+                           output_root: Path,
+                           threads: int = 4,
+                           overwrite: bool = False,
+                           output_format: str = 'png'):  # 新增格式参数
+    """批量处理目录下所有Parquet文件"""
+    input_path = Path(input_dir)
+    output_root = Path(output_root)
+    if not input_path.exists():
+        raise FileNotFoundError(f"Input directory not found: {input_path}")
+    parquet_files = list(input_path.glob("*.parquet"))
+    if not parquet_files:
+        logging.warning("No Parquet files found")
+        return
+    total_stats = {'success':0, 'errors':0}
+    for parquet_file in parquet_files:
+        success, errors = process_single_parquet(
+            parquet_file,
+            output_root,
+            threads=threads,
+            overwrite=overwrite,
+            output_format=output_format  # 传递格式参数
+        )
+        total_stats['success'] += success
+        total_stats['errors'] += errors
+    # 最终统计报告
+    logging.info(f"\n{'='*40}\nBatch processing completed")
+    logging.info(f"Total processed files: {len(parquet_files)}")
+    logging.info(f"Total successful images: {total_stats['success']}")
+    logging.info(f"Total errors: {total_stats['errors']}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Parallel processing of Parquet files to generate images")
+    parser.add_argument("-i", "--input", required=True, help="Input directory path")
+    parser.add_argument("-o", "--output", required=True, help="Output directory path")
+    parser.add_argument("--threads", type=int, default=4, help="Number of concurrent threads (default 4)")
+    parser.add_argument("--overwrite", action="store_true", help="Overwrite existing files")
+    # 新增格式选择参数
+    parser.add_argument("--format", choices=['png', 'jpg'], default='jpg',
+                        help="Output image format (png/jpg, default jpg)")
+    args = parser.parse_args()
+    try:
+        start = time.time()
+        batch_process_parquets(
+            input_dir=args.input,
+            output_root=args.output,
+            threads=args.threads,
+            overwrite=args.overwrite,
+            output_format=args.format  # 传递格式参数
+        )
+        logging.info(f"\nTotal processing time: {time.time()-start:.2f}s")
+    except Exception as e:
+        logging.error(f"Fatal error: {str(e)}")
+        sys.exit(1)

utils/parquet/pathQA_ptj.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import argparse
+import json
+import os
+import logging
+import sys
+from pathlib import Path
+from concurrent.futures import ThreadPoolExecutor
+from ast import literal_eval
+import time
+from typing import Tuple
+import pandas as pd
+# 配置日志
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    stream=sys.stdout
+)
+def process_row(args):
+    """处理单行数据（线程安全）"""
+    index, row, file_stem= args
+    try:
+        # 生成媒体路径
+        media_path = "./" + (Path("data") / file_stem / f"{index}.png").as_posix()
+        return {
+            "index": index,
+            "media_type": "image",
+            "media_paths": media_path,
+            "description": "",
+            "task_type": "Vision-Question-Answer",
+            "question": [row.get('question', '')],
+            "question_type": "free-form",
+            "annotations":[],
+            "options": [],
+            "answer": [row.get('answer', '')],
+            "source": "PathQA",
+            "domain": "Biomedical"
+        }
+    except Exception as e:
+        logging.error(f"处理行 {index} 时出错: {str(e)}")
+        return None
+def process_single_parquet(parquet_path: Path, output_root: Path) -> Tuple[int, int]:
+    """处理单个Parquet文件"""
+    start_time = time.time()
+    file_stem = parquet_path.stem
+    output_dir = output_root
+    output_json = output_dir / f"{file_stem}.json"
+    success_count = 0
+    error_count = 0
+    results = []
+    try:
+        df = pd.read_parquet(parquet_path)
+        total_rows = len(df)
+        logging.info(f"\n{'='*40}\nProcessing: {parquet_path.name}")
+        logging.info(f"Output directory: {output_dir}")
+        # 创建线程池
+        with ThreadPoolExecutor(max_workers=os.cpu_count() * 2) as executor:
+            task_args = [(idx, row, file_stem) for idx, row in df.iterrows()]
+            futures = [executor.submit(process_row, args) for args in task_args]
+            for future in futures:
+                result = future.result()
+                if result:
+                    results.append(result)
+                    success_count += 1
+                else:
+                    error_count += 1
+        # 写入JSON文件
+        with open(output_json, 'w', encoding='utf-8') as f:
+            json.dump(results, f, ensure_ascii=False, indent=2)
+        # 生成报告
+        process_time = time.time() - start_time
+        logging.info(
+            f"Processed: {success_count}/{total_rows} | "
+            f"Errors: {error_count} | "
+            f"Time: {process_time:.2f}s"
+        )
+        return success_count, error_count
+    except Exception as e:
+        logging.error(f"处理文件失败: {str(e)}")
+        return 0, total_rows
+def batch_process_parquets(input_dir: Path, output_root: Path):
+    """批量处理目录下所有Parquet文件"""
+    input_path = Path(input_dir)
+    output_root = Path(output_root)
+    if not input_path.exists():
+        raise FileNotFoundError(f"输入目录不存在: {input_path}")
+    parquet_files = list(input_path.glob("*.parquet"))
+    if not parquet_files:
+        logging.warning("未找到Parquet文件")
+        return
+    total_stats = {'success': 0, 'errors': 0}
+    for parquet_file in parquet_files:
+        success, errors = process_single_parquet(parquet_file, output_root)
+        total_stats['success'] += success
+        total_stats['errors'] += errors
+    logging.info(f"\n{'='*40}\n批量处理完成")
+    logging.info(f"处理文件总数: {len(parquet_files)}")
+    logging.info(f"总成功条目: {total_stats['success']}")
+    logging.info(f"总失败条目: {total_stats['errors']}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='批量处理Parquet文件转JSON')
+    parser.add_argument('-i', '--input', required=True, help='输入目录路径')
+    parser.add_argument('-o', '--output', required=True, help='输出根目录路径')
+    args = parser.parse_args()
+    try:
+        start_time = time.time()
+        batch_process_parquets(
+            input_dir=args.input,
+            output_root=args.output
+        )
+        logging.info(f"\n总耗时: {time.time()-start_time:.2f}s")
+    except Exception as e:
+        logging.error(f"程序异常终止: {str(e)}")
+        sys.exit(1)

utils/upload/batch_download.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import os
+import json
+from openai import OpenAI
+def batch_download_files():
+    # 初始化客户端
+    client = OpenAI(
+        api_key=os.getenv("DASHSCOPE_API_KEY"),
+        base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+    )
+    try:
+        # 获取用户输入
+        input_jsonl = input("请输入包含batch任务状态的jsonl文件路径：")
+        output_dir = input("请输入保存文件的目录路径：")
+        # 创建输出目录（如果不存在）
+        os.makedirs(output_dir, exist_ok=True)
+        # 读取并处理jsonl文件
+        with open(input_jsonl, 'r', encoding='utf-8') as f:
+            for line_num, line in enumerate(f, 1):
+                try:
+                    entry = json.loads(line.strip())
+                    # 检查状态和文件ID
+                    if entry.get('status') == 'completed':
+                        file_id = entry.get('output_file_id')
+                        if not file_id:
+                            print(f"第 {line_num} 行: 缺少output_file_id")
+                            continue
+                        # 下载文件内容
+                        content = client.files.content(file_id=file_id)
+                        # 构建保存路径
+                        filename = f"{file_id}.jsonl"
+                        save_path = os.path.join(output_dir, filename)
+                        # 保存文件
+                        content.write_to_file(save_path)
+                        print(f"成功保存: {filename} -> {save_path}")
+                    else:
+                        print(f"第 {line_num} 行: 状态未完成（当前状态：{entry.get('status')}）")
+                except json.JSONDecodeError:
+                    print(f"第 {line_num} 行: JSON解析失败")
+                except Exception as e:
+                    print(f"第 {line_num} 行: 发生错误 - {str(e)}")
+    except FileNotFoundError:
+        print("错误：输入文件不存在")
+    except Exception as e:
+        print(f"发生未预期的错误: {str(e)}")
+if __name__ == "__main__":
+    batch_download_files()

utils/upload/batch_search.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import os
+import json
+from openai import OpenAI
+# 初始化OpenAI客户端（实际对接阿里云百炼）
+client = OpenAI(
+    api_key=os.getenv("DASHSCOPE_API_KEY"),
+    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+)
+def process_batches(input_file):
+    output_file = "batch_status_output.jsonl"
+    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
+        for line in infile:
+            try:
+                # 解析JSONL行数据
+                entry = json.loads(line.strip())
+                batch_id = entry['id']
+                # 查询Batch详细信息
+                batch = client.batches.retrieve(batch_id)
+                # 构建结果记录
+                result = {
+                    "status": batch.status,
+                    "input_file_id": batch.input_file_id,
+                    "output_file_id": batch.output_file_id
+                }
+                # 写入结果到新JSONL文件
+                outfile.write(json.dumps(result) + '\n')
+                print(f"Processed batch: {batch_id}")
+            except KeyError:
+                print(f"Invalid entry format: {line.strip()}")
+            except Exception as e:
+                print(f"Error processing batch {batch_id}: {str(e)}")
+if __name__ == "__main__":
+    input_path = input("请输入包含Batch IDs的JSONL文件路径: ")
+    if os.path.exists(input_path):
+        process_batches(input_path)
+        print(f"处理完成，结果已保存到 batch_status_output.jsonl")
+    else:
+        print("错误：输入文件不存在")

utils/upload/batch_upload.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import os
+import json
+from openai import OpenAI
+# 初始化OpenAI客户端
+client = OpenAI(
+    api_key="sk-xxx",
+    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+)
+# 获取用户输入的文件夹路径
+folder_path = input("请输入包含批量文件的文件夹路径：")
+if not os.path.isdir(folder_path):
+    print("错误：输入的路径不存在或不是文件夹")
+    exit()
+# 获取文件夹下所有文件列表
+file_list = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
+# 创建并保存batch任务信息
+with open("batch_results.jsonl", "w", encoding="utf-8") as jsonl_file:
+    for filename in file_list:
+        # 构建OSS文件标识符（假设OSS路径结构与本地一致）
+        input_file_id = f"oss:me-east-1:acm-mm/test/{filename}"
+        try:
+            # 创建batch任务
+            batch = client.batches.create(
+                input_file_id=input_file_id,
+                endpoint="/v1/chat/completions",
+                completion_window="1h"
+            )
+            # 提取需要的信息
+            batch_info = {
+                "id": batch.id,
+                "input_file_id": batch.input_file_id
+            }
+            # 写入JSONL文件
+            jsonl_file.write(json.dumps(batch_info, ensure_ascii=False) + "\n")
+            print(f"文件 {filename} 已成功创建任务，ID: {batch.id}")
+        except Exception as e:
+            print(f"为文件 {filename} 创建任务失败，错误信息: {str(e)}")

utils/upload/compare.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import json
+def read_original_file(file_path):
+    original_data = {}
+    custom_id_order = []
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            entry = json.loads(line.strip())
+            custom_id = entry.get('custom_id')
+            if not custom_id:
+                continue
+            # 提取用户问题
+            body = entry.get('body', {})
+            messages = body.get('messages', [])
+            user_content = None
+            for msg in messages:
+                if msg.get('role') == 'user':
+                    user_content = msg.get('content')
+                    break
+            if user_content is not None:
+                original_data[custom_id] = user_content
+                custom_id_order.append(custom_id)
+    return custom_id_order, original_data
+def read_output_file(file_path):
+    output_data = {}
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            entry = json.loads(line.strip())
+            custom_id = entry.get('custom_id')
+            if not custom_id:
+                continue
+            # 提取模型输出
+            response = entry.get('response', {})
+            body = response.get('body', {})
+            choices = body.get('choices', [])
+            model_output = ''
+            if choices:
+                message = choices[0].get('message', {})
+                model_output = message.get('content', '')
+            output_data[custom_id] = model_output
+    return output_data
+def main():
+    # 获取用户输入路径
+    original_path = input("请输入原始请求文件路径：").strip()
+    output_path = input("请输入大模型输出文件路径：").strip()
+    save_path = input("请输入结果保存路径：").strip()
+    # 读取数据
+    custom_id_order, original_data = read_original_file(original_path)
+    output_data = read_output_file(output_path)
+    # 写入结果文件
+    with open(save_path, 'w', encoding='utf-8') as f:
+        for i, cid in enumerate(custom_id_order):
+            original_question = original_data.get(cid, '')
+            model_output = output_data.get(cid, '')
+            f.write(f"custom_id: {cid}\n")
+            f.write(f"原问题: {original_question}\n")
+            f.write(f"大模型输出: {model_output}\n")
+            # 组间空行（最后一组不空）
+            if i != len(custom_id_order) - 1:
+                f.write('\n')
+if __name__ == "__main__":
+    main()

utils/upload/download.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os
+from openai import OpenAI
+client = OpenAI(
+    api_key=os.getenv("DASHSCOPE_API_KEY"),
+    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+)
+content = client.files.content(file_id="file-batch_output-xxx")
+# 打印结果文件内容
+content.write_to_file("result.jsonl")

utils/upload/jsonl_otest.py ADDED Viewed

	@@ -0,0 +1,32 @@

+def analyze_jsonl_size(input_path):
+    over_count = 0
+    total_over_size = 0  # 总大小，单位字节
+    max_size = 0         # 最大对象大小，单位字节
+    line_counter = 0     # 行号跟踪（可选）
+    with open(input_path, 'r', encoding='utf-8') as file:
+        for line in file:
+            line_counter += 1
+            # 计算当前行的字节大小
+            line_bytes = line.encode('utf-8')
+            current_size = len(line_bytes)
+            # 检查是否超过1MB
+            if current_size > 1 * 1024 * 1024:
+                over_count += 1
+                total_over_size += current_size
+                if current_size > max_size:
+                    max_size = current_size
+    # 输出统计结果
+    print(f"Number of objects exceeding 1MB: {over_count}")
+    if over_count > 0:
+        avg_size_mb = (total_over_size / over_count) / (1024 * 1024)
+        max_size_mb = max_size / (1024 * 1024)
+        print(f"Average size of oversized objects: {avg_size_mb:.2f} MB")
+        print(f"Largest object size: {max_size_mb:.2f} MB")
+    else:
+        print("No objects exceed the 1MB limit.")
+# 示例调用
+analyze_jsonl_size('/mnt/data/users/zys/proj/vlm_reasoning/request/vlm_batch_requests.jsonl')

utils/upload/jsonl_split.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import os
+def split_jsonl_by_size(input_path, output_dir, target_size_mb):
+    target_size = target_size_mb * 1024 * 1024  # 转换为字节
+    os.makedirs(output_dir, exist_ok=True)
+    part_num = 1
+    current_size = 0
+    current_file = None
+    line_counter = 0  # 用于跟踪行号
+    with open(input_path, 'r', encoding='utf-8') as infile:
+        for line in infile:
+            line_counter += 1
+            line_bytes = line.encode('utf-8')
+            line_size = len(line_bytes)
+            # 新增：检查单行大小是否超过1MB
+            if line_size > 1 * 1024 * 1024:
+                raise ValueError(
+                    f"JSON object at line {line_counter} exceeds 1MB limit "
+                    f"(actual size: {line_size / 1024 / 1024:.2f}MB)"
+                )
+            # 原有分割逻辑
+            if current_file and (current_size + line_size > target_size):
+                current_file.close()
+                current_file = None
+                current_size = 0
+                part_num += 1
+            if not current_file:
+                output_path = os.path.join(output_dir, f'vlm_requests_{part_num}.jsonl')
+                current_file = open(output_path, 'wb')
+                current_size = 0
+            current_file.write(line_bytes)
+            current_size += line_size
+    if current_file:
+        current_file.close()
+# 示例调用
+split_jsonl_by_size('/mnt/data/users/zys/proj/vlm_reasoning/request/vlm_batch_requests.jsonl', '/mnt/data/users/zys/proj/vlm_reasoning/upload/vlm', 450)

utils/upload/load_ll.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import os
+import json
+import copy
+import concurrent.futures
+import traceback
+system_prompt = """You are a professional textual question-answering analyst. Your expertise lies in transforming explicit textual queries, which may have originally been associated with images, into sophisticated, implicit instructions using only the provided text. Your goal is to reframe direct questions into nuanced directives that guide a user or another AI to the same answer through logical deduction, contextual understanding, or knowledge application based *solely on the textual information available*, rather than by a direct ask.\n\n**Important Context:** You must be aware that the `Original Question` you receive likely originated from a dataset where it was paired with an image. However, for your task, **you will NOT receive the image**. The explicit question might therefore contain implicit references to visual elements (e.g., \"the object on the left\", \"the color of the car\") that are underspecified in the text alone.\n\nYour Task:\nGiven an explicit textual question (and potentially its answer or relevant context), originating from an image-question pair but provided to you without the image, your task is to:\n1.  Analyze the `Original Question` and any provided `Answer/Context` to understand the core informational intent, acknowledging potential reliance on missing visual details.\n2.  Convert the explicit question into a compelling implicit instruction. This instruction should:\n    * Focus on aspects of the query solvable using the **provided text** and general knowledge.\n    * Leverage **all details** available in the `Answer/Context` field, as this might contain crucial information originally derived from the image.\n    * Reformulate or abstract parts of the question that heavily relied on specific visual cues into more general or conceptual descriptions, if possible, using the analytical dimensions.\n    * **Avoid** creating instructions that fundamentally require visual inspection of an image that is not present. If a question is purely visual and cannot be meaningfully reformulated textually (e.g., \"What specific shade of blue is the sky in the top right corner?\"), aim for the most reasonable textual abstraction or focus on other extractable information.\n3.  Classify the reframed instruction into one of five analytical dimensions (Structural Property Enhancement, Spatial-Logical Relationship Modeling, Domain Knowledge Integration, Multimodal Reasoning Pathways, Semantic Context Reconstruction) based on the primary analytical approach used in the **textual reformulation**.\n\nMethodology: Reconstructing the Query Using Analytical Dimensions (Text-Only Adaptation)\nTo create the implicit instruction from the text-only input:\n1.  Actively draw insights *only* from the provided `Original Question` and `Answer/Context`.\n2.  Utilize one or more of the five analytical dimensions below to adjust the question, add textual constraints or domain knowledge, model relationships described textually, or reconstruct semantic meaning based on the available words.\n3.  Assign a classification based on how the *textual* information was primarily restructured or analyzed.\n4.  The implicit instruction should make the intended answer feel like a natural consequence of analyzing the **provided text** and applying relevant knowledge.\n\nAnalytical Dimensions & Examples (Applied to Textual Input, potentially abstracting original visual cues):\n\n1.  Structural Property Enhancement - Add descriptive structural or quantitative attributes *mentioned or implied* in the text.\n    1.1 Physical Properties (Text-based)\n        Original question (from image context, maybe): \"Describe the benzene ring shown.\" -> (Text input only): \"Describe a benzene ring structure.\"\n        Implicit instruction: \"Describe the key structural features of an aromatic hydrocarbon molecule known for its planar regular hexagonal symmetry, focusing on its carbon-carbon bonding and associated hydrogen atoms based on standard chemical representation.\" (Focus shifts to general knowledge)\n    1.2 Quantitative Features (Text-based)\n        Original question (from image context): \"Count the runways.\" -> (Text input only): \"Count the runways.\" (Potentially with context: \"Context: The image depicts a large international airport.\")\n        Implicit instruction: \"Based on the context of a large international airport, enumerate the typical number of parallel, elongated structures designed for aircraft takeoff/landing that meet high-capacity standards (e.g., length > 3000m), assuming standard configurations if specifics aren't provided.\" (Uses context and general knowledge)\n\n2.  Spatial-Logical Relationship Modeling - Model relationships *described or implied* textually.\n    2.1 Hierarchical Structures (Text-based)\n        Original question (from image/text): \"What skin disease based on these microscopic findings?\" -> (Text input only): \"What skin disease?\" (Context: \"Hyperkeratosis in epidermis, lymphocyte infiltration in dermis.\")\n        Implicit instruction: \"Given the pathological findings described as 'hyperkeratosis in the epidermal layer' and 'lymphocyte infiltration in the dermal layer', determine the most probable dermatological diagnosis by correlating these layer-specific abnormalities.\" (Uses provided textual context directly)\n    2.2 Spatial Topology (Conceptual/Mathematical)\n        Original question: \"Number of common tangents for these two circles.\" -> (Text input only): \"Number of common tangents when two circles are tangent.\"\n        Implicit instruction: \"In a conceptual geometric scenario where two circles are defined as being externally tangent, determine the total count of lines that can be drawn tangent to both circles simultaneously, based on the properties of this specific topological arrangement.\" (Focuses on the defined geometric condition)\n\n3.  Domain Knowledge Integration - Infuse domain knowledge relevant to the *textual topic*.\n    3.1 Domain-Specific Characteristics (Text-based)\n        Original question (from image): \"Identify the oil storage tanks.\" -> (Text input only): \"Identify the oil storage tanks.\" (Context: \"Area contains large, circular metal structures.\")\n        Implicit instruction: \"Based on the description of 'large, circular metal structures' often found in industrial areas, and applying typical characteristics known from domains like remote sensing or industrial engineering (e.g., large diameter, specific roof types, association with pipelines), infer the likely function of these structures as potential storage units, possibly for petroleum products.\" (Connects text description to domain knowledge)\n    3.2 Mathematical Constraints\n        Original question: \"Prove the triangle angle sum theorem.\" (Text only)\n        Implicit instruction: \"Utilizing the axioms and postulates of Euclidean geometry, particularly the properties of parallel lines and transversal intersections, construct a logical argument demonstrating that the sum of the interior angles of any planar triangle invariably equals 180 degrees.\" (Purely conceptual/textual)\n\n4.  Multimodal Reasoning Pathways (Text-based: Combining textual info/logic)\n    4.1 Exclusion Logic (Text-based)\n        Original question (from image/symptoms): \"Which vitamin deficiency?\" -> (Text input only): \"Which vitamin deficiency?\" (Context: \"Symptoms: follicular hyperkeratosis, nyctalopia. Patient gets ample sunlight.\")\n        Implicit instruction: \"Considering the presented symptoms 'follicular hyperkeratosis' and 'nyctalopia', and given the contextual information ruling out insufficient light exposure (a common factor for Vitamin D issues), deduce the most likely fat-soluble vitamin deficiency responsible for this specific combination of clinical signs.\" (Uses text symptoms + exclusion context)\n    4.2 Data Association (Text-based)\n        Original question: \"What is the molecular weight of sodium chloride?\" (Text only)\n        Implicit instruction: \"Accessing standard atomic weight data, calculate the sum corresponding to the chemical formula NaCl, reflecting the one-to-one ionic ratio between sodium (Na, element 11) and chlorine (Cl, element 17).\" (Associates name/formula with data lookup and calculation rule)\n\n5.  Semantic Context Reconstruction - Leverage functional descriptions or context *provided in text*.\n    5.1 Functional Descriptions (Text-based)\n        Original question (from image): \"Name this apparatus.\" -> (Text input only): \"Name the apparatus.\" (Context: \"Used for collecting fractions during distillation.\")\n        Implicit instruction: \"Identify the standard laboratory glassware term for a conical-shaped vessel specifically employed during atmospheric distillation processes to receive liquids condensing at different boiling points.\" (Uses functional context provided textually)\n    5.2 Anomaly Detection (Based on *described* features)\n        Original question (from image): \"Is this illegal mining?\" -> (Text input only): \"Is this illegal mining?\" (Context: \"Vegetated area shows regular bare patches and new roads.\")\n        Implicit instruction: \"Evaluate the provided description of a land area – 'geometrically regular bare areas appearing within a vegetation zone, accompanied by traces of transport roads' – against typical indicators of unauthorized resource extraction activities to determine if it signifies potential illegal mining.\" (Focuses on interpreting the textual description)\n\nInput Format You Will Receive:\n* Original Question (required, text - potentially referencing unseen visual context)\n* Answer/Context (optional, text - crucial for potentially bridging the visual gap)\n* *Note: Although the original data source included images, your input for this task consists solely of the text components.*\n\nOutput Format You Should Generate:\nParse the reframed instruction and its classification into JSON format. Follow this structure:\n\n{\n    \"question\": \"[Implicit instruction focused on textual analysis, abstracting visual reliance]\",\n    \"classification\": \"[One of: Structural Property Enhancement, Spatial-Logical Relationship Modeling, Domain Knowledge Integration, Multimodal Reasoning Pathways, Semantic Context Reconstruction]\"\n}\n\nExample Output (using the distillation apparatus example):\n{\n    \"question\": \"Identify the standard laboratory glassware term for a conical-shaped vessel specifically employed during atmospheric distillation processes to receive liquids condensing at different boiling points.\",\n    \"classification\": \"Semantic Context Reconstruction\"\n}"""
+base_template = {
+    "custom_id": None,
+    "method": "POST",
+    "url": "/v1/chat/completions",
+    "body": {
+        "model": "qwen-plus-latest",
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": None}
+        ],
+        "temperature": 0.5,
+        "response_format":{"type": "json_object"},
+        "max_tokens": 200
+    }
+}
+def process_single_file(json_file, dataset_dir):
+    print(f"\nProcessing JSON file: {json_file}")
+    json_path = os.path.join(dataset_dir, json_file)
+    file_requests = []
+    try:
+        with open(json_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+    except (IOError, json.JSONDecodeError) as e:
+        print(f"Failed to load {json_path}: {str(e)}")
+        return file_requests
+    if not isinstance(data, list):
+        print(f"Invalid data format in {json_file}, expected list")
+        return file_requests
+    for item_idx, item in enumerate(data):
+        try:
+            if not isinstance(item, dict):
+                print(f"Invalid item format at index {item_idx} in {json_file}")
+                continue
+            questions = item.get('question', [])
+            answers = item.get('answer', [])
+            options = item.get('options', [])
+            description = item.get('description', '')
+            valid_description = ""
+            if (isinstance(description, str) and
+                len(description.split()) >= 3 and
+                len(description) > 0):
+                valid_description = description.strip() + " "
+            if not isinstance(questions, list) or len(questions) == 0:
+                print(f"Invalid questions in item {item_idx} of {json_file}")
+                continue
+            option_map = {}
+            if options and isinstance(options, list):
+                try:
+                    option_map = {opt['id']: opt['text'] for opt in options
+                                 if 'id' in opt and 'text' in opt}
+                except KeyError as e:
+                    print(f"Invalid option format in {json_file} item {item_idx}: {str(e)}")
+            for qa_idx in range(len(questions)):
+                try:
+                    original_q = questions[qa_idx]
+                    if not isinstance(original_q, str):
+                        print(f"Invalid question format at index {qa_idx} in {json_file} item {item_idx}")
+                        continue
+                    q_text = valid_description + original_q
+                    answer_text = ""
+                    if qa_idx < len(answers):
+                        original_answer = answers[qa_idx]
+                        try:
+                            if len(option_map) >= 1:
+                                answer_text = option_map.get(original_answer[0], original_answer[0])
+                            else:
+                                answer_text = original_answer if isinstance(original_answer, str) else ""
+                        except (TypeError, IndexError) as e:
+                            print(f"Answer processing error: {str(e)}")
+                    # 纯文本内容构造
+                    text_content = f"question:{q_text}"
+                    if answer_text:
+                        text_content += f"\nanswer:{answer_text}"
+                    request = copy.deepcopy(base_template)
+                    request['custom_id'] = f"{json_file[:-5]}-{item_idx}-{qa_idx}"
+                    request['body']['messages'][1]['content'] = text_content
+                    file_requests.append(request)
+                except Exception as e:
+                    print(f"Error processing QA pair {qa_idx} in {json_file} item {item_idx}: {str(e)}")
+                    traceback.print_exc()
+        except Exception as e:
+            print(f"Error processing item {item_idx} in {json_file}: {str(e)}")
+            continue
+    return file_requests
+def process_dataset(dataset_dir):
+    batch_requests = []
+    if not os.path.exists(dataset_dir):
+        print(f"Error: Dataset directory {dataset_dir} does not exist")
+        return batch_requests
+    json_files = [f for f in os.listdir(dataset_dir) if f.endswith('.json')]
+    # 保持并行处理以提升效率
+    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
+        futures = {executor.submit(process_single_file, json_file, dataset_dir): json_file
+                  for json_file in json_files}
+        for future in concurrent.futures.as_completed(futures):
+            json_file = futures[future]
+            try:
+                requests = future.result()
+                batch_requests.extend(requests)
+            except Exception as e:
+                print(f"Error processing file {json_file}: {str(e)}")
+                traceback.print_exc()
+    return batch_requests
+if __name__ == "__main__":
+    dataset_directory = "/mnt/data/users/zys/proj/vlm_reasoning/load"
+    try:
+        batch_requests = process_dataset(dataset_directory)
+        with open("/mnt/data/users/zys/proj/vlm_reasoning/request/llm_batch_requests.jsonl", 'w') as f:
+            for req in batch_requests:
+                try:
+                    f.write(json.dumps(req, ensure_ascii=False) + '\n')
+                except (TypeError, IOError) as e:
+                    print(f"Failed to write request: {str(e)}")
+                    continue
+        print(f"Successfully generated {len(batch_requests)} QA requests")
+    except Exception as e:
+        print(f"Critical error occurred: {str(e)}")
+        traceback.print_exc()

utils/upload/load_vl.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import os
+import re
+import json
+import base64
+import copy
+from io import BytesIO
+from PIL import Image
+import traceback
+import concurrent.futures  # 新增多线程支持
+def encode_image_from_pil(pil_image, max_size=256, quality=85):
+    """
+    将PIL图像按比例缩放到指定最大边长后，编码为Base64字符串
+    参数:
+        pil_image (Image): PIL图像对象
+        max_size (int): 缩放后的最长边像素（默认256）
+        quality (int): JPEG压缩质量（1-95，默认85）
+    返回:
+        str: Base64编码的字符串，失败时返回None
+    """
+    try:
+        # 1. 尺寸调整
+        width, height = pil_image.width, pil_image.height
+        if max(width, height) > max_size:
+            ratio = min(max_size / width, max_size / height)
+            new_size = (int(width * ratio), int(height * ratio))
+            resized_img = pil_image.resize(new_size)
+        else:
+            resized_img = pil_image.copy()  # 小图不放大
+        # 2. 编码为Base64
+        buffered = BytesIO()
+        resized_img.save(buffered, format="JPEG", quality=quality)
+        return base64.b64encode(buffered.getvalue()).decode('utf-8')
+    except Exception as e:
+        print(f"Image processing failed: {str(e)}")
+        return None
+system_prompt = """You are an expert Vision-Language Model assistant specializing in transforming explicit questions about images into sophisticated, implicit instructions. Your goal is to reframe direct queries into nuanced directives that guide a user or another AI to the same answer through observation and inference, rather than by a direct ask.\n\nYour Task:\nGiven an image and a corresponding explicit question-answer (QA) pair, your task is to:\n1. Convert the explicit question into a compelling implicit instruction that leverages the image content and guides the user to deduce the information sought by the original question.\n2. Classify the reframed question into one of five analytical dimensions (Spatial Relationships, Visual Attributes, Functional Context, Logical Reasoning, Semantic Connections) based on its core intent.\n\nMethodology: Reconstructing the Query Using Analytical Dimensions\nTo create the implicit instruction, you will:\n1. Actively draw insights from the provided image.\n2. Utilize one or more of the five analytical dimensions to adjust the original question, add contextual information, and reconstruct its intent into an implicit directive.\n3. Assign a classification to the reframed question based on which dimension(s) it primarily engages (e.g., questions about object positioning vs. symbolic meaning).\n4. The implicit instruction should make the answer feel like a natural consequence of observation or analysis tied to the classification.\n\n1. Spatial Relationships – Used to add spatial descriptive attributes\n1.1. Positional Arrangements\nOriginal Question Type (Location/Identification): What is the fountain in the middle of the photo like?\nImplicit Instruction (using positional arrangement description): Among the series of landscape structures in the square, please describe in detail the facility that is neither located at the very edge nor immediately adjacent to buildings, but is roughly in the geometric center area and jets water upwards.\n1.2. Directional Orientation\nOriginal Question Type (Location/Identification): Find the person in the picture who has their back to us.\nImplicit Instruction (using directional orientation description): Among the multiple individuals in the scene, please identify and describe the person whose main body part (especially the face) is oriented roughly opposite to the observer's line of sight.\n\n2. Visual Attributes – Used to add visual descriptive attributes\n2.1. Color\nOriginal Question Type (Location/Identification): How is the green apple in the basket? (Assuming there are red and green apples)\nImplicit Instruction (using color attribute description): In the container holding various fruits, please focus on the spherical fruits whose skin presents a hue similar to leaves or unripe bananas, and describe the condition of one of them.\n2.2. Shape\nOriginal Question Type (Location/Identification): Find the square cushion.\nImplicit Instruction (using shape attribute description): Among the multiple cushions on the sofa, please point out the fabric item used for comfortable leaning that has an outer contour with four roughly equal sides and internal angles close to right angles.\n2.3. Size\nOriginal Question Type (Location/Identification): What is the tallest book in that pile of books?\nImplicit Instruction (using size attribute description): Among the several books stacked together, please identify the printed material that significantly surpasses all other books in the vertical dimension, and describe its cover.\n2.4. Material Properties\nOriginal Question Type (Location/Identification): Which sculpture is made of stone?\nImplicit Instruction (using material attribute description): Among the multiple displayed artistic forms, please identify the work whose surface presents natural rock texture, feels cold to the touch, has a hard texture, and typically possesses a certain sense of weight.\n\n3. Functional Context – Used to add functional or behavioral descriptive attributes\n3.1. Item Purposes\nOriginal Question Type (Location/Identification): Find the knife used for cutting vegetables in the kitchen.\nImplicit Instruction (using item purpose description): Among the various utensils in the kitchen, please identify the hand-held metal tool that typically has a sharp single or double-edged blade and is designed for segmenting or slicing food ingredients.\n3.2. Human Actions\nOriginal Question Type (Location/Identification): Which waiter is wiping the table?\nImplicit Instruction (using human action description): Among the staff in the restaurant, please locate the employee who is currently holding a cloth or cleaning supplies and whose upper body and arms are repeatedly performing a wiping motion on a flat surface.\n3.3. Environmental Conditions\nOriginal Question Type (Location/Identification): Point out the rabbit in the snow. (Assuming a complex background where the environment helps)\nImplicit Instruction (using environmental condition description): On the vast ground covered with white ice crystals, carefully search for and point out the small mammal that contrasts with the surrounding snowy white environment, which it might be using for camouflage.\n\n4. Logical Reasoning – Used to add logical judgment attributes\n4.1. Quantity Comparisons\nOriginal Question Type (Location/Identification): Which team has the fewest people? (Assuming there are three teams)\nImplicit Instruction (using quantity comparison description for differentiation/location): Separately count the members of each clearly distinguishable group of people in the frame, and then indicate the group whose total number of members is at the lowest level in comparison.\n4.2. Conditional Evaluations\nOriginal Question Type (Location/Identification): If I were going out, which umbrella should I take? (Assuming one good umbrella and one broken one)\nImplicit Instruction (using conditional evaluation description): Examine all available rain gear in the image, assess their respective conditions and functionality, and select the umbrella that would provide reliable shelter in the event of wet weather and currently has no obvious damage.\n4.3. Causal Relationships\nOriginal Question Type (Location/Identification): Which child knocked over the milk? (Assuming spilled milk and several children nearby)\nImplicit Instruction (using causal relationship description): Observe the location where the milky white liquid was spilled and the position of its overturned container, and considering the positions of nearby children, their expressions, or any traces on their hands, infer the child most likely responsible for this accident.\n\n5. Semantic Connections – Used to add symbolic or emotional descriptive attributes\n5.1. Cultural Metaphors\nOriginal Question Type (Location/Identification): Find the traditional Chinese painting that features bamboo.\nImplicit Instruction (using cultural metaphor description): Among several traditional Eastern paintings, please identify the artwork whose main subject is a plant characterized by its hollow, segmented stems, often used in a specific culture as a symbol of gentlemanly qualities (such as integrity, modesty, and resilience).\n5.2. Emotional Expressions\nOriginal Question Type (Location/Identification): Which member of the audience looks the most disappointed?\nImplicit Instruction (using emotional expression description): Among the people watching the game or performance, find the individual whose facial expression (e.g., downturned mouth, dull eyes, furrowed brow) and body posture most clearly convey negative emotions, such as unmet expectations or dissatisfaction.\n5.3. Symbolic Meanings\nOriginal Question Type (Location/Identification): Identify the decoration on top of the wedding cake.\nImplicit Instruction (using symbolic meaning description): Observe the multi-tiered celebration cake and locate the small decorative object placed at its very top, which typically carries auspicious meanings (such as figures of the newlyweds, symbols of love, a shared future, etc.) and serves as a finishing touch for the overall ceremony.\n\nInput Format You Will Receive:\nImage (required)\nOriginal Question (required)\nAnswer/Context (optional): May include partial or no contextual constraints\n\nOutput Format You Should Generate:\nParse the reframed question and its classification into JSON format. Follow this structure:\n\n{\n    \"question\": \"[Implicit instruction guiding observation/inference]\",\n    \"classification\": \"[One of: Spatial Relationships, Visual Attributes, Functional Context, Logical Reasoning, Semantic Connections]\"\n}\nExample Output:\n{\n    \"question\": \"Observe the multi-tiered celebration cake and locate the small decorative object placed at its very top, which typically carries auspicious meanings (such as figures of the newlyweds, symbols of love, a shared future, etc.) and serves as a finishing touch for the overall ceremony.\",\n    \"classification\": \"Semantic Connections\"\n}"""
+base_template = {
+    "custom_id": None,
+    "method": "POST",
+    "url": "/v1/chat/completions",
+    "body": {
+        "model": "qwen-vl-max-latest",
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": None}
+        ],
+        "temperature": 0.1,
+        "response_format":{"type": "json_object"},
+        "max_tokens": 200
+    }
+}
+def process_single_file(json_file, dataset_dir):  # 新增文件处理函数
+    print(f"\nProcessing JSON file: {json_file}")  # 添加开始处理日志
+    json_path = os.path.join(dataset_dir, json_file)
+    file_requests = []
+    try:
+        with open(json_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+    except (IOError, json.JSONDecodeError) as e:
+        print(f"Failed to load {json_path}: {str(e)}")
+        return file_requests
+    if not isinstance(data, list):
+        print(f"Invalid data format in {json_file}, expected list")
+        return file_requests
+    for item_idx, item in enumerate(data):
+        try:
+            if not isinstance(item, dict):
+                print(f"Invalid item format at index {item_idx} in {json_file}")
+                continue
+            img_path = os.path.join(dataset_dir, item['media_paths'])
+            img_path = os.path.normpath(img_path)
+            if not os.path.exists(img_path):
+                print(f"Image file not found: {img_path}")
+                continue
+            try:
+                image = Image.open(img_path)
+                base64_img = encode_image_from_pil(image)
+                if not base64_img:
+                    continue
+            except (IOError, OSError) as e:
+                print(f"Failed to process image {img_path}: {str(e)}")
+                continue
+            questions = item.get('question', [])
+            answers = item.get('answer', [])
+            options = item.get('options', [])
+            if not isinstance(questions, list) or len(questions) == 0:
+                print(f"Invalid questions in item {item_idx} of {json_file}")
+                continue
+            option_map = {}
+            if options and isinstance(options, list):
+                try:
+                    option_map = {opt['id']: opt['text'] for opt in options
+                                 if 'id' in opt and 'text' in opt}
+                except KeyError as e:
+                    print(f"Invalid option format in {json_file} item {item_idx}: {str(e)}")
+            for qa_idx in range(len(questions)):
+                try:
+                    q_text = questions[qa_idx]
+                    if not isinstance(q_text, str):
+                        print(f"Invalid question format at index {qa_idx} in {json_file} item {item_idx}")
+                        continue
+                    answer_text = ""
+                    if qa_idx < len(answers):
+                        original_answer = answers[qa_idx]
+                        try:
+                            if len(option_map) >= 1:
+                                answer_text = option_map.get(original_answer[0], original_answer[0])
+                            else:
+                                answer_text = original_answer if isinstance(original_answer, str) else ""
+                                answer_text= re.sub(r'\s*\[.*?\]', '', answer_text).strip()
+                        except (TypeError, IndexError) as e:
+                            print(f"Answer processing error: {str(e)}")
+                    user_content = []
+                    text_content = f"question: {q_text} answer: {answer_text}" if answer_text else f"question: {q_text}"
+                    user_content.append({"type": "text", "text": text_content})
+                    user_content.append({
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{base64_img}"}
+                    })
+                    request = copy.deepcopy(base_template)
+                    request['custom_id'] = f"{json_file[:-5]}-{item_idx}-{qa_idx}"
+                    request['body']['messages'][1]['content'] = user_content
+                    file_requests.append(request)
+                except Exception as e:
+                    print(f"Error processing QA pair {qa_idx} in {json_file} item {item_idx}: {str(e)}")
+                    traceback.print_exc()
+        except Exception as e:
+            print(f"Error processing item {item_idx} in {json_file}: {str(e)}")
+            continue
+    return file_requests
+def process_dataset(dataset_dir):
+    batch_requests = []
+    if not os.path.exists(dataset_dir):
+        print(f"Error: Dataset directory {dataset_dir} does not exist")
+        return batch_requests
+    json_files = [f for f in os.listdir(dataset_dir) if f.endswith('.json')]
+    # 使用线程池并行处理文件
+    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
+        futures = {executor.submit(process_single_file, json_file, dataset_dir): json_file
+                  for json_file in json_files}
+        for future in concurrent.futures.as_completed(futures):
+            json_file = futures[future]
+            try:
+                requests = future.result()
+                batch_requests.extend(requests)
+            except Exception as e:
+                print(f"Error processing file {json_file}: {str(e)}")
+                traceback.print_exc()
+    return batch_requests
+if __name__ == "__main__":
+    dataset_directory = "/mnt/data/users/zys/proj/vlm_reasoning/dataset"
+    try:
+        batch_requests = process_dataset(dataset_directory)
+        with open("/mnt/data/users/zys/proj/vlm_reasoning/request/vlm_batch_requests.jsonl", 'w') as f:
+            for req in batch_requests:
+                try:
+                    f.write(json.dumps(req, ensure_ascii=False) + '\n')
+                except (TypeError, IOError) as e:
+                    print(f"Failed to write request: {str(e)}")
+                    continue
+        print(f"Successfully generated {len(batch_requests)} requests")
+    except Exception as e:
+        print(f"Critical error occurred: {str(e)}")
+        traceback.print_exc()

utils/upload/request_create.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import json
+import sys
+import random
+from collections import defaultdict
+def collect_dataset_info(file_path):
+    """收集数据集信息，包括每个数据集的行号列表和首次出现顺序"""
+    dataset_lines = defaultdict(list)
+    order = []
+    seen = set()
+    with open(file_path, 'r') as f:
+        for line_num, line in enumerate(f, 1):
+            try:
+                data = json.loads(line.strip())
+                custom_id = data['custom_id']
+                dataset = custom_id.split('-')[0]
+                if dataset not in seen:
+                    order.append(dataset)
+                    seen.add(dataset)
+                dataset_lines[dataset].append(line_num)
+            except json.JSONDecodeError:
+                print(f"Error: Invalid JSON at line {line_num}", file=sys.stderr)
+            except KeyError:
+                print(f"Error: Missing 'custom_id' at line {line_num}", file=sys.stderr)
+            except IndexError:
+                print(f"Error: Invalid custom_id format at line {line_num}", file=sys.stderr)
+    return dataset_lines, order
+def main():
+    if len(sys.argv) != 4:
+        print("Usage: python sample_datasets.py <input.jsonl> <output.jsonl> <N>")
+        sys.exit(1)
+    input_file = sys.argv[1]
+    output_file = sys.argv[2]
+    try:
+        N = int(sys.argv[3])
+    except ValueError:
+        print("Error: N must be an integer.")
+        sys.exit(1)
+    # 收集数据集信息
+    dataset_info, dataset_order = collect_dataset_info(input_file)
+    k = len(dataset_info)
+    if k == 0:
+        print("Error: No datasets found in the input file.")
+        sys.exit(1)
+    # 检查每个数据集是否有至少5个样本
+    for dataset, lines in dataset_info.items():
+        if len(lines) < 5:
+            print(f"Error: Dataset '{dataset}' has fewer than 5 samples.")
+            sys.exit(1)
+    total_samples = sum(len(lines) for lines in dataset_info.values())
+    min_samples = 5 * k
+    if N < min_samples or N > total_samples:
+        print(f"Error: N must be between {min_samples} and {total_samples}.")
+        sys.exit(1)
+    # 计算可用样本数和剩余需要分配的样本数
+    available = {dataset: len(lines) - 5 for dataset, lines in dataset_info.items()}
+    total_available = sum(available.values())
+    R = N - 5 * k
+    if R > total_available:
+        print(f"Error: Cannot allocate {R} samples from available {total_available}.")
+        sys.exit(1)
+    # 计算每个数据集分配的剩余样本数
+    allocations = []
+    sum_avail = total_available if total_available != 0 else 1  # 避免除以零
+    for dataset in dataset_order:
+        avail = available[dataset]
+        alloc_float = R * avail / sum_avail
+        allocations.append(alloc_float)
+    integer_part = [int(alloc) for alloc in allocations]
+    remainders = [alloc - int_part for alloc, int_part in zip(allocations, integer_part)]
+    remainder_total = R - sum(integer_part)
+    # 分配余数
+    remainder_indices = sorted(enumerate(remainders), key=lambda x: (-x[1], x[0]))
+    for i in range(remainder_total):
+        idx = remainder_indices[i][0]
+        integer_part[idx] += 1
+    # 计算每个数据集的最终采样数
+    sample_counts = {}
+    for i, dataset in enumerate(dataset_order):
+        alloc = integer_part[i]
+        if alloc > available[dataset]:
+            print(f"Error: Allocation for dataset '{dataset}' exceeds available samples.")
+            sys.exit(1)
+        sample_counts[dataset] = 5 + alloc
+    # 打印采样分布信息（新增部分）
+    print("\nSampling Distribution:")
+    total_sampled = 0
+    for dataset in dataset_order:
+        count = sample_counts[dataset]
+        total_sampled += count
+        print(f"  - {dataset}: {count} samples")
+    print(f"Total samples: {total_sampled} (target: {N})")
+    # 验证总数正确性
+    if total_sampled != N:
+        print(f"Error: Total sampled count mismatch ({total_sampled} vs {N})")
+        sys.exit(1)
+    # 随机选择行号
+    selected_lines = []
+    for dataset in dataset_order:
+        lines = dataset_info[dataset]
+        count = sample_counts[dataset]
+        selected = random.sample(lines, count)
+        selected_lines.extend(selected)
+    selected_lines.sort()
+    # 写入输出文件
+    current_idx = 0
+    total_selected = len(selected_lines)
+    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
+        for line_num, line in enumerate(infile, 1):
+            if current_idx >= total_selected:
+                break
+            if line_num == selected_lines[current_idx]:
+                outfile.write(line)
+                current_idx += 1
+    print(f"\nSuccessfully sampled {N} records to {output_file}.")
+if __name__ == "__main__":
+    main()

utils/upload_test/batch_create.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import os
+from openai import OpenAI
+client = OpenAI(
+    # 若没有配置环境变量，可用阿里云百炼API Key将下行替换为：api_key="sk-xxx"。但不建议在生产环境中直接将API Key硬编码到代码中，以减少API Key泄露风险。
+    api_key=os.getenv("DASHSCOPE_API_KEY"),
+    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",  # 阿里云百炼服务的base_url
+)
+batch = client.batches.create(
+    input_file_id="oss:cn-beijing:acm-mm-reason/test/vlm_test100.jsonl",  # 上传文件返回的id或OSS文件URL或OSS文件资源标识符
+    endpoint="/v1/chat/completions",  # Embedding文本向量模型填写/v1/embeddings,测试模型batch-test-model填写/v1/chat/ds-test,其他模型填写/v1/chat/completions
+    completion_window="10h",
+    metadata={'ds_name':"vlm_test100",'ds_description':'finally test'}
+)
+print(batch)

utils/upload_test/batch_download.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import os
+from openai import OpenAI
+client = OpenAI(
+    api_key=os.getenv("DASHSCOPE_API_KEY"),
+    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
+)
+content = client.files.content(file_id="file-batch_output-kJ3eejYgBU2JI3m9D3nTs6Jp")
+# 打印结果文件内容
+print(content.text)
+# 保存结果文件至本地
+content.write_to_file("finderror.jsonl")

utils/upload_test/batch_search.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import os
+from openai import OpenAI
+client = OpenAI(
+    # 若没有配置环境变量，可用阿里云百炼API Key将下行替换为：api_key="sk-xxx"。但不建议在生产环境中直接将API Key硬编码到代码中，以减少API Key泄露风险。
+    api_key=os.getenv("DASHSCOPE_API_KEY"),
+    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",  # 阿里云百炼服务的base_url
+)
+batch = client.batches.retrieve("batch_d76f011f-e7bb-4686-bc6f-de7a202fa9fa")  # 将batch_id替换为Batch任务的id
+print(batch)

utils/upload_test/finderror.jsonl ADDED Viewed

	@@ -0,0 +1,94 @@

+{"id":"f112028f-6663-9701-b2d8-6b43ed717e99","custom_id":"Lisa-242-3","response":{"status_code":500,"request_id":"f112028f-6663-9701-b2d8-6b43ed717e99","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"8ffae164-6447-94f4-af7a-f161ad4248ad","custom_id":"Lisa-295-0","response":{"status_code":500,"request_id":"8ffae164-6447-94f4-af7a-f161ad4248ad","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"c455f77b-4285-9d67-8eb4-1913df6c2ccb","custom_id":"Lisa-610-5","response":{"status_code":500,"request_id":"c455f77b-4285-9d67-8eb4-1913df6c2ccb","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"9431003e-9a74-945f-a175-127355b27d76","custom_id":"Lisa-632-5","response":{"status_code":500,"request_id":"9431003e-9a74-945f-a175-127355b27d76","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"850bb280-8d40-9712-81bc-e4b756c09ebe","custom_id":"Lisa-756-0","response":{"status_code":500,"request_id":"850bb280-8d40-9712-81bc-e4b756c09ebe","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"0b06a5c5-311e-935b-8f6c-ecac974900d3","custom_id":"Lisa-1173-0","response":{"status_code":500,"request_id":"0b06a5c5-311e-935b-8f6c-ecac974900d3","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"0a4a3c74-df68-983f-ad91-0e83ded9e082","custom_id":"EmbSpatial-1612-0","response":{"status_code":500,"request_id":"0a4a3c74-df68-983f-ad91-0e83ded9e082","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"91be2da1-efda-9ffe-98f6-31c4e27a3153","custom_id":"EmbSpatial-7398-0","response":{"status_code":500,"request_id":"91be2da1-efda-9ffe-98f6-31c4e27a3153","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"7075f06d-5123-9aed-9927-172df5b27a00","custom_id":"EmbSpatial-8495-0","response":{"status_code":500,"request_id":"7075f06d-5123-9aed-9927-172df5b27a00","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"6c62ced8-1104-93be-abc3-d61cd230ce56","custom_id":"EmbSpatial-11682-0","response":{"status_code":500,"request_id":"6c62ced8-1104-93be-abc3-d61cd230ce56","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"6b2bf3e4-6abf-9722-ae22-bb681a522290","custom_id":"EmbSpatial-12955-0","response":{"status_code":500,"request_id":"6b2bf3e4-6abf-9722-ae22-bb681a522290","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"2fcd6b4c-cfab-9214-92e2-ffc94a0ddee7","custom_id":"EmbSpatial-15924-0","response":{"status_code":500,"request_id":"2fcd6b4c-cfab-9214-92e2-ffc94a0ddee7","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"a8ea0e75-4ac4-97fe-a6d3-1c47ed87390c","custom_id":"EmbSpatial-17091-0","response":{"status_code":500,"request_id":"a8ea0e75-4ac4-97fe-a6d3-1c47ed87390c","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"2f91dc53-2ec0-9780-8855-dc4c63f2a16c","custom_id":"EmbSpatial-17249-0","response":{"status_code":500,"request_id":"2f91dc53-2ec0-9780-8855-dc4c63f2a16c","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"ffe4335a-138c-9a84-a40f-b91c9d58aa65","custom_id":"EmbSpatial-17382-0","response":{"status_code":500,"request_id":"ffe4335a-138c-9a84-a40f-b91c9d58aa65","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"5c753db1-2557-9618-abe3-4e381baa21f4","custom_id":"EmbSpatial-20852-0","response":{"status_code":500,"request_id":"5c753db1-2557-9618-abe3-4e381baa21f4","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"7fb1206e-fd3c-9abb-8bec-89d7d1ca4e3a","custom_id":"EmbSpatial-21270-0","response":{"status_code":500,"request_id":"7fb1206e-fd3c-9abb-8bec-89d7d1ca4e3a","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"1ad37634-ef3d-95fa-823f-3781bc1bb762","custom_id":"EmbSpatial-21954-0","response":{"status_code":500,"request_id":"1ad37634-ef3d-95fa-823f-3781bc1bb762","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"e05b95eb-59a3-9ec7-b059-1c1f9a9e4a96","custom_id":"EmbSpatial-22416-0","response":{"status_code":500,"request_id":"e05b95eb-59a3-9ec7-b059-1c1f9a9e4a96","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"f6b9688f-4204-9fb4-8fd2-86386d78c9d1","custom_id":"EmbSpatial-24631-0","response":{"status_code":500,"request_id":"f6b9688f-4204-9fb4-8fd2-86386d78c9d1","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"f9e7938a-05bf-9760-9f7f-61105c07ffc9","custom_id":"EmbSpatial-26898-0","response":{"status_code":500,"request_id":"f9e7938a-05bf-9760-9f7f-61105c07ffc9","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"b8d7738e-771d-9e41-bb46-9eb99eda7a5c","custom_id":"EmbSpatial-28983-0","response":{"status_code":500,"request_id":"b8d7738e-771d-9e41-bb46-9eb99eda7a5c","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"0493d089-e505-9d15-b0bb-49339a3e9b91","custom_id":"MMR-134-0","response":{"status_code":500,"request_id":"0493d089-e505-9d15-b0bb-49339a3e9b91","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"9008de99-fcdb-95b5-9049-5e8852f7be3d","custom_id":"MMR-171-3","response":{"status_code":500,"request_id":"9008de99-fcdb-95b5-9049-5e8852f7be3d","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"7cb673b1-3351-919b-85d5-d52db8ceed7b","custom_id":"MMR-300-0","response":{"status_code":500,"request_id":"7cb673b1-3351-919b-85d5-d52db8ceed7b","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"44af9316-adf4-9de8-9b45-c0b2b83afb89","custom_id":"MMR-305-0","response":{"status_code":500,"request_id":"44af9316-adf4-9de8-9b45-c0b2b83afb89","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"cfa4c691-c810-904a-b4c4-659cd9b62b66","custom_id":"MMR-331-2","response":{"status_code":500,"request_id":"cfa4c691-c810-904a-b4c4-659cd9b62b66","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"0481df84-9d4e-9fa7-b6ac-bbaec8effd99","custom_id":"MMR-725-2","response":{"status_code":500,"request_id":"0481df84-9d4e-9fa7-b6ac-bbaec8effd99","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"e7a559af-9bda-902c-9ae9-6ac8f0613f80","custom_id":"MMR-2275-0","response":{"status_code":500,"request_id":"e7a559af-9bda-902c-9ae9-6ac8f0613f80","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"0e155d01-d5ff-952c-acb3-9570c8d7e807","custom_id":"MMR-2468-1","response":{"status_code":500,"request_id":"0e155d01-d5ff-952c-acb3-9570c8d7e807","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"b61ca940-e684-9b3a-8130-c69843bf77ac","custom_id":"MMR-2935-2","response":{"status_code":500,"request_id":"b61ca940-e684-9b3a-8130-c69843bf77ac","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"8728de73-ddd0-9af0-b009-1d7aaad20139","custom_id":"MMR-3490-2","response":{"status_code":500,"request_id":"8728de73-ddd0-9af0-b009-1d7aaad20139","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"364deec3-8f67-9cd1-91c8-2ab76d5be7d5","custom_id":"MMR-3919-1","response":{"status_code":500,"request_id":"364deec3-8f67-9cd1-91c8-2ab76d5be7d5","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"ed04af32-3d22-93ca-8bbf-c629184d8451","custom_id":"MMR-4067-2","response":{"status_code":500,"request_id":"ed04af32-3d22-93ca-8bbf-c629184d8451","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"c69e6583-90f1-9a47-ac7d-776130e36cf6","custom_id":"MMR-4297-2","response":{"status_code":500,"request_id":"c69e6583-90f1-9a47-ac7d-776130e36cf6","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"1d16d552-70ae-916e-aa15-75936f932364","custom_id":"MMR-5152-2","response":{"status_code":500,"request_id":"1d16d552-70ae-916e-aa15-75936f932364","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"69f9beb7-8769-9c45-904a-150e0a25d1ef","custom_id":"MMR-5319-0","response":{"status_code":500,"request_id":"69f9beb7-8769-9c45-904a-150e0a25d1ef","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"ed27aa37-cb20-9659-9191-7103b4632c03","custom_id":"MMR-6139-3","response":{"status_code":500,"request_id":"ed27aa37-cb20-9659-9191-7103b4632c03","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"759a6605-f409-9f2c-9b75-58f4e36a46e5","custom_id":"MMR-6793-1","response":{"status_code":500,"request_id":"759a6605-f409-9f2c-9b75-58f4e36a46e5","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"0c0692be-2591-98f6-aa34-11fa21e1d7f1","custom_id":"MMR-9581-2","response":{"status_code":500,"request_id":"0c0692be-2591-98f6-aa34-11fa21e1d7f1","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"7edf6a92-fa60-9195-844c-f7f9a4ab6432","custom_id":"MMR-10764-2","response":{"status_code":500,"request_id":"7edf6a92-fa60-9195-844c-f7f9a4ab6432","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"266f796f-e2bb-9925-93d8-ddcbeecd2e14","custom_id":"MMR-10966-1","response":{"status_code":500,"request_id":"266f796f-e2bb-9925-93d8-ddcbeecd2e14","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"5fcc4302-8845-974d-a377-e80a401d7821","custom_id":"MMR-11897-2","response":{"status_code":500,"request_id":"5fcc4302-8845-974d-a377-e80a401d7821","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"df749e88-6d5e-98a2-b6af-b9f90bc99aad","custom_id":"MMR-12199-2","response":{"status_code":500,"request_id":"df749e88-6d5e-98a2-b6af-b9f90bc99aad","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"19d8e5a9-0991-9d18-83d9-3e9e7b3436d0","custom_id":"MMR-12378-1","response":{"status_code":500,"request_id":"19d8e5a9-0991-9d18-83d9-3e9e7b3436d0","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"41baf8eb-cc41-90c4-bff5-cdaeca7d9a9f","custom_id":"MMR-13621-2","response":{"status_code":500,"request_id":"41baf8eb-cc41-90c4-bff5-cdaeca7d9a9f","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"b0326514-4dad-9852-aea2-bb62a49a6cba","custom_id":"MMR-14582-1","response":{"status_code":500,"request_id":"b0326514-4dad-9852-aea2-bb62a49a6cba","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"dae759d3-8319-9d5c-bf76-a7f9637f8149","custom_id":"MMR-14908-1","response":{"status_code":500,"request_id":"dae759d3-8319-9d5c-bf76-a7f9637f8149","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"1b4b87b0-19d9-9720-87ae-8890dca85514","custom_id":"MMR-15221-0","response":{"status_code":500,"request_id":"1b4b87b0-19d9-9720-87ae-8890dca85514","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"8fbda12c-f543-909f-90cd-ab5a43c33402","custom_id":"MMR-16040-2","response":{"status_code":500,"request_id":"8fbda12c-f543-909f-90cd-ab5a43c33402","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"de5b38ae-5205-9692-b40a-50bf0b9f2e56","custom_id":"MMR-18168-0","response":{"status_code":500,"request_id":"de5b38ae-5205-9692-b40a-50bf0b9f2e56","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"a8dc2936-82d9-99d6-b837-8961e5abfeec","custom_id":"MMR-20306-1","response":{"status_code":500,"request_id":"a8dc2936-82d9-99d6-b837-8961e5abfeec","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"0e58d5cf-fdb9-9d19-ac70-aeea67e126fa","custom_id":"MMR-21718-0","response":{"status_code":500,"request_id":"0e58d5cf-fdb9-9d19-ac70-aeea67e126fa","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"b171708f-b8ab-904b-b5c4-1168a3b4d834","custom_id":"MMR-22581-1","response":{"status_code":500,"request_id":"b171708f-b8ab-904b-b5c4-1168a3b4d834","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"99bca0c8-e7b6-93ee-a328-bd190b467738","custom_id":"MMR-23390-1","response":{"status_code":500,"request_id":"99bca0c8-e7b6-93ee-a328-bd190b467738","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"69508cdd-7560-93f1-9bee-4bc8dbc0ef44","custom_id":"MMR-23603-2","response":{"status_code":500,"request_id":"69508cdd-7560-93f1-9bee-4bc8dbc0ef44","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"f475ba96-b312-9452-bb12-166259ea959e","custom_id":"MMR-24700-2","response":{"status_code":500,"request_id":"f475ba96-b312-9452-bb12-166259ea959e","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"afb8e374-850e-9b2a-a899-f49b18c67620","custom_id":"MMR-27112-0","response":{"status_code":500,"request_id":"afb8e374-850e-9b2a-a899-f49b18c67620","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"b65215bd-6b0f-960b-8507-79cf0d6c3923","custom_id":"MMR-27435-2","response":{"status_code":500,"request_id":"b65215bd-6b0f-960b-8507-79cf0d6c3923","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"cd9b192e-621c-952a-9686-63b310528e52","custom_id":"MMR-27983-4","response":{"status_code":500,"request_id":"cd9b192e-621c-952a-9686-63b310528e52","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"cd26cd84-e0c3-973e-8425-a84682693406","custom_id":"MMR-29200-1","response":{"status_code":500,"request_id":"cd26cd84-e0c3-973e-8425-a84682693406","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"c086beee-ac4a-9fc0-87b8-b3ffdf27f2d0","custom_id":"MMR-29279-1","response":{"status_code":500,"request_id":"c086beee-ac4a-9fc0-87b8-b3ffdf27f2d0","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"55bc024f-de7a-949f-8080-2d2b3a9cd28d","custom_id":"MMR-30032-1","response":{"status_code":500,"request_id":"55bc024f-de7a-949f-8080-2d2b3a9cd28d","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"9fe5df9b-5be5-9a55-a47a-b0fcfad916c6","custom_id":"MMR-32992-2","response":{"status_code":500,"request_id":"9fe5df9b-5be5-9a55-a47a-b0fcfad916c6","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"310d5d06-8e2f-936c-addc-d2e7793a9ff3","custom_id":"MMR-33740-0","response":{"status_code":500,"request_id":"310d5d06-8e2f-936c-addc-d2e7793a9ff3","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"2ef5cfbe-a8c3-90e9-b606-3ef40b006f86","custom_id":"MMR-33998-3","response":{"status_code":500,"request_id":"2ef5cfbe-a8c3-90e9-b606-3ef40b006f86","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"43aba159-dc9a-96a3-8939-cc0147d7985c","custom_id":"MMR-34992-2","response":{"status_code":500,"request_id":"43aba159-dc9a-96a3-8939-cc0147d7985c","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"5b7c27ab-6322-93cc-8bea-34e0cdcec92e","custom_id":"MMR-37312-1","response":{"status_code":500,"request_id":"5b7c27ab-6322-93cc-8bea-34e0cdcec92e","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"89eea748-43c8-9a28-aa7a-140dfd40b2d8","custom_id":"MMR-37405-0","response":{"status_code":500,"request_id":"89eea748-43c8-9a28-aa7a-140dfd40b2d8","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"6f08f754-005d-9773-8d61-cd22a5be7535","custom_id":"MMR-37799-1","response":{"status_code":500,"request_id":"6f08f754-005d-9773-8d61-cd22a5be7535","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"0e4cfcb7-ead6-9c03-8f1a-81546cbcff69","custom_id":"MMR-38631-1","response":{"status_code":500,"request_id":"0e4cfcb7-ead6-9c03-8f1a-81546cbcff69","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"7350938b-9728-944b-9081-7fe40444b1ff","custom_id":"MMR-39225-0","response":{"status_code":500,"request_id":"7350938b-9728-944b-9081-7fe40444b1ff","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"c2205a9a-6908-98a9-ad0b-ab2530fb3018","custom_id":"MMR-39575-1","response":{"status_code":500,"request_id":"c2205a9a-6908-98a9-ad0b-ab2530fb3018","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"97f6fd35-9361-9395-a59c-b81899d5fe17","custom_id":"MMR-39635-0","response":{"status_code":500,"request_id":"97f6fd35-9361-9395-a59c-b81899d5fe17","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"fb8709ec-c35f-97e0-96f3-93187c9bb24d","custom_id":"MMR-41248-2","response":{"status_code":500,"request_id":"fb8709ec-c35f-97e0-96f3-93187c9bb24d","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"3e40b136-f3a5-92ce-9b1e-61a786324e7f","custom_id":"MMR-41383-1","response":{"status_code":500,"request_id":"3e40b136-f3a5-92ce-9b1e-61a786324e7f","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"75fcfb45-cf9f-9208-b296-e3a66c7dfddc","custom_id":"MMR-41541-1","response":{"status_code":500,"request_id":"75fcfb45-cf9f-9208-b296-e3a66c7dfddc","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"4fb406a4-6489-92a0-8fee-23e39ec6374a","custom_id":"MMR-42068-1","response":{"status_code":500,"request_id":"4fb406a4-6489-92a0-8fee-23e39ec6374a","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"04b3516c-233c-9da0-aba4-76ef1c9730ec","custom_id":"MMR-43502-1","response":{"status_code":500,"request_id":"04b3516c-233c-9da0-aba4-76ef1c9730ec","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"7e65fa6d-af65-9d65-b29c-eb14ed0d3c04","custom_id":"MMR-43504-2","response":{"status_code":500,"request_id":"7e65fa6d-af65-9d65-b29c-eb14ed0d3c04","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"65c04f66-673c-95f7-ae40-55095b8a649d","custom_id":"MMR-43758-0","response":{"status_code":500,"request_id":"65c04f66-673c-95f7-ae40-55095b8a649d","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"4c2353ce-8ffd-9050-bcd4-7227e1b9c88c","custom_id":"MMR-43992-3","response":{"status_code":500,"request_id":"4c2353ce-8ffd-9050-bcd4-7227e1b9c88c","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"bffd08ff-1d4b-9534-a060-f4e3b284f561","custom_id":"MMR-44141-0","response":{"status_code":500,"request_id":"bffd08ff-1d4b-9534-a060-f4e3b284f561","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"e9e4361d-a1fe-9a02-b737-0c0d9be54a88","custom_id":"MMR-45554-1","response":{"status_code":500,"request_id":"e9e4361d-a1fe-9a02-b737-0c0d9be54a88","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"5f5abc51-0bfb-9535-86e1-24e62ccf7b14","custom_id":"MMR-48256-0","response":{"status_code":500,"request_id":"5f5abc51-0bfb-9535-86e1-24e62ccf7b14","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"aa572146-1c97-97bc-a6a5-b014ddb3944e","custom_id":"MMR-48592-1","response":{"status_code":500,"request_id":"aa572146-1c97-97bc-a6a5-b014ddb3944e","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"987ea0a7-80e3-9480-b35f-b8a2274c3f0b","custom_id":"MMR-49126-2","response":{"status_code":500,"request_id":"987ea0a7-80e3-9480-b35f-b8a2274c3f0b","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"ae4aed04-b35d-93de-9749-03072a4fc5fb","custom_id":"MMR-49269-0","response":{"status_code":500,"request_id":"ae4aed04-b35d-93de-9749-03072a4fc5fb","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"f30dcb02-9cac-9aaf-b614-f2cbd4f36287","custom_id":"MMR-49653-1","response":{"status_code":500,"request_id":"f30dcb02-9cac-9aaf-b614-f2cbd4f36287","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"481cf69b-81cf-921d-906d-36a443cdad2f","custom_id":"MMR-50614-1","response":{"status_code":500,"request_id":"481cf69b-81cf-921d-906d-36a443cdad2f","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"d87f29c5-ae32-9b98-a498-e2c5ce218c81","custom_id":"MMR-52000-2","response":{"status_code":500,"request_id":"d87f29c5-ae32-9b98-a498-e2c5ce218c81","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"9d2bec62-161a-9baf-9110-8ce8bc1aac2c","custom_id":"MMR-52537-0","response":{"status_code":500,"request_id":"9d2bec62-161a-9baf-9110-8ce8bc1aac2c","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"56ccda42-f2ba-9415-afce-a149cf945fd3","custom_id":"MMR-54912-1","response":{"status_code":500,"request_id":"56ccda42-f2ba-9415-afce-a149cf945fd3","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}
+{"id":"b693bc74-18f8-9858-bba3-3f5e153bacce","custom_id":"MMR-57095-2","response":{"status_code":500,"request_id":"b693bc74-18f8-9858-bba3-3f5e153bacce","body":{"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":"internal_error"}}},"error":{"code":"internal_error","param":null,"message":"An internal error has occured, please try again later or contact service support.","type":null}}