pengcc1
/

V2

Model card Files Files and versions

xet

Community

pengcc1 commited on Sep 26, 2024

Commit

a7d4c7b

verified ·

1 Parent(s): 0ef318d

Upload extract_questions.py with huggingface_hub

Browse files

Files changed (1) hide show

extract_questions.py +172 -0

extract_questions.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import os
+import json
+import shutil
+# 读取关键词文件并构建关键词映射字典
+keyword_file = '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI-MMbench-CoT/output_multi_column.txt'
+keyword_dict = {}
+with open(keyword_file, 'r', encoding='utf-8') as f:
+    for line in f:
+        line = line.strip()
+        if not line:
+            continue  # 跳过空行
+        parts = line.split(',')
+        if len(parts) != 4:
+            print(f"格式错误，跳过此行：{line}")
+            continue
+        keyword, department, task, modality = [p.strip() for p in parts]
+        keyword_dict[keyword] = {
+            'department': department,
+            'task': task,
+            'modality': modality
+        }
+print(f"总共加载了 {len(keyword_dict)} 个关键词。")
+# 定义需要处理的科室列表
+departments = [
+    'Cardiovascular Surgery',
+    'Dermatology',
+    'Endocrinology',
+    'Gastroenterology and Hepatology',
+    'General Surgery',
+    'Hematology',
+    'Infectious Diseases',
+    'Laboratory Medicine and Pathology',
+    'Nephrology and Hypertension',
+    'Neurosurgery',
+    'Obstetrics and Gynecology',
+    'Oncology (Medical)',
+    'Ophthalmology',
+    'Orthopedic Surgery',
+    'Otolaryngology (ENT)/Head and Neck Surgery',
+    'Pulmonary Medicine',
+    'Sports Medicine',
+    'Urology'
+]
+# 创建科室到目录名称的映射，处理特殊情况
+def get_department_dir_name(department):
+    if department == 'Otolaryngology (ENT)/Head and Neck Surgery':
+        return 'Otolaryngology (ENT)'
+    else:
+        return department
+# 将科室列表转换为集合，方便查找
+departments_set = set(departments)
+# 定义源目录列表
+source_dirs = [
+    '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/cls_2d',
+    '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/det_2d',
+    '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/semantic_seg_2d',
+    '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI/semantic_seg_3d'
+]
+# 定义目标基础目录
+destination_root = '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI-MMbench-CoT'
+# 用于统计和调试
+total_files_processed = 0
+files_matched = 0
+images_copied = 0
+# 用于统计每个科室的匹配文件数
+department_file_counts = {dept: 0 for dept in departments}
+# 要处理的图片键列表
+image_keys = ['img_mask_path', 'img_contour_path', 'img_bbox_path', 'img_path']
+# 遍历每个源目录
+for source_dir in source_dirs:
+    print(f"正在遍历目录：{source_dir}")
+    for root, dirs, files in os.walk(source_dir):
+        for file in files:
+            if file.endswith('.json'):
+                total_files_processed += 1
+                source_file_path = os.path.join(root, file)
+                try:
+                    with open(source_file_path, 'r', encoding='utf-8') as f:
+                        data = json.load(f)
+                    answer_letter = data.get('answer', '').strip()
+                    options = data.get('options', [])
+                    if not answer_letter or not options:
+                        print(f"文件缺少 'answer' 或 'options' 字段，跳过：{source_file_path}")
+                        continue
+                    # 创建选项字典，映射字母到选项文本
+                    option_dict = {}
+                    for opt in options:
+                        if len(opt) > 2 and opt[1] == '.':
+                            opt_letter = opt[0]
+                            opt_text = opt[3:].strip()
+                            option_dict[opt_letter] = opt_text
+                        else:
+                            print(f"选项格式错误，文件：{source_file_path}，选项：{opt}")
+                    # 获取关键词
+                    keyword = option_dict.get(answer_letter)
+                    if not keyword:
+                        print(f"答案字母 '{answer_letter}' 在选项中未找到，文件：{source_file_path}")
+                        continue
+                    print(f"处理文件：{source_file_path}")
+                    print(f"关键词：'{keyword}'")
+                    # 检查关键词是否在关键词字典中
+                    if keyword in keyword_dict:
+                        department_info = keyword_dict[keyword]
+                        department = department_info['department']
+                        print(f"关键词 '{keyword}' 的科室为：'{department}'")
+                        if department in departments_set:
+                            files_matched += 1
+                            department_dir_name = get_department_dir_name(department)
+                            destination_base = os.path.join(destination_root, department_dir_name)
+                            # 构造目标文件路径
+                            relative_path = os.path.relpath(source_file_path, '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI')
+                            destination_file_path = os.path.join(destination_base, relative_path)
+                            # 创建目标目录（如果不存在）
+                            destination_dir = os.path.dirname(destination_file_path)
+                            if not os.path.exists(destination_dir):
+                                os.makedirs(destination_dir)
+                                print(f"创建目录：{destination_dir}")
+                            # 复制JSON文件
+                            shutil.copy2(source_file_path, destination_file_path)
+                            print(f"已复制文件到：{destination_file_path}")
+                            # 处理并复制图片
+                            for image_key in image_keys:
+                                if image_key in data:
+                                    image_path = data[image_key]
+                                    # 图片路径是相对于 source_dir + '/images' 的
+                                    source_image_path = os.path.join(source_dir, 'images', image_path)
+                                    if not os.path.exists(source_image_path):
+                                        print(f"源图片不存在，跳过：{source_image_path}")
+                                        continue
+                                    # 构造相对路径，从 GMAI 之后开始，包括 'images' 目录
+                                    relative_image_path = os.path.relpath(source_image_path, '/mnt/petrelfs/chenpengcheng/benchmark_preprocess/GMAI')
+                                    # 构造目标图片路径
+                                    destination_image_path = os.path.join(destination_base, relative_image_path)
+                                    destination_image_dir = os.path.dirname(destination_image_path)
+                                    if not os.path.exists(destination_image_dir):
+                                        os.makedirs(destination_image_dir)
+                                        print(f"创建图片目录：{destination_image_dir}")
+                                    # 复制图片文件
+                                    shutil.copy2(source_image_path, destination_image_path)
+                                    images_copied += 1
+                                    print(f"已复制图片到：{destination_image_path}")
+                            # 增加对应科室的文件计数
+                            department_file_counts[department] += 1
+                        else:
+                            print(f"科室 '{department}' 不在处理列表中，不复制文件。")
+                    else:
+                        print(f"关键词 '{keyword}' 不在关键词列表中。")
+                except Exception as e:
+                    print(f"处理文件 {source_file_path} 时发生错误：{e}")
+print(f"总共处理了 {total_files_processed} 个 JSON 文件。")
+print(f"总共匹配并复制了 {files_matched} 个 JSON 文件。")
+print(f"总共复制了 {images_copied} 张图片。")
+# 打印每个科室的文件计数
+print("每个科室匹配并复制的文件数量：")
+for dept in departments:
+    count = department_file_counts[dept]
+    dept_dir_name = get_department_dir_name(dept)
+    print(f"{dept_dir_name}: {count} 个文件")