File size: 6,048 Bytes
849ca03 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | import json
import os
import re
from pathlib import Path
def filter_multi_image_samples(file_path_list: list):
"""
Takes a list of JSON file paths. For each file, it filters out
multi-image samples (where 'image' is a list) and saves the
result to a new file with an updated name and count.
"""
print("--- Starting multi-image filter process ---")
# 1. Iterate through the input file list.
for file_path in file_path_list:
if not os.path.exists(file_path):
print(f"⚠️ Warning: File not found, skipping: {file_path}")
continue
print(f"\n🔄 Processing: {file_path}")
try:
# 2. Open and load the JSON file.
with open(file_path, 'r', encoding='utf-8') as f:
data_list = json.load(f)
if not isinstance(data_list, list):
print(f" ⚠️ Warning: Content of {file_path} is not a list. Skipping.")
continue
# 3. Filter the data: Keep only items that are text-only OR have a single image (str).
filtered_data_list = []
for item in data_list:
# Keep if 'image' key doesn't exist (text-only)
if 'image' not in item:
filtered_data_list.append(item)
# Keep if 'image' key is a string (single-image)
elif isinstance(item['image'], str):
filtered_data_list.append(item)
# Otherwise (it's a list), it's filtered out.
original_count = len(data_list)
new_count = len(filtered_data_list)
print(f" ℹ️ Filtered: Kept {new_count} samples (out of {original_count}).")
# 4. Create the new filename
dir_name = os.path.dirname(file_path)
base_name = os.path.basename(file_path)
# Format the new count string (e.g., 55.3k)
new_count_str = f"{new_count / 1000.0:.1f}k"
# Regex to find the old count pattern (e.g., _80.0k.json or _20k.json) at the end of the file
# This looks for: _<numbers/dots><k or K>.json
pattern = re.compile(r'(_[0-9.]+[kK]\.json)$')
match = pattern.search(base_name)
if match:
# If pattern found, replace it
# e.g., 'file_80.0k.json' -> 'file_without_multi_image_55.3k.json'
new_base_name = pattern.sub(f'_without_multi_image_{new_count_str}.json', base_name)
else:
# If no count pattern found, just append to the name before the extension
# e.g., 'file.json' -> 'file_without_multi_image_55.3k.json'
base_wo_ext, ext = os.path.splitext(base_name)
new_base_name = f"{base_wo_ext}_without_multi_image_{new_count_str}{ext}"
new_file_path = os.path.join(dir_name, new_base_name)
# 5. Save the filtered list to the new file.
with open(new_file_path, 'w', encoding='utf-8') as f:
json.dump(filtered_data_list, f, indent=2, ensure_ascii=False)
print(f" ✅ Successfully saved filtered data to: {new_file_path}")
except json.JSONDecodeError:
print(f" ❌ Error: Could not decode JSON from {file_path}.")
except Exception as e:
print(f" ❌ An unknown error occurred ({file_path}): {e}")
# --- Example usage of the script ---
if __name__ == "__main__":
# Base path configured from your previous input
base_path = "/data/shared/Qwen/Fine-tuning-data/"
# 1. Define the list of JSON file paths to process.
# json_files_to_process = [
# # 20k individual datasets
# base_path + "single_PRISM_20k.json",
# base_path + "single_RefSpatial_20.0k.json",
# base_path + "single_RoboSpatial_20.0k.json",
# base_path + "single_SAT_20.0k.json",
# base_path + "single_Spatial457_20k.json",
# base_path + "single_SPAR-7M_20.0k.json",
# # 80k individual datasets
# base_path + "single_PRISM_80k.json",
# base_path + "single_RefSpatial_80.0k.json",
# base_path + "single_RoboSpatial_80.0k.json",
# base_path + "single_SAT_80.0k.json",
# base_path + "single_Spatial457_23.8k.json",
# base_path + "single_SPAR-7M_80.0k.json",
# # 20k Top3 datasets
# base_path + "top3_action_reasoning_PRISM_SAT_RefSpatial_20k.json",
# base_path + "top3_multi-view_reasoning_RefSpatial_20.0k.json",
# base_path + "top3_other_RefSpatial_SAT_20k.json",
# base_path + "top3_pointing_RefSpatial_20.0k.json",
# base_path + "top3_spatial_reasoning_RefSpatial_20.0k.json",
# base_path + "top3_state_estimation_RefSpatial_20.0k.json",
# base_path + "top3_task_reasoning_RefSpatial_20.0k.json",
# base_path + "top3_trajectory_reasoning_SAT_RefSpatial_20k.json",
# # 80k Top3 datasets
# base_path + "top3_action_reasoning_PRISM_SAT_RefSpatial_80k.json",
# base_path + "top3_multi-view_reasoning_RefSpatial_80.0k.json",
# base_path + "top3_other_RefSpatial_SAT_80k.json",
# base_path + "top3_pointing_spatial_reasoning_state_estimation_task_reasoning_RefSpatial_80.0k.json",
# base_path + "top3_trajectory_reasoning_SAT_RefSpatial_80k.json",
# # The user's example file from the prompt
# base_path + "data_scale_exp_SAT_RefSpatial_SPAR-7M_RoboSpatial_PRISM_80.0k.json"
# ]
json_files_to_process = [
base_path + "data_scale_exp_SAT_RefSpatial_SPAR-7M_RoboSpatial_PRISM_80.0k.json",
base_path + "data_scale_exp_SAT_RefSpatial_SPAR-7M_RoboSpatial_PRISM_400.0k.json",
base_path + "data_scale_exp_SAT_RefSpatial_SPAR-7M_RoboSpatial_PRISM_800.0k.json",
]
# 2. Run the function.
filter_multi_image_samples(json_files_to_process) |