| import json | |
| import os | |
| import re | |
| from pathlib import Path | |
| def filter_multi_image_samples(file_path_list: list): | |
| """ | |
| Takes a list of JSON file paths. For each file, it filters out | |
| multi-image samples (where 'image' is a list) and saves the | |
| result to a new file with an updated name and count. | |
| """ | |
| print("--- Starting multi-image filter process ---") | |
| # 1. Iterate through the input file list. | |
| for file_path in file_path_list: | |
| if not os.path.exists(file_path): | |
| print(f"⚠️ Warning: File not found, skipping: {file_path}") | |
| continue | |
| print(f"\n🔄 Processing: {file_path}") | |
| try: | |
| # 2. Open and load the JSON file. | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| data_list = json.load(f) | |
| if not isinstance(data_list, list): | |
| print(f" ⚠️ Warning: Content of {file_path} is not a list. Skipping.") | |
| continue | |
| # 3. Filter the data: Keep only items that are text-only OR have a single image (str). | |
| filtered_data_list = [] | |
| for item in data_list: | |
| # Keep if 'image' key doesn't exist (text-only) | |
| if 'image' not in item: | |
| filtered_data_list.append(item) | |
| # Keep if 'image' key is a string (single-image) | |
| elif isinstance(item['image'], str): | |
| filtered_data_list.append(item) | |
| # Otherwise (it's a list), it's filtered out. | |
| original_count = len(data_list) | |
| new_count = len(filtered_data_list) | |
| print(f" ℹ️ Filtered: Kept {new_count} samples (out of {original_count}).") | |
| # 4. Create the new filename | |
| dir_name = os.path.dirname(file_path) | |
| base_name = os.path.basename(file_path) | |
| # Format the new count string (e.g., 55.3k) | |
| new_count_str = f"{new_count / 1000.0:.1f}k" | |
| # Regex to find the old count pattern (e.g., _80.0k.json or _20k.json) at the end of the file | |
| # This looks for: _<numbers/dots><k or K>.json | |
| pattern = re.compile(r'(_[0-9.]+[kK]\.json)$') | |
| match = pattern.search(base_name) | |
| if match: | |
| # If pattern found, replace it | |
| # e.g., 'file_80.0k.json' -> 'file_without_multi_image_55.3k.json' | |
| new_base_name = pattern.sub(f'_without_multi_image_{new_count_str}.json', base_name) | |
| else: | |
| # If no count pattern found, just append to the name before the extension | |
| # e.g., 'file.json' -> 'file_without_multi_image_55.3k.json' | |
| base_wo_ext, ext = os.path.splitext(base_name) | |
| new_base_name = f"{base_wo_ext}_without_multi_image_{new_count_str}{ext}" | |
| new_file_path = os.path.join(dir_name, new_base_name) | |
| # 5. Save the filtered list to the new file. | |
| with open(new_file_path, 'w', encoding='utf-8') as f: | |
| json.dump(filtered_data_list, f, indent=2, ensure_ascii=False) | |
| print(f" ✅ Successfully saved filtered data to: {new_file_path}") | |
| except json.JSONDecodeError: | |
| print(f" ❌ Error: Could not decode JSON from {file_path}.") | |
| except Exception as e: | |
| print(f" ❌ An unknown error occurred ({file_path}): {e}") | |
| # --- Example usage of the script --- | |
| if __name__ == "__main__": | |
| # Base path configured from your previous input | |
| base_path = "/data/shared/Qwen/Fine-tuning-data/" | |
| # 1. Define the list of JSON file paths to process. | |
| # json_files_to_process = [ | |
| # # 20k individual datasets | |
| # base_path + "single_PRISM_20k.json", | |
| # base_path + "single_RefSpatial_20.0k.json", | |
| # base_path + "single_RoboSpatial_20.0k.json", | |
| # base_path + "single_SAT_20.0k.json", | |
| # base_path + "single_Spatial457_20k.json", | |
| # base_path + "single_SPAR-7M_20.0k.json", | |
| # # 80k individual datasets | |
| # base_path + "single_PRISM_80k.json", | |
| # base_path + "single_RefSpatial_80.0k.json", | |
| # base_path + "single_RoboSpatial_80.0k.json", | |
| # base_path + "single_SAT_80.0k.json", | |
| # base_path + "single_Spatial457_23.8k.json", | |
| # base_path + "single_SPAR-7M_80.0k.json", | |
| # # 20k Top3 datasets | |
| # base_path + "top3_action_reasoning_PRISM_SAT_RefSpatial_20k.json", | |
| # base_path + "top3_multi-view_reasoning_RefSpatial_20.0k.json", | |
| # base_path + "top3_other_RefSpatial_SAT_20k.json", | |
| # base_path + "top3_pointing_RefSpatial_20.0k.json", | |
| # base_path + "top3_spatial_reasoning_RefSpatial_20.0k.json", | |
| # base_path + "top3_state_estimation_RefSpatial_20.0k.json", | |
| # base_path + "top3_task_reasoning_RefSpatial_20.0k.json", | |
| # base_path + "top3_trajectory_reasoning_SAT_RefSpatial_20k.json", | |
| # # 80k Top3 datasets | |
| # base_path + "top3_action_reasoning_PRISM_SAT_RefSpatial_80k.json", | |
| # base_path + "top3_multi-view_reasoning_RefSpatial_80.0k.json", | |
| # base_path + "top3_other_RefSpatial_SAT_80k.json", | |
| # base_path + "top3_pointing_spatial_reasoning_state_estimation_task_reasoning_RefSpatial_80.0k.json", | |
| # base_path + "top3_trajectory_reasoning_SAT_RefSpatial_80k.json", | |
| # # The user's example file from the prompt | |
| # base_path + "data_scale_exp_SAT_RefSpatial_SPAR-7M_RoboSpatial_PRISM_80.0k.json" | |
| # ] | |
| json_files_to_process = [ | |
| base_path + "data_scale_exp_SAT_RefSpatial_SPAR-7M_RoboSpatial_PRISM_80.0k.json", | |
| base_path + "data_scale_exp_SAT_RefSpatial_SPAR-7M_RoboSpatial_PRISM_400.0k.json", | |
| base_path + "data_scale_exp_SAT_RefSpatial_SPAR-7M_RoboSpatial_PRISM_800.0k.json", | |
| ] | |
| # 2. Run the function. | |
| filter_multi_image_samples(json_files_to_process) |