File size: 6,048 Bytes

849ca03

import json
import os
import re
from pathlib import Path

def filter_multi_image_samples(file_path_list: list):
    """
    Takes a list of JSON file paths. For each file, it filters out
    multi-image samples (where 'image' is a list) and saves the
    result to a new file with an updated name and count.
    """
    
    print("--- Starting multi-image filter process ---")

    # 1. Iterate through the input file list.
    for file_path in file_path_list:
        if not os.path.exists(file_path):
            print(f"⚠️  Warning: File not found, skipping: {file_path}")
            continue

        print(f"\n🔄  Processing: {file_path}")
        
        try:
            # 2. Open and load the JSON file.
            with open(file_path, 'r', encoding='utf-8') as f:
                data_list = json.load(f)

            if not isinstance(data_list, list):
                print(f"  ⚠️  Warning: Content of {file_path} is not a list. Skipping.")
                continue

            # 3. Filter the data: Keep only items that are text-only OR have a single image (str).
            filtered_data_list = []
            for item in data_list:
                # Keep if 'image' key doesn't exist (text-only)
                if 'image' not in item:
                    filtered_data_list.append(item)
                # Keep if 'image' key is a string (single-image)
                elif isinstance(item['image'], str):
                    filtered_data_list.append(item)
                # Otherwise (it's a list), it's filtered out.
            
            original_count = len(data_list)
            new_count = len(filtered_data_list)
            print(f"  ℹ️  Filtered: Kept {new_count} samples (out of {original_count}).")

            # 4. Create the new filename
            dir_name = os.path.dirname(file_path)
            base_name = os.path.basename(file_path)
            
            # Format the new count string (e.g., 55.3k)
            new_count_str = f"{new_count / 1000.0:.1f}k"
            
            # Regex to find the old count pattern (e.g., _80.0k.json or _20k.json) at the end of the file
            # This looks for: _<numbers/dots><k or K>.json
            pattern = re.compile(r'(_[0-9.]+[kK]\.json)$')
            
            match = pattern.search(base_name)
            
            if match:
                # If pattern found, replace it
                # e.g., 'file_80.0k.json' -> 'file_without_multi_image_55.3k.json'
                new_base_name = pattern.sub(f'_without_multi_image_{new_count_str}.json', base_name)
            else:
                # If no count pattern found, just append to the name before the extension
                # e.g., 'file.json' -> 'file_without_multi_image_55.3k.json'
                base_wo_ext, ext = os.path.splitext(base_name)
                new_base_name = f"{base_wo_ext}_without_multi_image_{new_count_str}{ext}"

            new_file_path = os.path.join(dir_name, new_base_name)

            # 5. Save the filtered list to the new file.
            with open(new_file_path, 'w', encoding='utf-8') as f:
                json.dump(filtered_data_list, f, indent=2, ensure_ascii=False)
            
            print(f"  ✅  Successfully saved filtered data to: {new_file_path}")

        except json.JSONDecodeError:
            print(f"  ❌  Error: Could not decode JSON from {file_path}.")
        except Exception as e:
            print(f"  ❌  An unknown error occurred ({file_path}): {e}")

# --- Example usage of the script ---
if __name__ == "__main__":
    
    # Base path configured from your previous input
    base_path = "/data/shared/Qwen/Fine-tuning-data/"
    
    # 1. Define the list of JSON file paths to process.
    # json_files_to_process = [
    #     # 20k individual datasets
    #     base_path + "single_PRISM_20k.json",
    #     base_path + "single_RefSpatial_20.0k.json",
    #     base_path + "single_RoboSpatial_20.0k.json",
    #     base_path + "single_SAT_20.0k.json",
    #     base_path + "single_Spatial457_20k.json",
    #     base_path + "single_SPAR-7M_20.0k.json",
        
    #     # 80k individual datasets
    #     base_path + "single_PRISM_80k.json",
    #     base_path + "single_RefSpatial_80.0k.json",
    #     base_path + "single_RoboSpatial_80.0k.json",
    #     base_path + "single_SAT_80.0k.json",
    #     base_path + "single_Spatial457_23.8k.json",
    #     base_path + "single_SPAR-7M_80.0k.json",
        
    #     # 20k Top3 datasets
    #     base_path + "top3_action_reasoning_PRISM_SAT_RefSpatial_20k.json",
    #     base_path + "top3_multi-view_reasoning_RefSpatial_20.0k.json",
    #     base_path + "top3_other_RefSpatial_SAT_20k.json",
    #     base_path + "top3_pointing_RefSpatial_20.0k.json",
    #     base_path + "top3_spatial_reasoning_RefSpatial_20.0k.json",
    #     base_path + "top3_state_estimation_RefSpatial_20.0k.json",
    #     base_path + "top3_task_reasoning_RefSpatial_20.0k.json",
    #     base_path + "top3_trajectory_reasoning_SAT_RefSpatial_20k.json",
        
    #     # 80k Top3 datasets
    #     base_path + "top3_action_reasoning_PRISM_SAT_RefSpatial_80k.json",
    #     base_path + "top3_multi-view_reasoning_RefSpatial_80.0k.json",
    #     base_path + "top3_other_RefSpatial_SAT_80k.json",
    #     base_path + "top3_pointing_spatial_reasoning_state_estimation_task_reasoning_RefSpatial_80.0k.json",
    #     base_path + "top3_trajectory_reasoning_SAT_RefSpatial_80k.json",
        
    #     # The user's example file from the prompt
    #     base_path + "data_scale_exp_SAT_RefSpatial_SPAR-7M_RoboSpatial_PRISM_80.0k.json"
    # ]
    
    json_files_to_process = [
        base_path + "data_scale_exp_SAT_RefSpatial_SPAR-7M_RoboSpatial_PRISM_80.0k.json",
        base_path + "data_scale_exp_SAT_RefSpatial_SPAR-7M_RoboSpatial_PRISM_400.0k.json",
        base_path + "data_scale_exp_SAT_RefSpatial_SPAR-7M_RoboSpatial_PRISM_800.0k.json",
    ]

    # 2. Run the function.
    filter_multi_image_samples(json_files_to_process)