File size: 5,264 Bytes

849ca03

import json
import os

def clean_image_paths_in_json_files(file_path_list: list):
    """
    Takes a list of JSON file paths, removes a specific prefix 
    ("/data/shared/Qwen/") from the 'image' field in each file, 
    and saves the file back.

    Handles cases where the 'image' field is a single string or a list of strings.
    """
    
    prefix_to_remove = "/data/shared/Qwen/"
    prefix_len = len(prefix_to_remove)
    
    # 1. Iterate through the input file list.
    for file_path in file_path_list:
        if not os.path.exists(file_path):
            print(f"⚠️  Warning: File not found, skipping: {file_path}")
            continue

        print(f"🔄  Processing: {file_path}")
        
        try:
            # 2. Open the JSON file.
            with open(file_path, 'r', encoding='utf-8') as f:
                data_list = json.load(f)

            if not isinstance(data_list, list):
                print(f"  ⚠️  Warning: Content of {file_path} is not a list. Skipping.")
                continue

            modified_count = 0
            
            # 3. Iterate through the list within the JSON (each item 'i')
            for item in data_list:
                if 'image' not in item:
                    continue

                image_field = item['image']

                # 4. Case: i['image'] is a single string
                if isinstance(image_field, str):
                    if image_field.startswith(prefix_to_remove):
                        item['image'] = image_field[prefix_len:]
                        modified_count += 1
                
                # 5. Case: i['image'] is a list
                elif isinstance(image_field, list):
                    new_image_list = []
                    for path in image_field:
                        # Check if each element in the list is a string and has the prefix
                        if isinstance(path, str) and path.startswith(prefix_to_remove):
                            new_image_list.append(path[prefix_len:])
                            modified_count += 1
                        else:
                            new_image_list.append(path) # Add the original path if no match
                    item['image'] = new_image_list
            
            # 6. Save the modified content back to the same file.
            with open(file_path, 'w', encoding='utf-8') as f:
                # Use indent=2 for readability and ensure_ascii=False to preserve unicode
                json.dump(data_list, f, indent=2, ensure_ascii=False)
            
            if modified_count > 0:
                print(f"  ✅  Done: Modified {modified_count} paths and saved to {file_path}.")
            else:
                print(f"  ℹ️  Done: No paths to modify. {file_path} saved (original content).")

        except json.JSONDecodeError:
            print(f"  ❌  Error: Could not decode JSON from {file_path}.")
        except Exception as e:
            print(f"  ❌  An unknown error occurred ({file_path}): {e}")

# --- Example usage of the script ---
if __name__ == "__main__":
    
    # Base path configured from your previous input
    base_path = "/data/shared/Qwen/Fine-tuning-data/"
    
    # 1. Define the list of JSON file paths to process.
    # (Add all your JSON file paths here)
    json_files_to_process = [
        # 20k individual datasets
        base_path + "single_PRISM_20k.json",
        base_path + "single_RefSpatial_20.0k.json",
        base_path + "single_RoboSpatial_20.0k.json",
        base_path + "single_SAT_20.0k.json",
        base_path + "single_Spatial457_20k.json",
        base_path + "single_SPAR-7M_20.0k.json",
        
        # 80k individual datasets
        base_path + "single_PRISM_80k.json",
        base_path + "single_RefSpatial_80.0k.json",
        base_path + "single_RoboSpatial_80.0k.json",
        base_path + "single_SAT_80.0k.json",
        base_path + "single_Spatial457_23.8k.json",
        base_path + "single_SPAR-7M_80.0k.json",
        
        # 20k Top3 datasets
        base_path + "top3_action_reasoning_PRISM_SAT_RefSpatial_20k.json",
        # base_path + "top3_multi-view_reasoning_RefSpatial_20.0k.json",
        base_path + "top3_other_RefSpatial_SAT_20k.json",
        # base_path + "top3_pointing_RefSpatial_20.0k.json",
        # base_path + "top3_spatial_reasoning_RefSpatial_20.0k.json",
        # base_path + "top3_state_estimation_RefSpatial_20.0k.json",
        # base_path + "top3_task_reasoning_RefSpatial_20.0k.json",
        base_path + "top3_trajectory_reasoning_SAT_RefSpatial_20k.json",

        base_path + "top3_multi-view_reasoning_pointing_spatial_reasoning_state_estimation_task_reasoning_RefSpatial_20.0k.json",
        
        # 80k Top3 datasets
        base_path + "top3_action_reasoning_PRISM_SAT_RefSpatial_80k.json",
        base_path + "top3_other_RefSpatial_SAT_80k.json",
        # (Combined the duplicated filename from the 80k Top3 list)
        base_path + "top3_multi-view_reasoning_pointing_spatial_reasoning_state_estimation_task_reasoning_RefSpatial_80.0k.json",
        base_path + "top3_trajectory_reasoning_SAT_RefSpatial_80k.json"
    ]
    
    # 2. Run the function.
    clean_image_paths_in_json_files(json_files_to_process)