import json import os def clean_image_paths_in_json_files(file_path_list: list): """ Takes a list of JSON file paths, removes a specific prefix ("/data/shared/Qwen/") from the 'image' field in each file, and saves the file back. Handles cases where the 'image' field is a single string or a list of strings. """ prefix_to_remove = "/data/shared/Qwen/" prefix_len = len(prefix_to_remove) # 1. Iterate through the input file list. for file_path in file_path_list: if not os.path.exists(file_path): print(f"âš ī¸ Warning: File not found, skipping: {file_path}") continue print(f"🔄 Processing: {file_path}") try: # 2. Open the JSON file. with open(file_path, 'r', encoding='utf-8') as f: data_list = json.load(f) if not isinstance(data_list, list): print(f" âš ī¸ Warning: Content of {file_path} is not a list. Skipping.") continue modified_count = 0 # 3. Iterate through the list within the JSON (each item 'i') for item in data_list: if 'image' not in item: continue image_field = item['image'] # 4. Case: i['image'] is a single string if isinstance(image_field, str): if image_field.startswith(prefix_to_remove): item['image'] = image_field[prefix_len:] modified_count += 1 # 5. Case: i['image'] is a list elif isinstance(image_field, list): new_image_list = [] for path in image_field: # Check if each element in the list is a string and has the prefix if isinstance(path, str) and path.startswith(prefix_to_remove): new_image_list.append(path[prefix_len:]) modified_count += 1 else: new_image_list.append(path) # Add the original path if no match item['image'] = new_image_list # 6. Save the modified content back to the same file. with open(file_path, 'w', encoding='utf-8') as f: # Use indent=2 for readability and ensure_ascii=False to preserve unicode json.dump(data_list, f, indent=2, ensure_ascii=False) if modified_count > 0: print(f" ✅ Done: Modified {modified_count} paths and saved to {file_path}.") else: print(f" â„šī¸ Done: No paths to modify. {file_path} saved (original content).") except json.JSONDecodeError: print(f" ❌ Error: Could not decode JSON from {file_path}.") except Exception as e: print(f" ❌ An unknown error occurred ({file_path}): {e}") # --- Example usage of the script --- if __name__ == "__main__": # Base path configured from your previous input base_path = "/data/shared/Qwen/Fine-tuning-data/" # 1. Define the list of JSON file paths to process. # (Add all your JSON file paths here) json_files_to_process = [ # 20k individual datasets base_path + "single_PRISM_20k.json", base_path + "single_RefSpatial_20.0k.json", base_path + "single_RoboSpatial_20.0k.json", base_path + "single_SAT_20.0k.json", base_path + "single_Spatial457_20k.json", base_path + "single_SPAR-7M_20.0k.json", # 80k individual datasets base_path + "single_PRISM_80k.json", base_path + "single_RefSpatial_80.0k.json", base_path + "single_RoboSpatial_80.0k.json", base_path + "single_SAT_80.0k.json", base_path + "single_Spatial457_23.8k.json", base_path + "single_SPAR-7M_80.0k.json", # 20k Top3 datasets base_path + "top3_action_reasoning_PRISM_SAT_RefSpatial_20k.json", # base_path + "top3_multi-view_reasoning_RefSpatial_20.0k.json", base_path + "top3_other_RefSpatial_SAT_20k.json", # base_path + "top3_pointing_RefSpatial_20.0k.json", # base_path + "top3_spatial_reasoning_RefSpatial_20.0k.json", # base_path + "top3_state_estimation_RefSpatial_20.0k.json", # base_path + "top3_task_reasoning_RefSpatial_20.0k.json", base_path + "top3_trajectory_reasoning_SAT_RefSpatial_20k.json", base_path + "top3_multi-view_reasoning_pointing_spatial_reasoning_state_estimation_task_reasoning_RefSpatial_20.0k.json", # 80k Top3 datasets base_path + "top3_action_reasoning_PRISM_SAT_RefSpatial_80k.json", base_path + "top3_other_RefSpatial_SAT_80k.json", # (Combined the duplicated filename from the 80k Top3 list) base_path + "top3_multi-view_reasoning_pointing_spatial_reasoning_state_estimation_task_reasoning_RefSpatial_80.0k.json", base_path + "top3_trajectory_reasoning_SAT_RefSpatial_80k.json" ] # 2. Run the function. clean_image_paths_in_json_files(json_files_to_process)