| import json |
| import os |
|
|
| def clean_image_paths_in_json_files(file_path_list: list): |
| """ |
| Takes a list of JSON file paths, removes a specific prefix |
| ("/data/shared/Qwen/") from the 'image' field in each file, |
| and saves the file back. |
| |
| Handles cases where the 'image' field is a single string or a list of strings. |
| """ |
| |
| prefix_to_remove = "/data/shared/Qwen/" |
| prefix_len = len(prefix_to_remove) |
| |
| |
| for file_path in file_path_list: |
| if not os.path.exists(file_path): |
| print(f"⚠️ Warning: File not found, skipping: {file_path}") |
| continue |
|
|
| print(f"🔄 Processing: {file_path}") |
| |
| try: |
| |
| with open(file_path, 'r', encoding='utf-8') as f: |
| data_list = json.load(f) |
|
|
| if not isinstance(data_list, list): |
| print(f" ⚠️ Warning: Content of {file_path} is not a list. Skipping.") |
| continue |
|
|
| modified_count = 0 |
| |
| |
| for item in data_list: |
| if 'image' not in item: |
| continue |
|
|
| image_field = item['image'] |
|
|
| |
| if isinstance(image_field, str): |
| if image_field.startswith(prefix_to_remove): |
| item['image'] = image_field[prefix_len:] |
| modified_count += 1 |
| |
| |
| elif isinstance(image_field, list): |
| new_image_list = [] |
| for path in image_field: |
| |
| if isinstance(path, str) and path.startswith(prefix_to_remove): |
| new_image_list.append(path[prefix_len:]) |
| modified_count += 1 |
| else: |
| new_image_list.append(path) |
| item['image'] = new_image_list |
| |
| |
| with open(file_path, 'w', encoding='utf-8') as f: |
| |
| json.dump(data_list, f, indent=2, ensure_ascii=False) |
| |
| if modified_count > 0: |
| print(f" ✅ Done: Modified {modified_count} paths and saved to {file_path}.") |
| else: |
| print(f" ℹ️ Done: No paths to modify. {file_path} saved (original content).") |
|
|
| except json.JSONDecodeError: |
| print(f" ❌ Error: Could not decode JSON from {file_path}.") |
| except Exception as e: |
| print(f" ❌ An unknown error occurred ({file_path}): {e}") |
|
|
| |
| if __name__ == "__main__": |
| |
| |
| base_path = "/data/shared/Qwen/Fine-tuning-data/" |
| |
| |
| |
| json_files_to_process = [ |
| |
| base_path + "single_PRISM_20k.json", |
| base_path + "single_RefSpatial_20.0k.json", |
| base_path + "single_RoboSpatial_20.0k.json", |
| base_path + "single_SAT_20.0k.json", |
| base_path + "single_Spatial457_20k.json", |
| base_path + "single_SPAR-7M_20.0k.json", |
| |
| |
| base_path + "single_PRISM_80k.json", |
| base_path + "single_RefSpatial_80.0k.json", |
| base_path + "single_RoboSpatial_80.0k.json", |
| base_path + "single_SAT_80.0k.json", |
| base_path + "single_Spatial457_23.8k.json", |
| base_path + "single_SPAR-7M_80.0k.json", |
| |
| |
| base_path + "top3_action_reasoning_PRISM_SAT_RefSpatial_20k.json", |
| |
| base_path + "top3_other_RefSpatial_SAT_20k.json", |
| |
| |
| |
| |
| base_path + "top3_trajectory_reasoning_SAT_RefSpatial_20k.json", |
|
|
| base_path + "top3_multi-view_reasoning_pointing_spatial_reasoning_state_estimation_task_reasoning_RefSpatial_20.0k.json", |
| |
| |
| base_path + "top3_action_reasoning_PRISM_SAT_RefSpatial_80k.json", |
| base_path + "top3_other_RefSpatial_SAT_80k.json", |
| |
| base_path + "top3_multi-view_reasoning_pointing_spatial_reasoning_state_estimation_task_reasoning_RefSpatial_80.0k.json", |
| base_path + "top3_trajectory_reasoning_SAT_RefSpatial_80k.json" |
| ] |
| |
| |
| clean_image_paths_in_json_files(json_files_to_process) |