File size: 6,048 Bytes
849ca03
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import json
import os
import re
from pathlib import Path

def filter_multi_image_samples(file_path_list: list):
    """
    Takes a list of JSON file paths. For each file, it filters out
    multi-image samples (where 'image' is a list) and saves the
    result to a new file with an updated name and count.
    """
    
    print("--- Starting multi-image filter process ---")

    # 1. Iterate through the input file list.
    for file_path in file_path_list:
        if not os.path.exists(file_path):
            print(f"⚠️  Warning: File not found, skipping: {file_path}")
            continue

        print(f"\n🔄  Processing: {file_path}")
        
        try:
            # 2. Open and load the JSON file.
            with open(file_path, 'r', encoding='utf-8') as f:
                data_list = json.load(f)

            if not isinstance(data_list, list):
                print(f"  ⚠️  Warning: Content of {file_path} is not a list. Skipping.")
                continue

            # 3. Filter the data: Keep only items that are text-only OR have a single image (str).
            filtered_data_list = []
            for item in data_list:
                # Keep if 'image' key doesn't exist (text-only)
                if 'image' not in item:
                    filtered_data_list.append(item)
                # Keep if 'image' key is a string (single-image)
                elif isinstance(item['image'], str):
                    filtered_data_list.append(item)
                # Otherwise (it's a list), it's filtered out.
            
            original_count = len(data_list)
            new_count = len(filtered_data_list)
            print(f"  ℹ️  Filtered: Kept {new_count} samples (out of {original_count}).")

            # 4. Create the new filename
            dir_name = os.path.dirname(file_path)
            base_name = os.path.basename(file_path)
            
            # Format the new count string (e.g., 55.3k)
            new_count_str = f"{new_count / 1000.0:.1f}k"
            
            # Regex to find the old count pattern (e.g., _80.0k.json or _20k.json) at the end of the file
            # This looks for: _<numbers/dots><k or K>.json
            pattern = re.compile(r'(_[0-9.]+[kK]\.json)$')
            
            match = pattern.search(base_name)
            
            if match:
                # If pattern found, replace it
                # e.g., 'file_80.0k.json' -> 'file_without_multi_image_55.3k.json'
                new_base_name = pattern.sub(f'_without_multi_image_{new_count_str}.json', base_name)
            else:
                # If no count pattern found, just append to the name before the extension
                # e.g., 'file.json' -> 'file_without_multi_image_55.3k.json'
                base_wo_ext, ext = os.path.splitext(base_name)
                new_base_name = f"{base_wo_ext}_without_multi_image_{new_count_str}{ext}"

            new_file_path = os.path.join(dir_name, new_base_name)

            # 5. Save the filtered list to the new file.
            with open(new_file_path, 'w', encoding='utf-8') as f:
                json.dump(filtered_data_list, f, indent=2, ensure_ascii=False)
            
            print(f"  ✅  Successfully saved filtered data to: {new_file_path}")

        except json.JSONDecodeError:
            print(f"  ❌  Error: Could not decode JSON from {file_path}.")
        except Exception as e:
            print(f"  ❌  An unknown error occurred ({file_path}): {e}")

# --- Example usage of the script ---
if __name__ == "__main__":
    
    # Base path configured from your previous input
    base_path = "/data/shared/Qwen/Fine-tuning-data/"
    
    # 1. Define the list of JSON file paths to process.
    # json_files_to_process = [
    #     # 20k individual datasets
    #     base_path + "single_PRISM_20k.json",
    #     base_path + "single_RefSpatial_20.0k.json",
    #     base_path + "single_RoboSpatial_20.0k.json",
    #     base_path + "single_SAT_20.0k.json",
    #     base_path + "single_Spatial457_20k.json",
    #     base_path + "single_SPAR-7M_20.0k.json",
        
    #     # 80k individual datasets
    #     base_path + "single_PRISM_80k.json",
    #     base_path + "single_RefSpatial_80.0k.json",
    #     base_path + "single_RoboSpatial_80.0k.json",
    #     base_path + "single_SAT_80.0k.json",
    #     base_path + "single_Spatial457_23.8k.json",
    #     base_path + "single_SPAR-7M_80.0k.json",
        
    #     # 20k Top3 datasets
    #     base_path + "top3_action_reasoning_PRISM_SAT_RefSpatial_20k.json",
    #     base_path + "top3_multi-view_reasoning_RefSpatial_20.0k.json",
    #     base_path + "top3_other_RefSpatial_SAT_20k.json",
    #     base_path + "top3_pointing_RefSpatial_20.0k.json",
    #     base_path + "top3_spatial_reasoning_RefSpatial_20.0k.json",
    #     base_path + "top3_state_estimation_RefSpatial_20.0k.json",
    #     base_path + "top3_task_reasoning_RefSpatial_20.0k.json",
    #     base_path + "top3_trajectory_reasoning_SAT_RefSpatial_20k.json",
        
    #     # 80k Top3 datasets
    #     base_path + "top3_action_reasoning_PRISM_SAT_RefSpatial_80k.json",
    #     base_path + "top3_multi-view_reasoning_RefSpatial_80.0k.json",
    #     base_path + "top3_other_RefSpatial_SAT_80k.json",
    #     base_path + "top3_pointing_spatial_reasoning_state_estimation_task_reasoning_RefSpatial_80.0k.json",
    #     base_path + "top3_trajectory_reasoning_SAT_RefSpatial_80k.json",
        
    #     # The user's example file from the prompt
    #     base_path + "data_scale_exp_SAT_RefSpatial_SPAR-7M_RoboSpatial_PRISM_80.0k.json"
    # ]
    
    json_files_to_process = [
        base_path + "data_scale_exp_SAT_RefSpatial_SPAR-7M_RoboSpatial_PRISM_80.0k.json",
        base_path + "data_scale_exp_SAT_RefSpatial_SPAR-7M_RoboSpatial_PRISM_400.0k.json",
        base_path + "data_scale_exp_SAT_RefSpatial_SPAR-7M_RoboSpatial_PRISM_800.0k.json",
    ]

    # 2. Run the function.
    filter_multi_image_samples(json_files_to_process)