File size: 1,091 Bytes
b148e11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import json
import os
import random
import jsonlines

def select_and_split_captions(input_path, output_dir, num_splits=6):
    with jsonlines.open(input_path) as reader:
        captions = [line for line in reader if line.get('test_set') is True]
    
    selected_captions = captions #random.sample(captions, 500)
    
    # Split the selected captions into num_splits groups
    split_size = len(selected_captions) // num_splits
    for i in range(num_splits):
        start_idx = i * split_size
        end_idx = (i + 1) * split_size if i != num_splits - 1 else len(selected_captions)
        split_captions = selected_captions[start_idx:end_idx]
        
        output_path = os.path.join(output_dir, f'selected_captions_{i}.json')
        with open(output_path, 'w') as f:
            json.dump(split_captions, f, indent=4)
        print(f'Saved {len(split_captions)} captions to {output_path}')

if __name__ == "__main__":
    input_path = '/root/captions/train.json'
    output_dir = '/root/captions/'
    select_and_split_captions(input_path, output_dir)