Spaces:
Sleeping
Sleeping
File size: 1,091 Bytes
b148e11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
import json
import os
import random
import jsonlines
def select_and_split_captions(input_path, output_dir, num_splits=6):
with jsonlines.open(input_path) as reader:
captions = [line for line in reader if line.get('test_set') is True]
selected_captions = captions #random.sample(captions, 500)
# Split the selected captions into num_splits groups
split_size = len(selected_captions) // num_splits
for i in range(num_splits):
start_idx = i * split_size
end_idx = (i + 1) * split_size if i != num_splits - 1 else len(selected_captions)
split_captions = selected_captions[start_idx:end_idx]
output_path = os.path.join(output_dir, f'selected_captions_{i}.json')
with open(output_path, 'w') as f:
json.dump(split_captions, f, indent=4)
print(f'Saved {len(split_captions)} captions to {output_path}')
if __name__ == "__main__":
input_path = '/root/captions/train.json'
output_dir = '/root/captions/'
select_and_split_captions(input_path, output_dir)
|