Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| import random | |
| import jsonlines | |
| def select_and_split_captions(input_path, output_dir, num_splits=6): | |
| with jsonlines.open(input_path) as reader: | |
| captions = [line for line in reader if line.get('test_set') is True] | |
| selected_captions = captions #random.sample(captions, 500) | |
| # Split the selected captions into num_splits groups | |
| split_size = len(selected_captions) // num_splits | |
| for i in range(num_splits): | |
| start_idx = i * split_size | |
| end_idx = (i + 1) * split_size if i != num_splits - 1 else len(selected_captions) | |
| split_captions = selected_captions[start_idx:end_idx] | |
| output_path = os.path.join(output_dir, f'selected_captions_{i}.json') | |
| with open(output_path, 'w') as f: | |
| json.dump(split_captions, f, indent=4) | |
| print(f'Saved {len(split_captions)} captions to {output_path}') | |
| if __name__ == "__main__": | |
| input_path = '/root/captions/train.json' | |
| output_dir = '/root/captions/' | |
| select_and_split_captions(input_path, output_dir) | |