File size: 364 Bytes
a745a5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import json
import random

input_path = "annotations/captions_train.jsonl"
output_path = "annotations/subset_20k.jsonl"

with open(input_path, "r") as f:
    data = [json.loads(line) for line in f]

subset = random.sample(data, 20000)

with open(output_path, "w") as f:
    for item in subset:
        f.write(json.dumps(item) + "\n")

print("20k subset created.")