Spaces:
Running
Running
File size: 364 Bytes
a745a5e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 | import json
import random
input_path = "annotations/captions_train.jsonl"
output_path = "annotations/subset_20k.jsonl"
with open(input_path, "r") as f:
data = [json.loads(line) for line in f]
subset = random.sample(data, 20000)
with open(output_path, "w") as f:
for item in subset:
f.write(json.dumps(item) + "\n")
print("20k subset created.") |