image_captioning / create_subset_20k.py
pchandragrid's picture
Deploy Streamlit app
a745a5e
raw
history blame contribute delete
364 Bytes
import json
import random
input_path = "annotations/captions_train.jsonl"
output_path = "annotations/subset_20k.jsonl"
with open(input_path, "r") as f:
data = [json.loads(line) for line in f]
subset = random.sample(data, 20000)
with open(output_path, "w") as f:
for item in subset:
f.write(json.dumps(item) + "\n")
print("20k subset created.")