Spaces:
Running
Running
| import json | |
| import random | |
| input_path = "annotations/captions_train.jsonl" | |
| output_path = "annotations/subset_20k.jsonl" | |
| with open(input_path, "r") as f: | |
| data = [json.loads(line) for line in f] | |
| subset = random.sample(data, 20000) | |
| with open(output_path, "w") as f: | |
| for item in subset: | |
| f.write(json.dumps(item) + "\n") | |
| print("20k subset created.") |