Spaces:
Running
Running
File size: 1,669 Bytes
a745a5e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | import json
import random
from pathlib import Path
from typing import Iterable
def create_subset(
input_path: str | Path,
output_path: str | Path,
size: int = 20_000,
) -> None:
"""
Create a random subset of a JSONL annotations file.
"""
input_path = Path(input_path)
output_path = Path(output_path)
with input_path.open("r") as f:
data = [json.loads(line) for line in f]
if size > len(data):
raise ValueError(f"Requested subset size {size} exceeds dataset size {len(data)}")
subset = random.sample(data, size)
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("w") as f:
for item in subset:
f.write(json.dumps(item) + "\n")
def _main_from_cli(args: Iterable[str] | None = None) -> None:
"""
Simple CLI wrapper when this module is executed as a script.
"""
import argparse
parser = argparse.ArgumentParser(description="Create a random JSONL subset.")
parser.add_argument(
"--input",
default="annotations/captions_train.jsonl",
help="Input JSONL annotations path.",
)
parser.add_argument(
"--output",
default="annotations/subset_20k.jsonl",
help="Output JSONL path.",
)
parser.add_argument(
"--size",
type=int,
default=20_000,
help="Number of samples to keep.",
)
parsed = parser.parse_args(list(args) if args is not None else None)
create_subset(parsed.input, parsed.output, parsed.size)
print(f"Subset of {parsed.size} entries written to {parsed.output}")
if __name__ == "__main__":
_main_from_cli()
|