import json import random from pathlib import Path from typing import Iterable def create_subset( input_path: str | Path, output_path: str | Path, size: int = 20_000, ) -> None: """ Create a random subset of a JSONL annotations file. """ input_path = Path(input_path) output_path = Path(output_path) with input_path.open("r") as f: data = [json.loads(line) for line in f] if size > len(data): raise ValueError(f"Requested subset size {size} exceeds dataset size {len(data)}") subset = random.sample(data, size) output_path.parent.mkdir(parents=True, exist_ok=True) with output_path.open("w") as f: for item in subset: f.write(json.dumps(item) + "\n") def _main_from_cli(args: Iterable[str] | None = None) -> None: """ Simple CLI wrapper when this module is executed as a script. """ import argparse parser = argparse.ArgumentParser(description="Create a random JSONL subset.") parser.add_argument( "--input", default="annotations/captions_train.jsonl", help="Input JSONL annotations path.", ) parser.add_argument( "--output", default="annotations/subset_20k.jsonl", help="Output JSONL path.", ) parser.add_argument( "--size", type=int, default=20_000, help="Number of samples to keep.", ) parsed = parser.parse_args(list(args) if args is not None else None) create_subset(parsed.input, parsed.output, parsed.size) print(f"Subset of {parsed.size} entries written to {parsed.output}") if __name__ == "__main__": _main_from_cli()