image_captioning / src /utils /data_subset.py
pchandragrid's picture
Deploy Streamlit app
a745a5e
import json
import random
from pathlib import Path
from typing import Iterable
def create_subset(
input_path: str | Path,
output_path: str | Path,
size: int = 20_000,
) -> None:
"""
Create a random subset of a JSONL annotations file.
"""
input_path = Path(input_path)
output_path = Path(output_path)
with input_path.open("r") as f:
data = [json.loads(line) for line in f]
if size > len(data):
raise ValueError(f"Requested subset size {size} exceeds dataset size {len(data)}")
subset = random.sample(data, size)
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("w") as f:
for item in subset:
f.write(json.dumps(item) + "\n")
def _main_from_cli(args: Iterable[str] | None = None) -> None:
"""
Simple CLI wrapper when this module is executed as a script.
"""
import argparse
parser = argparse.ArgumentParser(description="Create a random JSONL subset.")
parser.add_argument(
"--input",
default="annotations/captions_train.jsonl",
help="Input JSONL annotations path.",
)
parser.add_argument(
"--output",
default="annotations/subset_20k.jsonl",
help="Output JSONL path.",
)
parser.add_argument(
"--size",
type=int,
default=20_000,
help="Number of samples to keep.",
)
parsed = parser.parse_args(list(args) if args is not None else None)
create_subset(parsed.input, parsed.output, parsed.size)
print(f"Subset of {parsed.size} entries written to {parsed.output}")
if __name__ == "__main__":
_main_from_cli()