File size: 1,669 Bytes
a745a5e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import json
import random
from pathlib import Path
from typing import Iterable


def create_subset(
    input_path: str | Path,
    output_path: str | Path,
    size: int = 20_000,
) -> None:
    """
    Create a random subset of a JSONL annotations file.
    """
    input_path = Path(input_path)
    output_path = Path(output_path)

    with input_path.open("r") as f:
        data = [json.loads(line) for line in f]

    if size > len(data):
        raise ValueError(f"Requested subset size {size} exceeds dataset size {len(data)}")

    subset = random.sample(data, size)

    output_path.parent.mkdir(parents=True, exist_ok=True)

    with output_path.open("w") as f:
        for item in subset:
            f.write(json.dumps(item) + "\n")


def _main_from_cli(args: Iterable[str] | None = None) -> None:
    """
    Simple CLI wrapper when this module is executed as a script.
    """
    import argparse

    parser = argparse.ArgumentParser(description="Create a random JSONL subset.")
    parser.add_argument(
        "--input",
        default="annotations/captions_train.jsonl",
        help="Input JSONL annotations path.",
    )
    parser.add_argument(
        "--output",
        default="annotations/subset_20k.jsonl",
        help="Output JSONL path.",
    )
    parser.add_argument(
        "--size",
        type=int,
        default=20_000,
        help="Number of samples to keep.",
    )

    parsed = parser.parse_args(list(args) if args is not None else None)
    create_subset(parsed.input, parsed.output, parsed.size)
    print(f"Subset of {parsed.size} entries written to {parsed.output}")


if __name__ == "__main__":
    _main_from_cli()