File size: 3,564 Bytes
1d1e600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""Load COCO 2017 caption annotations into a ``pandas.DataFrame``.

Mirrors notebook cell 2 with two small but important upgrades that don't
change behaviour at fixed seeds:

1. **Seeded sampling.** The notebook calls ``captions.sample(120000)`` with no
   ``random_state``, so two runs produce different subsets. We thread the
   seed through so every run is identical when the seed is fixed.
2. **Path validation.** The notebook constructs paths via ``f-string`` with no
   ``os.path.exists`` check; if the dataset is missing, training fails ten
   minutes in. We check the annotations file up front and raise a clear error.
"""

from __future__ import annotations

import json
from collections.abc import Callable
from pathlib import Path

import pandas as pd

from captioning.utils.logging import get_logger

log = get_logger(__name__)

IMAGE_FILENAME_TEMPLATE = "%012d.jpg"  # Notebook cell 2: '%012d.jpg' % image_id


def load_coco_annotations(
    base_path: str | Path,
    annotations_filename: str = "captions_train2017.json",
    images_subdir: str = "train2017",
    sample_size: int = 120_000,
    seed: int | None = None,
    caption_preprocessor: Callable[[str], str] | None = None,
) -> pd.DataFrame:
    """Read COCO annotations and return a (image, caption) DataFrame.

    Mirrors notebook cell 2 + cell 4 (when ``caption_preprocessor`` is
    supplied). Returns the same columns and dtypes the notebook produces.

    Args:
        base_path: Path to the COCO root containing ``annotations/`` and the
            images sub-directory.
        annotations_filename: JSON file under ``base_path / 'annotations'``.
        images_subdir: Folder containing the JPEG files.
        sample_size: Number of caption rows to keep after sampling. Use
            ``-1`` to disable sampling and keep everything.
        seed: Random seed for deterministic sampling. ``None`` matches the
            notebook's non-deterministic behaviour.
        caption_preprocessor: Optional function applied to the ``caption``
            column. Pass ``preprocessing.preprocess_caption`` to reproduce
            cell 4. Left optional so callers can stage the preprocessing
            differently (the parity audit script applies it manually).

    Returns:
        ``pd.DataFrame`` with columns ``image`` (absolute path) and ``caption``
        (string), index reset.

    Raises:
        FileNotFoundError: If ``annotations_filename`` is missing.
    """
    base_path = Path(base_path)
    annotations_path = base_path / "annotations" / annotations_filename
    if not annotations_path.is_file():
        raise FileNotFoundError(
            f"COCO annotations not found at {annotations_path}. "
            f"Run `python -m scripts.prepare_data` to download."
        )

    log.info("loading_coco", path=str(annotations_path))
    with annotations_path.open(encoding="utf-8") as f:
        payload = json.load(f)
    annotations = payload["annotations"]

    img_dir = base_path / images_subdir
    img_cap_pairs = [
        [str(img_dir / (IMAGE_FILENAME_TEMPLATE % sample["image_id"])), sample["caption"]]
        for sample in annotations
    ]
    df = pd.DataFrame(img_cap_pairs, columns=["image", "caption"])

    if sample_size > 0 and sample_size < len(df):
        df = df.sample(n=sample_size, random_state=seed)

    df = df.reset_index(drop=True)

    if caption_preprocessor is not None:
        df["caption"] = df["caption"].apply(caption_preprocessor)

    log.info("coco_loaded", rows=len(df), unique_images=df["image"].nunique())
    return df