apoorvrajdev's picture
fix(ci): un-ignore src/captioning/data/ source package
1d1e600
"""Load COCO 2017 caption annotations into a ``pandas.DataFrame``.
Mirrors notebook cell 2 with two small but important upgrades that don't
change behaviour at fixed seeds:
1. **Seeded sampling.** The notebook calls ``captions.sample(120000)`` with no
``random_state``, so two runs produce different subsets. We thread the
seed through so every run is identical when the seed is fixed.
2. **Path validation.** The notebook constructs paths via ``f-string`` with no
``os.path.exists`` check; if the dataset is missing, training fails ten
minutes in. We check the annotations file up front and raise a clear error.
"""
from __future__ import annotations
import json
from collections.abc import Callable
from pathlib import Path
import pandas as pd
from captioning.utils.logging import get_logger
log = get_logger(__name__)
IMAGE_FILENAME_TEMPLATE = "%012d.jpg" # Notebook cell 2: '%012d.jpg' % image_id
def load_coco_annotations(
base_path: str | Path,
annotations_filename: str = "captions_train2017.json",
images_subdir: str = "train2017",
sample_size: int = 120_000,
seed: int | None = None,
caption_preprocessor: Callable[[str], str] | None = None,
) -> pd.DataFrame:
"""Read COCO annotations and return a (image, caption) DataFrame.
Mirrors notebook cell 2 + cell 4 (when ``caption_preprocessor`` is
supplied). Returns the same columns and dtypes the notebook produces.
Args:
base_path: Path to the COCO root containing ``annotations/`` and the
images sub-directory.
annotations_filename: JSON file under ``base_path / 'annotations'``.
images_subdir: Folder containing the JPEG files.
sample_size: Number of caption rows to keep after sampling. Use
``-1`` to disable sampling and keep everything.
seed: Random seed for deterministic sampling. ``None`` matches the
notebook's non-deterministic behaviour.
caption_preprocessor: Optional function applied to the ``caption``
column. Pass ``preprocessing.preprocess_caption`` to reproduce
cell 4. Left optional so callers can stage the preprocessing
differently (the parity audit script applies it manually).
Returns:
``pd.DataFrame`` with columns ``image`` (absolute path) and ``caption``
(string), index reset.
Raises:
FileNotFoundError: If ``annotations_filename`` is missing.
"""
base_path = Path(base_path)
annotations_path = base_path / "annotations" / annotations_filename
if not annotations_path.is_file():
raise FileNotFoundError(
f"COCO annotations not found at {annotations_path}. "
f"Run `python -m scripts.prepare_data` to download."
)
log.info("loading_coco", path=str(annotations_path))
with annotations_path.open(encoding="utf-8") as f:
payload = json.load(f)
annotations = payload["annotations"]
img_dir = base_path / images_subdir
img_cap_pairs = [
[str(img_dir / (IMAGE_FILENAME_TEMPLATE % sample["image_id"])), sample["caption"]]
for sample in annotations
]
df = pd.DataFrame(img_cap_pairs, columns=["image", "caption"])
if sample_size > 0 and sample_size < len(df):
df = df.sample(n=sample_size, random_state=seed)
df = df.reset_index(drop=True)
if caption_preprocessor is not None:
df["caption"] = df["caption"].apply(caption_preprocessor)
log.info("coco_loaded", rows=len(df), unique_images=df["image"].nunique())
return df