Spaces:

apoorvrajdev
/

image-captioning-api

Configuration error

App Files Files Community

image-captioning-api / src /captioning /data /coco.py

apoorvrajdev

fix(ci): un-ignore src/captioning/data/ source package

1d1e600 4 days ago

raw

history blame contribute delete

3.56 kB

	"""Load COCO 2017 caption annotations into a ``pandas.DataFrame``.

	Mirrors notebook cell 2 with two small but important upgrades that don't
	change behaviour at fixed seeds:

	1. Seeded sampling. The notebook calls ``captions.sample(120000)`` with no
	``random_state``, so two runs produce different subsets. We thread the
	seed through so every run is identical when the seed is fixed.
	2. Path validation. The notebook constructs paths via ``f-string`` with no
	``os.path.exists`` check; if the dataset is missing, training fails ten
	minutes in. We check the annotations file up front and raise a clear error.
	"""

	from __future__ import annotations

	import json
	from collections.abc import Callable
	from pathlib import Path

	import pandas as pd

	from captioning.utils.logging import get_logger

	log = get_logger(__name__)

	IMAGE_FILENAME_TEMPLATE = "%012d.jpg" # Notebook cell 2: '%012d.jpg' % image_id


	def load_coco_annotations(
	base_path: str \| Path,
	annotations_filename: str = "captions_train2017.json",
	images_subdir: str = "train2017",
	sample_size: int = 120_000,
	seed: int \| None = None,
	caption_preprocessor: Callable[[str], str] \| None = None,
	) -> pd.DataFrame:
	"""Read COCO annotations and return a (image, caption) DataFrame.

	Mirrors notebook cell 2 + cell 4 (when ``caption_preprocessor`` is
	supplied). Returns the same columns and dtypes the notebook produces.

	Args:
	base_path: Path to the COCO root containing ``annotations/`` and the
	images sub-directory.
	annotations_filename: JSON file under ``base_path / 'annotations'``.
	images_subdir: Folder containing the JPEG files.
	sample_size: Number of caption rows to keep after sampling. Use
	``-1`` to disable sampling and keep everything.
	seed: Random seed for deterministic sampling. ``None`` matches the
	notebook's non-deterministic behaviour.
	caption_preprocessor: Optional function applied to the ``caption``
	column. Pass ``preprocessing.preprocess_caption`` to reproduce
	cell 4. Left optional so callers can stage the preprocessing
	differently (the parity audit script applies it manually).

	Returns:
	``pd.DataFrame`` with columns ``image`` (absolute path) and ``caption``
	(string), index reset.

	Raises:
	FileNotFoundError: If ``annotations_filename`` is missing.
	"""
	base_path = Path(base_path)
	annotations_path = base_path / "annotations" / annotations_filename
	if not annotations_path.is_file():
	raise FileNotFoundError(
	f"COCO annotations not found at {annotations_path}. "
	f"Run `python -m scripts.prepare_data` to download."
	)

	log.info("loading_coco", path=str(annotations_path))
	with annotations_path.open(encoding="utf-8") as f:
	payload = json.load(f)
	annotations = payload["annotations"]

	img_dir = base_path / images_subdir
	img_cap_pairs = [
	[str(img_dir / (IMAGE_FILENAME_TEMPLATE % sample["image_id"])), sample["caption"]]
	for sample in annotations
	]
	df = pd.DataFrame(img_cap_pairs, columns=["image", "caption"])

	if sample_size > 0 and sample_size < len(df):
	df = df.sample(n=sample_size, random_state=seed)

	df = df.reset_index(drop=True)

	if caption_preprocessor is not None:
	df["caption"] = df["caption"].apply(caption_preprocessor)

	log.info("coco_loaded", rows=len(df), unique_images=df["image"].nunique())
	return df