Spaces:
Configuration error
Configuration error
| """Load COCO 2017 caption annotations into a ``pandas.DataFrame``. | |
| Mirrors notebook cell 2 with two small but important upgrades that don't | |
| change behaviour at fixed seeds: | |
| 1. **Seeded sampling.** The notebook calls ``captions.sample(120000)`` with no | |
| ``random_state``, so two runs produce different subsets. We thread the | |
| seed through so every run is identical when the seed is fixed. | |
| 2. **Path validation.** The notebook constructs paths via ``f-string`` with no | |
| ``os.path.exists`` check; if the dataset is missing, training fails ten | |
| minutes in. We check the annotations file up front and raise a clear error. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from collections.abc import Callable | |
| from pathlib import Path | |
| import pandas as pd | |
| from captioning.utils.logging import get_logger | |
| log = get_logger(__name__) | |
| IMAGE_FILENAME_TEMPLATE = "%012d.jpg" # Notebook cell 2: '%012d.jpg' % image_id | |
| def load_coco_annotations( | |
| base_path: str | Path, | |
| annotations_filename: str = "captions_train2017.json", | |
| images_subdir: str = "train2017", | |
| sample_size: int = 120_000, | |
| seed: int | None = None, | |
| caption_preprocessor: Callable[[str], str] | None = None, | |
| ) -> pd.DataFrame: | |
| """Read COCO annotations and return a (image, caption) DataFrame. | |
| Mirrors notebook cell 2 + cell 4 (when ``caption_preprocessor`` is | |
| supplied). Returns the same columns and dtypes the notebook produces. | |
| Args: | |
| base_path: Path to the COCO root containing ``annotations/`` and the | |
| images sub-directory. | |
| annotations_filename: JSON file under ``base_path / 'annotations'``. | |
| images_subdir: Folder containing the JPEG files. | |
| sample_size: Number of caption rows to keep after sampling. Use | |
| ``-1`` to disable sampling and keep everything. | |
| seed: Random seed for deterministic sampling. ``None`` matches the | |
| notebook's non-deterministic behaviour. | |
| caption_preprocessor: Optional function applied to the ``caption`` | |
| column. Pass ``preprocessing.preprocess_caption`` to reproduce | |
| cell 4. Left optional so callers can stage the preprocessing | |
| differently (the parity audit script applies it manually). | |
| Returns: | |
| ``pd.DataFrame`` with columns ``image`` (absolute path) and ``caption`` | |
| (string), index reset. | |
| Raises: | |
| FileNotFoundError: If ``annotations_filename`` is missing. | |
| """ | |
| base_path = Path(base_path) | |
| annotations_path = base_path / "annotations" / annotations_filename | |
| if not annotations_path.is_file(): | |
| raise FileNotFoundError( | |
| f"COCO annotations not found at {annotations_path}. " | |
| f"Run `python -m scripts.prepare_data` to download." | |
| ) | |
| log.info("loading_coco", path=str(annotations_path)) | |
| with annotations_path.open(encoding="utf-8") as f: | |
| payload = json.load(f) | |
| annotations = payload["annotations"] | |
| img_dir = base_path / images_subdir | |
| img_cap_pairs = [ | |
| [str(img_dir / (IMAGE_FILENAME_TEMPLATE % sample["image_id"])), sample["caption"]] | |
| for sample in annotations | |
| ] | |
| df = pd.DataFrame(img_cap_pairs, columns=["image", "caption"]) | |
| if sample_size > 0 and sample_size < len(df): | |
| df = df.sample(n=sample_size, random_state=seed) | |
| df = df.reset_index(drop=True) | |
| if caption_preprocessor is not None: | |
| df["caption"] = df["caption"].apply(caption_preprocessor) | |
| log.info("coco_loaded", rows=len(df), unique_images=df["image"].nunique()) | |
| return df | |