"""Data — dataset loaders, splits, and tf.data pipelines. Data lives separately from preprocessing because *I/O and randomness* are fundamentally different concerns from pure transforms. This package owns: coco.py Read COCO annotation JSONs into a (image_path, caption) DataFrame splits.py Deterministic image-level train/val splitting (NOT caption-level — preventing the same image from appearing in both splits) pipeline.py Compose preprocessing + tokenization into tf.data pipelines """ from captioning.data.coco import load_coco_annotations from captioning.data.pipeline import build_train_pipeline, build_val_pipeline from captioning.data.splits import make_image_level_splits __all__ = [ "build_train_pipeline", "build_val_pipeline", "load_coco_annotations", "make_image_level_splits", ]