Spaces:

apoorvrajdev
/

image-captioning-api

Configuration error

File size: 6,069 Bytes

08f1adc

"""Generate development-only model artifacts so the FastAPI backend can boot.

Why this script exists:
    The Phase 2 backend lifespan loads weights + tokenizer from
    ``models/v1.0.0/``. Until Phase 1 training has been run end-to-end on
    COCO, those files don't exist and ``uvicorn`` fails on startup with
    ``FileNotFoundError``. This script produces a *valid* but
    *not meaningfully trained* set of artefacts so:

      * the entire backend pipeline (lifespan, /healthz, /v1/captions,
        multipart upload, predictor wiring) can be exercised;
      * mypy/ruff/pytest stay green;
      * a recruiter reviewing the repo can run ``uvicorn`` and hit the API.

Captions returned by the bootstrapped model will be *gibberish* — every
weight is initialised by Keras's default initialiser and never trained.
That's deliberate and clearly documented; the goal is to verify the
serving system, not produce real predictions.

Usage::

    python -m scripts.bootstrap_dev_artifacts \\
        --config configs/base.yaml \\
        --output-dir models/v1.0.0

The script is idempotent — running it twice overwrites the previous
artefacts. To replace dev artefacts with real Phase 1 outputs, run
``scripts/train.py`` and copy ``model.h5`` + ``vocab.pkl`` into the same
directory.
"""

from __future__ import annotations

from pathlib import Path

import click

from captioning.config import load_config
from captioning.models.factory import build_caption_model
from captioning.preprocessing.tokenizer import CaptionTokenizer
from captioning.utils import configure_logging, get_logger

log = get_logger(__name__)

# A tiny synthetic corpus. Wrapped in [start] ... [end] to mirror exactly the
# pre-processed format the real training pipeline produces in cell 4. The
# vocabulary that comes out of fitting on this is small (~50 tokens), but
# that's fine: the model's vocab_size is taken from the fitted tokenizer at
# build time, so weights and decode tables stay in lockstep.
_DEV_CORPUS: list[str] = [
    "[start] a man riding a surfboard on a wave [end]",
    "[start] a woman holding a small dog in her arms [end]",
    "[start] a group of people standing on a beach [end]",
    "[start] a cat sitting on top of a wooden table [end]",
    "[start] a plate of food on a wooden table [end]",
    "[start] a red bus driving down a city street [end]",
    "[start] a child kicking a soccer ball in a park [end]",
    "[start] two birds sitting on a tree branch [end]",
    "[start] a kitchen with a stove and a refrigerator [end]",
    "[start] a person standing in front of a mountain [end]",
]


@click.command()
@click.option(
    "--config",
    "config_path",
    default=Path("configs/base.yaml"),
    show_default=True,
    type=click.Path(exists=True, path_type=Path),
    help="App config YAML. Architecture hyperparameters are read from `model.*`.",
)
@click.option(
    "--output-dir",
    default=Path("models/v1.0.0"),
    show_default=True,
    type=click.Path(path_type=Path),
    help="Directory that will contain model.h5, vocab.pkl, vocab.json.",
)
def main(config_path: Path, output_dir: Path) -> None:
    """Create model.h5 + vocab.pkl + vocab.json under ``output-dir``."""
    configure_logging()
    config = load_config(config_path)
    output_dir.mkdir(parents=True, exist_ok=True)
    weights_filename = config.train.weights_filename
    weights_path = output_dir / weights_filename

    log.info("bootstrap_starting", output_dir=str(output_dir))

    # 1. Fit a tiny tokenizer on the synthetic corpus and save it.
    tokenizer = CaptionTokenizer(
        vocab_size=config.model.vocabulary_size,
        max_length=config.model.max_length,
    )
    tokenizer.fit(_DEV_CORPUS)
    tokenizer.save(output_dir)
    log.info(
        "tokenizer_saved",
        directory=str(output_dir),
        vocabulary_size=tokenizer.vocabulary_size,
    )

    # 2. Build the model with the *fitted* vocab size so the weights file
    #    matches the tokenizer that will be loaded next to it. Augmentation
    #    is left at its default (enabled) so the variable tree matches what
    #    a real Phase 1 ``model.fit`` produces — the predictor builds with
    #    the same defaults on load.
    model = build_caption_model(config, vocab_size=tokenizer.vocabulary_size)

    # 3. Force a forward pass so all variables are created before save. The
    #    sequence of calls mirrors ``CaptionPredictor._dummy_pass`` exactly,
    #    keeping save/load symmetric.
    import tensorflow as tf

    dummy_img = tf.zeros((1, 299, 299, 3), dtype=tf.float32)
    dummy_caps = tf.zeros((1, config.model.max_length), dtype=tf.int64)
    img_embed = model.cnn_model(dummy_img)
    encoded = model.encoder(img_embed, training=False)
    _ = model.decoder(
        dummy_caps[:, :-1],
        encoded,
        training=False,
        mask=tf.cast(dummy_caps[:, 1:] != 0, tf.int32),
    )
    if getattr(model, "image_aug", None) is not None:
        _ = model.image_aug(dummy_img, training=False)

    # 4. Mark the parent Model as built so HDF5 save/load round-trips. Real
    #    Phase 1 weights satisfy this implicitly via ``model.fit``; the
    #    bootstrap doesn't fit, so we set the flag explicitly. Predictor's
    #    ``_dummy_pass`` does the symmetric thing on load.
    model.built = True

    # 5. Save randomly-initialised weights. The file is structurally identical
    #    to a real Phase 1 checkpoint; only the values inside are untrained.
    model.save_weights(str(weights_path))
    log.info(
        "weights_saved",
        path=str(weights_path),
        warning="weights are randomly initialised; outputs will be gibberish",
    )

    click.echo(
        "\nDevelopment artefacts written:\n"
        f"  weights : {weights_path}\n"
        f"  vocab   : {output_dir / 'vocab.pkl'}\n"
        f"  vocab   : {output_dir / 'vocab.json'}\n"
        "\nThese are SMOKE-TEST artefacts only. Replace with real Phase 1 "
        "outputs before drawing any inference about model quality."
    )


if __name__ == "__main__":
    main()