Spaces:

apoorvrajdev
/

image-captioning-api

Configuration error

App Files Files Community

image-captioning-api / scripts /bootstrap_dev_artifacts.py

apoorvrajdev

feat(api): build production-grade FastAPI inference backend

08f1adc 27 days ago

raw

history blame contribute delete

6.07 kB

	"""Generate development-only model artifacts so the FastAPI backend can boot.

	Why this script exists:
	The Phase 2 backend lifespan loads weights + tokenizer from
	``models/v1.0.0/``. Until Phase 1 training has been run end-to-end on
	COCO, those files don't exist and ``uvicorn`` fails on startup with
	``FileNotFoundError``. This script produces a valid but
	not meaningfully trained set of artefacts so:

	* the entire backend pipeline (lifespan, /healthz, /v1/captions,
	multipart upload, predictor wiring) can be exercised;
	* mypy/ruff/pytest stay green;
	* a recruiter reviewing the repo can run ``uvicorn`` and hit the API.

	Captions returned by the bootstrapped model will be gibberish — every
	weight is initialised by Keras's default initialiser and never trained.
	That's deliberate and clearly documented; the goal is to verify the
	serving system, not produce real predictions.

	Usage::

	python -m scripts.bootstrap_dev_artifacts \\
	--config configs/base.yaml \\
	--output-dir models/v1.0.0

	The script is idempotent — running it twice overwrites the previous
	artefacts. To replace dev artefacts with real Phase 1 outputs, run
	``scripts/train.py`` and copy ``model.h5`` + ``vocab.pkl`` into the same
	directory.
	"""

	from __future__ import annotations

	from pathlib import Path

	import click

	from captioning.config import load_config
	from captioning.models.factory import build_caption_model
	from captioning.preprocessing.tokenizer import CaptionTokenizer
	from captioning.utils import configure_logging, get_logger

	log = get_logger(__name__)

	# A tiny synthetic corpus. Wrapped in [start] ... [end] to mirror exactly the
	# pre-processed format the real training pipeline produces in cell 4. The
	# vocabulary that comes out of fitting on this is small (~50 tokens), but
	# that's fine: the model's vocab_size is taken from the fitted tokenizer at
	# build time, so weights and decode tables stay in lockstep.
	_DEV_CORPUS: list[str] = [
	"[start] a man riding a surfboard on a wave [end]",
	"[start] a woman holding a small dog in her arms [end]",
	"[start] a group of people standing on a beach [end]",
	"[start] a cat sitting on top of a wooden table [end]",
	"[start] a plate of food on a wooden table [end]",
	"[start] a red bus driving down a city street [end]",
	"[start] a child kicking a soccer ball in a park [end]",
	"[start] two birds sitting on a tree branch [end]",
	"[start] a kitchen with a stove and a refrigerator [end]",
	"[start] a person standing in front of a mountain [end]",
	]


	@click.command()
	@click.option(
	"--config",
	"config_path",
	default=Path("configs/base.yaml"),
	show_default=True,
	type=click.Path(exists=True, path_type=Path),
	help="App config YAML. Architecture hyperparameters are read from `model.*`.",
	)
	@click.option(
	"--output-dir",
	default=Path("models/v1.0.0"),
	show_default=True,
	type=click.Path(path_type=Path),
	help="Directory that will contain model.h5, vocab.pkl, vocab.json.",
	)
	def main(config_path: Path, output_dir: Path) -> None:
	"""Create model.h5 + vocab.pkl + vocab.json under ``output-dir``."""
	configure_logging()
	config = load_config(config_path)
	output_dir.mkdir(parents=True, exist_ok=True)
	weights_filename = config.train.weights_filename
	weights_path = output_dir / weights_filename

	log.info("bootstrap_starting", output_dir=str(output_dir))

	# 1. Fit a tiny tokenizer on the synthetic corpus and save it.
	tokenizer = CaptionTokenizer(
	vocab_size=config.model.vocabulary_size,
	max_length=config.model.max_length,
	)
	tokenizer.fit(_DEV_CORPUS)
	tokenizer.save(output_dir)
	log.info(
	"tokenizer_saved",
	directory=str(output_dir),
	vocabulary_size=tokenizer.vocabulary_size,
	)

	# 2. Build the model with the fitted vocab size so the weights file
	# matches the tokenizer that will be loaded next to it. Augmentation
	# is left at its default (enabled) so the variable tree matches what
	# a real Phase 1 ``model.fit`` produces — the predictor builds with
	# the same defaults on load.
	model = build_caption_model(config, vocab_size=tokenizer.vocabulary_size)

	# 3. Force a forward pass so all variables are created before save. The
	# sequence of calls mirrors ``CaptionPredictor._dummy_pass`` exactly,
	# keeping save/load symmetric.
	import tensorflow as tf

	dummy_img = tf.zeros((1, 299, 299, 3), dtype=tf.float32)
	dummy_caps = tf.zeros((1, config.model.max_length), dtype=tf.int64)
	img_embed = model.cnn_model(dummy_img)
	encoded = model.encoder(img_embed, training=False)
	_ = model.decoder(
	dummy_caps[:, :-1],
	encoded,
	training=False,
	mask=tf.cast(dummy_caps[:, 1:] != 0, tf.int32),
	)
	if getattr(model, "image_aug", None) is not None:
	_ = model.image_aug(dummy_img, training=False)

	# 4. Mark the parent Model as built so HDF5 save/load round-trips. Real
	# Phase 1 weights satisfy this implicitly via ``model.fit``; the
	# bootstrap doesn't fit, so we set the flag explicitly. Predictor's
	# ``_dummy_pass`` does the symmetric thing on load.
	model.built = True

	# 5. Save randomly-initialised weights. The file is structurally identical
	# to a real Phase 1 checkpoint; only the values inside are untrained.
	model.save_weights(str(weights_path))
	log.info(
	"weights_saved",
	path=str(weights_path),
	warning="weights are randomly initialised; outputs will be gibberish",
	)

	click.echo(
	"\nDevelopment artefacts written:\n"
	f" weights : {weights_path}\n"
	f" vocab : {output_dir / 'vocab.pkl'}\n"
	f" vocab : {output_dir / 'vocab.json'}\n"
	"\nThese are SMOKE-TEST artefacts only. Replace with real Phase 1 "
	"outputs before drawing any inference about model quality."
	)


	if __name__ == "__main__":
	main()