Spaces:
Configuration error
Configuration error
| """Generate development-only model artifacts so the FastAPI backend can boot. | |
| Why this script exists: | |
| The Phase 2 backend lifespan loads weights + tokenizer from | |
| ``models/v1.0.0/``. Until Phase 1 training has been run end-to-end on | |
| COCO, those files don't exist and ``uvicorn`` fails on startup with | |
| ``FileNotFoundError``. This script produces a *valid* but | |
| *not meaningfully trained* set of artefacts so: | |
| * the entire backend pipeline (lifespan, /healthz, /v1/captions, | |
| multipart upload, predictor wiring) can be exercised; | |
| * mypy/ruff/pytest stay green; | |
| * a recruiter reviewing the repo can run ``uvicorn`` and hit the API. | |
| Captions returned by the bootstrapped model will be *gibberish* — every | |
| weight is initialised by Keras's default initialiser and never trained. | |
| That's deliberate and clearly documented; the goal is to verify the | |
| serving system, not produce real predictions. | |
| Usage:: | |
| python -m scripts.bootstrap_dev_artifacts \\ | |
| --config configs/base.yaml \\ | |
| --output-dir models/v1.0.0 | |
| The script is idempotent — running it twice overwrites the previous | |
| artefacts. To replace dev artefacts with real Phase 1 outputs, run | |
| ``scripts/train.py`` and copy ``model.h5`` + ``vocab.pkl`` into the same | |
| directory. | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| import click | |
| from captioning.config import load_config | |
| from captioning.models.factory import build_caption_model | |
| from captioning.preprocessing.tokenizer import CaptionTokenizer | |
| from captioning.utils import configure_logging, get_logger | |
| log = get_logger(__name__) | |
| # A tiny synthetic corpus. Wrapped in [start] ... [end] to mirror exactly the | |
| # pre-processed format the real training pipeline produces in cell 4. The | |
| # vocabulary that comes out of fitting on this is small (~50 tokens), but | |
| # that's fine: the model's vocab_size is taken from the fitted tokenizer at | |
| # build time, so weights and decode tables stay in lockstep. | |
| _DEV_CORPUS: list[str] = [ | |
| "[start] a man riding a surfboard on a wave [end]", | |
| "[start] a woman holding a small dog in her arms [end]", | |
| "[start] a group of people standing on a beach [end]", | |
| "[start] a cat sitting on top of a wooden table [end]", | |
| "[start] a plate of food on a wooden table [end]", | |
| "[start] a red bus driving down a city street [end]", | |
| "[start] a child kicking a soccer ball in a park [end]", | |
| "[start] two birds sitting on a tree branch [end]", | |
| "[start] a kitchen with a stove and a refrigerator [end]", | |
| "[start] a person standing in front of a mountain [end]", | |
| ] | |
| def main(config_path: Path, output_dir: Path) -> None: | |
| """Create model.h5 + vocab.pkl + vocab.json under ``output-dir``.""" | |
| configure_logging() | |
| config = load_config(config_path) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| weights_filename = config.train.weights_filename | |
| weights_path = output_dir / weights_filename | |
| log.info("bootstrap_starting", output_dir=str(output_dir)) | |
| # 1. Fit a tiny tokenizer on the synthetic corpus and save it. | |
| tokenizer = CaptionTokenizer( | |
| vocab_size=config.model.vocabulary_size, | |
| max_length=config.model.max_length, | |
| ) | |
| tokenizer.fit(_DEV_CORPUS) | |
| tokenizer.save(output_dir) | |
| log.info( | |
| "tokenizer_saved", | |
| directory=str(output_dir), | |
| vocabulary_size=tokenizer.vocabulary_size, | |
| ) | |
| # 2. Build the model with the *fitted* vocab size so the weights file | |
| # matches the tokenizer that will be loaded next to it. Augmentation | |
| # is left at its default (enabled) so the variable tree matches what | |
| # a real Phase 1 ``model.fit`` produces — the predictor builds with | |
| # the same defaults on load. | |
| model = build_caption_model(config, vocab_size=tokenizer.vocabulary_size) | |
| # 3. Force a forward pass so all variables are created before save. The | |
| # sequence of calls mirrors ``CaptionPredictor._dummy_pass`` exactly, | |
| # keeping save/load symmetric. | |
| import tensorflow as tf | |
| dummy_img = tf.zeros((1, 299, 299, 3), dtype=tf.float32) | |
| dummy_caps = tf.zeros((1, config.model.max_length), dtype=tf.int64) | |
| img_embed = model.cnn_model(dummy_img) | |
| encoded = model.encoder(img_embed, training=False) | |
| _ = model.decoder( | |
| dummy_caps[:, :-1], | |
| encoded, | |
| training=False, | |
| mask=tf.cast(dummy_caps[:, 1:] != 0, tf.int32), | |
| ) | |
| if getattr(model, "image_aug", None) is not None: | |
| _ = model.image_aug(dummy_img, training=False) | |
| # 4. Mark the parent Model as built so HDF5 save/load round-trips. Real | |
| # Phase 1 weights satisfy this implicitly via ``model.fit``; the | |
| # bootstrap doesn't fit, so we set the flag explicitly. Predictor's | |
| # ``_dummy_pass`` does the symmetric thing on load. | |
| model.built = True | |
| # 5. Save randomly-initialised weights. The file is structurally identical | |
| # to a real Phase 1 checkpoint; only the values inside are untrained. | |
| model.save_weights(str(weights_path)) | |
| log.info( | |
| "weights_saved", | |
| path=str(weights_path), | |
| warning="weights are randomly initialised; outputs will be gibberish", | |
| ) | |
| click.echo( | |
| "\nDevelopment artefacts written:\n" | |
| f" weights : {weights_path}\n" | |
| f" vocab : {output_dir / 'vocab.pkl'}\n" | |
| f" vocab : {output_dir / 'vocab.json'}\n" | |
| "\nThese are SMOKE-TEST artefacts only. Replace with real Phase 1 " | |
| "outputs before drawing any inference about model quality." | |
| ) | |
| if __name__ == "__main__": | |
| main() | |