Spaces:
Configuration error
Configuration error
File size: 6,069 Bytes
08f1adc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 | """Generate development-only model artifacts so the FastAPI backend can boot.
Why this script exists:
The Phase 2 backend lifespan loads weights + tokenizer from
``models/v1.0.0/``. Until Phase 1 training has been run end-to-end on
COCO, those files don't exist and ``uvicorn`` fails on startup with
``FileNotFoundError``. This script produces a *valid* but
*not meaningfully trained* set of artefacts so:
* the entire backend pipeline (lifespan, /healthz, /v1/captions,
multipart upload, predictor wiring) can be exercised;
* mypy/ruff/pytest stay green;
* a recruiter reviewing the repo can run ``uvicorn`` and hit the API.
Captions returned by the bootstrapped model will be *gibberish* — every
weight is initialised by Keras's default initialiser and never trained.
That's deliberate and clearly documented; the goal is to verify the
serving system, not produce real predictions.
Usage::
python -m scripts.bootstrap_dev_artifacts \\
--config configs/base.yaml \\
--output-dir models/v1.0.0
The script is idempotent — running it twice overwrites the previous
artefacts. To replace dev artefacts with real Phase 1 outputs, run
``scripts/train.py`` and copy ``model.h5`` + ``vocab.pkl`` into the same
directory.
"""
from __future__ import annotations
from pathlib import Path
import click
from captioning.config import load_config
from captioning.models.factory import build_caption_model
from captioning.preprocessing.tokenizer import CaptionTokenizer
from captioning.utils import configure_logging, get_logger
log = get_logger(__name__)
# A tiny synthetic corpus. Wrapped in [start] ... [end] to mirror exactly the
# pre-processed format the real training pipeline produces in cell 4. The
# vocabulary that comes out of fitting on this is small (~50 tokens), but
# that's fine: the model's vocab_size is taken from the fitted tokenizer at
# build time, so weights and decode tables stay in lockstep.
_DEV_CORPUS: list[str] = [
"[start] a man riding a surfboard on a wave [end]",
"[start] a woman holding a small dog in her arms [end]",
"[start] a group of people standing on a beach [end]",
"[start] a cat sitting on top of a wooden table [end]",
"[start] a plate of food on a wooden table [end]",
"[start] a red bus driving down a city street [end]",
"[start] a child kicking a soccer ball in a park [end]",
"[start] two birds sitting on a tree branch [end]",
"[start] a kitchen with a stove and a refrigerator [end]",
"[start] a person standing in front of a mountain [end]",
]
@click.command()
@click.option(
"--config",
"config_path",
default=Path("configs/base.yaml"),
show_default=True,
type=click.Path(exists=True, path_type=Path),
help="App config YAML. Architecture hyperparameters are read from `model.*`.",
)
@click.option(
"--output-dir",
default=Path("models/v1.0.0"),
show_default=True,
type=click.Path(path_type=Path),
help="Directory that will contain model.h5, vocab.pkl, vocab.json.",
)
def main(config_path: Path, output_dir: Path) -> None:
"""Create model.h5 + vocab.pkl + vocab.json under ``output-dir``."""
configure_logging()
config = load_config(config_path)
output_dir.mkdir(parents=True, exist_ok=True)
weights_filename = config.train.weights_filename
weights_path = output_dir / weights_filename
log.info("bootstrap_starting", output_dir=str(output_dir))
# 1. Fit a tiny tokenizer on the synthetic corpus and save it.
tokenizer = CaptionTokenizer(
vocab_size=config.model.vocabulary_size,
max_length=config.model.max_length,
)
tokenizer.fit(_DEV_CORPUS)
tokenizer.save(output_dir)
log.info(
"tokenizer_saved",
directory=str(output_dir),
vocabulary_size=tokenizer.vocabulary_size,
)
# 2. Build the model with the *fitted* vocab size so the weights file
# matches the tokenizer that will be loaded next to it. Augmentation
# is left at its default (enabled) so the variable tree matches what
# a real Phase 1 ``model.fit`` produces — the predictor builds with
# the same defaults on load.
model = build_caption_model(config, vocab_size=tokenizer.vocabulary_size)
# 3. Force a forward pass so all variables are created before save. The
# sequence of calls mirrors ``CaptionPredictor._dummy_pass`` exactly,
# keeping save/load symmetric.
import tensorflow as tf
dummy_img = tf.zeros((1, 299, 299, 3), dtype=tf.float32)
dummy_caps = tf.zeros((1, config.model.max_length), dtype=tf.int64)
img_embed = model.cnn_model(dummy_img)
encoded = model.encoder(img_embed, training=False)
_ = model.decoder(
dummy_caps[:, :-1],
encoded,
training=False,
mask=tf.cast(dummy_caps[:, 1:] != 0, tf.int32),
)
if getattr(model, "image_aug", None) is not None:
_ = model.image_aug(dummy_img, training=False)
# 4. Mark the parent Model as built so HDF5 save/load round-trips. Real
# Phase 1 weights satisfy this implicitly via ``model.fit``; the
# bootstrap doesn't fit, so we set the flag explicitly. Predictor's
# ``_dummy_pass`` does the symmetric thing on load.
model.built = True
# 5. Save randomly-initialised weights. The file is structurally identical
# to a real Phase 1 checkpoint; only the values inside are untrained.
model.save_weights(str(weights_path))
log.info(
"weights_saved",
path=str(weights_path),
warning="weights are randomly initialised; outputs will be gibberish",
)
click.echo(
"\nDevelopment artefacts written:\n"
f" weights : {weights_path}\n"
f" vocab : {output_dir / 'vocab.pkl'}\n"
f" vocab : {output_dir / 'vocab.json'}\n"
"\nThese are SMOKE-TEST artefacts only. Replace with real Phase 1 "
"outputs before drawing any inference about model quality."
)
if __name__ == "__main__":
main()
|