File size: 6,069 Bytes
08f1adc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""Generate development-only model artifacts so the FastAPI backend can boot.

Why this script exists:
    The Phase 2 backend lifespan loads weights + tokenizer from
    ``models/v1.0.0/``. Until Phase 1 training has been run end-to-end on
    COCO, those files don't exist and ``uvicorn`` fails on startup with
    ``FileNotFoundError``. This script produces a *valid* but
    *not meaningfully trained* set of artefacts so:

      * the entire backend pipeline (lifespan, /healthz, /v1/captions,
        multipart upload, predictor wiring) can be exercised;
      * mypy/ruff/pytest stay green;
      * a recruiter reviewing the repo can run ``uvicorn`` and hit the API.

Captions returned by the bootstrapped model will be *gibberish* — every
weight is initialised by Keras's default initialiser and never trained.
That's deliberate and clearly documented; the goal is to verify the
serving system, not produce real predictions.

Usage::

    python -m scripts.bootstrap_dev_artifacts \\
        --config configs/base.yaml \\
        --output-dir models/v1.0.0

The script is idempotent — running it twice overwrites the previous
artefacts. To replace dev artefacts with real Phase 1 outputs, run
``scripts/train.py`` and copy ``model.h5`` + ``vocab.pkl`` into the same
directory.
"""

from __future__ import annotations

from pathlib import Path

import click

from captioning.config import load_config
from captioning.models.factory import build_caption_model
from captioning.preprocessing.tokenizer import CaptionTokenizer
from captioning.utils import configure_logging, get_logger

log = get_logger(__name__)

# A tiny synthetic corpus. Wrapped in [start] ... [end] to mirror exactly the
# pre-processed format the real training pipeline produces in cell 4. The
# vocabulary that comes out of fitting on this is small (~50 tokens), but
# that's fine: the model's vocab_size is taken from the fitted tokenizer at
# build time, so weights and decode tables stay in lockstep.
_DEV_CORPUS: list[str] = [
    "[start] a man riding a surfboard on a wave [end]",
    "[start] a woman holding a small dog in her arms [end]",
    "[start] a group of people standing on a beach [end]",
    "[start] a cat sitting on top of a wooden table [end]",
    "[start] a plate of food on a wooden table [end]",
    "[start] a red bus driving down a city street [end]",
    "[start] a child kicking a soccer ball in a park [end]",
    "[start] two birds sitting on a tree branch [end]",
    "[start] a kitchen with a stove and a refrigerator [end]",
    "[start] a person standing in front of a mountain [end]",
]


@click.command()
@click.option(
    "--config",
    "config_path",
    default=Path("configs/base.yaml"),
    show_default=True,
    type=click.Path(exists=True, path_type=Path),
    help="App config YAML. Architecture hyperparameters are read from `model.*`.",
)
@click.option(
    "--output-dir",
    default=Path("models/v1.0.0"),
    show_default=True,
    type=click.Path(path_type=Path),
    help="Directory that will contain model.h5, vocab.pkl, vocab.json.",
)
def main(config_path: Path, output_dir: Path) -> None:
    """Create model.h5 + vocab.pkl + vocab.json under ``output-dir``."""
    configure_logging()
    config = load_config(config_path)
    output_dir.mkdir(parents=True, exist_ok=True)
    weights_filename = config.train.weights_filename
    weights_path = output_dir / weights_filename

    log.info("bootstrap_starting", output_dir=str(output_dir))

    # 1. Fit a tiny tokenizer on the synthetic corpus and save it.
    tokenizer = CaptionTokenizer(
        vocab_size=config.model.vocabulary_size,
        max_length=config.model.max_length,
    )
    tokenizer.fit(_DEV_CORPUS)
    tokenizer.save(output_dir)
    log.info(
        "tokenizer_saved",
        directory=str(output_dir),
        vocabulary_size=tokenizer.vocabulary_size,
    )

    # 2. Build the model with the *fitted* vocab size so the weights file
    #    matches the tokenizer that will be loaded next to it. Augmentation
    #    is left at its default (enabled) so the variable tree matches what
    #    a real Phase 1 ``model.fit`` produces — the predictor builds with
    #    the same defaults on load.
    model = build_caption_model(config, vocab_size=tokenizer.vocabulary_size)

    # 3. Force a forward pass so all variables are created before save. The
    #    sequence of calls mirrors ``CaptionPredictor._dummy_pass`` exactly,
    #    keeping save/load symmetric.
    import tensorflow as tf

    dummy_img = tf.zeros((1, 299, 299, 3), dtype=tf.float32)
    dummy_caps = tf.zeros((1, config.model.max_length), dtype=tf.int64)
    img_embed = model.cnn_model(dummy_img)
    encoded = model.encoder(img_embed, training=False)
    _ = model.decoder(
        dummy_caps[:, :-1],
        encoded,
        training=False,
        mask=tf.cast(dummy_caps[:, 1:] != 0, tf.int32),
    )
    if getattr(model, "image_aug", None) is not None:
        _ = model.image_aug(dummy_img, training=False)

    # 4. Mark the parent Model as built so HDF5 save/load round-trips. Real
    #    Phase 1 weights satisfy this implicitly via ``model.fit``; the
    #    bootstrap doesn't fit, so we set the flag explicitly. Predictor's
    #    ``_dummy_pass`` does the symmetric thing on load.
    model.built = True

    # 5. Save randomly-initialised weights. The file is structurally identical
    #    to a real Phase 1 checkpoint; only the values inside are untrained.
    model.save_weights(str(weights_path))
    log.info(
        "weights_saved",
        path=str(weights_path),
        warning="weights are randomly initialised; outputs will be gibberish",
    )

    click.echo(
        "\nDevelopment artefacts written:\n"
        f"  weights : {weights_path}\n"
        f"  vocab   : {output_dir / 'vocab.pkl'}\n"
        f"  vocab   : {output_dir / 'vocab.json'}\n"
        "\nThese are SMOKE-TEST artefacts only. Replace with real Phase 1 "
        "outputs before drawing any inference about model quality."
    )


if __name__ == "__main__":
    main()