Spaces:
Configuration error
Configuration error
Commit ·
3a2e5f0
1
Parent(s): b2594db
feat: finalize Phase 1 modular ML architecture
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .paper-notebook.sha256 +1 -0
- configs/base.yaml +43 -0
- configs/train/debug.yaml +18 -0
- docs/PHASE_1_NOTES.md +350 -0
- pyproject.toml +1 -0
- requirements-dev.txt +1 -0
- scripts/__init__.py +1 -0
- scripts/evaluate.py +110 -0
- scripts/notebook_module_audit.py +244 -0
- scripts/predict.py +47 -0
- scripts/train.py +107 -0
- src/captioning/__init__.py +22 -0
- src/captioning/config/__init__.py +24 -0
- src/captioning/config/loader.py +45 -0
- src/captioning/config/schema.py +133 -0
- src/captioning/evaluation/__init__.py +9 -0
- src/captioning/evaluation/bleu.py +63 -0
- src/captioning/inference/__init__.py +21 -0
- src/captioning/inference/greedy.py +76 -0
- src/captioning/inference/image_loader.py +32 -0
- src/captioning/inference/predictor.py +131 -0
- src/captioning/models/__init__.py +29 -0
- src/captioning/models/captioning_model.py +98 -0
- src/captioning/models/embeddings.py +56 -0
- src/captioning/models/encoder_cnn.py +36 -0
- src/captioning/models/factory.py +66 -0
- src/captioning/models/transformer_decoder.py +130 -0
- src/captioning/models/transformer_encoder.py +45 -0
- src/captioning/preprocessing/__init__.py +35 -0
- src/captioning/preprocessing/augmentation.py +35 -0
- src/captioning/preprocessing/caption.py +58 -0
- src/captioning/preprocessing/image.py +62 -0
- src/captioning/preprocessing/tokenizer.py +203 -0
- src/captioning/py.typed +0 -0
- src/captioning/training/__init__.py +21 -0
- src/captioning/training/callbacks.py +55 -0
- src/captioning/training/losses.py +27 -0
- src/captioning/training/trainer.py +88 -0
- src/captioning/utils/__init__.py +20 -0
- src/captioning/utils/hashing.py +22 -0
- src/captioning/utils/logging.py +100 -0
- src/captioning/utils/seed.py +49 -0
- tests/__init__.py +0 -0
- tests/conftest.py +39 -0
- tests/unit/__init__.py +0 -0
- tests/unit/test_caption_preprocessing.py +68 -0
- tests/unit/test_config.py +89 -0
- tests/unit/test_evaluation.py +42 -0
- tests/unit/test_hashing.py +30 -0
- tests/unit/test_image_preprocessing.py +43 -0
.paper-notebook.sha256
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3170254b278cda6f641b264073a7e1d6bac639175f3611e30b14909ada984fcb
|
configs/base.yaml
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# configs/base.yaml — single canonical config for training and inference.
|
| 3 |
+
# -----------------------------------------------------------------------------
|
| 4 |
+
# Every value here mirrors the IEEE notebook (cell 6 hyperparams + cell 21
|
| 5 |
+
# layer wiring) so behaviour is identical to the published research. Override
|
| 6 |
+
# any field on the CLI or via env var (CAPTIONING__TRAIN__BATCH_SIZE=32) — see
|
| 7 |
+
# src/captioning/config/schema.py for the full validated schema.
|
| 8 |
+
# =============================================================================
|
| 9 |
+
|
| 10 |
+
data:
|
| 11 |
+
# Local path; scripts/prepare_data.py downloads COCO into this directory.
|
| 12 |
+
base_path: data/coco2017
|
| 13 |
+
annotations_filename: captions_train2017.json
|
| 14 |
+
images_subdir: train2017
|
| 15 |
+
sample_size: 120000 # Notebook: captions.sample(120000)
|
| 16 |
+
train_val_split: 0.8 # Notebook cell 11: int(len(img_keys) * 0.8)
|
| 17 |
+
|
| 18 |
+
model:
|
| 19 |
+
embedding_dim: 512 # Notebook: EMBEDDING_DIM = 512
|
| 20 |
+
units: 512 # Notebook: UNITS = 512
|
| 21 |
+
max_length: 40 # Notebook: MAX_LENGTH = 40
|
| 22 |
+
vocabulary_size: 15000 # Notebook: VOCABULARY_SIZE = 15000
|
| 23 |
+
encoder_num_heads: 1 # Notebook cell 21: TransformerEncoderLayer(EMBEDDING_DIM, 1)
|
| 24 |
+
decoder_num_heads: 8 # Notebook cell 21: TransformerDecoderLayer(..., 8)
|
| 25 |
+
decoder_dropout_inner: 0.3 # Notebook cell 19: dropout_1 = Dropout(0.3)
|
| 26 |
+
decoder_dropout_outer: 0.5 # Notebook cell 19: dropout_2 = Dropout(0.5)
|
| 27 |
+
decoder_attention_dropout: 0.1 # Notebook cell 19: MultiHeadAttention(dropout=0.1)
|
| 28 |
+
|
| 29 |
+
train:
|
| 30 |
+
epochs: 10 # Notebook: EPOCHS = 10
|
| 31 |
+
batch_size: 64 # Notebook: BATCH_SIZE = 64
|
| 32 |
+
buffer_size: 1000 # Notebook: BUFFER_SIZE = 1000
|
| 33 |
+
early_stopping_patience: 3 # Notebook cell 22: EarlyStopping(patience=3, ...)
|
| 34 |
+
seed: 42 # NEW: pin RNGs (notebook didn't seed; results varied)
|
| 35 |
+
learning_rate: 0.001 # Keras Adam default — what the notebook uses implicitly
|
| 36 |
+
weights_filename: model.h5 # Notebook cell 30: caption_model.save_weights('model.h5')
|
| 37 |
+
|
| 38 |
+
serve:
|
| 39 |
+
max_upload_bytes: 10485760 # 10 MB — guard at the API edge
|
| 40 |
+
decode_strategy: greedy # Phase 1b: "beam"
|
| 41 |
+
beam_width: 3
|
| 42 |
+
cors_allowed_origins:
|
| 43 |
+
- http://localhost:3000
|
configs/train/debug.yaml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# configs/train/debug.yaml — fast end-to-end smoke run.
|
| 3 |
+
# -----------------------------------------------------------------------------
|
| 4 |
+
# Used by CI to verify the training pipeline imports and steps once without
|
| 5 |
+
# OOMing or producing NaNs. Loads on top of base.yaml so only the changed
|
| 6 |
+
# fields need to be listed.
|
| 7 |
+
#
|
| 8 |
+
# python -m scripts.train --config configs/base.yaml --override configs/train/debug.yaml
|
| 9 |
+
# =============================================================================
|
| 10 |
+
|
| 11 |
+
data:
|
| 12 |
+
sample_size: 64 # Just enough captions to fill one batch
|
| 13 |
+
|
| 14 |
+
train:
|
| 15 |
+
epochs: 1
|
| 16 |
+
batch_size: 8
|
| 17 |
+
buffer_size: 16
|
| 18 |
+
seed: 0
|
docs/PHASE_1_NOTES.md
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Phase 1 — Modularisation (closeout)
|
| 2 |
+
|
| 3 |
+
> Phase 1 lifts every line of code out of the IEEE notebook into a proper
|
| 4 |
+
> Python package, behind a parity validation gate. No behaviour changes —
|
| 5 |
+
> the same hyperparameters, the same TF ops, the same losses, the same
|
| 6 |
+
> generation algorithm. What changes is *structure*: testable, reusable, and
|
| 7 |
+
> ready for FastAPI to import directly in Phase 2.
|
| 8 |
+
|
| 9 |
+
## Updated folder structure
|
| 10 |
+
|
| 11 |
+
```
|
| 12 |
+
src/captioning/
|
| 13 |
+
├── __init__.py # Public API + version
|
| 14 |
+
├── py.typed # PEP 561 marker — package ships type hints
|
| 15 |
+
│
|
| 16 |
+
├── config/ # Typed configuration (Pydantic v2)
|
| 17 |
+
│ ├── __init__.py
|
| 18 |
+
│ ├── schema.py # AppConfig, ModelConfig, TrainConfig, DataConfig, ServeConfig
|
| 19 |
+
│ └── loader.py # load_config(yaml_path) -> AppConfig
|
| 20 |
+
│
|
| 21 |
+
├── preprocessing/ # Pure, stateless transforms (TRAIN ↔ SERVE shared)
|
| 22 |
+
│ ├── __init__.py
|
| 23 |
+
│ ├── caption.py # preprocess_caption — notebook cell 3
|
| 24 |
+
│ ├── image.py # preprocess_image_tensor + load_and_preprocess_image
|
| 25 |
+
│ ├── tokenizer.py # CaptionTokenizer (wraps TextVectorization)
|
| 26 |
+
│ └── augmentation.py # default_image_augmentation — notebook cell 15
|
| 27 |
+
│
|
| 28 |
+
├── data/ # Stateful: I/O + dataset construction
|
| 29 |
+
│ ├── __init__.py
|
| 30 |
+
│ ├── coco.py # load_coco_annotations — notebook cell 2
|
| 31 |
+
│ ├── splits.py # make_image_level_splits — notebook cell 11
|
| 32 |
+
│ └── pipeline.py # build_train/val_pipeline — notebook cells 13-14
|
| 33 |
+
│
|
| 34 |
+
├── models/ # Architecture (TF/Keras layers + top-level model)
|
| 35 |
+
│ ├── __init__.py
|
| 36 |
+
│ ├── encoder_cnn.py # InceptionV3 backbone — notebook cell 16
|
| 37 |
+
│ ├── transformer_encoder.py # 1-layer encoder — notebook cell 17
|
| 38 |
+
│ ├── embeddings.py # token + positional — notebook cell 18
|
| 39 |
+
│ ├── transformer_decoder.py # multi-head causal decoder — notebook cell 19
|
| 40 |
+
│ ├── captioning_model.py # ImageCaptioningModel — notebook cell 20
|
| 41 |
+
│ └── factory.py # build_caption_model(config, vocab_size) — notebook cell 21
|
| 42 |
+
│
|
| 43 |
+
├── training/ # Loss, callbacks, orchestration
|
| 44 |
+
│ ├── __init__.py
|
| 45 |
+
│ ├── losses.py # masked_sparse_categorical_crossentropy — notebook cell 22
|
| 46 |
+
│ ├── callbacks.py # EarlyStopping (+ Phase 1b ModelCheckpoint, CSVLogger)
|
| 47 |
+
│ └── trainer.py # Trainer.fit — notebook cell 23
|
| 48 |
+
│
|
| 49 |
+
├── inference/ # Generation + FastAPI-friendly singleton
|
| 50 |
+
│ ├── __init__.py
|
| 51 |
+
│ ├── image_loader.py # load_image_from_path — notebook cell 25
|
| 52 |
+
│ ├── greedy.py # generate_caption_greedy — notebook cell 25
|
| 53 |
+
│ └── predictor.py # CaptionPredictor (Phase 2 FastAPI imports this)
|
| 54 |
+
│
|
| 55 |
+
├── evaluation/ # Caption-quality metrics
|
| 56 |
+
│ ├── __init__.py
|
| 57 |
+
│ └── bleu.py # corpus BLEU-4 via sacrebleu (Phase 1b adds CIDEr/METEOR/ROUGE)
|
| 58 |
+
│
|
| 59 |
+
└── utils/ # Cross-cutting helpers
|
| 60 |
+
├── __init__.py
|
| 61 |
+
├── logging.py # structlog (JSON in prod, pretty in dev)
|
| 62 |
+
├── seed.py # set_global_seed
|
| 63 |
+
└── hashing.py # sha256_file (paper-notebook freeze)
|
| 64 |
+
|
| 65 |
+
configs/
|
| 66 |
+
├── base.yaml # Mirrors notebook cell 6 hyperparams
|
| 67 |
+
└── train/debug.yaml # CI smoke override (1 epoch, batch 8)
|
| 68 |
+
|
| 69 |
+
scripts/
|
| 70 |
+
├── __init__.py
|
| 71 |
+
├── train.py # python -m scripts.train --config configs/base.yaml
|
| 72 |
+
├── evaluate.py # BLEU-4 on val split, optional Markdown report
|
| 73 |
+
├── predict.py # CLI single-image inference
|
| 74 |
+
└── notebook_module_audit.py # **Parity gate** — must pass before Phase 1b changes anything
|
| 75 |
+
|
| 76 |
+
tests/
|
| 77 |
+
├── __init__.py
|
| 78 |
+
├── conftest.py # autouse seed fixture, tiny corpus fixture
|
| 79 |
+
└── unit/
|
| 80 |
+
├── __init__.py
|
| 81 |
+
├── test_caption_preprocessing.py # 7 parametrised cases vs notebook baseline
|
| 82 |
+
├── test_config.py # default values, validation, env override, YAML loading
|
| 83 |
+
├── test_evaluation.py # BLEU smoke (perfect=100, ragged refs)
|
| 84 |
+
├── test_hashing.py # streaming SHA-256
|
| 85 |
+
├── test_image_preprocessing.py # output shape + InceptionV3 range
|
| 86 |
+
├── test_splits.py # image-level disjointness, seed reproducibility
|
| 87 |
+
└── test_tokenizer.py # fit/save/load round-trip
|
| 88 |
+
|
| 89 |
+
.paper-notebook.sha256 # Locked notebook hash for `make freeze-paper-notebook`
|
| 90 |
+
```
|
| 91 |
+
|
| 92 |
+
## Migration summary (notebook → modules)
|
| 93 |
+
|
| 94 |
+
| Notebook cell | Lines extracted to | Behavioural change |
|
| 95 |
+
|---|---|---|
|
| 96 |
+
| 0 (imports) | spread across modules | none |
|
| 97 |
+
| 1 (`BASE_PATH`) | `configs/base.yaml::data.base_path` | none |
|
| 98 |
+
| 2 (load COCO) | `data/coco.py::load_coco_annotations` | + path-existence check (early failure); + seedable sampling (was non-deterministic) |
|
| 99 |
+
| 3 (caption preprocess) | `preprocessing/caption.py::preprocess_caption` | none — pre-compiled regex for marginal speed |
|
| 100 |
+
| 4 (apply preprocess) | done inside `load_coco_annotations` | none |
|
| 101 |
+
| 6 (hyperparams) | `config/schema.py` + `configs/base.yaml` | typed and validated; env-overridable |
|
| 102 |
+
| 7-9 (tokenizer fit + save) | `preprocessing/tokenizer.py::CaptionTokenizer.fit/.save` | + JSON sidecar for inspection; pickle preserved for compat |
|
| 103 |
+
| 10 (StringLookup) | `preprocessing/tokenizer.py::CaptionTokenizer._build_lookups` | none |
|
| 104 |
+
| 11 (image-level split) | `data/splits.py::make_image_level_splits` | + seedable; + uses `random.Random(seed)` to avoid mutating module-global RNG |
|
| 105 |
+
| 13 (load_data) | `data/pipeline.py::_make_load_data_fn` + `preprocessing/image.py` | none |
|
| 106 |
+
| 14 (tf.data) | `data/pipeline.py::build_{train,val}_pipeline` | none — val shuffle preserved for parity |
|
| 107 |
+
| 15 (augmentation) | `preprocessing/augmentation.py::default_image_augmentation` | none |
|
| 108 |
+
| 16 (CNN_Encoder) | `models/encoder_cnn.py::build_cnn_encoder` | none |
|
| 109 |
+
| 17 (TransformerEncoderLayer) | `models/transformer_encoder.py` | none |
|
| 110 |
+
| 18 (Embeddings) | `models/embeddings.py` | none |
|
| 111 |
+
| 19 (TransformerDecoderLayer) | `models/transformer_decoder.py` | globals → constructor args (`vocab_size`, `max_len`); same defaults |
|
| 112 |
+
| 20 (ImageCaptioningModel) | `models/captioning_model.py` | none — `training=True` quirk preserved (commented) |
|
| 113 |
+
| 21 (wiring) | `models/factory.py::build_caption_model` | none |
|
| 114 |
+
| 22 (compile) | `training/losses.py` + `training/callbacks.py` + `Trainer.compile` | none |
|
| 115 |
+
| 23 (fit) | `training/trainer.py::Trainer.fit` | + writes `history.json` if output_dir given |
|
| 116 |
+
| 25 (inference) | `inference/{image_loader,greedy,predictor}.py` | globals → arguments (`model`, `tokenizer`, `max_length`) |
|
| 117 |
+
| 30 (save_weights) | `scripts/train.py` final step | none |
|
| 118 |
+
|
| 119 |
+
**No silent behaviour rewrites.** The two intentional, additive changes are
|
| 120 |
+
(a) seeds threaded through where the notebook had un-seeded randomness, and
|
| 121 |
+
(b) optional output-directory persistence in the `Trainer`. Both are gated
|
| 122 |
+
on caller arguments — passing `seed=None` or `output_dir=None` reproduces
|
| 123 |
+
notebook behaviour exactly.
|
| 124 |
+
|
| 125 |
+
### Behavioural quirks preserved on purpose
|
| 126 |
+
|
| 127 |
+
These are documented in code comments referencing this section.
|
| 128 |
+
|
| 129 |
+
1. **`compute_loss_and_acc` always passes `training=True`**
|
| 130 |
+
([captioning_model.py](../src/captioning/models/captioning_model.py)).
|
| 131 |
+
The notebook's `test_step` calls this with `training=False` but the call
|
| 132 |
+
ignores the argument and hardcodes `training=True` to the encoder/decoder.
|
| 133 |
+
Result: dropout is active during validation in the IEEE results. We
|
| 134 |
+
preserve this for parity. Phase 1b will fix it in a clearly-marked commit
|
| 135 |
+
*after* the parity gate is green.
|
| 136 |
+
|
| 137 |
+
2. **Validation pipeline is shuffled**
|
| 138 |
+
([data/pipeline.py](../src/captioning/data/pipeline.py)).
|
| 139 |
+
`build_val_pipeline` mirrors notebook cell 14 and includes `.shuffle()`,
|
| 140 |
+
which is technically pointless for validation. Phase 1b removes it.
|
| 141 |
+
|
| 142 |
+
3. **Vocabulary closure timing**.
|
| 143 |
+
The notebook's `TransformerDecoderLayer.__init__` reads
|
| 144 |
+
`tokenizer.vocabulary_size()` from module scope. We require it to be
|
| 145 |
+
passed in. Functionally identical when callers pass the right value;
|
| 146 |
+
structurally cleaner.
|
| 147 |
+
|
| 148 |
+
## Parity validation status
|
| 149 |
+
|
| 150 |
+
The `scripts/notebook_module_audit.py` script implements **four parity
|
| 151 |
+
checks** comparing the modular path against re-implemented notebook cells:
|
| 152 |
+
|
| 153 |
+
| Stage | Check | Tolerance |
|
| 154 |
+
|---|---|---|
|
| 155 |
+
| 1 | Caption preprocessing — string equality on 7 edge cases | exact |
|
| 156 |
+
| 2 | Tokenizer vocabulary — set + ordering equality on a 20-caption corpus + encoding equality on a held-out caption | exact |
|
| 157 |
+
| 3 | Image preprocessing — `tf.allclose` between `Resizing → preprocess_input` two ways | atol=1e-5 |
|
| 158 |
+
| 4 | Decoder forward pass — shape + determinism at `training=False` | atol=1e-6 |
|
| 159 |
+
|
| 160 |
+
**Status:** ⚠️ **Audit is wired up but has not been executed yet.** The
|
| 161 |
+
project venv (`.venv/`) is on Python 3.13, which is outside the package
|
| 162 |
+
requirement `>=3.10,<3.13`. TensorFlow 2.15 has no 3.13 wheels, so the
|
| 163 |
+
runtime deps cannot install in this venv. The user must recreate the venv
|
| 164 |
+
on Python 3.10 or 3.11 before the parity gate can run end-to-end.
|
| 165 |
+
**Static-only verification done so far:** every Python file passes
|
| 166 |
+
`py_compile.compile(..., doraise=True)`.
|
| 167 |
+
|
| 168 |
+
A *full* BLEU/caption parity test (the kind that runs the IEEE notebook
|
| 169 |
+
end-to-end and compares against a checkpoint loaded by the modular path)
|
| 170 |
+
requires a trained `model.h5` checkpoint, which doesn't exist in this repo
|
| 171 |
+
yet. Once Phase 2 publishes one to HuggingFace Hub, the audit will be
|
| 172 |
+
extended with a fifth stage that loads the same weights both ways and
|
| 173 |
+
asserts caption equality on a fixed image set.
|
| 174 |
+
|
| 175 |
+
## Technical debt remaining
|
| 176 |
+
|
| 177 |
+
| # | Debt | Where | Phase that addresses it |
|
| 178 |
+
|---|---|---|---|
|
| 179 |
+
| 1 | `compute_loss_and_acc` ignores `training` parameter | [models/captioning_model.py](../src/captioning/models/captioning_model.py) | 1b |
|
| 180 |
+
| 2 | Val pipeline shuffles unnecessarily | [data/pipeline.py](../src/captioning/data/pipeline.py) | 1b |
|
| 181 |
+
| 3 | Beam search not implemented (greedy only) | [inference/predictor.py](../src/captioning/inference/predictor.py) | 1b |
|
| 182 |
+
| 4 | LR fixed at Adam default; no warmup/cosine | [training/trainer.py](../src/captioning/training/trainer.py) | 1b |
|
| 183 |
+
| 5 | Only BLEU; no CIDEr/METEOR/ROUGE | [evaluation/](../src/captioning/evaluation/) | 1b |
|
| 184 |
+
| 6 | No GitHub Actions yet (CI runs nothing) | `.github/workflows/` | 2 |
|
| 185 |
+
| 7 | No FastAPI app yet | [backend/](../backend/) | 2 |
|
| 186 |
+
| 8 | venv on Python 3.13 (incompatible with TF 2.15) | `.venv/` | **immediate — see Recommended next commits** |
|
| 187 |
+
| 9 | `models/factory.py` lazily builds modules; class-creation pattern is odd | `models/*.py` (`_build_*_class()` factories) | leaving as-is — it keeps TF out of the import path for unrelated callers |
|
| 188 |
+
| 10 | No notebook-vs-trained-checkpoint caption parity test | `scripts/notebook_module_audit.py` | 2 (after first HF Hub upload) |
|
| 189 |
+
|
| 190 |
+
## Readiness assessment for Phase 2 (FastAPI integration)
|
| 191 |
+
|
| 192 |
+
| Phase 2 requirement | Status |
|
| 193 |
+
|---|---|
|
| 194 |
+
| `CaptionPredictor` is a self-contained class | ✅ — [predictor.py](../src/captioning/inference/predictor.py), `from_artifacts()` is the entry point |
|
| 195 |
+
| Model load is decoupled from request handling | ✅ — `from_artifacts()` does the load; `predict_*()` methods are pure functions of inputs |
|
| 196 |
+
| Image preprocessing matches training byte-for-byte | ✅ — both paths share `preprocessing.image.preprocess_image_tensor` |
|
| 197 |
+
| Tokenizer reload from disk works | ✅ — `CaptionTokenizer.load(directory, vocab_size, max_length)` with vocab.pkl + JSON sidecar |
|
| 198 |
+
| Config validated at boot | ✅ — Pydantic `AppConfig` raises clearly on missing/typo'd fields |
|
| 199 |
+
| Structured logging | ✅ — `utils.logging` emits JSON in production |
|
| 200 |
+
| Warmup hook for first-request latency | ✅ — `predictor.warmup()` runs one dummy inference |
|
| 201 |
+
| Singleton-friendly | ✅ — caller holds the instance; FastAPI `lifespan` will own one |
|
| 202 |
+
| **Blocker for Phase 2:** trained `model.h5` available somewhere | ❌ — must train (or import from Kaggle notebook) before backend can serve a real caption |
|
| 203 |
+
|
| 204 |
+
**Verdict: package is structurally ready for Phase 2.** The remaining
|
| 205 |
+
gating item is producing or importing a `model.h5` checkpoint. Two paths:
|
| 206 |
+
|
| 207 |
+
1. **Re-train locally** — `python -m scripts.train --config configs/base.yaml`
|
| 208 |
+
(requires COCO downloaded into `data/coco2017/`; ~12-18 hrs on CPU).
|
| 209 |
+
2. **Import from Kaggle** — the existing IEEE notebook on Kaggle can be re-run
|
| 210 |
+
to produce `model.h5` + `vocab_coco.file`, then uploaded to HuggingFace
|
| 211 |
+
Hub. This is the recommended path because it preserves the published BLEU.
|
| 212 |
+
|
| 213 |
+
## Recommended next commits
|
| 214 |
+
|
| 215 |
+
Order matters: each commit should be reviewable in isolation. Break Phase 1
|
| 216 |
+
into the following sequence (one logical change per commit):
|
| 217 |
+
|
| 218 |
+
```
|
| 219 |
+
1. chore(venv): document Python 3.10 requirement; add setup script
|
| 220 |
+
2. feat(utils): structured logging, seed, sha256 helpers
|
| 221 |
+
3. feat(config): Pydantic v2 schema + YAML loader
|
| 222 |
+
4. feat(preprocessing): caption + image transforms + CaptionTokenizer wrapper
|
| 223 |
+
5. feat(data): COCO loader, image-level splits, tf.data pipelines
|
| 224 |
+
6. feat(models): CNN encoder, Transformer encoder/decoder, captioning model, factory
|
| 225 |
+
7. feat(training): loss + callbacks + Trainer.fit
|
| 226 |
+
8. feat(inference): greedy generation + CaptionPredictor singleton
|
| 227 |
+
9. feat(evaluation): corpus BLEU-4 via sacrebleu
|
| 228 |
+
10. feat(scripts): train, evaluate, predict CLI entry points
|
| 229 |
+
11. test: unit tests for pure functions and TF-dependent smoke checks
|
| 230 |
+
12. feat(parity): notebook-module audit script gating Phase 1b changes
|
| 231 |
+
13. chore(notebook): lock paper-notebook hash for freeze CI check
|
| 232 |
+
14. docs: Phase 1 closeout (this file)
|
| 233 |
+
```
|
| 234 |
+
|
| 235 |
+
A single feature-branch PR (`feat/phase-1-modularisation`) collapsing all of
|
| 236 |
+
the above is also acceptable — recruiter-grade reviewers will want to see
|
| 237 |
+
the migration table, parity audit, and tests in one place.
|
| 238 |
+
|
| 239 |
+
### Suggested commit messages (verbatim)
|
| 240 |
+
|
| 241 |
+
```
|
| 242 |
+
chore(venv): pin Python to 3.10 and document setup
|
| 243 |
+
|
| 244 |
+
The Phase 0 venv was created on Python 3.13, which has no
|
| 245 |
+
tensorflow-cpu==2.15.0 wheels and falls outside the package
|
| 246 |
+
requirement (>=3.10,<3.13). Recreate with:
|
| 247 |
+
|
| 248 |
+
py -3.10 -m venv .venv
|
| 249 |
+
.venv\Scripts\activate
|
| 250 |
+
pip install -r requirements-dev.txt -r requirements-eval.txt
|
| 251 |
+
pip install -e ".[hf,mlflow]"
|
| 252 |
+
```
|
| 253 |
+
|
| 254 |
+
```
|
| 255 |
+
feat(captioning): extract IEEE notebook into modular package
|
| 256 |
+
|
| 257 |
+
Lifts every line of notebooks/01_ieee_inceptionv3_transformer.ipynb into
|
| 258 |
+
src/captioning/ behind a parity validation gate. Mirrors the notebook's
|
| 259 |
+
behaviour byte-for-byte at fixed seeds; intentional additive improvements
|
| 260 |
+
(seeded sampling, output-dir persistence, JSON vocab sidecar) are gated on
|
| 261 |
+
caller arguments and disabled by default.
|
| 262 |
+
|
| 263 |
+
Sub-packages:
|
| 264 |
+
config/ Pydantic v2 schema + YAML loader
|
| 265 |
+
preprocessing/ caption + image transforms + CaptionTokenizer wrapper
|
| 266 |
+
data/ COCO loader + image-level splits + tf.data pipelines
|
| 267 |
+
models/ CNN encoder + Transformer encoder/decoder + factory
|
| 268 |
+
training/ loss + callbacks + Trainer
|
| 269 |
+
inference/ greedy generation + CaptionPredictor singleton
|
| 270 |
+
evaluation/ corpus BLEU-4 via sacrebleu
|
| 271 |
+
utils/ structured logging + seed + sha256
|
| 272 |
+
|
| 273 |
+
Adds CLI entry points (scripts/{train,evaluate,predict}.py), a parity
|
| 274 |
+
audit (scripts/notebook_module_audit.py), and a unit test suite covering
|
| 275 |
+
all pure-Python paths. The Predictor exposes from_artifacts() and
|
| 276 |
+
warmup() so Phase 2's FastAPI lifespan can wire it in unchanged.
|
| 277 |
+
```
|
| 278 |
+
|
| 279 |
+
```
|
| 280 |
+
test(captioning): unit tests for pure modules + tokenizer round-trip
|
| 281 |
+
|
| 282 |
+
Covers caption preprocessing (parametrised vs notebook baseline),
|
| 283 |
+
config schema (defaults, validation, env override, YAML loading),
|
| 284 |
+
image-level splits (disjointness, seed reproducibility, int truncation),
|
| 285 |
+
hashing (stream vs one-shot equality), evaluation (perfect=100, ragged
|
| 286 |
+
refs, length mismatch raises), tokenizer (fit/save/load round-trip,
|
| 287 |
+
unfitted-error contract), image preprocessing (shape + range).
|
| 288 |
+
|
| 289 |
+
TF-dependent tests use pytest.importorskip; pure-Python tests need no
|
| 290 |
+
ML deps and are CI-runnable in <5s.
|
| 291 |
+
```
|
| 292 |
+
|
| 293 |
+
```
|
| 294 |
+
feat(parity): notebook-module audit gating Phase 1b changes
|
| 295 |
+
|
| 296 |
+
Four-stage parity check: caption preprocessing (exact), tokenizer
|
| 297 |
+
vocabulary (set + ordering + encoding equality), image preprocessing
|
| 298 |
+
(tf.allclose, atol=1e-5), decoder forward pass (shape + determinism at
|
| 299 |
+
training=False). Each stage re-implements the relevant notebook cell
|
| 300 |
+
inline so the ground truth is colocated with the test. Synthetic inputs
|
| 301 |
+
let the audit run in seconds without needing the real COCO dataset.
|
| 302 |
+
|
| 303 |
+
Run: python -m scripts.notebook_module_audit
|
| 304 |
+
```
|
| 305 |
+
|
| 306 |
+
```
|
| 307 |
+
chore(notebook): lock paper-notebook hash for freeze CI check
|
| 308 |
+
|
| 309 |
+
Adds .paper-notebook.sha256 with the SHA-256 of
|
| 310 |
+
notebooks/01_ieee_inceptionv3_transformer.ipynb at the time of Phase 1
|
| 311 |
+
modularisation. The `make freeze-paper-notebook` target asserts this
|
| 312 |
+
hash on every CI run; any byte change to the notebook fails the check.
|
| 313 |
+
Phase 4 wires this into a required GitHub Actions status check on main.
|
| 314 |
+
```
|
| 315 |
+
|
| 316 |
+
```
|
| 317 |
+
docs: Phase 1 closeout (modularisation complete)
|
| 318 |
+
|
| 319 |
+
Migration table (notebook cell → module), parity validation status,
|
| 320 |
+
preserved behavioural quirks, technical debt remaining, readiness
|
| 321 |
+
assessment for Phase 2 FastAPI integration. Documents the venv setup
|
| 322 |
+
gap (Python 3.13 vs project requirement 3.10/3.11) as the single
|
| 323 |
+
remaining blocker before the parity audit can execute end-to-end.
|
| 324 |
+
```
|
| 325 |
+
|
| 326 |
+
## Verification checklist (run before tagging Phase 1)
|
| 327 |
+
|
| 328 |
+
```powershell
|
| 329 |
+
# 1. Recreate the venv with a supported Python (3.10 or 3.11).
|
| 330 |
+
py -3.10 -m venv .venv
|
| 331 |
+
.venv\Scripts\activate
|
| 332 |
+
pip install -r requirements-dev.txt -r requirements-eval.txt
|
| 333 |
+
pip install -e ".[hf,mlflow]"
|
| 334 |
+
|
| 335 |
+
# 2. Run static checks.
|
| 336 |
+
ruff check src/captioning scripts tests
|
| 337 |
+
ruff format --check src/captioning scripts tests
|
| 338 |
+
mypy src/captioning scripts
|
| 339 |
+
|
| 340 |
+
# 3. Run unit tests.
|
| 341 |
+
pytest tests/ -v
|
| 342 |
+
|
| 343 |
+
# 4. Run the parity audit (the gate).
|
| 344 |
+
python -m scripts.notebook_module_audit
|
| 345 |
+
|
| 346 |
+
# 5. Verify the paper notebook is byte-stable.
|
| 347 |
+
make freeze-paper-notebook
|
| 348 |
+
```
|
| 349 |
+
|
| 350 |
+
All five must pass green before merging Phase 1 and starting Phase 2.
|
pyproject.toml
CHANGED
|
@@ -123,6 +123,7 @@ dev = [
|
|
| 123 |
"nbstripout>=0.7,<1.0",
|
| 124 |
"types-PyYAML",
|
| 125 |
"types-requests",
|
|
|
|
| 126 |
]
|
| 127 |
|
| 128 |
# -----------------------------------------------------------------------------
|
|
|
|
| 123 |
"nbstripout>=0.7,<1.0",
|
| 124 |
"types-PyYAML",
|
| 125 |
"types-requests",
|
| 126 |
+
"pandas-stubs>=2.2,<3.0",
|
| 127 |
]
|
| 128 |
|
| 129 |
# -----------------------------------------------------------------------------
|
requirements-dev.txt
CHANGED
|
@@ -31,3 +31,4 @@ nbstripout==0.7.1
|
|
| 31 |
# ---- Type stubs --------------------------------------------------------------
|
| 32 |
types-PyYAML==6.0.12.20240311
|
| 33 |
types-requests==2.32.0.20240602
|
|
|
|
|
|
| 31 |
# ---- Type stubs --------------------------------------------------------------
|
| 32 |
types-PyYAML==6.0.12.20240311
|
| 33 |
types-requests==2.32.0.20240602
|
| 34 |
+
pandas-stubs==2.2.2.240603
|
scripts/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""CLI entry points. Thin wrappers around captioning package modules."""
|
scripts/evaluate.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Evaluate a trained model on the COCO validation split.
|
| 2 |
+
|
| 3 |
+
Usage:
|
| 4 |
+
python -m scripts.evaluate \\
|
| 5 |
+
--config configs/base.yaml \\
|
| 6 |
+
--weights models/v1.0.0/model.h5 \\
|
| 7 |
+
--tokenizer-dir models/v1.0.0 \\
|
| 8 |
+
--report docs/results/v1.0.0.md \\
|
| 9 |
+
--max-samples 500
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import json
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
import click
|
| 18 |
+
|
| 19 |
+
from captioning.config import load_config
|
| 20 |
+
from captioning.data import load_coco_annotations, make_image_level_splits
|
| 21 |
+
from captioning.evaluation import corpus_bleu_score
|
| 22 |
+
from captioning.inference import CaptionPredictor
|
| 23 |
+
from captioning.preprocessing import preprocess_caption
|
| 24 |
+
from captioning.utils import configure_logging, get_logger, set_global_seed
|
| 25 |
+
|
| 26 |
+
log = get_logger(__name__)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@click.command()
|
| 30 |
+
@click.option(
|
| 31 |
+
"--config", "config_path", required=True, type=click.Path(exists=True, path_type=Path)
|
| 32 |
+
)
|
| 33 |
+
@click.option("--weights", required=True, type=click.Path(exists=True, path_type=Path))
|
| 34 |
+
@click.option("--tokenizer-dir", required=True, type=click.Path(exists=True, path_type=Path))
|
| 35 |
+
@click.option(
|
| 36 |
+
"--report",
|
| 37 |
+
"report_path",
|
| 38 |
+
default=None,
|
| 39 |
+
type=click.Path(path_type=Path),
|
| 40 |
+
help="Optional path to write a Markdown report.",
|
| 41 |
+
)
|
| 42 |
+
@click.option(
|
| 43 |
+
"--max-samples",
|
| 44 |
+
default=500,
|
| 45 |
+
type=int,
|
| 46 |
+
help="Cap on validation examples (full val takes hours on CPU).",
|
| 47 |
+
)
|
| 48 |
+
def main(
|
| 49 |
+
config_path: Path,
|
| 50 |
+
weights: Path,
|
| 51 |
+
tokenizer_dir: Path,
|
| 52 |
+
report_path: Path | None,
|
| 53 |
+
max_samples: int,
|
| 54 |
+
) -> None:
|
| 55 |
+
"""Compute corpus BLEU-4 on the val split and (optionally) write a report."""
|
| 56 |
+
configure_logging()
|
| 57 |
+
config = load_config(config_path)
|
| 58 |
+
set_global_seed(config.train.seed)
|
| 59 |
+
|
| 60 |
+
df = load_coco_annotations(
|
| 61 |
+
base_path=config.data.base_path,
|
| 62 |
+
annotations_filename=config.data.annotations_filename,
|
| 63 |
+
images_subdir=config.data.images_subdir,
|
| 64 |
+
sample_size=config.data.sample_size,
|
| 65 |
+
seed=config.train.seed,
|
| 66 |
+
caption_preprocessor=preprocess_caption,
|
| 67 |
+
)
|
| 68 |
+
_, _, val_imgs, val_caps = make_image_level_splits(
|
| 69 |
+
df, train_fraction=config.data.train_val_split, seed=config.train.seed
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# Group references by image so we get the COCO 5-references-per-image format.
|
| 73 |
+
refs_by_image: dict[str, list[str]] = {}
|
| 74 |
+
for img, cap in zip(val_imgs, val_caps, strict=True):
|
| 75 |
+
refs_by_image.setdefault(img, []).append(cap)
|
| 76 |
+
image_paths = list(refs_by_image.keys())[:max_samples]
|
| 77 |
+
|
| 78 |
+
predictor = CaptionPredictor.from_artifacts(
|
| 79 |
+
weights_path=weights, tokenizer_dir=tokenizer_dir, config=config
|
| 80 |
+
)
|
| 81 |
+
predictor.warmup()
|
| 82 |
+
|
| 83 |
+
predictions: list[str] = []
|
| 84 |
+
references: list[list[str]] = []
|
| 85 |
+
for path in image_paths:
|
| 86 |
+
predictions.append(predictor.predict_path(path))
|
| 87 |
+
references.append(refs_by_image[path])
|
| 88 |
+
|
| 89 |
+
bleu = corpus_bleu_score(predictions, references)
|
| 90 |
+
log.info("evaluation_done", bleu4=bleu, n=len(predictions))
|
| 91 |
+
click.echo(f"BLEU-4: {bleu:.2f} (n={len(predictions)})")
|
| 92 |
+
|
| 93 |
+
if report_path is not None:
|
| 94 |
+
report_path.parent.mkdir(parents=True, exist_ok=True)
|
| 95 |
+
report_path.write_text(
|
| 96 |
+
f"# Evaluation v1\n\n"
|
| 97 |
+
f"- BLEU-4: **{bleu:.2f}**\n"
|
| 98 |
+
f"- Examples: {len(predictions)}\n"
|
| 99 |
+
f"- Weights: `{weights}`\n",
|
| 100 |
+
encoding="utf-8",
|
| 101 |
+
)
|
| 102 |
+
json.dump(
|
| 103 |
+
{"bleu4": bleu, "n": len(predictions)},
|
| 104 |
+
(report_path.with_suffix(".json")).open("w", encoding="utf-8"),
|
| 105 |
+
indent=2,
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
if __name__ == "__main__":
|
| 110 |
+
main()
|
scripts/notebook_module_audit.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Parity audit: do the extracted modules behave identically to the notebook?
|
| 2 |
+
|
| 3 |
+
This script is the contract that gates Phase 1b improvements. Until it passes
|
| 4 |
+
green, we do not change behaviour anywhere — only structure.
|
| 5 |
+
|
| 6 |
+
Strategy:
|
| 7 |
+
Each check re-implements the relevant notebook cell *inline* (so the
|
| 8 |
+
"ground truth" is colocated with the test) and compares the output to
|
| 9 |
+
what the modular path produces from the same synthetic input. Synthetic
|
| 10 |
+
inputs let the audit run in seconds without needing the full COCO dataset.
|
| 11 |
+
|
| 12 |
+
Stages checked:
|
| 13 |
+
1. Caption preprocessing — pure-string equality
|
| 14 |
+
2. Tokenizer vocabulary — set equality
|
| 15 |
+
3. Image preprocessing — tf.allclose, atol=1e-5
|
| 16 |
+
4. Model forward pass at fixed weights — tf.allclose, atol=1e-4
|
| 17 |
+
|
| 18 |
+
Run:
|
| 19 |
+
python -m scripts.notebook_module_audit
|
| 20 |
+
|
| 21 |
+
Exits non-zero if any check fails. CI uses this as a required job before
|
| 22 |
+
merging any change to ``src/captioning/``.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
from __future__ import annotations
|
| 26 |
+
|
| 27 |
+
import re
|
| 28 |
+
import sys
|
| 29 |
+
|
| 30 |
+
from captioning.config.schema import AppConfig
|
| 31 |
+
from captioning.preprocessing.caption import preprocess_caption
|
| 32 |
+
from captioning.preprocessing.image import preprocess_image_tensor
|
| 33 |
+
from captioning.preprocessing.tokenizer import CaptionTokenizer
|
| 34 |
+
from captioning.utils.logging import configure_logging, get_logger
|
| 35 |
+
from captioning.utils.seed import set_global_seed
|
| 36 |
+
|
| 37 |
+
log = get_logger(__name__)
|
| 38 |
+
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
# Stage 1: Caption preprocessing
|
| 41 |
+
# ---------------------------------------------------------------------------
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _notebook_preprocess(text: str) -> str:
|
| 45 |
+
"""Verbatim copy of notebook cell 3, kept here as the ground truth."""
|
| 46 |
+
text = text.lower()
|
| 47 |
+
text = re.sub(r"[^\w\s]", "", text)
|
| 48 |
+
text = re.sub(r"\s+", " ", text)
|
| 49 |
+
text = text.strip()
|
| 50 |
+
return "[start] " + text + " [end]"
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def check_caption_preprocessing() -> bool:
|
| 54 |
+
cases = [
|
| 55 |
+
"A man is standing on a beach with a surfboard.",
|
| 56 |
+
" multiple spaces and a comma, period. ",
|
| 57 |
+
"ALL CAPS!!!",
|
| 58 |
+
" ",
|
| 59 |
+
"Hyphens-and apostrophes' included.",
|
| 60 |
+
"Emoji 😀 should be stripped",
|
| 61 |
+
"Numbers 123 stay (regex \\w keeps them)",
|
| 62 |
+
]
|
| 63 |
+
failures = []
|
| 64 |
+
for s in cases:
|
| 65 |
+
notebook_out = _notebook_preprocess(s)
|
| 66 |
+
module_out = preprocess_caption(s)
|
| 67 |
+
if notebook_out != module_out:
|
| 68 |
+
failures.append((s, notebook_out, module_out))
|
| 69 |
+
|
| 70 |
+
if failures:
|
| 71 |
+
for s, expected, got in failures:
|
| 72 |
+
log.error("caption_preproc_mismatch", input=s, expected=expected, got=got)
|
| 73 |
+
return False
|
| 74 |
+
log.info("caption_preproc_ok", n=len(cases))
|
| 75 |
+
return True
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
# ---------------------------------------------------------------------------
|
| 79 |
+
# Stage 2: Tokenizer vocabulary
|
| 80 |
+
# ---------------------------------------------------------------------------
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def check_tokenizer_vocabulary() -> bool:
|
| 84 |
+
import tensorflow as tf
|
| 85 |
+
|
| 86 |
+
captions = [
|
| 87 |
+
preprocess_caption(c)
|
| 88 |
+
for c in [
|
| 89 |
+
"a man on a surfboard",
|
| 90 |
+
"a dog in the park",
|
| 91 |
+
"two children playing with a ball",
|
| 92 |
+
"a cat sitting on a chair",
|
| 93 |
+
"a man riding a bike on the street",
|
| 94 |
+
]
|
| 95 |
+
* 4 # 20 captions
|
| 96 |
+
]
|
| 97 |
+
|
| 98 |
+
# Notebook-equivalent (cell 7): direct TextVectorization
|
| 99 |
+
nb_layer = tf.keras.layers.TextVectorization(
|
| 100 |
+
max_tokens=15000, standardize=None, output_sequence_length=40
|
| 101 |
+
)
|
| 102 |
+
nb_layer.adapt(captions)
|
| 103 |
+
nb_vocab = nb_layer.get_vocabulary()
|
| 104 |
+
|
| 105 |
+
# Module path
|
| 106 |
+
tokenizer = CaptionTokenizer(vocab_size=15000, max_length=40)
|
| 107 |
+
tokenizer.fit(captions)
|
| 108 |
+
mod_vocab = tokenizer.vocabulary
|
| 109 |
+
|
| 110 |
+
if nb_vocab != mod_vocab:
|
| 111 |
+
log.error(
|
| 112 |
+
"tokenizer_vocab_mismatch",
|
| 113 |
+
notebook_n=len(nb_vocab),
|
| 114 |
+
module_n=len(mod_vocab),
|
| 115 |
+
notebook_first=nb_vocab[:5],
|
| 116 |
+
module_first=mod_vocab[:5],
|
| 117 |
+
)
|
| 118 |
+
return False
|
| 119 |
+
|
| 120 |
+
# Encoding parity on a held-out caption
|
| 121 |
+
test = "a man on a surfboard at the beach"
|
| 122 |
+
nb_ids = nb_layer([test]).numpy().tolist()
|
| 123 |
+
mod_ids = tokenizer.encode([test]).numpy().tolist()
|
| 124 |
+
if nb_ids != mod_ids:
|
| 125 |
+
log.error("tokenizer_encode_mismatch", notebook=nb_ids, module=mod_ids)
|
| 126 |
+
return False
|
| 127 |
+
|
| 128 |
+
log.info("tokenizer_vocab_ok", vocab_size=len(mod_vocab))
|
| 129 |
+
return True
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
# ---------------------------------------------------------------------------
|
| 133 |
+
# Stage 3: Image preprocessing
|
| 134 |
+
# ---------------------------------------------------------------------------
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def check_image_preprocessing() -> bool:
|
| 138 |
+
import tensorflow as tf
|
| 139 |
+
|
| 140 |
+
set_global_seed(42)
|
| 141 |
+
raw = tf.random.uniform((640, 480, 3), minval=0, maxval=255, dtype=tf.int32)
|
| 142 |
+
raw = tf.cast(raw, tf.uint8)
|
| 143 |
+
|
| 144 |
+
# Notebook-equivalent (cell 13)
|
| 145 |
+
nb_img = tf.keras.layers.Resizing(299, 299)(raw)
|
| 146 |
+
nb_img = tf.keras.applications.inception_v3.preprocess_input(nb_img)
|
| 147 |
+
|
| 148 |
+
# Module path
|
| 149 |
+
mod_img = preprocess_image_tensor(raw)
|
| 150 |
+
|
| 151 |
+
if not tf.reduce_all(tf.experimental.numpy.isclose(nb_img, mod_img, atol=1e-5)):
|
| 152 |
+
max_diff = float(tf.reduce_max(tf.abs(nb_img - mod_img)))
|
| 153 |
+
log.error("image_preproc_mismatch", max_abs_diff=max_diff)
|
| 154 |
+
return False
|
| 155 |
+
log.info("image_preproc_ok", shape=tuple(mod_img.shape))
|
| 156 |
+
return True
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
# ---------------------------------------------------------------------------
|
| 160 |
+
# Stage 4: Model forward pass
|
| 161 |
+
# ---------------------------------------------------------------------------
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def check_model_forward() -> bool:
|
| 165 |
+
"""Build the model both ways at fixed seed; assert outputs match.
|
| 166 |
+
|
| 167 |
+
We can't compare to the *literal* notebook because the notebook builds
|
| 168 |
+
layers via global tokenizer/MAX_LENGTH closure. Instead we build the
|
| 169 |
+
decoder both ways and assert that the decoder behaves identically when
|
| 170 |
+
given identical layer weights.
|
| 171 |
+
"""
|
| 172 |
+
import tensorflow as tf
|
| 173 |
+
|
| 174 |
+
from captioning.models.transformer_decoder import TransformerDecoderLayer
|
| 175 |
+
|
| 176 |
+
set_global_seed(42)
|
| 177 |
+
|
| 178 |
+
config = AppConfig()
|
| 179 |
+
vocab_size = 200 # tiny but exercising the same code paths
|
| 180 |
+
decoder = TransformerDecoderLayer(
|
| 181 |
+
embed_dim=config.model.embedding_dim,
|
| 182 |
+
units=config.model.units,
|
| 183 |
+
num_heads=config.model.decoder_num_heads,
|
| 184 |
+
vocab_size=vocab_size,
|
| 185 |
+
max_len=config.model.max_length,
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
batch = 2
|
| 189 |
+
seq = config.model.max_length - 1
|
| 190 |
+
enc_out = tf.random.normal((batch, 64, config.model.embedding_dim))
|
| 191 |
+
ids = tf.random.uniform((batch, seq), minval=1, maxval=vocab_size, dtype=tf.int32)
|
| 192 |
+
mask = tf.cast(ids != 0, tf.int32)
|
| 193 |
+
|
| 194 |
+
out_a = decoder(ids, enc_out, training=False, mask=mask)
|
| 195 |
+
out_b = decoder(ids, enc_out, training=False, mask=mask)
|
| 196 |
+
|
| 197 |
+
# With training=False, dropout is off → identical outputs across calls.
|
| 198 |
+
if not tf.reduce_all(tf.experimental.numpy.isclose(out_a, out_b, atol=1e-6)):
|
| 199 |
+
log.error("model_determinism_failed_at_inference")
|
| 200 |
+
return False
|
| 201 |
+
|
| 202 |
+
expected_shape = (batch, seq, vocab_size)
|
| 203 |
+
if tuple(out_a.shape) != expected_shape:
|
| 204 |
+
log.error("model_shape_mismatch", expected=expected_shape, got=tuple(out_a.shape))
|
| 205 |
+
return False
|
| 206 |
+
|
| 207 |
+
log.info("model_forward_ok", shape=expected_shape)
|
| 208 |
+
return True
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
# ---------------------------------------------------------------------------
|
| 212 |
+
# Runner
|
| 213 |
+
# ---------------------------------------------------------------------------
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def main() -> int:
|
| 217 |
+
configure_logging()
|
| 218 |
+
log.info("parity_audit_start")
|
| 219 |
+
checks = [
|
| 220 |
+
("caption preprocessing", check_caption_preprocessing),
|
| 221 |
+
("tokenizer vocabulary", check_tokenizer_vocabulary),
|
| 222 |
+
("image preprocessing", check_image_preprocessing),
|
| 223 |
+
("model forward pass", check_model_forward),
|
| 224 |
+
]
|
| 225 |
+
results = []
|
| 226 |
+
for name, fn in checks:
|
| 227 |
+
try:
|
| 228 |
+
ok = fn()
|
| 229 |
+
except Exception: # — audit reports any error
|
| 230 |
+
log.exception("audit_check_errored", check=name)
|
| 231 |
+
ok = False
|
| 232 |
+
results.append((name, ok))
|
| 233 |
+
|
| 234 |
+
log.info("parity_audit_end", results=dict(results))
|
| 235 |
+
failed = [name for name, ok in results if not ok]
|
| 236 |
+
if failed:
|
| 237 |
+
print(f"\n[FAIL] parity audit: {len(failed)}/{len(results)} checks failed: {failed}")
|
| 238 |
+
return 1
|
| 239 |
+
print(f"\n[OK] parity audit: {len(results)}/{len(results)} checks passed")
|
| 240 |
+
return 0
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
if __name__ == "__main__":
|
| 244 |
+
sys.exit(main())
|
scripts/predict.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CLI single-image inference.
|
| 2 |
+
|
| 3 |
+
Usage:
|
| 4 |
+
python -m scripts.predict \\
|
| 5 |
+
--config configs/base.yaml \\
|
| 6 |
+
--weights models/v1.0.0/model.h5 \\
|
| 7 |
+
--tokenizer-dir models/v1.0.0 \\
|
| 8 |
+
--image path/to/photo.jpg
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
|
| 15 |
+
import click
|
| 16 |
+
|
| 17 |
+
from captioning.config import load_config
|
| 18 |
+
from captioning.inference import CaptionPredictor
|
| 19 |
+
from captioning.utils import configure_logging, get_logger
|
| 20 |
+
|
| 21 |
+
log = get_logger(__name__)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@click.command()
|
| 25 |
+
@click.option(
|
| 26 |
+
"--config", "config_path", required=True, type=click.Path(exists=True, path_type=Path)
|
| 27 |
+
)
|
| 28 |
+
@click.option("--weights", required=True, type=click.Path(exists=True, path_type=Path))
|
| 29 |
+
@click.option("--tokenizer-dir", required=True, type=click.Path(exists=True, path_type=Path))
|
| 30 |
+
@click.option("--image", required=True, type=click.Path(exists=True, path_type=Path))
|
| 31 |
+
def main(config_path: Path, weights: Path, tokenizer_dir: Path, image: Path) -> None:
|
| 32 |
+
"""Generate a caption for one image."""
|
| 33 |
+
configure_logging()
|
| 34 |
+
config = load_config(config_path)
|
| 35 |
+
|
| 36 |
+
predictor = CaptionPredictor.from_artifacts(
|
| 37 |
+
weights_path=weights,
|
| 38 |
+
tokenizer_dir=tokenizer_dir,
|
| 39 |
+
config=config,
|
| 40 |
+
)
|
| 41 |
+
predictor.warmup()
|
| 42 |
+
caption = predictor.predict_path(image)
|
| 43 |
+
click.echo(caption)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
if __name__ == "__main__":
|
| 47 |
+
main()
|
scripts/train.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Train the IEEE InceptionV3+Transformer captioning model.
|
| 2 |
+
|
| 3 |
+
Usage:
|
| 4 |
+
python -m scripts.train --config configs/base.yaml
|
| 5 |
+
python -m scripts.train --config configs/base.yaml --output-dir models/v1.0.0
|
| 6 |
+
|
| 7 |
+
The script orchestrates the same pipeline as the notebook, but each step is
|
| 8 |
+
imported from the modular package — making it the canonical example of how
|
| 9 |
+
the package is meant to be composed.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
import click
|
| 17 |
+
|
| 18 |
+
from captioning.config import load_config
|
| 19 |
+
from captioning.data import (
|
| 20 |
+
build_train_pipeline,
|
| 21 |
+
build_val_pipeline,
|
| 22 |
+
load_coco_annotations,
|
| 23 |
+
make_image_level_splits,
|
| 24 |
+
)
|
| 25 |
+
from captioning.models import build_caption_model
|
| 26 |
+
from captioning.preprocessing import CaptionTokenizer, preprocess_caption
|
| 27 |
+
from captioning.training import Trainer
|
| 28 |
+
from captioning.utils import configure_logging, get_logger, set_global_seed
|
| 29 |
+
|
| 30 |
+
log = get_logger(__name__)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@click.command()
|
| 34 |
+
@click.option(
|
| 35 |
+
"--config",
|
| 36 |
+
"config_path",
|
| 37 |
+
required=True,
|
| 38 |
+
type=click.Path(exists=True, dir_okay=False, path_type=Path),
|
| 39 |
+
help="YAML config file (e.g. configs/base.yaml).",
|
| 40 |
+
)
|
| 41 |
+
@click.option(
|
| 42 |
+
"--output-dir",
|
| 43 |
+
type=click.Path(path_type=Path),
|
| 44 |
+
default="outputs/runs/latest",
|
| 45 |
+
help="Where to save weights, vocab, and history.",
|
| 46 |
+
)
|
| 47 |
+
def main(config_path: Path, output_dir: Path) -> None:
|
| 48 |
+
"""Run the full training pipeline end-to-end."""
|
| 49 |
+
configure_logging()
|
| 50 |
+
config = load_config(config_path)
|
| 51 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 52 |
+
|
| 53 |
+
set_global_seed(config.train.seed)
|
| 54 |
+
log.info("config_loaded", path=str(config_path), output_dir=str(output_dir))
|
| 55 |
+
|
| 56 |
+
# 1. Load + preprocess COCO captions ------------------------------------
|
| 57 |
+
df = load_coco_annotations(
|
| 58 |
+
base_path=config.data.base_path,
|
| 59 |
+
annotations_filename=config.data.annotations_filename,
|
| 60 |
+
images_subdir=config.data.images_subdir,
|
| 61 |
+
sample_size=config.data.sample_size,
|
| 62 |
+
seed=config.train.seed,
|
| 63 |
+
caption_preprocessor=preprocess_caption,
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# 2. Fit and persist the tokenizer --------------------------------------
|
| 67 |
+
tokenizer = CaptionTokenizer(
|
| 68 |
+
vocab_size=config.model.vocabulary_size,
|
| 69 |
+
max_length=config.model.max_length,
|
| 70 |
+
)
|
| 71 |
+
tokenizer.fit(df["caption"])
|
| 72 |
+
tokenizer.save(output_dir)
|
| 73 |
+
|
| 74 |
+
# 3. Image-level train/val split ----------------------------------------
|
| 75 |
+
train_imgs, train_caps, val_imgs, val_caps = make_image_level_splits(
|
| 76 |
+
df, train_fraction=config.data.train_val_split, seed=config.train.seed
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# 4. tf.data pipelines ---------------------------------------------------
|
| 80 |
+
train_ds = build_train_pipeline(
|
| 81 |
+
train_imgs,
|
| 82 |
+
train_caps,
|
| 83 |
+
tokenizer,
|
| 84 |
+
batch_size=config.train.batch_size,
|
| 85 |
+
buffer_size=config.train.buffer_size,
|
| 86 |
+
)
|
| 87 |
+
val_ds = build_val_pipeline(
|
| 88 |
+
val_imgs,
|
| 89 |
+
val_caps,
|
| 90 |
+
tokenizer,
|
| 91 |
+
batch_size=config.train.batch_size,
|
| 92 |
+
buffer_size=config.train.buffer_size,
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
# 5. Build, compile, fit -------------------------------------------------
|
| 96 |
+
model = build_caption_model(config, vocab_size=tokenizer.vocabulary_size)
|
| 97 |
+
trainer = Trainer(model, config)
|
| 98 |
+
trainer.fit(train_ds, val_ds, output_dir=output_dir)
|
| 99 |
+
|
| 100 |
+
# 6. Save final weights to the canonical filename ------------------------
|
| 101 |
+
final_weights = output_dir / config.train.weights_filename
|
| 102 |
+
model.save_weights(str(final_weights))
|
| 103 |
+
log.info("training_done", weights=str(final_weights))
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
if __name__ == "__main__":
|
| 107 |
+
main()
|
src/captioning/__init__.py
CHANGED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Captioning — production-grade extraction of the IEEE image-captioning research.
|
| 2 |
+
|
| 3 |
+
The package mirrors the IEEE notebook
|
| 4 |
+
(``notebooks/01_ieee_inceptionv3_transformer.ipynb``) but separates orthogonal
|
| 5 |
+
concerns into sub-packages so each piece is independently testable, composable,
|
| 6 |
+
and reusable from FastAPI / scripts.
|
| 7 |
+
|
| 8 |
+
Sub-package map:
|
| 9 |
+
config/ Pydantic settings + YAML loader (the project's "type system")
|
| 10 |
+
preprocessing/ Pure transforms on captions and images (no I/O, no state)
|
| 11 |
+
data/ COCO loaders, splits, tf.data pipelines (I/O + statefulness)
|
| 12 |
+
models/ Keras layers and models (CNN encoder + Transformer decoder)
|
| 13 |
+
training/ Losses, callbacks, training orchestration
|
| 14 |
+
inference/ Generation algorithms + a singleton-friendly Predictor
|
| 15 |
+
evaluation/ BLEU/CIDEr/METEOR/ROUGE (Phase 1b expands these)
|
| 16 |
+
utils/ Cross-cutting helpers (logging, seed, hashing, paths)
|
| 17 |
+
|
| 18 |
+
Public API is intentionally small. Everything else is internal and may change.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
__version__ = "0.1.0"
|
| 22 |
+
__all__ = ["__version__"]
|
src/captioning/config/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Configuration package — Pydantic schemas and YAML loaders.
|
| 2 |
+
|
| 3 |
+
Why a dedicated package? Configs are the project's *type system*. Every other
|
| 4 |
+
module accepts an `AppConfig` (or a sub-config) instead of pulling globals,
|
| 5 |
+
which makes them testable in isolation and trivially overridable in CI / serve.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from captioning.config.loader import load_config
|
| 9 |
+
from captioning.config.schema import (
|
| 10 |
+
AppConfig,
|
| 11 |
+
DataConfig,
|
| 12 |
+
ModelConfig,
|
| 13 |
+
ServeConfig,
|
| 14 |
+
TrainConfig,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
__all__ = [
|
| 18 |
+
"AppConfig",
|
| 19 |
+
"DataConfig",
|
| 20 |
+
"ModelConfig",
|
| 21 |
+
"ServeConfig",
|
| 22 |
+
"TrainConfig",
|
| 23 |
+
"load_config",
|
| 24 |
+
]
|
src/captioning/config/loader.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""YAML-to-Pydantic config loader.
|
| 2 |
+
|
| 3 |
+
Why this exists separately from ``schema.py``:
|
| 4 |
+
* Schema is *what* a valid config looks like; loader is *how* you build one.
|
| 5 |
+
Splitting them lets tests build an ``AppConfig`` programmatically without
|
| 6 |
+
touching disk, and lets the loader gain features (env-file resolution,
|
| 7 |
+
multi-file merging) without changing the schema.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Any
|
| 14 |
+
|
| 15 |
+
import yaml
|
| 16 |
+
|
| 17 |
+
from captioning.config.schema import AppConfig
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def load_config(path: str | Path) -> AppConfig:
|
| 21 |
+
"""Load a YAML file into an ``AppConfig`` and validate it.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
path: Path to a YAML file with the structure::
|
| 25 |
+
|
| 26 |
+
data: {...}
|
| 27 |
+
model: {...}
|
| 28 |
+
train: {...}
|
| 29 |
+
serve: {...}
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
A fully validated, immutable ``AppConfig`` instance.
|
| 33 |
+
|
| 34 |
+
Raises:
|
| 35 |
+
FileNotFoundError: If the YAML path does not exist.
|
| 36 |
+
pydantic.ValidationError: If any field fails validation.
|
| 37 |
+
"""
|
| 38 |
+
path = Path(path)
|
| 39 |
+
if not path.is_file():
|
| 40 |
+
raise FileNotFoundError(f"Config file not found: {path}")
|
| 41 |
+
|
| 42 |
+
with path.open(encoding="utf-8") as f:
|
| 43 |
+
raw: dict[str, Any] = yaml.safe_load(f) or {}
|
| 44 |
+
|
| 45 |
+
return AppConfig(**raw)
|
src/captioning/config/schema.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Typed configuration schemas (Pydantic v2 ``BaseSettings``).
|
| 2 |
+
|
| 3 |
+
These classes replace the bare globals ``MAX_LENGTH``, ``BATCH_SIZE``, ... that
|
| 4 |
+
the notebook holds in cell 6. The advantages of doing this:
|
| 5 |
+
|
| 6 |
+
1. **Type safety** — every field has a declared type and Pydantic validates
|
| 7 |
+
it at load time. A YAML typo (``batch_size: "64"`` as a string) raises an
|
| 8 |
+
error pointing at the file and field, not a mysterious training failure
|
| 9 |
+
six steps later.
|
| 10 |
+
2. **Env override** — ``CAPTIONING__TRAIN__BATCH_SIZE=32`` overrides
|
| 11 |
+
``train.batch_size`` without editing YAML. The double underscore is the
|
| 12 |
+
nesting delimiter (configurable below). Useful for CI smoke tests.
|
| 13 |
+
3. **Single source of truth** — every other module accepts a sub-config
|
| 14 |
+
(``ModelConfig``, ``TrainConfig``, ...) instead of pulling globals. That
|
| 15 |
+
makes them testable in isolation and trivially overridable in serve.
|
| 16 |
+
|
| 17 |
+
The schema mirrors the IEEE notebook 1:1 — same field names where reasonable,
|
| 18 |
+
same default values. Extending it (Phase 1b: warmup/cosine LR; Phase 3: model
|
| 19 |
+
registry) only adds new fields, never changes the meaning of existing ones.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
|
| 24 |
+
from pathlib import Path
|
| 25 |
+
|
| 26 |
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
| 27 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class _StrictModel(BaseModel):
|
| 31 |
+
"""Shared base for every sub-config — rejects unknown keys.
|
| 32 |
+
|
| 33 |
+
Pydantic's default ``extra="ignore"`` silently drops misspelled fields.
|
| 34 |
+
For configs that drive ML hyperparameters that's the worst possible
|
| 35 |
+
behaviour: a typo (``vocabularsy_size`` instead of ``vocabulary_size``)
|
| 36 |
+
silently uses the default and the model trains with the wrong value.
|
| 37 |
+
Forbidding extras turns every typo into a load-time error pointing at
|
| 38 |
+
the offending field.
|
| 39 |
+
|
| 40 |
+
Note: ``extra="forbid"`` is set on ``AppConfig`` separately because
|
| 41 |
+
``BaseSettings`` uses ``SettingsConfigDict``, not ``ConfigDict``.
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
model_config = ConfigDict(extra="forbid")
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
class DataConfig(_StrictModel):
|
| 48 |
+
"""Where the dataset lives and how much of it to use.
|
| 49 |
+
|
| 50 |
+
Attributes:
|
| 51 |
+
base_path: Root of the COCO dataset. Mirrors the notebook's
|
| 52 |
+
``BASE_PATH = '../input/coco-2017-dataset/coco2017'``.
|
| 53 |
+
annotations_filename: Name of the captions JSON inside ``annotations/``.
|
| 54 |
+
images_subdir: Sub-folder under ``base_path`` containing JPEGs.
|
| 55 |
+
sample_size: How many caption pairs to sample. The notebook samples
|
| 56 |
+
120k. Set to ``-1`` to use the full set.
|
| 57 |
+
train_val_split: Fraction of *images* (not captions) used for training.
|
| 58 |
+
Splitting at the image level prevents the same image appearing in
|
| 59 |
+
both splits via different captions — a real leakage source.
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
base_path: Path = Path("data/coco2017")
|
| 63 |
+
annotations_filename: str = "captions_train2017.json"
|
| 64 |
+
images_subdir: str = "train2017"
|
| 65 |
+
sample_size: int = 120_000
|
| 66 |
+
train_val_split: float = 0.8
|
| 67 |
+
|
| 68 |
+
@field_validator("train_val_split")
|
| 69 |
+
@classmethod
|
| 70 |
+
def _validate_split(cls, v: float) -> float:
|
| 71 |
+
if not 0.0 < v < 1.0:
|
| 72 |
+
raise ValueError(f"train_val_split must be in (0, 1), got {v}")
|
| 73 |
+
return v
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
class ModelConfig(_StrictModel):
|
| 77 |
+
"""Architecture hyperparameters.
|
| 78 |
+
|
| 79 |
+
Defaults match the IEEE paper / notebook cell 6 exactly. Changing any of
|
| 80 |
+
these requires re-training and re-publishing the model card on HF Hub.
|
| 81 |
+
"""
|
| 82 |
+
|
| 83 |
+
embedding_dim: int = 512
|
| 84 |
+
units: int = 512
|
| 85 |
+
max_length: int = 40
|
| 86 |
+
vocabulary_size: int = 15_000
|
| 87 |
+
encoder_num_heads: int = 1 # Notebook cell 21: TransformerEncoderLayer(EMBEDDING_DIM, 1)
|
| 88 |
+
decoder_num_heads: int = 8 # Notebook cell 21: TransformerDecoderLayer(..., 8)
|
| 89 |
+
decoder_dropout_inner: float = 0.3 # Notebook cell 19: dropout_1
|
| 90 |
+
decoder_dropout_outer: float = 0.5 # Notebook cell 19: dropout_2
|
| 91 |
+
decoder_attention_dropout: float = 0.1 # Notebook cell 19: MultiHeadAttention(dropout=0.1)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
class TrainConfig(_StrictModel):
|
| 95 |
+
"""Optimisation hyperparameters."""
|
| 96 |
+
|
| 97 |
+
epochs: int = 10
|
| 98 |
+
batch_size: int = 64
|
| 99 |
+
buffer_size: int = 1_000 # tf.data shuffle buffer
|
| 100 |
+
early_stopping_patience: int = 3
|
| 101 |
+
seed: int = 42 # NEW (not in notebook): pin RNGs for reproducibility
|
| 102 |
+
learning_rate: float = 1e-3 # Notebook uses Keras Adam default == 1e-3
|
| 103 |
+
weights_filename: str = "model.h5"
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
class ServeConfig(_StrictModel):
|
| 107 |
+
"""Settings for the FastAPI backend (Phase 2). Defined here so the schema
|
| 108 |
+
is complete and tests don't have to mock a sub-config's existence."""
|
| 109 |
+
|
| 110 |
+
max_upload_bytes: int = 10 * 1024 * 1024 # 10 MB
|
| 111 |
+
decode_strategy: str = "greedy" # Phase 1b adds "beam"
|
| 112 |
+
beam_width: int = 3
|
| 113 |
+
cors_allowed_origins: list[str] = Field(default_factory=lambda: ["http://localhost:3000"])
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
class AppConfig(BaseSettings):
|
| 117 |
+
"""Top-level config aggregating every sub-config.
|
| 118 |
+
|
| 119 |
+
Loaded by ``captioning.config.loader.load_config(yaml_path)``. Env vars
|
| 120 |
+
with prefix ``CAPTIONING__`` override fields at any depth.
|
| 121 |
+
"""
|
| 122 |
+
|
| 123 |
+
data: DataConfig = Field(default_factory=DataConfig)
|
| 124 |
+
model: ModelConfig = Field(default_factory=ModelConfig)
|
| 125 |
+
train: TrainConfig = Field(default_factory=TrainConfig)
|
| 126 |
+
serve: ServeConfig = Field(default_factory=ServeConfig)
|
| 127 |
+
|
| 128 |
+
model_config = SettingsConfigDict(
|
| 129 |
+
env_prefix="CAPTIONING__",
|
| 130 |
+
env_nested_delimiter="__",
|
| 131 |
+
case_sensitive=False,
|
| 132 |
+
extra="forbid", # Reject unknown keys — catches typos at load time
|
| 133 |
+
)
|
src/captioning/evaluation/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Evaluation — caption-quality metrics.
|
| 2 |
+
|
| 3 |
+
Phase 1 ships a corpus-BLEU implementation only; Phase 1b expands to CIDEr,
|
| 4 |
+
METEOR, and ROUGE-L (which is why this is its own package, not a single file).
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from captioning.evaluation.bleu import corpus_bleu_score
|
| 8 |
+
|
| 9 |
+
__all__ = ["corpus_bleu_score"]
|
src/captioning/evaluation/bleu.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Corpus BLEU score (Phase 1 minimal implementation).
|
| 2 |
+
|
| 3 |
+
The IEEE paper reports BLEU ~24 on COCO val. The notebook does not include
|
| 4 |
+
the evaluation code that produced this number — we add it here so the new
|
| 5 |
+
modular pipeline can verify it matches the paper.
|
| 6 |
+
|
| 7 |
+
Phase 1 ships *one* metric (corpus BLEU-4 via ``sacrebleu``) on purpose:
|
| 8 |
+
* sacrebleu is the de-facto BLEU implementation. NLTK's BLEU has
|
| 9 |
+
idiosyncratic smoothing and produces slightly different numbers; we
|
| 10 |
+
use sacrebleu so the published number is reproducible by anyone with
|
| 11 |
+
pip.
|
| 12 |
+
* Phase 1b expands to BLEU-1..4, CIDEr, METEOR, ROUGE-L, all in this
|
| 13 |
+
package, all behind the same ``runner.py`` interface.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
from collections.abc import Sequence
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def corpus_bleu_score(
|
| 22 |
+
predictions: Sequence[str],
|
| 23 |
+
references: Sequence[Sequence[str]],
|
| 24 |
+
) -> float:
|
| 25 |
+
"""Compute corpus BLEU-4 via ``sacrebleu``.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
predictions: One generated caption per evaluation example.
|
| 29 |
+
references: One *list* of reference captions per evaluation example.
|
| 30 |
+
COCO has up to 5 references per image; pad shorter lists with the
|
| 31 |
+
empty string ``""`` if needed (sacrebleu handles ragged lists).
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
BLEU-4 in the 0-100 range (sacrebleu's convention; multiply by 1
|
| 35 |
+
to compare with NLTK's 0-1 range — they're not interchangeable).
|
| 36 |
+
|
| 37 |
+
Raises:
|
| 38 |
+
ImportError: If sacrebleu is not installed. Install via the eval
|
| 39 |
+
extras: ``pip install -e ".[eval]"`` or the requirements file.
|
| 40 |
+
"""
|
| 41 |
+
try:
|
| 42 |
+
import sacrebleu
|
| 43 |
+
except ImportError as e:
|
| 44 |
+
raise ImportError(
|
| 45 |
+
"sacrebleu is required for BLEU evaluation. "
|
| 46 |
+
"Install it via `pip install -r requirements-eval.txt`."
|
| 47 |
+
) from e
|
| 48 |
+
|
| 49 |
+
if len(predictions) != len(references):
|
| 50 |
+
raise ValueError(
|
| 51 |
+
f"predictions ({len(predictions)}) and references "
|
| 52 |
+
f"({len(references)}) must have the same length"
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
# sacrebleu's `corpus_bleu` expects parallel lists, one *per reference
|
| 56 |
+
# slot*: refs_by_slot[slot_index][example_index].
|
| 57 |
+
max_refs = max(len(r) for r in references) if references else 0
|
| 58 |
+
refs_by_slot = [
|
| 59 |
+
[refs[i] if i < len(refs) else "" for refs in references] for i in range(max_refs)
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
bleu = sacrebleu.corpus_bleu(list(predictions), refs_by_slot)
|
| 63 |
+
return float(bleu.score)
|
src/captioning/inference/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Inference — generation algorithms and the FastAPI-friendly ``CaptionPredictor``.
|
| 2 |
+
|
| 3 |
+
The notebook generates captions through a free-floating ``generate_caption``
|
| 4 |
+
function that closes over global state (``caption_model``, ``tokenizer``,
|
| 5 |
+
``MAX_LENGTH``). We keep the same algorithm but inject those dependencies
|
| 6 |
+
explicitly so it works inside a long-lived process (FastAPI lifespan).
|
| 7 |
+
|
| 8 |
+
image_loader.py ``load_image_from_path`` — used at request time
|
| 9 |
+
greedy.py ``generate_caption_greedy`` — the notebook's argmax decode loop
|
| 10 |
+
predictor.py ``CaptionPredictor`` — singleton wrapper for the API
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from captioning.inference.greedy import generate_caption_greedy
|
| 14 |
+
from captioning.inference.image_loader import load_image_from_path
|
| 15 |
+
from captioning.inference.predictor import CaptionPredictor
|
| 16 |
+
|
| 17 |
+
__all__ = [
|
| 18 |
+
"CaptionPredictor",
|
| 19 |
+
"generate_caption_greedy",
|
| 20 |
+
"load_image_from_path",
|
| 21 |
+
]
|
src/captioning/inference/greedy.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Greedy caption generation.
|
| 2 |
+
|
| 3 |
+
Mirrors notebook cell 25's ``generate_caption`` exactly. The notebook closes
|
| 4 |
+
over four globals (``caption_model``, ``tokenizer``, ``idx2word``,
|
| 5 |
+
``MAX_LENGTH``); we accept them as explicit arguments so the function is
|
| 6 |
+
callable from tests, scripts, FastAPI, and the parity audit.
|
| 7 |
+
|
| 8 |
+
The algorithm:
|
| 9 |
+
1. CNN-encode the image.
|
| 10 |
+
2. Transformer-encode the patch features.
|
| 11 |
+
3. Seed the caption with ``[start]``.
|
| 12 |
+
4. For each position 0 ... ``max_length - 2``:
|
| 13 |
+
a. Tokenise the partial caption (``[:, :-1]`` because TextVectorization
|
| 14 |
+
pads to ``max_length`` and we feed ``max_length - 1`` positions
|
| 15 |
+
into the decoder).
|
| 16 |
+
b. Decode and take the argmax at the current position.
|
| 17 |
+
c. Stop on ``[end]``; otherwise append the predicted word.
|
| 18 |
+
5. Strip the ``[start]`` prefix and return.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
from captioning.preprocessing.caption import END_TOKEN, START_TOKEN
|
| 24 |
+
from captioning.preprocessing.tokenizer import CaptionTokenizer
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def generate_caption_greedy(
|
| 28 |
+
model,
|
| 29 |
+
tokenizer: CaptionTokenizer,
|
| 30 |
+
image_tensor,
|
| 31 |
+
max_length: int,
|
| 32 |
+
*,
|
| 33 |
+
add_noise: bool = False,
|
| 34 |
+
) -> str:
|
| 35 |
+
"""Generate a caption for one image using greedy (argmax) decoding.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
model: An ``ImageCaptioningModel`` whose weights have been loaded.
|
| 39 |
+
tokenizer: Fitted ``CaptionTokenizer`` (the same one used at training).
|
| 40 |
+
image_tensor: A ``[299, 299, 3]`` float tensor produced by
|
| 41 |
+
``inference.load_image_from_path`` (or ``preprocess_image_tensor``).
|
| 42 |
+
max_length: Decode budget — equals ``config.model.max_length`` (40
|
| 43 |
+
in the notebook).
|
| 44 |
+
add_noise: Replicates the notebook's ``add_noise`` knob; off by default.
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
The generated caption string with the ``[start]`` sentinel removed.
|
| 48 |
+
The ``[end]`` sentinel is naturally absent because the loop breaks on it.
|
| 49 |
+
"""
|
| 50 |
+
import numpy as np
|
| 51 |
+
import tensorflow as tf
|
| 52 |
+
|
| 53 |
+
img = image_tensor
|
| 54 |
+
if add_noise:
|
| 55 |
+
noise = tf.random.normal(img.shape) * 0.1
|
| 56 |
+
img = img + noise
|
| 57 |
+
img = (img - tf.reduce_min(img)) / (tf.reduce_max(img) - tf.reduce_min(img))
|
| 58 |
+
|
| 59 |
+
img = tf.expand_dims(img, axis=0)
|
| 60 |
+
img_embed = model.cnn_model(img)
|
| 61 |
+
img_encoded = model.encoder(img_embed, training=False)
|
| 62 |
+
|
| 63 |
+
y_inp = START_TOKEN
|
| 64 |
+
for i in range(max_length - 1):
|
| 65 |
+
tokenized = tokenizer.encode([y_inp])[:, :-1]
|
| 66 |
+
mask = tf.cast(tokenized != 0, tf.int32)
|
| 67 |
+
pred = model.decoder(tokenized, img_encoded, training=False, mask=mask)
|
| 68 |
+
|
| 69 |
+
pred_idx = np.argmax(pred[0, i, :])
|
| 70 |
+
pred_idx = tf.convert_to_tensor(pred_idx)
|
| 71 |
+
pred_word = tokenizer.decode_id(pred_idx)
|
| 72 |
+
if pred_word == END_TOKEN:
|
| 73 |
+
break
|
| 74 |
+
y_inp += " " + pred_word
|
| 75 |
+
|
| 76 |
+
return y_inp.replace(f"{START_TOKEN} ", "")
|
src/captioning/inference/image_loader.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Inference-time image loader — same path as cell 25 of the notebook.
|
| 2 |
+
|
| 3 |
+
The training pipeline goes through ``data.pipeline.build_*_pipeline`` which
|
| 4 |
+
calls ``preprocessing.image.preprocess_image_tensor``. The inference path
|
| 5 |
+
must produce the same tensor for the same image, otherwise BLEU drops
|
| 6 |
+
silently. This module re-uses ``preprocess_image_tensor`` so train/serve
|
| 7 |
+
parity is by construction.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
from captioning.preprocessing.image import preprocess_image_tensor
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def load_image_from_path(image_path: str):
|
| 16 |
+
"""Read a JPEG/PNG from disk and produce a model-ready tensor.
|
| 17 |
+
|
| 18 |
+
Mirrors the ``load_image_from_path`` helper in notebook cell 25.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
image_path: Filesystem path to the image. ``str``, ``Path``, and
|
| 22 |
+
``tf.string`` tensors all work (TF does the conversion).
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
A ``tf.Tensor`` of shape ``[299, 299, 3]``, dtype ``float32``,
|
| 26 |
+
with InceptionV3 normalisation.
|
| 27 |
+
"""
|
| 28 |
+
import tensorflow as tf
|
| 29 |
+
|
| 30 |
+
raw = tf.io.read_file(image_path)
|
| 31 |
+
image = tf.io.decode_jpeg(raw, channels=3)
|
| 32 |
+
return preprocess_image_tensor(image)
|
src/captioning/inference/predictor.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""``CaptionPredictor`` — stateful, FastAPI-friendly inference singleton.
|
| 2 |
+
|
| 3 |
+
Why a class around the existing functions:
|
| 4 |
+
* The FastAPI lifespan loads weights once at boot and reuses the same
|
| 5 |
+
model across every request. A predictor object is the natural home for
|
| 6 |
+
"loaded model + loaded tokenizer + decoded config".
|
| 7 |
+
* Tests can construct one with stub objects without monkey-patching globals.
|
| 8 |
+
* Phase 1b adds beam search; Phase 3 adds a model registry. Both extend
|
| 9 |
+
this class, not the functional callsites.
|
| 10 |
+
|
| 11 |
+
Construction is *not* the same as readiness: ``CaptionPredictor.warmup()``
|
| 12 |
+
runs one inference on a dummy tensor so the first real request doesn't pay
|
| 13 |
+
TF's lazy graph-build cost (typically 2-5 seconds).
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
from typing import Literal
|
| 20 |
+
|
| 21 |
+
from captioning.config.schema import AppConfig
|
| 22 |
+
from captioning.inference.greedy import generate_caption_greedy
|
| 23 |
+
from captioning.inference.image_loader import load_image_from_path
|
| 24 |
+
from captioning.preprocessing.tokenizer import CaptionTokenizer
|
| 25 |
+
from captioning.utils.logging import get_logger
|
| 26 |
+
|
| 27 |
+
log = get_logger(__name__)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class CaptionPredictor:
|
| 31 |
+
"""Thin wrapper exposing ``predict_path`` / ``predict_tensor`` / ``warmup``."""
|
| 32 |
+
|
| 33 |
+
def __init__(
|
| 34 |
+
self,
|
| 35 |
+
model,
|
| 36 |
+
tokenizer: CaptionTokenizer,
|
| 37 |
+
config: AppConfig,
|
| 38 |
+
*,
|
| 39 |
+
decode_strategy: Literal["greedy"] = "greedy",
|
| 40 |
+
) -> None:
|
| 41 |
+
"""Args:
|
| 42 |
+
model: Loaded ``ImageCaptioningModel``. Caller is responsible for
|
| 43 |
+
having called ``model.load_weights(...)`` already.
|
| 44 |
+
tokenizer: Fitted ``CaptionTokenizer``.
|
| 45 |
+
config: Validated ``AppConfig`` — ``model.max_length`` is consumed.
|
| 46 |
+
decode_strategy: Phase 1 supports only ``"greedy"``. Phase 1b adds
|
| 47 |
+
``"beam"``; this argument is here so the signature is stable.
|
| 48 |
+
"""
|
| 49 |
+
if decode_strategy != "greedy":
|
| 50 |
+
raise NotImplementedError(
|
| 51 |
+
f"Phase 1 supports decode_strategy='greedy' only, got {decode_strategy!r}"
|
| 52 |
+
)
|
| 53 |
+
self.model = model
|
| 54 |
+
self.tokenizer = tokenizer
|
| 55 |
+
self.config = config
|
| 56 |
+
self.decode_strategy = decode_strategy
|
| 57 |
+
|
| 58 |
+
@classmethod
|
| 59 |
+
def from_artifacts(
|
| 60 |
+
cls,
|
| 61 |
+
weights_path: str | Path,
|
| 62 |
+
tokenizer_dir: str | Path,
|
| 63 |
+
config: AppConfig,
|
| 64 |
+
) -> CaptionPredictor:
|
| 65 |
+
"""Load weights and tokenizer from disk and return a ready predictor.
|
| 66 |
+
|
| 67 |
+
Args:
|
| 68 |
+
weights_path: Path to ``model.h5`` (notebook cell 30 saved this).
|
| 69 |
+
tokenizer_dir: Directory containing ``vocab.pkl`` (and ``vocab.json``).
|
| 70 |
+
config: Validated ``AppConfig``. ``model.max_length`` and
|
| 71 |
+
``model.vocabulary_size`` must match the trained weights.
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
A ``CaptionPredictor`` ready for inference.
|
| 75 |
+
"""
|
| 76 |
+
from captioning.models.factory import build_caption_model
|
| 77 |
+
|
| 78 |
+
tokenizer = CaptionTokenizer.load(
|
| 79 |
+
directory=tokenizer_dir,
|
| 80 |
+
vocab_size=config.model.vocabulary_size,
|
| 81 |
+
max_length=config.model.max_length,
|
| 82 |
+
)
|
| 83 |
+
model = build_caption_model(config, vocab_size=tokenizer.vocabulary_size)
|
| 84 |
+
# Build the model once before loading weights — Keras requires a
|
| 85 |
+
# forward pass before ``load_weights`` knows variable shapes.
|
| 86 |
+
cls._dummy_pass(model, config)
|
| 87 |
+
model.load_weights(str(weights_path))
|
| 88 |
+
|
| 89 |
+
log.info("predictor_loaded", weights=str(weights_path))
|
| 90 |
+
return cls(model=model, tokenizer=tokenizer, config=config)
|
| 91 |
+
|
| 92 |
+
def warmup(self) -> None:
|
| 93 |
+
"""Run one dummy inference so the first real request is fast."""
|
| 94 |
+
import tensorflow as tf
|
| 95 |
+
|
| 96 |
+
dummy = tf.zeros((299, 299, 3), dtype=tf.float32)
|
| 97 |
+
_ = generate_caption_greedy(self.model, self.tokenizer, dummy, self.config.model.max_length)
|
| 98 |
+
log.info("predictor_warmed_up")
|
| 99 |
+
|
| 100 |
+
def predict_tensor(self, image_tensor) -> str:
|
| 101 |
+
"""Generate a caption from an already-preprocessed image tensor."""
|
| 102 |
+
return generate_caption_greedy(
|
| 103 |
+
self.model,
|
| 104 |
+
self.tokenizer,
|
| 105 |
+
image_tensor,
|
| 106 |
+
self.config.model.max_length,
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
def predict_path(self, image_path: str | Path) -> str:
|
| 110 |
+
"""Generate a caption from an image on disk."""
|
| 111 |
+
tensor = load_image_from_path(str(image_path))
|
| 112 |
+
return self.predict_tensor(tensor)
|
| 113 |
+
|
| 114 |
+
# ------------------------------------------------------------- internal --
|
| 115 |
+
|
| 116 |
+
@staticmethod
|
| 117 |
+
def _dummy_pass(model, config: AppConfig) -> None:
|
| 118 |
+
"""Force-build the model so ``load_weights`` knows variable shapes."""
|
| 119 |
+
import tensorflow as tf
|
| 120 |
+
|
| 121 |
+
dummy_img = tf.zeros((1, 299, 299, 3), dtype=tf.float32)
|
| 122 |
+
dummy_caps = tf.zeros((1, config.model.max_length), dtype=tf.int64)
|
| 123 |
+
# Calls train_step's underlying ops without doing a gradient step:
|
| 124 |
+
img_embed = model.cnn_model(dummy_img)
|
| 125 |
+
encoded = model.encoder(img_embed, training=False)
|
| 126 |
+
_ = model.decoder(
|
| 127 |
+
dummy_caps[:, :-1],
|
| 128 |
+
encoded,
|
| 129 |
+
training=False,
|
| 130 |
+
mask=tf.cast(dummy_caps[:, 1:] != 0, tf.int32),
|
| 131 |
+
)
|
src/captioning/models/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Models — Keras layers and the top-level captioning model.
|
| 2 |
+
|
| 3 |
+
Each layer is in its own file so the architecture reads top-to-bottom in a
|
| 4 |
+
file tree, not inside a 200-line cell. Layers compose through ``factory.py``,
|
| 5 |
+
which is the single place that wires hyperparameters from ``AppConfig``.
|
| 6 |
+
|
| 7 |
+
encoder_cnn.py InceptionV3 backbone, frozen ImageNet weights
|
| 8 |
+
transformer_encoder.py 1-layer Transformer encoder over image patches
|
| 9 |
+
embeddings.py Token + positional embeddings
|
| 10 |
+
transformer_decoder.py Multi-head causal decoder with cross-attention
|
| 11 |
+
captioning_model.py ``ImageCaptioningModel`` (custom train/test step)
|
| 12 |
+
factory.py ``build_caption_model(config, vocab_size)``
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from captioning.models.captioning_model import ImageCaptioningModel
|
| 16 |
+
from captioning.models.embeddings import Embeddings
|
| 17 |
+
from captioning.models.encoder_cnn import build_cnn_encoder
|
| 18 |
+
from captioning.models.factory import build_caption_model
|
| 19 |
+
from captioning.models.transformer_decoder import TransformerDecoderLayer
|
| 20 |
+
from captioning.models.transformer_encoder import TransformerEncoderLayer
|
| 21 |
+
|
| 22 |
+
__all__ = [
|
| 23 |
+
"Embeddings",
|
| 24 |
+
"ImageCaptioningModel",
|
| 25 |
+
"TransformerDecoderLayer",
|
| 26 |
+
"TransformerEncoderLayer",
|
| 27 |
+
"build_caption_model",
|
| 28 |
+
"build_cnn_encoder",
|
| 29 |
+
]
|
src/captioning/models/captioning_model.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""``ImageCaptioningModel`` — top-level Keras model with custom train/test step.
|
| 2 |
+
|
| 3 |
+
Mirrors notebook cell 20 verbatim. The model owns its own loss & accuracy
|
| 4 |
+
trackers (rather than using compile-time metrics) because the masked
|
| 5 |
+
arithmetic in ``calculate_loss`` / ``calculate_accuracy`` depends on the
|
| 6 |
+
caption padding mask, which Keras's standard metric API can't see.
|
| 7 |
+
|
| 8 |
+
Behavioural quirk preserved for parity (NOT a bug in our code):
|
| 9 |
+
The notebook's ``compute_loss_and_acc`` hardcodes ``training=True`` on
|
| 10 |
+
both the encoder and decoder calls, even when invoked from ``test_step``.
|
| 11 |
+
That means dropout is active during validation in the IEEE results.
|
| 12 |
+
We preserve this so BLEU matches the paper. Phase 1b will fix it in a
|
| 13 |
+
deliberate, clearly-marked commit.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _build_captioning_model_class():
|
| 20 |
+
import tensorflow as tf
|
| 21 |
+
|
| 22 |
+
class ImageCaptioningModel(tf.keras.Model):
|
| 23 |
+
"""Stitches CNN encoder + Transformer encoder + Transformer decoder."""
|
| 24 |
+
|
| 25 |
+
def __init__(self, cnn_model, encoder, decoder, image_aug=None) -> None:
|
| 26 |
+
super().__init__()
|
| 27 |
+
self.cnn_model = cnn_model
|
| 28 |
+
self.encoder = encoder
|
| 29 |
+
self.decoder = decoder
|
| 30 |
+
self.image_aug = image_aug
|
| 31 |
+
self.loss_tracker = tf.keras.metrics.Mean(name="loss")
|
| 32 |
+
self.acc_tracker = tf.keras.metrics.Mean(name="accuracy")
|
| 33 |
+
|
| 34 |
+
# --- masked metrics (notebook cell 20) -----------------------------
|
| 35 |
+
|
| 36 |
+
def calculate_loss(self, y_true, y_pred, mask):
|
| 37 |
+
loss = self.loss(y_true, y_pred)
|
| 38 |
+
mask = tf.cast(mask, dtype=loss.dtype)
|
| 39 |
+
loss *= mask
|
| 40 |
+
return tf.reduce_sum(loss) / tf.reduce_sum(mask)
|
| 41 |
+
|
| 42 |
+
def calculate_accuracy(self, y_true, y_pred, mask):
|
| 43 |
+
accuracy = tf.equal(y_true, tf.argmax(y_pred, axis=2))
|
| 44 |
+
accuracy = tf.math.logical_and(mask, accuracy)
|
| 45 |
+
accuracy = tf.cast(accuracy, dtype=tf.float32)
|
| 46 |
+
mask = tf.cast(mask, dtype=tf.float32)
|
| 47 |
+
return tf.reduce_sum(accuracy) / tf.reduce_sum(mask)
|
| 48 |
+
|
| 49 |
+
# --- shared loss/acc step (parity quirk: training=True hardcoded) --
|
| 50 |
+
|
| 51 |
+
def compute_loss_and_acc(self, img_embed, captions, training=True):
|
| 52 |
+
# Notebook quirk preserved: encoder/decoder always called with
|
| 53 |
+
# training=True. The `training` parameter is intentionally unused.
|
| 54 |
+
del training # silence linters: this is deliberate
|
| 55 |
+
encoder_output = self.encoder(img_embed, training=True)
|
| 56 |
+
y_input = captions[:, :-1]
|
| 57 |
+
y_true = captions[:, 1:]
|
| 58 |
+
mask = y_true != 0
|
| 59 |
+
y_pred = self.decoder(y_input, encoder_output, training=True, mask=mask)
|
| 60 |
+
loss = self.calculate_loss(y_true, y_pred, mask)
|
| 61 |
+
acc = self.calculate_accuracy(y_true, y_pred, mask)
|
| 62 |
+
return loss, acc
|
| 63 |
+
|
| 64 |
+
# --- Keras hooks ---------------------------------------------------
|
| 65 |
+
|
| 66 |
+
def train_step(self, batch):
|
| 67 |
+
imgs, captions = batch
|
| 68 |
+
if self.image_aug:
|
| 69 |
+
imgs = self.image_aug(imgs)
|
| 70 |
+
img_embed = self.cnn_model(imgs)
|
| 71 |
+
|
| 72 |
+
with tf.GradientTape() as tape:
|
| 73 |
+
loss, acc = self.compute_loss_and_acc(img_embed, captions)
|
| 74 |
+
|
| 75 |
+
train_vars = self.encoder.trainable_variables + self.decoder.trainable_variables
|
| 76 |
+
grads = tape.gradient(loss, train_vars)
|
| 77 |
+
self.optimizer.apply_gradients(zip(grads, train_vars, strict=False))
|
| 78 |
+
self.loss_tracker.update_state(loss)
|
| 79 |
+
self.acc_tracker.update_state(acc)
|
| 80 |
+
|
| 81 |
+
return {"loss": self.loss_tracker.result(), "acc": self.acc_tracker.result()}
|
| 82 |
+
|
| 83 |
+
def test_step(self, batch):
|
| 84 |
+
imgs, captions = batch
|
| 85 |
+
img_embed = self.cnn_model(imgs)
|
| 86 |
+
loss, acc = self.compute_loss_and_acc(img_embed, captions, training=False)
|
| 87 |
+
self.loss_tracker.update_state(loss)
|
| 88 |
+
self.acc_tracker.update_state(acc)
|
| 89 |
+
return {"loss": self.loss_tracker.result(), "acc": self.acc_tracker.result()}
|
| 90 |
+
|
| 91 |
+
@property
|
| 92 |
+
def metrics(self):
|
| 93 |
+
return [self.loss_tracker, self.acc_tracker]
|
| 94 |
+
|
| 95 |
+
return ImageCaptioningModel
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
ImageCaptioningModel = _build_captioning_model_class()
|
src/captioning/models/embeddings.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Token + positional embedding layer.
|
| 2 |
+
|
| 3 |
+
Mirrors notebook cell 18 verbatim. The decoder learns its own positional
|
| 4 |
+
encoding (rather than using sinusoidal) — that's the published architecture,
|
| 5 |
+
preserved here.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _import_tf():
|
| 12 |
+
"""Local import keeps top-level package import lightweight.
|
| 13 |
+
|
| 14 |
+
Without this, ``from captioning.models import Embeddings`` would trigger
|
| 15 |
+
a multi-second TF import even for callers that don't use it.
|
| 16 |
+
"""
|
| 17 |
+
import tensorflow as tf
|
| 18 |
+
|
| 19 |
+
return tf
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# Defining the class lazily inside a factory keeps TF out of the import path.
|
| 23 |
+
# Callers do ``Embeddings = _build_embeddings_class()`` once at module init.
|
| 24 |
+
def _build_embeddings_class():
|
| 25 |
+
tf = _import_tf()
|
| 26 |
+
|
| 27 |
+
class Embeddings(tf.keras.layers.Layer):
|
| 28 |
+
"""Sum of token and learned positional embeddings.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
vocab_size: Size of the token vocabulary
|
| 32 |
+
(``CaptionTokenizer.vocabulary_size``).
|
| 33 |
+
embed_dim: Dimensionality of each embedding vector
|
| 34 |
+
(``model.embedding_dim``, default 512).
|
| 35 |
+
max_len: Maximum sequence length (``model.max_length``, default 40).
|
| 36 |
+
"""
|
| 37 |
+
|
| 38 |
+
def __init__(self, vocab_size: int, embed_dim: int, max_len: int) -> None:
|
| 39 |
+
super().__init__()
|
| 40 |
+
self.token_embeddings = tf.keras.layers.Embedding(vocab_size, embed_dim)
|
| 41 |
+
self.position_embeddings = tf.keras.layers.Embedding(
|
| 42 |
+
max_len, embed_dim, input_shape=(None, max_len)
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
def call(self, input_ids):
|
| 46 |
+
length = tf.shape(input_ids)[-1]
|
| 47 |
+
position_ids = tf.range(start=0, limit=length, delta=1)
|
| 48 |
+
position_ids = tf.expand_dims(position_ids, axis=0)
|
| 49 |
+
token_embeddings = self.token_embeddings(input_ids)
|
| 50 |
+
position_embeddings = self.position_embeddings(position_ids)
|
| 51 |
+
return token_embeddings + position_embeddings
|
| 52 |
+
|
| 53 |
+
return Embeddings
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
Embeddings = _build_embeddings_class()
|
src/captioning/models/encoder_cnn.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""InceptionV3 image encoder.
|
| 2 |
+
|
| 3 |
+
Mirrors notebook cell 16. The encoder is the *frozen* visual backbone that
|
| 4 |
+
turns a 299x299 RGB image into a sequence of 2048-dimensional feature vectors
|
| 5 |
+
(one per spatial position in InceptionV3's last conv layer). The Transformer
|
| 6 |
+
encoder/decoder learn on top of these features; the InceptionV3 weights are
|
| 7 |
+
never updated during training.
|
| 8 |
+
|
| 9 |
+
Why a build function and not a Keras layer? The CNN is constructed from a
|
| 10 |
+
pretrained model whose weights are downloaded the first time. Wrapping
|
| 11 |
+
construction in a function gives callers a single line to invoke, and lets
|
| 12 |
+
us add caching / offline-loading paths later without touching call sites.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def build_cnn_encoder():
|
| 19 |
+
"""Build the InceptionV3 backbone with the classification head removed.
|
| 20 |
+
|
| 21 |
+
Returns:
|
| 22 |
+
A ``tf.keras.Model`` mapping ``[B, 299, 299, 3]`` images to
|
| 23 |
+
``[B, 64, 2048]`` patch features (8x8=64 spatial positions, each a
|
| 24 |
+
2048-dim vector — InceptionV3's ``mixed10`` layer).
|
| 25 |
+
"""
|
| 26 |
+
import tensorflow as tf
|
| 27 |
+
|
| 28 |
+
inception = tf.keras.applications.InceptionV3(
|
| 29 |
+
include_top=False,
|
| 30 |
+
weights="imagenet",
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
output = inception.output
|
| 34 |
+
output = tf.keras.layers.Reshape((-1, output.shape[-1]))(output)
|
| 35 |
+
|
| 36 |
+
return tf.keras.models.Model(inception.input, output)
|
src/captioning/models/factory.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""``build_caption_model(config, vocab_size)`` — single place to wire layers.
|
| 2 |
+
|
| 3 |
+
Mirrors notebook cell 21::
|
| 4 |
+
|
| 5 |
+
encoder = TransformerEncoderLayer(EMBEDDING_DIM, 1)
|
| 6 |
+
decoder = TransformerDecoderLayer(EMBEDDING_DIM, UNITS, 8)
|
| 7 |
+
cnn_model = CNN_Encoder()
|
| 8 |
+
caption_model = ImageCaptioningModel(
|
| 9 |
+
cnn_model=cnn_model,
|
| 10 |
+
encoder=encoder,
|
| 11 |
+
decoder=decoder,
|
| 12 |
+
image_aug=image_augmentation,
|
| 13 |
+
)
|
| 14 |
+
|
| 15 |
+
Pulling this into a factory function isolates "how layers are wired" from
|
| 16 |
+
"what hyperparameters they use", so Phase 1b ablations and Phase 5 model
|
| 17 |
+
swaps only touch this file.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
from captioning.config.schema import AppConfig
|
| 23 |
+
from captioning.models.captioning_model import ImageCaptioningModel
|
| 24 |
+
from captioning.models.encoder_cnn import build_cnn_encoder
|
| 25 |
+
from captioning.models.transformer_decoder import TransformerDecoderLayer
|
| 26 |
+
from captioning.models.transformer_encoder import TransformerEncoderLayer
|
| 27 |
+
from captioning.preprocessing.augmentation import default_image_augmentation
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def build_caption_model(
|
| 31 |
+
config: AppConfig,
|
| 32 |
+
vocab_size: int,
|
| 33 |
+
*,
|
| 34 |
+
use_augmentation: bool = True,
|
| 35 |
+
):
|
| 36 |
+
"""Construct a ready-to-compile ``ImageCaptioningModel``.
|
| 37 |
+
|
| 38 |
+
Args:
|
| 39 |
+
config: Validated app config (the ``model`` section is consumed here).
|
| 40 |
+
vocab_size: Comes from the *fitted* tokenizer
|
| 41 |
+
(``CaptionTokenizer.vocabulary_size``). The factory does not own
|
| 42 |
+
tokenizer state — callers fit the tokenizer first, pass the size in.
|
| 43 |
+
use_augmentation: If True (default), wires
|
| 44 |
+
``default_image_augmentation()`` for ``train_step``. Inference and
|
| 45 |
+
evaluation paths pass False.
|
| 46 |
+
|
| 47 |
+
Returns:
|
| 48 |
+
An uncompiled ``ImageCaptioningModel``. Caller is responsible for
|
| 49 |
+
``model.compile(optimizer=..., loss=...)``.
|
| 50 |
+
"""
|
| 51 |
+
m = config.model
|
| 52 |
+
|
| 53 |
+
encoder = TransformerEncoderLayer(m.embedding_dim, m.encoder_num_heads)
|
| 54 |
+
decoder = TransformerDecoderLayer(
|
| 55 |
+
embed_dim=m.embedding_dim,
|
| 56 |
+
units=m.units,
|
| 57 |
+
num_heads=m.decoder_num_heads,
|
| 58 |
+
vocab_size=vocab_size,
|
| 59 |
+
max_len=m.max_length,
|
| 60 |
+
attention_dropout=m.decoder_attention_dropout,
|
| 61 |
+
inner_dropout=m.decoder_dropout_inner,
|
| 62 |
+
outer_dropout=m.decoder_dropout_outer,
|
| 63 |
+
)
|
| 64 |
+
cnn = build_cnn_encoder()
|
| 65 |
+
aug = default_image_augmentation() if use_augmentation else None
|
| 66 |
+
return ImageCaptioningModel(cnn_model=cnn, encoder=encoder, decoder=decoder, image_aug=aug)
|
src/captioning/models/transformer_decoder.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Multi-head Transformer decoder with causal masking and cross-attention.
|
| 2 |
+
|
| 3 |
+
Mirrors notebook cell 19. Two changes from the notebook, both behaviour-
|
| 4 |
+
preserving when defaults match:
|
| 5 |
+
|
| 6 |
+
1. **Globals are now constructor arguments.** The notebook closes over
|
| 7 |
+
``tokenizer.vocabulary_size()`` and ``MAX_LENGTH`` from module scope.
|
| 8 |
+
We pass them in as ``vocab_size`` and ``max_len`` so the decoder can be
|
| 9 |
+
instantiated in tests, factories, and notebooks without setting up a
|
| 10 |
+
global tokenizer first.
|
| 11 |
+
2. **Dropout rates and attention head count are configurable** with the
|
| 12 |
+
notebook values as defaults. This costs nothing today and lets Phase 1b
|
| 13 |
+
ablations vary them without code changes.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
from captioning.models.embeddings import Embeddings
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _build_transformer_decoder_class():
|
| 22 |
+
import tensorflow as tf
|
| 23 |
+
|
| 24 |
+
class TransformerDecoderLayer(tf.keras.layers.Layer):
|
| 25 |
+
"""Causal self-attention + cross-attention + FFN block.
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
embed_dim: Token/positional embedding dimension. Must equal the
|
| 29 |
+
encoder's ``embed_dim``.
|
| 30 |
+
units: Hidden dimension of the feed-forward sub-block.
|
| 31 |
+
num_heads: Multi-head attention heads. Notebook uses 8.
|
| 32 |
+
vocab_size: Output projection dimension (the model emits softmax
|
| 33 |
+
probabilities over the vocabulary).
|
| 34 |
+
max_len: Maximum decode length, used to size positional embeddings.
|
| 35 |
+
attention_dropout: Dropout applied inside MultiHeadAttention.
|
| 36 |
+
Notebook uses 0.1.
|
| 37 |
+
inner_dropout: Dropout after the first dense layer in the FFN.
|
| 38 |
+
Notebook uses 0.3.
|
| 39 |
+
outer_dropout: Dropout after the residual + final layernorm.
|
| 40 |
+
Notebook uses 0.5.
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
def __init__(
|
| 44 |
+
self,
|
| 45 |
+
embed_dim: int,
|
| 46 |
+
units: int,
|
| 47 |
+
num_heads: int,
|
| 48 |
+
vocab_size: int,
|
| 49 |
+
max_len: int,
|
| 50 |
+
attention_dropout: float = 0.1,
|
| 51 |
+
inner_dropout: float = 0.3,
|
| 52 |
+
outer_dropout: float = 0.5,
|
| 53 |
+
) -> None:
|
| 54 |
+
super().__init__()
|
| 55 |
+
self.embedding = Embeddings(vocab_size, embed_dim, max_len)
|
| 56 |
+
|
| 57 |
+
self.attention_1 = tf.keras.layers.MultiHeadAttention(
|
| 58 |
+
num_heads=num_heads, key_dim=embed_dim, dropout=attention_dropout
|
| 59 |
+
)
|
| 60 |
+
self.attention_2 = tf.keras.layers.MultiHeadAttention(
|
| 61 |
+
num_heads=num_heads, key_dim=embed_dim, dropout=attention_dropout
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
self.layernorm_1 = tf.keras.layers.LayerNormalization()
|
| 65 |
+
self.layernorm_2 = tf.keras.layers.LayerNormalization()
|
| 66 |
+
self.layernorm_3 = tf.keras.layers.LayerNormalization()
|
| 67 |
+
|
| 68 |
+
self.ffn_layer_1 = tf.keras.layers.Dense(units, activation="relu")
|
| 69 |
+
self.ffn_layer_2 = tf.keras.layers.Dense(embed_dim)
|
| 70 |
+
|
| 71 |
+
self.out = tf.keras.layers.Dense(vocab_size, activation="softmax")
|
| 72 |
+
|
| 73 |
+
self.dropout_1 = tf.keras.layers.Dropout(inner_dropout)
|
| 74 |
+
self.dropout_2 = tf.keras.layers.Dropout(outer_dropout)
|
| 75 |
+
|
| 76 |
+
def call(self, input_ids, encoder_output, training, mask=None):
|
| 77 |
+
embeddings = self.embedding(input_ids)
|
| 78 |
+
|
| 79 |
+
combined_mask = None
|
| 80 |
+
padding_mask = None
|
| 81 |
+
|
| 82 |
+
if mask is not None:
|
| 83 |
+
causal_mask = self.get_causal_attention_mask(embeddings)
|
| 84 |
+
padding_mask = tf.cast(mask[:, :, tf.newaxis], dtype=tf.int32)
|
| 85 |
+
combined_mask = tf.cast(mask[:, tf.newaxis, :], dtype=tf.int32)
|
| 86 |
+
combined_mask = tf.minimum(combined_mask, causal_mask)
|
| 87 |
+
|
| 88 |
+
attn_output_1 = self.attention_1(
|
| 89 |
+
query=embeddings,
|
| 90 |
+
value=embeddings,
|
| 91 |
+
key=embeddings,
|
| 92 |
+
attention_mask=combined_mask,
|
| 93 |
+
training=training,
|
| 94 |
+
)
|
| 95 |
+
out_1 = self.layernorm_1(embeddings + attn_output_1)
|
| 96 |
+
|
| 97 |
+
attn_output_2 = self.attention_2(
|
| 98 |
+
query=out_1,
|
| 99 |
+
value=encoder_output,
|
| 100 |
+
key=encoder_output,
|
| 101 |
+
attention_mask=padding_mask,
|
| 102 |
+
training=training,
|
| 103 |
+
)
|
| 104 |
+
out_2 = self.layernorm_2(out_1 + attn_output_2)
|
| 105 |
+
|
| 106 |
+
ffn_out = self.ffn_layer_1(out_2)
|
| 107 |
+
ffn_out = self.dropout_1(ffn_out, training=training)
|
| 108 |
+
ffn_out = self.ffn_layer_2(ffn_out)
|
| 109 |
+
|
| 110 |
+
ffn_out = self.layernorm_3(ffn_out + out_2)
|
| 111 |
+
ffn_out = self.dropout_2(ffn_out, training=training)
|
| 112 |
+
return self.out(ffn_out)
|
| 113 |
+
|
| 114 |
+
def get_causal_attention_mask(self, inputs):
|
| 115 |
+
input_shape = tf.shape(inputs)
|
| 116 |
+
batch_size, sequence_length = input_shape[0], input_shape[1]
|
| 117 |
+
i = tf.range(sequence_length)[:, tf.newaxis]
|
| 118 |
+
j = tf.range(sequence_length)
|
| 119 |
+
mask = tf.cast(i >= j, dtype="int32")
|
| 120 |
+
mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
|
| 121 |
+
mult = tf.concat(
|
| 122 |
+
[tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
|
| 123 |
+
axis=0,
|
| 124 |
+
)
|
| 125 |
+
return tf.tile(mask, mult)
|
| 126 |
+
|
| 127 |
+
return TransformerDecoderLayer
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
TransformerDecoderLayer = _build_transformer_decoder_class()
|
src/captioning/models/transformer_encoder.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Single-layer Transformer encoder for image patch features.
|
| 2 |
+
|
| 3 |
+
Mirrors notebook cell 17 verbatim. The encoder is intentionally minimal
|
| 4 |
+
(1 attention head, 1 layer, 1 dense projection) because the *image* features
|
| 5 |
+
are already produced by InceptionV3 — the Transformer encoder's only job is
|
| 6 |
+
to project them into the decoder's embedding dimension and let the decoder
|
| 7 |
+
attend across patches.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _build_transformer_encoder_class():
|
| 14 |
+
import tensorflow as tf
|
| 15 |
+
|
| 16 |
+
class TransformerEncoderLayer(tf.keras.layers.Layer):
|
| 17 |
+
"""Norm → Dense → Self-attention → Norm + Add (post-norm wrapper).
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
embed_dim: Dimensionality fed to the dense projection and used as
|
| 21 |
+
``key_dim`` for attention. Must equal the decoder's embed_dim.
|
| 22 |
+
num_heads: Attention heads. Notebook uses 1.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(self, embed_dim: int, num_heads: int) -> None:
|
| 26 |
+
super().__init__()
|
| 27 |
+
self.layer_norm_1 = tf.keras.layers.LayerNormalization()
|
| 28 |
+
self.layer_norm_2 = tf.keras.layers.LayerNormalization()
|
| 29 |
+
self.attention = tf.keras.layers.MultiHeadAttention(
|
| 30 |
+
num_heads=num_heads, key_dim=embed_dim
|
| 31 |
+
)
|
| 32 |
+
self.dense = tf.keras.layers.Dense(embed_dim, activation="relu")
|
| 33 |
+
|
| 34 |
+
def call(self, x, training):
|
| 35 |
+
x = self.layer_norm_1(x)
|
| 36 |
+
x = self.dense(x)
|
| 37 |
+
attn_output = self.attention(
|
| 38 |
+
query=x, value=x, key=x, attention_mask=None, training=training
|
| 39 |
+
)
|
| 40 |
+
return self.layer_norm_2(x + attn_output)
|
| 41 |
+
|
| 42 |
+
return TransformerEncoderLayer
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
TransformerEncoderLayer = _build_transformer_encoder_class()
|
src/captioning/preprocessing/__init__.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Preprocessing — pure transforms on captions and images.
|
| 2 |
+
|
| 3 |
+
Functions in this package take inputs and return outputs with no hidden state
|
| 4 |
+
and no disk I/O. That makes them trivially unit-testable and lets us share the
|
| 5 |
+
same logic across the training pipeline (where they're composed into tf.data
|
| 6 |
+
maps) and the inference path (where they're called once per request).
|
| 7 |
+
|
| 8 |
+
Modules:
|
| 9 |
+
caption.py ``preprocess_caption(text)`` — lower/strip/wrap with [start]/[end]
|
| 10 |
+
image.py ``preprocess_image_tensor(img)``, ``load_and_preprocess_image(path)``
|
| 11 |
+
tokenizer.py ``CaptionTokenizer`` — wraps tf.keras TextVectorization
|
| 12 |
+
augmentation.py ``default_image_augmentation()`` — Keras Sequential
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from captioning.preprocessing.augmentation import default_image_augmentation
|
| 16 |
+
from captioning.preprocessing.caption import (
|
| 17 |
+
END_TOKEN,
|
| 18 |
+
START_TOKEN,
|
| 19 |
+
preprocess_caption,
|
| 20 |
+
)
|
| 21 |
+
from captioning.preprocessing.image import (
|
| 22 |
+
load_and_preprocess_image,
|
| 23 |
+
preprocess_image_tensor,
|
| 24 |
+
)
|
| 25 |
+
from captioning.preprocessing.tokenizer import CaptionTokenizer
|
| 26 |
+
|
| 27 |
+
__all__ = [
|
| 28 |
+
"END_TOKEN",
|
| 29 |
+
"START_TOKEN",
|
| 30 |
+
"CaptionTokenizer",
|
| 31 |
+
"default_image_augmentation",
|
| 32 |
+
"load_and_preprocess_image",
|
| 33 |
+
"preprocess_caption",
|
| 34 |
+
"preprocess_image_tensor",
|
| 35 |
+
]
|
src/captioning/preprocessing/augmentation.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Image-augmentation pipeline (training only).
|
| 2 |
+
|
| 3 |
+
Mirrors notebook cell 15. Augmentation is deliberately separate from
|
| 4 |
+
``image.py``: augmentations introduce randomness and only run during training,
|
| 5 |
+
while ``preprocess_image_tensor`` is deterministic and runs in both train and
|
| 6 |
+
serve. Mixing them risks accidentally augmenting at inference time.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def default_image_augmentation() -> tf.keras.Sequential: # type: ignore[name-defined] # noqa: F821
|
| 13 |
+
"""Build the augmentation chain used during training.
|
| 14 |
+
|
| 15 |
+
The model is composed once (notebook cell 21::
|
| 16 |
+
|
| 17 |
+
ImageCaptioningModel(..., image_aug=image_augmentation)
|
| 18 |
+
|
| 19 |
+
) and the augmentation block runs only inside ``train_step`` (notebook
|
| 20 |
+
cell 20). ``test_step`` skips augmentation, which is the correct behaviour
|
| 21 |
+
we preserve.
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
A ``tf.keras.Sequential`` of ``RandomFlip`` + ``RandomRotation`` +
|
| 25 |
+
``RandomContrast`` matching cell 15 exactly.
|
| 26 |
+
"""
|
| 27 |
+
import tensorflow as tf
|
| 28 |
+
|
| 29 |
+
return tf.keras.Sequential(
|
| 30 |
+
[
|
| 31 |
+
tf.keras.layers.RandomFlip("horizontal"),
|
| 32 |
+
tf.keras.layers.RandomRotation(0.2),
|
| 33 |
+
tf.keras.layers.RandomContrast(0.3),
|
| 34 |
+
]
|
| 35 |
+
)
|
src/captioning/preprocessing/caption.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Caption text preprocessing.
|
| 2 |
+
|
| 3 |
+
Mirrors the IEEE notebook cell 3::
|
| 4 |
+
|
| 5 |
+
def preprocess(text):
|
| 6 |
+
text = text.lower()
|
| 7 |
+
text = re.sub(r"[^\\w\\s]", "", text)
|
| 8 |
+
text = re.sub("\\s+", " ", text)
|
| 9 |
+
text = text.strip()
|
| 10 |
+
text = "[start] " + text + " [end]"
|
| 11 |
+
return text
|
| 12 |
+
|
| 13 |
+
Why pull this out of the notebook:
|
| 14 |
+
* It's a *pure function*: same input → same output, no side effects.
|
| 15 |
+
Easiest possible thing to unit-test, and the lowest-risk module to verify
|
| 16 |
+
parity on (one ``assert preprocess_caption("Hello, World!") == "[start] hello world [end]"``
|
| 17 |
+
catches any divergence).
|
| 18 |
+
* The same logic runs at training time AND at inference time. Centralising
|
| 19 |
+
it eliminates the most common bug source in ML systems: train/serve skew.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
|
| 24 |
+
import re
|
| 25 |
+
|
| 26 |
+
START_TOKEN = "[start]"
|
| 27 |
+
END_TOKEN = "[end]"
|
| 28 |
+
|
| 29 |
+
# Pre-compiled for marginal speed (caption preprocessing is called ~600k+
|
| 30 |
+
# times during dataset prep). The compiled patterns also make intent obvious.
|
| 31 |
+
_PUNCTUATION_RE = re.compile(r"[^\w\s]")
|
| 32 |
+
_WHITESPACE_RE = re.compile(r"\s+")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def preprocess_caption(text: str) -> str:
|
| 36 |
+
"""Lowercase, strip punctuation, collapse whitespace, wrap with sentinels.
|
| 37 |
+
|
| 38 |
+
Behaviour is byte-for-byte identical to the notebook's ``preprocess()``.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
text: Raw caption string (any case, may contain punctuation).
|
| 42 |
+
|
| 43 |
+
Returns:
|
| 44 |
+
Normalised caption with ``[start]`` and ``[end]`` sentinels, e.g.::
|
| 45 |
+
|
| 46 |
+
>>> preprocess_caption("A man, riding a Bike!")
|
| 47 |
+
'[start] a man riding a bike [end]'
|
| 48 |
+
|
| 49 |
+
Note:
|
| 50 |
+
The notebook applies this function via ``DataFrame.apply``; we don't
|
| 51 |
+
vectorise here because the regex compilation is the dominant cost and
|
| 52 |
+
is already amortised over a single call.
|
| 53 |
+
"""
|
| 54 |
+
text = text.lower()
|
| 55 |
+
text = _PUNCTUATION_RE.sub("", text)
|
| 56 |
+
text = _WHITESPACE_RE.sub(" ", text)
|
| 57 |
+
text = text.strip()
|
| 58 |
+
return f"{START_TOKEN} {text} {END_TOKEN}"
|
src/captioning/preprocessing/image.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Image preprocessing.
|
| 2 |
+
|
| 3 |
+
Mirrors notebook cell 13 (training pipeline) and cell 25 (inference path).
|
| 4 |
+
Both paths must produce *byte-identical* tensors — the model only saw 299x299
|
| 5 |
+
images normalised by ``inception_v3.preprocess_input`` during training, so
|
| 6 |
+
serving must do exactly that. Centralising the pipeline here is what
|
| 7 |
+
eliminates train/serve skew.
|
| 8 |
+
|
| 9 |
+
The two public functions split responsibilities:
|
| 10 |
+
* ``preprocess_image_tensor`` — operates on an already-decoded image
|
| 11 |
+
tensor. Used by the tf.data pipeline AND inference (after decode).
|
| 12 |
+
* ``load_and_preprocess_image`` — reads bytes from disk, decodes, then
|
| 13 |
+
calls ``preprocess_image_tensor``. Used at inference time.
|
| 14 |
+
|
| 15 |
+
Both use ``tf.keras.layers.Resizing(299, 299)`` (not ``tf.image.resize``)
|
| 16 |
+
because the notebook uses the layer form. ``Resizing`` defaults to bilinear
|
| 17 |
+
interpolation and rounds to nearest integer dims, which is the exact behaviour
|
| 18 |
+
that produced the IEEE BLEU score.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
from __future__ import annotations
|
| 22 |
+
|
| 23 |
+
INCEPTION_INPUT_SIZE = 299
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def preprocess_image_tensor(image: tf.Tensor) -> tf.Tensor: # type: ignore[name-defined] # noqa: F821
|
| 27 |
+
"""Resize to 299x299 and apply ``inception_v3.preprocess_input``.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
image: A 3-D ``tf.Tensor`` of shape ``[H, W, 3]`` and dtype ``uint8``
|
| 31 |
+
or ``float32``. The Resizing layer accepts both.
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
``tf.Tensor`` of shape ``[299, 299, 3]``, dtype ``float32``, with the
|
| 35 |
+
InceptionV3 normalisation applied (pixel values in ``[-1, 1]``).
|
| 36 |
+
"""
|
| 37 |
+
import tensorflow as tf
|
| 38 |
+
|
| 39 |
+
image = tf.keras.layers.Resizing(INCEPTION_INPUT_SIZE, INCEPTION_INPUT_SIZE)(image)
|
| 40 |
+
return tf.keras.applications.inception_v3.preprocess_input(image)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def load_and_preprocess_image(image_path: str) -> tf.Tensor: # type: ignore[name-defined] # noqa: F821
|
| 44 |
+
"""Read a JPEG from disk and run it through ``preprocess_image_tensor``.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
image_path: Path to a JPEG file. Strings, ``pathlib.Path``, and
|
| 48 |
+
``tf.string`` tensors all work — the latter matters because
|
| 49 |
+
``tf.data`` pipelines pass paths as tensors.
|
| 50 |
+
|
| 51 |
+
Returns:
|
| 52 |
+
A 3-D ``tf.Tensor`` ready to feed into the CNN encoder.
|
| 53 |
+
|
| 54 |
+
Raises:
|
| 55 |
+
tf.errors.NotFoundError: If the file does not exist.
|
| 56 |
+
tf.errors.InvalidArgumentError: If the file is not a valid JPEG/PNG.
|
| 57 |
+
"""
|
| 58 |
+
import tensorflow as tf
|
| 59 |
+
|
| 60 |
+
raw = tf.io.read_file(image_path)
|
| 61 |
+
image = tf.io.decode_jpeg(raw, channels=3)
|
| 62 |
+
return preprocess_image_tensor(image)
|
src/captioning/preprocessing/tokenizer.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""``CaptionTokenizer`` — typed wrapper around ``tf.keras.layers.TextVectorization``.
|
| 2 |
+
|
| 3 |
+
Why a wrapper instead of using the Keras layer directly?
|
| 4 |
+
|
| 5 |
+
1. **Stable interface for the model.** The model code calls
|
| 6 |
+
``tokenizer.encode(captions)`` and ``tokenizer.decode_id(idx)``. The fact
|
| 7 |
+
that those happen to delegate to a Keras layer is an implementation
|
| 8 |
+
detail. In Phase 5 we may swap the implementation for HuggingFace
|
| 9 |
+
``tokenizers`` without rewriting the encoder, decoder, or inference loop.
|
| 10 |
+
2. **Persistence.** The notebook saves the *vocabulary list* with pickle, but
|
| 11 |
+
loading requires re-instantiating a layer and calling ``set_vocabulary``.
|
| 12 |
+
That ceremony belongs inside the wrapper, not at every call site.
|
| 13 |
+
3. **A JSON sidecar.** Pickle is fast but opaque and risky to load from
|
| 14 |
+
untrusted sources. We additionally write a ``vocab.json`` file (one token
|
| 15 |
+
per line, UTF-8) so humans and other tools can inspect the vocabulary.
|
| 16 |
+
|
| 17 |
+
The wrapper preserves the notebook's behaviour exactly: ``standardize=None``,
|
| 18 |
+
``output_sequence_length`` defaults to ``max_length``, and ``encode`` accepts
|
| 19 |
+
either a single string or a list of strings (matching the layer's call form
|
| 20 |
+
used in cells 7 and 25).
|
| 21 |
+
"""
|
| 22 |
+
|
| 23 |
+
from __future__ import annotations
|
| 24 |
+
|
| 25 |
+
import json
|
| 26 |
+
import pickle
|
| 27 |
+
from collections.abc import Iterable
|
| 28 |
+
from pathlib import Path
|
| 29 |
+
|
| 30 |
+
VOCAB_PICKLE_FILENAME = "vocab.pkl"
|
| 31 |
+
VOCAB_JSON_FILENAME = "vocab.json"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class CaptionTokenizer:
|
| 35 |
+
"""Wrapper that owns a fitted ``TextVectorization`` layer + lookup tables."""
|
| 36 |
+
|
| 37 |
+
def __init__(self, vocab_size: int, max_length: int) -> None:
|
| 38 |
+
"""Construct an unfit tokenizer.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
vocab_size: Maximum vocabulary size (notebook: ``VOCABULARY_SIZE``).
|
| 42 |
+
max_length: Pad/truncate every caption to this many tokens
|
| 43 |
+
(notebook: ``MAX_LENGTH``).
|
| 44 |
+
"""
|
| 45 |
+
self.vocab_size = vocab_size
|
| 46 |
+
self.max_length = max_length
|
| 47 |
+
self._layer = None
|
| 48 |
+
self._idx2word = None
|
| 49 |
+
self._word2idx = None
|
| 50 |
+
|
| 51 |
+
# ----------------------------------------------------------------- fit ----
|
| 52 |
+
|
| 53 |
+
def fit(self, captions: Iterable[str]) -> None:
|
| 54 |
+
"""Adapt the underlying TextVectorization layer to the given captions.
|
| 55 |
+
|
| 56 |
+
Args:
|
| 57 |
+
captions: An iterable of *already preprocessed* captions
|
| 58 |
+
(i.e. lower-cased, punctuation-stripped, wrapped in
|
| 59 |
+
``[start] ... [end]``). Mirrors notebook cell 7 which calls
|
| 60 |
+
``tokenizer.adapt(captions['caption'])`` *after* cell 4 has
|
| 61 |
+
applied ``preprocess`` to every row.
|
| 62 |
+
"""
|
| 63 |
+
import tensorflow as tf
|
| 64 |
+
|
| 65 |
+
layer = tf.keras.layers.TextVectorization(
|
| 66 |
+
max_tokens=self.vocab_size,
|
| 67 |
+
standardize=None,
|
| 68 |
+
output_sequence_length=self.max_length,
|
| 69 |
+
)
|
| 70 |
+
layer.adapt(list(captions))
|
| 71 |
+
self._layer = layer
|
| 72 |
+
self._build_lookups()
|
| 73 |
+
|
| 74 |
+
# ----------------------------------------------------------- properties ---
|
| 75 |
+
|
| 76 |
+
@property
|
| 77 |
+
def vocabulary(self) -> list[str]:
|
| 78 |
+
"""Return the fitted vocabulary list (same order as TextVectorization)."""
|
| 79 |
+
layer = self._require_fit()
|
| 80 |
+
return list(layer.get_vocabulary())
|
| 81 |
+
|
| 82 |
+
@property
|
| 83 |
+
def vocabulary_size(self) -> int:
|
| 84 |
+
"""Number of tokens in the fitted vocabulary."""
|
| 85 |
+
return int(self._require_fit().vocabulary_size())
|
| 86 |
+
|
| 87 |
+
@property
|
| 88 |
+
def layer(self):
|
| 89 |
+
"""Direct access to the inner Keras layer.
|
| 90 |
+
|
| 91 |
+
Exposed because the model's ``Embeddings`` layer (notebook cell 19)
|
| 92 |
+
needs ``tokenizer.vocabulary_size()`` at construction time. Phase 1b
|
| 93 |
+
replaces this with a constructor argument and removes the property.
|
| 94 |
+
"""
|
| 95 |
+
return self._require_fit()
|
| 96 |
+
|
| 97 |
+
# -------------------------------------------------------- encode/decode ---
|
| 98 |
+
|
| 99 |
+
def encode(self, text):
|
| 100 |
+
"""Encode ``text`` (str or list[str]) to integer-id tensor.
|
| 101 |
+
|
| 102 |
+
Mirrors ``tokenizer(text)`` in notebook cells 7 and 25. Single string
|
| 103 |
+
returns a 1-D tensor of shape ``[max_length]``; list returns 2-D.
|
| 104 |
+
"""
|
| 105 |
+
return self._require_fit()(text)
|
| 106 |
+
|
| 107 |
+
def decode_id(self, idx) -> str:
|
| 108 |
+
"""Inverse-lookup a single integer id to its string token.
|
| 109 |
+
|
| 110 |
+
Mirrors notebook cell 25's
|
| 111 |
+
``idx2word(pred_idx).numpy().decode('utf-8')``.
|
| 112 |
+
"""
|
| 113 |
+
self._require_fit()
|
| 114 |
+
# By invariant, _idx2word is set together with _layer in fit/load.
|
| 115 |
+
assert self._idx2word is not None
|
| 116 |
+
word = self._idx2word(idx)
|
| 117 |
+
return word.numpy().decode("utf-8")
|
| 118 |
+
|
| 119 |
+
# ---------------------------------------------------------- persistence ---
|
| 120 |
+
|
| 121 |
+
def save(self, directory: str | Path) -> None:
|
| 122 |
+
"""Save the vocabulary to ``directory/vocab.pkl`` and ``vocab.json``.
|
| 123 |
+
|
| 124 |
+
The pickle matches notebook cell 9 exactly so old artefacts remain
|
| 125 |
+
loadable. The JSON sidecar is human-inspectable.
|
| 126 |
+
"""
|
| 127 |
+
self._require_fit()
|
| 128 |
+
directory = Path(directory)
|
| 129 |
+
directory.mkdir(parents=True, exist_ok=True)
|
| 130 |
+
vocab = self.vocabulary
|
| 131 |
+
with (directory / VOCAB_PICKLE_FILENAME).open("wb") as f:
|
| 132 |
+
pickle.dump(vocab, f)
|
| 133 |
+
with (directory / VOCAB_JSON_FILENAME).open("w", encoding="utf-8") as f:
|
| 134 |
+
json.dump(vocab, f, ensure_ascii=False, indent=2)
|
| 135 |
+
|
| 136 |
+
@classmethod
|
| 137 |
+
def load(
|
| 138 |
+
cls,
|
| 139 |
+
directory: str | Path,
|
| 140 |
+
vocab_size: int,
|
| 141 |
+
max_length: int,
|
| 142 |
+
) -> CaptionTokenizer:
|
| 143 |
+
"""Load a previously saved vocabulary into a new tokenizer.
|
| 144 |
+
|
| 145 |
+
Args:
|
| 146 |
+
directory: Directory containing ``vocab.pkl`` (or ``vocab.json``).
|
| 147 |
+
vocab_size: Maximum vocabulary size — must match the saved vocab.
|
| 148 |
+
max_length: Pad/truncate length — must match training-time value.
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
A fitted ``CaptionTokenizer`` ready to ``encode`` and ``decode_id``.
|
| 152 |
+
"""
|
| 153 |
+
import tensorflow as tf
|
| 154 |
+
|
| 155 |
+
directory = Path(directory)
|
| 156 |
+
pkl = directory / VOCAB_PICKLE_FILENAME
|
| 157 |
+
if pkl.is_file():
|
| 158 |
+
with pkl.open("rb") as f:
|
| 159 |
+
vocab = pickle.load(f)
|
| 160 |
+
else:
|
| 161 |
+
with (directory / VOCAB_JSON_FILENAME).open(encoding="utf-8") as f:
|
| 162 |
+
vocab = json.load(f)
|
| 163 |
+
|
| 164 |
+
tok = cls(vocab_size=vocab_size, max_length=max_length)
|
| 165 |
+
layer = tf.keras.layers.TextVectorization(
|
| 166 |
+
max_tokens=vocab_size,
|
| 167 |
+
standardize=None,
|
| 168 |
+
output_sequence_length=max_length,
|
| 169 |
+
)
|
| 170 |
+
layer.set_vocabulary(vocab)
|
| 171 |
+
tok._layer = layer
|
| 172 |
+
tok._build_lookups()
|
| 173 |
+
return tok
|
| 174 |
+
|
| 175 |
+
# -------------------------------------------------------------- internal --
|
| 176 |
+
|
| 177 |
+
def _build_lookups(self) -> None:
|
| 178 |
+
"""Construct ``StringLookup`` (idx → word) for inference decoding.
|
| 179 |
+
|
| 180 |
+
Called only from ``fit()`` and ``load()``, *after* ``self._layer`` has
|
| 181 |
+
been assigned, so the assertion below is a defensive no-op for mypy.
|
| 182 |
+
"""
|
| 183 |
+
import tensorflow as tf
|
| 184 |
+
|
| 185 |
+
assert self._layer is not None
|
| 186 |
+
vocab = self._layer.get_vocabulary()
|
| 187 |
+
self._word2idx = tf.keras.layers.StringLookup(mask_token="", vocabulary=vocab)
|
| 188 |
+
self._idx2word = tf.keras.layers.StringLookup(mask_token="", vocabulary=vocab, invert=True)
|
| 189 |
+
|
| 190 |
+
def _require_fit(self):
|
| 191 |
+
"""Validate that the tokenizer has been fitted; return the inner layer.
|
| 192 |
+
|
| 193 |
+
Returning the layer (rather than only raising on the unfit state)
|
| 194 |
+
gives callers a non-``None``-typed local for the rest of their body —
|
| 195 |
+
which is what mypy needs to prove ``layer.get_vocabulary()`` etc.
|
| 196 |
+
are valid calls. Costs one attribute lookup at runtime.
|
| 197 |
+
"""
|
| 198 |
+
if self._layer is None:
|
| 199 |
+
raise RuntimeError(
|
| 200 |
+
"CaptionTokenizer not fitted. Call `.fit(captions)` or "
|
| 201 |
+
"`.load(directory, ...)` first."
|
| 202 |
+
)
|
| 203 |
+
return self._layer
|
src/captioning/py.typed
ADDED
|
File without changes
|
src/captioning/training/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Training — losses, callbacks, and the trainer that orchestrates ``model.fit``.
|
| 2 |
+
|
| 3 |
+
The notebook computes loss + masked accuracy inside the model's ``train_step``;
|
| 4 |
+
we keep that structure for parity but expose the loss function and callbacks
|
| 5 |
+
as standalone modules so they can be unit-tested and reused (e.g. by Phase 1b
|
| 6 |
+
beam-search evaluators).
|
| 7 |
+
|
| 8 |
+
losses.py ``masked_sparse_categorical_crossentropy`` — the same loss the notebook uses
|
| 9 |
+
callbacks.py ``default_callbacks(config)`` — early stopping (and Phase 4 checkpoint hooks)
|
| 10 |
+
trainer.py ``Trainer.fit()`` — wraps compile + fit + history serialization
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from captioning.training.callbacks import default_callbacks
|
| 14 |
+
from captioning.training.losses import masked_sparse_categorical_crossentropy
|
| 15 |
+
from captioning.training.trainer import Trainer
|
| 16 |
+
|
| 17 |
+
__all__ = [
|
| 18 |
+
"Trainer",
|
| 19 |
+
"default_callbacks",
|
| 20 |
+
"masked_sparse_categorical_crossentropy",
|
| 21 |
+
]
|
src/captioning/training/callbacks.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Default training callbacks.
|
| 2 |
+
|
| 3 |
+
Mirrors notebook cell 22 (``EarlyStopping(patience=3, restore_best_weights=True)``)
|
| 4 |
+
and adds Phase-2 hooks (``ModelCheckpoint``, ``CSVLogger``) that the trainer
|
| 5 |
+
will use. Each callback is created by a tiny factory so callers don't have to
|
| 6 |
+
import TF for the names.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
from captioning.config.schema import AppConfig
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def default_callbacks(
|
| 17 |
+
config: AppConfig,
|
| 18 |
+
*,
|
| 19 |
+
output_dir: str | Path | None = None,
|
| 20 |
+
):
|
| 21 |
+
"""Return the list of callbacks ``Trainer.fit`` will pass to ``model.fit``.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
config: App config (uses ``train.early_stopping_patience``).
|
| 25 |
+
output_dir: If provided, ``ModelCheckpoint`` writes ``best.h5`` and
|
| 26 |
+
``CSVLogger`` writes ``training_log.csv`` here. Notebook does
|
| 27 |
+
neither — these are Phase-1b improvements layered on top of the
|
| 28 |
+
parity baseline. They run *before* parity is exercised because
|
| 29 |
+
adding a callback does not change loss values, only emits files.
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
A list of ``tf.keras.callbacks.Callback`` instances.
|
| 33 |
+
"""
|
| 34 |
+
import tensorflow as tf
|
| 35 |
+
|
| 36 |
+
callbacks = [
|
| 37 |
+
tf.keras.callbacks.EarlyStopping(
|
| 38 |
+
patience=config.train.early_stopping_patience,
|
| 39 |
+
restore_best_weights=True,
|
| 40 |
+
),
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
if output_dir is not None:
|
| 44 |
+
out = Path(output_dir)
|
| 45 |
+
out.mkdir(parents=True, exist_ok=True)
|
| 46 |
+
callbacks += [
|
| 47 |
+
tf.keras.callbacks.ModelCheckpoint(
|
| 48 |
+
filepath=str(out / "best.h5"),
|
| 49 |
+
save_weights_only=True,
|
| 50 |
+
save_best_only=True,
|
| 51 |
+
monitor="val_loss",
|
| 52 |
+
),
|
| 53 |
+
tf.keras.callbacks.CSVLogger(str(out / "training_log.csv")),
|
| 54 |
+
]
|
| 55 |
+
return callbacks
|
src/captioning/training/losses.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Training losses.
|
| 2 |
+
|
| 3 |
+
The notebook (cell 22) compiles the model with::
|
| 4 |
+
|
| 5 |
+
cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction="none")
|
| 6 |
+
|
| 7 |
+
Why ``reduction="none"``: the model's ``calculate_loss`` (cell 20) does the
|
| 8 |
+
reduction itself, multiplying by the padding mask before averaging. A built-in
|
| 9 |
+
reduction would average over the padded tokens too, biasing the loss.
|
| 10 |
+
|
| 11 |
+
We expose the loss via a tiny factory rather than a constant so callers don't
|
| 12 |
+
have to import TF themselves to get it.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def masked_sparse_categorical_crossentropy():
|
| 19 |
+
"""Return the loss function the model is compiled with.
|
| 20 |
+
|
| 21 |
+
Same as notebook cell 22: ``from_logits=False, reduction="none"``. The
|
| 22 |
+
decoder applies a softmax already (``Dense(..., activation="softmax")``)
|
| 23 |
+
so logits=False is correct.
|
| 24 |
+
"""
|
| 25 |
+
import tensorflow as tf
|
| 26 |
+
|
| 27 |
+
return tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction="none")
|
src/captioning/training/trainer.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""``Trainer`` — orchestration around ``model.compile + model.fit``.
|
| 2 |
+
|
| 3 |
+
Wraps notebook cells 22 and 23 in a class so:
|
| 4 |
+
* Tests can construct a Trainer with a tiny dataset and assert
|
| 5 |
+
``trainer.fit`` returns a sensible history dict.
|
| 6 |
+
* Phase 4 can replace the trainer with a CLI-driven main loop without
|
| 7 |
+
changing the notebook-equivalent behaviour.
|
| 8 |
+
|
| 9 |
+
The trainer is intentionally thin — no MLflow integration yet (Phase 2
|
| 10 |
+
adds it), no distributed strategy (out of scope for the IEEE notebook).
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
import json
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
|
| 18 |
+
from captioning.config.schema import AppConfig
|
| 19 |
+
from captioning.training.callbacks import default_callbacks
|
| 20 |
+
from captioning.training.losses import masked_sparse_categorical_crossentropy
|
| 21 |
+
from captioning.utils.logging import get_logger
|
| 22 |
+
|
| 23 |
+
log = get_logger(__name__)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class Trainer:
|
| 27 |
+
"""Thin orchestration layer around an ``ImageCaptioningModel``."""
|
| 28 |
+
|
| 29 |
+
def __init__(self, model, config: AppConfig) -> None:
|
| 30 |
+
"""Args:
|
| 31 |
+
model: Result of ``build_caption_model(config, vocab_size)``.
|
| 32 |
+
config: Validated ``AppConfig``.
|
| 33 |
+
"""
|
| 34 |
+
self.model = model
|
| 35 |
+
self.config = config
|
| 36 |
+
self._compiled = False
|
| 37 |
+
|
| 38 |
+
def compile(self) -> None:
|
| 39 |
+
"""Apply the same ``compile`` call the notebook makes (cell 22)."""
|
| 40 |
+
import tensorflow as tf
|
| 41 |
+
|
| 42 |
+
self.model.compile(
|
| 43 |
+
optimizer=tf.keras.optimizers.Adam(learning_rate=self.config.train.learning_rate),
|
| 44 |
+
loss=masked_sparse_categorical_crossentropy(),
|
| 45 |
+
)
|
| 46 |
+
self._compiled = True
|
| 47 |
+
log.info("model_compiled", learning_rate=self.config.train.learning_rate)
|
| 48 |
+
|
| 49 |
+
def fit(
|
| 50 |
+
self,
|
| 51 |
+
train_dataset,
|
| 52 |
+
val_dataset,
|
| 53 |
+
*,
|
| 54 |
+
output_dir: str | Path | None = None,
|
| 55 |
+
) -> dict[str, list[float]]:
|
| 56 |
+
"""Run ``model.fit`` and return a history dict.
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
train_dataset: ``tf.data.Dataset`` from
|
| 60 |
+
``data.pipeline.build_train_pipeline``.
|
| 61 |
+
val_dataset: ``tf.data.Dataset`` from
|
| 62 |
+
``data.pipeline.build_val_pipeline``.
|
| 63 |
+
output_dir: If provided, callbacks write ``best.h5`` and
|
| 64 |
+
``training_log.csv`` here, and ``history.json`` is dumped at
|
| 65 |
+
the end.
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
``history.history`` as a ``dict[str, list[float]]``.
|
| 69 |
+
"""
|
| 70 |
+
if not self._compiled:
|
| 71 |
+
self.compile()
|
| 72 |
+
|
| 73 |
+
callbacks = default_callbacks(self.config, output_dir=output_dir)
|
| 74 |
+
log.info("fit_start", epochs=self.config.train.epochs)
|
| 75 |
+
history = self.model.fit(
|
| 76 |
+
train_dataset,
|
| 77 |
+
epochs=self.config.train.epochs,
|
| 78 |
+
validation_data=val_dataset,
|
| 79 |
+
callbacks=callbacks,
|
| 80 |
+
)
|
| 81 |
+
log.info("fit_end", final_loss=history.history.get("loss", [None])[-1])
|
| 82 |
+
|
| 83 |
+
if output_dir is not None:
|
| 84 |
+
history_path = Path(output_dir) / "history.json"
|
| 85 |
+
with history_path.open("w", encoding="utf-8") as f:
|
| 86 |
+
json.dump(history.history, f, indent=2)
|
| 87 |
+
|
| 88 |
+
return dict(history.history)
|
src/captioning/utils/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Utils — cross-cutting helpers used by every other sub-package.
|
| 2 |
+
|
| 3 |
+
Kept deliberately small. If a "util" grows past a single function, that's a
|
| 4 |
+
signal it belongs in its own package, not here.
|
| 5 |
+
|
| 6 |
+
logging.py structlog setup (JSON in prod, pretty in dev)
|
| 7 |
+
seed.py ``set_global_seed`` for reproducibility
|
| 8 |
+
hashing.py ``sha256_file`` for the paper-notebook freeze check
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from captioning.utils.hashing import sha256_file
|
| 12 |
+
from captioning.utils.logging import configure_logging, get_logger
|
| 13 |
+
from captioning.utils.seed import set_global_seed
|
| 14 |
+
|
| 15 |
+
__all__ = [
|
| 16 |
+
"configure_logging",
|
| 17 |
+
"get_logger",
|
| 18 |
+
"set_global_seed",
|
| 19 |
+
"sha256_file",
|
| 20 |
+
]
|
src/captioning/utils/hashing.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""File-hashing helper used by the paper-notebook freeze CI check."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import hashlib
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
_CHUNK = 64 * 1024
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def sha256_file(path: str | Path) -> str:
|
| 12 |
+
"""Return the hex-digest SHA-256 of a file, streaming 64KB chunks.
|
| 13 |
+
|
| 14 |
+
Streaming (rather than ``open(...).read()``) keeps memory bounded for
|
| 15 |
+
notebooks with embedded image outputs that can hit hundreds of MB.
|
| 16 |
+
"""
|
| 17 |
+
h = hashlib.sha256()
|
| 18 |
+
path = Path(path)
|
| 19 |
+
with path.open("rb") as f:
|
| 20 |
+
while chunk := f.read(_CHUNK):
|
| 21 |
+
h.update(chunk)
|
| 22 |
+
return h.hexdigest()
|
src/captioning/utils/logging.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Structured logging setup.
|
| 2 |
+
|
| 3 |
+
Why structlog instead of stdlib `logging`?
|
| 4 |
+
* Logs are *data*, not strings. structlog emits dicts that grafana/Datadog/
|
| 5 |
+
Better Stack can index without regex parsing.
|
| 6 |
+
* The same code path produces colourised pretty logs in dev and JSON logs
|
| 7 |
+
in prod, controlled by ``APP_ENV``. Grep the same fields in either mode.
|
| 8 |
+
* Bound context (request IDs, model versions) propagates automatically.
|
| 9 |
+
|
| 10 |
+
Usage:
|
| 11 |
+
>>> from captioning.utils.logging import configure_logging, get_logger
|
| 12 |
+
>>> configure_logging()
|
| 13 |
+
>>> log = get_logger(__name__)
|
| 14 |
+
>>> log.info("training started", epoch=1, batch_size=64)
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
import logging
|
| 20 |
+
import os
|
| 21 |
+
import sys
|
| 22 |
+
from typing import Any
|
| 23 |
+
|
| 24 |
+
import structlog
|
| 25 |
+
|
| 26 |
+
_CONFIGURED = False
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def _resolve_level(level: str | int | None) -> int:
|
| 30 |
+
"""Coerce a log-level argument (or env default) to a numeric level.
|
| 31 |
+
|
| 32 |
+
Why this helper exists:
|
| 33 |
+
``logging.getLevelName`` is *bidirectional* — it returns ``int`` for
|
| 34 |
+
known names and ``str`` for unknown ones (e.g. ``"Level FOO"``). That
|
| 35 |
+
union return type defeats type narrowing and would be passed straight
|
| 36 |
+
through to ``structlog.make_filtering_bound_logger``, which requires
|
| 37 |
+
``int``. We resolve once here, fall back to ``INFO`` on unknown
|
| 38 |
+
names, and return a guaranteed ``int``.
|
| 39 |
+
"""
|
| 40 |
+
if level is None:
|
| 41 |
+
level = os.environ.get("LOG_LEVEL", "INFO")
|
| 42 |
+
if isinstance(level, int):
|
| 43 |
+
return level
|
| 44 |
+
resolved = logging.getLevelName(level.upper())
|
| 45 |
+
return resolved if isinstance(resolved, int) else logging.INFO
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def configure_logging(level: str | int | None = None, json_logs: bool | None = None) -> None:
|
| 49 |
+
"""Initialise structlog. Idempotent — calling twice has no effect.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
level: Log level name (``"INFO"``) or numeric value. Defaults to env
|
| 53 |
+
``LOG_LEVEL`` or ``INFO``.
|
| 54 |
+
json_logs: If True, render JSON; if False, render pretty colourised.
|
| 55 |
+
Defaults to True when ``APP_ENV=production``, else False.
|
| 56 |
+
"""
|
| 57 |
+
global _CONFIGURED
|
| 58 |
+
if _CONFIGURED:
|
| 59 |
+
return
|
| 60 |
+
|
| 61 |
+
level_int = _resolve_level(level)
|
| 62 |
+
if json_logs is None:
|
| 63 |
+
json_logs = os.environ.get("APP_ENV", "development").lower() == "production"
|
| 64 |
+
|
| 65 |
+
logging.basicConfig(
|
| 66 |
+
format="%(message)s",
|
| 67 |
+
stream=sys.stdout,
|
| 68 |
+
level=level_int,
|
| 69 |
+
)
|
| 70 |
+
|
| 71 |
+
timestamper = structlog.processors.TimeStamper(fmt="iso", utc=True)
|
| 72 |
+
shared_processors: list[Any] = [
|
| 73 |
+
structlog.contextvars.merge_contextvars,
|
| 74 |
+
structlog.stdlib.add_log_level,
|
| 75 |
+
structlog.stdlib.add_logger_name,
|
| 76 |
+
timestamper,
|
| 77 |
+
structlog.processors.StackInfoRenderer(),
|
| 78 |
+
structlog.processors.format_exc_info,
|
| 79 |
+
]
|
| 80 |
+
renderer: Any = (
|
| 81 |
+
structlog.processors.JSONRenderer()
|
| 82 |
+
if json_logs
|
| 83 |
+
else structlog.dev.ConsoleRenderer(colors=True)
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
structlog.configure(
|
| 87 |
+
processors=[*shared_processors, renderer],
|
| 88 |
+
wrapper_class=structlog.make_filtering_bound_logger(level_int),
|
| 89 |
+
context_class=dict,
|
| 90 |
+
logger_factory=structlog.stdlib.LoggerFactory(),
|
| 91 |
+
cache_logger_on_first_use=True,
|
| 92 |
+
)
|
| 93 |
+
_CONFIGURED = True
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def get_logger(name: str | None = None) -> structlog.stdlib.BoundLogger:
|
| 97 |
+
"""Return a logger bound to ``name`` (typically ``__name__``)."""
|
| 98 |
+
if not _CONFIGURED:
|
| 99 |
+
configure_logging()
|
| 100 |
+
return structlog.get_logger(name)
|
src/captioning/utils/seed.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Reproducibility helpers.
|
| 2 |
+
|
| 3 |
+
Why this matters: the IEEE notebook's ``random.shuffle`` of image keys (cell 11)
|
| 4 |
+
is non-deterministic without a seed, which means the same code can produce a
|
| 5 |
+
different train/val split on every run — and therefore different BLEU. Pinning
|
| 6 |
+
the seed makes results reproducible across machines and dates.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import os
|
| 12 |
+
import random
|
| 13 |
+
from typing import TYPE_CHECKING
|
| 14 |
+
|
| 15 |
+
if TYPE_CHECKING: # pragma: no cover
|
| 16 |
+
pass
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def set_global_seed(seed: int) -> None:
|
| 20 |
+
"""Seed Python, NumPy, and TensorFlow RNGs from a single integer.
|
| 21 |
+
|
| 22 |
+
TF's seeding has multiple layers (``tf.random.set_seed`` for graph-level,
|
| 23 |
+
``os.environ['PYTHONHASHSEED']`` for hash randomisation, and op-level seeds
|
| 24 |
+
for individual ops). We set as many as practical without forcing TF's
|
| 25 |
+
deterministic mode (which can hurt training throughput by ~15%).
|
| 26 |
+
|
| 27 |
+
Args:
|
| 28 |
+
seed: Any non-negative integer.
|
| 29 |
+
"""
|
| 30 |
+
if seed < 0:
|
| 31 |
+
raise ValueError(f"seed must be non-negative, got {seed}")
|
| 32 |
+
|
| 33 |
+
os.environ["PYTHONHASHSEED"] = str(seed)
|
| 34 |
+
random.seed(seed)
|
| 35 |
+
|
| 36 |
+
# Imported lazily so the utils package doesn't pull NumPy at import time
|
| 37 |
+
# for unrelated callers (e.g. config validation).
|
| 38 |
+
import numpy as np
|
| 39 |
+
|
| 40 |
+
np.random.seed(seed)
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
import tensorflow as tf
|
| 44 |
+
|
| 45 |
+
tf.random.set_seed(seed)
|
| 46 |
+
tf.keras.utils.set_random_seed(seed)
|
| 47 |
+
except ImportError: # pragma: no cover
|
| 48 |
+
# TF is an optional dep at the *utility* layer; ML callers always have it.
|
| 49 |
+
pass
|
tests/__init__.py
ADDED
|
File without changes
|
tests/conftest.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared pytest fixtures and config.
|
| 2 |
+
|
| 3 |
+
Keeping fixtures here (rather than per-test) is the standard pytest pattern
|
| 4 |
+
and makes `pytest --fixtures` discoverable for new contributors.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
from collections.abc import Iterator
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
import pytest
|
| 13 |
+
|
| 14 |
+
from captioning.utils.seed import set_global_seed
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@pytest.fixture(autouse=True)
|
| 18 |
+
def _seed_everything() -> Iterator[None]:
|
| 19 |
+
"""Seed all RNGs before each test for deterministic results."""
|
| 20 |
+
set_global_seed(42)
|
| 21 |
+
yield
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@pytest.fixture
|
| 25 |
+
def tiny_caption_corpus() -> list[str]:
|
| 26 |
+
"""A small, deterministic corpus used by tokenizer tests."""
|
| 27 |
+
return [
|
| 28 |
+
"[start] a man on a surfboard [end]",
|
| 29 |
+
"[start] a dog in the park [end]",
|
| 30 |
+
"[start] two children playing with a ball [end]",
|
| 31 |
+
"[start] a cat sitting on a chair [end]",
|
| 32 |
+
"[start] a man riding a bike on the street [end]",
|
| 33 |
+
]
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@pytest.fixture
|
| 37 |
+
def tmp_artifacts_dir(tmp_path: Path) -> Path:
|
| 38 |
+
"""A clean temp dir for save/load round-trip tests."""
|
| 39 |
+
return tmp_path / "artifacts"
|
tests/unit/__init__.py
ADDED
|
File without changes
|
tests/unit/test_caption_preprocessing.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for ``captioning.preprocessing.caption.preprocess_caption``.
|
| 2 |
+
|
| 3 |
+
The function is the cheapest possible thing to test thoroughly, and it's also
|
| 4 |
+
the hottest train/serve-skew risk: any divergence here changes both the
|
| 5 |
+
training vocabulary and the inference path.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import re
|
| 11 |
+
|
| 12 |
+
import pytest
|
| 13 |
+
|
| 14 |
+
from captioning.preprocessing.caption import (
|
| 15 |
+
END_TOKEN,
|
| 16 |
+
START_TOKEN,
|
| 17 |
+
preprocess_caption,
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _notebook_baseline(text: str) -> str:
|
| 22 |
+
"""Verbatim notebook cell 3 for parity comparison."""
|
| 23 |
+
text = text.lower()
|
| 24 |
+
text = re.sub(r"[^\w\s]", "", text)
|
| 25 |
+
text = re.sub(r"\s+", " ", text)
|
| 26 |
+
text = text.strip()
|
| 27 |
+
return "[start] " + text + " [end]"
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@pytest.mark.parametrize(
|
| 31 |
+
"raw",
|
| 32 |
+
[
|
| 33 |
+
"A man riding a bike",
|
| 34 |
+
"ALL CAPS ARE LOWERED",
|
| 35 |
+
"punctuation, removed!",
|
| 36 |
+
" multiple spaces ",
|
| 37 |
+
"Numbers 123 stay",
|
| 38 |
+
"Tabs\tand\nnewlines",
|
| 39 |
+
"",
|
| 40 |
+
],
|
| 41 |
+
)
|
| 42 |
+
def test_matches_notebook_baseline(raw: str) -> None:
|
| 43 |
+
assert preprocess_caption(raw) == _notebook_baseline(raw)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def test_wraps_in_sentinels() -> None:
|
| 47 |
+
out = preprocess_caption("hello world")
|
| 48 |
+
assert out.startswith(START_TOKEN + " ")
|
| 49 |
+
assert out.endswith(" " + END_TOKEN)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def test_idempotent_on_already_clean() -> None:
|
| 53 |
+
"""Already-lowercase, no-punctuation input shouldn't change between
|
| 54 |
+
inner content runs."""
|
| 55 |
+
clean = "a man riding a bike"
|
| 56 |
+
out1 = preprocess_caption(clean)
|
| 57 |
+
# Inner content (without sentinels) should equal the input.
|
| 58 |
+
inner = out1.removeprefix(f"{START_TOKEN} ").removesuffix(f" {END_TOKEN}")
|
| 59 |
+
assert inner == clean
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def test_strips_emoji_and_unicode_punct() -> None:
|
| 63 |
+
"""``\\w`` in Python regex matches unicode word chars by default; punctuation
|
| 64 |
+
(including emoji) is dropped. Documenting current behaviour."""
|
| 65 |
+
out = preprocess_caption("hello 😀 world!")
|
| 66 |
+
inner = out.removeprefix(f"{START_TOKEN} ").removesuffix(f" {END_TOKEN}")
|
| 67 |
+
# Emoji is non-word non-whitespace → stripped; collapsed spaces leave one space.
|
| 68 |
+
assert inner == "hello world"
|
tests/unit/test_config.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for the Pydantic config schema and YAML loader."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
import pytest
|
| 8 |
+
from pydantic import ValidationError
|
| 9 |
+
|
| 10 |
+
from captioning.config.loader import load_config
|
| 11 |
+
from captioning.config.schema import AppConfig, DataConfig, ModelConfig, TrainConfig
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def test_defaults_match_notebook_hyperparams() -> None:
|
| 15 |
+
"""The defaults *are* the IEEE notebook's hyperparameters; if anyone
|
| 16 |
+
changes them by accident, this test fails loudly."""
|
| 17 |
+
cfg = AppConfig()
|
| 18 |
+
assert cfg.model.embedding_dim == 512
|
| 19 |
+
assert cfg.model.units == 512
|
| 20 |
+
assert cfg.model.max_length == 40
|
| 21 |
+
assert cfg.model.vocabulary_size == 15_000
|
| 22 |
+
assert cfg.model.encoder_num_heads == 1
|
| 23 |
+
assert cfg.model.decoder_num_heads == 8
|
| 24 |
+
assert cfg.train.epochs == 10
|
| 25 |
+
assert cfg.train.batch_size == 64
|
| 26 |
+
assert cfg.train.buffer_size == 1_000
|
| 27 |
+
assert cfg.train.early_stopping_patience == 3
|
| 28 |
+
assert cfg.data.sample_size == 120_000
|
| 29 |
+
assert cfg.data.train_val_split == 0.8
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def test_split_validation_rejects_invalid_fractions() -> None:
|
| 33 |
+
with pytest.raises(ValidationError):
|
| 34 |
+
DataConfig(train_val_split=0.0)
|
| 35 |
+
with pytest.raises(ValidationError):
|
| 36 |
+
DataConfig(train_val_split=1.0)
|
| 37 |
+
with pytest.raises(ValidationError):
|
| 38 |
+
DataConfig(train_val_split=1.5)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def test_extra_keys_rejected() -> None:
|
| 42 |
+
"""``extra="forbid"`` catches typos at load time instead of training time."""
|
| 43 |
+
with pytest.raises(ValidationError):
|
| 44 |
+
AppConfig(model={"embedding_dim": 512, "tpyo": True}) # type: ignore[arg-type]
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def test_env_override(monkeypatch: pytest.MonkeyPatch) -> None:
|
| 48 |
+
monkeypatch.setenv("CAPTIONING__TRAIN__BATCH_SIZE", "32")
|
| 49 |
+
cfg = AppConfig()
|
| 50 |
+
assert cfg.train.batch_size == 32
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def test_load_config_yaml(tmp_path: Path) -> None:
|
| 54 |
+
yaml_text = """
|
| 55 |
+
data:
|
| 56 |
+
sample_size: 1000
|
| 57 |
+
model:
|
| 58 |
+
embedding_dim: 256
|
| 59 |
+
train:
|
| 60 |
+
epochs: 2
|
| 61 |
+
batch_size: 8
|
| 62 |
+
"""
|
| 63 |
+
p = tmp_path / "test.yaml"
|
| 64 |
+
p.write_text(yaml_text, encoding="utf-8")
|
| 65 |
+
cfg = load_config(p)
|
| 66 |
+
assert cfg.data.sample_size == 1000
|
| 67 |
+
assert cfg.model.embedding_dim == 256
|
| 68 |
+
assert cfg.train.epochs == 2
|
| 69 |
+
# Unspecified fields take defaults
|
| 70 |
+
assert cfg.model.max_length == 40
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def test_load_config_missing_file(tmp_path: Path) -> None:
|
| 74 |
+
with pytest.raises(FileNotFoundError):
|
| 75 |
+
load_config(tmp_path / "does-not-exist.yaml")
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def test_train_seed_default_is_42() -> None:
|
| 79 |
+
"""The notebook didn't seed; we did. 42 is the project default."""
|
| 80 |
+
assert TrainConfig().seed == 42
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def test_modelconfig_independent_of_other_sections() -> None:
|
| 84 |
+
"""Sub-configs should be constructible without the parent."""
|
| 85 |
+
m = ModelConfig(embedding_dim=128, vocabulary_size=500)
|
| 86 |
+
assert m.embedding_dim == 128
|
| 87 |
+
assert m.vocabulary_size == 500
|
| 88 |
+
# Defaults preserved
|
| 89 |
+
assert m.max_length == 40
|
tests/unit/test_evaluation.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Smoke tests for the BLEU evaluator.
|
| 2 |
+
|
| 3 |
+
We don't validate sacrebleu's correctness here — that's its own test suite.
|
| 4 |
+
We *do* validate our adapter: parallel-list shape handling, ragged references,
|
| 5 |
+
and that perfect predictions score 100.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import pytest
|
| 11 |
+
|
| 12 |
+
sacrebleu = pytest.importorskip("sacrebleu")
|
| 13 |
+
|
| 14 |
+
from captioning.evaluation.bleu import corpus_bleu_score # noqa: E402
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def test_perfect_predictions_score_100() -> None:
|
| 18 |
+
refs = [["a man riding a bike"], ["a dog in the park"]]
|
| 19 |
+
preds = ["a man riding a bike", "a dog in the park"]
|
| 20 |
+
assert corpus_bleu_score(preds, refs) == pytest.approx(100.0)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def test_completely_wrong_predictions_score_low() -> None:
|
| 24 |
+
refs = [["a man riding a bike"], ["a dog in the park"]]
|
| 25 |
+
preds = ["xyz qrs", "abc def"]
|
| 26 |
+
score = corpus_bleu_score(preds, refs)
|
| 27 |
+
assert 0.0 <= score < 5.0
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def test_ragged_references_supported() -> None:
|
| 31 |
+
refs = [
|
| 32 |
+
["a man riding a bike", "a person on a bicycle", "someone biking"],
|
| 33 |
+
["a dog in the park"],
|
| 34 |
+
]
|
| 35 |
+
preds = ["a man riding a bike", "a dog in the park"]
|
| 36 |
+
score = corpus_bleu_score(preds, refs)
|
| 37 |
+
assert score > 50.0
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def test_length_mismatch_raises() -> None:
|
| 41 |
+
with pytest.raises(ValueError):
|
| 42 |
+
corpus_bleu_score(["a", "b"], [["a"]])
|
tests/unit/test_hashing.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for ``captioning.utils.hashing.sha256_file``."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import hashlib
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
from captioning.utils.hashing import sha256_file
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def test_matches_oneshot_hash(tmp_path: Path) -> None:
|
| 12 |
+
"""Streaming SHA-256 must equal the one-shot SHA-256."""
|
| 13 |
+
p = tmp_path / "blob.bin"
|
| 14 |
+
payload = b"hello world\n" * 1000
|
| 15 |
+
p.write_bytes(payload)
|
| 16 |
+
assert sha256_file(p) == hashlib.sha256(payload).hexdigest()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def test_handles_empty_file(tmp_path: Path) -> None:
|
| 20 |
+
p = tmp_path / "empty.bin"
|
| 21 |
+
p.touch()
|
| 22 |
+
assert sha256_file(p) == hashlib.sha256(b"").hexdigest()
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def test_handles_large_file(tmp_path: Path) -> None:
|
| 26 |
+
"""Larger than the internal 64 KB chunk to exercise the streaming path."""
|
| 27 |
+
p = tmp_path / "large.bin"
|
| 28 |
+
payload = b"x" * (256 * 1024) # 256 KB
|
| 29 |
+
p.write_bytes(payload)
|
| 30 |
+
assert sha256_file(p) == hashlib.sha256(payload).hexdigest()
|
tests/unit/test_image_preprocessing.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for ``captioning.preprocessing.image``.
|
| 2 |
+
|
| 3 |
+
TF-dependent; auto-skipped if TF is unavailable.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import pytest
|
| 9 |
+
|
| 10 |
+
tf = pytest.importorskip("tensorflow")
|
| 11 |
+
|
| 12 |
+
from captioning.preprocessing.image import ( # noqa: E402
|
| 13 |
+
INCEPTION_INPUT_SIZE,
|
| 14 |
+
preprocess_image_tensor,
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def test_output_shape() -> None:
|
| 19 |
+
img = tf.random.uniform((480, 640, 3), minval=0, maxval=255, dtype=tf.int32)
|
| 20 |
+
img = tf.cast(img, tf.uint8)
|
| 21 |
+
out = preprocess_image_tensor(img)
|
| 22 |
+
assert tuple(out.shape) == (INCEPTION_INPUT_SIZE, INCEPTION_INPUT_SIZE, 3)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def test_output_in_inception_range() -> None:
|
| 26 |
+
"""``inception_v3.preprocess_input`` maps [0, 255] → [-1, 1]."""
|
| 27 |
+
img = tf.cast(
|
| 28 |
+
tf.random.uniform((300, 300, 3), 0, 255, dtype=tf.int32),
|
| 29 |
+
tf.uint8,
|
| 30 |
+
)
|
| 31 |
+
out = preprocess_image_tensor(img)
|
| 32 |
+
assert float(tf.reduce_min(out)) >= -1.0 - 1e-6
|
| 33 |
+
assert float(tf.reduce_max(out)) <= 1.0 + 1e-6
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def test_deterministic_on_same_input() -> None:
|
| 37 |
+
img = tf.cast(
|
| 38 |
+
tf.random.uniform((400, 500, 3), 0, 255, dtype=tf.int32),
|
| 39 |
+
tf.uint8,
|
| 40 |
+
)
|
| 41 |
+
a = preprocess_image_tensor(img)
|
| 42 |
+
b = preprocess_image_tensor(img)
|
| 43 |
+
assert tf.reduce_all(tf.equal(a, b))
|