image-captioning-api / configs /train /stabilized.yaml
apoorvrajdev's picture
feat(evaluation): add beam search, metrics pipeline, and stabilized training workflow
91a1214
# =============================================================================
# configs/train/stabilized.yaml β€” first experimental run with the opt-in
# training-stability primitives turned on.
# -----------------------------------------------------------------------------
# Identical to configs/base.yaml except for the four flags called out in the
# `train:` section below. Every other field mirrors the IEEE notebook verbatim
# so this run is comparable to the baseline at the same seed and architecture.
#
# Why a complete config (not a thin override)?
# scripts/train.py only accepts --config; there is no --override merge mode
# in the CLI (the README mentions one but it's aspirational, not implemented).
# Duplicating the values here is the smallest correct change that keeps
# base.yaml itself untouched β€” which was the explicit requirement for this
# experiment phase.
#
# Usage:
# python -m scripts.train --config configs/train/stabilized.yaml \
# --output-dir outputs/runs/stabilized
#
# Compare against the baseline by training the same code twice β€” once with
# configs/base.yaml, once with this file β€” and diffing the resulting
# results/<run_id>/metrics.json files.
# =============================================================================
data:
base_path: data/coco2017
annotations_filename: captions_train2017.json
images_subdir: train2017
sample_size: 120000 # Same sample as base.yaml β€” comparability matters
train_val_split: 0.8
model:
embedding_dim: 512
units: 512
max_length: 40
vocabulary_size: 15000
encoder_num_heads: 1
decoder_num_heads: 8
decoder_dropout_inner: 0.3
decoder_dropout_outer: 0.5
decoder_attention_dropout: 0.1
train:
epochs: 10
batch_size: 64
buffer_size: 1000
early_stopping_patience: 3
seed: 42
learning_rate: 0.001
weights_filename: model.h5
# ---- the four flags this experiment is actually testing -------------------
# Label smoothing 0.1 softens the cross-entropy target so the decoder
# cannot collapse onto a handful of high-frequency tokens. Standard
# transformer captioning recipe (BLIP, ViT-GPT2, GIT all use it).
label_smoothing: 0.1
# Warmup + cosine decay replaces the bare constant Adam LR. Transformers
# trained from scratch with no warmup tend to settle into a "safe captions"
# basin where every output looks like "a man standing ...". Cosine decay
# then anneals smoothly toward min_learning_rate.
lr_schedule: cosine
warmup_steps: 500 # ~1/3 of an epoch at batch 64, sample 120k
cosine_decay_steps: null # null -> trainer derives from steps_per_epoch * epochs
min_learning_rate: 0.0
# Restore conventional behaviour: dropout OFF during validation, accuracy
# tracker weighted by token count. This gives a clean val_loss signal so
# EarlyStopping fires on a real plateau rather than on dropout noise.
honour_training_flag_in_test_step: true
serve:
max_upload_bytes: 10485760
decode_strategy: greedy # Decode strategy is selected at evaluate time
beam_width: 4 # Stored defaults for `scripts.evaluate --decode-strategy beam`
length_penalty: 0.7
repetition_penalty: 1.0
no_repeat_ngram_size: 3
cors_allowed_origins:
- http://localhost:3000
- http://localhost:5173
- http://localhost:5174
- http://127.0.0.1:5173
- http://127.0.0.1:5174