Spaces:

apoorvrajdev
/

image-captioning-api

Configuration error

App Files Files Community

image-captioning-api / configs /train /stabilized.yaml

apoorvrajdev

feat(evaluation): add beam search, metrics pipeline, and stabilized training workflow

91a1214 21 days ago

raw

history blame contribute delete

3.4 kB

	# =============================================================================
	# configs/train/stabilized.yaml — first experimental run with the opt-in
	# training-stability primitives turned on.
	# -----------------------------------------------------------------------------
	# Identical to configs/base.yaml except for the four flags called out in the
	# `train:` section below. Every other field mirrors the IEEE notebook verbatim
	# so this run is comparable to the baseline at the same seed and architecture.
	#
	# Why a complete config (not a thin override)?
	# scripts/train.py only accepts --config; there is no --override merge mode
	# in the CLI (the README mentions one but it's aspirational, not implemented).
	# Duplicating the values here is the smallest correct change that keeps
	# base.yaml itself untouched — which was the explicit requirement for this
	# experiment phase.
	#
	# Usage:
	# python -m scripts.train --config configs/train/stabilized.yaml \
	# --output-dir outputs/runs/stabilized
	#
	# Compare against the baseline by training the same code twice — once with
	# configs/base.yaml, once with this file — and diffing the resulting
	# results/<run_id>/metrics.json files.
	# =============================================================================

	data:
	base_path: data/coco2017
	annotations_filename: captions_train2017.json
	images_subdir: train2017
	sample_size: 120000 # Same sample as base.yaml — comparability matters
	train_val_split: 0.8

	model:
	embedding_dim: 512
	units: 512
	max_length: 40
	vocabulary_size: 15000
	encoder_num_heads: 1
	decoder_num_heads: 8
	decoder_dropout_inner: 0.3
	decoder_dropout_outer: 0.5
	decoder_attention_dropout: 0.1

	train:
	epochs: 10
	batch_size: 64
	buffer_size: 1000
	early_stopping_patience: 3
	seed: 42
	learning_rate: 0.001
	weights_filename: model.h5

	# ---- the four flags this experiment is actually testing -------------------
	# Label smoothing 0.1 softens the cross-entropy target so the decoder
	# cannot collapse onto a handful of high-frequency tokens. Standard
	# transformer captioning recipe (BLIP, ViT-GPT2, GIT all use it).
	label_smoothing: 0.1

	# Warmup + cosine decay replaces the bare constant Adam LR. Transformers
	# trained from scratch with no warmup tend to settle into a "safe captions"
	# basin where every output looks like "a man standing ...". Cosine decay
	# then anneals smoothly toward min_learning_rate.
	lr_schedule: cosine
	warmup_steps: 500 # ~1/3 of an epoch at batch 64, sample 120k
	cosine_decay_steps: null # null -> trainer derives from steps_per_epoch * epochs
	min_learning_rate: 0.0

	# Restore conventional behaviour: dropout OFF during validation, accuracy
	# tracker weighted by token count. This gives a clean val_loss signal so
	# EarlyStopping fires on a real plateau rather than on dropout noise.
	honour_training_flag_in_test_step: true

	serve:
	max_upload_bytes: 10485760
	decode_strategy: greedy # Decode strategy is selected at evaluate time
	beam_width: 4 # Stored defaults for `scripts.evaluate --decode-strategy beam`
	length_penalty: 0.7
	repetition_penalty: 1.0
	no_repeat_ngram_size: 3
	cors_allowed_origins:
	- http://localhost:3000
	- http://localhost:5173
	- http://localhost:5174
	- http://127.0.0.1:5173
	- http://127.0.0.1:5174