Spaces:

apoorvrajdev
/

image-captioning-api

Configuration error

App Files Files Community

image-captioning-api / scripts /train.py

apoorvrajdev

feat: finalize Phase 1 modular ML architecture

3a2e5f0 27 days ago

raw

history blame contribute delete

3.54 kB

	"""Train the IEEE InceptionV3+Transformer captioning model.

	Usage:
	python -m scripts.train --config configs/base.yaml
	python -m scripts.train --config configs/base.yaml --output-dir models/v1.0.0

	The script orchestrates the same pipeline as the notebook, but each step is
	imported from the modular package — making it the canonical example of how
	the package is meant to be composed.
	"""

	from __future__ import annotations

	from pathlib import Path

	import click

	from captioning.config import load_config
	from captioning.data import (
	build_train_pipeline,
	build_val_pipeline,
	load_coco_annotations,
	make_image_level_splits,
	)
	from captioning.models import build_caption_model
	from captioning.preprocessing import CaptionTokenizer, preprocess_caption
	from captioning.training import Trainer
	from captioning.utils import configure_logging, get_logger, set_global_seed

	log = get_logger(__name__)


	@click.command()
	@click.option(
	"--config",
	"config_path",
	required=True,
	type=click.Path(exists=True, dir_okay=False, path_type=Path),
	help="YAML config file (e.g. configs/base.yaml).",
	)
	@click.option(
	"--output-dir",
	type=click.Path(path_type=Path),
	default="outputs/runs/latest",
	help="Where to save weights, vocab, and history.",
	)
	def main(config_path: Path, output_dir: Path) -> None:
	"""Run the full training pipeline end-to-end."""
	configure_logging()
	config = load_config(config_path)
	output_dir.mkdir(parents=True, exist_ok=True)

	set_global_seed(config.train.seed)
	log.info("config_loaded", path=str(config_path), output_dir=str(output_dir))

	# 1. Load + preprocess COCO captions ------------------------------------
	df = load_coco_annotations(
	base_path=config.data.base_path,
	annotations_filename=config.data.annotations_filename,
	images_subdir=config.data.images_subdir,
	sample_size=config.data.sample_size,
	seed=config.train.seed,
	caption_preprocessor=preprocess_caption,
	)

	# 2. Fit and persist the tokenizer --------------------------------------
	tokenizer = CaptionTokenizer(
	vocab_size=config.model.vocabulary_size,
	max_length=config.model.max_length,
	)
	tokenizer.fit(df["caption"])
	tokenizer.save(output_dir)

	# 3. Image-level train/val split ----------------------------------------
	train_imgs, train_caps, val_imgs, val_caps = make_image_level_splits(
	df, train_fraction=config.data.train_val_split, seed=config.train.seed
	)

	# 4. tf.data pipelines ---------------------------------------------------
	train_ds = build_train_pipeline(
	train_imgs,
	train_caps,
	tokenizer,
	batch_size=config.train.batch_size,
	buffer_size=config.train.buffer_size,
	)
	val_ds = build_val_pipeline(
	val_imgs,
	val_caps,
	tokenizer,
	batch_size=config.train.batch_size,
	buffer_size=config.train.buffer_size,
	)

	# 5. Build, compile, fit -------------------------------------------------
	model = build_caption_model(config, vocab_size=tokenizer.vocabulary_size)
	trainer = Trainer(model, config)
	trainer.fit(train_ds, val_ds, output_dir=output_dir)

	# 6. Save final weights to the canonical filename ------------------------
	final_weights = output_dir / config.train.weights_filename
	model.save_weights(str(final_weights))
	log.info("training_done", weights=str(final_weights))


	if __name__ == "__main__":
	main()