Spaces:

apoorvrajdev
/

image-captioning-api

Configuration error

App Files Files Community

apoorvrajdev commited on 27 days ago

Commit

b2594db

1 Parent(s): 430abdc

feat: bootstrap production-grade ML repository tooling

Browse files

Files changed (17) hide show

.env.example +48 -0
.gitignore +102 -12
.pre-commit-config.yaml +98 -0
.python-version +1 -0
LICENSE +21 -0
Makefile +177 -0
README.md +37 -37
docs/PHASE_0_NOTES.md +184 -0
docs/restructure-plan.md +199 -0
notebooks/01_ieee_inceptionv3_transformer.ipynb +786 -0
notebooks/README.md +81 -0
notebooks/image-captionin-using-dl.ipynb +0 -0
pyproject.toml +242 -0
requirements-dev.txt +33 -0
requirements-eval.txt +18 -0
requirements.txt +45 -0
src/captioning/__init__.py +0 -0

.env.example ADDED Viewed

	@@ -0,0 +1,48 @@

+# =============================================================================
+# .env.example — schema for environment variables.
+# -----------------------------------------------------------------------------
+# Copy this file to `.env` (which is gitignored) and fill in real values.
+# `pydantic-settings` automatically reads `.env` at startup and validates each
+# field. Variables prefixed CAPTIONING__ override nested config keys (see
+# `src/captioning/config/schema.py`); double underscore is the nesting delimiter.
+#
+# Example: CAPTIONING__TRAIN__BATCH_SIZE=32 overrides AppConfig.train.batch_size.
+# =============================================================================
+# ---- App-wide ----------------------------------------------------------------
+APP_ENV=development                          # development | staging | production
+LOG_LEVEL=INFO                               # DEBUG | INFO | WARNING | ERROR
+# ---- Backend (FastAPI) -------------------------------------------------------
+PORT=8000
+# Directory where weights/vocab are downloaded at startup. Empty in the image
+# layer; populated by `huggingface_hub.snapshot_download`. Use a writable path.
+MODEL_DIR=./models/cache
+MAX_UPLOAD_BYTES=10485760                    # 10 MB; rejects oversized images
+# Comma-separated list of allowed origins for CORS. In production, the Vercel
+# frontend URL only. NEVER use "*" in prod.
+CORS_ALLOWED_ORIGINS=http://localhost:3000,https://your-frontend.vercel.app
+# ---- HuggingFace Hub (model artefact storage) --------------------------------
+# Public model repo holding the trained weights, vocab.pkl, config.yaml.
+HF_REPO_ID=your-username/captioning-weights
+HF_REVISION=v1.0.0                           # Pin a specific tag for reproducibility
+# Optional: only needed for private repos or higher rate limits.
+# Generate at https://huggingface.co/settings/tokens (read-only is enough).
+HF_TOKEN=
+# ---- Experiment tracking (MLflow) --------------------------------------------
+# Local SQLite during dev; DagsHub URL in production.
+MLFLOW_TRACKING_URI=sqlite:///mlruns/mlflow.db
+# MLFLOW_TRACKING_URI=https://dagshub.com/your-username/captioning.mlflow
+# MLFLOW_TRACKING_USERNAME=your-username
+# MLFLOW_TRACKING_PASSWORD=your-dagshub-access-token
+MLFLOW_EXPERIMENT_NAME=captioning
+# ---- Observability (Phase 4) -------------------------------------------------
+# Sentry: error tracking. Free tier = 5k errors/mo. Get a DSN at sentry.io.
+# SENTRY_DSN=
+# SENTRY_TRACES_SAMPLE_RATE=0.1                # 10% of requests traced
+# ---- Frontend (Next.js) — also in frontend/.env.local ------------------------
+# NEXT_PUBLIC_API_URL=http://localhost:8000    # Backend base URL for client fetches

.gitignore CHANGED Viewed

@@ -1,30 +1,120 @@
-# Python
 __pycache__/
 *.py[cod]
-# Virtual environments
 venv/
 .venv/
-# Jupyter
 .ipynb_checkpoints/
-# Environment variables
-.env
-# ML Models
 *.h5
 *.pt
 *.pth
 *.onnx
-# Node
 node_modules/
-# Build
-dist/
-build/
-# OS
 .DS_Store
-Thumbs.db

+# =============================================================================
+# .gitignore — what NOT to commit
+# -----------------------------------------------------------------------------
+# Each block below is grouped by *why* the files are excluded so future readers
+# (and recruiters) understand the engineering rationale, not just the patterns.
+# =============================================================================
+# ---- Python bytecode / packaging ---------------------------------------------
+# Compiled artefacts. Regenerated automatically on every run.
 __pycache__/
 *.py[cod]
+*.egg-info/
+*.egg
+.eggs/
+build/
+dist/
+pip-wheel-metadata/
+# ---- Virtual environments ----------------------------------------------------
+# Per-developer; pinning is done via requirements.txt + .python-version.
 venv/
 .venv/
+env/
+.env-tf/
+# ---- Python tooling caches ---------------------------------------------------
+# Speed up local runs; nothing portable. Caches are recreated by the tools.
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+.tox/
+.coverage
+.coverage.*
+htmlcov/
+coverage.xml
+.nox/
+.hypothesis/
+# ---- Jupyter / notebooks -----------------------------------------------------
+# Checkpoints are autosaves; outputs are stripped by `nbstripout` pre-commit
+# so notebook diffs stay reviewable.
 .ipynb_checkpoints/
+*.ipynb_checkpoints
+# ---- ML / experiment tracking ------------------------------------------------
+# MLflow's local store, model artefacts, training run dumps. These are large
+# and should live in a model registry (HuggingFace Hub) or experiment-tracking
+# server (DagsHub MLflow), not in Git.
+mlruns/
+mlartifacts/
+outputs/
+runs/
+wandb/
+lightning_logs/
+# ---- Model weights / serialised artefacts ------------------------------------
+# Large binaries — published via HuggingFace Hub, not Git.
 *.h5
+*.keras
 *.pt
 *.pth
+*.ckpt
 *.onnx
+*.tflite
+*.pb
+*.savedmodel/
+*.safetensors
+# ---- Tokenizer / vocabulary artefacts ----------------------------------------
+# Pickles can carry RCE risk if blindly loaded from untrusted sources.
+# Vocabularies are versioned alongside their model in models/<version>/.
+*.pkl
+vocab_*.file
+sentencepiece.model
+# ---- Datasets ----------------------------------------------------------------
+# COCO is downloaded by `scripts/prepare_data.py`; never committed.
+data/
+datasets/
+*.tfrecord
+*.tfrecords
+# ---- Environment / secrets ---------------------------------------------------
+# `.env.example` is committed as the schema; `.env` never is.
+.env
+.env.local
+.env.*.local
+!.env.example
+# ---- Node / frontend ---------------------------------------------------------
 node_modules/
+.next/
+.turbo/
+.vercel/
+out/
+*.tsbuildinfo
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+# ---- Docker / build ----------------------------------------------------------
+.docker/
+# ---- Editors / IDEs ----------------------------------------------------------
+# Per-developer settings. Workspace-shared settings should go in `.vscode/*.json`
+# explicitly committed; anything else stays local.
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# ---- OS noise ----------------------------------------------------------------
 .DS_Store
+Thumbs.db
+desktop.ini
+# ---- Claude / AI tooling -----------------------------------------------------
+# Local Claude Code session state. Contains user-specific settings.
+.claude/

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,98 @@

+# =============================================================================
+# .pre-commit-config.yaml — automated checks that run on `git commit`.
+# -----------------------------------------------------------------------------
+# Why pre-commit hooks?
+#   They make broken commits *physically impossible* — failed checks abort the
+#   commit. This catches lint/type/secret issues at the lowest-cost moment
+#   (before they enter history) and is what serious teams expect.
+#
+# Setup (one-time, per developer):
+#   pip install pre-commit
+#   pre-commit install                # registers the hooks in .git/hooks/
+#   pre-commit run --all-files        # run once over the whole repo
+#
+# After setup, hooks run automatically on every `git commit`. To bypass them
+# in an emergency: `git commit --no-verify` (do not commit this habit).
+# =============================================================================
+# Run hooks against staged files only by default (faster). The CI workflow
+# runs `pre-commit run --all-files` to catch anything missed locally.
+default_install_hook_types: [pre-commit]
+default_stages: [pre-commit]
+fail_fast: false                              # Show ALL failures, not just first
+repos:
+  # ---------------------------------------------------------------------------
+  # General hygiene: whitespace, line endings, accidentally-committed binaries.
+  # ---------------------------------------------------------------------------
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.6.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: mixed-line-ending
+        args: [--fix=lf]                      # Force LF; CRLF is a Windows trap
+      - id: check-yaml
+        exclude: ^(\.github/workflows/.*\.yml)$  # Some YAML uses GHA syntax
+      - id: check-toml
+      - id: check-merge-conflict
+      - id: check-added-large-files
+        args: [--maxkb=5000]                  # Reject >5MB blobs (use HF Hub)
+      - id: check-case-conflict
+      - id: detect-private-key
+  # ---------------------------------------------------------------------------
+  # Ruff: Python lint + format. Replaces black + isort + flake8 with one tool.
+  # Reads config from pyproject.toml so behaviour is identical here and in CI.
+  # ---------------------------------------------------------------------------
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.5.0
+    hooks:
+      - id: ruff
+        args: [--fix]                         # Auto-fix what's safely fixable
+      - id: ruff-format
+  # ---------------------------------------------------------------------------
+  # mypy: static type checking. Limited to package code so notebooks/scripts
+  # don't gate commits.
+  # ---------------------------------------------------------------------------
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.10.1
+    hooks:
+      - id: mypy
+        files: ^(src/captioning|backend/app)/
+        additional_dependencies:
+          - pydantic>=2.7
+          - pydantic-settings>=2.3
+          - types-PyYAML
+          - types-requests
+  # ---------------------------------------------------------------------------
+  # nbstripout: strips outputs from .ipynb files on commit.
+  # Why: notebook outputs include large base64-encoded images and run state,
+  # which makes diffs unreadable and can leak data. Outputs are a *render*
+  # of the code, not source — they belong in CI artefacts, not Git history.
+  # ---------------------------------------------------------------------------
+  - repo: https://github.com/kynan/nbstripout
+    rev: 0.7.1
+    hooks:
+      - id: nbstripout
+  # ---------------------------------------------------------------------------
+  # Prettier: format frontend (.ts, .tsx, .json, .md, .css). Limited to the
+  # frontend/ subtree to avoid stepping on Markdown owned by docs writers.
+  # ---------------------------------------------------------------------------
+  - repo: https://github.com/pre-commit/mirrors-prettier
+    rev: v4.0.0-alpha.8
+    hooks:
+      - id: prettier
+        files: ^frontend/.*\.(ts|tsx|js|jsx|json|md|css)$
+  # ---------------------------------------------------------------------------
+  # gitleaks: scans for accidentally committed credentials (API keys, tokens,
+  # private keys). Catches mistakes BEFORE they hit a public remote.
+  # ---------------------------------------------------------------------------
+  - repo: https://github.com/gitleaks/gitleaks
+    rev: v8.18.4
+    hooks:
+      - id: gitleaks

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Apoorv Raj
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

Makefile ADDED Viewed

	@@ -0,0 +1,177 @@

+# =============================================================================
+# Makefile — common project commands.
+# -----------------------------------------------------------------------------
+# Why a Makefile when the team uses Windows + PowerShell?
+#   1. CI (Linux) runs these targets directly.
+#   2. The file is the canonical, discoverable command index — `make help`
+#      tells a new contributor (or a recruiter cloning the repo) the entire
+#      development workflow in one screen.
+#   3. Windows users can install Make via `winget install GnuWin32.Make`,
+#      use Git Bash, WSL, or just read the `RUN:` lines and run the underlying
+#      command in PowerShell directly.
+#
+# Conventions:
+#   - `.PHONY` declares targets that don't produce a same-named file.
+#   - Target naming: `verb-noun` (e.g. `docker-build`, not `build_docker`).
+#   - Each target is annotated with a one-line `## description` comment that
+#     `make help` parses and prints automatically.
+# =============================================================================
+# Default Python interpreter. Override on Windows: `make PYTHON=py install`.
+PYTHON ?= python
+PIP    ?= $(PYTHON) -m pip
+NPM    ?= npm
+# Directories
+SRC_DIR      := src/captioning
+BACKEND_DIR  := backend
+FRONTEND_DIR := frontend
+TESTS_DIR    := tests
+NOTEBOOK_FROZEN := notebooks/01_ieee_inceptionv3_transformer.ipynb
+# ---- Default goal: show available targets -----------------------------------
+.DEFAULT_GOAL := help
+.PHONY: help
+help: ## Show this help message
+	@echo "Image Captioning System — available commands"
+	@echo ""
+	@grep -E '^[a-zA-Z_-]+:.*?## ' $(MAKEFILE_LIST) | \
+		awk 'BEGIN {FS = ":.*?## "} {printf "  \033[36m%-22s\033[0m %s\n", $$1, $$2}'
+	@echo ""
+# =============================================================================
+# Install / setup
+# =============================================================================
+.PHONY: install
+install: ## Install runtime dependencies only (slim, for Docker parity)
+	$(PIP) install --upgrade pip
+	$(PIP) install -r requirements.txt
+.PHONY: install-dev
+install-dev: ## Install runtime + dev + eval extras + the captioning package (editable)
+	$(PIP) install --upgrade pip
+	$(PIP) install -r requirements-dev.txt -r requirements-eval.txt
+	$(PIP) install -e ".[hf,mlflow]"
+.PHONY: install-hooks
+install-hooks: ## Register pre-commit hooks in .git/hooks/
+	pre-commit install
+	pre-commit install --hook-type commit-msg
+# =============================================================================
+# Code quality
+# =============================================================================
+.PHONY: lint
+lint: ## Run ruff lint checks (no fixes)
+	ruff check $(SRC_DIR) $(BACKEND_DIR) scripts $(TESTS_DIR)
+.PHONY: format
+format: ## Auto-fix lint issues and reformat
+	ruff check --fix $(SRC_DIR) $(BACKEND_DIR) scripts $(TESTS_DIR)
+	ruff format $(SRC_DIR) $(BACKEND_DIR) scripts $(TESTS_DIR)
+.PHONY: typecheck
+typecheck: ## Run mypy static type checks
+	mypy $(SRC_DIR) $(BACKEND_DIR)/app scripts
+.PHONY: pre-commit
+pre-commit: ## Run all pre-commit hooks against ALL files
+	pre-commit run --all-files
+# =============================================================================
+# Testing
+# =============================================================================
+.PHONY: test
+test: ## Run pytest (fast, unit + integration)
+	pytest $(TESTS_DIR) $(BACKEND_DIR)/app/tests -v
+.PHONY: test-cov
+test-cov: ## Run tests with coverage report
+	pytest $(TESTS_DIR) $(BACKEND_DIR)/app/tests \
+		--cov=$(SRC_DIR) --cov=$(BACKEND_DIR)/app \
+		--cov-report=term-missing --cov-report=xml --cov-report=html
+.PHONY: test-smoke
+test-smoke: ## Run only the fast smoke tests (used by Docker HEALTHCHECK CI step)
+	pytest $(TESTS_DIR) -v -m "not slow" --maxfail=1
+# =============================================================================
+# ML lifecycle (Phase 1+ — placeholders until scripts/ exists)
+# =============================================================================
+.PHONY: train
+train: ## Train the IEEE InceptionV3+Transformer model from configs/base.yaml
+	$(PYTHON) -m scripts.train --config configs/base.yaml
+.PHONY: eval
+eval: ## Evaluate the latest model on COCO val (BLEU, CIDEr, METEOR, ROUGE)
+	$(PYTHON) -m scripts.evaluate --config configs/base.yaml --report docs/results/latest.md
+.PHONY: predict
+predict: ## CLI single-image inference (usage: make predict IMAGE=path/to/img.jpg)
+	$(PYTHON) -m scripts.predict --image $(IMAGE)
+# =============================================================================
+# Backend (FastAPI)
+# =============================================================================
+.PHONY: serve
+serve: ## Run the FastAPI backend locally with hot reload
+	uvicorn app.main:app --app-dir $(BACKEND_DIR) --host 0.0.0.0 --port 8000 --reload
+# =============================================================================
+# Docker
+# =============================================================================
+.PHONY: docker-build
+docker-build: ## Build the backend Docker image (slim, no HF extras)
+	docker build -f $(BACKEND_DIR)/Dockerfile -t captioning-backend:latest .
+.PHONY: docker-build-hf
+docker-build-hf: ## Build the backend image WITH HuggingFace baselines (~2.3 GB)
+	docker build --build-arg INSTALL_HF=1 -f $(BACKEND_DIR)/Dockerfile -t captioning-backend:hf-latest .
+.PHONY: docker-up
+docker-up: ## Start backend + frontend + mlflow via docker compose
+	docker compose up --build
+.PHONY: docker-down
+docker-down: ## Stop docker compose stack
+	docker compose down
+# =============================================================================
+# Reproducibility / paper integrity
+# =============================================================================
+.PHONY: freeze-paper-notebook
+freeze-paper-notebook: ## CI guard: assert the IEEE notebook hasn't been modified
+	@$(PYTHON) -c "import hashlib, sys; \
+h = hashlib.sha256(open('$(NOTEBOOK_FROZEN)', 'rb').read()).hexdigest(); \
+expected = open('.paper-notebook.sha256').read().strip() if __import__('os').path.exists('.paper-notebook.sha256') else None; \
+sys.exit(0) if expected is None else (print(f'ERROR: notebook hash {h} != frozen {expected}') or sys.exit(1)) if h != expected else (print('OK: paper notebook is byte-stable'), sys.exit(0))"
+.PHONY: lock-paper-notebook
+lock-paper-notebook: ## Record the current notebook hash as the frozen reference
+	@$(PYTHON) -c "import hashlib; \
+h = hashlib.sha256(open('$(NOTEBOOK_FROZEN)', 'rb').read()).hexdigest(); \
+open('.paper-notebook.sha256', 'w').write(h + '\n'); \
+print(f'Locked paper notebook at {h}')"
+# =============================================================================
+# Cleanup
+# =============================================================================
+.PHONY: clean
+clean: ## Remove build artefacts, caches, and test outputs (NOT models/)
+	rm -rf build/ dist/ *.egg-info src/*.egg-info
+	rm -rf .pytest_cache .mypy_cache .ruff_cache .coverage htmlcov coverage.xml
+	find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
+	find . -type f -name "*.pyc" -delete 2>/dev/null || true
+.PHONY: clean-all
+clean-all: clean ## clean + remove mlruns/, outputs/, and downloaded models cache
+	rm -rf mlruns/ outputs/ models/cache/

README.md CHANGED Viewed

@@ -24,10 +24,10 @@ OR explore the full pipeline here:
 The notebook includes:
-- End-to-end training pipeline
-- COCO dataset integration
-- Transformer-based caption generation
-- GPU-enabled execution
 ---
@@ -37,25 +37,25 @@ This project is backed by an **IEEE published research paper**:
 [![IEEE Paper](https://img.shields.io/badge/View%20Research%20Paper-IEEE-blue)](https://ieeexplore.ieee.org/document/10675203)
-📄 **Title:** AI Narratives: Bridging Visual Content and Linguistic Expression
 ---
 ### 🧠 Key Contributions
-- Designed a hybrid **CNN + Transformer architecture** for image captioning
-- Leveraged **InceptionV3** for visual feature extraction
-- Implemented **attention-based sequence generation**
-- Achieved improved caption quality using **BLEU evaluation**
-- Compared multiple CNN backbones (VGG, ResNet, Inception)
 ---
 ### 🚀 Practical Impact
-- Combines **computer vision and NLP** for real-world multimodal applications
-- Demonstrates ability to build **end-to-end deep learning pipelines**
-- Trained and evaluated on **COCO benchmark dataset** used in industry research
 # 🧠 Model Overview
@@ -82,7 +82,7 @@ Image → CNN Encoder → Feature Embeddings → Transformer Decoder → Caption
 # 📸 Sample Outputs
 ### 🟢 Example 1
-**Generated Caption:**
 `a man is standing on a beach with a surfboard`
 *<img width="923" height="906" alt="image" src="https://github.com/user-attachments/assets/64e8412b-1d49-404c-a5b2-1da121b224e2" />
@@ -91,7 +91,7 @@ Image → CNN Encoder → Feature Embeddings → Transformer Decoder → Caption
 ---
 ### 🟢 Example 2
-**Generated Caption:**
 `a man riding a motorcycle on a street`
 *<img width="832" height="857" alt="image" src="https://github.com/user-attachments/assets/c802d420-a1c1-48be-8e79-599f193c72cd" />
 *
@@ -119,10 +119,10 @@ The model is trained on the **COCO 2017 Dataset**, a large-scale benchmark datas
 Dataset characteristics:
-- 200,000+ images
-- 80 object categories
-- Multiple captions per image
-- Rich annotations for training
 ---
@@ -143,13 +143,13 @@ The project follows a complete deep learning workflow:
 # 🧰 Technologies Used
-- Python
-- TensorFlow / Keras
-- CNN (InceptionV3)
-- Transformer Architecture
-- NumPy, Pandas
-- Matplotlib
-- Jupyter Notebook
 ---
@@ -177,32 +177,32 @@ Key contributions:
 - Integration of **CNN + Transformer architecture**
 - Improved caption generation using **attention mechanisms**
 - Comparative analysis of CNN encoders (VGG, ResNet, Inception)
-- Enhanced tokenization strategies for better language modeling
 ---
 # ⚠️ Limitations
-- Struggles with highly complex or cluttered scenes
-- May generate generic captions for rare objects
-- Requires large datasets and compute for training
 ---
 # 🚀 Future Improvements
-- Replace CNN with **Vision Transformer (ViT)**
-- Use pretrained models like **BLIP / CLIP**
-- Optimize inference using **TensorRT / ONNX**
-- Deploy as **FastAPI-based real-time API**
-- Multi-GPU distributed training
 ---
 # 👨‍💻 Author
-**Apoorv Raj**
-AI Systems Engineer | Deep Learning | ML Infrastructure
 ---

 The notebook includes:
+- End-to-end training pipeline
+- COCO dataset integration
+- Transformer-based caption generation
+- GPU-enabled execution
 ---
 [![IEEE Paper](https://img.shields.io/badge/View%20Research%20Paper-IEEE-blue)](https://ieeexplore.ieee.org/document/10675203)
+📄 **Title:** AI Narratives: Bridging Visual Content and Linguistic Expression
 ---
 ### 🧠 Key Contributions
+- Designed a hybrid **CNN + Transformer architecture** for image captioning
+- Leveraged **InceptionV3** for visual feature extraction
+- Implemented **attention-based sequence generation**
+- Achieved improved caption quality using **BLEU evaluation**
+- Compared multiple CNN backbones (VGG, ResNet, Inception)
 ---
 ### 🚀 Practical Impact
+- Combines **computer vision and NLP** for real-world multimodal applications
+- Demonstrates ability to build **end-to-end deep learning pipelines**
+- Trained and evaluated on **COCO benchmark dataset** used in industry research
 # 🧠 Model Overview
 # 📸 Sample Outputs
 ### 🟢 Example 1
+**Generated Caption:**
 `a man is standing on a beach with a surfboard`
 *<img width="923" height="906" alt="image" src="https://github.com/user-attachments/assets/64e8412b-1d49-404c-a5b2-1da121b224e2" />
 ---
 ### 🟢 Example 2
+**Generated Caption:**
 `a man riding a motorcycle on a street`
 *<img width="832" height="857" alt="image" src="https://github.com/user-attachments/assets/c802d420-a1c1-48be-8e79-599f193c72cd" />
 *
 Dataset characteristics:
+- 200,000+ images
+- 80 object categories
+- Multiple captions per image
+- Rich annotations for training
 ---
 # 🧰 Technologies Used
+- Python
+- TensorFlow / Keras
+- CNN (InceptionV3)
+- Transformer Architecture
+- NumPy, Pandas
+- Matplotlib
+- Jupyter Notebook
 ---
 - Integration of **CNN + Transformer architecture**
 - Improved caption generation using **attention mechanisms**
 - Comparative analysis of CNN encoders (VGG, ResNet, Inception)
+- Enhanced tokenization strategies for better language modeling
 ---
 # ⚠️ Limitations
+- Struggles with highly complex or cluttered scenes
+- May generate generic captions for rare objects
+- Requires large datasets and compute for training
 ---
 # 🚀 Future Improvements
+- Replace CNN with **Vision Transformer (ViT)**
+- Use pretrained models like **BLIP / CLIP**
+- Optimize inference using **TensorRT / ONNX**
+- Deploy as **FastAPI-based real-time API**
+- Multi-GPU distributed training
 ---
 # 👨‍💻 Author
+**Apoorv Raj**
+AI Systems Engineer | Deep Learning | ML Infrastructure
 ---

docs/PHASE_0_NOTES.md ADDED Viewed

	@@ -0,0 +1,184 @@

+# Phase 0 — Bootstrap (decision log)
+> Phase 0 establishes the engineering scaffolding the rest of the project will
+> stand on. Nothing here changes the model; everything here changes how the
+> repo *looks and behaves* to the next person who clones it (including
+> recruiters and CI runners).
+## What this phase delivers
+| Artefact | Purpose |
+|---|---|
+| [`notebooks/01_ieee_inceptionv3_transformer.ipynb`](../notebooks/01_ieee_inceptionv3_transformer.ipynb) | Renamed from `image-captionin-using-dl.ipynb` via `git mv` to preserve history. Now the canonical, frozen IEEE artefact. |
+| [`notebooks/README.md`](../notebooks/README.md) | Documents the frozen-notebook policy and conventions for any new notebooks. |
+| [`pyproject.toml`](../pyproject.toml) | Single source of truth for the `captioning` Python package, dependency groups, and tool config (ruff/mypy/pytest/coverage). |
+| [`requirements.txt`](../requirements.txt) | Pinned runtime deps, used directly by Docker and CI (mirrors `[project.dependencies]`). |
+| [`requirements-dev.txt`](../requirements-dev.txt) | Pinned dev deps (lint, type-check, test, hooks). |
+| [`requirements-eval.txt`](../requirements-eval.txt) | Pinned metric deps, kept separate to avoid bloating the serving image. |
+| [`.python-version`](../.python-version) | Pins Python 3.10 for `pyenv` users. |
+| [`.env.example`](../.env.example) | Schema for `pydantic-settings`-loaded env vars. |
+| [`.pre-commit-config.yaml`](../.pre-commit-config.yaml) | Hooks: ruff, mypy, nbstripout, prettier (frontend), gitleaks. |
+| [`Makefile`](../Makefile) | Discoverable command index (`make help`). |
+| [`LICENSE`](../LICENSE) | MIT license, attribution to original author. |
+| [`.gitignore`](../.gitignore) | Production-grade exclusions, organised by purpose with explanatory comments. |
+| [`docs/restructure-plan.md`](./restructure-plan.md) | Public-facing engineering plan for Phases 0–4. |
+---
+## Decisions and reasoning
+### 1. Why `src/` layout over flat layout?
+A flat layout (`captioning/` at repo root) lets test code accidentally import
+from the working tree instead of the *installed* package. That hides bugs that
+would only surface in production, where the tree layout is gone. The `src/`
+layout forces every test, every script, and every import to go through the
+installed package — exactly the path users will follow. This is the layout
+the [Python Packaging Authority recommends](https://packaging.python.org/en/latest/discussions/src-layout-vs-flat-layout/),
+and it's what production Python codebases (FastAPI, Pydantic, HTTPX) use.
+### 2. Why `pyproject.toml` AND `requirements.txt`?
+They serve different audiences:
+- **`pyproject.toml`** is the *source of truth* for the package — its name,
+  version, abstract dependency ranges, optional extras, and tool configuration.
+  When you `pip install -e .[dev]`, this is what pip reads.
+- **`requirements.txt`** is the *concretely pinned snapshot* — used by Docker
+  builds, CI runners, and anyone who wants `pip install -r requirements.txt`
+  without cloning the source. It's regenerable from `pyproject.toml` via
+  `pip-compile`, but committing it explicitly makes installs deterministic and
+  diffable.
+Phase 5+ will switch to `pip-compile` for automated regeneration; for now,
+manual mirroring is simpler and beginner-readable.
+### 3. Why pin `tensorflow-cpu==2.15.0` so hard?
+Two independent reasons stack:
+1. **`tensorflow-cpu` (not `tensorflow`)**: the GPU build pulls ~600 MB of
+   CUDA libraries that are useless on CPU-only HuggingFace Spaces. Splitting
+   the wheel keeps the serving image well under 1.5 GB.
+2. **2.15 specifically**: TF 2.16 swapped to Keras 3 by default. The IEEE
+   notebook uses `tf.keras.layers.TextVectorization` with the Keras 2
+   save/load API. Upgrading silently changes vocab serialisation, which
+   silently changes BLEU. Pinning is the difference between
+   *reproducible-published-result* and *reproducibility theatre*.
+When Phase 5+ migrates to a modern multimodal backbone, this pin will move
+in a deliberate, tested step — not by accident.
+### 4. Why Ruff over Black + isort + flake8?
+Ruff replaces all three with one tool that runs ~100x faster, reads config
+from a single section in `pyproject.toml`, and ships its own formatter
+(`ruff format`) that is byte-identical to Black's output. One install, one
+config, one cache. Recruiters reading the repo see the modern Python tool;
+CI runs faster; `make format` is one command, not three.
+### 5. Why `nbstripout` is non-negotiable in pre-commit
+Notebook outputs include base64-encoded images, full DataFrames, and
+sometimes credentials printed by accident. Committed notebook diffs without
+output stripping are unreadable (`+aaaaaaaaaa[base64]+aaaaa…`) and
+occasionally leak data. `nbstripout` removes all output cells on commit,
+keeping notebook history clean and reviewable.
+### 6. Why include a `Makefile` on a Windows project?
+Three reasons:
+1. **CI runs on Linux** — every CI job uses the same Make targets, so the
+   commands you run locally match what CI runs.
+2. **Discoverability** — `make help` is one command that prints every
+   high-level operation with a one-line description. A new contributor (or
+   recruiter cloning the repo) sees the entire workflow in one screen.
+3. **Tooling availability** — Make is a 5-second install on Windows
+   (`winget install GnuWin32.Make`, Git Bash, or WSL). PowerShell users who
+   skip Make can still read the Makefile and run the underlying commands
+   directly.
+### 7. Why a `freeze-paper-notebook` Make target?
+The IEEE paper points reviewers at the notebook. If the notebook drifts from
+what the paper describes, reviewers running it will see numbers that don't
+match the paper — and that's a scientific integrity issue, not a software
+issue. The target hashes the notebook and asserts it matches a locked
+SHA-256. Phase 4 wires this into CI as a required check on `main`.
+### 8. Why split optional deps into `[hf]`, `[eval]`, `[mlflow]`, `[dev]`?
+The slim production image (`backend:latest`) does NOT need transformers,
+torch, pycocoevalcap, or MLflow. Bundling them adds ~1.5 GB of dependencies
+the production code never imports. Extras let `pip install -e ".[hf]"` add
+the HuggingFace baselines for the Phase 3 comparison demo, while
+`pip install -r requirements.txt` keeps the production install lean.
+### 9. Why MIT license?
+The IEEE paper is published under IEEE's standard terms; the *code* is
+covered separately. MIT is the most permissive widely recognised license —
+it lets recruiters, students, and other researchers freely fork, learn from,
+and extend the code. For a recruiter-grade portfolio project, permissive
+licensing signals "I want this work to be useful," which is the right tone.
+### 10. Why folder name `configs/` (plural), not `config/` (singular)?
+`config/` was the empty folder shipped with the template. The plural form
+`configs/` is the convention in modern Python ML projects (FastAPI's own
+example apps, Hydra projects, the official `transformers` repo) because
+it holds multiple files (one per environment, model variant, or run).
+Phase 1 creates `configs/` with content; the empty `config/` folder will
+be removed in the Phase 1 commit that introduces the YAML files.
+---
+## What this phase deliberately does NOT do
+- **No code is moved out of the notebook yet.** That's Phase 1, behind a
+  parity validation gate.
+- **No `src/captioning/` modules are created.** Empty `__init__.py` files
+  would just be churn; Phase 1 will create them with real code.
+- **No Dockerfile or docker-compose.yml.** They depend on `backend/app/`
+  existing; both arrive in Phase 1.
+- **No GitHub Actions workflows.** They live in Phase 2, after there is
+  Python code to lint and type-check.
+- **No README rewrite.** The current README accurately describes the
+  research; the demo-link rewrite happens in Phase 2 once a live URL exists.
+This restraint is deliberate. Each phase ships a coherent slice of value;
+running ahead would create half-built features and vague commits.
+---
+## Local setup checklist for the developer
+After pulling this commit, on a fresh dev box:
+```bash
+# 1. Create a Python 3.10 virtual environment.
+python -m venv .venv
+.venv\Scripts\activate              # PowerShell
+# source .venv/bin/activate         # Linux/macOS
+# 2. Install dev dependencies + the package (editable).
+make install-dev
+# Or, without Make:
+#   pip install -r requirements-dev.txt -r requirements-eval.txt
+#   pip install -e ".[hf,mlflow]"
+# 3. Register pre-commit hooks.
+make install-hooks
+# Or:  pre-commit install
+# 4. (Optional) Lock the paper notebook's hash, so CI can enforce parity.
+make lock-paper-notebook
+# 5. Verify everything works.
+make pre-commit                     # Run all hooks against all files
+make test                           # No tests yet — exits cleanly with "no tests collected"
+```
+The first `make install-dev` will take a few minutes (TensorFlow is large).
+Subsequent runs hit the wheel cache and complete in seconds.

docs/restructure-plan.md ADDED Viewed

	@@ -0,0 +1,199 @@

+# Production Restructuring Plan
+> Public, in-repo copy of the engineering plan that drives the transition from
+> a single-notebook research project into a deployable multimodal AI platform.
+> The original (with internal exploration notes) lives in the developer's
+> `~/.claude/plans/` directory; this version is the canonical public artefact.
+## Context
+This repository is the engineering home of an IEEE-published image-captioning
+research project. The published artefact is a single Jupyter notebook
+([`notebooks/01_ieee_inceptionv3_transformer.ipynb`](../notebooks/01_ieee_inceptionv3_transformer.ipynb))
+implementing **InceptionV3 (frozen) + custom Keras Transformer decoder**
+trained on **COCO 2017**, reporting **BLEU ~24**.
+**Goal**: convert the repo into a recruiter-grade, production-style
+multimodal AI platform with a live free-tier demo, while **preserving the
+IEEE notebook byte-for-byte** as the canonical research artefact.
+**Constraints**:
+- Hosting budget: **$0/month** → HuggingFace Spaces (backend) + Vercel free
+  (frontend) + HuggingFace Hub (model artefacts) + DagsHub free MLflow.
+- Multimodal scope (v1): **Tier 1 only** — add three pretrained HuggingFace
+  models (BLIP-base, ViT-GPT2, GIT-base-coco) for a side-by-side comparison
+  demo. Tier 2/3/4 are listed under *Future work* only.
+---
+## 1. Folder Structure (target)
+```
+image-captioning-system/
+├── notebooks/
+│   └── 01_ieee_inceptionv3_transformer.ipynb   # FROZEN
+├── src/captioning/                             # Installable Python package
+│   ├── config/                                 # Pydantic settings + YAML loader
+│   ├── data/                                   # COCO loaders, preprocess, splits
+│   ├── tokenizer/                              # CaptionTokenizer (Keras TextVectorization wrapper)
+│   ├── models/                                 # CNN encoder, Transformer decoder, factory
+│   ├── training/                               # Trainer, losses, metrics, callbacks
+│   ├── inference/                              # Greedy + beam search predictors
+│   ├── evaluation/                             # BLEU, CIDEr, METEOR, ROUGE
+│   ├── io/                                     # Checkpoints, image decoding, HF Hub I/O
+│   └── utils/                                  # Logging, seeding, timing
+├── configs/                                    # YAML hyperparameters (validated by Pydantic)
+├── scripts/                                    # CLI entrypoints (train, eval, predict, upload)
+├── models/                                     # Local checkpoint registry (gitignored content)
+├── backend/                                    # FastAPI service (depends on src/captioning)
+├── frontend/                                   # Next.js 14 + TypeScript + Tailwind + shadcn/ui
+├── tests/                                      # ML-core tests (unit + integration)
+├── docs/                                       # Architecture, ADRs, results, deployment
+├── .github/workflows/                          # CI, CD, model-eval
+├── docker-compose.yml                          # Local dev: backend + frontend + mlflow
+├── pyproject.toml                              # Single source of truth for the package
+└── Makefile                                    # Discoverable command index
+```
+**Key architectural rules**:
+- `src/captioning/` is the ML core; `backend/app/` imports from it. Never
+  reverse the dependency.
+- The IEEE notebook is **frozen** — `make freeze-paper-notebook` is a CI
+  check that fails on any byte change.
+- Model weights are **never committed**; they live in HuggingFace Hub
+  (`yourname/captioning-weights`) and are downloaded at backend startup.
+- Configuration is **YAML files validated by Pydantic v2 BaseSettings**, not
+  Hydra. Env vars override via `CAPTIONING__TRAIN__BATCH_SIZE=32` syntax.
+---
+## 2. Migration Strategy
+**Approach: verbatim refactor first, improvements second.** Reproducibility
+of the IEEE BLEU score is non-negotiable; behaviour parity must be proven
+*before* any improvement is made.
+### Phase 1a — "Lift and shift" (parity goal: BLEU within ±0.3 of notebook)
+| Step | Notebook cell | Target module |
+|---|---|---|
+| 1 | Hyperparams | `configs/base.yaml` + `src/captioning/config/schema.py` |
+| 2 | Caption preprocess | `data/preprocess.py::preprocess_caption` |
+| 3 | COCO loader | `data/coco.py::load_coco_annotations` |
+| 4 | Tokenizer | `tokenizer/vectorizer.py::CaptionTokenizer` |
+| 5 | Splits | `data/splits.py::make_splits(seed=...)` |
+| 6 | Image preprocess | `data/preprocess.py::preprocess_image` |
+| 7 | tf.data pipeline | `data/pipeline.py::build_{train,val}_pipeline` |
+| 8 | Augmentation | `data/augmentation.py::default_augmentation` |
+| 9 | InceptionV3 encoder | `models/encoder_cnn.py` |
+| 10 | Transformer encoder | `models/transformer_encoder.py` |
+| 11 | Embeddings | `models/embeddings.py` |
+| 12 | Transformer decoder | `models/transformer_decoder.py` |
+| 13 | Captioning model | `models/captioning_model.py` |
+| 14 | Wiring | `models/factory.py::build_caption_model(config)` |
+| 15 | Loss + compile | `training/losses.py` + `training/trainer.py` |
+| 16 | Fit | `training/trainer.py::Trainer.fit` |
+| 17 | Inference | `inference/greedy.py`, `inference/predictor.py` |
+| 18 | Save weights | `io/checkpoints.py` + `scripts/train.py` |
+### Parity validation gate
+`scripts/notebook_module_audit.py` runs both pipelines on a fixed 100-image
+fixture and asserts:
+- Tokenizer vocabulary identical (set equality).
+- Image preprocessing tensor-equal (`np.allclose`, atol=1e-5).
+- Model output logits equal at fixed weights (atol=1e-4).
+- Captions on 20 fixed images byte-equal between notebook and module path.
+### Phase 1b — Quality improvements (only after parity is green)
+1. Masked accuracy metric (notebook tracks loss only).
+2. Beam search inference.
+3. Warmup + cosine LR schedule (replaces bare Adam).
+4. CIDEr / METEOR / ROUGE-L (paper reports BLEU only).
+5. `vocab.json` sidecar alongside `vocab.pkl`.
+6. Label smoothing.
+---
+## 3. Implementation Roadmap
+| Phase | Deliverable | Effort | Recruiter signal |
+|---|---|---|---|
+| **0** | Repo bootstrap (this phase) | 3 hrs | Clean repo, lint passes from commit 1 |
+| **1** | Modular ML core + backend MVP | ~15 hrs | Working FastAPI for the IEEE model, runnable via `docker compose up` |
+| **2** | CI/CD + first deploy (HF Space + Vercel) | ~12 hrs | Live demo URL on LinkedIn |
+| **3** | Tier 1 multimodal: BLIP/ViT-GPT2/GIT comparison demo | ~20 hrs | The screenshot recruiters share |
+| **4** | Polish + observability (Sentry, Prometheus, ADRs) | ~8 hrs | Reads as production-grade, not a research one-off |
+### Future work (out of scope for v1)
+- **Tier 2**: ViT + Transformer fine-tune on COCO via Kaggle GPU (BLEU 24 → 32+).
+- **Tier 3**: Anthropic Claude vision endpoint as a "Frontier" tab.
+- **Tier 4**: VQA "Ask the image" extension reusing Tier 3 infra.
+- Self-hosted compose on a VPS with Caddy TLS and DVC dataset versioning.
+---
+## 4. Deployment Stack (free-tier)
+| Layer | Service | Why |
+|---|---|---|
+| Backend hosting | HuggingFace Spaces (Docker SDK, free CPU) | 16 GB RAM, ML-native, recruiter-clickable |
+| Frontend hosting | Vercel free | Next.js native; per-PR preview URLs |
+| Model artefacts | HuggingFace Hub | Free, unlimited public, versioned, model cards |
+| Experiment tracking | MLflow on DagsHub free | Public read-only tracking server |
+| Errors | Sentry free (5k errors/mo) | |
+| Uptime | UptimeRobot free | Doubles as HF Space wake-up keeper |
+| Domain | None (use `*.hf.space` and `*.vercel.app`) | $0 budget |
+---
+## 5. Trade-offs Decided
+| Decision | Alternative rejected | Reason |
+|---|---|---|
+| FastAPI | Flask | Async, OpenAPI, Pydantic, lifespan |
+| Next.js 14 App Router | Streamlit | Streamlit screams "research demo" |
+| TanStack Query | Redux | Server state belongs in a server-state lib |
+| YAML + Pydantic | Hydra | Hydra is overkill for 1–3 active configs |
+| MLflow on DagsHub | W&B | DagsHub public free; no recruiter login |
+| Keep TextVectorization | HF tokenizer in v1 | Changes vocab → breaks paper parity |
+| Verbatim refactor first | Clean rewrite | IEEE BLEU reproducibility non-negotiable |
+| `tensorflow-cpu==2.15.0` pinned | Floating TF | TF 2.16 broke Keras 2 compat with notebook |
+| HF Spaces backend | Fly.io paid | Free-tier-only constraint |
+| Multipart uploads | Base64 in JSON | 33% overhead, no streaming |
+| `--workers 1` uvicorn | Multi-worker | TF graph + InceptionV3 ×N OOMs |
+| Tier 1 only (HF baselines) | Tier 2/3/4 in v1 | User selected Tier 1; others as future work |
+---
+## 6. Verification Plan
+**Phase 1**:
+- `pytest tests/ -v` → all green; coverage ≥ 70% on `src/captioning/`.
+- `python scripts/notebook_module_audit.py` → parity assertions all pass.
+- `docker compose up` → `curl -F "file=@sample.jpg" http://localhost:8000/v1/captions`
+  returns valid caption JSON.
+**Phase 2**:
+- GitHub Actions `ci.yml` green on a PR.
+- HF Space URL serves `/v1/model/info`.
+- Vercel preview URL renders frontend; uploading a sample image returns a caption.
+**Phase 3**:
+- `GET /v1/models` returns 4 entries.
+- `POST /v1/compare` returns 4 captions; total latency < 15s on HF Space CPU.
+- `model-eval.yml` posts a BLEU comparison comment on a test PR.
+**Phase 4**:
+- `/metrics` exposes `caption_inference_seconds` histogram.
+- DagsHub MLflow link shows ≥ 1 logged run with metrics.
+- `make freeze-paper-notebook` fails when notebook bytes change; passes when restored.

notebooks/01_ieee_inceptionv3_transformer.ipynb ADDED Viewed

	@@ -0,0 +1,786 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "AldVDvOgcpbc"
+   },
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "import os\n",
+    "import json\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "import numpy as np\n",
+    "import time\n",
+    "import matplotlib.pyplot as plt\n",
+    "import collections\n",
+    "import random\n",
+    "import requests\n",
+    "import json\n",
+    "from math import sqrt\n",
+    "from PIL import Image\n",
+    "from tqdm.auto import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "BASE_PATH = '../input/coco-2017-dataset/coco2017'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(f'{BASE_PATH}/annotations/captions_train2017.json', 'r') as f:\n",
+    "    data = json.load(f)\n",
+    "    data = data['annotations']\n",
+    "\n",
+    "img_cap_pairs = []\n",
+    "\n",
+    "for sample in data:\n",
+    "    img_name = '%012d.jpg' % sample['image_id']\n",
+    "    img_cap_pairs.append([img_name, sample['caption']])\n",
+    "\n",
+    "captions = pd.DataFrame(img_cap_pairs, columns=['image', 'caption'])\n",
+    "captions['image'] = captions['image'].apply(\n",
+    "    lambda x: f'{BASE_PATH}/train2017/{x}'\n",
+    ")\n",
+    "captions = captions.sample(120000)\n",
+    "captions = captions.reset_index(drop=True)\n",
+    "captions.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "rWbe_xuhFaJp"
+   },
+   "outputs": [],
+   "source": [
+    "def preprocess(text):\n",
+    "    text = text.lower()\n",
+    "    text = re.sub(r'[^\\w\\s]', '', text)\n",
+    "    text = re.sub('\\s+', ' ', text)\n",
+    "    text = text.strip()\n",
+    "    text = '[start] ' + text + ' [end]'\n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "v_ouwWhKnEy5",
+    "outputId": "d190c744-d31e-430b-ed85-eb0295010c1d"
+   },
+   "outputs": [],
+   "source": [
+    "captions['caption'] = captions['caption'].apply(preprocess)\n",
+    "captions.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "6RBuExHWnGEt",
+    "outputId": "0242452f-4d17-4af6-a9bb-3bea7b09568e"
+   },
+   "outputs": [],
+   "source": [
+    "random_row = captions.sample(1).iloc[0]\n",
+    "print(random_row.caption)\n",
+    "print()\n",
+    "im = Image.open(random_row.image)\n",
+    "im"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "nSTivH_FSSf2"
+   },
+   "outputs": [],
+   "source": [
+    "MAX_LENGTH = 40\n",
+    "VOCABULARY_SIZE = 15000\n",
+    "BATCH_SIZE = 64\n",
+    "BUFFER_SIZE = 1000\n",
+    "EMBEDDING_DIM = 512\n",
+    "UNITS = 512\n",
+    "EPOCHS = 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "X8MGUNtBN2sz"
+   },
+   "outputs": [],
+   "source": [
+    "tokenizer = tf.keras.layers.TextVectorization(\n",
+    "    max_tokens=VOCABULARY_SIZE,\n",
+    "    standardize=None,\n",
+    "    output_sequence_length=MAX_LENGTH)\n",
+    "\n",
+    "tokenizer.adapt(captions['caption'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer.vocabulary_size()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pickle\n",
+    "\n",
+    "pickle.dump(tokenizer.get_vocabulary(), open('vocab_coco.file', 'wb'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "qvhg-6eKN3nz"
+   },
+   "outputs": [],
+   "source": [
+    "word2idx = tf.keras.layers.StringLookup(\n",
+    "    mask_token=\"\",\n",
+    "    vocabulary=tokenizer.get_vocabulary())\n",
+    "\n",
+    "idx2word = tf.keras.layers.StringLookup(\n",
+    "    mask_token=\"\",\n",
+    "    vocabulary=tokenizer.get_vocabulary(),\n",
+    "    invert=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Yrca2aN2N5WL"
+   },
+   "outputs": [],
+   "source": [
+    "img_to_cap_vector = collections.defaultdict(list)\n",
+    "for img, cap in zip(captions['image'], captions['caption']):\n",
+    "    img_to_cap_vector[img].append(cap)\n",
+    "\n",
+    "img_keys = list(img_to_cap_vector.keys())\n",
+    "random.shuffle(img_keys)\n",
+    "\n",
+    "slice_index = int(len(img_keys)*0.8)\n",
+    "img_name_train_keys, img_name_val_keys = (img_keys[:slice_index], \n",
+    "                                          img_keys[slice_index:])\n",
+    "\n",
+    "train_imgs = []\n",
+    "train_captions = []\n",
+    "for imgt in img_name_train_keys:\n",
+    "    capt_len = len(img_to_cap_vector[imgt])\n",
+    "    train_imgs.extend([imgt] * capt_len)\n",
+    "    train_captions.extend(img_to_cap_vector[imgt])\n",
+    "\n",
+    "val_imgs = []\n",
+    "val_captions = []\n",
+    "for imgv in img_name_val_keys:\n",
+    "    capv_len = len(img_to_cap_vector[imgv])\n",
+    "    val_imgs.extend([imgv] * capv_len)\n",
+    "    val_captions.extend(img_to_cap_vector[imgv])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "UHN3Q1YDN5TD",
+    "outputId": "0b0af2ea-f6d7-48c9-ba30-14d8d9c98418"
+   },
+   "outputs": [],
+   "source": [
+    "len(train_imgs), len(train_captions), len(val_imgs), len(val_captions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "12c-7FHzOFSq"
+   },
+   "outputs": [],
+   "source": [
+    "def load_data(img_path, caption):\n",
+    "    img = tf.io.read_file(img_path)\n",
+    "    img = tf.io.decode_jpeg(img, channels=3)\n",
+    "    img = tf.keras.layers.Resizing(299, 299)(img)\n",
+    "    img = tf.keras.applications.inception_v3.preprocess_input(img)\n",
+    "    caption = tokenizer(caption)\n",
+    "    return img, caption"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "vHk83y3eOFPz"
+   },
+   "outputs": [],
+   "source": [
+    "train_dataset = tf.data.Dataset.from_tensor_slices(\n",
+    "    (train_imgs, train_captions))\n",
+    "\n",
+    "train_dataset = train_dataset.map(\n",
+    "    load_data, num_parallel_calls=tf.data.AUTOTUNE\n",
+    "    ).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)\n",
+    "\n",
+    "val_dataset = tf.data.Dataset.from_tensor_slices(\n",
+    "    (val_imgs, val_captions))\n",
+    "\n",
+    "val_dataset = val_dataset.map(\n",
+    "    load_data, num_parallel_calls=tf.data.AUTOTUNE\n",
+    "    ).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "bQr_bgk11eMF"
+   },
+   "outputs": [],
+   "source": [
+    "image_augmentation = tf.keras.Sequential(\n",
+    "    [\n",
+    "        tf.keras.layers.RandomFlip(\"horizontal\"),\n",
+    "        tf.keras.layers.RandomRotation(0.2),\n",
+    "        tf.keras.layers.RandomContrast(0.3),\n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "H9GDJ9_1nIMO"
+   },
+   "outputs": [],
+   "source": [
+    "def CNN_Encoder():\n",
+    "    inception_v3 = tf.keras.applications.InceptionV3(\n",
+    "        include_top=False,\n",
+    "        weights='imagenet'\n",
+    "    )\n",
+    "\n",
+    "    output = inception_v3.output\n",
+    "    output = tf.keras.layers.Reshape(\n",
+    "        (-1, output.shape[-1]))(output)\n",
+    "\n",
+    "    cnn_model = tf.keras.models.Model(inception_v3.input, output)\n",
+    "    return cnn_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "jMy5MrE2PdHV"
+   },
+   "outputs": [],
+   "source": [
+    "class TransformerEncoderLayer(tf.keras.layers.Layer):\n",
+    "\n",
+    "    def __init__(self, embed_dim, num_heads):\n",
+    "        super().__init__()\n",
+    "        self.layer_norm_1 = tf.keras.layers.LayerNormalization()\n",
+    "        self.layer_norm_2 = tf.keras.layers.LayerNormalization()\n",
+    "        self.attention = tf.keras.layers.MultiHeadAttention(\n",
+    "            num_heads=num_heads, key_dim=embed_dim)\n",
+    "        self.dense = tf.keras.layers.Dense(embed_dim, activation=\"relu\")\n",
+    "    \n",
+    "\n",
+    "    def call(self, x, training):\n",
+    "        x = self.layer_norm_1(x)\n",
+    "        x = self.dense(x)\n",
+    "\n",
+    "        attn_output = self.attention(\n",
+    "            query=x,\n",
+    "            value=x,\n",
+    "            key=x,\n",
+    "            attention_mask=None,\n",
+    "            training=training\n",
+    "        )\n",
+    "\n",
+    "        x = self.layer_norm_2(x + attn_output)\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "MFqNFts0duGB"
+   },
+   "outputs": [],
+   "source": [
+    "class Embeddings(tf.keras.layers.Layer):\n",
+    "\n",
+    "    def __init__(self, vocab_size, embed_dim, max_len):\n",
+    "        super().__init__()\n",
+    "        self.token_embeddings = tf.keras.layers.Embedding(\n",
+    "            vocab_size, embed_dim)\n",
+    "        self.position_embeddings = tf.keras.layers.Embedding(\n",
+    "            max_len, embed_dim, input_shape=(None, max_len))\n",
+    "    \n",
+    "\n",
+    "    def call(self, input_ids):\n",
+    "        length = tf.shape(input_ids)[-1]\n",
+    "        position_ids = tf.range(start=0, limit=length, delta=1)\n",
+    "        position_ids = tf.expand_dims(position_ids, axis=0)\n",
+    "\n",
+    "        token_embeddings = self.token_embeddings(input_ids)\n",
+    "        position_embeddings = self.position_embeddings(position_ids)\n",
+    "\n",
+    "        return token_embeddings + position_embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "pcbCQqrDnJ4-"
+   },
+   "outputs": [],
+   "source": [
+    "class TransformerDecoderLayer(tf.keras.layers.Layer):\n",
+    "\n",
+    "    def __init__(self, embed_dim, units, num_heads):\n",
+    "        super().__init__()\n",
+    "        self.embedding = Embeddings(\n",
+    "            tokenizer.vocabulary_size(), embed_dim, MAX_LENGTH)\n",
+    "\n",
+    "        self.attention_1 = tf.keras.layers.MultiHeadAttention(\n",
+    "            num_heads=num_heads, key_dim=embed_dim, dropout=0.1\n",
+    "        )\n",
+    "        self.attention_2 = tf.keras.layers.MultiHeadAttention(\n",
+    "            num_heads=num_heads, key_dim=embed_dim, dropout=0.1\n",
+    "        )\n",
+    "\n",
+    "        self.layernorm_1 = tf.keras.layers.LayerNormalization()\n",
+    "        self.layernorm_2 = tf.keras.layers.LayerNormalization()\n",
+    "        self.layernorm_3 = tf.keras.layers.LayerNormalization()\n",
+    "\n",
+    "        self.ffn_layer_1 = tf.keras.layers.Dense(units, activation=\"relu\")\n",
+    "        self.ffn_layer_2 = tf.keras.layers.Dense(embed_dim)\n",
+    "\n",
+    "        self.out = tf.keras.layers.Dense(tokenizer.vocabulary_size(), activation=\"softmax\")\n",
+    "\n",
+    "        self.dropout_1 = tf.keras.layers.Dropout(0.3)\n",
+    "        self.dropout_2 = tf.keras.layers.Dropout(0.5)\n",
+    "    \n",
+    "\n",
+    "    def call(self, input_ids, encoder_output, training, mask=None):\n",
+    "        embeddings = self.embedding(input_ids)\n",
+    "\n",
+    "        combined_mask = None\n",
+    "        padding_mask = None\n",
+    "        \n",
+    "        if mask is not None:\n",
+    "            causal_mask = self.get_causal_attention_mask(embeddings)\n",
+    "            padding_mask = tf.cast(mask[:, :, tf.newaxis], dtype=tf.int32)\n",
+    "            combined_mask = tf.cast(mask[:, tf.newaxis, :], dtype=tf.int32)\n",
+    "            combined_mask = tf.minimum(combined_mask, causal_mask)\n",
+    "\n",
+    "        attn_output_1 = self.attention_1(\n",
+    "            query=embeddings,\n",
+    "            value=embeddings,\n",
+    "            key=embeddings,\n",
+    "            attention_mask=combined_mask,\n",
+    "            training=training\n",
+    "        )\n",
+    "\n",
+    "        out_1 = self.layernorm_1(embeddings + attn_output_1)\n",
+    "\n",
+    "        attn_output_2 = self.attention_2(\n",
+    "            query=out_1,\n",
+    "            value=encoder_output,\n",
+    "            key=encoder_output,\n",
+    "            attention_mask=padding_mask,\n",
+    "            training=training\n",
+    "        )\n",
+    "\n",
+    "        out_2 = self.layernorm_2(out_1 + attn_output_2)\n",
+    "\n",
+    "        ffn_out = self.ffn_layer_1(out_2)\n",
+    "        ffn_out = self.dropout_1(ffn_out, training=training)\n",
+    "        ffn_out = self.ffn_layer_2(ffn_out)\n",
+    "\n",
+    "        ffn_out = self.layernorm_3(ffn_out + out_2)\n",
+    "        ffn_out = self.dropout_2(ffn_out, training=training)\n",
+    "        preds = self.out(ffn_out)\n",
+    "        return preds\n",
+    "\n",
+    "\n",
+    "    def get_causal_attention_mask(self, inputs):\n",
+    "        input_shape = tf.shape(inputs)\n",
+    "        batch_size, sequence_length = input_shape[0], input_shape[1]\n",
+    "        i = tf.range(sequence_length)[:, tf.newaxis]\n",
+    "        j = tf.range(sequence_length)\n",
+    "        mask = tf.cast(i >= j, dtype=\"int32\")\n",
+    "        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))\n",
+    "        mult = tf.concat(\n",
+    "            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],\n",
+    "            axis=0\n",
+    "        )\n",
+    "        return tf.tile(mask, mult)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "9_NmSUaVys9R"
+   },
+   "outputs": [],
+   "source": [
+    "class ImageCaptioningModel(tf.keras.Model):\n",
+    "\n",
+    "    def __init__(self, cnn_model, encoder, decoder, image_aug=None):\n",
+    "        super().__init__()\n",
+    "        self.cnn_model = cnn_model\n",
+    "        self.encoder = encoder\n",
+    "        self.decoder = decoder\n",
+    "        self.image_aug = image_aug\n",
+    "        self.loss_tracker = tf.keras.metrics.Mean(name=\"loss\")\n",
+    "        self.acc_tracker = tf.keras.metrics.Mean(name=\"accuracy\")\n",
+    "\n",
+    "\n",
+    "    def calculate_loss(self, y_true, y_pred, mask):\n",
+    "        loss = self.loss(y_true, y_pred)\n",
+    "        mask = tf.cast(mask, dtype=loss.dtype)\n",
+    "        loss *= mask\n",
+    "        return tf.reduce_sum(loss) / tf.reduce_sum(mask)\n",
+    "\n",
+    "\n",
+    "    def calculate_accuracy(self, y_true, y_pred, mask):\n",
+    "        accuracy = tf.equal(y_true, tf.argmax(y_pred, axis=2))\n",
+    "        accuracy = tf.math.logical_and(mask, accuracy)\n",
+    "        accuracy = tf.cast(accuracy, dtype=tf.float32)\n",
+    "        mask = tf.cast(mask, dtype=tf.float32)\n",
+    "        return tf.reduce_sum(accuracy) / tf.reduce_sum(mask)\n",
+    "    \n",
+    "\n",
+    "    def compute_loss_and_acc(self, img_embed, captions, training=True):\n",
+    "        encoder_output = self.encoder(img_embed, training=True)\n",
+    "        y_input = captions[:, :-1]\n",
+    "        y_true = captions[:, 1:]\n",
+    "        mask = (y_true != 0)\n",
+    "        y_pred = self.decoder(\n",
+    "            y_input, encoder_output, training=True, mask=mask\n",
+    "        )\n",
+    "        loss = self.calculate_loss(y_true, y_pred, mask)\n",
+    "        acc = self.calculate_accuracy(y_true, y_pred, mask)\n",
+    "        return loss, acc\n",
+    "\n",
+    "    \n",
+    "    def train_step(self, batch):\n",
+    "        imgs, captions = batch\n",
+    "\n",
+    "        if self.image_aug:\n",
+    "            imgs = self.image_aug(imgs)\n",
+    "        \n",
+    "        img_embed = self.cnn_model(imgs)\n",
+    "\n",
+    "        with tf.GradientTape() as tape:\n",
+    "            loss, acc = self.compute_loss_and_acc(\n",
+    "                img_embed, captions\n",
+    "            )\n",
+    "    \n",
+    "        train_vars = (\n",
+    "            self.encoder.trainable_variables + self.decoder.trainable_variables\n",
+    "        )\n",
+    "        grads = tape.gradient(loss, train_vars)\n",
+    "        self.optimizer.apply_gradients(zip(grads, train_vars))\n",
+    "        self.loss_tracker.update_state(loss)\n",
+    "        self.acc_tracker.update_state(acc)\n",
+    "\n",
+    "        return {\"loss\": self.loss_tracker.result(), \"acc\": self.acc_tracker.result()}\n",
+    "    \n",
+    "\n",
+    "    def test_step(self, batch):\n",
+    "        imgs, captions = batch\n",
+    "\n",
+    "        img_embed = self.cnn_model(imgs)\n",
+    "\n",
+    "        loss, acc = self.compute_loss_and_acc(\n",
+    "            img_embed, captions, training=False\n",
+    "        )\n",
+    "\n",
+    "        self.loss_tracker.update_state(loss)\n",
+    "        self.acc_tracker.update_state(acc)\n",
+    "\n",
+    "        return {\"loss\": self.loss_tracker.result(), \"acc\": self.acc_tracker.result()}\n",
+    "\n",
+    "    @property\n",
+    "    def metrics(self):\n",
+    "        return [self.loss_tracker, self.acc_tracker]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "GqWpcsje0Hkh",
+    "outputId": "477f4a81-1e19-445a-d64d-cedad90a2893"
+   },
+   "outputs": [],
+   "source": [
+    "encoder = TransformerEncoderLayer(EMBEDDING_DIM, 1)\n",
+    "decoder = TransformerDecoderLayer(EMBEDDING_DIM, UNITS, 8)\n",
+    "\n",
+    "cnn_model = CNN_Encoder()\n",
+    "caption_model = ImageCaptioningModel(\n",
+    "    cnn_model=cnn_model, encoder=encoder, decoder=decoder, image_aug=image_augmentation,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "bayNssgNX6QN"
+   },
+   "outputs": [],
+   "source": [
+    "cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(\n",
+    "    from_logits=False, reduction=\"none\"\n",
+    ")\n",
+    "\n",
+    "early_stopping = tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)\n",
+    "\n",
+    "caption_model.compile(\n",
+    "    optimizer=tf.keras.optimizers.Adam(),\n",
+    "    loss=cross_entropy\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "1RYo-MRVYn49"
+   },
+   "outputs": [],
+   "source": [
+    "history = caption_model.fit(\n",
+    "    train_dataset,\n",
+    "    epochs=EPOCHS,\n",
+    "    validation_data=val_dataset,\n",
+    "    callbacks=[early_stopping]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.plot(history.history['loss'], label='train_loss')\n",
+    "plt.plot(history.history['val_loss'], label='validation loss')\n",
+    "plt.legend()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "3ErlQQICtj_g"
+   },
+   "outputs": [],
+   "source": [
+    "def load_image_from_path(img_path):\n",
+    "    img = tf.io.read_file(img_path)\n",
+    "    img = tf.io.decode_jpeg(img, channels=3)\n",
+    "    img = tf.keras.layers.Resizing(299, 299)(img)\n",
+    "    img = tf.keras.applications.inception_v3.preprocess_input(img)\n",
+    "    return img\n",
+    "\n",
+    "\n",
+    "def generate_caption(img_path, add_noise=False):\n",
+    "    img = load_image_from_path(img_path)\n",
+    "    \n",
+    "    if add_noise:\n",
+    "        noise = tf.random.normal(img.shape)*0.1\n",
+    "        img = img + noise\n",
+    "        img = (img - tf.reduce_min(img))/(tf.reduce_max(img) - tf.reduce_min(img))\n",
+    "    \n",
+    "    img = tf.expand_dims(img, axis=0)\n",
+    "    img_embed = caption_model.cnn_model(img)\n",
+    "    img_encoded = caption_model.encoder(img_embed, training=False)\n",
+    "\n",
+    "    y_inp = '[start]'\n",
+    "    for i in range(MAX_LENGTH-1):\n",
+    "        tokenized = tokenizer([y_inp])[:, :-1]\n",
+    "        mask = tf.cast(tokenized != 0, tf.int32)\n",
+    "        pred = caption_model.decoder(\n",
+    "            tokenized, img_encoded, training=False, mask=mask)\n",
+    "        \n",
+    "        pred_idx = np.argmax(pred[0, i, :])\n",
+    "        pred_idx = tf.convert_to_tensor(pred_idx)\n",
+    "        pred_word = idx2word(pred_idx).numpy().decode('utf-8')\n",
+    "        if pred_word == '[end]':\n",
+    "            break\n",
+    "        \n",
+    "        y_inp += ' ' + pred_word\n",
+    "    \n",
+    "    y_inp = y_inp.replace('[start] ', '')\n",
+    "    return y_inp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "27_bJe_M1Drr"
+   },
+   "outputs": [],
+   "source": [
+    "idx = random.randrange(0, len(captions))\n",
+    "img_path = captions.iloc[idx].image\n",
+    "\n",
+    "pred_caption = generate_caption(img_path)\n",
+    "print('Predicted Caption:', pred_caption)\n",
+    "print()\n",
+    "Image.open(img_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "img_url = \"https://images.unsplash.com/photo-1714981725936-8817d6a32dd3?w=500&auto=format&fit=crop&q=60&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHx0b3BpYy1mZWVkfDV8Qm4tRGpyY0Jyd298fGVufDB8fHx8fA%3D%3D\"\n",
+    "im = Image.open(requests.get(img_url, stream=True).raw)\n",
+    "im = im.convert('RGB')\n",
+    "im.save('tmp.jpg')\n",
+    "\n",
+    "pred_caption = generate_caption('tmp.jpg', add_noise=False)\n",
+    "print('Predicted Caption:', pred_caption)\n",
+    "print()\n",
+    "im"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "img_url = \"https://images.unsplash.com/photo-1669173733011-6f1afef8d5e6?w=500&auto=format&fit=crop&q=60&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHx0b3BpYy1mZWVkfDE0fEJuLURqcmNCcndvfHxlbnwwfHx8fHw%3D\"\n",
+    "im = Image.open(requests.get(img_url, stream=True).raw)\n",
+    "im = im.convert('RGB')\n",
+    "im.save('tmp.jpg')\n",
+    "\n",
+    "pred_caption = generate_caption('tmp.jpg', add_noise=False)\n",
+    "print('Predicted Caption:', pred_caption)\n",
+    "print()\n",
+    "im"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "img_url = \"https://images.unsplash.com/photo-1499676988064-a3779763470e?w=500&auto=format&fit=crop&q=60&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHx0b3BpYy1mZWVkfDEzMnxCbi1EanJjQnJ3b3x8ZW58MHx8fHx8\"\n",
+    "im = Image.open(requests.get(img_url, stream=True).raw)\n",
+    "im = im.convert('RGB')\n",
+    "im.save('tmp.jpg')\n",
+    "\n",
+    "pred_caption = generate_caption('tmp.jpg', add_noise=False)\n",
+    "print('Predicted Caption:', pred_caption)\n",
+    "print()\n",
+    "im"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "caption_model.save_weights('model.h5')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "XG69m29gs6W4"
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kaggle": {
+   "accelerator": "gpu",
+   "dataSources": [
+    {
+     "databundleVersionId": 1495989,
+     "datasetId": 857191,
+     "sourceId": 1462296,
+     "sourceType": "datasetVersion"
+    }
+   ],
+   "isGpuEnabled": true,
+   "isInternetEnabled": true,
+   "language": "python",
+   "sourceType": "notebook"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

notebooks/README.md ADDED Viewed

	@@ -0,0 +1,81 @@

+# Notebooks
+This directory holds Jupyter notebooks. Each notebook has a specific role in
+the project lifecycle, and the rules are different for each one.
+---
+## `01_ieee_inceptionv3_transformer.ipynb` — **FROZEN**
+This notebook is the **canonical research artefact** behind the IEEE
+publication [*AI Narratives: Bridging Visual Content and Linguistic
+Expression*](https://ieeexplore.ieee.org/document/10675203). It contains the
+exact training pipeline, hyperparameters, and inference code used to produce
+the BLEU ~24 score reported in the paper.
+### Why is it frozen?
+Reproducibility of a published result is non-negotiable. If the notebook drifts
+from what the paper describes, anyone trying to reproduce the result —
+reviewers, future students, recruiters running the demo — will see numbers that
+don't match the paper. That breaks scientific trust.
+### Rules
+1. **Do not edit cells.** No improvements, no refactors, no comment fixes.
+2. **Do not re-run cells with different seeds.** The committed outputs are
+   reference outputs — they are stripped on commit by `nbstripout`, but the
+   structure must stay identical.
+3. **Improvements go into the modular package** at [`src/captioning/`](../src/captioning/),
+   never back into this notebook.
+4. **Parity is enforced in CI.** The `make freeze-paper-notebook` target
+   computes a SHA-256 of this file and asserts it matches the locked hash in
+   `.paper-notebook.sha256`. If you change a cell, CI fails until you either
+   revert OR explicitly re-lock with `make lock-paper-notebook` AND update
+   the paper / model card to reflect the new behaviour.
+### When this rule changes
+The frozen state lifts when (and only when) we publish a v2 of the paper or
+explicitly mark a re-run in the changelog. Until then, treat this file like
+a museum exhibit.
+---
+## `02_dataset_eda.ipynb` — exploratory (Phase 1+)
+Dataset inspection. Caption length distributions, vocabulary coverage, image
+dimension histograms, class balance across COCO super-categories. This
+notebook **may** be edited freely; it's a working scratchpad, not a published
+artefact.
+## `03_attention_visualization.ipynb` — exploratory (Phase 4+)
+Visualisations of decoder attention weights over image patches. Used to
+generate the figures in [`docs/results/`](../docs/results/). Outputs are
+stripped by `nbstripout` on commit; PNGs land in `docs/images/attention/`
+when explicitly exported.
+---
+## Conventions for new notebooks
+If you add a new notebook:
+- **Number it** (`04_*`, `05_*`) so the lifecycle order is obvious.
+- **Use prose Markdown cells** between code cells — a notebook reads like a
+  short paper, not a Python script.
+- **Do not import from `notebooks/`** elsewhere in the codebase. Notebooks
+  consume the `captioning` package; they never define library code.
+- **Strip outputs before committing.** `nbstripout` does this automatically
+  if you ran `make install-hooks`. Without that hook, run `nbstripout
+  notebooks/your_notebook.ipynb` manually before `git add`.
+---
+## Why notebooks at all?
+Notebooks are excellent for *exploration* — narrative, mixed media, iterative
+data wrangling. They are bad for *libraries* — no testing, no type-checking,
+no module reuse, hidden cell-execution-order bugs. The IEEE notebook stays
+because the paper points at it; everything else lives in `src/captioning/`.

notebooks/image-captionin-using-dl.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml ADDED Viewed

	@@ -0,0 +1,242 @@

+# =============================================================================
+# pyproject.toml — single source of truth for the `captioning` Python package
+# -----------------------------------------------------------------------------
+# This file follows PEP 621 (project metadata) and PEP 517/518 (build system).
+# It replaces a scattered mix of setup.py + requirements.txt + setup.cfg with
+# one canonical config. `pip install -e .` installs the package from `src/`.
+#
+# Why src/ layout?  It prevents accidental imports of the package from the
+# repo root during testing — every test exercises the *installed* package,
+# the way users will actually import it. This is the layout used by the
+# Python Packaging Authority's example projects and recommended by pytest.
+# =============================================================================
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+# -----------------------------------------------------------------------------
+# Project metadata — what `pip show captioning` will display.
+# -----------------------------------------------------------------------------
+[project]
+name = "captioning"
+version = "0.1.0"
+description = "IEEE-published CNN+Transformer image captioning, restructured into a production-grade multimodal AI platform."
+readme = "README.md"
+requires-python = ">=3.10,<3.13"
+license = { text = "MIT" }
+authors = [
+    { name = "Apoorv Raj" },
+]
+keywords = [
+    "image-captioning",
+    "multimodal",
+    "transformer",
+    "computer-vision",
+    "tensorflow",
+    "fastapi",
+]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+# -----------------------------------------------------------------------------
+# Runtime dependencies for the core ML library + FastAPI backend.
+# Pinned hard. Floating versions across TF + transformers + torch is the most
+# common source of silent BLEU drift between training runs and deployments.
+#
+# - tensorflow-cpu (NOT tensorflow): saves ~600 MB and removes the CUDA driver
+#   dependency. We are deploying to CPU-only HuggingFace Spaces. If you ever
+#   train on a GPU box, install `tensorflow==2.15.0` in that environment only.
+# - 2.15.0 specifically: TF 2.16 swapped to Keras 3 by default and broke the
+#   `tf.keras.layers.TextVectorization` saving behaviour the IEEE notebook
+#   relies on. Stay on 2.15 for v1; upgrade is a deliberate Phase-5+ task.
+# - pydantic 2.x: required by FastAPI >= 0.100. Faster and stricter than v1.
+# -----------------------------------------------------------------------------
+dependencies = [
+    "tensorflow-cpu==2.15.0",
+    "numpy>=1.26,<2.0",          # NumPy 2.0 broke TF 2.15 binary compat
+    "pandas>=2.1,<3.0",
+    "pillow>=10.0,<11.0",
+    "pyyaml>=6.0,<7.0",
+    "pydantic>=2.7,<3.0",
+    "pydantic-settings>=2.3,<3.0",
+    "fastapi>=0.111,<1.0",
+    "uvicorn[standard]>=0.30,<1.0",
+    "python-multipart>=0.0.9",   # FastAPI multipart form data (image upload)
+    "huggingface-hub>=0.23,<1.0",  # Pulls weights from HF Hub at startup
+    "structlog>=24.1,<25.0",     # Structured JSON logs in prod, pretty in dev
+    "anyio>=4.3,<5.0",           # Thread-pool offload for sync TF inference
+    "tqdm>=4.66,<5.0",
+    "click>=8.1,<9.0",           # CLI for scripts/
+]
+# -----------------------------------------------------------------------------
+# Optional dependency groups — installed via `pip install ".[dev,eval]"`.
+# Splitting these keeps the production Docker image small (Phase 1 backend
+# image is ~1.1 GB; adding `hf` extras takes it to ~2.3 GB which is the
+# Phase 3 comparison image).
+# -----------------------------------------------------------------------------
+[project.optional-dependencies]
+# Tier-1 multimodal upgrade: BLIP, ViT-GPT2, GIT models from HuggingFace.
+# torch CPU is large (~700 MB); only install when serving the comparison demo.
+hf = [
+    "transformers==4.41.2",
+    "torch==2.3.0",
+    "sentencepiece>=0.2.0",
+    "accelerate>=0.30,<1.0",
+]
+# Evaluation metrics. Pulled separately because pycocoevalcap drags Java
+# dependencies (METEOR), which we don't want in the serving image.
+eval = [
+    "sacrebleu>=2.4,<3.0",
+    "nltk>=3.8,<4.0",
+    "rouge-score>=0.1.2",
+    "pycocoevalcap>=1.2",
+    "matplotlib>=3.8,<4.0",
+]
+# Experiment tracking. Local SQLite by default; points at DagsHub in prod.
+mlflow = [
+    "mlflow>=2.13,<3.0",
+]
+# Developer tooling: lint, type-check, test. Never deployed.
+dev = [
+    "ruff>=0.5,<1.0",
+    "mypy>=1.10,<2.0",
+    "pytest>=8.2,<9.0",
+    "pytest-cov>=5.0,<6.0",
+    "pytest-asyncio>=0.23,<1.0",
+    "httpx>=0.27,<1.0",          # FastAPI TestClient backend
+    "pre-commit>=3.7,<4.0",
+    "nbstripout>=0.7,<1.0",
+    "types-PyYAML",
+    "types-requests",
+]
+# -----------------------------------------------------------------------------
+# Where pip should install the package from (the `src/` layout).
+# -----------------------------------------------------------------------------
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["captioning*"]
+[tool.setuptools.package-data]
+"captioning" = ["py.typed"]      # PEP 561: ship type hints with the package
+# =============================================================================
+# Tooling configuration — co-located so a single file owns project policy.
+# =============================================================================
+# ---- Ruff: linter + formatter (replaces black + isort + flake8) -------------
+# We prefer Ruff because it runs ~100x faster and is the de-facto modern
+# default in the Python ecosystem. One tool, one config, one cache.
+[tool.ruff]
+line-length = 100
+target-version = "py310"
+src = ["src", "backend", "scripts", "tests"]
+extend-exclude = [
+    "notebooks",                 # Notebooks have their own conventions
+    "outputs",
+    "mlruns",
+    "frontend",
+]
+[tool.ruff.lint]
+# Curated rule set — pragmatic defaults, not the full strict catalogue.
+select = [
+    "E",   # pycodestyle errors
+    "W",   # pycodestyle warnings
+    "F",   # pyflakes
+    "I",   # isort import sorting
+    "B",   # flake8-bugbear (likely bugs)
+    "UP",  # pyupgrade (modern syntax)
+    "SIM", # flake8-simplify
+    "RET", # flake8-return
+    "PTH", # flake8-use-pathlib (prefer pathlib over os.path)
+    "RUF", # Ruff's own rules
+]
+ignore = [
+    "E501",  # line length — formatter handles it; lint warnings are noise
+    "B008",  # function call in default arg (FastAPI's Depends() pattern)
+]
+[tool.ruff.lint.per-file-ignores]
+"tests/**" = ["B011"]            # asserts in tests are fine
+"scripts/**" = ["T201"]          # print() in CLI scripts is fine
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+docstring-code-format = true
+# ---- Mypy: static type checker -----------------------------------------------
+# We only enforce types on our own code; third-party untyped libs are tolerated.
+[tool.mypy]
+python_version = "3.10"
+strict = false                   # Start lenient; tighten as types stabilise
+warn_unused_configs = true
+warn_redundant_casts = true
+warn_unused_ignores = true
+warn_no_return = true
+no_implicit_optional = true
+files = ["src/captioning", "backend/app", "scripts"]
+[[tool.mypy.overrides]]
+module = [
+    "tensorflow.*",
+    "transformers.*",
+    "huggingface_hub.*",
+    "PIL.*",
+    "nltk.*",
+    "sacrebleu.*",
+    "rouge_score.*",
+    "pycocoevalcap.*",
+]
+ignore_missing_imports = true
+# ---- Pytest -------------------------------------------------------------------
+[tool.pytest.ini_options]
+minversion = "8.0"
+testpaths = ["tests", "backend/app/tests"]
+addopts = [
+    "-ra",                       # Show short summary for non-passing tests
+    "--strict-markers",
+    "--strict-config",
+    "--showlocals",
+]
+markers = [
+    "slow: tests that take >10 seconds (run with -m slow)",
+    "gpu: tests requiring a GPU (skipped in CI by default)",
+]
+filterwarnings = [
+    "ignore::DeprecationWarning:tensorflow.*",
+    "ignore::FutureWarning:tensorflow.*",
+]
+# ---- Coverage -----------------------------------------------------------------
+[tool.coverage.run]
+branch = true
+source = ["src/captioning", "backend/app"]
+omit = ["*/tests/*", "*/__init__.py"]
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "raise NotImplementedError",
+    "if TYPE_CHECKING:",
+    "if __name__ == .__main__.:",
+]
+show_missing = true
+skip_covered = false

requirements-dev.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+# =============================================================================
+# requirements-dev.txt — developer tooling; never installed in production.
+# -----------------------------------------------------------------------------
+# Install alongside the runtime deps:
+#   pip install -r requirements.txt -r requirements-dev.txt
+#
+# Or in one go via the package extras:
+#   pip install -e ".[dev]"
+# =============================================================================
+# Inherit runtime deps so devs get a complete environment in one command.
+-r requirements.txt
+# ---- Lint / format / type-check ----------------------------------------------
+# Ruff replaces black + isort + flake8. ~100x faster, single config in pyproject.
+ruff==0.5.0
+mypy==1.10.1
+# ---- Test runner -------------------------------------------------------------
+pytest==8.2.2
+pytest-cov==5.0.0
+pytest-asyncio==0.23.7
+httpx==0.27.0                    # FastAPI TestClient transport
+# ---- Pre-commit / notebook hygiene -------------------------------------------
+# `pre-commit` orchestrates the hooks; `nbstripout` strips notebook outputs on
+# commit so notebook diffs stay reviewable (and avoid leaking PII in cell outputs).
+pre-commit==3.7.1
+nbstripout==0.7.1
+# ---- Type stubs --------------------------------------------------------------
+types-PyYAML==6.0.12.20240311
+types-requests==2.32.0.20240602

requirements-eval.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+# =============================================================================
+# requirements-eval.txt — caption-quality metrics. Used by `scripts/evaluate.py`
+# and by the `model-eval.yml` GitHub Action; NOT bundled into the serving image.
+# -----------------------------------------------------------------------------
+# pycocoevalcap drags Java for METEOR; we keep it in a separate file so the
+# slim production image stays Java-free.
+# =============================================================================
+-r requirements.txt
+# ---- Metrics -----------------------------------------------------------------
+sacrebleu==2.4.2                 # BLEU-1..4, corpus + sentence level
+nltk==3.8.1                      # BLEU + tokenisation utilities
+rouge-score==0.1.2               # ROUGE-L for caption quality
+pycocoevalcap==1.2               # CIDEr + METEOR (the COCO standard)
+# ---- Plotting / reports ------------------------------------------------------
+matplotlib==3.9.0

requirements.txt ADDED Viewed

	@@ -0,0 +1,45 @@

+# =============================================================================
+# requirements.txt — runtime dependencies for the FastAPI backend (slim).
+# -----------------------------------------------------------------------------
+# This file mirrors `[project.dependencies]` in pyproject.toml so that
+# Docker builds and CI can `pip install -r requirements.txt` without needing
+# the package source available.
+#
+# To regenerate from pyproject.toml later (recommended after Phase 1):
+#   pip install pip-tools
+#   pip-compile --extra=hf pyproject.toml -o requirements.txt
+#
+# All versions are pinned. Floating versions across TF + transformers + torch
+# is the most common silent source of BLEU drift between runs and deploys.
+# =============================================================================
+# ---- Core ML framework -------------------------------------------------------
+# CPU-only TF: deploy target is CPU HuggingFace Spaces. Pinned at 2.15 because
+# 2.16+ ships Keras 3 by default, which breaks the IEEE notebook's
+# `tf.keras.layers.TextVectorization` save/load semantics.
+tensorflow-cpu==2.15.0
+# ---- Numerics / data ---------------------------------------------------------
+# NumPy <2.0 because TF 2.15 was built against the 1.x ABI.
+numpy==1.26.4
+pandas==2.2.2
+pillow==10.3.0
+pyyaml==6.0.1
+# ---- Config & validation -----------------------------------------------------
+pydantic==2.7.4
+pydantic-settings==2.3.4
+# ---- API server --------------------------------------------------------------
+fastapi==0.111.0
+uvicorn[standard]==0.30.1
+python-multipart==0.0.9
+# ---- Model artefact pull at startup ------------------------------------------
+huggingface-hub==0.23.4
+# ---- Logging / async / CLI ---------------------------------------------------
+structlog==24.2.0
+anyio==4.4.0
+tqdm==4.66.4
+click==8.1.7

src/captioning/__init__.py ADDED Viewed

File without changes