Spaces:
Configuration error
Configuration error
Commit ·
b2594db
1
Parent(s): 430abdc
feat: bootstrap production-grade ML repository tooling
Browse files- .env.example +48 -0
- .gitignore +102 -12
- .pre-commit-config.yaml +98 -0
- .python-version +1 -0
- LICENSE +21 -0
- Makefile +177 -0
- README.md +37 -37
- docs/PHASE_0_NOTES.md +184 -0
- docs/restructure-plan.md +199 -0
- notebooks/01_ieee_inceptionv3_transformer.ipynb +786 -0
- notebooks/README.md +81 -0
- notebooks/image-captionin-using-dl.ipynb +0 -0
- pyproject.toml +242 -0
- requirements-dev.txt +33 -0
- requirements-eval.txt +18 -0
- requirements.txt +45 -0
- src/captioning/__init__.py +0 -0
.env.example
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# .env.example — schema for environment variables.
|
| 3 |
+
# -----------------------------------------------------------------------------
|
| 4 |
+
# Copy this file to `.env` (which is gitignored) and fill in real values.
|
| 5 |
+
# `pydantic-settings` automatically reads `.env` at startup and validates each
|
| 6 |
+
# field. Variables prefixed CAPTIONING__ override nested config keys (see
|
| 7 |
+
# `src/captioning/config/schema.py`); double underscore is the nesting delimiter.
|
| 8 |
+
#
|
| 9 |
+
# Example: CAPTIONING__TRAIN__BATCH_SIZE=32 overrides AppConfig.train.batch_size.
|
| 10 |
+
# =============================================================================
|
| 11 |
+
|
| 12 |
+
# ---- App-wide ----------------------------------------------------------------
|
| 13 |
+
APP_ENV=development # development | staging | production
|
| 14 |
+
LOG_LEVEL=INFO # DEBUG | INFO | WARNING | ERROR
|
| 15 |
+
|
| 16 |
+
# ---- Backend (FastAPI) -------------------------------------------------------
|
| 17 |
+
PORT=8000
|
| 18 |
+
# Directory where weights/vocab are downloaded at startup. Empty in the image
|
| 19 |
+
# layer; populated by `huggingface_hub.snapshot_download`. Use a writable path.
|
| 20 |
+
MODEL_DIR=./models/cache
|
| 21 |
+
MAX_UPLOAD_BYTES=10485760 # 10 MB; rejects oversized images
|
| 22 |
+
# Comma-separated list of allowed origins for CORS. In production, the Vercel
|
| 23 |
+
# frontend URL only. NEVER use "*" in prod.
|
| 24 |
+
CORS_ALLOWED_ORIGINS=http://localhost:3000,https://your-frontend.vercel.app
|
| 25 |
+
|
| 26 |
+
# ---- HuggingFace Hub (model artefact storage) --------------------------------
|
| 27 |
+
# Public model repo holding the trained weights, vocab.pkl, config.yaml.
|
| 28 |
+
HF_REPO_ID=your-username/captioning-weights
|
| 29 |
+
HF_REVISION=v1.0.0 # Pin a specific tag for reproducibility
|
| 30 |
+
# Optional: only needed for private repos or higher rate limits.
|
| 31 |
+
# Generate at https://huggingface.co/settings/tokens (read-only is enough).
|
| 32 |
+
HF_TOKEN=
|
| 33 |
+
|
| 34 |
+
# ---- Experiment tracking (MLflow) --------------------------------------------
|
| 35 |
+
# Local SQLite during dev; DagsHub URL in production.
|
| 36 |
+
MLFLOW_TRACKING_URI=sqlite:///mlruns/mlflow.db
|
| 37 |
+
# MLFLOW_TRACKING_URI=https://dagshub.com/your-username/captioning.mlflow
|
| 38 |
+
# MLFLOW_TRACKING_USERNAME=your-username
|
| 39 |
+
# MLFLOW_TRACKING_PASSWORD=your-dagshub-access-token
|
| 40 |
+
MLFLOW_EXPERIMENT_NAME=captioning
|
| 41 |
+
|
| 42 |
+
# ---- Observability (Phase 4) -------------------------------------------------
|
| 43 |
+
# Sentry: error tracking. Free tier = 5k errors/mo. Get a DSN at sentry.io.
|
| 44 |
+
# SENTRY_DSN=
|
| 45 |
+
# SENTRY_TRACES_SAMPLE_RATE=0.1 # 10% of requests traced
|
| 46 |
+
|
| 47 |
+
# ---- Frontend (Next.js) — also in frontend/.env.local ------------------------
|
| 48 |
+
# NEXT_PUBLIC_API_URL=http://localhost:8000 # Backend base URL for client fetches
|
.gitignore
CHANGED
|
@@ -1,30 +1,120 @@
|
|
| 1 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
__pycache__/
|
| 3 |
*.py[cod]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
-
# Virtual environments
|
|
|
|
| 6 |
venv/
|
| 7 |
.venv/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
# Jupyter
|
|
|
|
|
|
|
| 10 |
.ipynb_checkpoints/
|
|
|
|
| 11 |
|
| 12 |
-
#
|
| 13 |
-
.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
#
|
|
|
|
| 16 |
*.h5
|
|
|
|
| 17 |
*.pt
|
| 18 |
*.pth
|
|
|
|
| 19 |
*.onnx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
node_modules/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
#
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
# OS
|
| 29 |
.DS_Store
|
| 30 |
-
Thumbs.db
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# .gitignore — what NOT to commit
|
| 3 |
+
# -----------------------------------------------------------------------------
|
| 4 |
+
# Each block below is grouped by *why* the files are excluded so future readers
|
| 5 |
+
# (and recruiters) understand the engineering rationale, not just the patterns.
|
| 6 |
+
# =============================================================================
|
| 7 |
+
|
| 8 |
+
# ---- Python bytecode / packaging ---------------------------------------------
|
| 9 |
+
# Compiled artefacts. Regenerated automatically on every run.
|
| 10 |
__pycache__/
|
| 11 |
*.py[cod]
|
| 12 |
+
*.egg-info/
|
| 13 |
+
*.egg
|
| 14 |
+
.eggs/
|
| 15 |
+
build/
|
| 16 |
+
dist/
|
| 17 |
+
pip-wheel-metadata/
|
| 18 |
|
| 19 |
+
# ---- Virtual environments ----------------------------------------------------
|
| 20 |
+
# Per-developer; pinning is done via requirements.txt + .python-version.
|
| 21 |
venv/
|
| 22 |
.venv/
|
| 23 |
+
env/
|
| 24 |
+
.env-tf/
|
| 25 |
+
|
| 26 |
+
# ---- Python tooling caches ---------------------------------------------------
|
| 27 |
+
# Speed up local runs; nothing portable. Caches are recreated by the tools.
|
| 28 |
+
.pytest_cache/
|
| 29 |
+
.mypy_cache/
|
| 30 |
+
.ruff_cache/
|
| 31 |
+
.tox/
|
| 32 |
+
.coverage
|
| 33 |
+
.coverage.*
|
| 34 |
+
htmlcov/
|
| 35 |
+
coverage.xml
|
| 36 |
+
.nox/
|
| 37 |
+
.hypothesis/
|
| 38 |
|
| 39 |
+
# ---- Jupyter / notebooks -----------------------------------------------------
|
| 40 |
+
# Checkpoints are autosaves; outputs are stripped by `nbstripout` pre-commit
|
| 41 |
+
# so notebook diffs stay reviewable.
|
| 42 |
.ipynb_checkpoints/
|
| 43 |
+
*.ipynb_checkpoints
|
| 44 |
|
| 45 |
+
# ---- ML / experiment tracking ------------------------------------------------
|
| 46 |
+
# MLflow's local store, model artefacts, training run dumps. These are large
|
| 47 |
+
# and should live in a model registry (HuggingFace Hub) or experiment-tracking
|
| 48 |
+
# server (DagsHub MLflow), not in Git.
|
| 49 |
+
mlruns/
|
| 50 |
+
mlartifacts/
|
| 51 |
+
outputs/
|
| 52 |
+
runs/
|
| 53 |
+
wandb/
|
| 54 |
+
lightning_logs/
|
| 55 |
|
| 56 |
+
# ---- Model weights / serialised artefacts ------------------------------------
|
| 57 |
+
# Large binaries — published via HuggingFace Hub, not Git.
|
| 58 |
*.h5
|
| 59 |
+
*.keras
|
| 60 |
*.pt
|
| 61 |
*.pth
|
| 62 |
+
*.ckpt
|
| 63 |
*.onnx
|
| 64 |
+
*.tflite
|
| 65 |
+
*.pb
|
| 66 |
+
*.savedmodel/
|
| 67 |
+
*.safetensors
|
| 68 |
+
|
| 69 |
+
# ---- Tokenizer / vocabulary artefacts ----------------------------------------
|
| 70 |
+
# Pickles can carry RCE risk if blindly loaded from untrusted sources.
|
| 71 |
+
# Vocabularies are versioned alongside their model in models/<version>/.
|
| 72 |
+
*.pkl
|
| 73 |
+
vocab_*.file
|
| 74 |
+
sentencepiece.model
|
| 75 |
+
|
| 76 |
+
# ---- Datasets ----------------------------------------------------------------
|
| 77 |
+
# COCO is downloaded by `scripts/prepare_data.py`; never committed.
|
| 78 |
+
data/
|
| 79 |
+
datasets/
|
| 80 |
+
*.tfrecord
|
| 81 |
+
*.tfrecords
|
| 82 |
|
| 83 |
+
# ---- Environment / secrets ---------------------------------------------------
|
| 84 |
+
# `.env.example` is committed as the schema; `.env` never is.
|
| 85 |
+
.env
|
| 86 |
+
.env.local
|
| 87 |
+
.env.*.local
|
| 88 |
+
!.env.example
|
| 89 |
+
|
| 90 |
+
# ---- Node / frontend ---------------------------------------------------------
|
| 91 |
node_modules/
|
| 92 |
+
.next/
|
| 93 |
+
.turbo/
|
| 94 |
+
.vercel/
|
| 95 |
+
out/
|
| 96 |
+
*.tsbuildinfo
|
| 97 |
+
npm-debug.log*
|
| 98 |
+
yarn-debug.log*
|
| 99 |
+
yarn-error.log*
|
| 100 |
|
| 101 |
+
# ---- Docker / build ----------------------------------------------------------
|
| 102 |
+
.docker/
|
| 103 |
+
|
| 104 |
+
# ---- Editors / IDEs ----------------------------------------------------------
|
| 105 |
+
# Per-developer settings. Workspace-shared settings should go in `.vscode/*.json`
|
| 106 |
+
# explicitly committed; anything else stays local.
|
| 107 |
+
.vscode/
|
| 108 |
+
.idea/
|
| 109 |
+
*.swp
|
| 110 |
+
*.swo
|
| 111 |
+
*~
|
| 112 |
|
| 113 |
+
# ---- OS noise ----------------------------------------------------------------
|
| 114 |
.DS_Store
|
| 115 |
+
Thumbs.db
|
| 116 |
+
desktop.ini
|
| 117 |
+
|
| 118 |
+
# ---- Claude / AI tooling -----------------------------------------------------
|
| 119 |
+
# Local Claude Code session state. Contains user-specific settings.
|
| 120 |
+
.claude/
|
.pre-commit-config.yaml
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# .pre-commit-config.yaml — automated checks that run on `git commit`.
|
| 3 |
+
# -----------------------------------------------------------------------------
|
| 4 |
+
# Why pre-commit hooks?
|
| 5 |
+
# They make broken commits *physically impossible* — failed checks abort the
|
| 6 |
+
# commit. This catches lint/type/secret issues at the lowest-cost moment
|
| 7 |
+
# (before they enter history) and is what serious teams expect.
|
| 8 |
+
#
|
| 9 |
+
# Setup (one-time, per developer):
|
| 10 |
+
# pip install pre-commit
|
| 11 |
+
# pre-commit install # registers the hooks in .git/hooks/
|
| 12 |
+
# pre-commit run --all-files # run once over the whole repo
|
| 13 |
+
#
|
| 14 |
+
# After setup, hooks run automatically on every `git commit`. To bypass them
|
| 15 |
+
# in an emergency: `git commit --no-verify` (do not commit this habit).
|
| 16 |
+
# =============================================================================
|
| 17 |
+
|
| 18 |
+
# Run hooks against staged files only by default (faster). The CI workflow
|
| 19 |
+
# runs `pre-commit run --all-files` to catch anything missed locally.
|
| 20 |
+
default_install_hook_types: [pre-commit]
|
| 21 |
+
default_stages: [pre-commit]
|
| 22 |
+
fail_fast: false # Show ALL failures, not just first
|
| 23 |
+
|
| 24 |
+
repos:
|
| 25 |
+
# ---------------------------------------------------------------------------
|
| 26 |
+
# General hygiene: whitespace, line endings, accidentally-committed binaries.
|
| 27 |
+
# ---------------------------------------------------------------------------
|
| 28 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
| 29 |
+
rev: v4.6.0
|
| 30 |
+
hooks:
|
| 31 |
+
- id: trailing-whitespace
|
| 32 |
+
- id: end-of-file-fixer
|
| 33 |
+
- id: mixed-line-ending
|
| 34 |
+
args: [--fix=lf] # Force LF; CRLF is a Windows trap
|
| 35 |
+
- id: check-yaml
|
| 36 |
+
exclude: ^(\.github/workflows/.*\.yml)$ # Some YAML uses GHA syntax
|
| 37 |
+
- id: check-toml
|
| 38 |
+
- id: check-merge-conflict
|
| 39 |
+
- id: check-added-large-files
|
| 40 |
+
args: [--maxkb=5000] # Reject >5MB blobs (use HF Hub)
|
| 41 |
+
- id: check-case-conflict
|
| 42 |
+
- id: detect-private-key
|
| 43 |
+
|
| 44 |
+
# ---------------------------------------------------------------------------
|
| 45 |
+
# Ruff: Python lint + format. Replaces black + isort + flake8 with one tool.
|
| 46 |
+
# Reads config from pyproject.toml so behaviour is identical here and in CI.
|
| 47 |
+
# ---------------------------------------------------------------------------
|
| 48 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
| 49 |
+
rev: v0.5.0
|
| 50 |
+
hooks:
|
| 51 |
+
- id: ruff
|
| 52 |
+
args: [--fix] # Auto-fix what's safely fixable
|
| 53 |
+
- id: ruff-format
|
| 54 |
+
|
| 55 |
+
# ---------------------------------------------------------------------------
|
| 56 |
+
# mypy: static type checking. Limited to package code so notebooks/scripts
|
| 57 |
+
# don't gate commits.
|
| 58 |
+
# ---------------------------------------------------------------------------
|
| 59 |
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
| 60 |
+
rev: v1.10.1
|
| 61 |
+
hooks:
|
| 62 |
+
- id: mypy
|
| 63 |
+
files: ^(src/captioning|backend/app)/
|
| 64 |
+
additional_dependencies:
|
| 65 |
+
- pydantic>=2.7
|
| 66 |
+
- pydantic-settings>=2.3
|
| 67 |
+
- types-PyYAML
|
| 68 |
+
- types-requests
|
| 69 |
+
|
| 70 |
+
# ---------------------------------------------------------------------------
|
| 71 |
+
# nbstripout: strips outputs from .ipynb files on commit.
|
| 72 |
+
# Why: notebook outputs include large base64-encoded images and run state,
|
| 73 |
+
# which makes diffs unreadable and can leak data. Outputs are a *render*
|
| 74 |
+
# of the code, not source — they belong in CI artefacts, not Git history.
|
| 75 |
+
# ---------------------------------------------------------------------------
|
| 76 |
+
- repo: https://github.com/kynan/nbstripout
|
| 77 |
+
rev: 0.7.1
|
| 78 |
+
hooks:
|
| 79 |
+
- id: nbstripout
|
| 80 |
+
|
| 81 |
+
# ---------------------------------------------------------------------------
|
| 82 |
+
# Prettier: format frontend (.ts, .tsx, .json, .md, .css). Limited to the
|
| 83 |
+
# frontend/ subtree to avoid stepping on Markdown owned by docs writers.
|
| 84 |
+
# ---------------------------------------------------------------------------
|
| 85 |
+
- repo: https://github.com/pre-commit/mirrors-prettier
|
| 86 |
+
rev: v4.0.0-alpha.8
|
| 87 |
+
hooks:
|
| 88 |
+
- id: prettier
|
| 89 |
+
files: ^frontend/.*\.(ts|tsx|js|jsx|json|md|css)$
|
| 90 |
+
|
| 91 |
+
# ---------------------------------------------------------------------------
|
| 92 |
+
# gitleaks: scans for accidentally committed credentials (API keys, tokens,
|
| 93 |
+
# private keys). Catches mistakes BEFORE they hit a public remote.
|
| 94 |
+
# ---------------------------------------------------------------------------
|
| 95 |
+
- repo: https://github.com/gitleaks/gitleaks
|
| 96 |
+
rev: v8.18.4
|
| 97 |
+
hooks:
|
| 98 |
+
- id: gitleaks
|
.python-version
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
3.10
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2024 Apoorv Raj
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
Makefile
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# Makefile — common project commands.
|
| 3 |
+
# -----------------------------------------------------------------------------
|
| 4 |
+
# Why a Makefile when the team uses Windows + PowerShell?
|
| 5 |
+
# 1. CI (Linux) runs these targets directly.
|
| 6 |
+
# 2. The file is the canonical, discoverable command index — `make help`
|
| 7 |
+
# tells a new contributor (or a recruiter cloning the repo) the entire
|
| 8 |
+
# development workflow in one screen.
|
| 9 |
+
# 3. Windows users can install Make via `winget install GnuWin32.Make`,
|
| 10 |
+
# use Git Bash, WSL, or just read the `RUN:` lines and run the underlying
|
| 11 |
+
# command in PowerShell directly.
|
| 12 |
+
#
|
| 13 |
+
# Conventions:
|
| 14 |
+
# - `.PHONY` declares targets that don't produce a same-named file.
|
| 15 |
+
# - Target naming: `verb-noun` (e.g. `docker-build`, not `build_docker`).
|
| 16 |
+
# - Each target is annotated with a one-line `## description` comment that
|
| 17 |
+
# `make help` parses and prints automatically.
|
| 18 |
+
# =============================================================================
|
| 19 |
+
|
| 20 |
+
# Default Python interpreter. Override on Windows: `make PYTHON=py install`.
|
| 21 |
+
PYTHON ?= python
|
| 22 |
+
PIP ?= $(PYTHON) -m pip
|
| 23 |
+
NPM ?= npm
|
| 24 |
+
|
| 25 |
+
# Directories
|
| 26 |
+
SRC_DIR := src/captioning
|
| 27 |
+
BACKEND_DIR := backend
|
| 28 |
+
FRONTEND_DIR := frontend
|
| 29 |
+
TESTS_DIR := tests
|
| 30 |
+
NOTEBOOK_FROZEN := notebooks/01_ieee_inceptionv3_transformer.ipynb
|
| 31 |
+
|
| 32 |
+
# ---- Default goal: show available targets -----------------------------------
|
| 33 |
+
.DEFAULT_GOAL := help
|
| 34 |
+
|
| 35 |
+
.PHONY: help
|
| 36 |
+
help: ## Show this help message
|
| 37 |
+
@echo "Image Captioning System — available commands"
|
| 38 |
+
@echo ""
|
| 39 |
+
@grep -E '^[a-zA-Z_-]+:.*?## ' $(MAKEFILE_LIST) | \
|
| 40 |
+
awk 'BEGIN {FS = ":.*?## "} {printf " \033[36m%-22s\033[0m %s\n", $$1, $$2}'
|
| 41 |
+
@echo ""
|
| 42 |
+
|
| 43 |
+
# =============================================================================
|
| 44 |
+
# Install / setup
|
| 45 |
+
# =============================================================================
|
| 46 |
+
|
| 47 |
+
.PHONY: install
|
| 48 |
+
install: ## Install runtime dependencies only (slim, for Docker parity)
|
| 49 |
+
$(PIP) install --upgrade pip
|
| 50 |
+
$(PIP) install -r requirements.txt
|
| 51 |
+
|
| 52 |
+
.PHONY: install-dev
|
| 53 |
+
install-dev: ## Install runtime + dev + eval extras + the captioning package (editable)
|
| 54 |
+
$(PIP) install --upgrade pip
|
| 55 |
+
$(PIP) install -r requirements-dev.txt -r requirements-eval.txt
|
| 56 |
+
$(PIP) install -e ".[hf,mlflow]"
|
| 57 |
+
|
| 58 |
+
.PHONY: install-hooks
|
| 59 |
+
install-hooks: ## Register pre-commit hooks in .git/hooks/
|
| 60 |
+
pre-commit install
|
| 61 |
+
pre-commit install --hook-type commit-msg
|
| 62 |
+
|
| 63 |
+
# =============================================================================
|
| 64 |
+
# Code quality
|
| 65 |
+
# =============================================================================
|
| 66 |
+
|
| 67 |
+
.PHONY: lint
|
| 68 |
+
lint: ## Run ruff lint checks (no fixes)
|
| 69 |
+
ruff check $(SRC_DIR) $(BACKEND_DIR) scripts $(TESTS_DIR)
|
| 70 |
+
|
| 71 |
+
.PHONY: format
|
| 72 |
+
format: ## Auto-fix lint issues and reformat
|
| 73 |
+
ruff check --fix $(SRC_DIR) $(BACKEND_DIR) scripts $(TESTS_DIR)
|
| 74 |
+
ruff format $(SRC_DIR) $(BACKEND_DIR) scripts $(TESTS_DIR)
|
| 75 |
+
|
| 76 |
+
.PHONY: typecheck
|
| 77 |
+
typecheck: ## Run mypy static type checks
|
| 78 |
+
mypy $(SRC_DIR) $(BACKEND_DIR)/app scripts
|
| 79 |
+
|
| 80 |
+
.PHONY: pre-commit
|
| 81 |
+
pre-commit: ## Run all pre-commit hooks against ALL files
|
| 82 |
+
pre-commit run --all-files
|
| 83 |
+
|
| 84 |
+
# =============================================================================
|
| 85 |
+
# Testing
|
| 86 |
+
# =============================================================================
|
| 87 |
+
|
| 88 |
+
.PHONY: test
|
| 89 |
+
test: ## Run pytest (fast, unit + integration)
|
| 90 |
+
pytest $(TESTS_DIR) $(BACKEND_DIR)/app/tests -v
|
| 91 |
+
|
| 92 |
+
.PHONY: test-cov
|
| 93 |
+
test-cov: ## Run tests with coverage report
|
| 94 |
+
pytest $(TESTS_DIR) $(BACKEND_DIR)/app/tests \
|
| 95 |
+
--cov=$(SRC_DIR) --cov=$(BACKEND_DIR)/app \
|
| 96 |
+
--cov-report=term-missing --cov-report=xml --cov-report=html
|
| 97 |
+
|
| 98 |
+
.PHONY: test-smoke
|
| 99 |
+
test-smoke: ## Run only the fast smoke tests (used by Docker HEALTHCHECK CI step)
|
| 100 |
+
pytest $(TESTS_DIR) -v -m "not slow" --maxfail=1
|
| 101 |
+
|
| 102 |
+
# =============================================================================
|
| 103 |
+
# ML lifecycle (Phase 1+ — placeholders until scripts/ exists)
|
| 104 |
+
# =============================================================================
|
| 105 |
+
|
| 106 |
+
.PHONY: train
|
| 107 |
+
train: ## Train the IEEE InceptionV3+Transformer model from configs/base.yaml
|
| 108 |
+
$(PYTHON) -m scripts.train --config configs/base.yaml
|
| 109 |
+
|
| 110 |
+
.PHONY: eval
|
| 111 |
+
eval: ## Evaluate the latest model on COCO val (BLEU, CIDEr, METEOR, ROUGE)
|
| 112 |
+
$(PYTHON) -m scripts.evaluate --config configs/base.yaml --report docs/results/latest.md
|
| 113 |
+
|
| 114 |
+
.PHONY: predict
|
| 115 |
+
predict: ## CLI single-image inference (usage: make predict IMAGE=path/to/img.jpg)
|
| 116 |
+
$(PYTHON) -m scripts.predict --image $(IMAGE)
|
| 117 |
+
|
| 118 |
+
# =============================================================================
|
| 119 |
+
# Backend (FastAPI)
|
| 120 |
+
# =============================================================================
|
| 121 |
+
|
| 122 |
+
.PHONY: serve
|
| 123 |
+
serve: ## Run the FastAPI backend locally with hot reload
|
| 124 |
+
uvicorn app.main:app --app-dir $(BACKEND_DIR) --host 0.0.0.0 --port 8000 --reload
|
| 125 |
+
|
| 126 |
+
# =============================================================================
|
| 127 |
+
# Docker
|
| 128 |
+
# =============================================================================
|
| 129 |
+
|
| 130 |
+
.PHONY: docker-build
|
| 131 |
+
docker-build: ## Build the backend Docker image (slim, no HF extras)
|
| 132 |
+
docker build -f $(BACKEND_DIR)/Dockerfile -t captioning-backend:latest .
|
| 133 |
+
|
| 134 |
+
.PHONY: docker-build-hf
|
| 135 |
+
docker-build-hf: ## Build the backend image WITH HuggingFace baselines (~2.3 GB)
|
| 136 |
+
docker build --build-arg INSTALL_HF=1 -f $(BACKEND_DIR)/Dockerfile -t captioning-backend:hf-latest .
|
| 137 |
+
|
| 138 |
+
.PHONY: docker-up
|
| 139 |
+
docker-up: ## Start backend + frontend + mlflow via docker compose
|
| 140 |
+
docker compose up --build
|
| 141 |
+
|
| 142 |
+
.PHONY: docker-down
|
| 143 |
+
docker-down: ## Stop docker compose stack
|
| 144 |
+
docker compose down
|
| 145 |
+
|
| 146 |
+
# =============================================================================
|
| 147 |
+
# Reproducibility / paper integrity
|
| 148 |
+
# =============================================================================
|
| 149 |
+
|
| 150 |
+
.PHONY: freeze-paper-notebook
|
| 151 |
+
freeze-paper-notebook: ## CI guard: assert the IEEE notebook hasn't been modified
|
| 152 |
+
@$(PYTHON) -c "import hashlib, sys; \
|
| 153 |
+
h = hashlib.sha256(open('$(NOTEBOOK_FROZEN)', 'rb').read()).hexdigest(); \
|
| 154 |
+
expected = open('.paper-notebook.sha256').read().strip() if __import__('os').path.exists('.paper-notebook.sha256') else None; \
|
| 155 |
+
sys.exit(0) if expected is None else (print(f'ERROR: notebook hash {h} != frozen {expected}') or sys.exit(1)) if h != expected else (print('OK: paper notebook is byte-stable'), sys.exit(0))"
|
| 156 |
+
|
| 157 |
+
.PHONY: lock-paper-notebook
|
| 158 |
+
lock-paper-notebook: ## Record the current notebook hash as the frozen reference
|
| 159 |
+
@$(PYTHON) -c "import hashlib; \
|
| 160 |
+
h = hashlib.sha256(open('$(NOTEBOOK_FROZEN)', 'rb').read()).hexdigest(); \
|
| 161 |
+
open('.paper-notebook.sha256', 'w').write(h + '\n'); \
|
| 162 |
+
print(f'Locked paper notebook at {h}')"
|
| 163 |
+
|
| 164 |
+
# =============================================================================
|
| 165 |
+
# Cleanup
|
| 166 |
+
# =============================================================================
|
| 167 |
+
|
| 168 |
+
.PHONY: clean
|
| 169 |
+
clean: ## Remove build artefacts, caches, and test outputs (NOT models/)
|
| 170 |
+
rm -rf build/ dist/ *.egg-info src/*.egg-info
|
| 171 |
+
rm -rf .pytest_cache .mypy_cache .ruff_cache .coverage htmlcov coverage.xml
|
| 172 |
+
find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
|
| 173 |
+
find . -type f -name "*.pyc" -delete 2>/dev/null || true
|
| 174 |
+
|
| 175 |
+
.PHONY: clean-all
|
| 176 |
+
clean-all: clean ## clean + remove mlruns/, outputs/, and downloaded models cache
|
| 177 |
+
rm -rf mlruns/ outputs/ models/cache/
|
README.md
CHANGED
|
@@ -24,10 +24,10 @@ OR explore the full pipeline here:
|
|
| 24 |
|
| 25 |
The notebook includes:
|
| 26 |
|
| 27 |
-
- End-to-end training pipeline
|
| 28 |
-
- COCO dataset integration
|
| 29 |
-
- Transformer-based caption generation
|
| 30 |
-
- GPU-enabled execution
|
| 31 |
|
| 32 |
---
|
| 33 |
|
|
@@ -37,25 +37,25 @@ This project is backed by an **IEEE published research paper**:
|
|
| 37 |
|
| 38 |
[](https://ieeexplore.ieee.org/document/10675203)
|
| 39 |
|
| 40 |
-
📄 **Title:** AI Narratives: Bridging Visual Content and Linguistic Expression
|
| 41 |
|
| 42 |
---
|
| 43 |
|
| 44 |
### 🧠 Key Contributions
|
| 45 |
|
| 46 |
-
- Designed a hybrid **CNN + Transformer architecture** for image captioning
|
| 47 |
-
- Leveraged **InceptionV3** for visual feature extraction
|
| 48 |
-
- Implemented **attention-based sequence generation**
|
| 49 |
-
- Achieved improved caption quality using **BLEU evaluation**
|
| 50 |
-
- Compared multiple CNN backbones (VGG, ResNet, Inception)
|
| 51 |
|
| 52 |
---
|
| 53 |
|
| 54 |
### 🚀 Practical Impact
|
| 55 |
|
| 56 |
-
- Combines **computer vision and NLP** for real-world multimodal applications
|
| 57 |
-
- Demonstrates ability to build **end-to-end deep learning pipelines**
|
| 58 |
-
- Trained and evaluated on **COCO benchmark dataset** used in industry research
|
| 59 |
|
| 60 |
# 🧠 Model Overview
|
| 61 |
|
|
@@ -82,7 +82,7 @@ Image → CNN Encoder → Feature Embeddings → Transformer Decoder → Caption
|
|
| 82 |
# 📸 Sample Outputs
|
| 83 |
|
| 84 |
### 🟢 Example 1
|
| 85 |
-
**Generated Caption:**
|
| 86 |
`a man is standing on a beach with a surfboard`
|
| 87 |
|
| 88 |
*<img width="923" height="906" alt="image" src="https://github.com/user-attachments/assets/64e8412b-1d49-404c-a5b2-1da121b224e2" />
|
|
@@ -91,7 +91,7 @@ Image → CNN Encoder → Feature Embeddings → Transformer Decoder → Caption
|
|
| 91 |
---
|
| 92 |
|
| 93 |
### 🟢 Example 2
|
| 94 |
-
**Generated Caption:**
|
| 95 |
`a man riding a motorcycle on a street`
|
| 96 |
*<img width="832" height="857" alt="image" src="https://github.com/user-attachments/assets/c802d420-a1c1-48be-8e79-599f193c72cd" />
|
| 97 |
*
|
|
@@ -119,10 +119,10 @@ The model is trained on the **COCO 2017 Dataset**, a large-scale benchmark datas
|
|
| 119 |
|
| 120 |
Dataset characteristics:
|
| 121 |
|
| 122 |
-
- 200,000+ images
|
| 123 |
-
- 80 object categories
|
| 124 |
-
- Multiple captions per image
|
| 125 |
-
- Rich annotations for training
|
| 126 |
|
| 127 |
---
|
| 128 |
|
|
@@ -143,13 +143,13 @@ The project follows a complete deep learning workflow:
|
|
| 143 |
|
| 144 |
# 🧰 Technologies Used
|
| 145 |
|
| 146 |
-
- Python
|
| 147 |
-
- TensorFlow / Keras
|
| 148 |
-
- CNN (InceptionV3)
|
| 149 |
-
- Transformer Architecture
|
| 150 |
-
- NumPy, Pandas
|
| 151 |
-
- Matplotlib
|
| 152 |
-
- Jupyter Notebook
|
| 153 |
|
| 154 |
---
|
| 155 |
|
|
@@ -177,32 +177,32 @@ Key contributions:
|
|
| 177 |
- Integration of **CNN + Transformer architecture**
|
| 178 |
- Improved caption generation using **attention mechanisms**
|
| 179 |
- Comparative analysis of CNN encoders (VGG, ResNet, Inception)
|
| 180 |
-
- Enhanced tokenization strategies for better language modeling
|
| 181 |
|
| 182 |
---
|
| 183 |
|
| 184 |
# ⚠️ Limitations
|
| 185 |
|
| 186 |
-
- Struggles with highly complex or cluttered scenes
|
| 187 |
-
- May generate generic captions for rare objects
|
| 188 |
-
- Requires large datasets and compute for training
|
| 189 |
|
| 190 |
---
|
| 191 |
|
| 192 |
# 🚀 Future Improvements
|
| 193 |
|
| 194 |
-
- Replace CNN with **Vision Transformer (ViT)**
|
| 195 |
-
- Use pretrained models like **BLIP / CLIP**
|
| 196 |
-
- Optimize inference using **TensorRT / ONNX**
|
| 197 |
-
- Deploy as **FastAPI-based real-time API**
|
| 198 |
-
- Multi-GPU distributed training
|
| 199 |
|
| 200 |
---
|
| 201 |
|
| 202 |
# 👨💻 Author
|
| 203 |
|
| 204 |
-
**Apoorv Raj**
|
| 205 |
-
AI Systems Engineer | Deep Learning | ML Infrastructure
|
| 206 |
|
| 207 |
---
|
| 208 |
|
|
|
|
| 24 |
|
| 25 |
The notebook includes:
|
| 26 |
|
| 27 |
+
- End-to-end training pipeline
|
| 28 |
+
- COCO dataset integration
|
| 29 |
+
- Transformer-based caption generation
|
| 30 |
+
- GPU-enabled execution
|
| 31 |
|
| 32 |
---
|
| 33 |
|
|
|
|
| 37 |
|
| 38 |
[](https://ieeexplore.ieee.org/document/10675203)
|
| 39 |
|
| 40 |
+
📄 **Title:** AI Narratives: Bridging Visual Content and Linguistic Expression
|
| 41 |
|
| 42 |
---
|
| 43 |
|
| 44 |
### 🧠 Key Contributions
|
| 45 |
|
| 46 |
+
- Designed a hybrid **CNN + Transformer architecture** for image captioning
|
| 47 |
+
- Leveraged **InceptionV3** for visual feature extraction
|
| 48 |
+
- Implemented **attention-based sequence generation**
|
| 49 |
+
- Achieved improved caption quality using **BLEU evaluation**
|
| 50 |
+
- Compared multiple CNN backbones (VGG, ResNet, Inception)
|
| 51 |
|
| 52 |
---
|
| 53 |
|
| 54 |
### 🚀 Practical Impact
|
| 55 |
|
| 56 |
+
- Combines **computer vision and NLP** for real-world multimodal applications
|
| 57 |
+
- Demonstrates ability to build **end-to-end deep learning pipelines**
|
| 58 |
+
- Trained and evaluated on **COCO benchmark dataset** used in industry research
|
| 59 |
|
| 60 |
# 🧠 Model Overview
|
| 61 |
|
|
|
|
| 82 |
# 📸 Sample Outputs
|
| 83 |
|
| 84 |
### 🟢 Example 1
|
| 85 |
+
**Generated Caption:**
|
| 86 |
`a man is standing on a beach with a surfboard`
|
| 87 |
|
| 88 |
*<img width="923" height="906" alt="image" src="https://github.com/user-attachments/assets/64e8412b-1d49-404c-a5b2-1da121b224e2" />
|
|
|
|
| 91 |
---
|
| 92 |
|
| 93 |
### 🟢 Example 2
|
| 94 |
+
**Generated Caption:**
|
| 95 |
`a man riding a motorcycle on a street`
|
| 96 |
*<img width="832" height="857" alt="image" src="https://github.com/user-attachments/assets/c802d420-a1c1-48be-8e79-599f193c72cd" />
|
| 97 |
*
|
|
|
|
| 119 |
|
| 120 |
Dataset characteristics:
|
| 121 |
|
| 122 |
+
- 200,000+ images
|
| 123 |
+
- 80 object categories
|
| 124 |
+
- Multiple captions per image
|
| 125 |
+
- Rich annotations for training
|
| 126 |
|
| 127 |
---
|
| 128 |
|
|
|
|
| 143 |
|
| 144 |
# 🧰 Technologies Used
|
| 145 |
|
| 146 |
+
- Python
|
| 147 |
+
- TensorFlow / Keras
|
| 148 |
+
- CNN (InceptionV3)
|
| 149 |
+
- Transformer Architecture
|
| 150 |
+
- NumPy, Pandas
|
| 151 |
+
- Matplotlib
|
| 152 |
+
- Jupyter Notebook
|
| 153 |
|
| 154 |
---
|
| 155 |
|
|
|
|
| 177 |
- Integration of **CNN + Transformer architecture**
|
| 178 |
- Improved caption generation using **attention mechanisms**
|
| 179 |
- Comparative analysis of CNN encoders (VGG, ResNet, Inception)
|
| 180 |
+
- Enhanced tokenization strategies for better language modeling
|
| 181 |
|
| 182 |
---
|
| 183 |
|
| 184 |
# ⚠️ Limitations
|
| 185 |
|
| 186 |
+
- Struggles with highly complex or cluttered scenes
|
| 187 |
+
- May generate generic captions for rare objects
|
| 188 |
+
- Requires large datasets and compute for training
|
| 189 |
|
| 190 |
---
|
| 191 |
|
| 192 |
# 🚀 Future Improvements
|
| 193 |
|
| 194 |
+
- Replace CNN with **Vision Transformer (ViT)**
|
| 195 |
+
- Use pretrained models like **BLIP / CLIP**
|
| 196 |
+
- Optimize inference using **TensorRT / ONNX**
|
| 197 |
+
- Deploy as **FastAPI-based real-time API**
|
| 198 |
+
- Multi-GPU distributed training
|
| 199 |
|
| 200 |
---
|
| 201 |
|
| 202 |
# 👨💻 Author
|
| 203 |
|
| 204 |
+
**Apoorv Raj**
|
| 205 |
+
AI Systems Engineer | Deep Learning | ML Infrastructure
|
| 206 |
|
| 207 |
---
|
| 208 |
|
docs/PHASE_0_NOTES.md
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Phase 0 — Bootstrap (decision log)
|
| 2 |
+
|
| 3 |
+
> Phase 0 establishes the engineering scaffolding the rest of the project will
|
| 4 |
+
> stand on. Nothing here changes the model; everything here changes how the
|
| 5 |
+
> repo *looks and behaves* to the next person who clones it (including
|
| 6 |
+
> recruiters and CI runners).
|
| 7 |
+
|
| 8 |
+
## What this phase delivers
|
| 9 |
+
|
| 10 |
+
| Artefact | Purpose |
|
| 11 |
+
|---|---|
|
| 12 |
+
| [`notebooks/01_ieee_inceptionv3_transformer.ipynb`](../notebooks/01_ieee_inceptionv3_transformer.ipynb) | Renamed from `image-captionin-using-dl.ipynb` via `git mv` to preserve history. Now the canonical, frozen IEEE artefact. |
|
| 13 |
+
| [`notebooks/README.md`](../notebooks/README.md) | Documents the frozen-notebook policy and conventions for any new notebooks. |
|
| 14 |
+
| [`pyproject.toml`](../pyproject.toml) | Single source of truth for the `captioning` Python package, dependency groups, and tool config (ruff/mypy/pytest/coverage). |
|
| 15 |
+
| [`requirements.txt`](../requirements.txt) | Pinned runtime deps, used directly by Docker and CI (mirrors `[project.dependencies]`). |
|
| 16 |
+
| [`requirements-dev.txt`](../requirements-dev.txt) | Pinned dev deps (lint, type-check, test, hooks). |
|
| 17 |
+
| [`requirements-eval.txt`](../requirements-eval.txt) | Pinned metric deps, kept separate to avoid bloating the serving image. |
|
| 18 |
+
| [`.python-version`](../.python-version) | Pins Python 3.10 for `pyenv` users. |
|
| 19 |
+
| [`.env.example`](../.env.example) | Schema for `pydantic-settings`-loaded env vars. |
|
| 20 |
+
| [`.pre-commit-config.yaml`](../.pre-commit-config.yaml) | Hooks: ruff, mypy, nbstripout, prettier (frontend), gitleaks. |
|
| 21 |
+
| [`Makefile`](../Makefile) | Discoverable command index (`make help`). |
|
| 22 |
+
| [`LICENSE`](../LICENSE) | MIT license, attribution to original author. |
|
| 23 |
+
| [`.gitignore`](../.gitignore) | Production-grade exclusions, organised by purpose with explanatory comments. |
|
| 24 |
+
| [`docs/restructure-plan.md`](./restructure-plan.md) | Public-facing engineering plan for Phases 0–4. |
|
| 25 |
+
|
| 26 |
+
---
|
| 27 |
+
|
| 28 |
+
## Decisions and reasoning
|
| 29 |
+
|
| 30 |
+
### 1. Why `src/` layout over flat layout?
|
| 31 |
+
|
| 32 |
+
A flat layout (`captioning/` at repo root) lets test code accidentally import
|
| 33 |
+
from the working tree instead of the *installed* package. That hides bugs that
|
| 34 |
+
would only surface in production, where the tree layout is gone. The `src/`
|
| 35 |
+
layout forces every test, every script, and every import to go through the
|
| 36 |
+
installed package — exactly the path users will follow. This is the layout
|
| 37 |
+
the [Python Packaging Authority recommends](https://packaging.python.org/en/latest/discussions/src-layout-vs-flat-layout/),
|
| 38 |
+
and it's what production Python codebases (FastAPI, Pydantic, HTTPX) use.
|
| 39 |
+
|
| 40 |
+
### 2. Why `pyproject.toml` AND `requirements.txt`?
|
| 41 |
+
|
| 42 |
+
They serve different audiences:
|
| 43 |
+
|
| 44 |
+
- **`pyproject.toml`** is the *source of truth* for the package — its name,
|
| 45 |
+
version, abstract dependency ranges, optional extras, and tool configuration.
|
| 46 |
+
When you `pip install -e .[dev]`, this is what pip reads.
|
| 47 |
+
- **`requirements.txt`** is the *concretely pinned snapshot* — used by Docker
|
| 48 |
+
builds, CI runners, and anyone who wants `pip install -r requirements.txt`
|
| 49 |
+
without cloning the source. It's regenerable from `pyproject.toml` via
|
| 50 |
+
`pip-compile`, but committing it explicitly makes installs deterministic and
|
| 51 |
+
diffable.
|
| 52 |
+
|
| 53 |
+
Phase 5+ will switch to `pip-compile` for automated regeneration; for now,
|
| 54 |
+
manual mirroring is simpler and beginner-readable.
|
| 55 |
+
|
| 56 |
+
### 3. Why pin `tensorflow-cpu==2.15.0` so hard?
|
| 57 |
+
|
| 58 |
+
Two independent reasons stack:
|
| 59 |
+
|
| 60 |
+
1. **`tensorflow-cpu` (not `tensorflow`)**: the GPU build pulls ~600 MB of
|
| 61 |
+
CUDA libraries that are useless on CPU-only HuggingFace Spaces. Splitting
|
| 62 |
+
the wheel keeps the serving image well under 1.5 GB.
|
| 63 |
+
2. **2.15 specifically**: TF 2.16 swapped to Keras 3 by default. The IEEE
|
| 64 |
+
notebook uses `tf.keras.layers.TextVectorization` with the Keras 2
|
| 65 |
+
save/load API. Upgrading silently changes vocab serialisation, which
|
| 66 |
+
silently changes BLEU. Pinning is the difference between
|
| 67 |
+
*reproducible-published-result* and *reproducibility theatre*.
|
| 68 |
+
|
| 69 |
+
When Phase 5+ migrates to a modern multimodal backbone, this pin will move
|
| 70 |
+
in a deliberate, tested step — not by accident.
|
| 71 |
+
|
| 72 |
+
### 4. Why Ruff over Black + isort + flake8?
|
| 73 |
+
|
| 74 |
+
Ruff replaces all three with one tool that runs ~100x faster, reads config
|
| 75 |
+
from a single section in `pyproject.toml`, and ships its own formatter
|
| 76 |
+
(`ruff format`) that is byte-identical to Black's output. One install, one
|
| 77 |
+
config, one cache. Recruiters reading the repo see the modern Python tool;
|
| 78 |
+
CI runs faster; `make format` is one command, not three.
|
| 79 |
+
|
| 80 |
+
### 5. Why `nbstripout` is non-negotiable in pre-commit
|
| 81 |
+
|
| 82 |
+
Notebook outputs include base64-encoded images, full DataFrames, and
|
| 83 |
+
sometimes credentials printed by accident. Committed notebook diffs without
|
| 84 |
+
output stripping are unreadable (`+aaaaaaaaaa[base64]+aaaaa…`) and
|
| 85 |
+
occasionally leak data. `nbstripout` removes all output cells on commit,
|
| 86 |
+
keeping notebook history clean and reviewable.
|
| 87 |
+
|
| 88 |
+
### 6. Why include a `Makefile` on a Windows project?
|
| 89 |
+
|
| 90 |
+
Three reasons:
|
| 91 |
+
|
| 92 |
+
1. **CI runs on Linux** — every CI job uses the same Make targets, so the
|
| 93 |
+
commands you run locally match what CI runs.
|
| 94 |
+
2. **Discoverability** — `make help` is one command that prints every
|
| 95 |
+
high-level operation with a one-line description. A new contributor (or
|
| 96 |
+
recruiter cloning the repo) sees the entire workflow in one screen.
|
| 97 |
+
3. **Tooling availability** — Make is a 5-second install on Windows
|
| 98 |
+
(`winget install GnuWin32.Make`, Git Bash, or WSL). PowerShell users who
|
| 99 |
+
skip Make can still read the Makefile and run the underlying commands
|
| 100 |
+
directly.
|
| 101 |
+
|
| 102 |
+
### 7. Why a `freeze-paper-notebook` Make target?
|
| 103 |
+
|
| 104 |
+
The IEEE paper points reviewers at the notebook. If the notebook drifts from
|
| 105 |
+
what the paper describes, reviewers running it will see numbers that don't
|
| 106 |
+
match the paper — and that's a scientific integrity issue, not a software
|
| 107 |
+
issue. The target hashes the notebook and asserts it matches a locked
|
| 108 |
+
SHA-256. Phase 4 wires this into CI as a required check on `main`.
|
| 109 |
+
|
| 110 |
+
### 8. Why split optional deps into `[hf]`, `[eval]`, `[mlflow]`, `[dev]`?
|
| 111 |
+
|
| 112 |
+
The slim production image (`backend:latest`) does NOT need transformers,
|
| 113 |
+
torch, pycocoevalcap, or MLflow. Bundling them adds ~1.5 GB of dependencies
|
| 114 |
+
the production code never imports. Extras let `pip install -e ".[hf]"` add
|
| 115 |
+
the HuggingFace baselines for the Phase 3 comparison demo, while
|
| 116 |
+
`pip install -r requirements.txt` keeps the production install lean.
|
| 117 |
+
|
| 118 |
+
### 9. Why MIT license?
|
| 119 |
+
|
| 120 |
+
The IEEE paper is published under IEEE's standard terms; the *code* is
|
| 121 |
+
covered separately. MIT is the most permissive widely recognised license —
|
| 122 |
+
it lets recruiters, students, and other researchers freely fork, learn from,
|
| 123 |
+
and extend the code. For a recruiter-grade portfolio project, permissive
|
| 124 |
+
licensing signals "I want this work to be useful," which is the right tone.
|
| 125 |
+
|
| 126 |
+
### 10. Why folder name `configs/` (plural), not `config/` (singular)?
|
| 127 |
+
|
| 128 |
+
`config/` was the empty folder shipped with the template. The plural form
|
| 129 |
+
`configs/` is the convention in modern Python ML projects (FastAPI's own
|
| 130 |
+
example apps, Hydra projects, the official `transformers` repo) because
|
| 131 |
+
it holds multiple files (one per environment, model variant, or run).
|
| 132 |
+
Phase 1 creates `configs/` with content; the empty `config/` folder will
|
| 133 |
+
be removed in the Phase 1 commit that introduces the YAML files.
|
| 134 |
+
|
| 135 |
+
---
|
| 136 |
+
|
| 137 |
+
## What this phase deliberately does NOT do
|
| 138 |
+
|
| 139 |
+
- **No code is moved out of the notebook yet.** That's Phase 1, behind a
|
| 140 |
+
parity validation gate.
|
| 141 |
+
- **No `src/captioning/` modules are created.** Empty `__init__.py` files
|
| 142 |
+
would just be churn; Phase 1 will create them with real code.
|
| 143 |
+
- **No Dockerfile or docker-compose.yml.** They depend on `backend/app/`
|
| 144 |
+
existing; both arrive in Phase 1.
|
| 145 |
+
- **No GitHub Actions workflows.** They live in Phase 2, after there is
|
| 146 |
+
Python code to lint and type-check.
|
| 147 |
+
- **No README rewrite.** The current README accurately describes the
|
| 148 |
+
research; the demo-link rewrite happens in Phase 2 once a live URL exists.
|
| 149 |
+
|
| 150 |
+
This restraint is deliberate. Each phase ships a coherent slice of value;
|
| 151 |
+
running ahead would create half-built features and vague commits.
|
| 152 |
+
|
| 153 |
+
---
|
| 154 |
+
|
| 155 |
+
## Local setup checklist for the developer
|
| 156 |
+
|
| 157 |
+
After pulling this commit, on a fresh dev box:
|
| 158 |
+
|
| 159 |
+
```bash
|
| 160 |
+
# 1. Create a Python 3.10 virtual environment.
|
| 161 |
+
python -m venv .venv
|
| 162 |
+
.venv\Scripts\activate # PowerShell
|
| 163 |
+
# source .venv/bin/activate # Linux/macOS
|
| 164 |
+
|
| 165 |
+
# 2. Install dev dependencies + the package (editable).
|
| 166 |
+
make install-dev
|
| 167 |
+
# Or, without Make:
|
| 168 |
+
# pip install -r requirements-dev.txt -r requirements-eval.txt
|
| 169 |
+
# pip install -e ".[hf,mlflow]"
|
| 170 |
+
|
| 171 |
+
# 3. Register pre-commit hooks.
|
| 172 |
+
make install-hooks
|
| 173 |
+
# Or: pre-commit install
|
| 174 |
+
|
| 175 |
+
# 4. (Optional) Lock the paper notebook's hash, so CI can enforce parity.
|
| 176 |
+
make lock-paper-notebook
|
| 177 |
+
|
| 178 |
+
# 5. Verify everything works.
|
| 179 |
+
make pre-commit # Run all hooks against all files
|
| 180 |
+
make test # No tests yet — exits cleanly with "no tests collected"
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
The first `make install-dev` will take a few minutes (TensorFlow is large).
|
| 184 |
+
Subsequent runs hit the wheel cache and complete in seconds.
|
docs/restructure-plan.md
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Production Restructuring Plan
|
| 2 |
+
|
| 3 |
+
> Public, in-repo copy of the engineering plan that drives the transition from
|
| 4 |
+
> a single-notebook research project into a deployable multimodal AI platform.
|
| 5 |
+
> The original (with internal exploration notes) lives in the developer's
|
| 6 |
+
> `~/.claude/plans/` directory; this version is the canonical public artefact.
|
| 7 |
+
|
| 8 |
+
## Context
|
| 9 |
+
|
| 10 |
+
This repository is the engineering home of an IEEE-published image-captioning
|
| 11 |
+
research project. The published artefact is a single Jupyter notebook
|
| 12 |
+
([`notebooks/01_ieee_inceptionv3_transformer.ipynb`](../notebooks/01_ieee_inceptionv3_transformer.ipynb))
|
| 13 |
+
implementing **InceptionV3 (frozen) + custom Keras Transformer decoder**
|
| 14 |
+
trained on **COCO 2017**, reporting **BLEU ~24**.
|
| 15 |
+
|
| 16 |
+
**Goal**: convert the repo into a recruiter-grade, production-style
|
| 17 |
+
multimodal AI platform with a live free-tier demo, while **preserving the
|
| 18 |
+
IEEE notebook byte-for-byte** as the canonical research artefact.
|
| 19 |
+
|
| 20 |
+
**Constraints**:
|
| 21 |
+
|
| 22 |
+
- Hosting budget: **$0/month** → HuggingFace Spaces (backend) + Vercel free
|
| 23 |
+
(frontend) + HuggingFace Hub (model artefacts) + DagsHub free MLflow.
|
| 24 |
+
- Multimodal scope (v1): **Tier 1 only** — add three pretrained HuggingFace
|
| 25 |
+
models (BLIP-base, ViT-GPT2, GIT-base-coco) for a side-by-side comparison
|
| 26 |
+
demo. Tier 2/3/4 are listed under *Future work* only.
|
| 27 |
+
|
| 28 |
+
---
|
| 29 |
+
|
| 30 |
+
## 1. Folder Structure (target)
|
| 31 |
+
|
| 32 |
+
```
|
| 33 |
+
image-captioning-system/
|
| 34 |
+
├── notebooks/
|
| 35 |
+
│ └── 01_ieee_inceptionv3_transformer.ipynb # FROZEN
|
| 36 |
+
├── src/captioning/ # Installable Python package
|
| 37 |
+
│ ├── config/ # Pydantic settings + YAML loader
|
| 38 |
+
│ ├── data/ # COCO loaders, preprocess, splits
|
| 39 |
+
│ ├── tokenizer/ # CaptionTokenizer (Keras TextVectorization wrapper)
|
| 40 |
+
│ ├── models/ # CNN encoder, Transformer decoder, factory
|
| 41 |
+
│ ├── training/ # Trainer, losses, metrics, callbacks
|
| 42 |
+
│ ├── inference/ # Greedy + beam search predictors
|
| 43 |
+
│ ├── evaluation/ # BLEU, CIDEr, METEOR, ROUGE
|
| 44 |
+
│ ├── io/ # Checkpoints, image decoding, HF Hub I/O
|
| 45 |
+
│ └── utils/ # Logging, seeding, timing
|
| 46 |
+
├── configs/ # YAML hyperparameters (validated by Pydantic)
|
| 47 |
+
├── scripts/ # CLI entrypoints (train, eval, predict, upload)
|
| 48 |
+
├── models/ # Local checkpoint registry (gitignored content)
|
| 49 |
+
├── backend/ # FastAPI service (depends on src/captioning)
|
| 50 |
+
├── frontend/ # Next.js 14 + TypeScript + Tailwind + shadcn/ui
|
| 51 |
+
├── tests/ # ML-core tests (unit + integration)
|
| 52 |
+
├── docs/ # Architecture, ADRs, results, deployment
|
| 53 |
+
├── .github/workflows/ # CI, CD, model-eval
|
| 54 |
+
├── docker-compose.yml # Local dev: backend + frontend + mlflow
|
| 55 |
+
├── pyproject.toml # Single source of truth for the package
|
| 56 |
+
└── Makefile # Discoverable command index
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
**Key architectural rules**:
|
| 60 |
+
|
| 61 |
+
- `src/captioning/` is the ML core; `backend/app/` imports from it. Never
|
| 62 |
+
reverse the dependency.
|
| 63 |
+
- The IEEE notebook is **frozen** — `make freeze-paper-notebook` is a CI
|
| 64 |
+
check that fails on any byte change.
|
| 65 |
+
- Model weights are **never committed**; they live in HuggingFace Hub
|
| 66 |
+
(`yourname/captioning-weights`) and are downloaded at backend startup.
|
| 67 |
+
- Configuration is **YAML files validated by Pydantic v2 BaseSettings**, not
|
| 68 |
+
Hydra. Env vars override via `CAPTIONING__TRAIN__BATCH_SIZE=32` syntax.
|
| 69 |
+
|
| 70 |
+
---
|
| 71 |
+
|
| 72 |
+
## 2. Migration Strategy
|
| 73 |
+
|
| 74 |
+
**Approach: verbatim refactor first, improvements second.** Reproducibility
|
| 75 |
+
of the IEEE BLEU score is non-negotiable; behaviour parity must be proven
|
| 76 |
+
*before* any improvement is made.
|
| 77 |
+
|
| 78 |
+
### Phase 1a — "Lift and shift" (parity goal: BLEU within ±0.3 of notebook)
|
| 79 |
+
|
| 80 |
+
| Step | Notebook cell | Target module |
|
| 81 |
+
|---|---|---|
|
| 82 |
+
| 1 | Hyperparams | `configs/base.yaml` + `src/captioning/config/schema.py` |
|
| 83 |
+
| 2 | Caption preprocess | `data/preprocess.py::preprocess_caption` |
|
| 84 |
+
| 3 | COCO loader | `data/coco.py::load_coco_annotations` |
|
| 85 |
+
| 4 | Tokenizer | `tokenizer/vectorizer.py::CaptionTokenizer` |
|
| 86 |
+
| 5 | Splits | `data/splits.py::make_splits(seed=...)` |
|
| 87 |
+
| 6 | Image preprocess | `data/preprocess.py::preprocess_image` |
|
| 88 |
+
| 7 | tf.data pipeline | `data/pipeline.py::build_{train,val}_pipeline` |
|
| 89 |
+
| 8 | Augmentation | `data/augmentation.py::default_augmentation` |
|
| 90 |
+
| 9 | InceptionV3 encoder | `models/encoder_cnn.py` |
|
| 91 |
+
| 10 | Transformer encoder | `models/transformer_encoder.py` |
|
| 92 |
+
| 11 | Embeddings | `models/embeddings.py` |
|
| 93 |
+
| 12 | Transformer decoder | `models/transformer_decoder.py` |
|
| 94 |
+
| 13 | Captioning model | `models/captioning_model.py` |
|
| 95 |
+
| 14 | Wiring | `models/factory.py::build_caption_model(config)` |
|
| 96 |
+
| 15 | Loss + compile | `training/losses.py` + `training/trainer.py` |
|
| 97 |
+
| 16 | Fit | `training/trainer.py::Trainer.fit` |
|
| 98 |
+
| 17 | Inference | `inference/greedy.py`, `inference/predictor.py` |
|
| 99 |
+
| 18 | Save weights | `io/checkpoints.py` + `scripts/train.py` |
|
| 100 |
+
|
| 101 |
+
### Parity validation gate
|
| 102 |
+
|
| 103 |
+
`scripts/notebook_module_audit.py` runs both pipelines on a fixed 100-image
|
| 104 |
+
fixture and asserts:
|
| 105 |
+
|
| 106 |
+
- Tokenizer vocabulary identical (set equality).
|
| 107 |
+
- Image preprocessing tensor-equal (`np.allclose`, atol=1e-5).
|
| 108 |
+
- Model output logits equal at fixed weights (atol=1e-4).
|
| 109 |
+
- Captions on 20 fixed images byte-equal between notebook and module path.
|
| 110 |
+
|
| 111 |
+
### Phase 1b — Quality improvements (only after parity is green)
|
| 112 |
+
|
| 113 |
+
1. Masked accuracy metric (notebook tracks loss only).
|
| 114 |
+
2. Beam search inference.
|
| 115 |
+
3. Warmup + cosine LR schedule (replaces bare Adam).
|
| 116 |
+
4. CIDEr / METEOR / ROUGE-L (paper reports BLEU only).
|
| 117 |
+
5. `vocab.json` sidecar alongside `vocab.pkl`.
|
| 118 |
+
6. Label smoothing.
|
| 119 |
+
|
| 120 |
+
---
|
| 121 |
+
|
| 122 |
+
## 3. Implementation Roadmap
|
| 123 |
+
|
| 124 |
+
| Phase | Deliverable | Effort | Recruiter signal |
|
| 125 |
+
|---|---|---|---|
|
| 126 |
+
| **0** | Repo bootstrap (this phase) | 3 hrs | Clean repo, lint passes from commit 1 |
|
| 127 |
+
| **1** | Modular ML core + backend MVP | ~15 hrs | Working FastAPI for the IEEE model, runnable via `docker compose up` |
|
| 128 |
+
| **2** | CI/CD + first deploy (HF Space + Vercel) | ~12 hrs | Live demo URL on LinkedIn |
|
| 129 |
+
| **3** | Tier 1 multimodal: BLIP/ViT-GPT2/GIT comparison demo | ~20 hrs | The screenshot recruiters share |
|
| 130 |
+
| **4** | Polish + observability (Sentry, Prometheus, ADRs) | ~8 hrs | Reads as production-grade, not a research one-off |
|
| 131 |
+
|
| 132 |
+
### Future work (out of scope for v1)
|
| 133 |
+
|
| 134 |
+
- **Tier 2**: ViT + Transformer fine-tune on COCO via Kaggle GPU (BLEU 24 → 32+).
|
| 135 |
+
- **Tier 3**: Anthropic Claude vision endpoint as a "Frontier" tab.
|
| 136 |
+
- **Tier 4**: VQA "Ask the image" extension reusing Tier 3 infra.
|
| 137 |
+
- Self-hosted compose on a VPS with Caddy TLS and DVC dataset versioning.
|
| 138 |
+
|
| 139 |
+
---
|
| 140 |
+
|
| 141 |
+
## 4. Deployment Stack (free-tier)
|
| 142 |
+
|
| 143 |
+
| Layer | Service | Why |
|
| 144 |
+
|---|---|---|
|
| 145 |
+
| Backend hosting | HuggingFace Spaces (Docker SDK, free CPU) | 16 GB RAM, ML-native, recruiter-clickable |
|
| 146 |
+
| Frontend hosting | Vercel free | Next.js native; per-PR preview URLs |
|
| 147 |
+
| Model artefacts | HuggingFace Hub | Free, unlimited public, versioned, model cards |
|
| 148 |
+
| Experiment tracking | MLflow on DagsHub free | Public read-only tracking server |
|
| 149 |
+
| Errors | Sentry free (5k errors/mo) | |
|
| 150 |
+
| Uptime | UptimeRobot free | Doubles as HF Space wake-up keeper |
|
| 151 |
+
| Domain | None (use `*.hf.space` and `*.vercel.app`) | $0 budget |
|
| 152 |
+
|
| 153 |
+
---
|
| 154 |
+
|
| 155 |
+
## 5. Trade-offs Decided
|
| 156 |
+
|
| 157 |
+
| Decision | Alternative rejected | Reason |
|
| 158 |
+
|---|---|---|
|
| 159 |
+
| FastAPI | Flask | Async, OpenAPI, Pydantic, lifespan |
|
| 160 |
+
| Next.js 14 App Router | Streamlit | Streamlit screams "research demo" |
|
| 161 |
+
| TanStack Query | Redux | Server state belongs in a server-state lib |
|
| 162 |
+
| YAML + Pydantic | Hydra | Hydra is overkill for 1–3 active configs |
|
| 163 |
+
| MLflow on DagsHub | W&B | DagsHub public free; no recruiter login |
|
| 164 |
+
| Keep TextVectorization | HF tokenizer in v1 | Changes vocab → breaks paper parity |
|
| 165 |
+
| Verbatim refactor first | Clean rewrite | IEEE BLEU reproducibility non-negotiable |
|
| 166 |
+
| `tensorflow-cpu==2.15.0` pinned | Floating TF | TF 2.16 broke Keras 2 compat with notebook |
|
| 167 |
+
| HF Spaces backend | Fly.io paid | Free-tier-only constraint |
|
| 168 |
+
| Multipart uploads | Base64 in JSON | 33% overhead, no streaming |
|
| 169 |
+
| `--workers 1` uvicorn | Multi-worker | TF graph + InceptionV3 ×N OOMs |
|
| 170 |
+
| Tier 1 only (HF baselines) | Tier 2/3/4 in v1 | User selected Tier 1; others as future work |
|
| 171 |
+
|
| 172 |
+
---
|
| 173 |
+
|
| 174 |
+
## 6. Verification Plan
|
| 175 |
+
|
| 176 |
+
**Phase 1**:
|
| 177 |
+
|
| 178 |
+
- `pytest tests/ -v` → all green; coverage ≥ 70% on `src/captioning/`.
|
| 179 |
+
- `python scripts/notebook_module_audit.py` → parity assertions all pass.
|
| 180 |
+
- `docker compose up` → `curl -F "file=@sample.jpg" http://localhost:8000/v1/captions`
|
| 181 |
+
returns valid caption JSON.
|
| 182 |
+
|
| 183 |
+
**Phase 2**:
|
| 184 |
+
|
| 185 |
+
- GitHub Actions `ci.yml` green on a PR.
|
| 186 |
+
- HF Space URL serves `/v1/model/info`.
|
| 187 |
+
- Vercel preview URL renders frontend; uploading a sample image returns a caption.
|
| 188 |
+
|
| 189 |
+
**Phase 3**:
|
| 190 |
+
|
| 191 |
+
- `GET /v1/models` returns 4 entries.
|
| 192 |
+
- `POST /v1/compare` returns 4 captions; total latency < 15s on HF Space CPU.
|
| 193 |
+
- `model-eval.yml` posts a BLEU comparison comment on a test PR.
|
| 194 |
+
|
| 195 |
+
**Phase 4**:
|
| 196 |
+
|
| 197 |
+
- `/metrics` exposes `caption_inference_seconds` histogram.
|
| 198 |
+
- DagsHub MLflow link shows ≥ 1 logged run with metrics.
|
| 199 |
+
- `make freeze-paper-notebook` fails when notebook bytes change; passes when restored.
|
notebooks/01_ieee_inceptionv3_transformer.ipynb
ADDED
|
@@ -0,0 +1,786 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"metadata": {
|
| 7 |
+
"id": "AldVDvOgcpbc"
|
| 8 |
+
},
|
| 9 |
+
"outputs": [],
|
| 10 |
+
"source": [
|
| 11 |
+
"import tensorflow as tf\n",
|
| 12 |
+
"import os\n",
|
| 13 |
+
"import json\n",
|
| 14 |
+
"import pandas as pd\n",
|
| 15 |
+
"import re\n",
|
| 16 |
+
"import numpy as np\n",
|
| 17 |
+
"import time\n",
|
| 18 |
+
"import matplotlib.pyplot as plt\n",
|
| 19 |
+
"import collections\n",
|
| 20 |
+
"import random\n",
|
| 21 |
+
"import requests\n",
|
| 22 |
+
"import json\n",
|
| 23 |
+
"from math import sqrt\n",
|
| 24 |
+
"from PIL import Image\n",
|
| 25 |
+
"from tqdm.auto import tqdm"
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"cell_type": "code",
|
| 30 |
+
"execution_count": null,
|
| 31 |
+
"metadata": {},
|
| 32 |
+
"outputs": [],
|
| 33 |
+
"source": [
|
| 34 |
+
"BASE_PATH = '../input/coco-2017-dataset/coco2017'"
|
| 35 |
+
]
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"cell_type": "code",
|
| 39 |
+
"execution_count": null,
|
| 40 |
+
"metadata": {},
|
| 41 |
+
"outputs": [],
|
| 42 |
+
"source": [
|
| 43 |
+
"with open(f'{BASE_PATH}/annotations/captions_train2017.json', 'r') as f:\n",
|
| 44 |
+
" data = json.load(f)\n",
|
| 45 |
+
" data = data['annotations']\n",
|
| 46 |
+
"\n",
|
| 47 |
+
"img_cap_pairs = []\n",
|
| 48 |
+
"\n",
|
| 49 |
+
"for sample in data:\n",
|
| 50 |
+
" img_name = '%012d.jpg' % sample['image_id']\n",
|
| 51 |
+
" img_cap_pairs.append([img_name, sample['caption']])\n",
|
| 52 |
+
"\n",
|
| 53 |
+
"captions = pd.DataFrame(img_cap_pairs, columns=['image', 'caption'])\n",
|
| 54 |
+
"captions['image'] = captions['image'].apply(\n",
|
| 55 |
+
" lambda x: f'{BASE_PATH}/train2017/{x}'\n",
|
| 56 |
+
")\n",
|
| 57 |
+
"captions = captions.sample(120000)\n",
|
| 58 |
+
"captions = captions.reset_index(drop=True)\n",
|
| 59 |
+
"captions.head()"
|
| 60 |
+
]
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"cell_type": "code",
|
| 64 |
+
"execution_count": null,
|
| 65 |
+
"metadata": {
|
| 66 |
+
"id": "rWbe_xuhFaJp"
|
| 67 |
+
},
|
| 68 |
+
"outputs": [],
|
| 69 |
+
"source": [
|
| 70 |
+
"def preprocess(text):\n",
|
| 71 |
+
" text = text.lower()\n",
|
| 72 |
+
" text = re.sub(r'[^\\w\\s]', '', text)\n",
|
| 73 |
+
" text = re.sub('\\s+', ' ', text)\n",
|
| 74 |
+
" text = text.strip()\n",
|
| 75 |
+
" text = '[start] ' + text + ' [end]'\n",
|
| 76 |
+
" return text"
|
| 77 |
+
]
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"cell_type": "code",
|
| 81 |
+
"execution_count": null,
|
| 82 |
+
"metadata": {
|
| 83 |
+
"id": "v_ouwWhKnEy5",
|
| 84 |
+
"outputId": "d190c744-d31e-430b-ed85-eb0295010c1d"
|
| 85 |
+
},
|
| 86 |
+
"outputs": [],
|
| 87 |
+
"source": [
|
| 88 |
+
"captions['caption'] = captions['caption'].apply(preprocess)\n",
|
| 89 |
+
"captions.head()"
|
| 90 |
+
]
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"cell_type": "code",
|
| 94 |
+
"execution_count": null,
|
| 95 |
+
"metadata": {
|
| 96 |
+
"id": "6RBuExHWnGEt",
|
| 97 |
+
"outputId": "0242452f-4d17-4af6-a9bb-3bea7b09568e"
|
| 98 |
+
},
|
| 99 |
+
"outputs": [],
|
| 100 |
+
"source": [
|
| 101 |
+
"random_row = captions.sample(1).iloc[0]\n",
|
| 102 |
+
"print(random_row.caption)\n",
|
| 103 |
+
"print()\n",
|
| 104 |
+
"im = Image.open(random_row.image)\n",
|
| 105 |
+
"im"
|
| 106 |
+
]
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"cell_type": "code",
|
| 110 |
+
"execution_count": null,
|
| 111 |
+
"metadata": {
|
| 112 |
+
"id": "nSTivH_FSSf2"
|
| 113 |
+
},
|
| 114 |
+
"outputs": [],
|
| 115 |
+
"source": [
|
| 116 |
+
"MAX_LENGTH = 40\n",
|
| 117 |
+
"VOCABULARY_SIZE = 15000\n",
|
| 118 |
+
"BATCH_SIZE = 64\n",
|
| 119 |
+
"BUFFER_SIZE = 1000\n",
|
| 120 |
+
"EMBEDDING_DIM = 512\n",
|
| 121 |
+
"UNITS = 512\n",
|
| 122 |
+
"EPOCHS = 10"
|
| 123 |
+
]
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"cell_type": "code",
|
| 127 |
+
"execution_count": null,
|
| 128 |
+
"metadata": {
|
| 129 |
+
"id": "X8MGUNtBN2sz"
|
| 130 |
+
},
|
| 131 |
+
"outputs": [],
|
| 132 |
+
"source": [
|
| 133 |
+
"tokenizer = tf.keras.layers.TextVectorization(\n",
|
| 134 |
+
" max_tokens=VOCABULARY_SIZE,\n",
|
| 135 |
+
" standardize=None,\n",
|
| 136 |
+
" output_sequence_length=MAX_LENGTH)\n",
|
| 137 |
+
"\n",
|
| 138 |
+
"tokenizer.adapt(captions['caption'])"
|
| 139 |
+
]
|
| 140 |
+
},
|
| 141 |
+
{
|
| 142 |
+
"cell_type": "code",
|
| 143 |
+
"execution_count": null,
|
| 144 |
+
"metadata": {},
|
| 145 |
+
"outputs": [],
|
| 146 |
+
"source": [
|
| 147 |
+
"tokenizer.vocabulary_size()"
|
| 148 |
+
]
|
| 149 |
+
},
|
| 150 |
+
{
|
| 151 |
+
"cell_type": "code",
|
| 152 |
+
"execution_count": null,
|
| 153 |
+
"metadata": {},
|
| 154 |
+
"outputs": [],
|
| 155 |
+
"source": [
|
| 156 |
+
"import pickle\n",
|
| 157 |
+
"\n",
|
| 158 |
+
"pickle.dump(tokenizer.get_vocabulary(), open('vocab_coco.file', 'wb'))"
|
| 159 |
+
]
|
| 160 |
+
},
|
| 161 |
+
{
|
| 162 |
+
"cell_type": "code",
|
| 163 |
+
"execution_count": null,
|
| 164 |
+
"metadata": {
|
| 165 |
+
"id": "qvhg-6eKN3nz"
|
| 166 |
+
},
|
| 167 |
+
"outputs": [],
|
| 168 |
+
"source": [
|
| 169 |
+
"word2idx = tf.keras.layers.StringLookup(\n",
|
| 170 |
+
" mask_token=\"\",\n",
|
| 171 |
+
" vocabulary=tokenizer.get_vocabulary())\n",
|
| 172 |
+
"\n",
|
| 173 |
+
"idx2word = tf.keras.layers.StringLookup(\n",
|
| 174 |
+
" mask_token=\"\",\n",
|
| 175 |
+
" vocabulary=tokenizer.get_vocabulary(),\n",
|
| 176 |
+
" invert=True)"
|
| 177 |
+
]
|
| 178 |
+
},
|
| 179 |
+
{
|
| 180 |
+
"cell_type": "code",
|
| 181 |
+
"execution_count": null,
|
| 182 |
+
"metadata": {
|
| 183 |
+
"id": "Yrca2aN2N5WL"
|
| 184 |
+
},
|
| 185 |
+
"outputs": [],
|
| 186 |
+
"source": [
|
| 187 |
+
"img_to_cap_vector = collections.defaultdict(list)\n",
|
| 188 |
+
"for img, cap in zip(captions['image'], captions['caption']):\n",
|
| 189 |
+
" img_to_cap_vector[img].append(cap)\n",
|
| 190 |
+
"\n",
|
| 191 |
+
"img_keys = list(img_to_cap_vector.keys())\n",
|
| 192 |
+
"random.shuffle(img_keys)\n",
|
| 193 |
+
"\n",
|
| 194 |
+
"slice_index = int(len(img_keys)*0.8)\n",
|
| 195 |
+
"img_name_train_keys, img_name_val_keys = (img_keys[:slice_index], \n",
|
| 196 |
+
" img_keys[slice_index:])\n",
|
| 197 |
+
"\n",
|
| 198 |
+
"train_imgs = []\n",
|
| 199 |
+
"train_captions = []\n",
|
| 200 |
+
"for imgt in img_name_train_keys:\n",
|
| 201 |
+
" capt_len = len(img_to_cap_vector[imgt])\n",
|
| 202 |
+
" train_imgs.extend([imgt] * capt_len)\n",
|
| 203 |
+
" train_captions.extend(img_to_cap_vector[imgt])\n",
|
| 204 |
+
"\n",
|
| 205 |
+
"val_imgs = []\n",
|
| 206 |
+
"val_captions = []\n",
|
| 207 |
+
"for imgv in img_name_val_keys:\n",
|
| 208 |
+
" capv_len = len(img_to_cap_vector[imgv])\n",
|
| 209 |
+
" val_imgs.extend([imgv] * capv_len)\n",
|
| 210 |
+
" val_captions.extend(img_to_cap_vector[imgv])"
|
| 211 |
+
]
|
| 212 |
+
},
|
| 213 |
+
{
|
| 214 |
+
"cell_type": "code",
|
| 215 |
+
"execution_count": null,
|
| 216 |
+
"metadata": {
|
| 217 |
+
"id": "UHN3Q1YDN5TD",
|
| 218 |
+
"outputId": "0b0af2ea-f6d7-48c9-ba30-14d8d9c98418"
|
| 219 |
+
},
|
| 220 |
+
"outputs": [],
|
| 221 |
+
"source": [
|
| 222 |
+
"len(train_imgs), len(train_captions), len(val_imgs), len(val_captions)"
|
| 223 |
+
]
|
| 224 |
+
},
|
| 225 |
+
{
|
| 226 |
+
"cell_type": "code",
|
| 227 |
+
"execution_count": null,
|
| 228 |
+
"metadata": {
|
| 229 |
+
"id": "12c-7FHzOFSq"
|
| 230 |
+
},
|
| 231 |
+
"outputs": [],
|
| 232 |
+
"source": [
|
| 233 |
+
"def load_data(img_path, caption):\n",
|
| 234 |
+
" img = tf.io.read_file(img_path)\n",
|
| 235 |
+
" img = tf.io.decode_jpeg(img, channels=3)\n",
|
| 236 |
+
" img = tf.keras.layers.Resizing(299, 299)(img)\n",
|
| 237 |
+
" img = tf.keras.applications.inception_v3.preprocess_input(img)\n",
|
| 238 |
+
" caption = tokenizer(caption)\n",
|
| 239 |
+
" return img, caption"
|
| 240 |
+
]
|
| 241 |
+
},
|
| 242 |
+
{
|
| 243 |
+
"cell_type": "code",
|
| 244 |
+
"execution_count": null,
|
| 245 |
+
"metadata": {
|
| 246 |
+
"id": "vHk83y3eOFPz"
|
| 247 |
+
},
|
| 248 |
+
"outputs": [],
|
| 249 |
+
"source": [
|
| 250 |
+
"train_dataset = tf.data.Dataset.from_tensor_slices(\n",
|
| 251 |
+
" (train_imgs, train_captions))\n",
|
| 252 |
+
"\n",
|
| 253 |
+
"train_dataset = train_dataset.map(\n",
|
| 254 |
+
" load_data, num_parallel_calls=tf.data.AUTOTUNE\n",
|
| 255 |
+
" ).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)\n",
|
| 256 |
+
"\n",
|
| 257 |
+
"val_dataset = tf.data.Dataset.from_tensor_slices(\n",
|
| 258 |
+
" (val_imgs, val_captions))\n",
|
| 259 |
+
"\n",
|
| 260 |
+
"val_dataset = val_dataset.map(\n",
|
| 261 |
+
" load_data, num_parallel_calls=tf.data.AUTOTUNE\n",
|
| 262 |
+
" ).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)"
|
| 263 |
+
]
|
| 264 |
+
},
|
| 265 |
+
{
|
| 266 |
+
"cell_type": "code",
|
| 267 |
+
"execution_count": null,
|
| 268 |
+
"metadata": {
|
| 269 |
+
"id": "bQr_bgk11eMF"
|
| 270 |
+
},
|
| 271 |
+
"outputs": [],
|
| 272 |
+
"source": [
|
| 273 |
+
"image_augmentation = tf.keras.Sequential(\n",
|
| 274 |
+
" [\n",
|
| 275 |
+
" tf.keras.layers.RandomFlip(\"horizontal\"),\n",
|
| 276 |
+
" tf.keras.layers.RandomRotation(0.2),\n",
|
| 277 |
+
" tf.keras.layers.RandomContrast(0.3),\n",
|
| 278 |
+
" ]\n",
|
| 279 |
+
")"
|
| 280 |
+
]
|
| 281 |
+
},
|
| 282 |
+
{
|
| 283 |
+
"cell_type": "code",
|
| 284 |
+
"execution_count": null,
|
| 285 |
+
"metadata": {
|
| 286 |
+
"id": "H9GDJ9_1nIMO"
|
| 287 |
+
},
|
| 288 |
+
"outputs": [],
|
| 289 |
+
"source": [
|
| 290 |
+
"def CNN_Encoder():\n",
|
| 291 |
+
" inception_v3 = tf.keras.applications.InceptionV3(\n",
|
| 292 |
+
" include_top=False,\n",
|
| 293 |
+
" weights='imagenet'\n",
|
| 294 |
+
" )\n",
|
| 295 |
+
"\n",
|
| 296 |
+
" output = inception_v3.output\n",
|
| 297 |
+
" output = tf.keras.layers.Reshape(\n",
|
| 298 |
+
" (-1, output.shape[-1]))(output)\n",
|
| 299 |
+
"\n",
|
| 300 |
+
" cnn_model = tf.keras.models.Model(inception_v3.input, output)\n",
|
| 301 |
+
" return cnn_model"
|
| 302 |
+
]
|
| 303 |
+
},
|
| 304 |
+
{
|
| 305 |
+
"cell_type": "code",
|
| 306 |
+
"execution_count": null,
|
| 307 |
+
"metadata": {
|
| 308 |
+
"id": "jMy5MrE2PdHV"
|
| 309 |
+
},
|
| 310 |
+
"outputs": [],
|
| 311 |
+
"source": [
|
| 312 |
+
"class TransformerEncoderLayer(tf.keras.layers.Layer):\n",
|
| 313 |
+
"\n",
|
| 314 |
+
" def __init__(self, embed_dim, num_heads):\n",
|
| 315 |
+
" super().__init__()\n",
|
| 316 |
+
" self.layer_norm_1 = tf.keras.layers.LayerNormalization()\n",
|
| 317 |
+
" self.layer_norm_2 = tf.keras.layers.LayerNormalization()\n",
|
| 318 |
+
" self.attention = tf.keras.layers.MultiHeadAttention(\n",
|
| 319 |
+
" num_heads=num_heads, key_dim=embed_dim)\n",
|
| 320 |
+
" self.dense = tf.keras.layers.Dense(embed_dim, activation=\"relu\")\n",
|
| 321 |
+
" \n",
|
| 322 |
+
"\n",
|
| 323 |
+
" def call(self, x, training):\n",
|
| 324 |
+
" x = self.layer_norm_1(x)\n",
|
| 325 |
+
" x = self.dense(x)\n",
|
| 326 |
+
"\n",
|
| 327 |
+
" attn_output = self.attention(\n",
|
| 328 |
+
" query=x,\n",
|
| 329 |
+
" value=x,\n",
|
| 330 |
+
" key=x,\n",
|
| 331 |
+
" attention_mask=None,\n",
|
| 332 |
+
" training=training\n",
|
| 333 |
+
" )\n",
|
| 334 |
+
"\n",
|
| 335 |
+
" x = self.layer_norm_2(x + attn_output)\n",
|
| 336 |
+
" return x"
|
| 337 |
+
]
|
| 338 |
+
},
|
| 339 |
+
{
|
| 340 |
+
"cell_type": "code",
|
| 341 |
+
"execution_count": null,
|
| 342 |
+
"metadata": {
|
| 343 |
+
"id": "MFqNFts0duGB"
|
| 344 |
+
},
|
| 345 |
+
"outputs": [],
|
| 346 |
+
"source": [
|
| 347 |
+
"class Embeddings(tf.keras.layers.Layer):\n",
|
| 348 |
+
"\n",
|
| 349 |
+
" def __init__(self, vocab_size, embed_dim, max_len):\n",
|
| 350 |
+
" super().__init__()\n",
|
| 351 |
+
" self.token_embeddings = tf.keras.layers.Embedding(\n",
|
| 352 |
+
" vocab_size, embed_dim)\n",
|
| 353 |
+
" self.position_embeddings = tf.keras.layers.Embedding(\n",
|
| 354 |
+
" max_len, embed_dim, input_shape=(None, max_len))\n",
|
| 355 |
+
" \n",
|
| 356 |
+
"\n",
|
| 357 |
+
" def call(self, input_ids):\n",
|
| 358 |
+
" length = tf.shape(input_ids)[-1]\n",
|
| 359 |
+
" position_ids = tf.range(start=0, limit=length, delta=1)\n",
|
| 360 |
+
" position_ids = tf.expand_dims(position_ids, axis=0)\n",
|
| 361 |
+
"\n",
|
| 362 |
+
" token_embeddings = self.token_embeddings(input_ids)\n",
|
| 363 |
+
" position_embeddings = self.position_embeddings(position_ids)\n",
|
| 364 |
+
"\n",
|
| 365 |
+
" return token_embeddings + position_embeddings"
|
| 366 |
+
]
|
| 367 |
+
},
|
| 368 |
+
{
|
| 369 |
+
"cell_type": "code",
|
| 370 |
+
"execution_count": null,
|
| 371 |
+
"metadata": {
|
| 372 |
+
"id": "pcbCQqrDnJ4-"
|
| 373 |
+
},
|
| 374 |
+
"outputs": [],
|
| 375 |
+
"source": [
|
| 376 |
+
"class TransformerDecoderLayer(tf.keras.layers.Layer):\n",
|
| 377 |
+
"\n",
|
| 378 |
+
" def __init__(self, embed_dim, units, num_heads):\n",
|
| 379 |
+
" super().__init__()\n",
|
| 380 |
+
" self.embedding = Embeddings(\n",
|
| 381 |
+
" tokenizer.vocabulary_size(), embed_dim, MAX_LENGTH)\n",
|
| 382 |
+
"\n",
|
| 383 |
+
" self.attention_1 = tf.keras.layers.MultiHeadAttention(\n",
|
| 384 |
+
" num_heads=num_heads, key_dim=embed_dim, dropout=0.1\n",
|
| 385 |
+
" )\n",
|
| 386 |
+
" self.attention_2 = tf.keras.layers.MultiHeadAttention(\n",
|
| 387 |
+
" num_heads=num_heads, key_dim=embed_dim, dropout=0.1\n",
|
| 388 |
+
" )\n",
|
| 389 |
+
"\n",
|
| 390 |
+
" self.layernorm_1 = tf.keras.layers.LayerNormalization()\n",
|
| 391 |
+
" self.layernorm_2 = tf.keras.layers.LayerNormalization()\n",
|
| 392 |
+
" self.layernorm_3 = tf.keras.layers.LayerNormalization()\n",
|
| 393 |
+
"\n",
|
| 394 |
+
" self.ffn_layer_1 = tf.keras.layers.Dense(units, activation=\"relu\")\n",
|
| 395 |
+
" self.ffn_layer_2 = tf.keras.layers.Dense(embed_dim)\n",
|
| 396 |
+
"\n",
|
| 397 |
+
" self.out = tf.keras.layers.Dense(tokenizer.vocabulary_size(), activation=\"softmax\")\n",
|
| 398 |
+
"\n",
|
| 399 |
+
" self.dropout_1 = tf.keras.layers.Dropout(0.3)\n",
|
| 400 |
+
" self.dropout_2 = tf.keras.layers.Dropout(0.5)\n",
|
| 401 |
+
" \n",
|
| 402 |
+
"\n",
|
| 403 |
+
" def call(self, input_ids, encoder_output, training, mask=None):\n",
|
| 404 |
+
" embeddings = self.embedding(input_ids)\n",
|
| 405 |
+
"\n",
|
| 406 |
+
" combined_mask = None\n",
|
| 407 |
+
" padding_mask = None\n",
|
| 408 |
+
" \n",
|
| 409 |
+
" if mask is not None:\n",
|
| 410 |
+
" causal_mask = self.get_causal_attention_mask(embeddings)\n",
|
| 411 |
+
" padding_mask = tf.cast(mask[:, :, tf.newaxis], dtype=tf.int32)\n",
|
| 412 |
+
" combined_mask = tf.cast(mask[:, tf.newaxis, :], dtype=tf.int32)\n",
|
| 413 |
+
" combined_mask = tf.minimum(combined_mask, causal_mask)\n",
|
| 414 |
+
"\n",
|
| 415 |
+
" attn_output_1 = self.attention_1(\n",
|
| 416 |
+
" query=embeddings,\n",
|
| 417 |
+
" value=embeddings,\n",
|
| 418 |
+
" key=embeddings,\n",
|
| 419 |
+
" attention_mask=combined_mask,\n",
|
| 420 |
+
" training=training\n",
|
| 421 |
+
" )\n",
|
| 422 |
+
"\n",
|
| 423 |
+
" out_1 = self.layernorm_1(embeddings + attn_output_1)\n",
|
| 424 |
+
"\n",
|
| 425 |
+
" attn_output_2 = self.attention_2(\n",
|
| 426 |
+
" query=out_1,\n",
|
| 427 |
+
" value=encoder_output,\n",
|
| 428 |
+
" key=encoder_output,\n",
|
| 429 |
+
" attention_mask=padding_mask,\n",
|
| 430 |
+
" training=training\n",
|
| 431 |
+
" )\n",
|
| 432 |
+
"\n",
|
| 433 |
+
" out_2 = self.layernorm_2(out_1 + attn_output_2)\n",
|
| 434 |
+
"\n",
|
| 435 |
+
" ffn_out = self.ffn_layer_1(out_2)\n",
|
| 436 |
+
" ffn_out = self.dropout_1(ffn_out, training=training)\n",
|
| 437 |
+
" ffn_out = self.ffn_layer_2(ffn_out)\n",
|
| 438 |
+
"\n",
|
| 439 |
+
" ffn_out = self.layernorm_3(ffn_out + out_2)\n",
|
| 440 |
+
" ffn_out = self.dropout_2(ffn_out, training=training)\n",
|
| 441 |
+
" preds = self.out(ffn_out)\n",
|
| 442 |
+
" return preds\n",
|
| 443 |
+
"\n",
|
| 444 |
+
"\n",
|
| 445 |
+
" def get_causal_attention_mask(self, inputs):\n",
|
| 446 |
+
" input_shape = tf.shape(inputs)\n",
|
| 447 |
+
" batch_size, sequence_length = input_shape[0], input_shape[1]\n",
|
| 448 |
+
" i = tf.range(sequence_length)[:, tf.newaxis]\n",
|
| 449 |
+
" j = tf.range(sequence_length)\n",
|
| 450 |
+
" mask = tf.cast(i >= j, dtype=\"int32\")\n",
|
| 451 |
+
" mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))\n",
|
| 452 |
+
" mult = tf.concat(\n",
|
| 453 |
+
" [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],\n",
|
| 454 |
+
" axis=0\n",
|
| 455 |
+
" )\n",
|
| 456 |
+
" return tf.tile(mask, mult)"
|
| 457 |
+
]
|
| 458 |
+
},
|
| 459 |
+
{
|
| 460 |
+
"cell_type": "code",
|
| 461 |
+
"execution_count": null,
|
| 462 |
+
"metadata": {
|
| 463 |
+
"id": "9_NmSUaVys9R"
|
| 464 |
+
},
|
| 465 |
+
"outputs": [],
|
| 466 |
+
"source": [
|
| 467 |
+
"class ImageCaptioningModel(tf.keras.Model):\n",
|
| 468 |
+
"\n",
|
| 469 |
+
" def __init__(self, cnn_model, encoder, decoder, image_aug=None):\n",
|
| 470 |
+
" super().__init__()\n",
|
| 471 |
+
" self.cnn_model = cnn_model\n",
|
| 472 |
+
" self.encoder = encoder\n",
|
| 473 |
+
" self.decoder = decoder\n",
|
| 474 |
+
" self.image_aug = image_aug\n",
|
| 475 |
+
" self.loss_tracker = tf.keras.metrics.Mean(name=\"loss\")\n",
|
| 476 |
+
" self.acc_tracker = tf.keras.metrics.Mean(name=\"accuracy\")\n",
|
| 477 |
+
"\n",
|
| 478 |
+
"\n",
|
| 479 |
+
" def calculate_loss(self, y_true, y_pred, mask):\n",
|
| 480 |
+
" loss = self.loss(y_true, y_pred)\n",
|
| 481 |
+
" mask = tf.cast(mask, dtype=loss.dtype)\n",
|
| 482 |
+
" loss *= mask\n",
|
| 483 |
+
" return tf.reduce_sum(loss) / tf.reduce_sum(mask)\n",
|
| 484 |
+
"\n",
|
| 485 |
+
"\n",
|
| 486 |
+
" def calculate_accuracy(self, y_true, y_pred, mask):\n",
|
| 487 |
+
" accuracy = tf.equal(y_true, tf.argmax(y_pred, axis=2))\n",
|
| 488 |
+
" accuracy = tf.math.logical_and(mask, accuracy)\n",
|
| 489 |
+
" accuracy = tf.cast(accuracy, dtype=tf.float32)\n",
|
| 490 |
+
" mask = tf.cast(mask, dtype=tf.float32)\n",
|
| 491 |
+
" return tf.reduce_sum(accuracy) / tf.reduce_sum(mask)\n",
|
| 492 |
+
" \n",
|
| 493 |
+
"\n",
|
| 494 |
+
" def compute_loss_and_acc(self, img_embed, captions, training=True):\n",
|
| 495 |
+
" encoder_output = self.encoder(img_embed, training=True)\n",
|
| 496 |
+
" y_input = captions[:, :-1]\n",
|
| 497 |
+
" y_true = captions[:, 1:]\n",
|
| 498 |
+
" mask = (y_true != 0)\n",
|
| 499 |
+
" y_pred = self.decoder(\n",
|
| 500 |
+
" y_input, encoder_output, training=True, mask=mask\n",
|
| 501 |
+
" )\n",
|
| 502 |
+
" loss = self.calculate_loss(y_true, y_pred, mask)\n",
|
| 503 |
+
" acc = self.calculate_accuracy(y_true, y_pred, mask)\n",
|
| 504 |
+
" return loss, acc\n",
|
| 505 |
+
"\n",
|
| 506 |
+
" \n",
|
| 507 |
+
" def train_step(self, batch):\n",
|
| 508 |
+
" imgs, captions = batch\n",
|
| 509 |
+
"\n",
|
| 510 |
+
" if self.image_aug:\n",
|
| 511 |
+
" imgs = self.image_aug(imgs)\n",
|
| 512 |
+
" \n",
|
| 513 |
+
" img_embed = self.cnn_model(imgs)\n",
|
| 514 |
+
"\n",
|
| 515 |
+
" with tf.GradientTape() as tape:\n",
|
| 516 |
+
" loss, acc = self.compute_loss_and_acc(\n",
|
| 517 |
+
" img_embed, captions\n",
|
| 518 |
+
" )\n",
|
| 519 |
+
" \n",
|
| 520 |
+
" train_vars = (\n",
|
| 521 |
+
" self.encoder.trainable_variables + self.decoder.trainable_variables\n",
|
| 522 |
+
" )\n",
|
| 523 |
+
" grads = tape.gradient(loss, train_vars)\n",
|
| 524 |
+
" self.optimizer.apply_gradients(zip(grads, train_vars))\n",
|
| 525 |
+
" self.loss_tracker.update_state(loss)\n",
|
| 526 |
+
" self.acc_tracker.update_state(acc)\n",
|
| 527 |
+
"\n",
|
| 528 |
+
" return {\"loss\": self.loss_tracker.result(), \"acc\": self.acc_tracker.result()}\n",
|
| 529 |
+
" \n",
|
| 530 |
+
"\n",
|
| 531 |
+
" def test_step(self, batch):\n",
|
| 532 |
+
" imgs, captions = batch\n",
|
| 533 |
+
"\n",
|
| 534 |
+
" img_embed = self.cnn_model(imgs)\n",
|
| 535 |
+
"\n",
|
| 536 |
+
" loss, acc = self.compute_loss_and_acc(\n",
|
| 537 |
+
" img_embed, captions, training=False\n",
|
| 538 |
+
" )\n",
|
| 539 |
+
"\n",
|
| 540 |
+
" self.loss_tracker.update_state(loss)\n",
|
| 541 |
+
" self.acc_tracker.update_state(acc)\n",
|
| 542 |
+
"\n",
|
| 543 |
+
" return {\"loss\": self.loss_tracker.result(), \"acc\": self.acc_tracker.result()}\n",
|
| 544 |
+
"\n",
|
| 545 |
+
" @property\n",
|
| 546 |
+
" def metrics(self):\n",
|
| 547 |
+
" return [self.loss_tracker, self.acc_tracker]"
|
| 548 |
+
]
|
| 549 |
+
},
|
| 550 |
+
{
|
| 551 |
+
"cell_type": "code",
|
| 552 |
+
"execution_count": null,
|
| 553 |
+
"metadata": {
|
| 554 |
+
"id": "GqWpcsje0Hkh",
|
| 555 |
+
"outputId": "477f4a81-1e19-445a-d64d-cedad90a2893"
|
| 556 |
+
},
|
| 557 |
+
"outputs": [],
|
| 558 |
+
"source": [
|
| 559 |
+
"encoder = TransformerEncoderLayer(EMBEDDING_DIM, 1)\n",
|
| 560 |
+
"decoder = TransformerDecoderLayer(EMBEDDING_DIM, UNITS, 8)\n",
|
| 561 |
+
"\n",
|
| 562 |
+
"cnn_model = CNN_Encoder()\n",
|
| 563 |
+
"caption_model = ImageCaptioningModel(\n",
|
| 564 |
+
" cnn_model=cnn_model, encoder=encoder, decoder=decoder, image_aug=image_augmentation,\n",
|
| 565 |
+
")"
|
| 566 |
+
]
|
| 567 |
+
},
|
| 568 |
+
{
|
| 569 |
+
"cell_type": "code",
|
| 570 |
+
"execution_count": null,
|
| 571 |
+
"metadata": {
|
| 572 |
+
"id": "bayNssgNX6QN"
|
| 573 |
+
},
|
| 574 |
+
"outputs": [],
|
| 575 |
+
"source": [
|
| 576 |
+
"cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(\n",
|
| 577 |
+
" from_logits=False, reduction=\"none\"\n",
|
| 578 |
+
")\n",
|
| 579 |
+
"\n",
|
| 580 |
+
"early_stopping = tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)\n",
|
| 581 |
+
"\n",
|
| 582 |
+
"caption_model.compile(\n",
|
| 583 |
+
" optimizer=tf.keras.optimizers.Adam(),\n",
|
| 584 |
+
" loss=cross_entropy\n",
|
| 585 |
+
")"
|
| 586 |
+
]
|
| 587 |
+
},
|
| 588 |
+
{
|
| 589 |
+
"cell_type": "code",
|
| 590 |
+
"execution_count": null,
|
| 591 |
+
"metadata": {
|
| 592 |
+
"id": "1RYo-MRVYn49"
|
| 593 |
+
},
|
| 594 |
+
"outputs": [],
|
| 595 |
+
"source": [
|
| 596 |
+
"history = caption_model.fit(\n",
|
| 597 |
+
" train_dataset,\n",
|
| 598 |
+
" epochs=EPOCHS,\n",
|
| 599 |
+
" validation_data=val_dataset,\n",
|
| 600 |
+
" callbacks=[early_stopping]\n",
|
| 601 |
+
")"
|
| 602 |
+
]
|
| 603 |
+
},
|
| 604 |
+
{
|
| 605 |
+
"cell_type": "code",
|
| 606 |
+
"execution_count": null,
|
| 607 |
+
"metadata": {},
|
| 608 |
+
"outputs": [],
|
| 609 |
+
"source": [
|
| 610 |
+
"plt.plot(history.history['loss'], label='train_loss')\n",
|
| 611 |
+
"plt.plot(history.history['val_loss'], label='validation loss')\n",
|
| 612 |
+
"plt.legend()\n",
|
| 613 |
+
"plt.show()"
|
| 614 |
+
]
|
| 615 |
+
},
|
| 616 |
+
{
|
| 617 |
+
"cell_type": "code",
|
| 618 |
+
"execution_count": null,
|
| 619 |
+
"metadata": {
|
| 620 |
+
"id": "3ErlQQICtj_g"
|
| 621 |
+
},
|
| 622 |
+
"outputs": [],
|
| 623 |
+
"source": [
|
| 624 |
+
"def load_image_from_path(img_path):\n",
|
| 625 |
+
" img = tf.io.read_file(img_path)\n",
|
| 626 |
+
" img = tf.io.decode_jpeg(img, channels=3)\n",
|
| 627 |
+
" img = tf.keras.layers.Resizing(299, 299)(img)\n",
|
| 628 |
+
" img = tf.keras.applications.inception_v3.preprocess_input(img)\n",
|
| 629 |
+
" return img\n",
|
| 630 |
+
"\n",
|
| 631 |
+
"\n",
|
| 632 |
+
"def generate_caption(img_path, add_noise=False):\n",
|
| 633 |
+
" img = load_image_from_path(img_path)\n",
|
| 634 |
+
" \n",
|
| 635 |
+
" if add_noise:\n",
|
| 636 |
+
" noise = tf.random.normal(img.shape)*0.1\n",
|
| 637 |
+
" img = img + noise\n",
|
| 638 |
+
" img = (img - tf.reduce_min(img))/(tf.reduce_max(img) - tf.reduce_min(img))\n",
|
| 639 |
+
" \n",
|
| 640 |
+
" img = tf.expand_dims(img, axis=0)\n",
|
| 641 |
+
" img_embed = caption_model.cnn_model(img)\n",
|
| 642 |
+
" img_encoded = caption_model.encoder(img_embed, training=False)\n",
|
| 643 |
+
"\n",
|
| 644 |
+
" y_inp = '[start]'\n",
|
| 645 |
+
" for i in range(MAX_LENGTH-1):\n",
|
| 646 |
+
" tokenized = tokenizer([y_inp])[:, :-1]\n",
|
| 647 |
+
" mask = tf.cast(tokenized != 0, tf.int32)\n",
|
| 648 |
+
" pred = caption_model.decoder(\n",
|
| 649 |
+
" tokenized, img_encoded, training=False, mask=mask)\n",
|
| 650 |
+
" \n",
|
| 651 |
+
" pred_idx = np.argmax(pred[0, i, :])\n",
|
| 652 |
+
" pred_idx = tf.convert_to_tensor(pred_idx)\n",
|
| 653 |
+
" pred_word = idx2word(pred_idx).numpy().decode('utf-8')\n",
|
| 654 |
+
" if pred_word == '[end]':\n",
|
| 655 |
+
" break\n",
|
| 656 |
+
" \n",
|
| 657 |
+
" y_inp += ' ' + pred_word\n",
|
| 658 |
+
" \n",
|
| 659 |
+
" y_inp = y_inp.replace('[start] ', '')\n",
|
| 660 |
+
" return y_inp"
|
| 661 |
+
]
|
| 662 |
+
},
|
| 663 |
+
{
|
| 664 |
+
"cell_type": "code",
|
| 665 |
+
"execution_count": null,
|
| 666 |
+
"metadata": {
|
| 667 |
+
"id": "27_bJe_M1Drr"
|
| 668 |
+
},
|
| 669 |
+
"outputs": [],
|
| 670 |
+
"source": [
|
| 671 |
+
"idx = random.randrange(0, len(captions))\n",
|
| 672 |
+
"img_path = captions.iloc[idx].image\n",
|
| 673 |
+
"\n",
|
| 674 |
+
"pred_caption = generate_caption(img_path)\n",
|
| 675 |
+
"print('Predicted Caption:', pred_caption)\n",
|
| 676 |
+
"print()\n",
|
| 677 |
+
"Image.open(img_path)"
|
| 678 |
+
]
|
| 679 |
+
},
|
| 680 |
+
{
|
| 681 |
+
"cell_type": "code",
|
| 682 |
+
"execution_count": null,
|
| 683 |
+
"metadata": {},
|
| 684 |
+
"outputs": [],
|
| 685 |
+
"source": [
|
| 686 |
+
"img_url = \"https://images.unsplash.com/photo-1714981725936-8817d6a32dd3?w=500&auto=format&fit=crop&q=60&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHx0b3BpYy1mZWVkfDV8Qm4tRGpyY0Jyd298fGVufDB8fHx8fA%3D%3D\"\n",
|
| 687 |
+
"im = Image.open(requests.get(img_url, stream=True).raw)\n",
|
| 688 |
+
"im = im.convert('RGB')\n",
|
| 689 |
+
"im.save('tmp.jpg')\n",
|
| 690 |
+
"\n",
|
| 691 |
+
"pred_caption = generate_caption('tmp.jpg', add_noise=False)\n",
|
| 692 |
+
"print('Predicted Caption:', pred_caption)\n",
|
| 693 |
+
"print()\n",
|
| 694 |
+
"im"
|
| 695 |
+
]
|
| 696 |
+
},
|
| 697 |
+
{
|
| 698 |
+
"cell_type": "code",
|
| 699 |
+
"execution_count": null,
|
| 700 |
+
"metadata": {},
|
| 701 |
+
"outputs": [],
|
| 702 |
+
"source": [
|
| 703 |
+
"img_url = \"https://images.unsplash.com/photo-1669173733011-6f1afef8d5e6?w=500&auto=format&fit=crop&q=60&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHx0b3BpYy1mZWVkfDE0fEJuLURqcmNCcndvfHxlbnwwfHx8fHw%3D\"\n",
|
| 704 |
+
"im = Image.open(requests.get(img_url, stream=True).raw)\n",
|
| 705 |
+
"im = im.convert('RGB')\n",
|
| 706 |
+
"im.save('tmp.jpg')\n",
|
| 707 |
+
"\n",
|
| 708 |
+
"pred_caption = generate_caption('tmp.jpg', add_noise=False)\n",
|
| 709 |
+
"print('Predicted Caption:', pred_caption)\n",
|
| 710 |
+
"print()\n",
|
| 711 |
+
"im"
|
| 712 |
+
]
|
| 713 |
+
},
|
| 714 |
+
{
|
| 715 |
+
"cell_type": "code",
|
| 716 |
+
"execution_count": null,
|
| 717 |
+
"metadata": {},
|
| 718 |
+
"outputs": [],
|
| 719 |
+
"source": [
|
| 720 |
+
"img_url = \"https://images.unsplash.com/photo-1499676988064-a3779763470e?w=500&auto=format&fit=crop&q=60&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHx0b3BpYy1mZWVkfDEzMnxCbi1EanJjQnJ3b3x8ZW58MHx8fHx8\"\n",
|
| 721 |
+
"im = Image.open(requests.get(img_url, stream=True).raw)\n",
|
| 722 |
+
"im = im.convert('RGB')\n",
|
| 723 |
+
"im.save('tmp.jpg')\n",
|
| 724 |
+
"\n",
|
| 725 |
+
"pred_caption = generate_caption('tmp.jpg', add_noise=False)\n",
|
| 726 |
+
"print('Predicted Caption:', pred_caption)\n",
|
| 727 |
+
"print()\n",
|
| 728 |
+
"im"
|
| 729 |
+
]
|
| 730 |
+
},
|
| 731 |
+
{
|
| 732 |
+
"cell_type": "code",
|
| 733 |
+
"execution_count": null,
|
| 734 |
+
"metadata": {},
|
| 735 |
+
"outputs": [],
|
| 736 |
+
"source": [
|
| 737 |
+
"caption_model.save_weights('model.h5')"
|
| 738 |
+
]
|
| 739 |
+
},
|
| 740 |
+
{
|
| 741 |
+
"cell_type": "code",
|
| 742 |
+
"execution_count": null,
|
| 743 |
+
"metadata": {
|
| 744 |
+
"id": "XG69m29gs6W4"
|
| 745 |
+
},
|
| 746 |
+
"outputs": [],
|
| 747 |
+
"source": []
|
| 748 |
+
}
|
| 749 |
+
],
|
| 750 |
+
"metadata": {
|
| 751 |
+
"kaggle": {
|
| 752 |
+
"accelerator": "gpu",
|
| 753 |
+
"dataSources": [
|
| 754 |
+
{
|
| 755 |
+
"databundleVersionId": 1495989,
|
| 756 |
+
"datasetId": 857191,
|
| 757 |
+
"sourceId": 1462296,
|
| 758 |
+
"sourceType": "datasetVersion"
|
| 759 |
+
}
|
| 760 |
+
],
|
| 761 |
+
"isGpuEnabled": true,
|
| 762 |
+
"isInternetEnabled": true,
|
| 763 |
+
"language": "python",
|
| 764 |
+
"sourceType": "notebook"
|
| 765 |
+
},
|
| 766 |
+
"kernelspec": {
|
| 767 |
+
"display_name": "Python 3",
|
| 768 |
+
"language": "python",
|
| 769 |
+
"name": "python3"
|
| 770 |
+
},
|
| 771 |
+
"language_info": {
|
| 772 |
+
"codemirror_mode": {
|
| 773 |
+
"name": "ipython",
|
| 774 |
+
"version": 3
|
| 775 |
+
},
|
| 776 |
+
"file_extension": ".py",
|
| 777 |
+
"mimetype": "text/x-python",
|
| 778 |
+
"name": "python",
|
| 779 |
+
"nbconvert_exporter": "python",
|
| 780 |
+
"pygments_lexer": "ipython3",
|
| 781 |
+
"version": "3.7.12"
|
| 782 |
+
}
|
| 783 |
+
},
|
| 784 |
+
"nbformat": 4,
|
| 785 |
+
"nbformat_minor": 4
|
| 786 |
+
}
|
notebooks/README.md
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Notebooks
|
| 2 |
+
|
| 3 |
+
This directory holds Jupyter notebooks. Each notebook has a specific role in
|
| 4 |
+
the project lifecycle, and the rules are different for each one.
|
| 5 |
+
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
## `01_ieee_inceptionv3_transformer.ipynb` — **FROZEN**
|
| 9 |
+
|
| 10 |
+
This notebook is the **canonical research artefact** behind the IEEE
|
| 11 |
+
publication [*AI Narratives: Bridging Visual Content and Linguistic
|
| 12 |
+
Expression*](https://ieeexplore.ieee.org/document/10675203). It contains the
|
| 13 |
+
exact training pipeline, hyperparameters, and inference code used to produce
|
| 14 |
+
the BLEU ~24 score reported in the paper.
|
| 15 |
+
|
| 16 |
+
### Why is it frozen?
|
| 17 |
+
|
| 18 |
+
Reproducibility of a published result is non-negotiable. If the notebook drifts
|
| 19 |
+
from what the paper describes, anyone trying to reproduce the result —
|
| 20 |
+
reviewers, future students, recruiters running the demo — will see numbers that
|
| 21 |
+
don't match the paper. That breaks scientific trust.
|
| 22 |
+
|
| 23 |
+
### Rules
|
| 24 |
+
|
| 25 |
+
1. **Do not edit cells.** No improvements, no refactors, no comment fixes.
|
| 26 |
+
2. **Do not re-run cells with different seeds.** The committed outputs are
|
| 27 |
+
reference outputs — they are stripped on commit by `nbstripout`, but the
|
| 28 |
+
structure must stay identical.
|
| 29 |
+
3. **Improvements go into the modular package** at [`src/captioning/`](../src/captioning/),
|
| 30 |
+
never back into this notebook.
|
| 31 |
+
4. **Parity is enforced in CI.** The `make freeze-paper-notebook` target
|
| 32 |
+
computes a SHA-256 of this file and asserts it matches the locked hash in
|
| 33 |
+
`.paper-notebook.sha256`. If you change a cell, CI fails until you either
|
| 34 |
+
revert OR explicitly re-lock with `make lock-paper-notebook` AND update
|
| 35 |
+
the paper / model card to reflect the new behaviour.
|
| 36 |
+
|
| 37 |
+
### When this rule changes
|
| 38 |
+
|
| 39 |
+
The frozen state lifts when (and only when) we publish a v2 of the paper or
|
| 40 |
+
explicitly mark a re-run in the changelog. Until then, treat this file like
|
| 41 |
+
a museum exhibit.
|
| 42 |
+
|
| 43 |
+
---
|
| 44 |
+
|
| 45 |
+
## `02_dataset_eda.ipynb` — exploratory (Phase 1+)
|
| 46 |
+
|
| 47 |
+
Dataset inspection. Caption length distributions, vocabulary coverage, image
|
| 48 |
+
dimension histograms, class balance across COCO super-categories. This
|
| 49 |
+
notebook **may** be edited freely; it's a working scratchpad, not a published
|
| 50 |
+
artefact.
|
| 51 |
+
|
| 52 |
+
## `03_attention_visualization.ipynb` — exploratory (Phase 4+)
|
| 53 |
+
|
| 54 |
+
Visualisations of decoder attention weights over image patches. Used to
|
| 55 |
+
generate the figures in [`docs/results/`](../docs/results/). Outputs are
|
| 56 |
+
stripped by `nbstripout` on commit; PNGs land in `docs/images/attention/`
|
| 57 |
+
when explicitly exported.
|
| 58 |
+
|
| 59 |
+
---
|
| 60 |
+
|
| 61 |
+
## Conventions for new notebooks
|
| 62 |
+
|
| 63 |
+
If you add a new notebook:
|
| 64 |
+
|
| 65 |
+
- **Number it** (`04_*`, `05_*`) so the lifecycle order is obvious.
|
| 66 |
+
- **Use prose Markdown cells** between code cells — a notebook reads like a
|
| 67 |
+
short paper, not a Python script.
|
| 68 |
+
- **Do not import from `notebooks/`** elsewhere in the codebase. Notebooks
|
| 69 |
+
consume the `captioning` package; they never define library code.
|
| 70 |
+
- **Strip outputs before committing.** `nbstripout` does this automatically
|
| 71 |
+
if you ran `make install-hooks`. Without that hook, run `nbstripout
|
| 72 |
+
notebooks/your_notebook.ipynb` manually before `git add`.
|
| 73 |
+
|
| 74 |
+
---
|
| 75 |
+
|
| 76 |
+
## Why notebooks at all?
|
| 77 |
+
|
| 78 |
+
Notebooks are excellent for *exploration* — narrative, mixed media, iterative
|
| 79 |
+
data wrangling. They are bad for *libraries* — no testing, no type-checking,
|
| 80 |
+
no module reuse, hidden cell-execution-order bugs. The IEEE notebook stays
|
| 81 |
+
because the paper points at it; everything else lives in `src/captioning/`.
|
notebooks/image-captionin-using-dl.ipynb
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
pyproject.toml
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# pyproject.toml — single source of truth for the `captioning` Python package
|
| 3 |
+
# -----------------------------------------------------------------------------
|
| 4 |
+
# This file follows PEP 621 (project metadata) and PEP 517/518 (build system).
|
| 5 |
+
# It replaces a scattered mix of setup.py + requirements.txt + setup.cfg with
|
| 6 |
+
# one canonical config. `pip install -e .` installs the package from `src/`.
|
| 7 |
+
#
|
| 8 |
+
# Why src/ layout? It prevents accidental imports of the package from the
|
| 9 |
+
# repo root during testing — every test exercises the *installed* package,
|
| 10 |
+
# the way users will actually import it. This is the layout used by the
|
| 11 |
+
# Python Packaging Authority's example projects and recommended by pytest.
|
| 12 |
+
# =============================================================================
|
| 13 |
+
|
| 14 |
+
[build-system]
|
| 15 |
+
requires = ["setuptools>=68", "wheel"]
|
| 16 |
+
build-backend = "setuptools.build_meta"
|
| 17 |
+
|
| 18 |
+
# -----------------------------------------------------------------------------
|
| 19 |
+
# Project metadata — what `pip show captioning` will display.
|
| 20 |
+
# -----------------------------------------------------------------------------
|
| 21 |
+
[project]
|
| 22 |
+
name = "captioning"
|
| 23 |
+
version = "0.1.0"
|
| 24 |
+
description = "IEEE-published CNN+Transformer image captioning, restructured into a production-grade multimodal AI platform."
|
| 25 |
+
readme = "README.md"
|
| 26 |
+
requires-python = ">=3.10,<3.13"
|
| 27 |
+
license = { text = "MIT" }
|
| 28 |
+
authors = [
|
| 29 |
+
{ name = "Apoorv Raj" },
|
| 30 |
+
]
|
| 31 |
+
keywords = [
|
| 32 |
+
"image-captioning",
|
| 33 |
+
"multimodal",
|
| 34 |
+
"transformer",
|
| 35 |
+
"computer-vision",
|
| 36 |
+
"tensorflow",
|
| 37 |
+
"fastapi",
|
| 38 |
+
]
|
| 39 |
+
classifiers = [
|
| 40 |
+
"Development Status :: 3 - Alpha",
|
| 41 |
+
"Intended Audience :: Developers",
|
| 42 |
+
"Intended Audience :: Science/Research",
|
| 43 |
+
"License :: OSI Approved :: MIT License",
|
| 44 |
+
"Programming Language :: Python :: 3",
|
| 45 |
+
"Programming Language :: Python :: 3.10",
|
| 46 |
+
"Programming Language :: Python :: 3.11",
|
| 47 |
+
"Programming Language :: Python :: 3.12",
|
| 48 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
# -----------------------------------------------------------------------------
|
| 52 |
+
# Runtime dependencies for the core ML library + FastAPI backend.
|
| 53 |
+
# Pinned hard. Floating versions across TF + transformers + torch is the most
|
| 54 |
+
# common source of silent BLEU drift between training runs and deployments.
|
| 55 |
+
#
|
| 56 |
+
# - tensorflow-cpu (NOT tensorflow): saves ~600 MB and removes the CUDA driver
|
| 57 |
+
# dependency. We are deploying to CPU-only HuggingFace Spaces. If you ever
|
| 58 |
+
# train on a GPU box, install `tensorflow==2.15.0` in that environment only.
|
| 59 |
+
# - 2.15.0 specifically: TF 2.16 swapped to Keras 3 by default and broke the
|
| 60 |
+
# `tf.keras.layers.TextVectorization` saving behaviour the IEEE notebook
|
| 61 |
+
# relies on. Stay on 2.15 for v1; upgrade is a deliberate Phase-5+ task.
|
| 62 |
+
# - pydantic 2.x: required by FastAPI >= 0.100. Faster and stricter than v1.
|
| 63 |
+
# -----------------------------------------------------------------------------
|
| 64 |
+
dependencies = [
|
| 65 |
+
"tensorflow-cpu==2.15.0",
|
| 66 |
+
"numpy>=1.26,<2.0", # NumPy 2.0 broke TF 2.15 binary compat
|
| 67 |
+
"pandas>=2.1,<3.0",
|
| 68 |
+
"pillow>=10.0,<11.0",
|
| 69 |
+
"pyyaml>=6.0,<7.0",
|
| 70 |
+
"pydantic>=2.7,<3.0",
|
| 71 |
+
"pydantic-settings>=2.3,<3.0",
|
| 72 |
+
"fastapi>=0.111,<1.0",
|
| 73 |
+
"uvicorn[standard]>=0.30,<1.0",
|
| 74 |
+
"python-multipart>=0.0.9", # FastAPI multipart form data (image upload)
|
| 75 |
+
"huggingface-hub>=0.23,<1.0", # Pulls weights from HF Hub at startup
|
| 76 |
+
"structlog>=24.1,<25.0", # Structured JSON logs in prod, pretty in dev
|
| 77 |
+
"anyio>=4.3,<5.0", # Thread-pool offload for sync TF inference
|
| 78 |
+
"tqdm>=4.66,<5.0",
|
| 79 |
+
"click>=8.1,<9.0", # CLI for scripts/
|
| 80 |
+
]
|
| 81 |
+
|
| 82 |
+
# -----------------------------------------------------------------------------
|
| 83 |
+
# Optional dependency groups — installed via `pip install ".[dev,eval]"`.
|
| 84 |
+
# Splitting these keeps the production Docker image small (Phase 1 backend
|
| 85 |
+
# image is ~1.1 GB; adding `hf` extras takes it to ~2.3 GB which is the
|
| 86 |
+
# Phase 3 comparison image).
|
| 87 |
+
# -----------------------------------------------------------------------------
|
| 88 |
+
[project.optional-dependencies]
|
| 89 |
+
|
| 90 |
+
# Tier-1 multimodal upgrade: BLIP, ViT-GPT2, GIT models from HuggingFace.
|
| 91 |
+
# torch CPU is large (~700 MB); only install when serving the comparison demo.
|
| 92 |
+
hf = [
|
| 93 |
+
"transformers==4.41.2",
|
| 94 |
+
"torch==2.3.0",
|
| 95 |
+
"sentencepiece>=0.2.0",
|
| 96 |
+
"accelerate>=0.30,<1.0",
|
| 97 |
+
]
|
| 98 |
+
|
| 99 |
+
# Evaluation metrics. Pulled separately because pycocoevalcap drags Java
|
| 100 |
+
# dependencies (METEOR), which we don't want in the serving image.
|
| 101 |
+
eval = [
|
| 102 |
+
"sacrebleu>=2.4,<3.0",
|
| 103 |
+
"nltk>=3.8,<4.0",
|
| 104 |
+
"rouge-score>=0.1.2",
|
| 105 |
+
"pycocoevalcap>=1.2",
|
| 106 |
+
"matplotlib>=3.8,<4.0",
|
| 107 |
+
]
|
| 108 |
+
|
| 109 |
+
# Experiment tracking. Local SQLite by default; points at DagsHub in prod.
|
| 110 |
+
mlflow = [
|
| 111 |
+
"mlflow>=2.13,<3.0",
|
| 112 |
+
]
|
| 113 |
+
|
| 114 |
+
# Developer tooling: lint, type-check, test. Never deployed.
|
| 115 |
+
dev = [
|
| 116 |
+
"ruff>=0.5,<1.0",
|
| 117 |
+
"mypy>=1.10,<2.0",
|
| 118 |
+
"pytest>=8.2,<9.0",
|
| 119 |
+
"pytest-cov>=5.0,<6.0",
|
| 120 |
+
"pytest-asyncio>=0.23,<1.0",
|
| 121 |
+
"httpx>=0.27,<1.0", # FastAPI TestClient backend
|
| 122 |
+
"pre-commit>=3.7,<4.0",
|
| 123 |
+
"nbstripout>=0.7,<1.0",
|
| 124 |
+
"types-PyYAML",
|
| 125 |
+
"types-requests",
|
| 126 |
+
]
|
| 127 |
+
|
| 128 |
+
# -----------------------------------------------------------------------------
|
| 129 |
+
# Where pip should install the package from (the `src/` layout).
|
| 130 |
+
# -----------------------------------------------------------------------------
|
| 131 |
+
[tool.setuptools.packages.find]
|
| 132 |
+
where = ["src"]
|
| 133 |
+
include = ["captioning*"]
|
| 134 |
+
|
| 135 |
+
[tool.setuptools.package-data]
|
| 136 |
+
"captioning" = ["py.typed"] # PEP 561: ship type hints with the package
|
| 137 |
+
|
| 138 |
+
# =============================================================================
|
| 139 |
+
# Tooling configuration — co-located so a single file owns project policy.
|
| 140 |
+
# =============================================================================
|
| 141 |
+
|
| 142 |
+
# ---- Ruff: linter + formatter (replaces black + isort + flake8) -------------
|
| 143 |
+
# We prefer Ruff because it runs ~100x faster and is the de-facto modern
|
| 144 |
+
# default in the Python ecosystem. One tool, one config, one cache.
|
| 145 |
+
[tool.ruff]
|
| 146 |
+
line-length = 100
|
| 147 |
+
target-version = "py310"
|
| 148 |
+
src = ["src", "backend", "scripts", "tests"]
|
| 149 |
+
extend-exclude = [
|
| 150 |
+
"notebooks", # Notebooks have their own conventions
|
| 151 |
+
"outputs",
|
| 152 |
+
"mlruns",
|
| 153 |
+
"frontend",
|
| 154 |
+
]
|
| 155 |
+
|
| 156 |
+
[tool.ruff.lint]
|
| 157 |
+
# Curated rule set — pragmatic defaults, not the full strict catalogue.
|
| 158 |
+
select = [
|
| 159 |
+
"E", # pycodestyle errors
|
| 160 |
+
"W", # pycodestyle warnings
|
| 161 |
+
"F", # pyflakes
|
| 162 |
+
"I", # isort import sorting
|
| 163 |
+
"B", # flake8-bugbear (likely bugs)
|
| 164 |
+
"UP", # pyupgrade (modern syntax)
|
| 165 |
+
"SIM", # flake8-simplify
|
| 166 |
+
"RET", # flake8-return
|
| 167 |
+
"PTH", # flake8-use-pathlib (prefer pathlib over os.path)
|
| 168 |
+
"RUF", # Ruff's own rules
|
| 169 |
+
]
|
| 170 |
+
ignore = [
|
| 171 |
+
"E501", # line length — formatter handles it; lint warnings are noise
|
| 172 |
+
"B008", # function call in default arg (FastAPI's Depends() pattern)
|
| 173 |
+
]
|
| 174 |
+
|
| 175 |
+
[tool.ruff.lint.per-file-ignores]
|
| 176 |
+
"tests/**" = ["B011"] # asserts in tests are fine
|
| 177 |
+
"scripts/**" = ["T201"] # print() in CLI scripts is fine
|
| 178 |
+
|
| 179 |
+
[tool.ruff.format]
|
| 180 |
+
quote-style = "double"
|
| 181 |
+
indent-style = "space"
|
| 182 |
+
docstring-code-format = true
|
| 183 |
+
|
| 184 |
+
# ---- Mypy: static type checker -----------------------------------------------
|
| 185 |
+
# We only enforce types on our own code; third-party untyped libs are tolerated.
|
| 186 |
+
[tool.mypy]
|
| 187 |
+
python_version = "3.10"
|
| 188 |
+
strict = false # Start lenient; tighten as types stabilise
|
| 189 |
+
warn_unused_configs = true
|
| 190 |
+
warn_redundant_casts = true
|
| 191 |
+
warn_unused_ignores = true
|
| 192 |
+
warn_no_return = true
|
| 193 |
+
no_implicit_optional = true
|
| 194 |
+
files = ["src/captioning", "backend/app", "scripts"]
|
| 195 |
+
|
| 196 |
+
[[tool.mypy.overrides]]
|
| 197 |
+
module = [
|
| 198 |
+
"tensorflow.*",
|
| 199 |
+
"transformers.*",
|
| 200 |
+
"huggingface_hub.*",
|
| 201 |
+
"PIL.*",
|
| 202 |
+
"nltk.*",
|
| 203 |
+
"sacrebleu.*",
|
| 204 |
+
"rouge_score.*",
|
| 205 |
+
"pycocoevalcap.*",
|
| 206 |
+
]
|
| 207 |
+
ignore_missing_imports = true
|
| 208 |
+
|
| 209 |
+
# ---- Pytest -------------------------------------------------------------------
|
| 210 |
+
[tool.pytest.ini_options]
|
| 211 |
+
minversion = "8.0"
|
| 212 |
+
testpaths = ["tests", "backend/app/tests"]
|
| 213 |
+
addopts = [
|
| 214 |
+
"-ra", # Show short summary for non-passing tests
|
| 215 |
+
"--strict-markers",
|
| 216 |
+
"--strict-config",
|
| 217 |
+
"--showlocals",
|
| 218 |
+
]
|
| 219 |
+
markers = [
|
| 220 |
+
"slow: tests that take >10 seconds (run with -m slow)",
|
| 221 |
+
"gpu: tests requiring a GPU (skipped in CI by default)",
|
| 222 |
+
]
|
| 223 |
+
filterwarnings = [
|
| 224 |
+
"ignore::DeprecationWarning:tensorflow.*",
|
| 225 |
+
"ignore::FutureWarning:tensorflow.*",
|
| 226 |
+
]
|
| 227 |
+
|
| 228 |
+
# ---- Coverage -----------------------------------------------------------------
|
| 229 |
+
[tool.coverage.run]
|
| 230 |
+
branch = true
|
| 231 |
+
source = ["src/captioning", "backend/app"]
|
| 232 |
+
omit = ["*/tests/*", "*/__init__.py"]
|
| 233 |
+
|
| 234 |
+
[tool.coverage.report]
|
| 235 |
+
exclude_lines = [
|
| 236 |
+
"pragma: no cover",
|
| 237 |
+
"raise NotImplementedError",
|
| 238 |
+
"if TYPE_CHECKING:",
|
| 239 |
+
"if __name__ == .__main__.:",
|
| 240 |
+
]
|
| 241 |
+
show_missing = true
|
| 242 |
+
skip_covered = false
|
requirements-dev.txt
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# requirements-dev.txt — developer tooling; never installed in production.
|
| 3 |
+
# -----------------------------------------------------------------------------
|
| 4 |
+
# Install alongside the runtime deps:
|
| 5 |
+
# pip install -r requirements.txt -r requirements-dev.txt
|
| 6 |
+
#
|
| 7 |
+
# Or in one go via the package extras:
|
| 8 |
+
# pip install -e ".[dev]"
|
| 9 |
+
# =============================================================================
|
| 10 |
+
|
| 11 |
+
# Inherit runtime deps so devs get a complete environment in one command.
|
| 12 |
+
-r requirements.txt
|
| 13 |
+
|
| 14 |
+
# ---- Lint / format / type-check ----------------------------------------------
|
| 15 |
+
# Ruff replaces black + isort + flake8. ~100x faster, single config in pyproject.
|
| 16 |
+
ruff==0.5.0
|
| 17 |
+
mypy==1.10.1
|
| 18 |
+
|
| 19 |
+
# ---- Test runner -------------------------------------------------------------
|
| 20 |
+
pytest==8.2.2
|
| 21 |
+
pytest-cov==5.0.0
|
| 22 |
+
pytest-asyncio==0.23.7
|
| 23 |
+
httpx==0.27.0 # FastAPI TestClient transport
|
| 24 |
+
|
| 25 |
+
# ---- Pre-commit / notebook hygiene -------------------------------------------
|
| 26 |
+
# `pre-commit` orchestrates the hooks; `nbstripout` strips notebook outputs on
|
| 27 |
+
# commit so notebook diffs stay reviewable (and avoid leaking PII in cell outputs).
|
| 28 |
+
pre-commit==3.7.1
|
| 29 |
+
nbstripout==0.7.1
|
| 30 |
+
|
| 31 |
+
# ---- Type stubs --------------------------------------------------------------
|
| 32 |
+
types-PyYAML==6.0.12.20240311
|
| 33 |
+
types-requests==2.32.0.20240602
|
requirements-eval.txt
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# requirements-eval.txt — caption-quality metrics. Used by `scripts/evaluate.py`
|
| 3 |
+
# and by the `model-eval.yml` GitHub Action; NOT bundled into the serving image.
|
| 4 |
+
# -----------------------------------------------------------------------------
|
| 5 |
+
# pycocoevalcap drags Java for METEOR; we keep it in a separate file so the
|
| 6 |
+
# slim production image stays Java-free.
|
| 7 |
+
# =============================================================================
|
| 8 |
+
|
| 9 |
+
-r requirements.txt
|
| 10 |
+
|
| 11 |
+
# ---- Metrics -----------------------------------------------------------------
|
| 12 |
+
sacrebleu==2.4.2 # BLEU-1..4, corpus + sentence level
|
| 13 |
+
nltk==3.8.1 # BLEU + tokenisation utilities
|
| 14 |
+
rouge-score==0.1.2 # ROUGE-L for caption quality
|
| 15 |
+
pycocoevalcap==1.2 # CIDEr + METEOR (the COCO standard)
|
| 16 |
+
|
| 17 |
+
# ---- Plotting / reports ------------------------------------------------------
|
| 18 |
+
matplotlib==3.9.0
|
requirements.txt
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# requirements.txt — runtime dependencies for the FastAPI backend (slim).
|
| 3 |
+
# -----------------------------------------------------------------------------
|
| 4 |
+
# This file mirrors `[project.dependencies]` in pyproject.toml so that
|
| 5 |
+
# Docker builds and CI can `pip install -r requirements.txt` without needing
|
| 6 |
+
# the package source available.
|
| 7 |
+
#
|
| 8 |
+
# To regenerate from pyproject.toml later (recommended after Phase 1):
|
| 9 |
+
# pip install pip-tools
|
| 10 |
+
# pip-compile --extra=hf pyproject.toml -o requirements.txt
|
| 11 |
+
#
|
| 12 |
+
# All versions are pinned. Floating versions across TF + transformers + torch
|
| 13 |
+
# is the most common silent source of BLEU drift between runs and deploys.
|
| 14 |
+
# =============================================================================
|
| 15 |
+
|
| 16 |
+
# ---- Core ML framework -------------------------------------------------------
|
| 17 |
+
# CPU-only TF: deploy target is CPU HuggingFace Spaces. Pinned at 2.15 because
|
| 18 |
+
# 2.16+ ships Keras 3 by default, which breaks the IEEE notebook's
|
| 19 |
+
# `tf.keras.layers.TextVectorization` save/load semantics.
|
| 20 |
+
tensorflow-cpu==2.15.0
|
| 21 |
+
|
| 22 |
+
# ---- Numerics / data ---------------------------------------------------------
|
| 23 |
+
# NumPy <2.0 because TF 2.15 was built against the 1.x ABI.
|
| 24 |
+
numpy==1.26.4
|
| 25 |
+
pandas==2.2.2
|
| 26 |
+
pillow==10.3.0
|
| 27 |
+
pyyaml==6.0.1
|
| 28 |
+
|
| 29 |
+
# ---- Config & validation -----------------------------------------------------
|
| 30 |
+
pydantic==2.7.4
|
| 31 |
+
pydantic-settings==2.3.4
|
| 32 |
+
|
| 33 |
+
# ---- API server --------------------------------------------------------------
|
| 34 |
+
fastapi==0.111.0
|
| 35 |
+
uvicorn[standard]==0.30.1
|
| 36 |
+
python-multipart==0.0.9
|
| 37 |
+
|
| 38 |
+
# ---- Model artefact pull at startup ------------------------------------------
|
| 39 |
+
huggingface-hub==0.23.4
|
| 40 |
+
|
| 41 |
+
# ---- Logging / async / CLI ---------------------------------------------------
|
| 42 |
+
structlog==24.2.0
|
| 43 |
+
anyio==4.4.0
|
| 44 |
+
tqdm==4.66.4
|
| 45 |
+
click==8.1.7
|
src/captioning/__init__.py
ADDED
|
File without changes
|