Spaces:

pchandragrid
/

image_captioning

Sleeping

App Files Files Community

pchandragrid commited on 5 days ago

Commit

a745a5e

0 Parent(s):

Deploy Streamlit app

Browse files

Files changed (36) hide show

.gitattributes +35 -0
.gitignore +57 -0
.streamlit/config.toml +8 -0
Dockerfile +20 -0
README.md +149 -0
app.py +469 -0
app/streamlit_app.py +145 -0
beam_search_experiments.py +139 -0
create_subset_20k.py +16 -0
dataset_384.py +46 -0
dataset_advanced.py +97 -0
dataset_git.py +68 -0
dataset_vit_gpt2.py +77 -0
evaluate.py +149 -0
plot/beam_experiment_plot.py +27 -0
plot/caption_length_analysis.py +58 -0
requirements.txt +20 -0
src/__init__.py +7 -0
src/data/__init__.py +4 -0
src/data/coco_384_dataset.py +52 -0
src/data/coco_advanced_dataset.py +98 -0
src/data/coco_vit_gpt2_dataset.py +87 -0
src/evaluation/__init__.py +4 -0
src/evaluation/cider_eval.py +65 -0
src/streamlit_app.py +40 -0
src/training/__init__.py +4 -0
src/training/train_phase1.py +168 -0
src/training/train_phase2.py +170 -0
src/utils/__init__.py +4 -0
src/utils/data_subset.py +64 -0
train_blip_20k_384.py +103 -0
train_data_experiments.py +225 -0
train_git.py +127 -0
train_phase2.py +219 -0
train_vit_gpt2.py +194 -0
uploadtohf.py +48 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,57 @@

+.venv/
+__pycache__/
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+.DS_Store
+# Local datasets
+train2017/
+val2017/
+annotations/*.jsonl
+# Local model artifacts (don't commit)
+saved_model_phase1/
+saved_model_phase2/
+saved_vit_gpt2/
+saved_git_model/
+saved_model_phase2.bak/
+saved_vit_gpt2.bak/
+saved_git_model.bak/
+# Hugging Face cache (optional)
+.cache/
+hf_cache/
+hf_home/
+# virtual environment
+.venv/
+# python cache
+__pycache__/
+*.pyc
+# datasets
+train2017/
+val2017/
+annotations/
+# trained models
+saved_model/
+saved_model_20k_384/
+# checkpoints
+checkpoints/
+checkpoints_20k_384/
+# mac files
+.DS_Store
+# Hugging Face Spaces: avoid binaries in git
+*.png
+*.jpg
+*.jpeg
+*.gif
+*.webp

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,8 @@

+[server]
+headless = true
+enableCORS = false
+enableXsrfProtection = true
+[browser]
+gatherUsageStats = false

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM python:3.13.5-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt ./
+COPY src/ ./src/
+RUN pip3 install -r requirements.txt
+EXPOSE 8501
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

README.md ADDED Viewed

	@@ -0,0 +1,149 @@

+# Image Captioning (Streamlit)
+This repo hosts a Streamlit app (`app.py`) that compares multiple image-captioning models.
+## Why your models should NOT be inside the app repo
+Fine-tuned checkpoints are large. Public hosting (Hugging Face Spaces / Streamlit Cloud) works best when:
+- the app repo stays small
+- models live on the Hugging Face Hub (or S3/GCS)
+- the app downloads models at startup (cached by `transformers`)
+## 1) Upload your saved models to Hugging Face Hub
+Example for BLIP (you already have `uploadtohf.py`):
+```bash
+pip install -U transformers huggingface_hub
+huggingface-cli login
+python uploadtohf.py
+```
+Do the same for your other local folders (`saved_vit_gpt2`, `saved_git_model`) by pushing them to separate Hub repos.
+## 2) Configure the app to load from Hub
+`app.py` loads **local folders if present**, otherwise falls back to Hub IDs via environment variables:
+- `BLIP_MODEL_ID` (default: `prateekchandra/blip-caption-model`)
+- `VITGPT2_MODEL_ID` (default: `prateekchandra/vit-gpt2-caption-model`)
+- `GIT_MODEL_ID` (default: `prateekchandra/git-caption-model`)
+In this repo, defaults are set to:
+- `BLIP_MODEL_ID` (default: `pchandragrid/blip-caption-model`)
+- `VITGPT2_MODEL_ID` (default: `pchandragrid/vit-gpt2-caption-model`)
+- `GIT_MODEL_ID` (default: `pchandragrid/git-caption-model`)
+You can also override local folder names:
+- `BLIP_LOCAL_DIR` (default: `saved_model_phase2`)
+- `VITGPT2_LOCAL_DIR` (default: `saved_vit_gpt2`)
+- `GIT_LOCAL_DIR` (default: `saved_git_model`)
+## 3) Deploy options
+### Option A: Hugging Face Spaces (recommended)
+- Create a new Space: **Streamlit**
+- Push this repo (must include `app.py` + `requirements.txt`)
+- In Space “Variables”, set `BLIP_MODEL_ID`, `VITGPT2_MODEL_ID`, `GIT_MODEL_ID` to your Hub repos
+- If any model repo is private, add `HF_TOKEN` as a Space **Secret**
+### Option B: Streamlit Community Cloud
+- Point it to this repo
+- Set the same env vars in the app settings
+## Local run
+```bash
+python -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+streamlit run app.py
+```
+# 🖼️ Image Captioning with BLIP (COCO Subset)
+## 📌 Problem
+Generate natural language descriptions for images using transformer-based vision-language models.
+Goal:
+- Improve CIDEr score by 10%+
+- Compare architectures (BLIP vs ViT-GPT2)
+- Analyze resolution impact (224 vs 320 vs 384)
+- Optimize decoding parameters
+- Deploy minimal inference UI
+---
+## 📂 Dataset
+- MS COCO Captions (subset: 10k & 20k)
+- Random caption selection (5 captions per image)
+- Experiments:
+  - Short captions
+  - Mixed captions
+  - Filtered captions
+Train/Validation split: 90/10
+---
+## 🧠 Models
+### 1️⃣ BLIP (Primary Model)
+- Salesforce/blip-image-captioning-base
+- Vision encoder frozen (for efficiency)
+- Gradient checkpointing enabled
+- Mixed precision on MPS
+### 2️⃣ ViT-GPT2 (Comparison)
+- ViT base encoder
+- GPT2 decoder with cross-attention
+---
+## 🧪 Experiments
+### Resolution Comparison
+| Resolution | Dataset | CIDEr |
+|------------|---------|--------|
+| 224px | 10k | ~1.28 |
+| 320px | 20k | ~1.33–1.38 |
+| 384px | 20k | ~1.40+ |
+### Beam Search Tuning
+Tested:
+- Beams: 3, 5, 8
+- Length penalty: 0.8, 1.0, 1.2
+- Max length: 20, 30, 40
+Best config:
+Beams=5, MaxLen=20, LengthPenalty=1.0
+---
+## 📊 Evaluation Metric
+- CIDEr (via pycocoevalcap)
+- Validation loss
+- Confidence estimation
+---
+## 🖥️ Demo
+Streamlit app includes:
+- Image uploader
+- Beam controls
+- Toxicity filtering
+- Confidence display
+- Attention heatmap
+Run:
+```bash
+streamlit run app.py

app.py ADDED Viewed

	@@ -0,0 +1,469 @@

+import os
+import streamlit as st
+import torch
+import matplotlib.pyplot as plt
+import numpy as np
+import time
+import pandas as pd
+from transformers import (
+    BlipProcessor,
+    BlipForConditionalGeneration,
+    VisionEncoderDecoderModel,
+    ViTImageProcessor,
+    AutoTokenizer,
+    GitProcessor,
+    GitForCausalLM
+)
+from PIL import Image
+def _get_device() -> torch.device:
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    if getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available():
+        return torch.device("mps")
+    return torch.device("cpu")
+device = _get_device()
+_TORCH_DTYPE = torch.float16 if device.type in {"cuda", "mps"} else torch.float32
+def _resolve_source(local_dir: str, hub_id: str) -> str:
+    """
+    Prefer a local directory if it exists; otherwise use a Hugging Face Hub repo id.
+    """
+    if local_dir and os.path.isdir(local_dir):
+        return local_dir
+    return hub_id
+# ================================
+# EXPERIMENT GRAPH FUNCTIONS
+# ================================
+def plot_beam_experiment():
+    beam_sizes = [1,3,5,10]
+    blip_scores = [0.52,0.59,0.61,0.60]
+    vit_scores = [0.50,0.56,0.60,0.58]
+    git_scores = [0.12,0.16,0.17,0.16]
+    fig, ax = plt.subplots(figsize=(10,6))
+    ax.plot(beam_sizes, blip_scores, marker='o', linewidth=3, label="BLIP")
+    ax.plot(beam_sizes, vit_scores, marker='o', linewidth=3, label="ViT-GPT2")
+    ax.plot(beam_sizes, git_scores, marker='o', linewidth=3, label="GIT")
+    ax.set_xlabel("Beam Size")
+    ax.set_ylabel("CIDEr Score")
+    ax.set_title("Beam Size vs Caption Quality")
+    ax.legend()
+    ax.grid(True)
+    return fig
+def plot_caption_length():
+    labels = ["Short","Medium","Long"]
+    blip = [0.71,0.60,0.48]
+    vit = [0.65,0.59,0.42]
+    git = [0.30,0.18,0.11]
+    x = np.arange(len(labels))
+    width = 0.25
+    fig, ax = plt.subplots(figsize=(10,6))
+    ax.bar(x - width, blip, width, label="BLIP")
+    ax.bar(x, vit, width, label="ViT-GPT2")
+    ax.bar(x + width, git, width, label="GIT")
+    ax.set_xlabel("Caption Length Category")
+    ax.set_ylabel("CIDEr Score")
+    ax.set_title("Model Performance vs Caption Length")
+    ax.set_xticks(x)
+    ax.set_xticklabels(labels)
+    ax.legend()
+    return fig
+# ================================
+# UI STYLE
+# ================================
+st.markdown("""
+<style>
+.main-title{
+text-align:center;
+font-size:42px;
+font-weight:bold;
+margin-bottom:10px;
+}
+.subtitle{
+text-align:center;
+font-size:18px;
+color:gray;
+margin-bottom:30px;
+}
+.caption-box{
+background-color:white;
+padding:20px;
+border-radius:14px;
+text-align:center;
+font-size:18px;
+min-height:120px;
+display:flex;
+align-items:center;
+justify-content:center;
+color:black;
+font-weight:500;
+box-shadow:0px 4px 12px rgba(0,0,0,0.15);
+}
+.model-title{
+text-align:center;
+font-size:22px;
+font-weight:bold;
+margin-bottom:10px;
+}
+</style>
+""", unsafe_allow_html=True)
+# ================================
+# LOAD MODELS
+# ================================
+@st.cache_resource
+def load_blip():
+    source = _resolve_source(
+        os.getenv("BLIP_LOCAL_DIR", "saved_model_phase2"),
+        os.getenv("BLIP_MODEL_ID", "pchandragrid/blip-caption-model"),
+    )
+    model = BlipForConditionalGeneration.from_pretrained(
+        source,
+        torch_dtype=_TORCH_DTYPE,
+        low_cpu_mem_usage=True,
+    )
+    processor = BlipProcessor.from_pretrained(source)
+    model.to(device)
+    model.eval()
+    return model, processor
+@st.cache_resource
+def load_vit_gpt2():
+    source = _resolve_source(
+        os.getenv("VITGPT2_LOCAL_DIR", "saved_vit_gpt2"),
+        os.getenv("VITGPT2_MODEL_ID", "pchandragrid/vit-gpt2-caption-model"),
+    )
+    model = VisionEncoderDecoderModel.from_pretrained(
+        source,
+        torch_dtype=_TORCH_DTYPE,
+        low_cpu_mem_usage=True,
+    )
+    processor = ViTImageProcessor.from_pretrained(source)
+    tokenizer = AutoTokenizer.from_pretrained(source)
+    model.to(device)
+    model.eval()
+    return model, processor, tokenizer
+@st.cache_resource
+def load_git():
+    source = _resolve_source(
+        os.getenv("GIT_LOCAL_DIR", "saved_git_model"),
+        os.getenv("GIT_MODEL_ID", "pchandragrid/git-caption-model"),
+    )
+    processor = GitProcessor.from_pretrained(source)
+    model = GitForCausalLM.from_pretrained(
+        source,
+        torch_dtype=_TORCH_DTYPE,
+        low_cpu_mem_usage=True,
+    )
+    model.to(device)
+    model.eval()
+    return model, processor
+# ================================
+# HEADER
+# ================================
+st.markdown('<div class="main-title">🖼️ Image Captioning</div>', unsafe_allow_html=True)
+st.markdown(
+'<div class="subtitle">Compare BLIP vs ViT-GPT2 vs GIT on the same image</div>',
+unsafe_allow_html=True
+)
+st.markdown("""
+### 📌 Project Overview
+This project focuses on **automatic image caption generation using transformer-based vision-language models**.
+The system takes an input image and generates a natural language description of the scene.
+Three architectures are evaluated:
+• **BLIP (Bootstrapping Language Image Pretraining)** – multimodal transformer designed specifically for vision-language tasks
+• **ViT-GPT2** – Vision Transformer encoder combined with GPT2 text decoder
+• **GIT (Generative Image-to-Text Transformer)** – unified transformer architecture for image-to-text generation
+The goal of this project is to **compare model architectures, caption quality, and generation performance** using the COCO dataset.
+---
+### 🎯 Project Objective
+Improve caption generation performance through **fine-tuning and decoding optimization**.
+Training pipeline:
+**Step 1 — Dataset Preparation**
+- Use **MS COCO captions dataset**
+- Train on a **10k–50k image-caption subset**
+**Step 2 — Model Fine-Tuning**
+- Fine-tune **BLIP or VisionEncoderDecoder models**
+**Step 3 — Training Configuration**
+- Train with image resolution **224–384 px**
+- Train for **3 epochs**
+**Step 4 — Memory Optimization**
+- Use **gradient checkpointing** to reduce GPU memory usage
+**Step 5 — Target Performance**
+- Achieve **10%+ improvement in CIDEr score** compared to baseline models
+These steps allow the system to learn stronger **image-text alignment and caption generation capability**.
+""")
+# ================================
+# SIDEBAR
+# ================================
+st.sidebar.header("⚙️ Generation Settings")
+st.sidebar.subheader("Models to run")
+run_blip = st.sidebar.checkbox("BLIP", value=True)
+run_vit = st.sidebar.checkbox("ViT-GPT2", value=False)
+run_git = st.sidebar.checkbox("GIT", value=False)
+num_beams = st.sidebar.slider("Beam Size",1,10,5)
+max_length = st.sidebar.slider("Max Length",10,50,20)
+length_penalty = st.sidebar.slider("Length Penalty",0.5,2.0,1.0,step=0.1)
+uploaded_file = st.file_uploader("Upload Image", type=["jpg","png","jpeg"])
+# ================================
+# IMAGE DISPLAY
+# ================================
+if uploaded_file:
+    image = Image.open(uploaded_file).convert("RGB")
+    st.markdown(
+    """
+    <div style="text-align:center;font-size:22px;font-weight:bold;margin-bottom:10px;">
+    Uploaded Image
+    </div>
+    """,
+    unsafe_allow_html=True
+    )
+    st.image(image, use_container_width=True)
+    if st.button("Generate Captions"):
+        with st.spinner("Running models..."):
+            if not any([run_blip, run_vit, run_git]):
+                st.warning("Select at least one model in the sidebar.")
+                st.stop()
+            results = []
+            blip_inputs = None
+            if run_blip:
+                blip_model, blip_processor = load_blip()
+                start = time.time()
+                blip_inputs = blip_processor(images=image, return_tensors="pt").to(device)
+                with torch.no_grad():
+                    blip_ids = blip_model.generate(
+                        **blip_inputs,
+                        num_beams=num_beams,
+                        max_length=max_length,
+                        length_penalty=length_penalty,
+                    )
+                blip_caption = blip_processor.decode(blip_ids[0], skip_special_tokens=True)
+                results.append(("BLIP", blip_caption, time.time() - start))
+            if run_vit:
+                vit_model, vit_processor, vit_tokenizer = load_vit_gpt2()
+                start = time.time()
+                pixel_values = vit_processor(images=image, return_tensors="pt").pixel_values.to(device)
+                with torch.no_grad():
+                    vit_ids = vit_model.generate(
+                        pixel_values=pixel_values,
+                        num_beams=num_beams,
+                        max_length=max_length,
+                    )
+                vit_caption = vit_tokenizer.decode(vit_ids[0], skip_special_tokens=True)
+                results.append(("ViT-GPT2", vit_caption, time.time() - start))
+            if run_git:
+                git_model, git_processor = load_git()
+                start = time.time()
+                git_inputs = git_processor(images=image, return_tensors="pt").to(device)
+                with torch.no_grad():
+                    git_ids = git_model.generate(
+                        **git_inputs,
+                        num_beams=num_beams,
+                        max_length=max_length,
+                    )
+                git_caption = git_processor.batch_decode(git_ids, skip_special_tokens=True)[0]
+                results.append(("GIT", git_caption, time.time() - start))
+        st.divider()
+        st.subheader("Model Comparison")
+        st.markdown("""
+Each model generates a caption describing the uploaded image.
+This comparison highlights differences in:
+• caption quality
+• inference speed
+• architectural design
+""")
+        cols = st.columns(len(results))
+        for col, (name, caption, seconds) in zip(cols, results):
+            with col:
+                st.markdown(f'<div class="model-title">{name}</div>', unsafe_allow_html=True)
+                st.markdown(f'<div class="caption-box">{caption}</div>', unsafe_allow_html=True)
+                st.caption(f"Inference: {seconds:.2f}s")
+        st.divider()
+        # ================================
+        # ATTENTION HEATMAP
+        # ================================
+        if run_blip and blip_inputs is not None:
+            blip_model, _ = load_blip()
+            with torch.no_grad():
+                vision_outputs = blip_model.vision_model(
+                    blip_inputs["pixel_values"],
+                    output_attentions=True,
+                    return_dict=True,
+                )
+            attentions = vision_outputs.attentions[-1]
+            attn = attentions[0].mean(0)
+            cls_attn = attn[0, 1:]
+            attn_map = cls_attn.cpu().numpy()
+            attn_map = attn_map / attn_map.max()
+            size = int(np.sqrt(len(attn_map)))
+            fig, ax = plt.subplots(figsize=(6, 6))
+            ax.imshow(attn_map.reshape(size, size), cmap="viridis")
+            ax.set_title("BLIP Vision Attention")
+            ax.axis("off")
+            st.pyplot(fig, use_container_width=True)
+            st.markdown("""
+### 🔍 Attention Visualization
+The attention heatmap highlights **which regions of the image the model focused on while generating the caption**.
+Brighter regions indicate higher importance for the caption generation process.
+""")
+# ================================
+# ARCHITECTURE COMPARISON TABLE
+# ================================
+st.divider()
+st.header("📊 Model Architecture Comparison")
+data = {
+"Model":["BLIP","ViT-GPT2","GIT"],
+"Architecture":[
+"Vision Transformer + Text Decoder",
+"ViT Encoder + GPT2 Decoder",
+"Unified Transformer"
+],
+"Parameters":["~224M","~210M","~150M"],
+"Training Time":["~1h 34m / epoch","~1h 20m / epoch","~11 min / epoch"],
+"CIDEr Score":["0.61","0.60","0.17"]
+}
+df = pd.DataFrame(data)
+st.table(df)
+# ================================
+# EXPERIMENT GRAPHS
+# ================================
+st.divider()
+st.header("📊 Experiment Analysis")
+st.subheader("Beam Size vs Caption Quality")
+fig1 = plot_beam_experiment()
+st.pyplot(fig1, use_container_width=True)
+st.markdown("""
+Beam search controls how many candidate captions are explored during generation.
+Increasing beam size improves caption quality initially but eventually leads to diminishing returns.
+""")
+st.divider()
+st.subheader("Caption Length vs Model Performance")
+fig2 = plot_caption_length()
+st.pyplot(fig2, use_container_width=True)
+st.markdown("""
+Caption length impacts performance because longer captions require more detailed reasoning about the scene.
+Models generally perform better on shorter captions.
+""")

app/streamlit_app.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import torch
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+import streamlit as st
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    BlipForConditionalGeneration,
+    BlipProcessor,
+)
+@st.cache_resource
+def load_caption_model():
+    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
+    model = BlipForConditionalGeneration.from_pretrained("saved_model_phase2")
+    processor = BlipProcessor.from_pretrained("saved_model_phase2")
+    model.to(device)
+    model.eval()
+    return model, processor, device
+@st.cache_resource
+def load_toxicity_model():
+    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
+    tokenizer = AutoTokenizer.from_pretrained("unitary/toxic-bert")
+    model = AutoModelForSequenceClassification.from_pretrained("unitary/toxic-bert")
+    model.to(device)
+    model.eval()
+    return model, tokenizer, device
+caption_model, caption_processor, device = load_caption_model()
+tox_model, tox_tokenizer, tox_device = load_toxicity_model()
+st.title("🖼️ Advanced Image Captioning Demo")
+st.write("Fine-tuned BLIP with Beam Search + Toxicity Filtering")
+st.sidebar.header("⚙️ Generation Settings")
+num_beams = st.sidebar.slider("Beam Size", 1, 10, 5)
+max_length = st.sidebar.slider("Max Length", 10, 50, 20)
+length_penalty = st.sidebar.slider("Length Penalty", 0.5, 2.0, 1.0, step=0.1)
+uploaded_file = st.file_uploader("Upload an image", type=["jpg", "png", "jpeg"])
+if uploaded_file:
+    image = Image.open(uploaded_file).convert("RGB")
+    st.image(image, caption="Uploaded Image", width="stretch")
+    if st.button("Generate Caption"):
+        # Generate caption
+        with st.spinner("Generating caption..."):
+            inputs = caption_processor(
+                images=image,
+                return_tensors="pt",
+            ).to(device)
+            with torch.no_grad():
+                output_ids = caption_model.generate(
+                    **inputs,
+                    num_beams=num_beams,
+                    max_length=max_length,
+                    length_penalty=length_penalty,
+                )
+            caption = caption_processor.decode(
+                output_ids[0],
+                skip_special_tokens=True,
+            )
+        # Confidence score (stable)
+        with torch.no_grad():
+            loss_inputs = caption_processor(
+                images=image,
+                text=caption,
+                return_tensors="pt",
+            ).to(device)
+            outputs = caption_model(
+                pixel_values=loss_inputs["pixel_values"],
+                input_ids=loss_inputs["input_ids"],
+                attention_mask=loss_inputs["attention_mask"],
+                labels=loss_inputs["input_ids"],
+            )
+            loss = outputs.loss
+            confidence = torch.exp(-loss).item() if loss is not None else 0.0
+        # Toxicity check
+        tox_inputs = tox_tokenizer(
+            caption,
+            return_tensors="pt",
+            truncation=True,
+        ).to(tox_device)
+        with torch.no_grad():
+            tox_outputs = tox_model(**tox_inputs)
+            probs = F.softmax(tox_outputs.logits, dim=-1)
+        toxic_score = probs[0][1].item()
+        # Display caption
+        if toxic_score > 0.6:
+            st.error("⚠️ Generated caption flagged as potentially toxic.")
+            st.markdown("### 🚫 Caption Blocked")
+        else:
+            st.success("Caption Generated")
+            st.markdown(f"### 📝 {caption}")
+            st.caption(f"Toxicity Score: {toxic_score:.2f}")
+            st.caption(f"Confidence Score: {confidence:.2f}")
+        # Vision attention heatmap
+        with torch.no_grad():
+            vision_outputs = caption_model.vision_model(
+                inputs["pixel_values"],
+                output_attentions=True,
+                return_dict=True,
+            )
+        attentions = vision_outputs.attentions[-1]
+        attn = attentions[0].mean(0)
+        cls_attn = attn[0, 1:]
+        attn_map = cls_attn.cpu().numpy()
+        attn_map = attn_map / attn_map.max()
+        size = int(np.sqrt(len(attn_map)))
+        fig, ax = plt.subplots()
+        ax.imshow(attn_map.reshape(size, size), cmap="viridis")
+        ax.set_title("Vision Attention Heatmap")
+        ax.axis("off")
+        st.pyplot(fig)

beam_search_experiments.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import os
+import torch
+from transformers import BlipProcessor, BlipForConditionalGeneration
+from dataset_advanced import COCODataset
+from torch.utils.data import random_split
+from tqdm import tqdm
+from PIL import Image
+from pycocoevalcap.cider.cider import Cider
+def generate_caption(model, processor, image, device,
+                     num_beams=5,
+                     max_length=20,
+                     length_penalty=1.0):
+    inputs = processor(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        generated_ids = model.generate(
+            **inputs,
+            num_beams=num_beams,
+            max_length=max_length,
+            length_penalty=length_penalty
+        )
+    caption = processor.decode(
+        generated_ids[0],
+        skip_special_tokens=True
+    )
+    return caption
+def evaluate_config(model, processor, val_dataset, device,
+                    num_beams, max_length, length_penalty,
+                    max_samples=200):
+    model.eval()
+    cider_scorer = Cider()
+    ground_truth = {}
+    predictions = {}
+    print(f"\nTesting: beams={num_beams}, "
+          f"max_len={max_length}, "
+          f"len_penalty={length_penalty}")
+    for idx in tqdm(range(min(max_samples, len(val_dataset)))):
+        real_idx = val_dataset.indices[idx]
+        ann = val_dataset.dataset.annotations[real_idx]
+        image_path = os.path.join("train2017", ann["image"])
+        image = Image.open(image_path).convert("RGB")
+        pred_caption = generate_caption(
+            model,
+            processor,
+            image,
+            device,
+            num_beams=num_beams,
+            max_length=max_length,
+            length_penalty=length_penalty
+        )
+        ground_truth[idx] = ann["captions"]
+        predictions[idx] = [pred_caption]
+    score, _ = cider_scorer.compute_score(ground_truth, predictions)
+    print(f"CIDEr: {score:.4f}")
+    model.train()
+    return score
+def main():
+    if not torch.backends.mps.is_available():
+        raise RuntimeError("MPS not available.")
+    device = torch.device("mps")
+    print("Using device:", device)
+    # Load best Phase 2 model
+    model_dir = "saved_model_phase2"
+    processor = BlipProcessor.from_pretrained(model_dir)
+    model = BlipForConditionalGeneration.from_pretrained(model_dir)
+    model.to(device)
+    # Load validation split
+    full_dataset = COCODataset(
+        "annotations/subset_10k.jsonl",
+        "train2017",
+        processor
+    )
+    train_size = int(0.9 * len(full_dataset))
+    val_size = len(full_dataset) - train_size
+    _, val_dataset = random_split(
+        full_dataset,
+        [train_size, val_size]
+    )
+    # =========================
+    # Experiment Grid
+    # =========================
+    beam_sizes = [5]
+    max_lengths = [20]
+    length_penalties = [1.0]
+    results = []
+    for beams in beam_sizes:
+        for max_len in max_lengths:
+            for lp in length_penalties:
+                score = evaluate_config(
+                    model,
+                    processor,
+                    val_dataset,
+                    device,
+                    num_beams=beams,
+                    max_length=max_len,
+                    length_penalty=lp
+                )
+                results.append((beams, max_len, lp, score))
+    print("\n===== FINAL RESULTS =====")
+    for r in results:
+        print(f"Beams={r[0]}, MaxLen={r[1]}, "
+              f"LenPenalty={r[2]} -> CIDEr={r[3]:.4f}")
+if __name__ == "__main__":
+    main()

create_subset_20k.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import json
+import random
+input_path = "annotations/captions_train.jsonl"
+output_path = "annotations/subset_20k.jsonl"
+with open(input_path, "r") as f:
+    data = [json.loads(line) for line in f]
+subset = random.sample(data, 20000)
+with open(output_path, "w") as f:
+    for item in subset:
+        f.write(json.dumps(item) + "\n")
+print("20k subset created.")

dataset_384.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import json
+import os
+import random
+from torch.utils.data import Dataset
+from PIL import Image
+class COCODataset384(Dataset):
+    def __init__(self, annotation_path, image_folder, processor):
+        self.image_folder = image_folder
+        self.processor = processor
+        with open(annotation_path, "r") as f:
+            self.annotations = [json.loads(line) for line in f]
+    def __len__(self):
+        return len(self.annotations)
+    def __getitem__(self, idx):
+        ann = self.annotations[idx]
+        caption = random.choice(ann["captions"])
+        image_path = os.path.join(self.image_folder, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+        # 🔥 IMPORTANT: 384px
+        image = image.resize((384, 384))
+        encoding = self.processor(
+            image,
+            caption,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt"
+        )
+        input_ids = encoding["input_ids"].squeeze(0)
+        return {
+            "pixel_values": encoding["pixel_values"].squeeze(0),
+            "input_ids": input_ids,
+            "attention_mask": encoding["attention_mask"].squeeze(0),
+            "labels": input_ids.clone()
+        }

dataset_advanced.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import json
+import os
+import random
+import re
+from torch.utils.data import Dataset
+from PIL import Image
+class COCODatasetAdvanced(Dataset):
+    def __init__(self,
+                 annotation_path,
+                 image_folder,
+                 processor,
+                 mode="mixed",
+                 max_length=40):
+        self.image_folder = image_folder
+        self.processor = processor
+        self.max_length = max_length
+        self.mode = mode
+        with open(annotation_path, "r") as f:
+            raw_data = [json.loads(line) for line in f]
+        self.annotations = []
+        for ann in raw_data:
+            filtered_captions = []
+            for cap in ann["captions"]:
+                cap = cap.strip().lower()
+                # ---------- QUALITY FILTERS ----------
+                # Remove very short captions
+                if len(cap.split()) < 3:
+                    continue
+                # Remove repeated words
+                words = cap.split()
+                if len(set(words)) < len(words) * 0.6:
+                    continue
+                # Remove non-alphabetic captions
+                if not re.search(r"[a-z]", cap):
+                    continue
+                word_count = len(words)
+                # ---------- LENGTH FILTERS ----------
+                if self.mode == "short" and word_count <= 8:
+                    filtered_captions.append(cap)
+                elif self.mode == "long" and word_count > 15:
+                    filtered_captions.append(cap)
+                elif self.mode == "mixed":
+                    filtered_captions.append(cap)
+            if len(filtered_captions) > 0:
+                self.annotations.append({
+                    "image": ann["image"],
+                    "captions": filtered_captions
+                })
+    def __len__(self):
+        return len(self.annotations)
+    def __getitem__(self, idx):
+        ann = self.annotations[idx]
+        file_name = ann["image"]
+        caption = random.choice(ann["captions"])
+        image_path = os.path.join(self.image_folder, file_name)
+        image = Image.open(image_path).convert("RGB")
+        encoding = self.processor(
+            images=image,
+            text=caption,
+            padding="max_length",
+            truncation=True,
+            max_length=self.max_length,
+            return_tensors="pt"
+        )
+        input_ids = encoding["input_ids"].squeeze(0)
+        return {
+            "pixel_values": encoding["pixel_values"].squeeze(0),
+            "input_ids": input_ids,
+            "attention_mask": encoding["attention_mask"].squeeze(0),
+            "labels": input_ids.clone()
+        }

dataset_git.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import os
+import random
+import torch
+from torch.utils.data import Dataset
+from PIL import Image
+import json
+class COCODatasetGIT(Dataset):
+    def __init__(self, annotation_file, image_folder, processor, mode="mixed"):
+        self.annotations = []
+        self.image_folder = image_folder
+        self.processor = processor
+        self.mode = mode
+        # Proper JSONL loading
+        with open(annotation_file, "r") as f:
+            for line in f:
+                self.annotations.append(json.loads(line.strip()))
+    def __len__(self):
+        return len(self.annotations)
+    def select_caption(self, captions):
+        if self.mode == "short":
+            captions = [c for c in captions if len(c.split()) <= 10]
+        elif self.mode == "long":
+            captions = [c for c in captions if len(c.split()) > 10]
+        if len(captions) == 0:
+            captions = self.annotations[
+                random.randint(0, len(self.annotations) - 1)
+            ]["captions"]
+        return random.choice(captions)
+    def __getitem__(self, idx):
+        ann = self.annotations[idx]
+        image_path = os.path.join(self.image_folder, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+        caption = self.select_caption(ann["captions"])
+        encoding = self.processor(
+            images=image,
+            text=caption,
+            padding="max_length",
+            truncation=True,
+            max_length=30,
+            return_tensors="pt"
+        )
+        input_ids = encoding["input_ids"].squeeze(0)
+        attention_mask = encoding["attention_mask"].squeeze(0)
+        pixel_values = encoding["pixel_values"].squeeze(0)
+        return {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": input_ids   # GIT uses input_ids as labels
+        }

dataset_vit_gpt2.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import json
+import os
+import random
+from torch.utils.data import Dataset
+from PIL import Image
+class COCODatasetViTGPT2(Dataset):
+    def __init__(self,
+                 annotation_path,
+                 image_folder,
+                 image_processor,
+                 tokenizer,
+                 mode="short",
+                 max_length=20):
+        self.image_folder = image_folder
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.mode = mode
+        with open(annotation_path, "r") as f:
+            raw_data = [json.loads(line) for line in f]
+        self.annotations = []
+        for ann in raw_data:
+            filtered = []
+            for cap in ann["captions"]:
+                words = cap.split()
+                wc = len(words)
+                if mode == "short" and wc <= 8:
+                    filtered.append(cap)
+                elif mode == "long" and wc > 15:
+                    filtered.append(cap)
+                elif mode == "mixed":
+                    filtered.append(cap)
+            if len(filtered) > 0:
+                self.annotations.append({
+                    "image": ann["image"],
+                    "captions": filtered
+                })
+    def __len__(self):
+        return len(self.annotations)
+    def __getitem__(self, idx):
+        ann = self.annotations[idx]
+        caption = random.choice(ann["captions"])
+        image_path = os.path.join(self.image_folder, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+        pixel_values = self.image_processor(
+            images=image,
+            return_tensors="pt"
+        ).pixel_values.squeeze(0)
+        tokenized = self.tokenizer(
+            caption,
+            padding="max_length",
+            truncation=True,
+            max_length=self.max_length,
+            return_tensors="pt"
+        )
+        input_ids = tokenized.input_ids.squeeze(0)
+        return {
+            "pixel_values": pixel_values,
+            "labels": input_ids
+        }

evaluate.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import argparse
+import os
+import torch
+import torch.nn.functional as F
+from transformers import (
+    BlipProcessor,
+    BlipForConditionalGeneration,
+    AutoTokenizer,
+    AutoModelForSequenceClassification
+)
+from PIL import Image
+# ---------------------------------------
+# Load Models
+# ---------------------------------------
+def load_models():
+    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
+    print("Using device:", device)
+    caption_model = BlipForConditionalGeneration.from_pretrained("saved_model_phase2")
+    caption_processor = BlipProcessor.from_pretrained("saved_model_phase2")
+    caption_model.to(device)
+    caption_model.eval()
+    # Toxicity model
+    tox_tokenizer = AutoTokenizer.from_pretrained("unitary/toxic-bert")
+    tox_model = AutoModelForSequenceClassification.from_pretrained("unitary/toxic-bert")
+    tox_model.to(device)
+    tox_model.eval()
+    return caption_model, caption_processor, tox_model, tox_tokenizer, device
+# ---------------------------------------
+# Generate Caption + Confidence
+# ---------------------------------------
+def generate_caption(model, processor, image, device):
+    inputs = processor(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            num_beams=5,
+            max_length=20,
+            length_penalty=1.0,
+            output_scores=True,
+            return_dict_in_generate=True
+        )
+    generated_ids = outputs.sequences
+    caption = processor.decode(
+        generated_ids[0],
+        skip_special_tokens=True
+    )
+    # True confidence
+    seq_score = outputs.sequences_scores[0]
+    confidence = torch.exp(seq_score).item()
+    return caption, confidence
+# ---------------------------------------
+# Toxicity Score
+# ---------------------------------------
+def check_toxicity(tox_model, tox_tokenizer, caption, device):
+    inputs = tox_tokenizer(
+        caption,
+        return_tensors="pt",
+        truncation=True
+    ).to(device)
+    with torch.no_grad():
+        outputs = tox_model(**inputs)
+        probs = F.softmax(outputs.logits, dim=-1)
+    toxic_score = probs[0][1].item()
+    return toxic_score
+# ---------------------------------------
+# Evaluate Single Image
+# ---------------------------------------
+def evaluate_image(image_path, models):
+    caption_model, caption_processor, tox_model, tox_tokenizer, device = models
+    image = Image.open(image_path).convert("RGB")
+    caption, confidence = generate_caption(
+        caption_model,
+        caption_processor,
+        image,
+        device
+    )
+    toxic_score = check_toxicity(
+        tox_model,
+        tox_tokenizer,
+        caption,
+        device
+    )
+    print("\n===================================")
+    print("Image:", image_path)
+    print("Caption:", caption)
+    print(f"Confidence: {confidence:.3f}")
+    print(f"Toxicity Score: {toxic_score:.3f}")
+    if toxic_score > 0.6:
+        print("⚠️ WARNING: Caption flagged as toxic")
+    print("===================================\n")
+# ---------------------------------------
+# Main
+# ---------------------------------------
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--image", type=str, help="Path to single image")
+    parser.add_argument("--folder", type=str, help="Path to folder of images")
+    args = parser.parse_args()
+    if not args.image and not args.folder:
+        print("Please provide --image or --folder")
+        return
+    models = load_models()
+    if args.image:
+        evaluate_image(args.image, models)
+    if args.folder:
+        for file in os.listdir(args.folder):
+            if file.lower().endswith((".jpg", ".jpeg", ".png")):
+                path = os.path.join(args.folder, file)
+                evaluate_image(path, models)
+if __name__ == "__main__":
+    main()

plot/beam_experiment_plot.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import matplotlib.pyplot as plt
+# Beam sizes tested
+beam_sizes = [1, 3, 5, 10]
+# Example CIDEr scores from experiments
+blip_scores = [0.52, 0.59, 0.61, 0.60]
+vit_scores = [0.50, 0.56, 0.60, 0.58]
+git_scores = [0.12, 0.16, 0.17, 0.16]
+plt.figure(figsize=(8,5))
+plt.plot(beam_sizes, blip_scores, marker='o', label="BLIP")
+plt.plot(beam_sizes, vit_scores, marker='o', label="ViT-GPT2")
+plt.plot(beam_sizes, git_scores, marker='o', label="GIT")
+plt.xlabel("Beam Size")
+plt.ylabel("CIDEr Score")
+plt.title("Effect of Beam Size on Caption Quality")
+plt.legend()
+plt.grid(True)
+plt.savefig("beam_search_experiment.png", dpi=300)
+plt.show()

plot/caption_length_analysis.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import json
+import matplotlib.pyplot as plt
+import numpy as np
+ANNOTATION_FILE = "annotations/captions_validation.jsonl"
+short = []
+medium = []
+long = []
+with open(ANNOTATION_FILE) as f:
+    for line in f:
+        data = json.loads(line)
+        caption = data["captions"][0]
+        length = len(caption.split())
+        if length <= 8:
+            short.append(length)
+        elif length <= 15:
+            medium.append(length)
+        else:
+            long.append(length)
+print("Short captions:", len(short))
+print("Medium captions:", len(medium))
+print("Long captions:", len(long))
+# Example scores from your training logs
+blip_scores = [0.71, 0.60, 0.48]
+vit_scores = [0.65, 0.59, 0.42]
+git_scores = [0.30, 0.18, 0.11]
+labels = ["Short", "Medium", "Long"]
+x = np.arange(len(labels))
+width = 0.25
+plt.figure(figsize=(9,5))
+plt.bar(x - width, blip_scores, width, label="BLIP")
+plt.bar(x, vit_scores, width, label="ViT-GPT2")
+plt.bar(x + width, git_scores, width, label="GIT")
+plt.xlabel("Caption Length")
+plt.ylabel("CIDEr Score")
+plt.title("Model Performance vs Caption Length")
+plt.xticks(x, labels)
+plt.legend()
+plt.savefig("caption_length_analysis.png", dpi=300)
+plt.show()

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+streamlit
+torch
+transformers
+accelerate
+pillow
+numpy
+matplotlib
+pandas
+torch
+torchvision
+transformers
+datasets
+Pillow
+numpy
+tqdm
+pycocoevalcap
+streamlit
+matplotlib
+pandas
+scikit-learn

src/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+Top-level package for the image captioning project.
+This package exposes the core modules used in training, evaluation,
+and serving the captioning models.
+"""

src/data/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""
+Data loading utilities and dataset definitions.
+"""

src/data/coco_384_dataset.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import json
+import os
+import random
+from typing import Any, Dict
+from PIL import Image
+from torch.utils.data import Dataset
+class COCODataset384(Dataset):
+    """
+    COCO-style dataset that always resizes images to 384x384 and uses
+    a BLIP-style processor for joint image-text encoding.
+    """
+    def __init__(self, annotation_path: str, image_folder: str, processor: Any) -> None:
+        self.image_folder = image_folder
+        self.processor = processor
+        with open(annotation_path, "r") as f:
+            self.annotations = [json.loads(line) for line in f]
+    def __len__(self) -> int:
+        return len(self.annotations)
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        ann = self.annotations[idx]
+        caption = random.choice(ann["captions"])
+        image_path = os.path.join(self.image_folder, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+        # 384px resize for the vision backbone
+        image = image.resize((384, 384))
+        encoding = self.processor(
+            image,
+            caption,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+        )
+        input_ids = encoding["input_ids"].squeeze(0)
+        return {
+            "pixel_values": encoding["pixel_values"].squeeze(0),
+            "input_ids": input_ids,
+            "attention_mask": encoding["attention_mask"].squeeze(0),
+            "labels": input_ids.clone(),
+        }

src/data/coco_advanced_dataset.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import json
+import os
+import random
+import re
+from typing import Any, Dict, List
+from PIL import Image
+from torch.utils.data import Dataset
+class COCODatasetAdvanced(Dataset):
+    """
+    COCO dataset with caption quality and length filtering.
+    """
+    def __init__(
+        self,
+        annotation_path: str,
+        image_folder: str,
+        processor: Any,
+        mode: str = "mixed",
+        max_length: int = 40,
+    ) -> None:
+        self.image_folder = image_folder
+        self.processor = processor
+        self.max_length = max_length
+        self.mode = mode
+        with open(annotation_path, "r") as f:
+            raw_data = [json.loads(line) for line in f]
+        self.annotations: List[Dict[str, Any]] = []
+        for ann in raw_data:
+            filtered_captions: List[str] = []
+            for cap in ann["captions"]:
+                cap = cap.strip().lower()
+                # Remove very short captions
+                if len(cap.split()) < 3:
+                    continue
+                # Remove repeated words
+                words = cap.split()
+                if len(set(words)) < len(words) * 0.6:
+                    continue
+                # Remove non-alphabetic captions
+                if not re.search(r"[a-z]", cap):
+                    continue
+                word_count = len(words)
+                if self.mode == "short" and word_count <= 8:
+                    filtered_captions.append(cap)
+                elif self.mode == "long" and word_count > 15:
+                    filtered_captions.append(cap)
+                elif self.mode == "mixed":
+                    filtered_captions.append(cap)
+            if filtered_captions:
+                self.annotations.append(
+                    {
+                        "image": ann["image"],
+                        "captions": filtered_captions,
+                    }
+                )
+    def __len__(self) -> int:
+        return len(self.annotations)
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        ann = self.annotations[idx]
+        file_name = ann["image"]
+        caption = random.choice(ann["captions"])
+        image_path = os.path.join(self.image_folder, file_name)
+        image = Image.open(image_path).convert("RGB")
+        encoding = self.processor(
+            images=image,
+            text=caption,
+            padding="max_length",
+            truncation=True,
+            max_length=self.max_length,
+            return_tensors="pt",
+        )
+        input_ids = encoding["input_ids"].squeeze(0)
+        return {
+            "pixel_values": encoding["pixel_values"].squeeze(0),
+            "input_ids": input_ids,
+            "attention_mask": encoding["attention_mask"].squeeze(0),
+            "labels": input_ids.clone(),
+        }

src/data/coco_vit_gpt2_dataset.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import json
+import os
+import random
+from typing import Any, Dict, List
+from PIL import Image
+from torch.utils.data import Dataset
+class COCODatasetViTGPT2(Dataset):
+    """
+    COCO dataset tailored for ViT + GPT-2 style architectures with
+    separate image processor and tokenizer.
+    """
+    def __init__(
+        self,
+        annotation_path: str,
+        image_folder: str,
+        image_processor: Any,
+        tokenizer: Any,
+        mode: str = "short",
+        max_length: int = 20,
+    ) -> None:
+        self.image_folder = image_folder
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        self.mode = mode
+        with open(annotation_path, "r") as f:
+            raw_data = [json.loads(line) for line in f]
+        self.annotations: List[Dict[str, Any]] = []
+        for ann in raw_data:
+            filtered: List[str] = []
+            for cap in ann["captions"]:
+                words = cap.split()
+                wc = len(words)
+                if mode == "short" and wc <= 8:
+                    filtered.append(cap)
+                elif mode == "long" and wc > 15:
+                    filtered.append(cap)
+                elif mode == "mixed":
+                    filtered.append(cap)
+            if filtered:
+                self.annotations.append(
+                    {
+                        "image": ann["image"],
+                        "captions": filtered,
+                    }
+                )
+    def __len__(self) -> int:
+        return len(self.annotations)
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        ann = self.annotations[idx]
+        caption = random.choice(ann["captions"])
+        image_path = os.path.join(self.image_folder, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+        pixel_values = self.image_processor(
+            images=image,
+            return_tensors="pt",
+        ).pixel_values.squeeze(0)
+        tokenized = self.tokenizer(
+            caption,
+            padding="max_length",
+            truncation=True,
+            max_length=self.max_length,
+            return_tensors="pt",
+        )
+        input_ids = tokenized.input_ids.squeeze(0)
+        return {
+            "pixel_values": pixel_values,
+            "labels": input_ids,
+        }

src/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""
+Evaluation utilities (e.g., CIDEr scoring).
+"""

src/evaluation/cider_eval.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+from typing import Any
+from PIL import Image
+from pycocoevalcap.cider.cider import Cider
+from tqdm import tqdm
+def generate_caption(model: Any, processor: Any, image: Image.Image, device) -> str:
+    """
+    Run the captioning model on a single image and return the decoded caption.
+    """
+    inputs = processor(images=image, return_tensors="pt").to(device)
+    with getattr(__import__("torch"), "no_grad")():
+        torch = __import__("torch")
+        generated_ids = model.generate(
+            **inputs,
+            max_length=30,
+            num_beams=5,
+        )
+    caption = processor.decode(
+        generated_ids[0],
+        skip_special_tokens=True,
+    )
+    return caption
+def evaluate_cider(model: Any, processor: Any, val_dataset, device, max_samples: int = 200) -> float:
+    """
+    Compute CIDEr score on a validation subset.
+    Expects a PyTorch `Subset`/`Dataset` where:
+    - `val_dataset.indices[idx]` gives the underlying index
+    - `val_dataset.dataset.annotations[...]` is a list of dicts with
+      keys `image` and `captions`.
+    """
+    import torch  # local import to avoid hard dependency for non-training paths
+    model.eval()
+    cider_scorer = Cider()
+    ground_truth = {}
+    predictions = {}
+    for idx in tqdm(range(min(max_samples, len(val_dataset))), desc="CIDEr Eval"):
+        real_idx = val_dataset.indices[idx]
+        ann = val_dataset.dataset.annotations[real_idx]
+        image_path = os.path.join("train2017", ann["image"])
+        image = Image.open(image_path).convert("RGB")
+        pred_caption = generate_caption(model, processor, image, device)
+        ground_truth[idx] = ann["captions"]
+        predictions[idx] = [pred_caption]
+    score, _ = cider_scorer.compute_score(ground_truth, predictions)
+    print(f"CIDEr Score: {score:.4f}")
+    model.train()
+    return score

src/streamlit_app.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import altair as alt
+import numpy as np
+import pandas as pd
+import streamlit as st
+"""
+# Welcome to Streamlit!
+Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
+If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
+forums](https://discuss.streamlit.io).
+In the meantime, below is an example of what you can do with just a few lines of code:
+"""
+num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
+num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
+indices = np.linspace(0, 1, num_points)
+theta = 2 * np.pi * num_turns * indices
+radius = indices
+x = radius * np.cos(theta)
+y = radius * np.sin(theta)
+df = pd.DataFrame({
+    "x": x,
+    "y": y,
+    "idx": indices,
+    "rand": np.random.randn(num_points),
+})
+st.altair_chart(alt.Chart(df, height=700, width=700)
+    .mark_point(filled=True)
+    .encode(
+        x=alt.X("x", axis=None),
+        y=alt.Y("y", axis=None),
+        color=alt.Color("idx", legend=None, scale=alt.Scale()),
+        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
+    ))

src/training/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""
+Training entrypoints and training utilities.
+"""

src/training/train_phase1.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import os
+import torch
+from PIL import Image
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from torch.utils.data import DataLoader, random_split
+from transformers import BlipForConditionalGeneration, BlipProcessor
+from tqdm import tqdm
+from src.data.coco_384_dataset import COCODataset384 as COCODataset
+from src.evaluation.cider_eval import evaluate_cider
+def main() -> None:
+    if not torch.backends.mps.is_available():
+        raise RuntimeError("MPS not available.")
+    device = torch.device("mps")
+    print("Using device:", device)
+    # =========================
+    # CONFIG
+    # =========================
+    EPOCHS = 5
+    BATCH_SIZE = 6
+    LR = 3e-5
+    NUM_WORKERS = 0
+    FINAL_MODEL_DIR = "saved_model_phase1"
+    os.makedirs(FINAL_MODEL_DIR, exist_ok=True)
+    # =========================
+    # LOAD MODEL
+    # =========================
+    processor = BlipProcessor.from_pretrained(
+        "Salesforce/blip-image-captioning-base"
+    )
+    model = BlipForConditionalGeneration.from_pretrained(
+        "Salesforce/blip-image-captioning-base"
+    )
+    # Unfreeze LAST 2 vision layers only
+    for name, param in model.vision_model.named_parameters():
+        if "encoder.layers.10" in name or "encoder.layers.11" in name:
+            param.requires_grad = True
+        else:
+            param.requires_grad = False
+    model.gradient_checkpointing_enable()
+    model.config.use_cache = False
+    model.to(device)
+    # =========================
+    # DATASET SPLIT
+    # =========================
+    full_dataset = COCODataset(
+        "annotations/subset_10k.jsonl",
+        "train2017",
+        processor,
+    )
+    train_size = int(0.9 * len(full_dataset))
+    val_size = len(full_dataset) - train_size
+    train_dataset, val_dataset = random_split(
+        full_dataset,
+        [train_size, val_size],
+    )
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=BATCH_SIZE,
+        shuffle=True,
+        num_workers=NUM_WORKERS,
+    )
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=BATCH_SIZE,
+        shuffle=False,
+        num_workers=NUM_WORKERS,
+    )
+    optimizer = AdamW(
+        filter(lambda p: p.requires_grad, model.parameters()),
+        lr=LR,
+    )
+    scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)
+    # =========================
+    # EARLY STOPPING
+    # =========================
+    best_cider = 0.0
+    patience = 3
+    counter = 0
+    # =========================
+    # TRAIN LOOP
+    # =========================
+    for epoch in range(EPOCHS):
+        model.train()
+        total_loss = 0.0
+        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
+        for batch in progress_bar:
+            batch = {k: v.to(device) for k, v in batch.items()}
+            with torch.autocast(device_type="mps", dtype=torch.float16):
+                outputs = model(**batch)
+                loss = outputs.loss
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+            total_loss += loss.item()
+            progress_bar.set_postfix(loss=loss.item())
+        avg_train_loss = total_loss / len(train_loader)
+        print(f"Epoch {epoch + 1} Train Loss: {avg_train_loss:.4f}")
+        # =========================
+        # VALIDATION LOSS
+        # =========================
+        model.eval()
+        val_loss = 0.0
+        with torch.no_grad():
+            for batch in val_loader:
+                batch = {k: v.to(device) for k, v in batch.items()}
+                outputs = model(**batch)
+                val_loss += outputs.loss.item()
+        val_loss /= len(val_loader)
+        print(f"Epoch {epoch + 1} Validation Loss: {val_loss:.4f}")
+        # =========================
+        # CIDEr
+        # =========================
+        cider_score = evaluate_cider(model, processor, val_dataset, device)
+        # =========================
+        # SAVE BEST CIDEr MODEL
+        # =========================
+        if cider_score > best_cider:
+            best_cider = cider_score
+            counter = 0
+            model.save_pretrained(FINAL_MODEL_DIR)
+            processor.save_pretrained(FINAL_MODEL_DIR)
+            print("Best CIDEr model saved.")
+        else:
+            counter += 1
+        if counter >= patience:
+            print("Early stopping triggered.")
+            break
+        scheduler.step()
+    print("Phase 1 training complete.")
+if __name__ == "__main__":
+    main()

src/training/train_phase2.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import os
+import torch
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from torch.utils.data import DataLoader, random_split
+from transformers import BlipForConditionalGeneration, BlipProcessor
+from tqdm import tqdm
+from src.data.coco_advanced_dataset import COCODatasetAdvanced
+from src.evaluation.cider_eval import evaluate_cider
+def main() -> None:
+    if not torch.backends.mps.is_available():
+        raise RuntimeError("MPS not available.")
+    device = torch.device("mps")
+    print("Using device:", device)
+    # =========================
+    # CONFIG
+    # =========================
+    EPOCHS = 5
+    BATCH_SIZE = 6
+    LR = 3e-5  # Lower LR for partial unfreezing
+    NUM_WORKERS = 0
+    FINAL_MODEL_DIR = "saved_model_phase2"
+    os.makedirs(FINAL_MODEL_DIR, exist_ok=True)
+    # =========================
+    # LOAD MODEL
+    # =========================
+    processor = BlipProcessor.from_pretrained(
+        "Salesforce/blip-image-captioning-base"
+    )
+    model = BlipForConditionalGeneration.from_pretrained(
+        "Salesforce/blip-image-captioning-base"
+    )
+    # Unfreeze LAST 2 vision layers only
+    for name, param in model.vision_model.named_parameters():
+        if "encoder.layers.10" in name or "encoder.layers.11" in name:
+            param.requires_grad = True
+        else:
+            param.requires_grad = False
+    model.gradient_checkpointing_enable()
+    model.config.use_cache = False
+    model.to(device)
+    # =========================
+    # DATASET SPLIT
+    # =========================
+    MODE = "long"  # change to "short" or "mixed" as needed
+    full_dataset = COCODatasetAdvanced(
+        "annotations/subset_10k.jsonl",
+        "train2017",
+        processor,
+        mode=MODE,
+    )
+    train_size = int(0.9 * len(full_dataset))
+    val_size = len(full_dataset) - train_size
+    train_dataset, val_dataset = random_split(
+        full_dataset,
+        [train_size, val_size],
+    )
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=BATCH_SIZE,
+        shuffle=True,
+        num_workers=NUM_WORKERS,
+    )
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=BATCH_SIZE,
+        shuffle=False,
+        num_workers=NUM_WORKERS,
+    )
+    optimizer = AdamW(
+        filter(lambda p: p.requires_grad, model.parameters()),
+        lr=LR,
+    )
+    scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)
+    # =========================
+    # EARLY STOPPING
+    # =========================
+    best_cider = 0.0
+    patience = 3
+    counter = 0
+    # =========================
+    # TRAIN LOOP
+    # =========================
+    for epoch in range(EPOCHS):
+        model.train()
+        total_loss = 0.0
+        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
+        for batch in progress_bar:
+            batch = {k: v.to(device) for k, v in batch.items()}
+            with torch.autocast(device_type="mps", dtype=torch.float16):
+                outputs = model(**batch)
+                loss = outputs.loss
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+            total_loss += loss.item()
+            progress_bar.set_postfix(loss=loss.item())
+        avg_train_loss = total_loss / len(train_loader)
+        print(f"Epoch {epoch + 1} Train Loss: {avg_train_loss:.4f}")
+        # =========================
+        # VALIDATION LOSS
+        # =========================
+        model.eval()
+        val_loss = 0.0
+        with torch.no_grad():
+            for batch in val_loader:
+                batch = {k: v.to(device) for k, v in batch.items()}
+                outputs = model(**batch)
+                val_loss += outputs.loss.item()
+        val_loss /= len(val_loader)
+        print(f"Epoch {epoch + 1} Validation Loss: {val_loss:.4f}")
+        # =========================
+        # CIDEr
+        # =========================
+        cider_score = evaluate_cider(model, processor, val_dataset, device)
+        # =========================
+        # SAVE BEST CIDEr MODEL
+        # =========================
+        if cider_score > best_cider:
+            best_cider = cider_score
+            counter = 0
+            model.save_pretrained(FINAL_MODEL_DIR)
+            processor.save_pretrained(FINAL_MODEL_DIR)
+            print("Best CIDEr model saved.")
+        else:
+            counter += 1
+        if counter >= patience:
+            print("Early stopping triggered.")
+            break
+        scheduler.step()
+    print("Phase 2 training complete.")
+if __name__ == "__main__":
+    main()

src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""
+General-purpose utility functions and scripts.
+"""

src/utils/data_subset.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import json
+import random
+from pathlib import Path
+from typing import Iterable
+def create_subset(
+    input_path: str | Path,
+    output_path: str | Path,
+    size: int = 20_000,
+) -> None:
+    """
+    Create a random subset of a JSONL annotations file.
+    """
+    input_path = Path(input_path)
+    output_path = Path(output_path)
+    with input_path.open("r") as f:
+        data = [json.loads(line) for line in f]
+    if size > len(data):
+        raise ValueError(f"Requested subset size {size} exceeds dataset size {len(data)}")
+    subset = random.sample(data, size)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with output_path.open("w") as f:
+        for item in subset:
+            f.write(json.dumps(item) + "\n")
+def _main_from_cli(args: Iterable[str] | None = None) -> None:
+    """
+    Simple CLI wrapper when this module is executed as a script.
+    """
+    import argparse
+    parser = argparse.ArgumentParser(description="Create a random JSONL subset.")
+    parser.add_argument(
+        "--input",
+        default="annotations/captions_train.jsonl",
+        help="Input JSONL annotations path.",
+    )
+    parser.add_argument(
+        "--output",
+        default="annotations/subset_20k.jsonl",
+        help="Output JSONL path.",
+    )
+    parser.add_argument(
+        "--size",
+        type=int,
+        default=20_000,
+        help="Number of samples to keep.",
+    )
+    parsed = parser.parse_args(list(args) if args is not None else None)
+    create_subset(parsed.input, parsed.output, parsed.size)
+    print(f"Subset of {parsed.size} entries written to {parsed.output}")
+if __name__ == "__main__":
+    _main_from_cli()

train_blip_20k_384.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import os
+import torch
+from torch.utils.data import DataLoader, random_split
+from transformers import BlipProcessor, BlipForConditionalGeneration
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from dataset_384 import COCODataset384
+from tqdm import tqdm
+def main():
+    device = torch.device("mps")
+    print("Using device:", device)
+    EPOCHS = 5
+    BATCH_SIZE = 3  # ⚠️ Lower because 384px uses more memory
+    LR = 3e-5
+    CHECKPOINT_DIR = "checkpoints_20k_384"
+    MODEL_DIR = "saved_model_20k_384"
+    os.makedirs(CHECKPOINT_DIR, exist_ok=True)
+    os.makedirs(MODEL_DIR, exist_ok=True)
+    processor = BlipProcessor.from_pretrained(
+        "Salesforce/blip-image-captioning-base"
+    )
+    model = BlipForConditionalGeneration.from_pretrained(
+        "Salesforce/blip-image-captioning-base"
+    )
+    model.gradient_checkpointing_enable()
+    model.config.use_cache = False
+    model.to(device)
+    dataset = COCODataset384(
+        "annotations/subset_20k.jsonl",
+        "train2017",
+        processor
+    )
+    train_size = int(0.9 * len(dataset))
+    val_size = len(dataset) - train_size
+    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
+    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
+    optimizer = AdamW(model.parameters(), lr=LR)
+    scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)
+    best_val_loss = float("inf")
+    for epoch in range(EPOCHS):
+        model.train()
+        total_loss = 0
+        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
+            batch = {k: v.to(device) for k, v in batch.items()}
+            with torch.autocast(device_type="mps", dtype=torch.float16):
+                outputs = model(**batch)
+                loss = outputs.loss
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+            total_loss += loss.item()
+        train_loss = total_loss / len(train_loader)
+        print(f"Train Loss: {train_loss:.4f}")
+        # Validation
+        model.eval()
+        val_loss = 0
+        with torch.no_grad():
+            for batch in val_loader:
+                batch = {k: v.to(device) for k, v in batch.items()}
+                outputs = model(**batch)
+                val_loss += outputs.loss.item()
+        val_loss /= len(val_loader)
+        print(f"Validation Loss: {val_loss:.4f}")
+        if val_loss < best_val_loss:
+            best_val_loss = val_loss
+            model.save_pretrained(MODEL_DIR)
+            processor.save_pretrained(MODEL_DIR)
+            print("Best model saved.")
+        scheduler.step()
+    print("Training complete.")
+if __name__ == "__main__":
+    main()

train_data_experiments.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import os
+from platform import processor
+import torch
+from torch.utils.data import DataLoader, random_split
+from transformers import BlipProcessor, BlipForConditionalGeneration
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from dataset_advanced import COCODataset
+from tqdm import tqdm
+from PIL import Image
+from pycocoevalcap.cider.cider import Cider
+from dataset_advanced import COCODatasetAdvanced
+# =========================
+# GENERATE CAPTION
+# =========================
+def generate_caption(model, processor, image, device):
+    inputs = processor(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        generated_ids = model.generate(
+            **inputs,
+            max_length=30,
+            num_beams=5
+        )
+    caption = processor.decode(
+        generated_ids[0],
+        skip_special_tokens=True
+    )
+    return caption
+# =========================
+# CIDEr EVALUATION
+# =========================
+def evaluate_cider(model, processor, val_dataset, device, max_samples=200):
+    model.eval()
+    cider_scorer = Cider()
+    ground_truth = {}
+    predictions = {}
+    for idx in tqdm(range(min(max_samples, len(val_dataset))), desc="CIDEr Eval"):
+        real_idx = val_dataset.indices[idx]
+        ann = val_dataset.dataset.annotations[real_idx]
+        image_path = os.path.join("train2017", ann["image"])
+        image = Image.open(image_path).convert("RGB")
+        pred_caption = generate_caption(model, processor, image, device)
+        ground_truth[idx] = ann["captions"]
+        predictions[idx] = [pred_caption]
+    score, _ = cider_scorer.compute_score(ground_truth, predictions)
+    print(f"CIDEr Score: {score:.4f}")
+    model.train()
+    return score
+# =========================
+# MAIN
+# =========================
+def main():
+    if not torch.backends.mps.is_available():
+        raise RuntimeError("MPS not available.")
+    device = torch.device("mps")
+    print("Using device:", device)
+    # =========================
+    # CONFIG
+    # =========================
+    EPOCHS = 5
+    BATCH_SIZE = 6
+    LR = 3e-5   # Lower LR for partial unfreezing
+    NUM_WORKERS = 0
+    FINAL_MODEL_DIR = "saved_model_phase2"
+    os.makedirs(FINAL_MODEL_DIR, exist_ok=True)
+    # =========================
+    # LOAD MODEL
+    # =========================
+    processor = BlipProcessor.from_pretrained(
+        "Salesforce/blip-image-captioning-base"
+    )
+    model = BlipForConditionalGeneration.from_pretrained(
+        "Salesforce/blip-image-captioning-base"
+    )
+    # 🔥 Unfreeze LAST 2 vision layers only
+    for name, param in model.vision_model.named_parameters():
+        if "encoder.layers.10" in name or "encoder.layers.11" in name:
+            param.requires_grad = True
+        else:
+            param.requires_grad = False
+    model.gradient_checkpointing_enable()
+    model.config.use_cache = False
+    model.to(device)
+    # =========================
+    # DATASET SPLIT
+    # =========================
+    MODE = "long"   # change to "short" or "long"
+    full_dataset = COCODatasetAdvanced(
+        "annotations/subset_10k.jsonl",
+        "train2017",
+        processor,
+        mode=MODE
+    )
+    train_size = int(0.9 * len(full_dataset))
+    val_size = len(full_dataset) - train_size
+    train_dataset, val_dataset = random_split(
+        full_dataset,
+        [train_size, val_size]
+    )
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=BATCH_SIZE,
+        shuffle=True,
+        num_workers=NUM_WORKERS
+    )
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=BATCH_SIZE,
+        shuffle=False,
+        num_workers=NUM_WORKERS
+    )
+    optimizer = AdamW(
+        filter(lambda p: p.requires_grad, model.parameters()),
+        lr=LR
+    )
+    scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)
+    # =========================
+    # EARLY STOPPING
+    # =========================
+    best_cider = 0
+    patience = 3
+    counter = 0
+    # =========================
+    # TRAIN LOOP
+    # =========================
+    for epoch in range(EPOCHS):
+        model.train()
+        total_loss = 0
+        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
+        for batch in progress_bar:
+            batch = {k: v.to(device) for k, v in batch.items()}
+            with torch.autocast(device_type="mps", dtype=torch.float16):
+                outputs = model(**batch)
+                loss = outputs.loss
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+            total_loss += loss.item()
+            progress_bar.set_postfix(loss=loss.item())
+        avg_train_loss = total_loss / len(train_loader)
+        print(f"Epoch {epoch+1} Train Loss: {avg_train_loss:.4f}")
+        # =========================
+        # VALIDATION LOSS
+        # =========================
+        model.eval()
+        val_loss = 0
+        with torch.no_grad():
+            for batch in val_loader:
+                batch = {k: v.to(device) for k, v in batch.items()}
+                outputs = model(**batch)
+                val_loss += outputs.loss.item()
+        val_loss /= len(val_loader)
+        print(f"Epoch {epoch+1} Validation Loss: {val_loss:.4f}")
+        # =========================
+        # CIDEr
+        # =========================
+        cider_score = evaluate_cider(model, processor, val_dataset, device)
+        # =========================
+        # SAVE BEST CIDEr MODEL
+        # =========================
+        if cider_score > best_cider:
+            best_cider = cider_score
+            counter = 0
+            model.save_pretrained(FINAL_MODEL_DIR)
+            processor.save_pretrained(FINAL_MODEL_DIR)
+            print("Best CIDEr model saved.")
+        else:
+            counter += 1
+        if counter >= patience:
+            print("Early stopping triggered.")
+            break
+        scheduler.step()
+    print("Phase 2 training complete.")
+if __name__ == "__main__":
+    main()

train_git.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import os
+import torch
+from torch.utils.data import DataLoader, random_split
+from transformers import GitProcessor, GitForCausalLM
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from dataset_git import COCODatasetGIT
+from tqdm import tqdm
+from pycocoevalcap.cider.cider import Cider
+from PIL import Image
+def generate_caption(model, processor, image, device):
+    inputs = processor(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        output_ids = model.generate(
+            **inputs,
+            num_beams=5,
+            max_length=20
+        )
+    return processor.batch_decode(output_ids, skip_special_tokens=True)[0]
+def evaluate_cider(model, processor, val_dataset, device, max_samples=200):
+    model.eval()
+    cider_scorer = Cider()
+    ground_truth = {}
+    predictions = {}
+    for idx in tqdm(range(min(max_samples, len(val_dataset))), desc="CIDEr Eval"):
+        real_idx = val_dataset.indices[idx]
+        ann = val_dataset.dataset.annotations[real_idx]
+        image_path = os.path.join("train2017", ann["image"])
+        image = Image.open(image_path).convert("RGB")
+        pred_caption = generate_caption(model, processor, image, device)
+        ground_truth[idx] = ann["captions"]
+        predictions[idx] = [pred_caption]
+    score, _ = cider_scorer.compute_score(ground_truth, predictions)
+    print(f"CIDEr Score: {score:.4f}")
+    model.train()
+    return score
+def main():
+    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
+    print("Using device:", device)
+    EPOCHS = 20
+    BATCH_SIZE = 4
+    LR = 5e-5
+    SAVE_DIR = "saved_git_model"
+    os.makedirs(SAVE_DIR, exist_ok=True)
+    processor = GitProcessor.from_pretrained("microsoft/git-base")
+    model = GitForCausalLM.from_pretrained("microsoft/git-base")
+    model.to(device)
+    dataset = COCODatasetGIT(
+        "annotations/subset_20k.jsonl",
+        "train2017",
+        processor,
+        mode="mixed"
+    )
+    train_size = int(0.9 * len(dataset))
+    val_size = len(dataset) - train_size
+    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
+    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
+    optimizer = AdamW(model.parameters(), lr=LR)
+    scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)
+    best_cider = 0
+    for epoch in range(EPOCHS):
+        model.train()
+        total_loss = 0
+        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
+            batch = {k: v.to(device) for k, v in batch.items()}
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step()
+            optimizer.zero_grad()
+            total_loss += loss.item()
+        print(f"Train Loss: {total_loss / len(train_loader):.4f}")
+        cider_score = evaluate_cider(model, processor, val_dataset, device)
+        if cider_score > best_cider:
+            best_cider = cider_score
+            model.save_pretrained(SAVE_DIR)
+            processor.save_pretrained(SAVE_DIR)
+            print("Best GIT model saved.")
+        scheduler.step()
+    print("GIT Training complete.")
+if __name__ == "__main__":
+    main()

train_phase2.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import os
+import torch
+from torch.utils.data import DataLoader, random_split
+from transformers import BlipProcessor, BlipForConditionalGeneration
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from dataset_advanced import COCODataset
+from tqdm import tqdm
+from PIL import Image
+from pycocoevalcap.cider.cider import Cider
+# =========================
+# GENERATE CAPTION
+# =========================
+def generate_caption(model, processor, image, device):
+    inputs = processor(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        generated_ids = model.generate(
+            **inputs,
+            max_length=30,
+            num_beams=5
+        )
+    caption = processor.decode(
+        generated_ids[0],
+        skip_special_tokens=True
+    )
+    return caption
+# =========================
+# CIDEr EVALUATION
+# =========================
+def evaluate_cider(model, processor, val_dataset, device, max_samples=200):
+    model.eval()
+    cider_scorer = Cider()
+    ground_truth = {}
+    predictions = {}
+    for idx in tqdm(range(min(max_samples, len(val_dataset))), desc="CIDEr Eval"):
+        real_idx = val_dataset.indices[idx]
+        ann = val_dataset.dataset.annotations[real_idx]
+        image_path = os.path.join("train2017", ann["image"])
+        image = Image.open(image_path).convert("RGB")
+        pred_caption = generate_caption(model, processor, image, device)
+        ground_truth[idx] = ann["captions"]
+        predictions[idx] = [pred_caption]
+    score, _ = cider_scorer.compute_score(ground_truth, predictions)
+    print(f"CIDEr Score: {score:.4  f}")
+    model.train()
+    return score
+# =========================
+# MAIN
+# =========================
+def main():
+    if not torch.backends.mps.is_available():
+        raise RuntimeError("MPS not available.")
+    device = torch.device("mps")
+    print("Using device:", device)
+    # =========================
+    # CONFIG
+    # =========================
+    EPOCHS = 5
+    BATCH_SIZE = 6
+    LR = 3e-5   # Lower LR for partial unfreezing
+    NUM_WORKERS = 0
+    FINAL_MODEL_DIR = "saved_model_phase2"
+    os.makedirs(FINAL_MODEL_DIR, exist_ok=True)
+    # =========================
+    # LOAD MODEL
+    # =========================
+    processor = BlipProcessor.from_pretrained(
+        "Salesforce/blip-image-captioning-base"
+    )
+    model = BlipForConditionalGeneration.from_pretrained(
+        "Salesforce/blip-image-captioning-base"
+    )
+    # 🔥 Unfreeze LAST 2 vision layers only
+    for name, param in model.vision_model.named_parameters():
+        if "encoder.layers.10" in name or "encoder.layers.11" in name:
+            param.requires_grad = True
+        else:
+            param.requires_grad = False
+    model.gradient_checkpointing_enable()
+    model.config.use_cache = False
+    model.to(device)
+    # =========================
+    # DATASET SPLIT
+    # =========================
+    full_dataset = COCODataset(
+        "annotations/subset_10k.jsonl",
+        "train2017",
+        processor
+    )
+    train_size = int(0.9 * len(full_dataset))
+    val_size = len(full_dataset) - train_size
+    train_dataset, val_dataset = random_split(
+        full_dataset,
+        [train_size, val_size]
+    )
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=BATCH_SIZE,
+        shuffle=True,
+        num_workers=NUM_WORKERS
+    )
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=BATCH_SIZE,
+        shuffle=False,
+        num_workers=NUM_WORKERS
+    )
+    optimizer = AdamW(
+        filter(lambda p: p.requires_grad, model.parameters()),
+        lr=LR
+    )
+    scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)
+    # =========================
+    # EARLY STOPPING
+    # =========================
+    best_cider = 0
+    patience = 3
+    counter = 0
+    # =========================
+    # TRAIN LOOP
+    # =========================
+    for epoch in range(EPOCHS):
+        model.train()
+        total_loss = 0
+        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")
+        for batch in progress_bar:
+            batch = {k: v.to(device) for k, v in batch.items()}
+            with torch.autocast(device_type="mps", dtype=torch.float16):
+                outputs = model(**batch)
+                loss = outputs.loss
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+            total_loss += loss.item()
+            progress_bar.set_postfix(loss=loss.item())
+        avg_train_loss = total_loss / len(train_loader)
+        print(f"Epoch {epoch+1} Train Loss: {avg_train_loss:.4f}")
+        # =========================
+        # VALIDATION LOSS
+        # =========================
+        model.eval()
+        val_loss = 0
+        with torch.no_grad():
+            for batch in val_loader:
+                batch = {k: v.to(device) for k, v in batch.items()}
+                outputs = model(**batch)
+                val_loss += outputs.loss.item()
+        val_loss /= len(val_loader)
+        print(f"Epoch {epoch+1} Validation Loss: {val_loss:.4f}")
+        # =========================
+        # CIDEr
+        # =========================
+        cider_score = evaluate_cider(model, processor, val_dataset, device)
+        # =========================
+        # SAVE BEST CIDEr MODEL
+        # =========================
+        if cider_score > best_cider:
+            best_cider = cider_score
+            counter = 0
+            model.save_pretrained(FINAL_MODEL_DIR)
+            processor.save_pretrained(FINAL_MODEL_DIR)
+            print("Best CIDEr model saved.")
+        else:
+            counter += 1
+        if counter >= patience:
+            print("Early stopping triggered.")
+            break
+        scheduler.step()
+    print("Phase 2 training complete.")
+if __name__ == "__main__":
+    main()

train_vit_gpt2.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import os
+import torch
+from torch.utils.data import DataLoader, random_split
+from transformers import (
+    VisionEncoderDecoderModel,
+    ViTImageProcessor,
+    AutoTokenizer,
+    GPT2Config,
+    GPT2LMHeadModel,
+    ViTModel
+)
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingLR
+from dataset_vit_gpt2 import COCODatasetViTGPT2
+from tqdm import tqdm
+from pycocoevalcap.cider.cider import Cider
+from PIL import Image
+# ==========================================
+# GENERATE CAPTION
+# ==========================================
+def generate_caption(model, processor, tokenizer, image, device):
+    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
+    with torch.no_grad():
+        output_ids = model.generate(
+            pixel_values=pixel_values,
+            num_beams=5,
+            max_length=20,
+            length_penalty=1.0,
+            pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id
+        )
+    return tokenizer.decode(output_ids[0], skip_special_tokens=True)
+# ==========================================
+# CIDEr EVALUATION
+# ==========================================
+def evaluate_cider(model, processor, tokenizer, val_dataset, device, max_samples=200):
+    model.eval()
+    cider_scorer = Cider()
+    ground_truth = {}
+    predictions = {}
+    for idx in tqdm(range(min(max_samples, len(val_dataset))), desc="CIDEr Eval"):
+        real_idx = val_dataset.indices[idx]
+        ann = val_dataset.dataset.annotations[real_idx]
+        image_path = os.path.join("train2017", ann["image"])
+        image = Image.open(image_path).convert("RGB")
+        pred_caption = generate_caption(model, processor, tokenizer, image, device)
+        ground_truth[idx] = ann["captions"]
+        predictions[idx] = [pred_caption]
+    score, _ = cider_scorer.compute_score(ground_truth, predictions)
+    print(f"CIDEr Score: {score:.4f}")
+    model.train()
+    return score
+# ==========================================
+# MAIN
+# ==========================================
+def main():
+    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
+    print("Using device:", device)
+    EPOCHS = 5
+    BATCH_SIZE = 6
+    LR = 3e-5
+    SAVE_DIR = "saved_vit_gpt2"
+    os.makedirs(SAVE_DIR, exist_ok=True)
+    # ------------------------------------------
+    # Build Encoder + Decoder
+    # ------------------------------------------
+    encoder = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
+    decoder_config = GPT2Config.from_pretrained("gpt2")
+    decoder_config.is_decoder = True
+    decoder_config.add_cross_attention = True
+    decoder = GPT2LMHeadModel.from_pretrained("gpt2", config=decoder_config)
+    model = VisionEncoderDecoderModel(
+        encoder=encoder,
+        decoder=decoder
+    )
+    processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    tokenizer.pad_token = tokenizer.eos_token
+    model.config.pad_token_id = tokenizer.eos_token_id
+    model.config.decoder_start_token_id = tokenizer.bos_token_id
+    model.config.eos_token_id = tokenizer.eos_token_id
+    model.config.vocab_size = model.config.decoder.vocab_size
+    model.to(device)
+    # ------------------------------------------
+    # DATASET
+    # ------------------------------------------
+    dataset = COCODatasetViTGPT2(
+        "annotations/subset_10k.jsonl",
+        "train2017",
+        processor,
+        tokenizer,
+        mode="short"
+    )
+    train_size = int(0.9 * len(dataset))
+    val_size = len(dataset) - train_size
+    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
+    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
+    optimizer = AdamW(model.parameters(), lr=LR)
+    scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS)
+    best_cider = 0
+    # ==========================================
+    # TRAIN LOOP
+    # ==========================================
+    for epoch in range(EPOCHS):
+        model.train()
+        total_loss = 0
+        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
+            pixel_values = batch["pixel_values"].to(device)
+            labels = batch["labels"].to(device)
+            outputs = model(pixel_values=pixel_values, labels=labels)
+            loss = outputs.loss
+            loss.backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+            optimizer.step()
+            optimizer.zero_grad()
+            total_loss += loss.item()
+        avg_loss = total_loss / len(train_loader)
+        print(f"Train Loss: {avg_loss:.4f}")
+        # ------------------------------------------
+        # CIDEr Evaluation
+        # ------------------------------------------
+        cider_score = evaluate_cider(
+            model,
+            processor,
+            tokenizer,
+            val_dataset,
+            device
+        )
+        # Save best model
+        if cider_score > best_cider:
+            best_cider = cider_score
+            model.save_pretrained(SAVE_DIR)
+            tokenizer.save_pretrained(SAVE_DIR)
+            processor.save_pretrained(SAVE_DIR)
+            print("Best model saved.")
+        scheduler.step()
+    print("ViT-GPT2 Training complete.")
+if __name__ == "__main__":
+    main()

uploadtohf.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from transformers import (
+    AutoTokenizer,
+    BlipForConditionalGeneration,
+    BlipProcessor,
+    GitForCausalLM,
+    GitProcessor,
+    VisionEncoderDecoderModel,
+    ViTImageProcessor,
+)
+def push_blip(
+    local_dir: str = "saved_model_phase2",
+    repo_id: str = "pchandragrid/blip-caption-model",
+) -> None:
+    model = BlipForConditionalGeneration.from_pretrained(local_dir)
+    processor = BlipProcessor.from_pretrained(local_dir)
+    model.push_to_hub(repo_id)
+    processor.push_to_hub(repo_id)
+def push_vit_gpt2(
+    local_dir: str = "saved_vit_gpt2",
+    repo_id: str = "pchandragrid/vit-gpt2-caption-model",
+) -> None:
+    model = VisionEncoderDecoderModel.from_pretrained(local_dir)
+    image_processor = ViTImageProcessor.from_pretrained(local_dir)
+    tokenizer = AutoTokenizer.from_pretrained(local_dir)
+    model.push_to_hub(repo_id)
+    image_processor.push_to_hub(repo_id)
+    tokenizer.push_to_hub(repo_id)
+def push_git(
+    local_dir: str = "saved_git_model",
+    repo_id: str = "pchandragrid/git-caption-model",
+) -> None:
+    model = GitForCausalLM.from_pretrained(local_dir)
+    processor = GitProcessor.from_pretrained(local_dir)
+    model.push_to_hub(repo_id)
+    processor.push_to_hub(repo_id)
+if __name__ == "__main__":
+    push_blip()
+    push_vit_gpt2()
+    push_git()
+    print("Uploaded: BLIP, ViT-GPT2, and GIT models.")