In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import json
import random
import requests
from tqdm import tqdm
from pycocotools.coco import COCO
from pathlib import Path


SUBSET_SIZE = 30_000
RANDOM_SEED = 42

OUTPUT_DIR = Path("data/processed")
IMAGES_DIR = OUTPUT_DIR / "images"

CAPTIONS_OUT = OUTPUT_DIR / "captions.json"
SPLITS_OUT = OUTPUT_DIR / "splits.json"

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
IMAGES_DIR.mkdir(parents=True, exist_ok=True)


In [12]:
ANNOT_DIR = Path("annotations")
ANNOT_DIR.mkdir(exist_ok=True)

train_caps_path = ANNOT_DIR / "captions_train2017.json"

if not train_caps_path.exists():
 print("Downloading COCO captions_train2017.json (~45 MB)...")
 url = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
 zip_path = ANNOT_DIR / "annotations_trainval2017.zip"

 # download zip
 r = requests.get(url, stream=True)
 with open(zip_path, "wb") as f:
 for chunk in r.iter_content(chunk_size=8192):
 if chunk:
 f.write(chunk)

 # unzip only the captions file
 import zipfile
 with zipfile.ZipFile(zip_path, "r") as z:
 for name in z.namelist():
 if "captions_train2017.json" in name:
 z.extract(name, ANNOT_DIR)
 # Normalize to expected name
 (ANNOT_DIR / name).rename(train_caps_path)

print("Captions file ready:", train_caps_path)


Captions file ready: annotations/captions_train2017.json


In [13]:
print("Loading captions_train2017.json ...")
coco = COCO(str(train_caps_path))

image_ids = list(coco.imgs.keys())
print(f"Total train images: {len(image_ids)}")


Loading captions_train2017.json ...
loading annotations into memory...
Done (t=0.49s)
creating index...
index created!
Total train images: 118287


In [14]:
random.seed(RANDOM_SEED)
sampled_ids = random.sample(image_ids, SUBSET_SIZE)

print("Sampled:", len(sampled_ids))


Sampled: 30000


In [15]:
# Collect metadata for faster processing
metadata = {}

for img_id in sampled_ids:
 info = coco.imgs[img_id]
 anns = coco.imgToAnns[img_id]

 captions = [ann["caption"] for ann in anns]

 metadata[str(img_id)] = {
 "file_name": info["file_name"],
 "url": info["coco_url"],
 "captions": captions,
 }

len(metadata)


30000

In [16]:
print("Previewing 3 random metadata samples:\n")

for img_id in list(metadata.keys())[:3]:
 item = metadata[img_id]
 print("Image ID:", img_id)
 print("File name:", item["file_name"])
 print("URL:", item["url"])
 print("Captions:")
 for c in item["captions"]:
 print(" -", c)
 print("-" * 60)


Previewing 3 random metadata samples:

Image ID: 68130
File name: 000000068130.jpg
URL: http://images.cocodataset.org/train2017/000000068130.jpg
Captions:
 - a group of people crossing a city street 
 - A group of people cross a cross walk in a big city
 - a lot of people crossing a crosswalk 
 - Many people are crossing the street in front of some large buildings.
 - a photo taken on the corner facing old looking buildings
------------------------------------------------------------
Image ID: 222195
File name: 000000222195.jpg
URL: http://images.cocodataset.org/train2017/000000222195.jpg
Captions:
 - A cay laying on top of a blue couch arm next to a wall.
 - A cat lying down with a packaged toothbrush on its head.
 - The cat is irritated that there is a packaged toothbrush resting on its head.
 - A close up view of cat carrying a toothbrush on its head.
 - A grey and black cat with a toothbrush on its head.
------------------------------------------------------------
Image ID: 133386


In [17]:
def download_image(url, out_path):
 try:
 r = requests.get(url, timeout=10, stream=True)
 if r.status_code != 200:
 return False
 with open(out_path, "wb") as f:
 for chunk in r.iter_content(8192):
 if chunk:
 f.write(chunk)
 return True
 except:
 return False

print("Downloading images...")

for img_id, item in tqdm(metadata.items()):
 url = item["url"]
 fname = item["file_name"]

 out_path = IMAGES_DIR / fname
 if out_path.exists():
 continue

 ok = download_image(url, out_path)
 if not ok:
 print("FAILED DOWNLOAD:", url)


Downloading images...


100%|██████████| 30000/30000 [6:22:54<00:00, 1.31it/s] 


In [18]:
print("Writing captions.json ...")
with open(CAPTIONS_OUT, "w") as f:
 json.dump(metadata, f, indent=2)


Writing captions.json ...


In [19]:
print("Creating splits...")

ids = list(metadata.keys())
random.shuffle(ids)

n = len(ids)
n_train = int(0.85 * n)
n_val = int(0.10 * n)
n_test = n - n_train - n_val

splits = {
 "train": ids[:n_train],
 "val": ids[n_train:n_train + n_val],
 "test": ids[n_train + n_val:]
}

with open(SPLITS_OUT, "w") as f:
 json.dump(splits, f, indent=2)

print("Splits saved:")
print({k: len(v) for k, v in splits.items()})
print("Done!")


Creating splits...
Splits saved:
{'train': 25500, 'val': 3000, 'test': 1500}
Done!


## Unit Tests

In [5]:
import yaml
from transformers import T5TokenizerFast

with open("configs/default.yaml") as f:
 cfg = yaml.safe_load(f)

tokenizer = T5TokenizerFast.from_pretrained(cfg["model"]["t5_name"])


In [6]:
from data.loaders import get_coco_dataloaders
import torch

train_loader, val_loader, test_loader = get_coco_dataloaders(
 batch_size=4, data_dir="data/processed"
)

batch = next(iter(train_loader))

print("Image batch shape:", batch["pixel_values"].shape) # Expect [B, 3, H, W]
print("Input IDs shape:", batch["input_ids"].shape)
print("Attention mask shape:", batch["attention_mask"].shape)

# Show one caption
tokenizer_pad = batch["input_ids"][0]
decoded = tokenizer.decode(
 tokenizer_pad[tokenizer_pad != tokenizer.pad_token_id],
 skip_special_tokens=True
)
print("Caption example:", decoded)


Image batch shape: torch.Size([4, 3, 224, 224])
Input IDs shape: torch.Size([4, 64])
Attention mask shape: torch.Size([4, 64])
Caption example: A man in brown sitting on a motorcycle


In [None]:
import yaml
import torch
from models.vision_t5 import VisionT5
from models.encoder_projection_t5 import ImageProjection
import models.encoders as encoders
from train import build_model

with open("configs/default.yaml") as f:
 config = yaml.safe_load(f)

device = "cuda" if torch.cuda.is_available() else "cpu"
model, tokenizer = build_model(config)
model.to(device)

print("Model loaded with encoder:", config["model"]["encoder"])
print("T5 model:", config["model"]["t5_name"])


Model loaded with encoder: ResnetCNNEncoder
T5 model: t5-small


In [8]:
batch = next(iter(train_loader))
pixel_values = batch["pixel_values"].to(device)
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)

labels = input_ids.clone()
labels[labels == tokenizer.pad_token_id] = -100

with torch.cuda.amp.autocast():
 outputs = model(
 pixel_values=pixel_values,
 input_ids=input_ids,
 attention_mask=attention_mask,
 labels=labels,
 )

print("Forward pass OK. Loss:", outputs.loss.item())


 with torch.cuda.amp.autocast():


DEBUG — encoder hidden shape: torch.Size([4, 1, 512])
DEBUG — encoder hidden mean: 0.001422882080078125
Forward pass OK. Loss: 6.275390625


In [None]:
from torch.optim import AdamW
from torch.cuda.amp import GradScaler

optimizer = AdamW(model.parameters(), lr=1e-4)
scaler = GradScaler()

model.train()

pixel_values = batch["pixel_values"].to(device)
input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)

labels = input_ids.clone()
labels[labels == tokenizer.pad_token_id] = -100

optimizer.zero_grad()

with torch.cuda.amp.autocast():
 outputs = model(
 pixel_values=pixel_values,
 input_ids=input_ids,
 attention_mask=attention_mask,
 labels=labels,
 )
 loss = outputs.loss

print("Loss before backward:", loss.item())

scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()

print(" Training step passed (no errors)")


In [10]:
ids = model.t5.generate(
 input_ids=torch.tensor([[tokenizer.pad_token_id]]).to(device),
 max_length=10,
)
print(ids)
print(tokenizer.decode(ids[0], skip_special_tokens=True))

tensor([[0, 3, 0, 0, 0, 0, 3, 0, 0, 3]], device='cuda:0')
 


In [14]:
print("Decoder start token:", model.t5.config.decoder_start_token_id)


Decoder start token: 0


In [15]:
test_ids = model.t5.generate(
 input_ids=torch.tensor([[tokenizer.pad_token_id]]).to(device),
 max_length=10
)

print("Unconditioned generation:", tokenizer.decode(test_ids[0], skip_special_tokens=True))


Unconditioned generation: 


In [16]:
from inference import generate_caption

model.eval()

sample_img = batch["pixel_values"][0:1].to(device) # one image batch
vision_out = model.vision_encoder(sample_img)
print("Vision out mean:", vision_out["image_embeds"].abs().mean().item())

proj = model.projector(vision_out["image_embeds"])
print("Projected mean:", proj.abs().mean().item())
print("Projected shape:", proj.shape)


caption = generate_caption(model, tokenizer, sample_img, device=device)
print("Generated caption:", caption)


Vision out mean: 0.11751262843608856
Projected mean: 0.14342853426933289
Projected shape: torch.Size([1, 512])
Generated caption: e e e e e e e e e e e e e e e e


In [18]:
model.eval()

with torch.no_grad():
 pixel_values = batch["pixel_values"][0:1].to(device)
 input_ids = batch["input_ids"][0:1].to(device)
 attention_mask = batch["attention_mask"][0:1].to(device)

 labels = input_ids.clone()
 labels[labels == tokenizer.pad_token_id] = -100

 outputs = model(
 pixel_values=pixel_values,
 input_ids=input_ids,
 attention_mask=attention_mask,
 labels=labels,
 )

print("Val-style loss:", outputs.loss.item())

# Preview caption
pred = generate_caption(model, tokenizer, pixel_values, device=device)
gt_ids = input_ids[0][input_ids[0] != tokenizer.pad_token_id]
gt_caption = tokenizer.decode(gt_ids, skip_special_tokens=True)

print("Prediction:", pred)
print("Ground Truth:", gt_caption)


DEBUG — encoder hidden shape: torch.Size([1, 1, 512])
DEBUG — encoder hidden mean: 0.01798422262072563
Val-style loss: 8.727174758911133
Prediction: e e e e e e e e e e e e e e e e
Ground Truth: A woman dancing around with a umbrella in her hand.
