Spaces:

Ryanfafa
/

image_captioning_model

Sleeping

App Files Files Community

Ryanfafa commited on Feb 9

Commit

19ea5c5

verified ·

1 Parent(s): 346fd4f

Upload 7 files

Browse files

Files changed (7) hide show

image_captioning/__init__.py +11 -0
image_captioning/config.py +112 -0
image_captioning/dataset.py +385 -0
image_captioning/evaluate.py +207 -0
image_captioning/inference.py +101 -0
image_captioning/model.py +382 -0
image_captioning/train.py +297 -0

image_captioning/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""
+Image captioning package: EfficientNetB0 encoder + GPT-2 decoder.
+This package exposes the main components:
+- ImageCaptioningModel (in model.py)
+- dataset/dataloader utilities (in dataset.py)
+- training, evaluation, and inference scripts.
+"""
+from .model import ImageCaptioningModel  # noqa: F401

image_captioning/config.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import os
+import random
+from dataclasses import dataclass
+from typing import Optional
+import numpy as np
+import torch
+@dataclass
+class PathsConfig:
+    """
+    Configuration for dataset and checkpoint paths.
+    This is tailored to your existing visually impaired dataset layout:
+    - Images:  <data_root>/visual_dataset/*.jpg
+    - Text:    <data_root>/visual_text/visual.token.txt
+               <data_root>/visual_text/visual.trainImages.txt
+               <data_root>/visual_text/visual.testImages.txt
+    """
+    data_root: str = "/Users/ryan/Downloads/visuallyimpair"
+    images_dir_name: str = "visual_dataset"
+    text_dir_name: str = "visual_text"
+    def _join(self, *parts: str) -> str:
+        return os.path.join(*parts)
+    @property
+    def images_dir(self) -> str:
+        return self._join(self.data_root, self.images_dir_name)
+    @property
+    def text_dir(self) -> str:
+        return self._join(self.data_root, self.text_dir_name)
+    @property
+    def token_file(self) -> str:
+        return self._join(self.text_dir, "visual.token.txt")
+    @property
+    def train_list_file(self) -> str:
+        return self._join(self.text_dir, "visual.trainImages.txt")
+    @property
+    def test_list_file(self) -> str:
+        return self._join(self.text_dir, "visual.testImages.txt")
+@dataclass
+class TrainingConfig:
+    """
+    Hyperparameters and training-related configuration.
+    """
+    learning_rate: float = 5e-5
+    batch_size: int = 16
+    num_epochs: int = 10
+    warmup_steps: int = 500
+    max_caption_length: int = 50
+    gradient_accumulation_steps: int = 1
+    num_workers: int = 4
+    mixed_precision: bool = True
+    patience: int = 3
+    max_grad_norm: float = 1.0
+    # Model-specific
+    prefix_length: int = 1  # number of visual prefix tokens
+    # Logging / checkpoints
+    output_dir: str = "checkpoints"
+    log_dir: str = "runs"
+    # Reproducibility
+    seed: int = 42
+def get_device() -> torch.device:
+    """
+    Return the best available device (CUDA if available, else CPU) and log it.
+    """
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        print("Using CUDA for training/inference.")
+    else:
+        device = torch.device("cpu")
+        print("CUDA not available, falling back to CPU.")
+    return device
+def set_seed(seed: int) -> None:
+    """
+    Set random seeds for reproducibility across Python, NumPy, and PyTorch.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def ensure_dir(path: str) -> None:
+    """
+    Create directory if it does not already exist.
+    """
+    os.makedirs(path, exist_ok=True)

image_captioning/dataset.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import os
+import random
+from typing import Dict, List, Optional, Tuple
+import torch
+from PIL import Image
+from torch import Tensor
+from torch.utils.data import DataLoader, Dataset, Subset
+from torchvision import transforms
+from transformers import GPT2TokenizerFast
+from .config import PathsConfig, TrainingConfig
+IMAGENET_MEAN = [0.485, 0.456, 0.406]
+IMAGENET_STD = [0.229, 0.224, 0.225]
+def train_image_transform() -> transforms.Compose:
+    """
+    Image preprocessing for training with random augmentation to improve
+    generalization. Augmentations are kept moderate to avoid changing the
+    semantic content of the scene.
+    """
+    return transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
+            transforms.RandomHorizontalFlip(p=0.5),
+            transforms.ColorJitter(
+                brightness=0.2,
+                contrast=0.2,
+                saturation=0.2,
+                hue=0.05,
+            ),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
+        ]
+    )
+def eval_image_transform() -> transforms.Compose:
+    """
+    Deterministic preprocessing for validation and test: resize, center-crop
+    to 224x224, normalize.
+    """
+    return transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
+        ]
+    )
+class ImageCaptionDataset(Dataset):
+    """
+    Custom Dataset for the visually impaired image captioning data.
+    This implementation is tailored to your existing layout:
+    - Images:  <data_root>/visual_dataset/*.jpg
+    - Text:
+        - visual.token.txt           (image#idx<TAB>caption)
+        - visual.trainImages.txt     (one image filename per line)
+        - visual.testImages.txt      (one image filename per line)
+    """
+    def __init__(
+        self,
+        paths_cfg: PathsConfig,
+        tokenizer: GPT2TokenizerFast,
+        split: str = "train",
+        training_cfg: Optional[TrainingConfig] = None,
+        transform: Optional[transforms.Compose] = None,
+        random_caption: bool = True,
+    ) -> None:
+        super().__init__()
+        if split not in {"train", "val", "test"}:
+            raise ValueError("split must be one of {'train', 'val', 'test'}")
+        self.paths_cfg = paths_cfg
+        self.tokenizer = tokenizer
+        self.training_cfg = training_cfg or TrainingConfig()
+        # If no transform is provided, fall back to a deterministic eval
+        # transform so this class can still be used directly. In practice,
+        # create_dataloader() will supply train/eval-specific transforms.
+        self.transform = transform or eval_image_transform()
+        self.random_caption = random_caption
+        self.max_length: int = int(self.training_cfg.max_caption_length)
+        # Load all captions from visual.token.txt
+        token_path = self.paths_cfg.token_file
+        if not os.path.exists(token_path):
+            raise FileNotFoundError(f"Caption file not found: {token_path}")
+        self.captions_by_image: Dict[str, List[str]] = {}
+        with open(token_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    key, caption = line.split("\t", 1)
+                except ValueError as exc:
+                    raise ValueError(f"Malformed line in {token_path}: {line}") from exc
+                img_name = key.split("#")[0]
+                self.captions_by_image.setdefault(img_name, []).append(caption.strip())
+        # Choose image list file based on split
+        if split == "train":
+            list_file = self.paths_cfg.train_list_file
+        else:
+            # We only have a single test list in this dataset; use it for both
+            # 'val' and 'test' splits for now.
+            list_file = self.paths_cfg.test_list_file
+        if not os.path.exists(list_file):
+            raise FileNotFoundError(f"Image list file for split '{split}' not found: {list_file}")
+        self.image_ids: List[str] = []
+        with open(list_file, "r", encoding="utf-8") as f:
+            for line in f:
+                img_name = line.strip()
+                if not img_name:
+                    continue
+                if img_name not in self.captions_by_image:
+                    # Skip images without captions to avoid runtime issues
+                    continue
+                self.image_ids.append(img_name)
+        if not self.image_ids:
+            raise RuntimeError(f"No images with captions found for split '{split}'.")
+        print(f"Loaded {len(self.image_ids)} {split} images with captions.")
+    def __len__(self) -> int:
+        return len(self.image_ids)
+    def __getitem__(self, idx: int) -> Dict[str, Tensor]:
+        img_name = self.image_ids[idx]
+        img_path = os.path.join(self.paths_cfg.images_dir, img_name)
+        if not os.path.exists(img_path):
+            raise FileNotFoundError(f"Image file not found: {img_path}")
+        image = Image.open(img_path).convert("RGB")
+        image_tensor = self.transform(image)
+        caption_list = self.captions_by_image[img_name]
+        if not caption_list:
+            raise RuntimeError(f"No captions available for image {img_name}")
+        # Choose a caption. During training we consider up to three different
+        # captions per image and randomly sample among them; for evaluation we
+        # always take the first caption. We only strip leading/trailing
+        # whitespace so that the raw textual content is preserved and no
+        # characters are dropped before tokenization.
+        if self.random_caption:
+            limited_captions = caption_list[:3]
+            caption = random.choice(limited_captions)
+        else:
+            caption = caption_list[0]
+        caption = caption.strip()
+        # Convert caption text into token IDs without adding any extra special
+        # tokens so we retain a direct mapping between the raw caption string
+        # and the token sequence.
+        token_ids: List[int] = self.tokenizer.encode(
+            caption,
+            add_special_tokens=False,
+        )
+        # Define explicit BOS (start-of-sentence) and EOS (end-of-sentence)
+        # tokens so the model learns where captions begin and end. If the
+        # tokenizer does not define a BOS token, we reuse EOS.
+        bos_token_id = self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
+        eos_token_id = self.tokenizer.eos_token_id
+        seq_ids: List[int] = [bos_token_id] + token_ids + [eos_token_id]
+        # Truncate if necessary to respect max_length. To guarantee that the
+        # full caption (including BOS/EOS) can be represented without cutting
+        # tokens, ensure that training_cfg.max_caption_length is set large
+        # enough for your data.
+        if len(seq_ids) > self.max_length:
+            seq_ids = seq_ids[: self.max_length]
+        # Pad up to max_length with pad_token_id and build attention mask.
+        pad_id = self.tokenizer.pad_token_id
+        input_ids = torch.full(
+            (self.max_length,),
+            pad_id,
+            dtype=torch.long,
+        )
+        attention_mask = torch.zeros(self.max_length, dtype=torch.long)
+        seq_len = len(seq_ids)
+        input_ids[:seq_len] = torch.tensor(seq_ids, dtype=torch.long)
+        attention_mask[:seq_len] = 1
+        # Labels are initially the same as input_ids; padding positions will
+        # be set to -100 so they are ignored by the loss.
+        labels = input_ids.clone()
+        labels[attention_mask == 0] = -100
+        return {
+            "image": image_tensor,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+            "caption": caption,
+            "image_id": img_name,
+        }
+def create_tokenizer() -> GPT2TokenizerFast:
+    """
+    Create a GPT-2 tokenizer with a defined pad token.
+    """
+    tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    return tokenizer
+def _infer_category_from_filename(filename: str) -> str:
+    """
+    Infer a coarse category label from an image filename.
+    Heuristic:
+    - Strip directory and extension.
+    - Remove trailing digits to group files like 'bench1.jpg', 'bench25.jpg'
+      into the same category 'bench'.
+    """
+    base = os.path.basename(filename)
+    stem, _ext = os.path.splitext(base)
+    # Remove trailing digits
+    i = len(stem)
+    while i > 0 and stem[i - 1].isdigit():
+        i -= 1
+    category = stem[:i] or stem
+    return category
+def _balanced_train_val_indices(
+    dataset: ImageCaptionDataset,
+    val_ratio: float = 0.2,
+) -> Tuple[List[int], List[int]]:
+    """
+    Split the dataset indices into train and validation sets.
+    The validation set:
+    - Targets approximately `val_ratio` of the total dataset size.
+    - Is balanced across categories inferred from filenames, i.e., each
+      category contributes (as much as possible) the same number of images.
+    """
+    num_items = len(dataset.image_ids)
+    if num_items == 0:
+        raise RuntimeError("Cannot create train/val split from an empty dataset.")
+    # Group indices by inferred category
+    category_to_indices: Dict[str, List[int]] = {}
+    for idx, img_name in enumerate(dataset.image_ids):
+        cat = _infer_category_from_filename(img_name)
+        category_to_indices.setdefault(cat, []).append(idx)
+    # Sort indices within each category for deterministic behavior
+    for indices in category_to_indices.values():
+        indices.sort()
+    categories = sorted(category_to_indices.keys())
+    num_categories = len(categories)
+    # Desired total size for validation set
+    target_val_size = max(1, int(round(val_ratio * num_items)))
+    # Base number of validation samples per category, constrained by the
+    # smallest category so we can keep counts balanced.
+    min_cat_size = min(len(category_to_indices[cat]) for cat in categories)
+    per_category = min(
+        min_cat_size,
+        max(1, int(round(target_val_size / max(1, num_categories)))),
+    )
+    val_indices: List[int] = []
+    train_indices: List[int] = []
+    for cat in categories:
+        indices = category_to_indices[cat]
+        val_for_cat = indices[:per_category]
+        train_for_cat = indices[per_category:]
+        val_indices.extend(val_for_cat)
+        train_indices.extend(train_for_cat)
+    return train_indices, val_indices
+def create_dataloader(
+    paths_cfg: PathsConfig,
+    training_cfg: TrainingConfig,
+    split: str,
+    tokenizer: Optional[GPT2TokenizerFast] = None,
+    shuffle: Optional[bool] = None,
+) -> Tuple[DataLoader, GPT2TokenizerFast]:
+    """
+    Factory function to create a DataLoader for a given split.
+    Parameters
+    ----------
+    paths_cfg:
+        Paths configuration.
+    training_cfg:
+        Training configuration containing batch size, max caption length, etc.
+    split:
+        One of {'train', 'val', 'test'}.
+    tokenizer:
+        Optional pre-initialized GPT-2 tokenizer. If None, a new one is created.
+    shuffle:
+        Optional flag to override shuffle behavior. If None, shuffle is True
+        for the 'train' split and False otherwise.
+    """
+    if tokenizer is None:
+        tokenizer = create_tokenizer()
+    if shuffle is None:
+        shuffle = split == "train"
+    # For training and validation, we build a single underlying dataset from
+    # the training list file and then create a balanced 80/20 split by
+    # category. The test split continues to use the dedicated test list file.
+    if split == "test":
+        random_caption = False
+        dataset = ImageCaptionDataset(
+            paths_cfg=paths_cfg,
+            tokenizer=tokenizer,
+            split="test",
+            training_cfg=training_cfg,
+            transform=eval_image_transform(),
+            random_caption=random_caption,
+        )
+    else:
+        # Underlying full training dataset
+        full_train_dataset = ImageCaptionDataset(
+            paths_cfg=paths_cfg,
+            tokenizer=tokenizer,
+            split="train",
+            training_cfg=training_cfg,
+            transform=train_image_transform(),
+            random_caption=True,  # always randomize captions during training
+        )
+        train_indices, val_indices = _balanced_train_val_indices(
+            full_train_dataset,
+            val_ratio=0.2,
+        )
+        if split == "train":
+            dataset = Subset(full_train_dataset, train_indices)
+        elif split == "val":
+            dataset = Subset(full_train_dataset, val_indices)
+        else:
+            raise ValueError("split must be one of {'train', 'val', 'test'}")
+    dataloader = DataLoader(
+        dataset,
+        batch_size=training_cfg.batch_size,
+        shuffle=shuffle,
+        num_workers=training_cfg.num_workers,
+        pin_memory=torch.cuda.is_available(),
+    )
+    return dataloader, tokenizer

image_captioning/evaluate.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import argparse
+import json
+import os
+from typing import Dict, List, Tuple
+import torch
+from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu
+from nltk.translate.meteor_score import single_meteor_score
+from rouge_score import rouge_scorer
+from .config import PathsConfig, TrainingConfig, get_device, set_seed
+from .dataset import create_dataloader, create_tokenizer
+from .model import ImageCaptioningModel
+def parse_args() -> argparse.Namespace:
+    """
+    Parse command-line arguments for evaluation.
+    """
+    parser = argparse.ArgumentParser(description="Evaluate image captioning model on test set.")
+    parser.add_argument("--data_root", type=str, default="/Users/ryan/Downloads/visuallyimpair", help="Root path to dataset.")
+    parser.add_argument("--checkpoint", type=str, required=True, help="Path to model checkpoint (.pt).")
+    parser.add_argument("--batch_size", type=int, default=16, help="Batch size for evaluation.")
+    parser.add_argument("--max_length", type=int, default=50, help="Maximum caption length during generation.")
+    parser.add_argument("--num_beams", type=int, default=3, help="Number of beams for beam search.")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed.")
+    parser.add_argument("--output_samples", type=str, default="evaluation_samples.jsonl", help="File to save sample predictions.")
+    return parser.parse_args()
+def compute_metrics(
+    references: List[List[str]],
+    hypotheses: List[str],
+) -> Dict[str, float]:
+    """
+    Compute BLEU (1-4), METEOR, and ROUGE-L metrics.
+    """
+    if not references or not hypotheses:
+        raise ValueError("References and hypotheses must be non-empty.")
+    if len(references) != len(hypotheses):
+        raise ValueError("Number of references and hypotheses must match.")
+    smoothie = SmoothingFunction().method4
+    # BLEU scores
+    bleu1 = corpus_bleu(
+        references,
+        hypotheses,
+        weights=(1.0, 0.0, 0.0, 0.0),
+        smoothing_function=smoothie,
+    )
+    bleu2 = corpus_bleu(
+        references,
+        hypotheses,
+        weights=(0.5, 0.5, 0.0, 0.0),
+        smoothing_function=smoothie,
+    )
+    bleu3 = corpus_bleu(
+        references,
+        hypotheses,
+        weights=(1.0 / 3, 1.0 / 3, 1.0 / 3, 0.0),
+        smoothing_function=smoothie,
+    )
+    bleu4 = corpus_bleu(
+        references,
+        hypotheses,
+        weights=(0.25, 0.25, 0.25, 0.25),
+        smoothing_function=smoothie,
+    )
+    # METEOR
+    meteor_scores: List[float] = []
+    for ref_list, hyp in zip(references, hypotheses):
+        # Use the first reference for METEOR; tokenize by simple whitespace.
+        # If NLTK's WordNet data is missing, fall back to a simple unigram F1.
+        ref_tokens = ref_list[0].split()
+        hyp_tokens = hyp.split()
+        try:
+            meteor_scores.append(single_meteor_score(ref_tokens, hyp_tokens))
+        except LookupError:
+            ref_set = set(ref_tokens)
+            hyp_set = set(hyp_tokens)
+            if not ref_set or not hyp_set:
+                meteor_scores.append(0.0)
+            else:
+                overlap = len(ref_set & hyp_set)
+                precision = overlap / len(hyp_set)
+                recall = overlap / len(ref_set)
+                if precision + recall == 0:
+                    meteor_scores.append(0.0)
+                else:
+                    meteor_scores.append(2 * precision * recall / (precision + recall))
+    meteor = sum(meteor_scores) / max(1, len(meteor_scores))
+    # ROUGE-L
+    rouge = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
+    rouge_l_scores: List[float] = []
+    for ref_list, hyp in zip(references, hypotheses):
+        scores = rouge.score(ref_list[0], hyp)
+        rouge_l_scores.append(scores["rougeL"].fmeasure)
+    rouge_l = sum(rouge_l_scores) / max(1, len(rouge_l_scores))
+    return {
+        "BLEU-1": bleu1,
+        "BLEU-2": bleu2,
+        "BLEU-3": bleu3,
+        "BLEU-4": bleu4,
+        "METEOR": meteor,
+        "ROUGE-L": rouge_l,
+    }
+def run_evaluation(args: argparse.Namespace) -> None:
+    """
+    Run evaluation on the test set, compute metrics, and save sample predictions.
+    """
+    paths_cfg = PathsConfig(data_root=args.data_root)
+    training_cfg = TrainingConfig(
+        batch_size=args.batch_size,
+        max_caption_length=args.max_length,
+        num_epochs=1,
+    )
+    set_seed(args.seed)
+    device = get_device()
+    tokenizer = create_tokenizer()
+    test_loader, tokenizer = create_dataloader(
+        paths_cfg=paths_cfg,
+        training_cfg=training_cfg,
+        split="test",
+        tokenizer=tokenizer,
+        shuffle=False,
+    )
+    model = ImageCaptioningModel(training_cfg=training_cfg)
+    state_dict = torch.load(args.checkpoint, map_location=device)
+    model.load_state_dict(state_dict)
+    model.to(device)
+    model.eval()
+    references: List[List[str]] = []
+    hypotheses: List[str] = []
+    num_samples_to_save = 50
+    saved_samples: List[Dict[str, str]] = []
+    with torch.no_grad():
+        for batch in test_loader:
+            images = batch["image"].to(device)
+            # Use the raw caption string from the dataset as reference
+            captions = batch["caption"]
+            # Generate predictions one image at a time to respect generate() constraints
+            for idx in range(images.size(0)):
+                single_image = images[idx : idx + 1]
+                ref_caption = captions[idx]
+                pred_text_list = model.generate(
+                    images=single_image,
+                    max_length=args.max_length,
+                    num_beams=args.num_beams,
+                )
+                pred_text = pred_text_list[0]
+                references.append([ref_caption])
+                hypotheses.append(pred_text)
+                if len(saved_samples) < num_samples_to_save:
+                    saved_samples.append(
+                        {
+                            "image_id": batch["image_id"][idx],
+                            "reference": ref_caption,
+                            "prediction": pred_text,
+                        }
+                    )
+    metrics = compute_metrics(references, hypotheses)
+    print("Evaluation metrics:")
+    for name, value in metrics.items():
+        print(f"  {name}: {value:.4f}")
+    # Save sample predictions
+    output_path = args.output_samples
+    with open(output_path, "w", encoding="utf-8") as f:
+        for sample in saved_samples:
+            f.write(json.dumps(sample) + "\n")
+    print(f"Saved {len(saved_samples)} sample predictions to {output_path}")
+def main() -> None:
+    args = parse_args()
+    if not os.path.exists(args.checkpoint):
+        raise FileNotFoundError(f"Checkpoint not found: {args.checkpoint}")
+    run_evaluation(args)
+if __name__ == "__main__":
+    main()

image_captioning/inference.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import argparse
+import os
+from typing import List
+import torch
+from PIL import Image
+from torchvision import transforms
+from .config import PathsConfig, TrainingConfig, get_device, set_seed
+from .dataset import IMAGENET_MEAN, IMAGENET_STD, create_tokenizer
+from .model import ImageCaptioningModel
+def parse_args() -> argparse.Namespace:
+    """
+    Parse command-line arguments for inference.
+    """
+    parser = argparse.ArgumentParser(description="Run image captioning inference on a single image.")
+    parser.add_argument("--image", type=str, required=True, help="Path to image file.")
+    parser.add_argument("--checkpoint", type=str, default="checkpoints/best_model.pt", help="Path to model checkpoint.")
+    parser.add_argument("--max_length", type=int, default=50, help="Maximum caption length.")
+    parser.add_argument("--num_beams", type=int, default=3, help="Number of beams for beam search.")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed.")
+    parser.add_argument("--data_root", type=str, default="/Users/ryan/Downloads/visuallyimpair", help="Root path to dataset (for consistency).")
+    return parser.parse_args()
+def build_preprocess_transform() -> transforms.Compose:
+    """
+    Build image preprocessing transform matching the training pipeline.
+    """
+    return transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
+        ]
+    )
+def load_image(image_path: str) -> torch.Tensor:
+    """
+    Load and preprocess a single image.
+    """
+    if not os.path.exists(image_path):
+        raise FileNotFoundError(f"Image not found: {image_path}")
+    image = Image.open(image_path).convert("RGB")
+    transform = build_preprocess_transform()
+    tensor = transform(image).unsqueeze(0)  # (1, 3, 224, 224)
+    return tensor
+def run_inference(args: argparse.Namespace) -> List[str]:
+    """
+    Run caption generation on the specified image and print the result.
+    """
+    set_seed(args.seed)
+    device = get_device()
+    _paths_cfg = PathsConfig(data_root=args.data_root)  # Included for consistency and future extensions
+    training_cfg = TrainingConfig(max_caption_length=args.max_length)
+    tokenizer = create_tokenizer()
+    model = ImageCaptioningModel(training_cfg=training_cfg)
+    if not os.path.exists(args.checkpoint):
+        raise FileNotFoundError(f"Checkpoint not found: {args.checkpoint}")
+    state_dict = torch.load(args.checkpoint, map_location=device)
+    model.load_state_dict(state_dict)
+    model.to(device)
+    model.eval()
+    image_tensor = load_image(args.image).to(device)
+    captions = model.generate(
+        images=image_tensor,
+        max_length=args.max_length,
+        num_beams=args.num_beams,
+    )
+    for caption in captions:
+        print(f"Caption: {caption}")
+    return captions
+def main() -> None:
+    args = parse_args()
+    run_inference(args)
+if __name__ == "__main__":
+    main()

image_captioning/model.py ADDED Viewed

	@@ -0,0 +1,382 @@

+import contextlib
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torchvision import models
+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+from .config import TrainingConfig, get_device
+@dataclass
+class ImageCaptioningOutput:
+    """
+    Container for model outputs.
+    Attributes
+    ----------
+    logits:
+        Predicted token logits of shape (batch_size, seq_len, vocab_size),
+        where seq_len is the number of text tokens (visual prefix tokens are removed).
+    loss:
+        Optional cross-entropy loss over caption tokens.
+    """
+    logits: Tensor
+    loss: Optional[Tensor] = None
+class EfficientNetB0Encoder(nn.Module):
+    """
+    EfficientNet-B0 image encoder using torchvision.
+    The classification head is removed and only the pooled feature vector
+    (dimension 1280) is returned.
+    """
+    def __init__(self, pretrained: bool = True) -> None:
+        super().__init__()
+        effnet = models.efficientnet_b0(pretrained=pretrained)
+        self.features = effnet.features
+        self.avgpool = effnet.avgpool
+        self.flatten = nn.Flatten()
+        # in_features of the final classifier is the encoder output dim
+        self.out_dim: int = effnet.classifier[1].in_features
+    def forward(self, images: Tensor) -> Tensor:
+        """
+        Encode a batch of images into a pooled feature representation.
+        Parameters
+        ----------
+        images:
+            Tensor of shape (batch_size, 3, 224, 224).
+        """
+        x = self.features(images)
+        x = self.avgpool(x)
+        x = self.flatten(x)  # (batch_size, out_dim)
+        return x
+class ImageCaptioningModel(nn.Module):
+    """
+    Image captioning model with an EfficientNet-B0 vision encoder and GPT-2 decoder.
+    The model projects visual features into a sequence of prefix embeddings that
+    are concatenated with GPT-2 token embeddings. GPT-2 then predicts caption tokens.
+    """
+    def __init__(
+        self,
+        training_cfg: Optional[TrainingConfig] = None,
+        pretrained_encoder: bool = True,
+    ) -> None:
+        super().__init__()
+        self.training_cfg = training_cfg or TrainingConfig()
+        self.device: torch.device = get_device()
+        # Vision encoder
+        self.encoder = EfficientNetB0Encoder(pretrained=pretrained_encoder)
+        # Text decoder (GPT-2 small)
+        self.tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+        if self.tokenizer.pad_token is None:
+            # Use EOS as pad token
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.gpt2 = GPT2LMHeadModel.from_pretrained("gpt2")
+        self.gpt2.config.pad_token_id = self.tokenizer.pad_token_id
+        # Number of visual prefix tokens
+        self.prefix_length: int = int(self.training_cfg.prefix_length)
+        if self.prefix_length < 1:
+            raise ValueError("prefix_length must be >= 1")
+        # Project image features to a sequence of prefix token embeddings
+        self.visual_projection = nn.Linear(
+            self.encoder.out_dim,
+            self.gpt2.config.n_embd * self.prefix_length,
+        )
+        self._printed_debug: bool = False
+        self.to(self.device)
+    # --------------------------------------------------------------------- #
+    # Internal utilities
+    # --------------------------------------------------------------------- #
+    def encode_images(self, images: Tensor) -> Tensor:
+        """
+        Encode images and produce visual prefix embeddings.
+        Returns
+        -------
+        Tensor of shape (batch_size, prefix_length, hidden_size).
+        """
+        assert images.dim() == 4, f"Expected images of shape (B,3,H,W), got {images.shape}"
+        img_features = self.encoder(images)  # (B, encoder_out_dim)
+        batch_size = img_features.size(0)
+        prefix_embeddings = self.visual_projection(img_features)
+        prefix_embeddings = prefix_embeddings.view(
+            batch_size,
+            self.prefix_length,
+            self.gpt2.config.n_embd,
+        )
+        return prefix_embeddings
+    # --------------------------------------------------------------------- #
+    # Forward (training)
+    # --------------------------------------------------------------------- #
+    def forward(
+        self,
+        images: Tensor,
+        captions: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        labels: Optional[Tensor] = None,
+    ) -> ImageCaptioningOutput:
+        """
+        Forward pass for training.
+        Parameters
+        ----------
+        images:
+            Tensor of shape (batch_size, 3, 224, 224).
+        captions:
+            Token IDs of shape (batch_size, seq_len).
+        attention_mask:
+            Optional attention mask of shape (batch_size, seq_len).
+        labels:
+            Optional target token IDs of shape (batch_size, seq_len).
+            If provided, cross-entropy loss is computed, ignoring positions
+            with label -100.
+        """
+        images = images.to(self.device)
+        captions = captions.to(self.device)
+        if attention_mask is not None:
+            attention_mask = attention_mask.to(self.device)
+        if labels is not None:
+            labels = labels.to(self.device)
+        batch_size, seq_len = captions.shape
+        assert images.size(0) == batch_size, "Batch size mismatch between images and captions."
+        prefix_embeddings = self.encode_images(images)  # (B, P, H)
+        token_embeddings = self.gpt2.transformer.wte(captions)  # (B, T, H)
+        inputs_embeds = torch.cat([prefix_embeddings, token_embeddings], dim=1)  # (B, P+T, H)
+        if attention_mask is not None:
+            prefix_mask = torch.ones(
+                batch_size,
+                self.prefix_length,
+                dtype=attention_mask.dtype,
+                device=attention_mask.device,
+            )
+            extended_attention_mask = torch.cat([prefix_mask, attention_mask], dim=1)
+        else:
+            extended_attention_mask = None
+        if not self._printed_debug:
+            print(f"[DEBUG] images shape:          {images.shape}")
+            print(f"[DEBUG] captions shape:        {captions.shape}")
+            print(f"[DEBUG] prefix_embeddings:     {prefix_embeddings.shape}")
+            print(f"[DEBUG] token_embeddings:      {token_embeddings.shape}")
+            print(f"[DEBUG] inputs_embeds shape:   {inputs_embeds.shape}")
+            if extended_attention_mask is not None:
+                print(f"[DEBUG] attention_mask shape: {extended_attention_mask.shape}")
+            self._printed_debug = True
+        outputs = self.gpt2(
+            inputs_embeds=inputs_embeds,
+            attention_mask=extended_attention_mask,
+            use_cache=False,
+            return_dict=True,
+        )
+        # Remove visual prefix positions from the logits so that
+        # the returned logits only correspond to text tokens.
+        logits = outputs.logits[:, self.prefix_length :, :]  # (B, T, V)
+        loss: Optional[Tensor] = None
+        if labels is not None:
+            if labels.shape != (batch_size, seq_len):
+                raise ValueError(
+                    f"labels shape {labels.shape} does not match captions shape {(batch_size, seq_len)}"
+                )
+            # Shift logits and labels for next-token prediction
+            shift_logits = logits[:, :-1, :].contiguous()
+            shift_labels = labels[:, 1:].contiguous()
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+            )
+        return ImageCaptioningOutput(logits=logits, loss=loss)
+    # --------------------------------------------------------------------- #
+    # Generation (inference)
+    # --------------------------------------------------------------------- #
+    @torch.no_grad()
+    def generate(
+        self,
+        images: Tensor,
+        max_length: int = 50,
+        num_beams: int = 1,
+        temperature: float = 1.0,
+        top_k: int = 0,
+        eos_token_id: Optional[int] = None,
+        length_penalty: float = 0.0,
+        repetition_penalty: float = 1.0,
+    ) -> List[str]:
+        """
+        Generate captions for a batch of images using a simple beam search.
+        Notes
+        -----
+        - For simplicity and clarity, this implementation currently supports
+          batch_size == 1. A ValueError is raised otherwise.
+        """
+        self.eval()
+        images = images.to(self.device)
+        batch_size = images.size(0)
+        if batch_size != 1:
+            raise ValueError(f"generate currently supports batch_size == 1, got {batch_size}")
+        eos_token_id = eos_token_id or self.tokenizer.eos_token_id
+        bos_token_id = self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
+        prefix_embeddings = self.encode_images(images)  # (1, P, H)
+        # Each beam is (token_ids, log_prob)
+        beams: List[Tuple[List[int], float]] = [([], 0.0)]
+        def _length_normalized_score(tokens: List[int], score: float) -> float:
+            if length_penalty is None or length_penalty == 0.0:
+                return score
+            length = max(1, len(tokens))
+            return score / (length ** length_penalty)
+        for _ in range(max_length):
+            all_candidates: List[Tuple[List[int], float]] = []
+            for seq, score in beams:
+                if seq and seq[-1] == eos_token_id:
+                    # If already finished, keep as-is
+                    all_candidates.append((seq, score))
+                    continue
+                # Build a 2D tensor of token IDs with shape (1, L)
+                if seq:
+                    input_ids = torch.tensor(
+                        [seq],
+                        device=self.device,
+                        dtype=torch.long,
+                    )  # (1, L)
+                else:
+                    input_ids = torch.tensor(
+                        [[bos_token_id]],
+                        device=self.device,
+                        dtype=torch.long,
+                    )  # (1, 1)
+                token_embeddings = self.gpt2.transformer.wte(input_ids)  # (1, L, H)
+                inputs_embeds = torch.cat([prefix_embeddings, token_embeddings], dim=1)
+                attention_mask = torch.ones(
+                    inputs_embeds.size()[:-1],
+                    dtype=torch.long,
+                    device=self.device,
+                )
+                outputs = self.gpt2(
+                    inputs_embeds=inputs_embeds,
+                    attention_mask=attention_mask,
+                    use_cache=False,
+                    return_dict=True,
+                )
+                logits = outputs.logits[:, -1, :] / max(temperature, 1e-5)
+                if top_k > 0:
+                    topk_logits, topk_indices = torch.topk(logits, top_k, dim=-1)
+                    log_probs = torch.log_softmax(topk_logits, dim=-1)
+                    for i in range(top_k):
+                        token_id = int(topk_indices[0, i])
+                        candidate = (seq + [token_id], score + float(log_probs[0, i]))
+                        all_candidates.append(candidate)
+                else:
+                    log_probs = torch.log_softmax(logits, dim=-1)
+                    topk_log_probs, topk_indices = torch.topk(log_probs, num_beams, dim=-1)
+                    for i in range(num_beams):
+                        token_id = int(topk_indices[0, i])
+                        candidate = (seq + [token_id], score + float(topk_log_probs[0, i]))
+                        all_candidates.append(candidate)
+            # Select best beams. With num_beams=1 and length_penalty=0 this
+            # reduces to simple greedy decoding, which is fully deterministic.
+            beams = sorted(
+                all_candidates,
+                key=lambda x: _length_normalized_score(x[0], x[1]),
+                reverse=True,
+            )[:num_beams]
+            # If all beams ended with EOS, stop early
+            if all(seq and seq[-1] == eos_token_id for seq, _ in beams):
+                break
+        best_seq, best_score = max(
+            beams,
+            key=lambda x: _length_normalized_score(x[0], x[1]),
+        )
+        # Truncate at EOS if present
+        if eos_token_id in best_seq:
+            best_seq = best_seq[: best_seq.index(eos_token_id)]
+        caption = self.tokenizer.decode(best_seq, skip_special_tokens=True)
+        # Normalize whitespace so the final caption is a single, clean string.
+        caption = " ".join(caption.strip().split())
+        return [caption]
+    # --------------------------------------------------------------------- #
+    # Dummy test helper
+    # --------------------------------------------------------------------- #
+    def test_dummy(self) -> None:
+        """
+        Run a dummy forward pass to verify the model works end-to-end.
+        This matches the specification in the prompt and asserts that the
+        output logits have shape (2, 20, 50257) when captions have length 20.
+        """
+        self.eval()
+        vocab_size = int(self.gpt2.config.vocab_size)
+        dummy_images = torch.randn(2, 3, 224, 224, device=self.device)
+        dummy_captions = torch.randint(0, vocab_size, (2, 20), device=self.device)
+        with torch.no_grad(), contextlib.ExitStack() as stack:
+            if self.device.type == "cuda":
+                stack.enter_context(torch.cuda.amp.autocast())
+            outputs = self(dummy_images, dummy_captions)
+        logits = outputs.logits
+        assert logits.shape == (2, 20, vocab_size), (
+            f"Output shape mismatch: expected (2, 20, {vocab_size}), "
+            f"got {tuple(logits.shape)}"
+        )
+        print("✓ Model architecture verified successfully!")

image_captioning/train.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import argparse
+import math
+import os
+from typing import Tuple
+import torch
+from torch import Tensor
+from torch.optim import AdamW
+from torch.utils.tensorboard import SummaryWriter
+from tqdm import tqdm
+from transformers import get_cosine_schedule_with_warmup
+from .config import PathsConfig, TrainingConfig, ensure_dir, get_device, set_seed
+from .dataset import create_dataloader, create_tokenizer
+from .model import ImageCaptioningModel
+def parse_args() -> argparse.Namespace:
+    """
+    Parse command-line arguments for training.
+    """
+    parser = argparse.ArgumentParser(description="Train EfficientNetB0 + GPT-2 image captioning model.")
+    parser.add_argument("--data_root", type=str, default="/Users/ryan/Downloads/visuallyimpair", help="Root path to dataset.")
+    parser.add_argument("--epochs", type=int, default=10, help="Number of training epochs.")
+    parser.add_argument("--batch_size", type=int, default=16, help="Batch size.")
+    parser.add_argument("--lr", type=float, default=5e-5, help="Learning rate.")
+    parser.add_argument("--warmup_steps", type=int, default=500, help="Number of warmup steps.")
+    parser.add_argument("--max_length", type=int, default=50, help="Maximum caption length.")
+    parser.add_argument("--grad_accum_steps", type=int, default=1, help="Gradient accumulation steps.")
+    parser.add_argument("--output_dir", type=str, default="checkpoints", help="Directory to save checkpoints.")
+    parser.add_argument("--log_dir", type=str, default="runs", help="Directory for TensorBoard logs.")
+    parser.add_argument("--patience", type=int, default=10, help="Early stopping patience based on validation loss.")
+    parser.add_argument("--seed", type=int, default=42, help="Random seed.")
+    return parser.parse_args()
+def create_training_config_from_args(args: argparse.Namespace) -> TrainingConfig:
+    """
+    Create a TrainingConfig instance using command-line arguments.
+    """
+    cfg = TrainingConfig()
+    cfg.learning_rate = args.lr
+    cfg.batch_size = args.batch_size
+    cfg.num_epochs = args.epochs
+    cfg.warmup_steps = args.warmup_steps
+    cfg.max_caption_length = args.max_length
+    cfg.gradient_accumulation_steps = max(1, args.grad_accum_steps)
+    cfg.output_dir = args.output_dir
+    cfg.log_dir = args.log_dir
+    cfg.patience = args.patience
+    cfg.seed = args.seed
+    return cfg
+def validate_dataloader(
+    train_loader,
+    device: torch.device,
+) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    """
+    Fetch a single batch from the DataLoader to validate dataset loading.
+    Returns
+    -------
+    Tuple of (images, input_ids, attention_mask, labels).
+    """
+    try:
+        batch = next(iter(train_loader))
+    except StopIteration as exc:
+        raise RuntimeError("Training DataLoader is empty. Check your dataset configuration.") from exc
+    images = batch["image"].to(device)
+    input_ids = batch["input_ids"].to(device)
+    attention_mask = batch["attention_mask"].to(device)
+    labels = batch["labels"].to(device)
+    print(f"[DATA] images batch shape:         {images.shape}")
+    print(f"[DATA] input_ids batch shape:      {input_ids.shape}")
+    print(f"[DATA] attention_mask batch shape: {attention_mask.shape}")
+    print(f"[DATA] labels batch shape:         {labels.shape}")
+    return images, input_ids, attention_mask, labels
+def train_one_epoch(
+    model: ImageCaptioningModel,
+    train_loader,
+    optimizer: AdamW,
+    scheduler,
+    device: torch.device,
+    cfg: TrainingConfig,
+    epoch: int,
+    scaler: torch.cuda.amp.GradScaler,
+    writer: SummaryWriter,
+) -> float:
+    """
+    Train the model for a single epoch.
+    """
+    model.train()
+    running_loss = 0.0
+    num_steps = 0
+    grad_accum_steps = cfg.gradient_accumulation_steps
+    progress = tqdm(train_loader, desc=f"Epoch {epoch} [train]", unit="batch")
+    for step, batch in enumerate(progress):
+        images = batch["image"].to(device)
+        input_ids = batch["input_ids"].to(device)
+        attention_mask = batch["attention_mask"].to(device)
+        labels = batch["labels"].to(device)
+        with torch.cuda.amp.autocast(enabled=(device.type == "cuda" and cfg.mixed_precision)):
+            outputs = model(
+                images=images,
+                captions=input_ids,
+                attention_mask=attention_mask,
+                labels=labels,
+            )
+            loss = outputs.loss
+            if loss is None:
+                raise RuntimeError("Model did not return a loss during training.")
+            loss = loss / grad_accum_steps
+        scaler.scale(loss).backward()
+        if (step + 1) % grad_accum_steps == 0:
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
+            scaler.step(optimizer)
+            scaler.update()
+            optimizer.zero_grad(set_to_none=True)
+            scheduler.step()
+        running_loss += loss.item() * grad_accum_steps
+        num_steps += 1
+        avg_loss = running_loss / num_steps
+        progress.set_postfix({"loss": f"{avg_loss:.4f}"})
+    epoch_loss = running_loss / max(1, num_steps)
+    writer.add_scalar("Loss/train", epoch_loss, epoch)
+    return epoch_loss
+def evaluate(
+    model: ImageCaptioningModel,
+    val_loader,
+    device: torch.device,
+    cfg: TrainingConfig,
+    epoch: int,
+    writer: SummaryWriter,
+) -> float:
+    """
+    Evaluate the model on a validation split and return the average loss.
+    """
+    model.eval()
+    running_loss = 0.0
+    num_steps = 0
+    with torch.no_grad():
+        progress = tqdm(val_loader, desc=f"Epoch {epoch} [val]", unit="batch")
+        for batch in progress:
+            images = batch["image"].to(device)
+            input_ids = batch["input_ids"].to(device)
+            attention_mask = batch["attention_mask"].to(device)
+            labels = batch["labels"].to(device)
+            outputs = model(
+                images=images,
+                captions=input_ids,
+                attention_mask=attention_mask,
+                labels=labels,
+            )
+            loss = outputs.loss
+            if loss is None:
+                raise RuntimeError("Model did not return a loss during validation.")
+            running_loss += loss.item()
+            num_steps += 1
+            avg_loss = running_loss / num_steps
+            progress.set_postfix({"val_loss": f"{avg_loss:.4f}"})
+    val_loss = running_loss / max(1, num_steps)
+    writer.add_scalar("Loss/val", val_loss, epoch)
+    return val_loss
+def main() -> None:
+    args = parse_args()
+    # Configuration and setup
+    paths_cfg = PathsConfig(data_root=args.data_root)
+    training_cfg = create_training_config_from_args(args)
+    ensure_dir(training_cfg.output_dir)
+    ensure_dir(training_cfg.log_dir)
+    set_seed(training_cfg.seed)
+    device = get_device()
+    # Data
+    tokenizer = create_tokenizer()
+    train_loader, tokenizer = create_dataloader(
+        paths_cfg=paths_cfg,
+        training_cfg=training_cfg,
+        split="train",
+        tokenizer=tokenizer,
+        shuffle=True,
+    )
+    val_loader, _ = create_dataloader(
+        paths_cfg=paths_cfg,
+        training_cfg=training_cfg,
+        split="val",
+        tokenizer=tokenizer,
+        shuffle=False,
+    )
+    # Validate dataset loading
+    validate_dataloader(train_loader, device)
+    # Model
+    model = ImageCaptioningModel(training_cfg=training_cfg)
+    optimizer = AdamW(model.parameters(), lr=training_cfg.learning_rate)
+    total_training_steps = math.ceil(
+        len(train_loader) / max(1, training_cfg.gradient_accumulation_steps)
+    ) * training_cfg.num_epochs
+    scheduler = get_cosine_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=training_cfg.warmup_steps,
+        num_training_steps=total_training_steps,
+    )
+    scaler = torch.cuda.amp.GradScaler(enabled=(device.type == "cuda" and training_cfg.mixed_precision))
+    writer = SummaryWriter(log_dir=training_cfg.log_dir)
+    best_val_loss = float("inf")
+    epochs_without_improvement = 0
+    try:
+        for epoch in range(1, training_cfg.num_epochs + 1):
+            train_loss = train_one_epoch(
+                model=model,
+                train_loader=train_loader,
+                optimizer=optimizer,
+                scheduler=scheduler,
+                device=device,
+                cfg=training_cfg,
+                epoch=epoch,
+                scaler=scaler,
+                writer=writer,
+            )
+            val_loss = evaluate(
+                model=model,
+                val_loader=val_loader,
+                device=device,
+                cfg=training_cfg,
+                epoch=epoch,
+                writer=writer,
+            )
+            print(f"[EPOCH {epoch}] train_loss={train_loss:.4f}  val_loss={val_loss:.4f}")
+            # Checkpointing
+            if val_loss < best_val_loss:
+                best_val_loss = val_loss
+                epochs_without_improvement = 0
+                best_path = os.path.join(training_cfg.output_dir, "best_model.pt")
+                torch.save(model.state_dict(), best_path)
+                print(f"[CHECKPOINT] Saved new best model to {best_path}")
+            else:
+                epochs_without_improvement += 1
+                print(
+                    f"[EARLY STOP] No improvement for {epochs_without_improvement} "
+                    f"epoch(s) (patience={training_cfg.patience})."
+                )
+            if epochs_without_improvement >= training_cfg.patience:
+                print("Early stopping triggered.")
+                break
+    except Exception as exc:  # noqa: BLE001
+        print(f"[ERROR] Training failed with error: {exc}")
+        raise
+    finally:
+        writer.close()
+if __name__ == "__main__":
+    main()