Spaces:

pavanpraneeth
/

CaptionIQ

Sleeping

File size: 4,238 Bytes

290f366

"""
CaptionIQ — Attention-Based CNN-LSTM Caption Generation Model
Uses Bahdanau (additive) attention over spatial CNN features for
image-specific caption generation.
"""

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Dense, LSTM, Embedding, Dropout, Concatenate, Layer
)
from tensorflow.keras.optimizers import Adam

import os, sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from src.config import (
    EMBED_DIM, LSTM_UNITS, DROPOUT_RATE, FEATURE_DIM,
    FEATURE_LOCATIONS, ATTENTION_DIM, LEARNING_RATE,
)


class BahdanauAttention(Layer):
    """
    Bahdanau (additive) attention over spatial image features.

    Given spatial features (batch, 49, 512) and LSTM hidden state (batch, 512):
        score = V * tanh(W1 * features + W2 * hidden)
        weights = softmax(score)
        context = sum(weights * features)

    This lets the model focus on different image regions for each word.
    """

    def __init__(self, units, **kwargs):
        super().__init__(**kwargs)
        self.units = units
        self.W1 = Dense(units, name="att_features")
        self.W2 = Dense(units, name="att_hidden")
        self.V = Dense(1, name="att_score")

    def call(self, inputs):
        features, hidden = inputs
        # features: (batch, locations, feature_dim)
        # hidden:   (batch, lstm_units)

        hidden_expanded = tf.expand_dims(hidden, 1)  # (batch, 1, lstm_units)

        score = tf.nn.tanh(
            self.W1(features) + self.W2(hidden_expanded)
        )  # (batch, locations, attention_dim)

        attention_weights = tf.nn.softmax(
            self.V(score), axis=1
        )  # (batch, locations, 1)

        context = tf.reduce_sum(
            attention_weights * features, axis=1
        )  # (batch, feature_dim)

        return context

    def get_config(self):
        config = super().get_config()
        config.update({"units": self.units})
        return config


def build_model(vocab_size: int, max_length: int) -> Model:
    """
    Build the attention-based CNN-LSTM image captioning model.

    Architecture:
        Image:   (49, 512) spatial features from VGG block5_pool
        Caption: (max_length,) → Embedding(256) → Dropout(0.3) → LSTM(512)
        Attention: query=LSTM hidden, keys=spatial features → context (512,)
        Merge:   Concatenate(context, LSTM) → Dense(512) → Dropout(0.3) → Dense(vocab)

    Args:
        vocab_size: Vocabulary size (including padding index 0)
        max_length: Maximum caption length in tokens

    Returns:
        Compiled Keras Model
    """
    # ── Image spatial features ──
    image_input = Input(
        shape=(FEATURE_LOCATIONS, FEATURE_DIM), name="image_input"
    )

    # ── Caption sequence branch ──
    caption_input = Input(shape=(max_length,), name="caption_input")
    caption_embed = Embedding(
        vocab_size, EMBED_DIM, mask_zero=True, name="caption_embedding"
    )(caption_input)
    caption_drop = Dropout(DROPOUT_RATE, name="caption_dropout")(caption_embed)
    caption_lstm = LSTM(LSTM_UNITS, name="caption_lstm")(caption_drop)

    # ── Attention over spatial features ──
    context = BahdanauAttention(
        ATTENTION_DIM, name="attention"
    )([image_input, caption_lstm])

    # ── Merge context + LSTM output ──
    merged = Concatenate(name="merge")([context, caption_lstm])
    dense1 = Dense(LSTM_UNITS, activation="relu", name="dense_relu")(merged)
    dense_drop = Dropout(DROPOUT_RATE, name="dense_dropout")(dense1)
    output = Dense(vocab_size, activation="softmax", name="output")(dense_drop)

    # ── Build and compile ──
    model = Model(
        inputs=[image_input, caption_input], outputs=output, name="CaptionIQ"
    )
    model.compile(
        loss="categorical_crossentropy",
        optimizer=Adam(learning_rate=LEARNING_RATE),
    )

    return model


def print_model_summary(vocab_size: int = 5000, max_length: int = 34):
    """Utility to print the model architecture."""
    model = build_model(vocab_size, max_length)
    model.summary()
    return model


if __name__ == "__main__":
    print_model_summary()