Spaces:

Briran
/

CogniCaption

Sleeping

App Files Files Community

BriranSus commited on Dec 17, 2025

Commit

a667af8

1 Parent(s): f56b89a

initial commit

Browse files

Files changed (2) hide show

app.py +225 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import gradio as gr
+import torch
+import torch.nn as nn
+import torchvision.transforms as transforms
+from transformers import ViTModel
+from PIL import Image
+import pickle
+import re
+class Vocabulary:
+    def __init__(self, freq_threshold=5):
+        self.freq_threshold = freq_threshold
+        self.itos = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
+        self.stoi = {v: k for k, v in self.itos.items()}
+        self.index = 4
+    def __len__(self):
+        return len(self.itos)
+    def tokenizer(self, text):
+        text = text.lower()
+        tokens = re.findall(r"\w+", text)
+        return tokens
+    def numericalize(self, text):
+        tokens = self.tokenizer(text)
+        numericalized = []
+        for token in tokens:
+            if token in self.stoi:
+                numericalized.append(self.stoi[token])
+            else:
+                numericalized.append(self.stoi["<UNK>"])
+        return numericalized
+class Encoder(nn.Module):
+    def __init__(self, embed_dim, freeze=False):
+        super().__init__()
+        self.vit = ViTModel.from_pretrained("facebook/vit-mae-base")
+        if freeze:
+            for param in self.vit.parameters():
+                param.requires_grad = False
+        self.linear = nn.Sequential(
+            nn.Linear(self.vit.config.hidden_size, embed_dim),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(embed_dim, embed_dim),
+            nn.LayerNorm(embed_dim)
+        )
+    def forward(self, images):
+        outputs = self.vit(pixel_values=images)
+        patch_embeddings = outputs.last_hidden_state[:, 1:, :]
+        features = self.linear(patch_embeddings)
+        return features
+class MultiHeadAttention(nn.Module):
+    def __init__(self, hidden_dim, encoder_dim, num_heads=4):
+        super().__init__()
+        self.num_heads = num_heads
+        self.hidden_dim = hidden_dim
+        self.head_dim = hidden_dim // num_heads
+        assert hidden_dim % num_heads == 0, "hidden_dim must be divisible by num_heads"
+        self.query = nn.Linear(hidden_dim, hidden_dim)
+        self.key = nn.Linear(encoder_dim, hidden_dim)
+        self.value = nn.Linear(encoder_dim, hidden_dim)
+        self.fc_out = nn.Linear(hidden_dim, encoder_dim)
+    def forward(self, hidden, encoder_outputs):
+        B, N, _ = encoder_outputs.shape
+        Q = self.query(hidden).view(B, self.num_heads, self.head_dim)
+        K = self.key(encoder_outputs).view(B, N, self.num_heads, self.head_dim).transpose(1, 2)
+        V = self.value(encoder_outputs).view(B, N, self.num_heads, self.head_dim).transpose(1, 2)
+        scores = torch.matmul(Q.unsqueeze(2), K.transpose(-2, -1)) / (self.head_dim ** 0.5)
+        attn = torch.softmax(scores, dim=-1)
+        context = torch.matmul(attn, V)
+        context = context.transpose(1, 2).contiguous().view(B, self.hidden_dim)
+        return self.fc_out(context)
+class Decoder(nn.Module):
+    def __init__(self, embed_dim, hidden_dim, vocab_size, encoder_dim=256, num_layers=2, dropout=0.3, num_heads=4):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, embed_dim)
+        self.dropout = nn.Dropout(dropout)
+        self.lstm = nn.LSTM(embed_dim + encoder_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
+        self.attention = MultiHeadAttention(hidden_dim, encoder_dim, num_heads=num_heads)
+        self.fc = nn.Linear(hidden_dim, vocab_size)
+    def generate(self, features, max_len=50, start_index=1, end_index=2, beam_size=3, beam_search=True):
+        B = features.size(0)
+        device = features.device
+        states = (torch.zeros(self.lstm.num_layers, B, self.lstm.hidden_size, device=device),
+                  torch.zeros(self.lstm.num_layers, B, self.lstm.hidden_size, device=device))
+        if not beam_search:
+            generated = []
+            current_token = torch.LongTensor([start_index]).to(device).unsqueeze(0)
+            for _ in range(max_len):
+                emb = self.embedding(current_token).squeeze(1)
+                context = self.attention(states[0][-1], features)
+                lstm_input = torch.cat((emb, context), dim=1).unsqueeze(1)
+                out, states = self.lstm(lstm_input, states)
+                logits = self.fc(out.squeeze(1))
+                predicted = logits.argmax(dim=1).item()
+                generated.append(predicted)
+                if predicted == end_index: break
+                current_token = torch.LongTensor([predicted]).to(device).unsqueeze(0)
+            return generated
+        else:
+            beams = [([start_index], 0.0, states) for _ in range(beam_size)]
+            for _ in range(max_len):
+                new_beams = []
+                for seq, log_prob, (h, c) in beams:
+                    current_token = torch.LongTensor([seq[-1]]).to(device).unsqueeze(0)
+                    emb = self.embedding(current_token).squeeze(1)
+                    context = self.attention(h[-1], features)
+                    lstm_input = torch.cat((emb, context), dim=1).unsqueeze(1)
+                    out, (h_new, c_new) = self.lstm(lstm_input, (h, c))
+                    logits = self.fc(out.squeeze(1))
+                    log_probs = torch.log_softmax(logits, dim=1)
+                    top_log_probs, top_indices = log_probs.topk(beam_size, dim=1)
+                    for k in range(beam_size):
+                        next_seq = seq + [top_indices[0, k].item()]
+                        next_log_prob = log_prob + top_log_probs[0, k].item()
+                        new_beams.append((next_seq, next_log_prob, (h_new, c_new)))
+                new_beams = sorted(new_beams, key=lambda x: x[1], reverse=True)[:beam_size]
+                beams = new_beams
+                if all(seq[-1] == end_index for seq, _, _ in beams): break
+            best_seq = beams[0][0]
+            if best_seq[0] == start_index: best_seq = best_seq[1:]
+            return best_seq
+class Model(nn.Module):
+    def __init__(self, encoder, decoder):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+    def generate(self, images, max_len=50):
+        features = self.encoder(images)
+        captions = self.decoder.generate(features, max_len=max_len, beam_search=True)
+        return captions
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+EMBED_DIM = 256
+HIDDEN_DIM = 512
+VOCAB_PATH = "vocab.pkl"
+MODEL_PATH = "vit_lstm.pth"
+print("Loading Vocabulary...")
+try:
+    with open(VOCAB_PATH, "rb") as f:
+        vocab = pickle.load(f)
+    print(f"Vocabulary Loaded. Size: {len(vocab)}")
+except FileNotFoundError:
+    raise RuntimeError("vocab.pkl not found! Please upload it.")
+print("Initializing Model...")
+encoder = Encoder(EMBED_DIM, freeze=True)
+decoder = Decoder(EMBED_DIM, HIDDEN_DIM, len(vocab))
+model = Model(encoder, decoder).to(DEVICE)
+print("Loading Weights...")
+try:
+    checkpoint = torch.load(MODEL_PATH, map_location=DEVICE)
+    if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
+        model.load_state_dict(checkpoint['model_state_dict'])
+    else:
+        model.load_state_dict(checkpoint)
+    model.eval()
+    print("Model Loaded Successfully!")
+except FileNotFoundError:
+    raise RuntimeError("vit_lstm.pth not found! Please upload it.")
+except Exception as e:
+    print(f"Warning loading weights: {e}")
+inference_transform = transforms.Compose([
+    transforms.Resize((224, 224)),
+    transforms.ToTensor(),
+    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+])
+def generate_caption(image):
+    if image is None:
+        return "Please upload an image."
+    try:
+        pil_image = image.convert("RGB")
+        image_tensor = inference_transform(pil_image).unsqueeze(0).to(DEVICE)
+        with torch.no_grad():
+            output_indices = model.generate(image_tensor)
+        result_words = []
+        for idx in output_indices:
+            word = vocab.itos.get(idx, "<UNK>")
+            if word == "<EOS>":
+                break
+            if word not in ("<SOS>", "<PAD>"):
+                result_words.append(word)
+        caption = " ".join(result_words)
+        return caption
+    except Exception as e:
+        return f"Error occurred: {str(e)}"
+iface = gr.Interface(
+    fn=generate_caption,
+    inputs=gr.Image(type="pil", label="Upload Image"),
+    outputs=gr.Textbox(label="Generated Caption"),
+    title="ViT + LSTM Image Captioning",
+    description="Upload an image to generate a caption using a Vision Transformer (Encoder) and LSTM (Decoder) architecture."
+)
+if __name__ == "__main__":
+    iface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+torchvision
+transformers
+gradio
+pillow
+numpy