Spaces:

VIKRAM989
/

Image-Captioning

Sleeping

+import torch
+import torch.nn as nn
+import torchvision.models as models
+import sys
+import os
+import pickle
+import re
+from collections import Counter
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+EMBED_DIM = 512
+HIDDEN_DIM = 512
+MAX_LEN = 25
+# Vocabulary class
+class Vocabulary:
+    def __init__(self, freq_threshold=5):
+        self.freq_threshold = freq_threshold
+        self.itos = {0: "pad", 1: "startofseq", 2: "endofseq", 3: "unk"}
+        self.stoi = {v: k for k, v in self.itos.items()}
+        self.index = 4
+    def __len__(self):
+        return len(self.itos)
+    def tokenizer(self, text):
+        text = text.lower()
+        tokens = re.findall(r"\w+", text)
+        return tokens
+    def build_vocabulary(self, sentence_list):
+        frequencies = Counter()
+        for sentence in sentence_list:
+            tokens = self.tokenizer(sentence)
+            frequencies.update(tokens)
+        for word, freq in frequencies.items():
+            if freq >= self.freq_threshold:
+                self.stoi[word] = self.index
+                self.itos[self.index] = word
+                self.index += 1
+    def numericalize(self, text):
+        tokens = self.tokenizer(text)
+        numericalized = []
+        for token in tokens:
+            if token in self.stoi:
+                numericalized.append(self.stoi[token])
+            else:
+                numericalized.append(self.stoi["unk"])
+        return numericalized
+class Encoder(nn.Module):
+    def __init__(self, embed_dim):
+        super().__init__()
+        resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
+        self.backbone = nn.Sequential(*list(resnet.children())[:-1])
+        self.fc = nn.Linear(resnet.fc.in_features, embed_dim)
+        self.bn = nn.BatchNorm1d(embed_dim)
+    def forward(self, x):
+        with torch.no_grad():
+            features = self.backbone(x)
+        features = features.reshape(features.size(0), -1)
+        features = self.bn(self.fc(features))
+        return features
+class Decoder(nn.Module):
+    def __init__(self, embed_dim, hidden_dim, vocab_size):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, embed_dim)
+        self.lstm = nn.LSTM(
+            embed_dim,
+            hidden_dim,
+            batch_first=True
+        )
+        self.fc = nn.Linear(hidden_dim, vocab_size)
+    def forward(self, x, states=None):
+        emb = self.embedding(x)
+        outputs, states = self.lstm(emb, states)
+        logits = self.fc(outputs)
+        return logits, states
+class CaptionModel(nn.Module):
+    def __init__(self, embed_dim, hidden_dim, vocab_size):
+        super().__init__()
+        self.encoder = Encoder(embed_dim)
+        self.decoder = Decoder(embed_dim, hidden_dim, vocab_size)
+# Main debug
+script_dir = os.path.dirname(os.path.abspath(__file__))
+CHECKPOINT_PATH = os.path.join(script_dir, "best_checkpoint.pth")
+VOCAB_PATH = os.path.join(script_dir, "vocab.pkl")
+print("=" * 80)
+print("LOADING CHECKPOINT")
+print("=" * 80)
+checkpoint = torch.load(CHECKPOINT_PATH, map_location=DEVICE)
+print(f"\nCheckpoint keys: {list(checkpoint.keys())}")
+print("\nCheckpoint model_state_dict keys:")
+checkpoint_keys = set(checkpoint["model_state_dict"].keys())
+for key in sorted(checkpoint_keys):
+    shape = checkpoint["model_state_dict"][key].shape
+    print(f"  {key}: {shape}")
+# Load vocab
+with open(VOCAB_PATH, "rb") as f:
+    vocab = pickle.load(f)
+vocab_size = len(vocab)
+print(f"\nVocab size: {vocab_size}")
+# Create model
+model = CaptionModel(
+    EMBED_DIM,
+    HIDDEN_DIM,
+    vocab_size
+).to(DEVICE)
+print("\n" + "=" * 80)
+print("MODEL STATE DICT KEYS")
+print("=" * 80)
+model_keys = set(model.state_dict().keys())
+for key in sorted(model_keys):
+    shape = model.state_dict()[key].shape
+    print(f"  {key}: {shape}")
+# Check differences
+print("\n" + "=" * 80)
+print("COMPARISON")
+print("=" * 80)
+print("\nKeys in checkpoint but NOT in model:")
+for key in sorted(checkpoint_keys - model_keys):
+    print(f"  {key}")
+print("\nKeys in model but NOT in checkpoint:")
+for key in sorted(model_keys - checkpoint_keys):
+    print(f"  {key}")
+print("\nKeys in both but with different shapes:")
+for key in sorted(checkpoint_keys & model_keys):
+    cp_shape = checkpoint["model_state_dict"][key].shape
+    model_shape = model.state_dict()[key].shape
+    if cp_shape != model_shape:
+        print(f"  {key}")
+        print(f"    Checkpoint:  {cp_shape}")
+        print(f"    Model:       {model_shape}")
+print("\n" + "=" * 80)
+print("ATTEMPTING TO LOAD WEIGHTS")
+print("=" * 80)
+try:
+    model.load_state_dict(checkpoint["model_state_dict"])
+    print("SUCCESS: Weights loaded successfully!")
+except Exception as e:
+    print(f"ERROR: {e}")

main.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from fastapi import FastAPI, UploadFile, File
+from fastapi.middleware.cors import CORSMiddleware
+from PIL import Image
+import io
+import torch
+import pickle
+import os
+import uvicorn
+# Import from model.py
+from model import (
+    Vocabulary,
+    ResNetEncoder,
+    DecoderLSTM,
+    ImageCaptioningModel,
+    generate_caption,
+    transform,
+    EMBED_DIM,
+    HIDDEN_DIM,
+)
+app = FastAPI(title="Image Captioning API")
+# -------------------------
+# Enable CORS
+# -------------------------
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# -------------------------
+# Paths (relative to main.py)
+# -------------------------
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+VOCAB_PATH = os.path.join(BASE_DIR, "vocab.pkl")
+CHECKPOINT_PATH = os.path.join(BASE_DIR, "best_checkpoint.pth")
+# -------------------------
+# Load Vocabulary
+# -------------------------
+class CustomUnpickler(pickle.Unpickler):
+    def find_class(self, module, name):
+        if name == "Vocabulary":
+            return Vocabulary
+        return super().find_class(module, name)
+with open(VOCAB_PATH, "rb") as f:
+    vocab = CustomUnpickler(f).load()
+vocab_size = len(vocab)
+# -------------------------
+# Build Model
+# -------------------------
+encoder = ResNetEncoder(EMBED_DIM)
+decoder = DecoderLSTM(EMBED_DIM, HIDDEN_DIM, vocab_size)
+model = ImageCaptioningModel(encoder, decoder).to(DEVICE)
+# -------------------------
+# Load Weights
+# -------------------------
+checkpoint = torch.load(CHECKPOINT_PATH, map_location=DEVICE)
+model.load_state_dict(checkpoint["model_state_dict"])
+model.eval()
+print("✅ Model Loaded Successfully")
+# -------------------------
+# Health Check
+# -------------------------
+@app.get("/")
+def root():
+    return {"message": "Image Captioning API Running"}
+# -------------------------
+# Caption Endpoint
+# -------------------------
+@app.post("/caption")
+async def caption_image(file: UploadFile = File(...)):
+    contents = await file.read()
+    image = Image.open(io.BytesIO(contents)).convert("RGB")
+    image = transform(image)
+    caption = generate_caption(model, image, vocab)
+    return {
+        "caption": caption
+    }
+if __name__ == "__main__":
+    uvicorn.run("main:app", host="0.0.0.0", port=7860)

model.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import torch
+import torch.nn as nn
+import torchvision.transforms as transforms
+import torchvision.models as models
+from PIL import Image
+import pickle
+import sys
+import os
+import re
+from collections import Counter
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+EMBED_DIM = 512
+HIDDEN_DIM = 512
+MAX_LEN = 25
+# -----------------------
+# Vocabulary
+# -----------------------
+class Vocabulary:
+    def __init__(self, freq_threshold=5):
+        self.freq_threshold = freq_threshold
+        self.itos = {0: "pad", 1: "startofseq", 2: "endofseq", 3: "unk"}
+        self.stoi = {v: k for k, v in self.itos.items()}
+        self.index = 4
+    def __len__(self):
+        return len(self.itos)
+    def tokenizer(self, text):
+        text = text.lower()
+        tokens = re.findall(r"\w+", text)
+        return tokens
+    def build_vocabulary(self, sentence_list):
+        frequencies = Counter()
+        for sentence in sentence_list:
+            tokens = self.tokenizer(sentence)
+            frequencies.update(tokens)
+        for word, freq in frequencies.items():
+            if freq >= self.freq_threshold:
+                self.stoi[word] = self.index
+                self.itos[self.index] = word
+                self.index += 1
+    def numericalize(self, text):
+        tokens = self.tokenizer(text)
+        numericalized = []
+        for token in tokens:
+            if token in self.stoi:
+                numericalized.append(self.stoi[token])
+            else:
+                numericalized.append(self.stoi["unk"])
+        return numericalized
+# -----------------------
+# Encoder
+# -----------------------
+class ResNetEncoder(nn.Module):
+    def __init__(self, embed_dim):
+        super().__init__()
+        resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
+        for param in resnet.parameters():
+            param.requires_grad = True
+        modules = list(resnet.children())[:-1]
+        self.resnet = nn.Sequential(*modules)
+        self.fc = nn.Linear(resnet.fc.in_features, embed_dim)
+        self.batch_norm = nn.BatchNorm1d(embed_dim, momentum=0.01)
+    def forward(self, images):
+        with torch.no_grad():
+            features = self.resnet(images)  # (batch_size, 2048, 1, 1)
+        features = features.view(features.size(0), -1)
+        features = self.fc(features)
+        features = self.batch_norm(features)
+        return features
+# -----------------------
+# Decoder
+# -----------------------
+class DecoderLSTM(nn.Module):
+    def __init__(self, embed_dim, hidden_dim, vocab_size, num_layers=1):
+        super().__init__()
+        self.embedding = nn.Embedding(vocab_size, embed_dim)
+        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
+        self.fc = nn.Linear(hidden_dim, vocab_size)
+    def forward(self, features, captions):
+        # remove the last token for input
+        captions_in = captions[:, :-1]
+        emb = self.embedding(captions_in)
+        features = features.unsqueeze(1)
+        lstm_input = torch.cat((features, emb), dim=1)
+        outputs, _ = self.lstm(lstm_input)
+        logits = self.fc(outputs)
+        return logits
+# -----------------------
+# Caption Model
+# -----------------------
+class ImageCaptioningModel(nn.Module):
+    def __init__(self, encoder, decoder):
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+    def forward(self, images, captions):
+        features = self.encoder(images)
+        outputs = self.decoder(features, captions)
+        return outputs
+# -----------------------
+# Caption generator
+# -----------------------
+def generate_caption(model, image, vocab):
+    model.eval()
+    image = image.unsqueeze(0).to(DEVICE)
+    with torch.no_grad():
+        # Get image features
+        features = model.encoder(image)  # (1, embed_dim)
+        # Start with the start token
+        word_idx = vocab.stoi["startofseq"]
+        sentence = []
+        # Initialize hidden state for LSTM
+        h = None
+        for _ in range(MAX_LEN):
+            # Create input: concatenate features with embedding of previous word
+            word_tensor = torch.tensor([word_idx]).to(DEVICE)
+            emb = model.decoder.embedding(word_tensor)  # (1, embed_dim)
+            if h is None:
+                # First step: concatenate features with embedding
+                lstm_input = torch.cat([features.unsqueeze(1), emb.unsqueeze(1)], dim=1)  # (1, 2, embed_dim)
+            else:
+                lstm_input = emb.unsqueeze(1)  # (1, 1, embed_dim)
+            # Forward through LSTM
+            output, h_new = model.decoder.lstm(lstm_input, h)
+            h = h_new
+            # Predict next token
+            logits = model.decoder.fc(output[:, -1, :])  # (1, vocab_size)
+            predicted = logits.argmax(1).item()
+            # Get token from vocab
+            token = vocab.itos[predicted]
+            if token == "endofseq":
+                break
+            sentence.append(token)
+            word_idx = predicted
+    return " ".join(sentence)
+# -----------------------
+# Image transform
+# -----------------------
+transform = transforms.Compose([
+    transforms.Resize((224,224)),
+    transforms.ToTensor(),
+    transforms.Normalize(
+        mean=[0.485,0.456,0.406],
+        std=[0.229,0.224,0.225]
+    )
+])
+# -----------------------
+# Main
+# -----------------------
+def main():
+    image_path = sys.argv[1]
+    # Get the directory where this script is located
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    CHECKPOINT_PATH = os.path.join(script_dir, "best_checkpoint.pth")
+    VOCAB_PATH = os.path.join(script_dir, "vocab.pkl")
+    # load vocab
+    with open(VOCAB_PATH, "rb") as f:
+        vocab = pickle.load(f)
+    vocab_size = len(vocab)
+    # rebuild model
+    encoder = ResNetEncoder(EMBED_DIM)
+    decoder = DecoderLSTM(EMBED_DIM, HIDDEN_DIM, vocab_size)
+    model = ImageCaptioningModel(encoder, decoder).to(DEVICE)
+    # load checkpoint
+    checkpoint = torch.load(CHECKPOINT_PATH, map_location=DEVICE)
+    model.load_state_dict(checkpoint["model_state_dict"])
+    model.eval()
+    # load image
+    img = Image.open(image_path).convert("RGB")
+    img = transform(img)
+    caption = generate_caption(model, img, vocab)
+    print("\nCaption:", caption)
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+fastapi
+uvicorn
+torch
+torchvision
+pillow
+python-multipart

vocab.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3878a91256421ba64776cf69d22693c0a37e49d0303d84d8853c1c5ca937452
+size 174488