first commit

Browse files

Files changed (4) hide show

app.py +124 -0
dataset.py +182 -0
predict.py +158 -0
train.py +123 -0

app.py ADDED Viewed

	@@ -0,0 +1,124 @@

+import gradio as gr
+import torch
+import torch.nn as nn
+from torchvision.models import resnet50, ResNet50_Weights
+from torchvision import transforms
+from PIL import Image
+class Vocabulary:
+    def __init__(self):
+        self.itos = {}
+        self.stoi = {}
+    def load(self, stoi, itos):
+        self.stoi = stoi
+        self.itos = itos
+class EncoderCNN(nn.Module):
+    def __init__(self, embed_size):
+        super(EncoderCNN, self).__init__()
+        resnet = resnet50(weights=ResNet50_Weights.DEFAULT)
+        modules = list(resnet.children())[:-1]
+        self.resnet = nn.Sequential(*modules)
+        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
+        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
+    def forward(self, images):
+        with torch.no_grad():
+            features = self.resnet(images)
+        features = features.view(features.size(0), -1)
+        features = self.linear(features)
+        features = self.bn(features)
+        return features
+class DecoderRNN(nn.Module):
+    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
+        super(DecoderRNN, self).__init__()
+        self.embed = nn.Embedding(vocab_size, embed_size)
+        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
+        self.linear = nn.Linear(hidden_size, vocab_size)
+    def forward(self, features, captions):
+        embeddings = self.embed(captions)
+        inputs = torch.cat((features.unsqueeze(1), embeddings), 1)
+        hiddens, _ = self.lstm(inputs)
+        outputs = self.linear(hiddens)
+        return outputs
+    def sample(self, features, vocab, max_len=30):
+        output_ids = []
+        states = None
+        inputs = features.unsqueeze(1)
+        for _ in range(max_len):
+            hiddens, states = self.lstm(inputs, states)
+            outputs = self.linear(hiddens.squeeze(1))
+            predicted = outputs.argmax(1)
+            output_ids.append(predicted.item())
+            if vocab.itos[predicted.item()] == "<end>":
+                break
+            inputs = self.embed(predicted).unsqueeze(1)
+        return output_ids
+checkpoint_path = "./checkpoints/caption_model_epoch30.pth"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+embed_size = 256
+hidden_size = 512
+num_layers = 1
+checkpoint = torch.load(checkpoint_path, map_location=device)
+vocab = Vocabulary()
+vocab.load(checkpoint['vocab_stoi'], checkpoint['vocab_itos'])
+vocab_size = len(vocab.stoi)
+encoder = EncoderCNN(embed_size).to(device)
+decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)
+encoder.load_state_dict(checkpoint['encoder_state_dict'])
+decoder.load_state_dict(checkpoint['decoder_state_dict'])
+encoder.eval()
+decoder.eval()
+transform = transforms.Compose([
+    transforms.Resize((224, 224)),
+    transforms.ToTensor()
+])
+def generate_caption(image):
+    image = Image.fromarray(image).convert("RGB")
+    image = transform(image).unsqueeze(0).to(device)
+    with torch.no_grad():
+        features = encoder(image)
+        output_ids = decoder.sample(features, vocab)
+    caption = []
+    for idx in output_ids:
+        word = vocab.itos[idx]
+        if word == "<end>":
+            break
+        caption.append(word)
+    return ' '.join(caption)
+demo = gr.Interface(
+    fn=generate_caption,
+    inputs=gr.Image(type="numpy"),
+    outputs="text",
+    title="Skin Disease Image Captioning",
+    description="Upload an image of a skin disease to generate a descriptive caption using your trained model."
+)
+if __name__ == "__main__":
+    demo.launch()

dataset.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import os
+import json
+from PIL import Image
+from collections import Counter
+import torch
+from torch.utils.data import Dataset, DataLoader
+from torch.nn.utils.rnn import pad_sequence
+import torchvision.transforms as transforms
+import spacy
+# ===== Load spaCy English tokenizer =====
+spacy_eng = spacy.load("en_core_web_sm")
+class Vocabulary:
+    def __init__(self, freq_threshold):
+        """
+        freq_threshold: minimum word frequency to keep in vocab
+        """
+        self.freq_threshold = freq_threshold
+        self.itos = {0: "<pad>", 1: "<start>", 2: "<end>", 3: "<unk>"}
+        self.stoi = {v: k for k, v in self.itos.items()}
+    def __len__(self):
+        return len(self.itos)
+    @staticmethod
+    def tokenizer_eng(text):
+        """
+        Uses spaCy tokenizer to split sentence into list of tokens
+        """
+        return [tok.text.lower() for tok in spacy_eng.tokenizer(text)]
+    def build_vocabulary(self, sentence_list):
+        """
+        Builds vocab: {word -> index} for all words with freq >= threshold
+        """
+        frequencies = Counter()
+        idx = 4  # Start indexing after special tokens
+        for sentence in sentence_list:
+            tokens = self.tokenizer_eng(sentence)
+            frequencies.update(tokens)
+        for word, freq in frequencies.items():
+            if freq >= self.freq_threshold:
+                self.stoi[word] = idx
+                self.itos[idx] = word
+                idx += 1
+    def numericalize(self, text):
+        """
+        Converts text caption to list of vocab indices
+        """
+        tokenized_text = self.tokenizer_eng(text)
+        return [
+            self.stoi.get(token, self.stoi["<unk>"])
+            for token in tokenized_text
+        ]
+class CaptionDataset(Dataset):
+    def __init__(self, images_dir, captions_file, vocab, transform=None):
+        """
+        images_dir: path to images/train or images/val
+        captions_file: JSON file
+        vocab: Vocabulary object
+        transform: torchvision transform
+        """
+        self.images_dir = images_dir
+        self.vocab = vocab
+        self.transform = transform
+        # Load JSON
+        with open(captions_file, 'r') as f:
+            data = json.load(f)
+        self.images = data["images"]
+        self.annotations = data["annotations"]
+        # Create map: image_id -> file_name
+        self.id_to_filename = {img["id"]: img["file_name"] for img in self.images}
+    def __len__(self):
+        return len(self.annotations)
+    def __getitem__(self, index):
+        ann = self.annotations[index]
+        image_id = ann["image_id"]
+        caption = ann["caption"]
+        # Build image path
+        img_path = os.path.join(self.images_dir, self.id_to_filename[image_id])
+        # Open image
+        image = Image.open(img_path).convert("RGB")
+        if self.transform:
+            image = self.transform(image)
+        # Numericalize caption + add <start> and <end> tokens
+        numericalized_caption = [self.vocab.stoi["<start>"]]
+        numericalized_caption += self.vocab.numericalize(caption)
+        numericalized_caption.append(self.vocab.stoi["<end>"])
+        return image, torch.tensor(numericalized_caption)
+def build_vocab_from_json(captions_file, freq_threshold):
+    """
+    Builds Vocabulary object from JSON file.
+    """
+    with open(captions_file, 'r') as f:
+        data = json.load(f)
+    all_captions = [ann["caption"] for ann in data["annotations"]]
+    vocab = Vocabulary(freq_threshold)
+    vocab.build_vocabulary(all_captions)
+    return vocab
+def my_collate_fn(batch):
+    """
+    Custom collate_fn for variable-length captions:
+    Pads captions in batch to max length in batch.
+    """
+    images = []
+    captions = []
+    for img, cap in batch:
+        images.append(img)
+        captions.append(cap)
+    images = torch.stack(images, dim=0)
+    captions = pad_sequence(captions, batch_first=True, padding_value=0)  # pad with <pad> token idx 0
+    return images, captions
+# ====== Test block ======
+if __name__ == "__main__":
+    # === Paths ===
+    captions_train_json = "./Dataset/annotations/captions_train.json"
+    images_train_dir = "./Dataset/images/train/"
+    # === Build vocab ===
+    vocab = build_vocab_from_json(captions_train_json, freq_threshold=2)
+    print(f"Vocab size: {len(vocab)}")
+    # === Transforms ===
+    transform = transforms.Compose([
+        transforms.Resize((224, 224)),
+        transforms.ToTensor()
+    ])
+    # === Create dataset ===
+    train_dataset = CaptionDataset(
+        images_dir=images_train_dir,
+        captions_file=captions_train_json,
+        vocab=vocab,
+        transform=transform
+    )
+    # === DataLoader with custom collate_fn ===
+    train_loader = DataLoader(
+        dataset=train_dataset,
+        batch_size=4,
+        shuffle=True,
+        collate_fn=my_collate_fn  # ✅ REQUIRED for variable-length captions
+    )
+    # === Test loop ===
+    for idx, (images, captions) in enumerate(train_loader):
+        print(f"\nBatch {idx + 1}")
+        print("Images shape:", images.shape)      # [B, 3, H, W]
+        print("Captions shape:", captions.shape)  # [B, T] (padded)
+        print("Sample caption:", captions[0])
+        break  # one batch test only

predict.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import torch
+import torch.nn as nn
+from torchvision.models import resnet50, ResNet50_Weights
+from torchvision import transforms
+from PIL import Image
+import os
+# ===========
+# Vocabulary
+# ===========
+class Vocabulary:
+    def __init__(self):
+        self.itos = {}
+        self.stoi = {}
+    def load(self, stoi, itos):
+        self.stoi = stoi
+        self.itos = itos
+# ===========
+# Encoder
+# ===========
+class EncoderCNN(nn.Module):
+    def __init__(self, embed_size):
+        super(EncoderCNN, self).__init__()
+        resnet = resnet50(weights=ResNet50_Weights.DEFAULT)
+        modules = list(resnet.children())[:-1]
+        self.resnet = nn.Sequential(*modules)
+        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
+        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
+    def forward(self, images):
+        with torch.no_grad():
+            features = self.resnet(images)  # [B, 2048, 1, 1]
+        features = features.view(features.size(0), -1)  # [B, 2048]
+        features = self.linear(features)                # [B, embed_size]
+        features = self.bn(features)                    # [B, embed_size]
+        return features
+    def __init__(self, embed_size):
+        super(EncoderCNN, self).__init__()
+        resnet = resnet50(weights=ResNet50_Weights.DEFAULT)
+        modules = list(resnet.children())[:-1]
+        self.resnet = nn.Sequential(*modules)
+        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
+        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
+    def forward(self, images):
+        with torch.no_grad():
+            features = self.resnet(images).squeeze()
+        features = self.linear(features)
+        features = self.bn(features)
+        return features
+# ===========
+# Decoder
+# ===========
+class DecoderRNN(nn.Module):
+    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
+        super(DecoderRNN, self).__init__()
+        self.embed = nn.Embedding(vocab_size, embed_size)
+        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
+        self.linear = nn.Linear(hidden_size, vocab_size)
+    def forward(self, features, captions):
+        embeddings = self.embed(captions)
+        inputs = torch.cat((features.unsqueeze(1), embeddings), 1)
+        hiddens, _ = self.lstm(inputs)
+        outputs = self.linear(hiddens)
+        return outputs
+    def sample(self, features, vocab, max_len=30):
+        """
+        Generates a caption for given image features using greedy search.
+        """
+        output_ids = []
+        states = None
+        inputs = features.unsqueeze(1)  # [B, 1, embed_size]
+        for _ in range(max_len):
+            hiddens, states = self.lstm(inputs, states)  # [B, 1, hidden]
+            outputs = self.linear(hiddens.squeeze(1))    # [B, vocab_size]
+            predicted = outputs.argmax(1)                # [B]
+            output_ids.append(predicted.item())
+            if vocab.itos[predicted.item()] == "<end>":
+                break
+            inputs = self.embed(predicted).unsqueeze(1)
+        return output_ids
+# ===========
+# Predict block
+# ===========
+def predict(image_path, checkpoint_path):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    embed_size = 256
+    hidden_size = 512
+    num_layers = 1
+    # === Load checkpoint ===
+    checkpoint = torch.load(checkpoint_path, map_location=device)
+    # === Load vocab ===
+    vocab = Vocabulary()
+    vocab.load(checkpoint['vocab_stoi'], checkpoint['vocab_itos'])
+    vocab_size = len(vocab.stoi)
+    # === Load models ===
+    encoder = EncoderCNN(embed_size).to(device)
+    decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)
+    encoder.load_state_dict(checkpoint['encoder_state_dict'])
+    decoder.load_state_dict(checkpoint['decoder_state_dict'])
+    encoder.eval()
+    decoder.eval()
+    # === Image transform ===
+    transform = transforms.Compose([
+        transforms.Resize((224, 224)),
+        transforms.ToTensor()
+    ])
+    image = Image.open(image_path).convert("RGB")
+    image = transform(image).unsqueeze(0).to(device)  # [1, 3, 224, 224]
+    # === Encode ===
+    features = encoder(image)
+    # === Decode ===
+    output_ids = decoder.sample(features, vocab)
+    # === Convert IDs to words ===
+    caption = []
+    for idx in output_ids:
+        word = vocab.itos[idx]
+        if word == "<end>":
+            break
+        caption.append(word)
+    final_caption = ' '.join(caption)
+    print(f"\n📝 Predicted caption: {final_caption}\n")
+if __name__ == "__main__":
+    # ✅ Change these!
+    image_path = r"C:\Users\Jayasimma D\Documents\Skin_Disease_Captioning\Dataset\images\train\Albinism\Albinism2.jpg"  # 🔍 your test image path
+    checkpoint_path = "./checkpoints/caption_model_epoch5.pth"
+    predict(image_path, checkpoint_path)

train.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import os
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from torchvision.models import resnet50, ResNet50_Weights
+import torchvision.transforms as transforms
+from dataset import build_vocab_from_json, CaptionDataset, my_collate_fn
+class EncoderCNN(nn.Module):
+    def __init__(self, embed_size):
+        super(EncoderCNN, self).__init__()
+        resnet = resnet50(weights=ResNet50_Weights.DEFAULT)
+        modules = list(resnet.children())[:-1]  # remove FC layer
+        self.resnet = nn.Sequential(*modules)
+        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
+        self.bn = nn.BatchNorm1d(embed_size, momentum=0.01)
+    def forward(self, images):
+        with torch.no_grad():
+            features = self.resnet(images).squeeze()
+        features = self.linear(features)
+        features = self.bn(features)
+        return features
+class DecoderRNN(nn.Module):
+    def __init__(self, embed_size, hidden_size, vocab_size, num_layers=1):
+        super(DecoderRNN, self).__init__()
+        self.embed = nn.Embedding(vocab_size, embed_size)
+        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
+        self.linear = nn.Linear(hidden_size, vocab_size)
+    def forward(self, features, captions):
+        embeddings = self.embed(captions[:, :-1])  # Exclude <end>
+        inputs = torch.cat((features.unsqueeze(1), embeddings), 1)  # Add image feature at t=0
+        hiddens, _ = self.lstm(inputs)
+        outputs = self.linear(hiddens)
+        return outputs
+embed_size = 256
+hidden_size = 512
+num_layers = 1
+learning_rate = 3e-4
+num_epochs = 30
+batch_size = 8
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+captions_train_json = "./Dataset/annotations/captions_train.json"
+images_train_dir = "./Dataset/images/train/"
+transform = transforms.Compose([
+    transforms.Resize((224, 224)),
+    transforms.ToTensor()
+])
+vocab = build_vocab_from_json(captions_train_json, freq_threshold=2)
+vocab_size = len(vocab)
+train_dataset = CaptionDataset(
+    images_dir=images_train_dir,
+    captions_file=captions_train_json,
+    vocab=vocab,
+    transform=transform
+)
+train_loader = DataLoader(
+    dataset=train_dataset,
+    batch_size=batch_size,
+    shuffle=True,
+    collate_fn=my_collate_fn
+)
+encoder = EncoderCNN(embed_size).to(device)
+decoder = DecoderRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)
+criterion = nn.CrossEntropyLoss(ignore_index=0)
+params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())
+optimizer = optim.Adam(params, lr=learning_rate)
+encoder.train()
+decoder.train()
+os.makedirs("checkpoints", exist_ok=True)
+for epoch in range(num_epochs):
+    for idx, (imgs, captions) in enumerate(train_loader):
+        imgs, captions = imgs.to(device), captions.to(device)
+        features = encoder(imgs)
+        outputs = decoder(features, captions)
+        outputs = outputs[:, 1:, :]  # [B, T-1, vocab_size]
+        outputs = outputs.reshape(-1, vocab_size)
+        targets = captions[:, 1:].reshape(-1)
+        loss = criterion(outputs, targets)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        if idx % 50 == 0:
+            print(f"Epoch [{epoch+1}/{num_epochs}] Batch [{idx}/{len(train_loader)}] Loss: {loss.item():.4f}")
+    torch.save({
+        'epoch': epoch + 1,
+        'encoder_state_dict': encoder.state_dict(),
+        'decoder_state_dict': decoder.state_dict(),
+        'optimizer_state_dict': optimizer.state_dict(),
+        'vocab_stoi': vocab.stoi,
+        'vocab_itos': vocab.itos,
+    }, f"checkpoints/caption_model_epoch{epoch+1}.pth")
+    print(f"✅ Saved model to checkpoints/caption_model_epoch{epoch+1}.pth")
+print("Training complete ✅")