Spaces:

RishabA
/

image-captioning

Sleeping

App Files Files Community

RishabA commited on Apr 7, 2025

Commit

2978074

verified ·

1 Parent(s): 42296ae

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -28

app.py CHANGED Viewed

@@ -13,7 +13,6 @@ n_layers = 6
 n_heads = 8
 tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
 transform = transforms.Compose(
     [
         transforms.Resize(image_size),
@@ -23,10 +22,9 @@ transform = transforms.Compose(
     ]
 )
-# Instantiate your model
 model = CaptioningTransformer(
     image_size=image_size,
-    in_channels=3,  # RGB images
     vocab_size=tokenizer.vocab_size,
     device=device,
     patch_size=patch_size,
@@ -35,53 +33,51 @@ model = CaptioningTransformer(
     n_heads=n_heads,
 ).to(device)
-# Load your pre-trained weights (make sure the .pt file is in your repo)
 model_path = "image_captioning_model.pt"
 model.load_state_dict(torch.load(model_path, map_location=device))
 model.eval()
-# This is your existing inference function (you can modify as needed)
-def make_prediction(model, sos_token, eos_token, image, max_len, temp, device):
-    log_tokens = [sos_token]  # Start with the start-of-sequence token
     with torch.inference_mode():
-        # Get image embeddings from the encoder
         image_embedding = model.encoder(image.to(device))
         for _ in range(max_len):
             input_tokens = torch.cat(log_tokens, dim=1)
             data_pred = model.decoder(input_tokens.to(device), image_embedding)
-            # Get the logits for the most recent token only
             dist = torch.distributions.Categorical(logits=data_pred[:, -1] / temp)
             next_tokens = dist.sample().reshape(1, 1)
             log_tokens.append(next_tokens.cpu())
-            if next_tokens.item() == 102:  # Assuming 102 is your [SEP] token
                 break
     return torch.cat(log_tokens, dim=1)
-# Define the Gradio prediction function
 def predict(image: Image.Image):
-    # Preprocess the image
-    img_tensor = transform(image).unsqueeze(0)  # Shape: (1, 3, image_size, image_size)
-    # Create a start-of-sequence token (assuming 101 is your [CLS] token)
     sos_token = 101 * torch.ones(1, 1).long().to(device)
-    # Generate caption tokens using your inference function
-    tokens = make_prediction(
-        model, sos_token, 102, img_tensor, max_len=50, temp=0.5, device=device
-    )
-    # Decode tokens to text (skipping special tokens)
     caption = tokenizer.decode(tokens[0], skip_special_tokens=True)
     return caption
-# Create a Gradio interface
-iface = gr.Interface(
-    fn=predict,
-    inputs=gr.Image(type="pil"),
-    outputs="text",
-    title="Image Captioning Model",
-    description="Upload an image and get a caption generated by the model.",
-)
 if __name__ == "__main__":
-    iface.launch()

 n_heads = 8
 tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
 transform = transforms.Compose(
     [
         transforms.Resize(image_size),
     ]
 )
 model = CaptioningTransformer(
     image_size=image_size,
+    in_channels=3,
     vocab_size=tokenizer.vocab_size,
     device=device,
     patch_size=patch_size,
     n_heads=n_heads,
 ).to(device)
 model_path = "image_captioning_model.pt"
 model.load_state_dict(torch.load(model_path, map_location=device))
 model.eval()
+def make_prediction(
+    model, sos_token, eos_token, image, max_len=50, temp=0.5, device=device
+):
+    log_tokens = [sos_token]
     with torch.inference_mode():
         image_embedding = model.encoder(image.to(device))
         for _ in range(max_len):
             input_tokens = torch.cat(log_tokens, dim=1)
             data_pred = model.decoder(input_tokens.to(device), image_embedding)
             dist = torch.distributions.Categorical(logits=data_pred[:, -1] / temp)
             next_tokens = dist.sample().reshape(1, 1)
             log_tokens.append(next_tokens.cpu())
+            if next_tokens.item() == 102:
                 break
     return torch.cat(log_tokens, dim=1)
 def predict(image: Image.Image):
+    img_tensor = transform(image).unsqueeze(0)
     sos_token = 101 * torch.ones(1, 1).long().to(device)
+    tokens = make_prediction(model, sos_token, 102, img_tensor)
     caption = tokenizer.decode(tokens[0], skip_special_tokens=True)
     return caption
+with gr.Blocks(css=".block-title { font-size: 24px; font-weight: bold; }") as demo:
+    gr.Markdown("<div class='block-title'>Image Captioning with PyTorch</div>")
+    gr.Markdown("Upload an image and get a descriptive caption about the image:")
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(type="pil", label="Your Image")
+            generate_button = gr.Button("Generate Caption")
+        with gr.Column():
+            caption_output = gr.Textbox(
+                label="Caption Output",
+                placeholder="Your generated caption will appear here...",
+            )
+    generate_button.click(fn=predict, inputs=image_input, outputs=caption_output)
 if __name__ == "__main__":
+    demo.launch(share=True)