Spaces:
Sleeping
Sleeping
Agelakis - Unit 8 Assignment
Browse files- app.py +40 -115
- requirements.txt +4 -4
app.py
CHANGED
|
@@ -1,115 +1,40 @@
|
|
| 1 |
-
import gradio as gr
|
| 2 |
-
from transformers import
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
#
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
)
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
image : Image uploaded by the user.
|
| 42 |
-
question: Optional natural-language question about the image.
|
| 43 |
-
labels : Optional comma-separated classification labels for CLIP.
|
| 44 |
-
|
| 45 |
-
Returns:
|
| 46 |
-
caption (str) : Generated caption for the image.
|
| 47 |
-
vqa_answer (str) : Answer to the user's question.
|
| 48 |
-
clip_output (str) : Zero-shot classification probabilities.
|
| 49 |
-
"""
|
| 50 |
-
|
| 51 |
-
# -----------------------------
|
| 52 |
-
# IMAGE CAPTIONING USING BLIP
|
| 53 |
-
# -----------------------------
|
| 54 |
-
caption_result = caption_pipeline(image)
|
| 55 |
-
caption = caption_result[0]["generated_text"] # extract caption text
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
# ----------------------------------------------------
|
| 59 |
-
# VISUAL QUESTION ANSWERING (only if question given)
|
| 60 |
-
# ----------------------------------------------------
|
| 61 |
-
if question and question.strip(): # check if the user provided a question
|
| 62 |
-
vqa_result = vqa_pipeline(image=image, question=question)
|
| 63 |
-
vqa_answer = vqa_result[0]["answer"]
|
| 64 |
-
else:
|
| 65 |
-
vqa_answer = "No question provided."
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
# ----------------------------------------------------
|
| 69 |
-
# ZERO-SHOT IMAGE CLASSIFICATION USING CLIP
|
| 70 |
-
# ----------------------------------------------------
|
| 71 |
-
if labels and labels.strip(): # ensure labels exist
|
| 72 |
-
# Convert comma-separated text into clean list of labels
|
| 73 |
-
candidate_labels = [l.strip() for l in labels.split(",") if l.strip()]
|
| 74 |
-
|
| 75 |
-
if candidate_labels:
|
| 76 |
-
# CLIP requires parameter name 'images=' instead of 'image'
|
| 77 |
-
clip_result = clip_pipeline(images=image, candidate_labels=candidate_labels)
|
| 78 |
-
|
| 79 |
-
# Format classification scores nicely for display
|
| 80 |
-
clip_output = "\n".join(
|
| 81 |
-
f"{item['label']}: {round(item['score'] * 100, 1)}%"
|
| 82 |
-
for item in clip_result
|
| 83 |
-
)
|
| 84 |
-
else:
|
| 85 |
-
clip_output = "No valid labels provided."
|
| 86 |
-
else:
|
| 87 |
-
clip_output = "No labels provided."
|
| 88 |
-
|
| 89 |
-
# Return results of all three AI tasks
|
| 90 |
-
return caption, vqa_answer, clip_output
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
# ----------------------------------------------------------
|
| 94 |
-
# CREATE THE GRADIO USER INTERFACE
|
| 95 |
-
# ----------------------------------------------------------
|
| 96 |
-
demo = gr.Interface(
|
| 97 |
-
fn=process_image, # function that executes model inference
|
| 98 |
-
inputs=[
|
| 99 |
-
gr.Image(type="pil", label="Upload an image"), # image input
|
| 100 |
-
gr.Textbox(label="Ask a question about the image (optional)"), # VQA input
|
| 101 |
-
gr.Textbox(
|
| 102 |
-
label="Enter CLIP classification labels (comma-separated)",
|
| 103 |
-
placeholder="e.g., man, boy, park, snow, happiness",
|
| 104 |
-
),
|
| 105 |
-
],
|
| 106 |
-
outputs=[
|
| 107 |
-
gr.Textbox(label="Generated Caption"), # BLIP caption output
|
| 108 |
-
gr.Textbox(label="VQA Answer"), # VQA answer output
|
| 109 |
-
gr.Textbox(label="CLIP Classification Scores"), # CLIP zero-shot output
|
| 110 |
-
],
|
| 111 |
-
title="Multimodal AI — Captioning + VQA + Zero-Shot Classification",
|
| 112 |
-
)
|
| 113 |
-
|
| 114 |
-
# Launch the web application on Hugging Face Spaces or locally
|
| 115 |
-
demo.launch()
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
|
| 3 |
+
from PIL import Image
|
| 4 |
+
import torch
|
| 5 |
+
|
| 6 |
+
# Load model
|
| 7 |
+
model_name = "nlpconnect/vit-gpt2-image-captioning"
|
| 8 |
+
model = VisionEncoderDecoderModel.from_pretrained(model_name)
|
| 9 |
+
processor = ViTImageProcessor.from_pretrained(model_name)
|
| 10 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 11 |
+
|
| 12 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 13 |
+
model.to(device)
|
| 14 |
+
|
| 15 |
+
# Caption function
|
| 16 |
+
def predict_caption(image):
|
| 17 |
+
if image is None:
|
| 18 |
+
return "Upload an image."
|
| 19 |
+
if image.mode != "RGB":
|
| 20 |
+
image = image.convert("RGB")
|
| 21 |
+
|
| 22 |
+
pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
|
| 23 |
+
|
| 24 |
+
with torch.no_grad():
|
| 25 |
+
output_ids = model.generate(pixel_values, max_length=32, num_beams=4)
|
| 26 |
+
|
| 27 |
+
caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
| 28 |
+
return caption.strip()
|
| 29 |
+
|
| 30 |
+
# UI
|
| 31 |
+
demo = gr.Interface(
|
| 32 |
+
fn=predict_caption,
|
| 33 |
+
inputs=gr.Image(type="pil", label="Upload Image"),
|
| 34 |
+
outputs=gr.Textbox(label="Caption"),
|
| 35 |
+
title="AI Image Captioning",
|
| 36 |
+
description="Upload an image to get an AI-generated caption."
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
if __name__ == "__main__":
|
| 40 |
+
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
gradio
|
| 2 |
-
transformers
|
| 3 |
-
torch
|
| 4 |
-
Pillow
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
transformers
|
| 3 |
+
torch
|
| 4 |
+
Pillow
|