Spaces:

ProfRom
/

TestSpace3

Sleeping

App Files Files Community

ProfRom commited on 21 days ago

Commit

20351ba

verified ·

1 Parent(s): 5fc8594

Agelakis - Unit 8 Assignment

Browse files

Files changed (2) hide show

app.py +40 -115
requirements.txt +4 -4

app.py CHANGED Viewed

@@ -1,115 +1,40 @@
-import gradio as gr
-from transformers import pipeline
-# ----------------------------------------------------------
-# LOAD ALL THREE MODELS USED IN THE MULTIMODAL AI SYSTEM
-# ----------------------------------------------------------
-# 1. BLIP Image Captioning Model
-#    - Takes an image as input and generates a natural language
-#      description of its contents.
-caption_pipeline = pipeline(
-    task="image-to-text",
-    model="Salesforce/blip-image-captioning-base"
-)
-# 2. BLIP Visual Question Answering Model
-#    - Takes an image AND a natural language question as input.
-#    - Produces a short text answer based on the image content.
-vqa_pipeline = pipeline(
-    task="visual-question-answering",
-    model="Salesforce/blip-vqa-base"
-)
-# 3. CLIP Zero-Shot Image Classification Model
-#    - Compares the image with a list of user-provided labels.
-#    - Returns a probability score for each label without training.
-clip_pipeline = pipeline(
-    task="zero-shot-image-classification",
-    model="openai/clip-vit-base-patch32"
-)
-# ----------------------------------------------------------
-# PROCESS FUNCTION — RUNS ALL THREE AI TASKS
-# ----------------------------------------------------------
-def process_image(image, question, labels):
-    """
-    Runs captioning, VQA, and zero-shot classification on the input image.
-    Parameters:
-        image   : Image uploaded by the user.
-        question: Optional natural-language question about the image.
-        labels  : Optional comma-separated classification labels for CLIP.
-    Returns:
-        caption (str)      : Generated caption for the image.
-        vqa_answer (str)   : Answer to the user's question.
-        clip_output (str)  : Zero-shot classification probabilities.
-    """
-    # -----------------------------
-    # IMAGE CAPTIONING USING BLIP
-    # -----------------------------
-    caption_result = caption_pipeline(image)
-    caption = caption_result[0]["generated_text"]  # extract caption text
-    # ----------------------------------------------------
-    # VISUAL QUESTION ANSWERING (only if question given)
-    # ----------------------------------------------------
-    if question and question.strip():  # check if the user provided a question
-        vqa_result = vqa_pipeline(image=image, question=question)
-        vqa_answer = vqa_result[0]["answer"]
-    else:
-        vqa_answer = "No question provided."
-    # ----------------------------------------------------
-    # ZERO-SHOT IMAGE CLASSIFICATION USING CLIP
-    # ----------------------------------------------------
-    if labels and labels.strip():  # ensure labels exist
-        # Convert comma-separated text into clean list of labels
-        candidate_labels = [l.strip() for l in labels.split(",") if l.strip()]
-        if candidate_labels:
-            # CLIP requires parameter name 'images=' instead of 'image'
-            clip_result = clip_pipeline(images=image, candidate_labels=candidate_labels)
-            # Format classification scores nicely for display
-            clip_output = "\n".join(
-                f"{item['label']}: {round(item['score'] * 100, 1)}%"
-                for item in clip_result
-            )
-        else:
-            clip_output = "No valid labels provided."
-    else:
-        clip_output = "No labels provided."
-    # Return results of all three AI tasks
-    return caption, vqa_answer, clip_output
-# ----------------------------------------------------------
-# CREATE THE GRADIO USER INTERFACE
-# ----------------------------------------------------------
-demo = gr.Interface(
-    fn=process_image,            # function that executes model inference
-    inputs=[
-        gr.Image(type="pil", label="Upload an image"),  # image input
-        gr.Textbox(label="Ask a question about the image (optional)"),  # VQA input
-        gr.Textbox(
-            label="Enter CLIP classification labels (comma-separated)",
-            placeholder="e.g., man, boy, park, snow, happiness",
-        ),
-    ],
-    outputs=[
-        gr.Textbox(label="Generated Caption"),           # BLIP caption output
-        gr.Textbox(label="VQA Answer"),                 # VQA answer output
-        gr.Textbox(label="CLIP Classification Scores"), # CLIP zero-shot output
-    ],
-    title="Multimodal AI — Captioning + VQA + Zero-Shot Classification",
-)
-# Launch the web application on Hugging Face Spaces or locally
-demo.launch()

+import gradio as gr
+from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
+from PIL import Image
+import torch
+# Load model
+model_name = "nlpconnect/vit-gpt2-image-captioning"
+model = VisionEncoderDecoderModel.from_pretrained(model_name)
+processor = ViTImageProcessor.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+# Caption function
+def predict_caption(image):
+    if image is None:
+        return "Upload an image."
+    if image.mode != "RGB":
+        image = image.convert("RGB")
+    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
+    with torch.no_grad():
+        output_ids = model.generate(pixel_values, max_length=32, num_beams=4)
+    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    return caption.strip()
+# UI
+demo = gr.Interface(
+    fn=predict_caption,
+    inputs=gr.Image(type="pil", label="Upload Image"),
+    outputs=gr.Textbox(label="Caption"),
+    title="AI Image Captioning",
+    description="Upload an image to get an AI-generated caption."
+)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-gradio==4.25.0
-transformers==4.36.2
-torch
-Pillow

+gradio
+transformers
+torch
+Pillow