Spaces:

ProfRom
/

TestSpace3

Sleeping

App Files Files Community

ProfRom commited on Dec 3, 2025

Commit

4cde0da

verified ·

1 Parent(s): 15db435

Adapa - Sanity Check 3

Browse files

Files changed (1) hide show

app.py +62 -14

app.py CHANGED Viewed

@@ -2,23 +2,71 @@
 import gradio as gr
 from transformers import pipeline
-# Load image captioning pipeline
-captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
-def generate_caption(image):
-    if image is None:
-        return "Please upload an image."
-    result = captioner(image)
-    return result[0]['generated_text']
 demo = gr.Interface(
-    fn=generate_caption,
-    inputs=gr.Image(type="pil", label="Upload an image"),
-    outputs=gr.Textbox(label="Generated Caption"),
-    title="Image Captioning Demo",
-    description="Multimodal model: Vision → Language"
 )
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 from transformers import pipeline
+# BLIP captioning
+caption_pipeline = pipeline(
+    task="image-to-text",
+    model="Salesforce/blip-image-captioning-base"
+)
+# BLIP VQA
+vqa_pipeline = pipeline(
+    task="visual-question-answering",
+    model="Salesforce/blip-vqa-base"
+)
+# CLIP zero-shot classification
+clip_pipeline = pipeline(
+    task="zero-shot-image-classification",
+    model="openai/clip-vit-base-patch32"
+)
+def process_image(image, question, labels):
+    # Caption
+    caption_result = caption_pipeline(image)
+    caption = caption_result[0]["generated_text"]
+    # VQA
+    if question and question.strip():
+        vqa_result = vqa_pipeline(image=image, question=question)
+        vqa_answer = vqa_result[0]["answer"]
+    else:
+        vqa_answer = "No question provided."
+    # CLIP Classification
+    if labels and labels.strip():
+        candidate_labels = [l.strip() for l in labels.split(",") if l.strip()]
+        if candidate_labels:
+            # NOTE: use 'images=' or positional arg
+            clip_result = clip_pipeline(images=image, candidate_labels=candidate_labels)
+            clip_output = "\n".join(
+                f"{item['label']}: {round(item['score'] * 100, 1)}%"
+                for item in clip_result
+            )
+        else:
+            clip_output = "No valid labels provided."
+    else:
+        clip_output = "No labels provided."
+    return caption, vqa_answer, clip_output
 demo = gr.Interface(
+    fn=process_image,
+    inputs=[
+        gr.Image(type="pil", label="Upload an image"),
+        gr.Textbox(label="Ask a question about the image (optional)"),
+        gr.Textbox(
+            label="Enter CLIP classification labels (comma-separated)",
+            placeholder="e.g., man, boy, park, snow, happiness",
+        ),
+    ],
+    outputs=[
+        gr.Textbox(label="Generated Caption"),
+        gr.Textbox(label="VQA Answer"),
+        gr.Textbox(label="CLIP Classification Scores"),
+    ],
+    title="Multimodal AI — Captioning + VQA + Zero-Shot Classification",
 )
+demo.launch()