Spaces:

cconklin
/

blip-vqa-captioning

Sleeping

App Files Files Community

cconklin commited on Feb 26

Commit

a8887c5

verified ·

1 Parent(s): 5e841d6

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -31

app.py CHANGED Viewed

@@ -2,40 +2,82 @@ import torch
 from transformers import pipeline
 import gradio as gr
-# Choose device: GPU if available, otherwise CPU. On Hugging Face Spaces, unless you explicitly pick a GPU runtime, you’re on CPU only
-if torch.cuda.is_available():
-    vqa = pipeline(
-        task="visual-question-answering",
-        model="Salesforce/blip-vqa-base",
-        torch_dtype=torch.float16,#newer versions of TRANSFORMERS in Hugging face is torch_dtype not dtype. dtype is still working fine in Google Colab space
-        device=0,          # GPU
-        use_fast=False,
-    )
-else:
-    vqa = pipeline(
-        task="visual-question-answering",
-        model="Salesforce/blip-vqa-base",
-        device=-1,         # CPU
-        use_fast=False,
-    )
 def answer_question(image, question):
-    if not question:
         return "Please type a question about the image."
-    # vqa returns a list of dicts like [{'score':..., 'answer':...}]
     result = vqa(question=question, image=image)
-    return result[0]["answer"]
-demo = gr.Interface(
-    fn=answer_question,
-    inputs=[
-        gr.Image(type="pil", label="Upload an image"),
-        gr.Textbox(label="Question", placeholder="e.g. What is the weather in this image?"),
-    ],
-    outputs=gr.Textbox(label="Answer"),
-    title="BLIP Visual Question Answering",
-    description="Ask a question about the uploaded image using Salesforce/blip-vqa-base.",
-)
 if __name__ == "__main__":
-    demo.launch()

 from transformers import pipeline
 import gradio as gr
+# Choose device: GPU if available, otherwise CPU.
+DEVICE = 0 if torch.cuda.is_available() else -1
+# --- Load pipelines ---
+# VQA (image + question -> answer)
+vqa = pipeline(
+    task="visual-question-answering",
+    model="Salesforce/blip-vqa-base",
+    device=DEVICE,
+    torch_dtype=torch.float16 if DEVICE == 0 else None,
+    use_fast=False,
+)
+# Captioning (image -> text)
+captioner = pipeline(
+    task="image-to-text",
+    model="Salesforce/blip-image-captioning-base",
+    device=DEVICE,
+    torch_dtype=torch.float16 if DEVICE == 0 else None,
+    use_fast=False,
+)
+# --- App functions ---
+def generate_caption(image):
+    """Generate a short caption for the uploaded image."""
+    if image is None:
+        return ""
+    result = captioner(image)
+    # result is typically [{'generated_text': '...'}]
+    return result[0].get("generated_text", "").strip()
 def answer_question(image, question):
+    """Answer a question about the image."""
+    if image is None:
+        return "Please upload an image first."
+    if not question or not question.strip():
         return "Please type a question about the image."
     result = vqa(question=question, image=image)
+    return result[0].get("answer", "")
+# --- Gradio UI ---
+with gr.Blocks() as demo:
+    gr.Markdown("# BLIP Captioning + Visual Question Answering")
+    gr.Markdown(
+        "1) Upload an image to generate a caption.  \n"
+        "2) Ask a question about the image to get an answer.  \n"
+        "Models: `Salesforce/blip-image-captioning-base` and `Salesforce/blip-vqa-base`."
+    )
+    with gr.Row():
+        image_in = gr.Image(type="pil", label="Upload an image")
+        with gr.Column():
+            caption_out = gr.Textbox(label="Caption (auto-generated)", lines=2)
+            answer_out = gr.Textbox(label="Answer", lines=2)
+    question_in = gr.Textbox(
+        label="Question",
+        placeholder="e.g., What is in the image? How many people are there? What color is the car?",
+    )
+    with gr.Row():
+        clear_btn = gr.Button("Clear")
+        answer_btn = gr.Button("Submit")
+    # Auto-caption when image changes
+    image_in.change(fn=generate_caption, inputs=image_in, outputs=caption_out)
+    # Answer on button click
+    answer_btn.click(fn=answer_question, inputs=[image_in, question_in], outputs=answer_out)
+    # Clear everything
+    clear_btn.click(fn=lambda: (None, "", "", ""), inputs=None, outputs=[image_in, question_in, caption_out, answer_out])
+    gr.Markdown(
+        "**Note:** This demo may produce incorrect outputs. Do not use for medical/legal decisions."
+    )
 if __name__ == "__main__":
+    demo.launch()