Spaces:

ProfRom
/

TestSpace

Sleeping

App Files Files Community

ProfRom commited on 18 days ago

Commit

1aa944c

verified ·

1 Parent(s): 32d3122

Deleon - Final submission

Browse files

Files changed (1) hide show

app.py +125 -32

app.py CHANGED Viewed

@@ -1,42 +1,135 @@
 import torch
-from transformers import BlipProcessor, BlipForQuestionAnswering
 import gradio as gr
-from PIL import Image
-# Load model + processor
-processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
-model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
-# Move to GPU if available (Spaces free tier = CPU, but this keeps it safe)
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(device)
 def answer_question(image, question):
     if image is None:
-        return "Please upload an image."
-    if not question:
-        return "Please enter a question."
-    # Process inputs
-    inputs = processor(image, question, return_tensors="pt").to(device)
-    # Generate answer
-    output = model.generate(**inputs)
-    answer = processor.decode(output[0], skip_special_tokens=True)
-    return answer
-# Gradio Interface
-demo = gr.Interface(
-    fn=answer_question,
-    inputs=[
-        gr.Image(type="pil", label="Upload an image"),
-        gr.Textbox(label="Question", placeholder="Example: What is in this image?")
-    ],
-    outputs=gr.Textbox(label="Answer"),
-    title="BLIP Visual Question Answering",
-    description="Upload an image and ask a question about it using a multimodal AI model.",
-)
 if __name__ == "__main__":
     demo.launch()

+import time
 import torch
+from transformers import (
+    BlipProcessor,
+    BlipForConditionalGeneration,
+    BlipForQuestionAnswering,
+    pipeline,
+)
 import gradio as gr
+TORCH_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+PIPELINE_DEVICE = 0 if TORCH_DEVICE == "cuda" else -1
+DEVICE_LABEL = "GPU (CUDA)" if TORCH_DEVICE == "cuda" else "CPU"
+print(f"[startup] Loading models on {DEVICE_LABEL}...")
+caption_processor = BlipProcessor.from_pretrained(
+    "Salesforce/blip-image-captioning-base"
+)
+caption_model = (
+    BlipForConditionalGeneration.from_pretrained(
+        "Salesforce/blip-image-captioning-base"
+    )
+    .to(TORCH_DEVICE)
+    .eval()
+)
+vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
+vqa_model = (
+    BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
+    .to(TORCH_DEVICE)
+    .eval()
+)
+sentiment = pipeline(
+    task="sentiment-analysis",
+    model="distilbert-base-uncased-finetuned-sst-2-english",
+    device=PIPELINE_DEVICE,
+)
+print("[startup] Models loaded.")
+@torch.no_grad()
+def generate_caption(image):
+    inputs = caption_processor(images=image, return_tensors="pt").to(TORCH_DEVICE)
+    output_ids = caption_model.generate(**inputs, max_new_tokens=50)
+    return caption_processor.decode(output_ids[0], skip_special_tokens=True).strip()
+@torch.no_grad()
 def answer_question(image, question):
+    inputs = vqa_processor(images=image, text=question, return_tensors="pt").to(
+        TORCH_DEVICE
+    )
+    output_ids = vqa_model.generate(**inputs, max_new_tokens=20)
+    return vqa_processor.decode(output_ids[0], skip_special_tokens=True).strip()
+def analyze(image, question):
     if image is None:
+        return "You need to upload an image big dawg.", "", "", "", ""
+    if not question or not question.strip():
+        return "", "This is not optional. Ask me a question about the picture you uploaded.", "", "", ""
+    image = image.convert("RGB")
+    question = question.strip()
+    timings = {}
+    t0 = time.perf_counter()
+    caption = generate_caption(image)
+    timings["caption"] = time.perf_counter() - t0
+    t0 = time.perf_counter()
+    answer = answer_question(image, question)
+    timings["vqa"] = time.perf_counter() - t0
+    t0 = time.perf_counter()
+    cap_sent = sentiment(caption)[0]
+    cap_sent_str = f"{cap_sent['label']} ({cap_sent['score']:.2f})"
+    ans_sent = sentiment(answer)[0]
+    ans_sent_str = f"{ans_sent['label']} ({ans_sent['score']:.2f})"
+    timings["sentiment"] = time.perf_counter() - t0
+    latency_str = (
+        f"Caption: {timings['caption']:.2f}s | "
+        f"VQA: {timings['vqa']:.2f}s | "
+        f"Sentiment: {timings['sentiment']:.2f}s | "
+        f"Total: {sum(timings.values()):.2f}s ({DEVICE_LABEL})"
+    )
+    return caption, answer, cap_sent_str, ans_sent_str, latency_str
+DESCRIPTION = """
+# Multimodal Image Understanding Pipeline
+Upload an image and ask a question about the uploaded image. The app returns an image caption,
+answers your question, analyzes sentiment, and reports latency.
+"""
+with gr.Blocks(title="Multimodal Image Understanding") as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        with gr.Column():
+            image_in = gr.Image(type="pil", label="Image")
+            question_in = gr.Textbox(
+                label="Question",
+                placeholder="What was that one movie with Billy Crystal?",
+            )
+            submit_btn = gr.Button("Analyze This!", variant="secondary")
+        with gr.Column():
+            caption_out = gr.Textbox(label="Generated caption")
+            answer_out = gr.Textbox(label="Answer to question")
+            cap_sent_out = gr.Textbox(label="Sentiment of caption")
+            ans_sent_out = gr.Textbox(label="Sentiment of answer")
+            timing_out = gr.Textbox(label="Latency breakdown")
+    submit_btn.click(
+        fn=analyze,
+        inputs=[image_in, question_in],
+        outputs=[caption_out, answer_out, cap_sent_out, ans_sent_out, timing_out],
+    )
 if __name__ == "__main__":
     demo.launch()