Spaces:

ProfRom
/

TestSpace3

Sleeping

App Files Files Community

ProfRom commited on Nov 26, 2025

Commit

2c6a099

verified ·

1 Parent(s): 5b6977e

Poojary - Sanity Check 2

Browse files

Files changed (1) hide show

app.py +94 -37

app.py CHANGED Viewed

@@ -1,43 +1,100 @@
-import torch
-from transformers import pipeline
 import gradio as gr
-# Choose device: GPU if available, otherwise CPU. On Hugging Face Spaces, unless you explicitly pick a GPU runtime, you’re on CPU only
-if torch.cuda.is_available():
-    vqa = pipeline(
-        task="visual-question-answering",
-        model="Salesforce/blip-vqa-base",
-        torch_dtype=torch.float16,#newer versions of TRANSFORMERS in Hugging face is torch_dtype not dtype. dtype is still working fine in Google Colab space
-        device=0,          # GPU
-        use_fast=False,
-    )
-else:
-    vqa = pipeline(
-        task="visual-question-answering",
-        model="Salesforce/blip-vqa-base",
-        device=-1,         # CPU
-        use_fast=False,
-    )
-def answer_question(image, question):
-    if not question:
-        return "Please type a question about the image."
-    # vqa returns a list of dicts like [{'score':..., 'answer':...}]
-    result = vqa(question=question, image=image)
-    return result[0]["answer"]
-demo = gr.Interface(
-    fn=answer_question,
-    inputs=[
-        gr.Image(type="pil", label="Upload an image"),
-        gr.Textbox(label="Question", placeholder="e.g. What is the weather in this image?"),
-    ],
-    outputs=gr.Textbox(label="Answer"),
-    title="BLIP Visual Question Answering",
-    description="Ask a question about the uploaded image using Salesforce/blip-vqa-base.",
 )
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import torch
+import os
+import tempfile
+from huggingface_hub import login
+from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering, infer_device, PaliGemmaForConditionalGeneration
+from accelerate import Accelerator
+# login to Hugging Face
+login(token=os.getenv('HF_TOKEN'))
+# Set the device
+device = infer_device()
+# MODEL 1: BLIP-VQA
+processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
+model = AutoModelForVisualQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
+# Define inference function for Model 1
+def process_image(image, prompt):
+    inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
+    try:
+        # Generate output from the model
+        output = model.generate(**inputs, max_new_tokens=10)
+        # Decode and return the output
+        decoded_output = processor.batch_decode(output, skip_special_tokens=True)[0].strip()
+        # remove prompt from output
+        if decoded_output.startswith(prompt):
+            return decoded_output[len(prompt):].strip()
+        return decoded_output
+    except Exception as e:
+        print(f"Error in Model 1: {e}")
+        return "An error occurred during processing for Model 1."
+# MODEL 2: PaliGemma
+processor2 = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224")
+model2 = PaliGemmaForConditionalGeneration.from_pretrained(
+    "google/paligemma-3b-mix-224",
+    torch_dtype=torch.bfloat16
+).to(device)
+# Define inference function for Model 2
+def process_image2(image, prompt):
+    inputs2 = processor2(
+        text=prompt,
+        images=image,
+        return_tensors="pt"
+    ).to(device, model2.dtype)
+    try:
+        output = model2.generate(**inputs2, max_new_tokens=10)
+        decoded_output = processor2.batch_decode(
+            output[:, inputs2["input_ids"].shape[1]:],
+            skip_special_tokens=True
+        )[0].strip()
+        return decoded_output
+    except Exception as e:
+        print(f"Error in Model 2: {e}")
+        return "An error occurred during processing for Model 2. Ensure your hardware supports bfloat16 or adjust the torch_dtype."
+# GRADIO INTERFACE
+inputs_model1 = [
+   gr.Image(type="pil"),
+   gr.Textbox(label="Prompt", placeholder="Enter your question")
+]
+inputs_model2 = [
+   gr.Image(type="pil"),
+   gr.Textbox(label="Prompt", placeholder="Enter your question")
+]
+outputs_model1 = gr.Textbox(label="Answer")
+outputs_model2 = gr.Textbox(label="Answer")
+# Create the Gradio apps for each model
+model1_inf = gr.Interface(
+    fn=process_image,
+    inputs=inputs_model1,
+    outputs=outputs_model1,
+    title="Model 1: BLIP-VQA-Base",
+    description="Ask a question about the uploaded image using BLIP."
 )
+model2_inf = gr.Interface(
+    fn=process_image2,
+    inputs=inputs_model2,
+    outputs=outputs_model2,
+    title="Model 2: PaliGemma",
+    description="Ask a question about the uploaded image using PaliGemma."
+)
+demo = gr.TabbedInterface([model1_inf, model2_inf],["Model 1 (BLIP)", "Model 2 (PaliGemma)"])
+demo.launch(share=True)