Spaces:

ProfRom
/

TestSpace3

Sleeping

ProfRom commited on Nov 24, 2025

Commit

5b6977e

verified ·

1 Parent(s): cb8a643

Ayo Test

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,36 +1,43 @@
-#define model and processor
-processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
-model = AutoModelForVisualQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
-device = infer_device()
-# Define inference function
-def process_image(image, prompt):
-   # Process the image and prompt using the processor
-   inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
-   try:
-       # Generate output from the model
-       output = model.generate(**inputs, max_new_tokens=10)
-       # Decode and return the output
-       decoded_output = processor.batch_decode(output, skip_special_tokens=True)[0].strip()
-       #remove prompt from output
-       if decoded_output.startswith(prompt):
-           return decoded_output[len(prompt):].strip()
-       return decoded_output
-   except IndexError as e:
-       print(f"IndexError: {e}")
-       return "An error occurred during processing."
-# Define the Gradio interface
-inputs = [
-   gr.Image(type="pil"),
-   gr.Textbox(label="Prompt", placeholder="Enter your question")
-]
-outputs = gr.Textbox(label="Answer")
-# Create the Gradio app
-demo = gr.Interface(fn=process_image, inputs=inputs, outputs=outputs, title="Visual Question Answering", description="Upload an image and ask questions to get answers.")
-# Launch the app
-demo.launch()

+import torch
+from transformers import pipeline
+import gradio as gr
+# Choose device: GPU if available, otherwise CPU. On Hugging Face Spaces, unless you explicitly pick a GPU runtime, you’re on CPU only
+if torch.cuda.is_available():
+    vqa = pipeline(
+        task="visual-question-answering",
+        model="Salesforce/blip-vqa-base",
+        torch_dtype=torch.float16,#newer versions of TRANSFORMERS in Hugging face is torch_dtype not dtype. dtype is still working fine in Google Colab space
+        device=0,          # GPU
+        use_fast=False,
+    )
+else:
+    vqa = pipeline(
+        task="visual-question-answering",
+        model="Salesforce/blip-vqa-base",
+        device=-1,         # CPU
+        use_fast=False,
+    )
+def answer_question(image, question):
+    if not question:
+        return "Please type a question about the image."
+    # vqa returns a list of dicts like [{'score':..., 'answer':...}]
+    result = vqa(question=question, image=image)
+    return result[0]["answer"]
+demo = gr.Interface(
+    fn=answer_question,
+    inputs=[
+        gr.Image(type="pil", label="Upload an image"),
+        gr.Textbox(label="Question", placeholder="e.g. What is the weather in this image?"),
+    ],
+    outputs=gr.Textbox(label="Answer"),
+    title="BLIP Visual Question Answering",
+    description="Ask a question about the uploaded image using Salesforce/blip-vqa-base.",
+)
+if __name__ == "__main__":
+    demo.launch()