File size: 1,585 Bytes
a0e6aa2
 
 
 
 
7c75d85
a0e6aa2
 
 
 
75e38c4
60dffdb
75e38c4
60dffdb
 
7c75d85
 
75e38c4
 
 
7c75d85
75e38c4
 
a0e6aa2
 
75e38c4
a0e6aa2
75e38c4
7c75d85
60dffdb
7c75d85
75e38c4
 
 
 
 
 
ac5df15
75e38c4
 
7c75d85
75e38c4
 
7c75d85
a0e6aa2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import gradio as gr
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
import torch

# Load processor and model
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
model.eval()

# Resize function
def resize_image(image):
    if image is not None:
        max_size = 512
        image.thumbnail((max_size, max_size))
    return image

# Answer question function
def answer_question(resized_image, question):
    if resized_image is None or question.strip() == "":
        return "Please upload an image and ask a question."
    
    inputs = processor(resized_image, question, return_tensors="pt")
    with torch.no_grad():
        output = model.generate(**inputs)
    return processor.decode(output[0], skip_special_tokens=True)

# Gradio UI
with gr.Blocks(title="BLIP VQA App (Salesforce/blip-vqa-base)") as demo:
    gr.Markdown("## 📷 Visual Question Answering with BLIP VQA\nUpload an image and ask a question about it.")

    image_input = gr.Image(type="pil", label="Upload Image")
    resized_image = gr.State()

    question_input = gr.Textbox(label="Question", placeholder="What is in the image?")
    ask_button = gr.Button("Ask")
    answer_output = gr.Textbox(label="Answer")

    # Resize image on upload
    image_input.change(fn=resize_image, inputs=image_input, outputs=resized_image)

    # Ask button triggers VQA
    ask_button.click(fn=answer_question, inputs=[resized_image, question_input], outputs=answer_output)

demo.launch()