ProfRom commited on
Commit
5b6977e
·
verified ·
1 Parent(s): cb8a643
Files changed (1) hide show
  1. app.py +37 -30
app.py CHANGED
@@ -1,36 +1,43 @@
1
 
2
- #define model and processor
3
- processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
4
- model = AutoModelForVisualQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
5
- device = infer_device()
6
 
7
- # Define inference function
8
- def process_image(image, prompt):
9
- # Process the image and prompt using the processor
10
- inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- try:
13
- # Generate output from the model
14
- output = model.generate(**inputs, max_new_tokens=10)
 
 
 
15
 
16
- # Decode and return the output
17
- decoded_output = processor.batch_decode(output, skip_special_tokens=True)[0].strip()
 
 
 
 
 
 
 
 
18
 
19
- #remove prompt from output
20
- if decoded_output.startswith(prompt):
21
- return decoded_output[len(prompt):].strip()
22
- return decoded_output
23
- except IndexError as e:
24
- print(f"IndexError: {e}")
25
- return "An error occurred during processing."
26
 
27
- # Define the Gradio interface
28
- inputs = [
29
- gr.Image(type="pil"),
30
- gr.Textbox(label="Prompt", placeholder="Enter your question")
31
- ]
32
- outputs = gr.Textbox(label="Answer")
33
- # Create the Gradio app
34
- demo = gr.Interface(fn=process_image, inputs=inputs, outputs=outputs, title="Visual Question Answering", description="Upload an image and ask questions to get answers.")
35
- # Launch the app
36
- demo.launch()
 
1
 
2
+ import torch
3
+ from transformers import pipeline
4
+ import gradio as gr
 
5
 
6
+ # Choose device: GPU if available, otherwise CPU. On Hugging Face Spaces, unless you explicitly pick a GPU runtime, you’re on CPU only
7
+ if torch.cuda.is_available():
8
+ vqa = pipeline(
9
+ task="visual-question-answering",
10
+ model="Salesforce/blip-vqa-base",
11
+ torch_dtype=torch.float16,#newer versions of TRANSFORMERS in Hugging face is torch_dtype not dtype. dtype is still working fine in Google Colab space
12
+ device=0, # GPU
13
+ use_fast=False,
14
+ )
15
+ else:
16
+ vqa = pipeline(
17
+ task="visual-question-answering",
18
+ model="Salesforce/blip-vqa-base",
19
+ device=-1, # CPU
20
+ use_fast=False,
21
+ )
22
 
23
+ def answer_question(image, question):
24
+ if not question:
25
+ return "Please type a question about the image."
26
+ # vqa returns a list of dicts like [{'score':..., 'answer':...}]
27
+ result = vqa(question=question, image=image)
28
+ return result[0]["answer"]
29
 
30
+ demo = gr.Interface(
31
+ fn=answer_question,
32
+ inputs=[
33
+ gr.Image(type="pil", label="Upload an image"),
34
+ gr.Textbox(label="Question", placeholder="e.g. What is the weather in this image?"),
35
+ ],
36
+ outputs=gr.Textbox(label="Answer"),
37
+ title="BLIP Visual Question Answering",
38
+ description="Ask a question about the uploaded image using Salesforce/blip-vqa-base.",
39
+ )
40
 
41
+ if __name__ == "__main__":
42
+ demo.launch()
 
 
 
 
 
43