ProfRom commited on
Commit
2c6a099
·
verified ·
1 Parent(s): 5b6977e

Poojary - Sanity Check 2

Browse files
Files changed (1) hide show
  1. app.py +94 -37
app.py CHANGED
@@ -1,43 +1,100 @@
1
 
2
- import torch
3
- from transformers import pipeline
4
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- # Choose device: GPU if available, otherwise CPU. On Hugging Face Spaces, unless you explicitly pick a GPU runtime, you’re on CPU only
7
- if torch.cuda.is_available():
8
- vqa = pipeline(
9
- task="visual-question-answering",
10
- model="Salesforce/blip-vqa-base",
11
- torch_dtype=torch.float16,#newer versions of TRANSFORMERS in Hugging face is torch_dtype not dtype. dtype is still working fine in Google Colab space
12
- device=0, # GPU
13
- use_fast=False,
14
- )
15
- else:
16
- vqa = pipeline(
17
- task="visual-question-answering",
18
- model="Salesforce/blip-vqa-base",
19
- device=-1, # CPU
20
- use_fast=False,
21
- )
22
-
23
- def answer_question(image, question):
24
- if not question:
25
- return "Please type a question about the image."
26
- # vqa returns a list of dicts like [{'score':..., 'answer':...}]
27
- result = vqa(question=question, image=image)
28
- return result[0]["answer"]
29
-
30
- demo = gr.Interface(
31
- fn=answer_question,
32
- inputs=[
33
- gr.Image(type="pil", label="Upload an image"),
34
- gr.Textbox(label="Question", placeholder="e.g. What is the weather in this image?"),
35
- ],
36
- outputs=gr.Textbox(label="Answer"),
37
- title="BLIP Visual Question Answering",
38
- description="Ask a question about the uploaded image using Salesforce/blip-vqa-base.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  )
40
 
41
- if __name__ == "__main__":
42
- demo.launch()
 
 
 
 
 
43
 
 
 
 
1
 
 
 
2
  import gradio as gr
3
+ import torch
4
+ import os
5
+ import tempfile
6
+ from huggingface_hub import login
7
+ from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering, infer_device, PaliGemmaForConditionalGeneration
8
+ from accelerate import Accelerator
9
+
10
+ # login to Hugging Face
11
+ login(token=os.getenv('HF_TOKEN'))
12
+
13
+ # Set the device
14
+ device = infer_device()
15
+
16
+ # MODEL 1: BLIP-VQA
17
+ processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
18
+ model = AutoModelForVisualQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
19
+
20
+ # Define inference function for Model 1
21
+ def process_image(image, prompt):
22
+ inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
23
+
24
+ try:
25
+ # Generate output from the model
26
+ output = model.generate(**inputs, max_new_tokens=10)
27
+
28
+ # Decode and return the output
29
+ decoded_output = processor.batch_decode(output, skip_special_tokens=True)[0].strip()
30
+
31
+ # remove prompt from output
32
+ if decoded_output.startswith(prompt):
33
+ return decoded_output[len(prompt):].strip()
34
+ return decoded_output
35
+ except Exception as e:
36
+ print(f"Error in Model 1: {e}")
37
+ return "An error occurred during processing for Model 1."
38
 
39
+
40
+ # MODEL 2: PaliGemma
41
+ processor2 = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224")
42
+ model2 = PaliGemmaForConditionalGeneration.from_pretrained(
43
+ "google/paligemma-3b-mix-224",
44
+ torch_dtype=torch.bfloat16
45
+ ).to(device)
46
+
47
+
48
+ # Define inference function for Model 2
49
+ def process_image2(image, prompt):
50
+ inputs2 = processor2(
51
+ text=prompt,
52
+ images=image,
53
+ return_tensors="pt"
54
+ ).to(device, model2.dtype)
55
+
56
+ try:
57
+ output = model2.generate(**inputs2, max_new_tokens=10)
58
+ decoded_output = processor2.batch_decode(
59
+ output[:, inputs2["input_ids"].shape[1]:],
60
+ skip_special_tokens=True
61
+ )[0].strip()
62
+
63
+ return decoded_output
64
+ except Exception as e:
65
+ print(f"Error in Model 2: {e}")
66
+ return "An error occurred during processing for Model 2. Ensure your hardware supports bfloat16 or adjust the torch_dtype."
67
+
68
+
69
+ # GRADIO INTERFACE
70
+ inputs_model1 = [
71
+ gr.Image(type="pil"),
72
+ gr.Textbox(label="Prompt", placeholder="Enter your question")
73
+ ]
74
+ inputs_model2 = [
75
+ gr.Image(type="pil"),
76
+ gr.Textbox(label="Prompt", placeholder="Enter your question")
77
+ ]
78
+
79
+ outputs_model1 = gr.Textbox(label="Answer")
80
+ outputs_model2 = gr.Textbox(label="Answer")
81
+
82
+ # Create the Gradio apps for each model
83
+ model1_inf = gr.Interface(
84
+ fn=process_image,
85
+ inputs=inputs_model1,
86
+ outputs=outputs_model1,
87
+ title="Model 1: BLIP-VQA-Base",
88
+ description="Ask a question about the uploaded image using BLIP."
89
  )
90
 
91
+ model2_inf = gr.Interface(
92
+ fn=process_image2,
93
+ inputs=inputs_model2,
94
+ outputs=outputs_model2,
95
+ title="Model 2: PaliGemma",
96
+ description="Ask a question about the uploaded image using PaliGemma."
97
+ )
98
 
99
+ demo = gr.TabbedInterface([model1_inf, model2_inf],["Model 1 (BLIP)", "Model 2 (PaliGemma)"])
100
+ demo.launch(share=True)