ProfRom commited on
Commit
235a4e1
·
verified ·
1 Parent(s): 648030e

Ngo - Final Assignment submission

Browse files
Files changed (2) hide show
  1. app.py +34 -92
  2. requirements.txt +3 -3
app.py CHANGED
@@ -1,99 +1,41 @@
1
  import gradio as gr
 
2
  import torch
3
- import os
4
- import tempfile
5
- from huggingface_hub import login
6
- from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering, infer_device, PaliGemmaForConditionalGeneration
7
- from accelerate import Accelerator
8
 
9
- # login to Hugging Face
10
- login(token=os.getenv('HF_TOKEN'))
11
-
12
- # Set the device
13
- device = infer_device()
14
-
15
- # MODEL 1: BLIP-VQA
16
- processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
17
- model = AutoModelForVisualQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
18
-
19
- # Define inference function for Model 1
20
- def process_image(image, prompt):
21
- inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
22
-
23
- try:
24
- # Generate output from the model
25
- output = model.generate(**inputs, max_new_tokens=10)
26
-
27
- # Decode and return the output
28
- decoded_output = processor.batch_decode(output, skip_special_tokens=True)[0].strip()
29
-
30
- # remove prompt from output
31
- if decoded_output.startswith(prompt):
32
- return decoded_output[len(prompt):].strip()
33
- return decoded_output
34
- except Exception as e:
35
- print(f"Error in Model 1: {e}")
36
- return "An error occurred during processing for Model 1."
37
-
38
-
39
- # MODEL 2: PaliGemma
40
- processor2 = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224")
41
- model2 = PaliGemmaForConditionalGeneration.from_pretrained(
42
- "google/paligemma-3b-mix-224",
43
- torch_dtype=torch.bfloat16
44
- ).to(device)
45
-
46
-
47
- # Define inference function for Model 2
48
- def process_image2(image, prompt):
49
- inputs2 = processor2(
50
- text=prompt,
51
- images=image,
52
- return_tensors="pt"
53
- ).to(device, model2.dtype)
54
-
55
- try:
56
- output = model2.generate(**inputs2, max_new_tokens=10)
57
- decoded_output = processor2.batch_decode(
58
- output[:, inputs2["input_ids"].shape[1]:],
59
- skip_special_tokens=True
60
- )[0].strip()
61
-
62
- return decoded_output
63
- except Exception as e:
64
- print(f"Error in Model 2: {e}")
65
- return "An error occurred during processing for Model 2. Ensure your hardware supports bfloat16 or adjust the torch_dtype."
66
-
67
-
68
- # GRADIO INTERFACE
69
- inputs_model1 = [
70
- gr.Image(type="pil"),
71
- gr.Textbox(label="Prompt", placeholder="Enter your question")
72
- ]
73
- inputs_model2 = [
74
- gr.Image(type="pil"),
75
- gr.Textbox(label="Prompt", placeholder="Enter your question")
76
- ]
77
-
78
- outputs_model1 = gr.Textbox(label="Answer")
79
- outputs_model2 = gr.Textbox(label="Answer")
80
-
81
- # Create the Gradio apps for each model
82
- model1_inf = gr.Interface(
83
- fn=process_image,
84
- inputs=inputs_model1,
85
- outputs=outputs_model1,
86
- title="Model 1: BLIP-VQA-Base",
87
- description="Ask a question about the uploaded image using BLIP."
88
  )
89
 
90
- model2_inf = gr.Interface(
91
- fn=process_image2,
92
- inputs=inputs_model2,
93
- outputs=outputs_model2,
94
- title="Model 2: PaliGemma",
95
- description="Ask a question about the uploaded image using PaliGemma."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  )
97
 
98
- demo = gr.TabbedInterface([model1_inf, model2_inf],["Model 1 (BLIP)", "Model 2 (PaliGemma)"])
99
- demo.launch(share=True)
 
1
  import gradio as gr
2
+ from transformers import Blip2Processor, Blip2ForConditionalGeneration
3
  import torch
 
 
 
 
 
4
 
5
+ # Load pre-trained BLIP-2 model and processor
6
+ processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
7
+ model = Blip2ForConditionalGeneration.from_pretrained(
8
+ "Salesforce/blip2-opt-2.7b",
9
+ torch_dtype=torch.float16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  )
11
 
12
+ def predict(image, question=None):
13
+ # If no question provided, generate a caption
14
+ if question is None or question.strip() == "":
15
+ inputs = processor(image, return_tensors="pt")
16
+ else:
17
+ inputs = processor(image, question, return_tensors="pt")
18
+
19
+ # Move to GPU if available
20
+ device = "cuda" if torch.cuda.is_available() else "cpu"
21
+ inputs = inputs.to(device)
22
+ model.to(device)
23
+
24
+ # Generate output
25
+ out = model.generate(**inputs, max_new_tokens=50)
26
+ result = processor.decode(out[0], skip_special_tokens=True)
27
+ return result
28
+
29
+ # Gradio interface
30
+ iface = gr.Interface(
31
+ fn=predict,
32
+ inputs=[
33
+ gr.Image(type="pil", label="Upload Image"),
34
+ gr.Textbox(label="Optional Question", placeholder="Ask something about the image...")
35
+ ],
36
+ outputs=gr.Textbox(label="Result"),
37
+ title="BLIP-2 Multimodal Assistant",
38
+ description="Upload an image and get a caption. Optionally, ask a question about the image."
39
  )
40
 
41
+ iface.launch()
 
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
- transformers
 
2
  torch
3
- peft
4
- gradio
 
1
+ gradio>=4.0
2
+ transformers>=4.30
3
  torch
4
+ pillow