ProfRom commited on
Commit
59ea8cc
·
verified ·
1 Parent(s): 1aa944c

Anderson - Final submission

Browse files
Files changed (1) hide show
  1. app.py +29 -125
app.py CHANGED
@@ -1,135 +1,39 @@
1
- import time
2
  import torch
3
- from transformers import (
4
- BlipProcessor,
5
- BlipForConditionalGeneration,
6
- BlipForQuestionAnswering,
7
- pipeline,
8
- )
9
  import gradio as gr
 
10
 
11
- TORCH_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
- PIPELINE_DEVICE = 0 if TORCH_DEVICE == "cuda" else -1
13
- DEVICE_LABEL = "GPU (CUDA)" if TORCH_DEVICE == "cuda" else "CPU"
14
-
15
 
16
- print(f"[startup] Loading models on {DEVICE_LABEL}...")
17
-
18
- caption_processor = BlipProcessor.from_pretrained(
19
- "Salesforce/blip-image-captioning-base"
20
- )
21
- caption_model = (
22
- BlipForConditionalGeneration.from_pretrained(
23
- "Salesforce/blip-image-captioning-base"
24
- )
25
- .to(TORCH_DEVICE)
26
- .eval()
27
- )
28
-
29
- vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
30
- vqa_model = (
31
- BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
32
- .to(TORCH_DEVICE)
33
- .eval()
34
- )
35
-
36
- sentiment = pipeline(
37
- task="sentiment-analysis",
38
- model="distilbert-base-uncased-finetuned-sst-2-english",
39
- device=PIPELINE_DEVICE,
40
- )
41
-
42
- print("[startup] Models loaded.")
43
-
44
 
45
- @torch.no_grad()
46
- def generate_caption(image):
47
- inputs = caption_processor(images=image, return_tensors="pt").to(TORCH_DEVICE)
48
- output_ids = caption_model.generate(**inputs, max_new_tokens=50)
49
- return caption_processor.decode(output_ids[0], skip_special_tokens=True).strip()
50
-
51
-
52
- @torch.no_grad()
53
- def answer_question(image, question):
54
- inputs = vqa_processor(images=image, text=question, return_tensors="pt").to(
55
- TORCH_DEVICE
56
- )
57
- output_ids = vqa_model.generate(**inputs, max_new_tokens=20)
58
- return vqa_processor.decode(output_ids[0], skip_special_tokens=True).strip()
59
-
60
 
61
- def analyze(image, question):
62
  if image is None:
63
- return "You need to upload an image big dawg.", "", "", "", ""
64
-
65
- if not question or not question.strip():
66
- return "", "This is not optional. Ask me a question about the picture you uploaded.", "", "", ""
67
-
68
- image = image.convert("RGB")
69
- question = question.strip()
70
-
71
- timings = {}
72
-
73
- t0 = time.perf_counter()
74
- caption = generate_caption(image)
75
- timings["caption"] = time.perf_counter() - t0
76
-
77
- t0 = time.perf_counter()
78
- answer = answer_question(image, question)
79
- timings["vqa"] = time.perf_counter() - t0
80
-
81
- t0 = time.perf_counter()
82
- cap_sent = sentiment(caption)[0]
83
- cap_sent_str = f"{cap_sent['label']} ({cap_sent['score']:.2f})"
84
-
85
- ans_sent = sentiment(answer)[0]
86
- ans_sent_str = f"{ans_sent['label']} ({ans_sent['score']:.2f})"
87
-
88
- timings["sentiment"] = time.perf_counter() - t0
89
-
90
-
91
- latency_str = (
92
- f"Caption: {timings['caption']:.2f}s | "
93
- f"VQA: {timings['vqa']:.2f}s | "
94
- f"Sentiment: {timings['sentiment']:.2f}s | "
95
- f"Total: {sum(timings.values()):.2f}s ({DEVICE_LABEL})"
96
- )
97
-
98
- return caption, answer, cap_sent_str, ans_sent_str, latency_str
99
-
100
 
101
- DESCRIPTION = """
102
- # Multimodal Image Understanding Pipeline
103
-
104
- Upload an image and ask a question about the uploaded image. The app returns an image caption,
105
- answers your question, analyzes sentiment, and reports latency.
106
- """
107
-
108
- with gr.Blocks(title="Multimodal Image Understanding") as demo:
109
- gr.Markdown(DESCRIPTION)
110
-
111
- with gr.Row():
112
- with gr.Column():
113
- image_in = gr.Image(type="pil", label="Image")
114
- question_in = gr.Textbox(
115
- label="Question",
116
- placeholder="What was that one movie with Billy Crystal?",
117
- )
118
- submit_btn = gr.Button("Analyze This!", variant="secondary")
119
-
120
- with gr.Column():
121
- caption_out = gr.Textbox(label="Generated caption")
122
- answer_out = gr.Textbox(label="Answer to question")
123
- cap_sent_out = gr.Textbox(label="Sentiment of caption")
124
- ans_sent_out = gr.Textbox(label="Sentiment of answer")
125
- timing_out = gr.Textbox(label="Latency breakdown")
126
-
127
- submit_btn.click(
128
- fn=analyze,
129
- inputs=[image_in, question_in],
130
- outputs=[caption_out, answer_out, cap_sent_out, ans_sent_out, timing_out],
131
- )
132
-
133
-
134
  if __name__ == "__main__":
135
  demo.launch()
 
 
1
  import torch
 
 
 
 
 
 
2
  import gradio as gr
3
+ from transformers import BlipProcessor, BlipForQuestionAnswering
4
 
5
+ model_name = "Salesforce/blip-vqa-base"
 
 
 
6
 
7
+ processor = BlipProcessor.from_pretrained(model_name)
8
+ model = BlipForQuestionAnswering.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ device = "cuda" if torch.cuda.is_available() else "cpu"
11
+ model.to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
+ def answer_question(image, question):
14
  if image is None:
15
+ return "Please upload an image."
16
+ if not question:
17
+ return "Please type a question about the image."
18
+
19
+ inputs = processor(image, question, return_tensors="pt").to(device)
20
+
21
+ with torch.no_grad():
22
+ output = model.generate(**inputs, max_new_tokens=20)
23
+
24
+ answer = processor.decode(output[0], skip_special_tokens=True)
25
+ return answer
26
+
27
+ demo = gr.Interface(
28
+ fn=answer_question,
29
+ inputs=[
30
+ gr.Image(type="pil", label="Upload an image"),
31
+ gr.Textbox(label="Question", placeholder="e.g. What animal is this?"),
32
+ ],
33
+ outputs=gr.Textbox(label="Answer"),
34
+ title="BLIP Visual Question Answering",
35
+ description="Ask a question about an uploaded image using Salesforce/blip-vqa-base.",
36
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  if __name__ == "__main__":
39
  demo.launch()