ProfRom commited on
Commit
1aa944c
·
verified ·
1 Parent(s): 32d3122

Deleon - Final submission

Browse files
Files changed (1) hide show
  1. app.py +125 -32
app.py CHANGED
@@ -1,42 +1,135 @@
 
1
  import torch
2
- from transformers import BlipProcessor, BlipForQuestionAnswering
 
 
 
 
 
3
  import gradio as gr
4
- from PIL import Image
5
 
6
- # Load model + processor
7
- processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
8
- model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
 
9
 
10
- # Move to GPU if available (Spaces free tier = CPU, but this keeps it safe)
11
- device = "cuda" if torch.cuda.is_available() else "cpu"
12
- model.to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
 
 
 
 
 
 
 
 
14
  def answer_question(image, question):
 
 
 
 
 
 
 
 
15
  if image is None:
16
- return "Please upload an image."
17
- if not question:
18
- return "Please enter a question."
19
-
20
- # Process inputs
21
- inputs = processor(image, question, return_tensors="pt").to(device)
22
-
23
- # Generate answer
24
- output = model.generate(**inputs)
25
- answer = processor.decode(output[0], skip_special_tokens=True)
26
-
27
- return answer
28
-
29
- # Gradio Interface
30
- demo = gr.Interface(
31
- fn=answer_question,
32
- inputs=[
33
- gr.Image(type="pil", label="Upload an image"),
34
- gr.Textbox(label="Question", placeholder="Example: What is in this image?")
35
- ],
36
- outputs=gr.Textbox(label="Answer"),
37
- title="BLIP Visual Question Answering",
38
- description="Upload an image and ask a question about it using a multimodal AI model.",
39
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  if __name__ == "__main__":
42
  demo.launch()
 
1
+ import time
2
  import torch
3
+ from transformers import (
4
+ BlipProcessor,
5
+ BlipForConditionalGeneration,
6
+ BlipForQuestionAnswering,
7
+ pipeline,
8
+ )
9
  import gradio as gr
 
10
 
11
+ TORCH_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
+ PIPELINE_DEVICE = 0 if TORCH_DEVICE == "cuda" else -1
13
+ DEVICE_LABEL = "GPU (CUDA)" if TORCH_DEVICE == "cuda" else "CPU"
14
+
15
 
16
+ print(f"[startup] Loading models on {DEVICE_LABEL}...")
17
+
18
+ caption_processor = BlipProcessor.from_pretrained(
19
+ "Salesforce/blip-image-captioning-base"
20
+ )
21
+ caption_model = (
22
+ BlipForConditionalGeneration.from_pretrained(
23
+ "Salesforce/blip-image-captioning-base"
24
+ )
25
+ .to(TORCH_DEVICE)
26
+ .eval()
27
+ )
28
+
29
+ vqa_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
30
+ vqa_model = (
31
+ BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
32
+ .to(TORCH_DEVICE)
33
+ .eval()
34
+ )
35
+
36
+ sentiment = pipeline(
37
+ task="sentiment-analysis",
38
+ model="distilbert-base-uncased-finetuned-sst-2-english",
39
+ device=PIPELINE_DEVICE,
40
+ )
41
+
42
+ print("[startup] Models loaded.")
43
+
44
 
45
+ @torch.no_grad()
46
+ def generate_caption(image):
47
+ inputs = caption_processor(images=image, return_tensors="pt").to(TORCH_DEVICE)
48
+ output_ids = caption_model.generate(**inputs, max_new_tokens=50)
49
+ return caption_processor.decode(output_ids[0], skip_special_tokens=True).strip()
50
+
51
+
52
+ @torch.no_grad()
53
  def answer_question(image, question):
54
+ inputs = vqa_processor(images=image, text=question, return_tensors="pt").to(
55
+ TORCH_DEVICE
56
+ )
57
+ output_ids = vqa_model.generate(**inputs, max_new_tokens=20)
58
+ return vqa_processor.decode(output_ids[0], skip_special_tokens=True).strip()
59
+
60
+
61
+ def analyze(image, question):
62
  if image is None:
63
+ return "You need to upload an image big dawg.", "", "", "", ""
64
+
65
+ if not question or not question.strip():
66
+ return "", "This is not optional. Ask me a question about the picture you uploaded.", "", "", ""
67
+
68
+ image = image.convert("RGB")
69
+ question = question.strip()
70
+
71
+ timings = {}
72
+
73
+ t0 = time.perf_counter()
74
+ caption = generate_caption(image)
75
+ timings["caption"] = time.perf_counter() - t0
76
+
77
+ t0 = time.perf_counter()
78
+ answer = answer_question(image, question)
79
+ timings["vqa"] = time.perf_counter() - t0
80
+
81
+ t0 = time.perf_counter()
82
+ cap_sent = sentiment(caption)[0]
83
+ cap_sent_str = f"{cap_sent['label']} ({cap_sent['score']:.2f})"
84
+
85
+ ans_sent = sentiment(answer)[0]
86
+ ans_sent_str = f"{ans_sent['label']} ({ans_sent['score']:.2f})"
87
+
88
+ timings["sentiment"] = time.perf_counter() - t0
89
+
90
+
91
+ latency_str = (
92
+ f"Caption: {timings['caption']:.2f}s | "
93
+ f"VQA: {timings['vqa']:.2f}s | "
94
+ f"Sentiment: {timings['sentiment']:.2f}s | "
95
+ f"Total: {sum(timings.values()):.2f}s ({DEVICE_LABEL})"
96
+ )
97
+
98
+ return caption, answer, cap_sent_str, ans_sent_str, latency_str
99
+
100
 
101
+ DESCRIPTION = """
102
+ # Multimodal Image Understanding Pipeline
103
+
104
+ Upload an image and ask a question about the uploaded image. The app returns an image caption,
105
+ answers your question, analyzes sentiment, and reports latency.
106
+ """
107
+
108
+ with gr.Blocks(title="Multimodal Image Understanding") as demo:
109
+ gr.Markdown(DESCRIPTION)
110
+
111
+ with gr.Row():
112
+ with gr.Column():
113
+ image_in = gr.Image(type="pil", label="Image")
114
+ question_in = gr.Textbox(
115
+ label="Question",
116
+ placeholder="What was that one movie with Billy Crystal?",
117
+ )
118
+ submit_btn = gr.Button("Analyze This!", variant="secondary")
119
+
120
+ with gr.Column():
121
+ caption_out = gr.Textbox(label="Generated caption")
122
+ answer_out = gr.Textbox(label="Answer to question")
123
+ cap_sent_out = gr.Textbox(label="Sentiment of caption")
124
+ ans_sent_out = gr.Textbox(label="Sentiment of answer")
125
+ timing_out = gr.Textbox(label="Latency breakdown")
126
+
127
+ submit_btn.click(
128
+ fn=analyze,
129
+ inputs=[image_in, question_in],
130
+ outputs=[caption_out, answer_out, cap_sent_out, ans_sent_out, timing_out],
131
+ )
132
+
133
+
134
  if __name__ == "__main__":
135
  demo.launch()