ProfRom commited on
Commit
ea35d18
·
verified ·
1 Parent(s): f2f22f7

Guimond - Final Assignment submission

Browse files
Files changed (2) hide show
  1. app.py +169 -314
  2. requirements.txt +9 -9
app.py CHANGED
@@ -1,322 +1,177 @@
1
-
2
- # app.py — Lazy Loaded Multimodal AI System
3
- #
4
- # Models load ONLY when needed to avoid memory overflow
5
- # Works on Hugging Face free CPU Spaces
6
-
7
- import torch
 
 
 
 
 
 
 
 
 
 
 
 
8
  import gradio as gr
9
-
10
- device = torch.device("cpu")
11
-
12
-
13
- # ---------------------------------------------------------
14
- # LAZY MODEL LOADERS
15
- # ---------------------------------------------------------
16
-
17
- def load_caption_model():
18
- from transformers import BlipProcessor, BlipForConditionalGeneration
19
- model_name = "Salesforce/blip-image-captioning-base"
20
- processor = BlipProcessor.from_pretrained(model_name)
21
- model = BlipForConditionalGeneration.from_pretrained(model_name).to(device)
22
- return processor, model
23
-
24
-
25
- def load_sentiment_model():
26
- from transformers import pipeline
27
- return pipeline(
28
- "sentiment-analysis",
29
- model="distilbert-base-uncased-finetuned-sst-2-english"
30
- )
31
-
32
-
33
- def load_vqa_model():
34
- from transformers import BlipProcessor, BlipForQuestionAnswering
35
- model_name = "Salesforce/blip-vqa-base"
36
- processor = BlipProcessor.from_pretrained(model_name)
37
- model = BlipForQuestionAnswering.from_pretrained(model_name).to(device)
38
- return processor, model
39
-
40
-
41
- def load_detr_model():
42
- from transformers import DetrImageProcessor, DetrForObjectDetection
43
- processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
44
- model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50").to(device)
45
- return processor, model
46
-
47
-
48
- def load_vit_model():
49
- from transformers import ViTImageProcessor, ViTForImageClassification
50
- model_name = "google/vit-base-patch16-224"
51
- processor = ViTImageProcessor.from_pretrained(model_name)
52
- model = ViTForImageClassification.from_pretrained(model_name).to(device)
53
- return processor, model
54
-
55
-
56
- # NEW — more verbose, less repetitive rewrite model
57
- def load_llm():
58
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
59
- name = "google/flan-t5-large"
60
- tokenizer = AutoTokenizer.from_pretrained(name)
61
- model = AutoModelForSeq2SeqLM.from_pretrained(name).to(device)
62
- return tokenizer, model
63
-
64
-
65
- # ---------------------------------------------------------
66
- # TASKS
67
- # ---------------------------------------------------------
68
-
69
- def generate_caption(image):
70
- processor, model = load_caption_model()
71
- inputs = processor(images=image, return_tensors="pt").to(device)
72
- with torch.no_grad():
73
- out_ids = model.generate(**inputs, max_new_tokens=30)
74
- return processor.decode(out_ids[0], skip_special_tokens=True)
75
-
76
-
77
- def analyze_sentiment(text):
78
- sentiment = load_sentiment_model()
79
- out = sentiment(text)[0]
80
- return out["label"], round(out["score"] * 100, 2)
81
-
82
-
83
- def vqa_answer(image, question):
84
- processor, model = load_vqa_model()
85
- inputs = processor(images=image, text=question, return_tensors="pt").to(device)
86
- with torch.no_grad():
87
- out = model.generate(**inputs)
88
- return processor.decode(out[0], skip_special_tokens=True)
89
-
90
-
91
- def detect_objects(image):
92
- processor, model = load_detr_model()
93
- inputs = processor(images=image, return_tensors="pt").to(device)
94
-
95
- with torch.no_grad():
96
- outputs = model(**inputs)
97
-
98
- target_sizes = torch.tensor([image.size[::-1]])
99
- results = processor.post_process_object_detection(outputs, target_sizes=target_sizes)[0]
100
-
101
- detections = []
102
- for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
103
- if score > 0.3:
104
- detections.append(
105
- f"{model.config.id2label[label.item()]} (score {round(score.item(), 2)})"
106
- )
107
- if len(detections) == 0:
108
- return ["No high-confidence objects detected"]
109
- return detections
110
-
111
-
112
- def classify_scene(image):
113
- processor, model = load_vit_model()
114
- inputs = processor(images=image, return_tensors="pt").to(device)
115
- with torch.no_grad():
116
- logits = model(**inputs).logits
117
- label = logits.argmax(-1).item()
118
- return model.config.id2label[label]
119
-
120
-
121
- # ---------------------------------------------------------
122
- # REWRITE CAPTIONS (8 STYLE SYSTEM + LENGTH SLIDER)
123
- # ---------------------------------------------------------
124
-
125
- def _build_style_prompt(caption, style):
126
- base = (
127
- "Rewrite the following image caption. "
128
- "Keep the original meaning and important details, "
129
- "but change the wording significantly and avoid repeating sentences verbatim. "
130
- "Do not just copy the original text.\n\n"
131
- f"Original caption:\n{caption}\n\n"
132
- )
133
-
134
- if style == "Short":
135
- return (
136
- base
137
- + "Now produce a shorter, compact version in one or two sentences."
138
- )
139
- elif style == "Creative":
140
- return (
141
- base
142
- + "Rewrite it in a colorful, imaginative, and richly descriptive style."
143
  )
144
- elif style == "Technical":
145
- return (
146
- base
147
- + "Rewrite it in a highly technical, analytical style using precise visual terminology."
148
- )
149
- elif style == "Humorous":
150
- return (
151
- base
152
- + "Rewrite it with a fun, humorous, witty tone while keeping the meaning."
153
- )
154
- elif style == "Poetic":
155
- return (
156
- base
157
- + "Rewrite it in a poetic, rhythmic, metaphorical style using sensory language."
158
- )
159
- elif style == "Cinematic":
160
- return (
161
- base
162
- + "Rewrite it as if describing an epic cinematic movie scene with dramatic, vivid imagery."
163
- )
164
- elif style == "Journalistic":
165
- return (
166
- base
167
- + "Rewrite it in a factual, neutral, journalistic news-reporting style."
168
- )
169
- elif style == "Academic":
170
- return (
171
- base
172
- + "Rewrite it in a formal, academic style with clear, analytical phrasing."
173
- )
174
- else:
175
- # Fallback: treat unknown style as creative rewrite
176
- return (
177
- base
178
- + "Rewrite it in a natural, descriptive style."
179
- )
180
-
181
-
182
- def rewrite_caption(caption, style, length):
183
- tokenizer, model = load_llm()
184
-
185
- prompt = _build_style_prompt(caption, style)
186
-
187
- # Tokenize
188
- inputs = tokenizer(prompt, return_tensors="pt").to(device)
189
-
190
- # First pass: normal creative decoding
191
- with torch.no_grad():
192
- outputs = model.generate(
193
- **inputs,
194
- max_new_tokens=length,
195
- do_sample=True,
196
- temperature=0.9,
197
- top_p=0.9,
198
- no_repeat_ngram_size=3,
199
- repetition_penalty=1.2,
200
- )
201
-
202
- rewritten = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
203
-
204
- # If the model basically echoed the caption, try a second, more forceful pass.
205
- if rewritten.lower().strip() == caption.lower().strip():
206
- strong_prompt = (
207
- "Paraphrase and expand the following caption. "
208
- "Use different wording and add extra detail, but keep the meaning. "
209
- "Do not repeat the original sentence exactly.\n\n"
210
- f"Original caption:\n{caption}"
211
- )
212
- strong_inputs = tokenizer(strong_prompt, return_tensors="pt").to(device)
213
-
214
- with torch.no_grad():
215
- outputs2 = model.generate(
216
- **strong_inputs,
217
- max_new_tokens=length,
218
- do_sample=True,
219
- temperature=1.0,
220
- top_p=0.95,
221
- no_repeat_ngram_size=3,
222
- repetition_penalty=1.3,
223
- )
224
- rewritten2 = tokenizer.decode(outputs2[0], skip_special_tokens=True).strip()
225
-
226
- # Only replace if it actually changed something
227
- if rewritten2 and rewritten2.lower().strip() != caption.lower().strip():
228
- rewritten = rewritten2
229
-
230
- return rewritten
231
 
232
-
233
- def extract_metadata(image):
234
- width, height = image.size
235
- meta = f"Dimensions: {width} x {height}\n"
236
- meta += "EXIF data detected\n" if "exif" in image.info else "No EXIF data available\n"
237
- return meta
238
-
239
-
240
- # ---------------------------------------------------------
241
- # MAIN LOOP
242
- # ---------------------------------------------------------
243
-
244
- def process_all(image, question, style, length):
245
- if image is None:
246
- return ["No image"] * 8
247
-
248
- caption = generate_caption(image)
249
- sentiment_label, sentiment_score = analyze_sentiment(caption)
250
- vqa = vqa_answer(image, question) if question else "No question asked"
251
- objects = detect_objects(image)
252
- scene = classify_scene(image)
253
- rewritten = rewrite_caption(caption, style, length)
254
- metadata = extract_metadata(image)
255
-
256
- return caption, sentiment_label, sentiment_score, vqa, objects, scene, rewritten, metadata
257
-
258
-
259
- # ---------------------------------------------------------
260
- # GRADIO UI
261
- # ---------------------------------------------------------
262
-
263
- with gr.Blocks(title="Multimodal AI System (Lazy Loaded)") as demo:
264
- gr.Markdown("# **Multimodal AI System**")
265
-
266
- with gr.Row():
267
- image_input = gr.Image(type="pil", label="Upload Image")
268
- question_input = gr.Textbox(label="Ask a Question")
269
-
270
- style_input = gr.Dropdown(
271
- [
272
- "Short",
273
- "Creative",
274
- "Technical",
275
- "Humorous",
276
- "Poetic",
277
- "Cinematic",
278
- "Journalistic",
279
- "Academic"
280
- ],
281
- label="Rewrite Style"
282
- )
283
-
284
- # New: length slider
285
- length_slider = gr.Slider(
286
- minimum=20,
287
- maximum=200,
288
- value=80,
289
- step=10,
290
- label="Rewrite Length (Max Tokens)"
291
  )
292
 
293
- run_btn = gr.Button("Run All Tools")
294
-
295
- caption = gr.Textbox(label="Generated Caption")
296
- sentiment_label = gr.Textbox(label="Sentiment Label")
297
- sentiment_score = gr.Number(label="Sentiment Score")
298
- vqa_output = gr.Textbox(label="VQA Answer")
299
- objects_output = gr.JSON(label="Detected Objects")
300
- scene_output = gr.Textbox(label="Scene Classification")
301
- rewritten_output = gr.Textbox(label="Rewritten Caption")
302
- metadata_output = gr.Textbox(label="Image Metadata")
303
-
304
- run_btn.click(
305
- process_all,
306
- [image_input, question_input, style_input, length_slider],
307
- [
308
- caption,
309
- sentiment_label,
310
- sentiment_score,
311
- vqa_output,
312
- objects_output,
313
- scene_output,
314
- rewritten_output,
315
- metadata_output
316
- ]
317
  )
318
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
  if __name__ == "__main__":
321
- demo.launch()
322
-
 
1
+ # ==============================================================================
2
+ # Josh Guimond
3
+ # Unit 8 Assignment: End-to-End AI Solution Implementation
4
+ # ARIN 460
5
+ # 12/03/2025
6
+
7
+ # Description: This script implements a multimodal AI web app using Gradio to
8
+ # run two image captioning models, a text “vibe” classifier, and NLP metrics on
9
+ # uploaded images, allowing direct comparison of model captions to ground-truth
10
+ # descriptions.
11
+ # ==============================================================================
12
+
13
+ # Video: https://youtu.be/pXCO00lK2UE
14
+ # Space: https://huggingface.co/spaces/jguimond/assignment_8_v3
15
+
16
+ # ==============================================================================
17
+ # SECTION 1: SETUP & INSTALLATIONS
18
+ # ==============================================================================
19
+ # Install libraries
20
  import gradio as gr
21
+ from transformers import pipeline, AutoTokenizer, AutoModelForImageTextToText
22
+ from sentence_transformers import SentenceTransformer, util
23
+ import evaluate
24
+ import warnings
25
+ import logging
26
+
27
+ # Filter out the "FutureWarning" and "UserWarning" to keep the console clean
28
+ warnings.filterwarnings("ignore", category=FutureWarning)
29
+ warnings.filterwarnings("ignore", category=UserWarning)
30
+ logging.getLogger("transformers").setLevel(logging.ERROR)
31
+
32
+
33
+ # ==============================================================================
34
+ # SECTION 2: LOAD MODELS
35
+ # ==============================================================================
36
+
37
+ # --- 1. Load Image Captioning Models ---
38
+
39
+ # Model 1: BLIP (Base)
40
+ print("Loading Model 1 (BLIP)...")
41
+ captioner_model1 = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
42
+
43
+ # Model 2: ViT-GPT2 (With Tokenizer Fix)
44
+ print("Loading Model 2 (ViT-GPT2)...")
45
+ # Load the tokenizer manually to set the pad_token and fix the warning
46
+ vit_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
47
+ vit_tokenizer.pad_token = vit_tokenizer.eos_token # <--- THE FIX
48
+ captioner_model2 = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", tokenizer=vit_tokenizer)
49
+
50
+ # --- 2. Load NLP Analysis Models (Unit 4 Techniques) ---
51
+
52
+ # A. Zero-Shot Classifier (For Nuanced Vibe/Sentiment)
53
+ print("Loading Zero-Shot Classifier...")
54
+ classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-xsmall-zeroshot-v1.1-all-33")
55
+
56
+ # B. Semantic Similarity (For Model Agreement)
57
+ print("Loading Sentence Transformer...")
58
+ similarity_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
59
+
60
+ # C. ROUGE Metric (For Accuracy vs Ground Truth)
61
+ print("Loading ROUGE Metric...")
62
+ rouge = evaluate.load("rouge")
63
+
64
+ # Define Nuanced Labels based on the image list
65
+ # These cover: Peaceful dog, Sad funeral, Happy kids, Angry man, Scared people, Fighting tigers
66
+ VIBE_LABELS = ["Peaceful/Calm", "Happy/Joy", "Sad/Sorrow", "Angry/Upset", "Fear/Scared", "Action/Violence"]
67
+
68
+ # ==============================================================================
69
+ # SECTION 3: ANALYSIS FUNCTIONS
70
+ # ==============================================================================
71
+
72
+ # --- Analysis Function ---
73
+ def analyze_image(image, ground_truth):
74
+
75
+ # -- A. Generate Captions --
76
+ res1 = captioner_model1(image)
77
+ cap1 = res1[0]['generated_text']
78
+
79
+ res2 = captioner_model2(image)
80
+ cap2 = res2[0]['generated_text']
81
+
82
+ # -- B. Analyze Vibe (Zero-Shot) --
83
+ # Model 1 Vibe
84
+ vibe1_result = classifier(cap1, VIBE_LABELS)
85
+ vibe1_label = vibe1_result['labels'][0]
86
+ vibe1_score = vibe1_result['scores'][0]
87
+
88
+ # Model 2 Vibe
89
+ vibe2_result = classifier(cap2, VIBE_LABELS)
90
+ vibe2_label = vibe2_result['labels'][0]
91
+ vibe2_score = vibe2_result['scores'][0]
92
+
93
+ # -- C. Calculate Statistics --
94
+
95
+ # 1. Semantic Similarity (Do the models agree?)
96
+ emb1 = similarity_model.encode(cap1, convert_to_tensor=True)
97
+ emb2 = similarity_model.encode(cap2, convert_to_tensor=True)
98
+ sim_score = util.pytorch_cos_sim(emb1, emb2).item()
99
+
100
+ # 2. ROUGE Scores (How accurate are they vs Ground Truth?)
101
+ rouge_output = "N/A (No Ground Truth provided)"
102
+ if ground_truth and ground_truth.strip() != "":
103
+ # Calculate scores
104
+ r1 = rouge.compute(predictions=[cap1], references=[ground_truth])
105
+ r2 = rouge.compute(predictions=[cap2], references=[ground_truth])
106
+
107
+ # Format the ROUGE output nicely
108
+ rouge_output = (
109
+ f"Model 1 ROUGE-L: {r1['rougeL']:.3f}\n"
110
+ f"Model 2 ROUGE-L: {r2['rougeL']:.3f}\n"
111
+ f"(Higher is better)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
+ # -- D. Format Output Strings --
115
+ # Create clean, formatted strings for the large textboxes
116
+
117
+ out1 = (
118
+ f"CAPTION: {cap1}\n"
119
+ f"-----------------------------\n"
120
+ f"DETECTED VIBE: {vibe1_label}\n"
121
+ f"CONFIDENCE: {vibe1_score:.1%}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  )
123
 
124
+ out2 = (
125
+ f"CAPTION: {cap2}\n"
126
+ f"-----------------------------\n"
127
+ f"DETECTED VIBE: {vibe2_label}\n"
128
+ f"CONFIDENCE: {vibe2_score:.1%}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  )
130
+
131
+ stats = (
132
+ f"--- 1. MODEL AGREEMENT (Semantic Similarity) ---\n"
133
+ f"Score: {sim_score:.3f}\n"
134
+ f"(Scale: 0.0 = Different, 1.0 = Identical)\n\n"
135
+ f"--- 2. OBJECT IDENTIFICATION ACCURACY (ROUGE) ---\n"
136
+ f"Ground Truth: '{ground_truth}'\n"
137
+ f"{rouge_output}"
138
+ )
139
+
140
+ return out1, out2, stats
141
+
142
+ # ==============================================================================
143
+ # SECTION 4: GRADIO INTERFACE
144
+ # ==============================================================================
145
+
146
+ # Define Inputs
147
+ image_input = gr.Image(type="pil", label="Upload Image")
148
+ text_input = gr.Textbox(label="Ground Truth Description", placeholder="e.g. 'A peaceful dog on a beach'")
149
+
150
+ # Define Outputs with LARGER viewing areas (lines=5 or 10)
151
+ output_m1 = gr.Textbox(label="Model 1 (BLIP) Analysis", lines=4)
152
+ output_m2 = gr.Textbox(label="Model 2 (ViT-GPT2) Analysis", lines=4)
153
+ output_stats = gr.Textbox(label="Comparison Metrics & Statistics", lines=10)
154
+
155
+ # Create Interface
156
+ interface = gr.Interface(
157
+ fn=analyze_image,
158
+ inputs=[image_input, text_input],
159
+ outputs=[output_m1, output_m2, output_stats],
160
+ title="Multimodal AI: Nuanced Image Analysis",
161
+ description="This application uses two Image Captioning models (BLIP & ViT-GPT2) to identify objects, Zero-Shot Classification to detect emotional vibes (Happy, Sad, Angry, etc.), and calculates ROUGE/Similarity metrics.",
162
+ examples=[
163
+ ["images/1.png", "A peaceful dog on a sunny beach"],
164
+ ["images/2.png", "Sad men carrying a casket at a funeral"],
165
+ ["images/3.png", "Happy kids at a birthday party"],
166
+ ["images/4.png", "An angry man in a car"],
167
+ ["images/5.png", "Two people happy mountain biking"],
168
+ ["images/6.png", "A man upset about his food at a restaurant"],
169
+ ["images/7.png", "A couple happy at a restaurant"],
170
+ ["images/8.png", "A sad woman reading a book"],
171
+ ["images/9.png", "People scared at a movie"],
172
+ ["images/10.png", "Two tigers fighting"]
173
+ ]
174
+ )
175
 
176
  if __name__ == "__main__":
177
+ interface.launch()
 
requirements.txt CHANGED
@@ -1,11 +1,11 @@
1
- torch
2
- torchvision
3
  transformers
4
- timm
5
  gradio
6
- Pillow
7
- numpy
8
- scipy
9
- accelerate
10
- pycocotools
11
- exifread
 
1
+ # requirements.txt
2
+
3
  transformers
4
+ torch
5
  gradio
6
+ pillow
7
+ sentence-transformers
8
+ evaluate
9
+ rouge_score
10
+ absl-py
11
+ scikit-learn