ProfRom commited on
Commit
648030e
·
verified ·
1 Parent(s): 13a9703

Poojary - Final Assignment submission

Browse files
Files changed (1) hide show
  1. app.py +95 -173
app.py CHANGED
@@ -1,177 +1,99 @@
1
- # ==============================================================================
2
- # Josh Guimond
3
- # Unit 8 Assignment: End-to-End AI Solution Implementation
4
- # ARIN 460
5
- # 12/03/2025
6
-
7
- # Description: This script implements a multimodal AI web app using Gradio to
8
- # run two image captioning models, a text “vibe” classifier, and NLP metrics on
9
- # uploaded images, allowing direct comparison of model captions to ground-truth
10
- # descriptions.
11
- # ==============================================================================
12
-
13
- # Video: https://youtu.be/pXCO00lK2UE
14
- # Space: https://huggingface.co/spaces/jguimond/assignment_8_v3
15
-
16
- # ==============================================================================
17
- # SECTION 1: SETUP & INSTALLATIONS
18
- # ==============================================================================
19
- # Install libraries
20
  import gradio as gr
21
- from transformers import pipeline, AutoTokenizer, AutoModelForImageTextToText
22
- from sentence_transformers import SentenceTransformer, util
23
- import evaluate
24
- import warnings
25
- import logging
26
-
27
- # Filter out the "FutureWarning" and "UserWarning" to keep the console clean
28
- warnings.filterwarnings("ignore", category=FutureWarning)
29
- warnings.filterwarnings("ignore", category=UserWarning)
30
- logging.getLogger("transformers").setLevel(logging.ERROR)
31
-
32
-
33
- # ==============================================================================
34
- # SECTION 2: LOAD MODELS
35
- # ==============================================================================
36
-
37
- # --- 1. Load Image Captioning Models ---
38
-
39
- # Model 1: BLIP (Base)
40
- print("Loading Model 1 (BLIP)...")
41
- captioner_model1 = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
42
-
43
- # Model 2: ViT-GPT2 (With Tokenizer Fix)
44
- print("Loading Model 2 (ViT-GPT2)...")
45
- # Load the tokenizer manually to set the pad_token and fix the warning
46
- vit_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
47
- vit_tokenizer.pad_token = vit_tokenizer.eos_token # <--- THE FIX
48
- captioner_model2 = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", tokenizer=vit_tokenizer)
49
-
50
- # --- 2. Load NLP Analysis Models (Unit 4 Techniques) ---
51
-
52
- # A. Zero-Shot Classifier (For Nuanced Vibe/Sentiment)
53
- print("Loading Zero-Shot Classifier...")
54
- classifier = pipeline("zero-shot-classification", model="MoritzLaurer/deberta-v3-xsmall-zeroshot-v1.1-all-33")
55
-
56
- # B. Semantic Similarity (For Model Agreement)
57
- print("Loading Sentence Transformer...")
58
- similarity_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
59
-
60
- # C. ROUGE Metric (For Accuracy vs Ground Truth)
61
- print("Loading ROUGE Metric...")
62
- rouge = evaluate.load("rouge")
63
-
64
- # Define Nuanced Labels based on the image list
65
- # These cover: Peaceful dog, Sad funeral, Happy kids, Angry man, Scared people, Fighting tigers
66
- VIBE_LABELS = ["Peaceful/Calm", "Happy/Joy", "Sad/Sorrow", "Angry/Upset", "Fear/Scared", "Action/Violence"]
67
-
68
- # ==============================================================================
69
- # SECTION 3: ANALYSIS FUNCTIONS
70
- # ==============================================================================
71
-
72
- # --- Analysis Function ---
73
- def analyze_image(image, ground_truth):
74
-
75
- # -- A. Generate Captions --
76
- res1 = captioner_model1(image)
77
- cap1 = res1[0]['generated_text']
78
-
79
- res2 = captioner_model2(image)
80
- cap2 = res2[0]['generated_text']
81
-
82
- # -- B. Analyze Vibe (Zero-Shot) --
83
- # Model 1 Vibe
84
- vibe1_result = classifier(cap1, VIBE_LABELS)
85
- vibe1_label = vibe1_result['labels'][0]
86
- vibe1_score = vibe1_result['scores'][0]
87
-
88
- # Model 2 Vibe
89
- vibe2_result = classifier(cap2, VIBE_LABELS)
90
- vibe2_label = vibe2_result['labels'][0]
91
- vibe2_score = vibe2_result['scores'][0]
92
-
93
- # -- C. Calculate Statistics --
94
-
95
- # 1. Semantic Similarity (Do the models agree?)
96
- emb1 = similarity_model.encode(cap1, convert_to_tensor=True)
97
- emb2 = similarity_model.encode(cap2, convert_to_tensor=True)
98
- sim_score = util.pytorch_cos_sim(emb1, emb2).item()
99
-
100
- # 2. ROUGE Scores (How accurate are they vs Ground Truth?)
101
- rouge_output = "N/A (No Ground Truth provided)"
102
- if ground_truth and ground_truth.strip() != "":
103
- # Calculate scores
104
- r1 = rouge.compute(predictions=[cap1], references=[ground_truth])
105
- r2 = rouge.compute(predictions=[cap2], references=[ground_truth])
106
 
107
- # Format the ROUGE output nicely
108
- rouge_output = (
109
- f"Model 1 ROUGE-L: {r1['rougeL']:.3f}\n"
110
- f"Model 2 ROUGE-L: {r2['rougeL']:.3f}\n"
111
- f"(Higher is better)"
112
- )
113
-
114
- # -- D. Format Output Strings --
115
- # Create clean, formatted strings for the large textboxes
116
-
117
- out1 = (
118
- f"CAPTION: {cap1}\n"
119
- f"-----------------------------\n"
120
- f"DETECTED VIBE: {vibe1_label}\n"
121
- f"CONFIDENCE: {vibe1_score:.1%}"
122
- )
123
-
124
- out2 = (
125
- f"CAPTION: {cap2}\n"
126
- f"-----------------------------\n"
127
- f"DETECTED VIBE: {vibe2_label}\n"
128
- f"CONFIDENCE: {vibe2_score:.1%}"
129
- )
130
-
131
- stats = (
132
- f"--- 1. MODEL AGREEMENT (Semantic Similarity) ---\n"
133
- f"Score: {sim_score:.3f}\n"
134
- f"(Scale: 0.0 = Different, 1.0 = Identical)\n\n"
135
- f"--- 2. OBJECT IDENTIFICATION ACCURACY (ROUGE) ---\n"
136
- f"Ground Truth: '{ground_truth}'\n"
137
- f"{rouge_output}"
138
- )
139
-
140
- return out1, out2, stats
141
-
142
- # ==============================================================================
143
- # SECTION 4: GRADIO INTERFACE
144
- # ==============================================================================
145
-
146
- # Define Inputs
147
- image_input = gr.Image(type="pil", label="Upload Image")
148
- text_input = gr.Textbox(label="Ground Truth Description", placeholder="e.g. 'A peaceful dog on a beach'")
149
-
150
- # Define Outputs with LARGER viewing areas (lines=5 or 10)
151
- output_m1 = gr.Textbox(label="Model 1 (BLIP) Analysis", lines=4)
152
- output_m2 = gr.Textbox(label="Model 2 (ViT-GPT2) Analysis", lines=4)
153
- output_stats = gr.Textbox(label="Comparison Metrics & Statistics", lines=10)
154
-
155
- # Create Interface
156
- interface = gr.Interface(
157
- fn=analyze_image,
158
- inputs=[image_input, text_input],
159
- outputs=[output_m1, output_m2, output_stats],
160
- title="Multimodal AI: Nuanced Image Analysis",
161
- description="This application uses two Image Captioning models (BLIP & ViT-GPT2) to identify objects, Zero-Shot Classification to detect emotional vibes (Happy, Sad, Angry, etc.), and calculates ROUGE/Similarity metrics.",
162
- examples=[
163
- ["images/1.png", "A peaceful dog on a sunny beach"],
164
- ["images/2.png", "Sad men carrying a casket at a funeral"],
165
- ["images/3.png", "Happy kids at a birthday party"],
166
- ["images/4.png", "An angry man in a car"],
167
- ["images/5.png", "Two people happy mountain biking"],
168
- ["images/6.png", "A man upset about his food at a restaurant"],
169
- ["images/7.png", "A couple happy at a restaurant"],
170
- ["images/8.png", "A sad woman reading a book"],
171
- ["images/9.png", "People scared at a movie"],
172
- ["images/10.png", "Two tigers fighting"]
173
- ]
174
  )
175
 
176
- if __name__ == "__main__":
177
- interface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ import os
4
+ import tempfile
5
+ from huggingface_hub import login
6
+ from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering, infer_device, PaliGemmaForConditionalGeneration
7
+ from accelerate import Accelerator
8
+
9
+ # login to Hugging Face
10
+ login(token=os.getenv('HF_TOKEN'))
11
+
12
+ # Set the device
13
+ device = infer_device()
14
+
15
+ # MODEL 1: BLIP-VQA
16
+ processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
17
+ model = AutoModelForVisualQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to(device)
18
+
19
+ # Define inference function for Model 1
20
+ def process_image(image, prompt):
21
+ inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
22
+
23
+ try:
24
+ # Generate output from the model
25
+ output = model.generate(**inputs, max_new_tokens=10)
26
+
27
+ # Decode and return the output
28
+ decoded_output = processor.batch_decode(output, skip_special_tokens=True)[0].strip()
29
+
30
+ # remove prompt from output
31
+ if decoded_output.startswith(prompt):
32
+ return decoded_output[len(prompt):].strip()
33
+ return decoded_output
34
+ except Exception as e:
35
+ print(f"Error in Model 1: {e}")
36
+ return "An error occurred during processing for Model 1."
37
+
38
+
39
+ # MODEL 2: PaliGemma
40
+ processor2 = AutoProcessor.from_pretrained("google/paligemma-3b-pt-224")
41
+ model2 = PaliGemmaForConditionalGeneration.from_pretrained(
42
+ "google/paligemma-3b-mix-224",
43
+ torch_dtype=torch.bfloat16
44
+ ).to(device)
45
+
46
+
47
+ # Define inference function for Model 2
48
+ def process_image2(image, prompt):
49
+ inputs2 = processor2(
50
+ text=prompt,
51
+ images=image,
52
+ return_tensors="pt"
53
+ ).to(device, model2.dtype)
54
+
55
+ try:
56
+ output = model2.generate(**inputs2, max_new_tokens=10)
57
+ decoded_output = processor2.batch_decode(
58
+ output[:, inputs2["input_ids"].shape[1]:],
59
+ skip_special_tokens=True
60
+ )[0].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ return decoded_output
63
+ except Exception as e:
64
+ print(f"Error in Model 2: {e}")
65
+ return "An error occurred during processing for Model 2. Ensure your hardware supports bfloat16 or adjust the torch_dtype."
66
+
67
+
68
+ # GRADIO INTERFACE
69
+ inputs_model1 = [
70
+ gr.Image(type="pil"),
71
+ gr.Textbox(label="Prompt", placeholder="Enter your question")
72
+ ]
73
+ inputs_model2 = [
74
+ gr.Image(type="pil"),
75
+ gr.Textbox(label="Prompt", placeholder="Enter your question")
76
+ ]
77
+
78
+ outputs_model1 = gr.Textbox(label="Answer")
79
+ outputs_model2 = gr.Textbox(label="Answer")
80
+
81
+ # Create the Gradio apps for each model
82
+ model1_inf = gr.Interface(
83
+ fn=process_image,
84
+ inputs=inputs_model1,
85
+ outputs=outputs_model1,
86
+ title="Model 1: BLIP-VQA-Base",
87
+ description="Ask a question about the uploaded image using BLIP."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  )
89
 
90
+ model2_inf = gr.Interface(
91
+ fn=process_image2,
92
+ inputs=inputs_model2,
93
+ outputs=outputs_model2,
94
+ title="Model 2: PaliGemma",
95
+ description="Ask a question about the uploaded image using PaliGemma."
96
+ )
97
+
98
+ demo = gr.TabbedInterface([model1_inf, model2_inf],["Model 1 (BLIP)", "Model 2 (PaliGemma)"])
99
+ demo.launch(share=True)