ProfRom commited on
Commit
a2b25d5
·
verified ·
1 Parent(s): da5e151

Diaz - Final submission

Browse files
Files changed (1) hide show
  1. app.py +34 -209
app.py CHANGED
@@ -1,217 +1,42 @@
1
-
2
- import numpy as np
3
- import librosa
4
  import torch
 
5
  import gradio as gr
6
  from PIL import Image
7
- import requests
8
- from io import BytesIO
9
-
10
- from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
11
-
12
- # Device configuration
13
- torch_device = "cuda" if torch.cuda.is_available() else "cpu"
14
- pipeline_device = 0 if torch.cuda.is_available() else -1
15
-
16
- # ---------- LABEL DEFINITIONS ----------
17
-
18
- CANONICAL_LABELS = ["anger", "happiness", "neutral", "sadness"]
19
-
20
- TEXT_MODEL_LABEL_MAP = {
21
- "anger": "anger",
22
- "joy": "happiness",
23
- "neutral": "neutral",
24
- "sadness": "sadness",
25
- "disgust": None,
26
- "fear": None,
27
- "surprise": None
28
- }
29
-
30
- AUDIO_MODEL_LABEL_MAP = {
31
- "ang": "anger",
32
- "hap": "happiness",
33
- "neu": "neutral",
34
- "sad": "sadness",
35
- "anger": "anger",
36
- "happy": "happiness",
37
- "neutral": "neutral",
38
- "sadness": "sadness"
39
- }
40
-
41
- TEXT_WEIGHT = 0.40
42
- AUDIO_WEIGHT = 0.60
43
-
44
- # ---------- LOAD MODELS ----------
45
-
46
- text_classifier = pipeline(
47
- "text-classification",
48
- model="j-hartmann/emotion-english-distilroberta-base",
49
- top_k=None,
50
- device=pipeline_device
51
- )
52
-
53
- audio_classifier = pipeline(
54
- "audio-classification",
55
- model="superb/wav2vec2-base-superb-er",
56
- device=pipeline_device
57
- )
58
-
59
- image_classifier = pipeline(
60
- "image-classification",
61
- model="google/vit-base-patch16-224",
62
- device=pipeline_device
63
- )
64
-
65
- image_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
66
- image_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(torch_device)
67
-
68
- # ---------- HELPER FUNCTIONS ----------
69
-
70
- def initialize_score_dict():
71
- return {label: 0.0 for label in CANONICAL_LABELS}
72
-
73
- def normalize_text_label(label):
74
- return TEXT_MODEL_LABEL_MAP.get(str(label).lower(), None)
75
-
76
- def normalize_audio_label(label):
77
- return AUDIO_MODEL_LABEL_MAP.get(str(label).lower(), None)
78
-
79
- def format_top_predictions(predictions, top_k=3):
80
- return "\n".join([f"{p['label']} ({p['score']:.4f})" for p in predictions[:top_k]])
81
-
82
- # ---------- TEXT MODEL ----------
83
-
84
- def predict_text_emotion(transcript):
85
- if not transcript or transcript.strip() == "":
86
- return [], initialize_score_dict()
87
-
88
- preds = text_classifier(transcript)
89
-
90
- if isinstance(preds, list) and isinstance(preds[0], list):
91
- preds = preds[0]
92
-
93
- scores = initialize_score_dict()
94
- normalized = []
95
-
96
- for item in preds:
97
- mapped = normalize_text_label(item["label"])
98
- if mapped:
99
- scores[mapped] += item["score"]
100
- normalized.append({"label": mapped, "score": item["score"]})
101
-
102
- return sorted(normalized, key=lambda x: x["score"], reverse=True), scores
103
-
104
- # ---------- AUDIO MODEL ----------
105
-
106
- def predict_audio_emotion(audio):
107
- array = audio["array"]
108
- sr = audio["sampling_rate"]
109
-
110
- if sr != 16000:
111
- array = librosa.resample(array, orig_sr=sr, target_sr=16000)
112
- sr = 16000
113
-
114
- preds = audio_classifier({"array": array, "sampling_rate": sr}, top_k=4)
115
 
116
- scores = initialize_score_dict()
117
- normalized = []
 
118
 
119
- for item in preds:
120
- mapped = normalize_audio_label(item["label"])
121
- if mapped:
122
- scores[mapped] += item["score"]
123
- normalized.append({"label": mapped, "score": item["score"]})
124
 
125
- return sorted(normalized, key=lambda x: x["score"], reverse=True), scores
126
-
127
- # ---------- FUSION ----------
128
-
129
- def fuse_scores(text_scores, audio_scores):
130
- fused_scores = {}
131
-
132
- for label in CANONICAL_LABELS:
133
- fused_scores[label] = (
134
- TEXT_WEIGHT * text_scores.get(label, 0.0) +
135
- AUDIO_WEIGHT * audio_scores.get(label, 0.0)
136
- )
137
-
138
- best_label = max(fused_scores, key=fused_scores.get)
139
- return best_label, fused_scores[best_label]
140
-
141
- # ---------- IMAGE ----------
142
-
143
- def run_image(image):
144
  if image is None:
145
- return "No image.", "No classification.", "No evaluation."
146
-
147
- inputs = image_processor(images=image, return_tensors="pt").to(torch_device)
148
-
149
- with torch.no_grad():
150
- output = image_model.generate(**inputs, max_new_tokens=30)
151
-
152
- caption = image_processor.decode(output[0], skip_special_tokens=True)
153
-
154
- preds = image_classifier(image)[:3]
155
- classification = "\n".join([f"{p['label']} ({p['score']:.4f})" for p in preds])
156
-
157
- return caption, classification, "Completed"
158
-
159
- # ---------- MAIN MULTIMODAL ----------
160
-
161
- def run_audio_text(audio_input, transcript):
162
-
163
- if audio_input is None:
164
- return "No audio provided.", "", "", ""
165
-
166
- sr, audio_array = audio_input
167
-
168
- audio = {
169
- "array": np.asarray(audio_array, dtype=np.float32),
170
- "sampling_rate": int(sr)
171
- }
172
-
173
- text_preds, text_scores = predict_text_emotion(transcript)
174
- audio_preds, audio_scores = predict_audio_emotion(audio)
175
-
176
- fused_label, fused_score = fuse_scores(text_scores, audio_scores)
177
-
178
- return (
179
- transcript if transcript else "No transcript",
180
- format_top_predictions(text_preds),
181
- format_top_predictions(audio_preds),
182
- f"{fused_label.upper()} (confidence: {fused_score:.4f})"
183
- )
184
-
185
- # ---------- UI ----------
186
-
187
- with gr.Blocks() as demo:
188
-
189
- gr.Markdown("# Multimodal AI System")
190
-
191
- with gr.Tabs():
192
-
193
- with gr.Tab("Audio + Text"):
194
- audio = gr.Audio(type="numpy")
195
- text = gr.Textbox()
196
-
197
- out1 = gr.Textbox(label="Transcript")
198
- out2 = gr.Textbox(label="Text Prediction")
199
- out3 = gr.Textbox(label="Audio Prediction")
200
- out4 = gr.Textbox(label="Fused Result")
201
-
202
- btn = gr.Button("Run")
203
-
204
- btn.click(run_audio_text, [audio, text], [out1, out2, out3, out4])
205
-
206
- with gr.Tab("Image Analysis"):
207
- image = gr.Image(type="pil")
208
-
209
- cap = gr.Textbox(label="Caption")
210
- cls = gr.Textbox(label="Classification")
211
- eval = gr.Textbox(label="Status")
212
-
213
- btn2 = gr.Button("Run Image")
214
-
215
- btn2.click(run_image, image, [cap, cls, eval])
216
 
217
- demo.launch()
 
 
 
 
 
1
  import torch
2
+ from transformers import BlipProcessor, BlipForQuestionAnswering
3
  import gradio as gr
4
  from PIL import Image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ # Load model + processor
7
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
8
+ model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
9
 
10
+ # Move to GPU if available (Spaces free tier = CPU, but this keeps it safe)
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ model.to(device)
 
 
13
 
14
+ def answer_question(image, question):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  if image is None:
16
+ return "Please upload an image."
17
+ if not question:
18
+ return "Please enter a question."
19
+
20
+ # Process inputs
21
+ inputs = processor(image, question, return_tensors="pt").to(device)
22
+
23
+ # Generate answer
24
+ output = model.generate(**inputs)
25
+ answer = processor.decode(output[0], skip_special_tokens=True)
26
+
27
+ return answer
28
+
29
+ # Gradio Interface
30
+ demo = gr.Interface(
31
+ fn=answer_question,
32
+ inputs=[
33
+ gr.Image(type="pil", label="Upload an image"),
34
+ gr.Textbox(label="Question", placeholder="Example: What is in this image?")
35
+ ],
36
+ outputs=gr.Textbox(label="Answer"),
37
+ title="BLIP Visual Question Answering",
38
+ description="Upload an image and ask a question about it using a multimodal AI model.",
39
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ if __name__ == "__main__":
42
+ demo.launch()