jatinsabari commited on
Commit
5c4445f
Β·
verified Β·
1 Parent(s): f03f067

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +371 -0
app.py ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import numpy as np
4
+ import torch
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM
6
+ import tempfile
7
+ import os
8
+ from typing import List, Dict, Any
9
+
10
+ # Model configuration
11
+ MODEL_NAME = "google/gemma-2-2b-it" # Using Gemma 2B for better performance on Hugging Face
12
+ # Note: gemma-3n model might not be available, using gemma-2-2b-it instead
13
+
14
+ class AudioEmotionAnalyzer:
15
+ def __init__(self, model_name: str = MODEL_NAME):
16
+ self.model_name = model_name
17
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
18
+ print(f"πŸš€ Using device: {self.device}")
19
+
20
+ # Load tokenizer and model
21
+ print("πŸ“₯ Loading tokenizer...")
22
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
23
+
24
+ print("πŸ“₯ Loading model...")
25
+ self.model = AutoModelForCausalLM.from_pretrained(
26
+ model_name,
27
+ torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
28
+ device_map="auto",
29
+ trust_remote_code=True
30
+ )
31
+
32
+ # Add padding token if it doesn't exist
33
+ if self.tokenizer.pad_token is None:
34
+ self.tokenizer.pad_token = self.tokenizer.eos_token
35
+
36
+ print("βœ… Model loaded successfully!")
37
+
38
+ def extract_audio_features(self, audio_path: str) -> Dict[str, Any]:
39
+ """Extract comprehensive audio features for emotion analysis"""
40
+ try:
41
+ # Load audio file
42
+ y, sr = librosa.load(audio_path, sr=22050, duration=10) # Limit to 10 seconds
43
+
44
+ # Extract various audio features
45
+ features = {}
46
+
47
+ # MFCC features (most important for speech emotion)
48
+ mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
49
+ features['mfcc_mean'] = np.mean(mfcc, axis=1).tolist()
50
+ features['mfcc_std'] = np.std(mfcc, axis=1).tolist()
51
+
52
+ # Spectral features
53
+ spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
54
+ features['spectral_centroid_mean'] = float(np.mean(spectral_centroid))
55
+ features['spectral_centroid_std'] = float(np.std(spectral_centroid))
56
+
57
+ # Zero crossing rate
58
+ zcr = librosa.feature.zero_crossing_rate(y)
59
+ features['zcr_mean'] = float(np.mean(zcr))
60
+ features['zcr_std'] = float(np.std(zcr))
61
+
62
+ # RMS energy
63
+ rms = librosa.feature.rms(y=y)
64
+ features['rms_mean'] = float(np.mean(rms))
65
+ features['rms_std'] = float(np.std(rms))
66
+
67
+ # Pitch features
68
+ pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
69
+ features['pitch_mean'] = float(np.mean(pitches[pitches > 0])) if np.any(pitches > 0) else 0.0
70
+ features['pitch_std'] = float(np.std(pitches[pitches > 0])) if np.any(pitches > 0) else 0.0
71
+
72
+ # Tempo
73
+ tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
74
+ features['tempo'] = float(tempo) if tempo else 0.0
75
+
76
+ # Duration
77
+ features['duration'] = len(y) / sr
78
+
79
+ print(f"βœ… Extracted {len(features)} audio features")
80
+ return features
81
+
82
+ except Exception as e:
83
+ print(f"❌ Error extracting audio features: {e}")
84
+ return {}
85
+
86
+ def features_to_text_description(self, features: Dict[str, Any]) -> str:
87
+ """Convert audio features to a descriptive text prompt"""
88
+
89
+ # Create a descriptive prompt based on audio features
90
+ description_parts = []
91
+
92
+ # Analyze spectral characteristics
93
+ if features.get('spectral_centroid_mean', 0) > 2000:
94
+ description_parts.append("high-frequency content")
95
+ else:
96
+ description_parts.append("low-frequency content")
97
+
98
+ # Analyze energy levels
99
+ rms_mean = features.get('rms_mean', 0)
100
+ if rms_mean > 0.1:
101
+ description_parts.append("high energy")
102
+ elif rms_mean < 0.01:
103
+ description_parts.append("low energy")
104
+ else:
105
+ description_parts.append("moderate energy")
106
+
107
+ # Analyze speaking rate through zero crossing rate
108
+ zcr_mean = features.get('zcr_mean', 0)
109
+ if zcr_mean > 0.1:
110
+ description_parts.append("rapid speech")
111
+ elif zcr_mean < 0.05:
112
+ description_parts.append("slow speech")
113
+
114
+ # Analyze pitch variation
115
+ pitch_std = features.get('pitch_std', 0)
116
+ if pitch_std > 100:
117
+ description_parts.append("variable pitch")
118
+ else:
119
+ description_parts.append("steady pitch")
120
+
121
+ # Analyze tempo
122
+ tempo = features.get('tempo', 0)
123
+ if tempo > 120:
124
+ description_parts.append("fast tempo")
125
+ elif tempo < 80:
126
+ description_parts.append("slow tempo")
127
+
128
+ description = "This audio has: " + ", ".join(description_parts)
129
+ return description
130
+
131
+ def analyze_emotion(self, audio_path: str) -> Dict[str, Any]:
132
+ """Analyze emotion from audio file using Gemma model"""
133
+ try:
134
+ print(f"🎡 Analyzing audio: {audio_path}")
135
+
136
+ # Extract audio features
137
+ features = self.extract_audio_features(audio_path)
138
+ if not features:
139
+ return {"error": "Failed to extract audio features"}
140
+
141
+ # Create feature description
142
+ feature_description = self.features_to_text_description(features)
143
+
144
+ # Create comprehensive prompt for emotion analysis
145
+ prompt = f"""Analyze the emotional content of this audio based on its acoustic features.
146
+
147
+ Audio Characteristics: {feature_description}
148
+
149
+ Based on these acoustic properties, analyze the emotional content and provide:
150
+ 1. Primary emotion (choose from: happy, sad, angry, fearful, disgusted, surprised, neutral, excited, calm, anxious)
151
+ 2. Confidence level (0-100%)
152
+ 3. Detailed reasoning based on the audio features
153
+ 4. Secondary emotions if present
154
+
155
+ Format your response as:
156
+ Primary Emotion: [emotion]
157
+ Confidence: [percentage]%
158
+ Reasoning: [detailed explanation]
159
+ Secondary Emotions: [comma-separated list]
160
+
161
+ Analysis:"""
162
+
163
+ print("πŸ€– Generating emotion analysis with Gemma...")
164
+
165
+ # Tokenize input
166
+ inputs = self.tokenizer(
167
+ prompt,
168
+ return_tensors="pt",
169
+ max_length=1024,
170
+ truncation=True,
171
+ padding=True
172
+ ).to(self.device)
173
+
174
+ # Generate response
175
+ with torch.no_grad():
176
+ outputs = self.model.generate(
177
+ **inputs,
178
+ max_new_tokens=256,
179
+ temperature=0.7,
180
+ do_sample=True,
181
+ top_p=0.9,
182
+ pad_token_id=self.tokenizer.eos_token_id
183
+ )
184
+
185
+ # Decode response
186
+ response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
187
+
188
+ # Extract just the new generated part (after the prompt)
189
+ generated_text = response[len(prompt):].strip()
190
+
191
+ print(f"βœ… Gemma response: {generated_text}")
192
+
193
+ # Parse the response
194
+ return self.parse_emotion_response(generated_text, features)
195
+
196
+ except Exception as e:
197
+ print(f"❌ Error in emotion analysis: {e}")
198
+ return {"error": f"Analysis failed: {str(e)}"}
199
+
200
+ def parse_emotion_response(self, response: str, features: Dict[str, Any]) -> Dict[str, Any]:
201
+ """Parse Gemma's response to extract structured emotion data"""
202
+ try:
203
+ result = {
204
+ "primary_emotion": "unknown",
205
+ "confidence": 0,
206
+ "reasoning": "",
207
+ "secondary_emotions": [],
208
+ "audio_features": features,
209
+ "raw_response": response
210
+ }
211
+
212
+ lines = response.split('\n')
213
+ for line in lines:
214
+ line = line.strip()
215
+ if line.startswith('Primary Emotion:'):
216
+ result["primary_emotion"] = line.split(':', 1)[1].strip()
217
+ elif line.startswith('Confidence:'):
218
+ conf_text = line.split(':', 1)[1].strip().replace('%', '')
219
+ try:
220
+ result["confidence"] = float(conf_text)
221
+ except:
222
+ result["confidence"] = 50
223
+ elif line.startswith('Reasoning:'):
224
+ result["reasoning"] = line.split(':', 1)[1].strip()
225
+ elif line.startswith('Secondary Emotions:'):
226
+ sec_emotions = line.split(':', 1)[1].strip()
227
+ result["secondary_emotions"] = [e.strip() for e in sec_emotions.split(',')]
228
+
229
+ # If parsing failed, use the raw response as reasoning
230
+ if not result["reasoning"]:
231
+ result["reasoning"] = response
232
+
233
+ return result
234
+
235
+ except Exception as e:
236
+ print(f"❌ Error parsing response: {e}")
237
+ return {
238
+ "primary_emotion": "unknown",
239
+ "confidence": 0,
240
+ "reasoning": response,
241
+ "secondary_emotions": [],
242
+ "audio_features": features,
243
+ "raw_response": response,
244
+ "error": f"Parsing error: {str(e)}"
245
+ }
246
+
247
+ # Initialize the analyzer
248
+ print("πŸ”„ Initializing Audio Emotion Analyzer...")
249
+ analyzer = AudioEmotionAnalyzer()
250
+
251
+ def process_audio(audio_path: str) -> Dict[str, Any]:
252
+ """Gradio-compatible function to process audio"""
253
+ if audio_path is None:
254
+ return {"error": "No audio file provided"}
255
+
256
+ try:
257
+ result = analyzer.analyze_emotion(audio_path)
258
+ return result
259
+ except Exception as e:
260
+ return {"error": f"Processing error: {str(e)}"}
261
+
262
+ # Create Gradio interface
263
+ def create_interface():
264
+ """Create the Gradio interface"""
265
+
266
+ # Custom CSS for better styling
267
+ css = """
268
+ .emotion-result {
269
+ padding: 20px;
270
+ border-radius: 10px;
271
+ margin: 10px 0;
272
+ }
273
+ .primary-emotion {
274
+ font-size: 24px;
275
+ font-weight: bold;
276
+ margin: 10px 0;
277
+ }
278
+ .confidence-bar {
279
+ height: 20px;
280
+ background: linear-gradient(90deg, #ff6b6b, #4ecdc4);
281
+ border-radius: 10px;
282
+ margin: 10px 0;
283
+ }
284
+ """
285
+
286
+ # Emotion color mapping
287
+ emotion_colors = {
288
+ "happy": "#4ecdc4",
289
+ "sad": "#6c5ce7",
290
+ "angry": "#ff6b6b",
291
+ "fearful": "#a29bfe",
292
+ "disgusted": "#00b894",
293
+ "surprised": "#fdcb6e",
294
+ "neutral": "#b2bec3",
295
+ "excited": "#e17055",
296
+ "calm": "#74b9ff",
297
+ "anxious": "#fd79a8"
298
+ }
299
+
300
+ def process_audio_wrapper(audio_path):
301
+ """Wrapper function for Gradio"""
302
+ result = process_audio(audio_path)
303
+
304
+ if "error" in result:
305
+ return f"❌ Error: {result['error']}"
306
+
307
+ # Create formatted output
308
+ emotion = result.get("primary_emotion", "unknown")
309
+ confidence = result.get("confidence", 0)
310
+ reasoning = result.get("reasoning", "")
311
+ secondary = result.get("secondary_emotions", [])
312
+
313
+ color = emotion_colors.get(emotion.lower(), "#b2bec3")
314
+
315
+ output = f"""
316
+ <div class="emotion-result" style="border-left: 5px solid {color};">
317
+ <div class="primary-emotion" style="color: {color};">
318
+ 🎭 {emotion.title()}
319
+ </div>
320
+ <div>
321
+ <strong>Confidence:</strong> {confidence}%
322
+ </div>
323
+ <div class="confidence-bar" style="width: {confidence}%;"></div>
324
+ <div>
325
+ <strong>Reasoning:</strong> {reasoning}
326
+ </div>
327
+ {f"<div><strong>Secondary Emotions:</strong> {', '.join(secondary)}</div>" if secondary else ""}
328
+ </div>
329
+ """
330
+
331
+ return output
332
+
333
+ # Create interface
334
+ interface = gr.Interface(
335
+ fn=process_audio_wrapper,
336
+ inputs=gr.Audio(
337
+ sources=["upload", "microphone"],
338
+ type="filepath",
339
+ label="Upload Audio File or Record",
340
+ ),
341
+ outputs=gr.HTML(label="Emotion Analysis Result"),
342
+ title="🎡 Audio Emotion Analyzer with Gemma",
343
+ description="""
344
+ Upload an audio file or record your voice to analyze emotional content using Google's Gemma model.
345
+ The AI will analyze acoustic features like pitch, energy, tempo, and spectral characteristics to detect emotions.
346
+ """,
347
+ examples=[
348
+ ["examples/happy_sample.wav"] if os.path.exists("examples/happy_sample.wav") else None,
349
+ ["examples/sad_sample.wav"] if os.path.exists("examples/sad_sample.wav") else None,
350
+ ],
351
+ css=css
352
+ )
353
+
354
+ return interface
355
+
356
+ # Main execution
357
+ if __name__ == "__main__":
358
+ print("πŸš€ Starting Audio Emotion Analyzer...")
359
+ print(f"πŸ“Š Using model: {MODEL_NAME}")
360
+ print(f"🎡 Supported formats: WAV, MP3, FLAC, etc.")
361
+
362
+ # Create and launch interface
363
+ demo = create_interface()
364
+
365
+ # Launch with appropriate settings
366
+ demo.launch(
367
+ server_name="0.0.0.0",
368
+ server_port=7860,
369
+ share=True,
370
+ debug=True
371
+ )