jatinsabari commited on
Commit
a1009a6
Β·
verified Β·
1 Parent(s): 5c4445f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -222
app.py CHANGED
@@ -5,11 +5,21 @@ import torch
5
  from transformers import AutoTokenizer, AutoModelForCausalLM
6
  import tempfile
7
  import os
8
- from typing import List, Dict, Any
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  # Model configuration
11
- MODEL_NAME = "google/gemma-2-2b-it" # Using Gemma 2B for better performance on Hugging Face
12
- # Note: gemma-3n model might not be available, using gemma-2-2b-it instead
13
 
14
  class AudioEmotionAnalyzer:
15
  def __init__(self, model_name: str = MODEL_NAME):
@@ -17,13 +27,18 @@ class AudioEmotionAnalyzer:
17
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
18
  print(f"πŸš€ Using device: {self.device}")
19
 
20
- # Load tokenizer and model
21
  print("πŸ“₯ Loading tokenizer...")
22
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
 
 
23
 
24
  print("πŸ“₯ Loading model...")
25
  self.model = AutoModelForCausalLM.from_pretrained(
26
  model_name,
 
27
  torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
28
  device_map="auto",
29
  trust_remote_code=True
@@ -33,103 +48,66 @@ class AudioEmotionAnalyzer:
33
  if self.tokenizer.pad_token is None:
34
  self.tokenizer.pad_token = self.tokenizer.eos_token
35
 
36
- print("βœ… Model loaded successfully!")
37
 
38
  def extract_audio_features(self, audio_path: str) -> Dict[str, Any]:
39
- """Extract comprehensive audio features for emotion analysis"""
40
  try:
41
- # Load audio file
42
- y, sr = librosa.load(audio_path, sr=22050, duration=10) # Limit to 10 seconds
43
 
44
- # Extract various audio features
45
  features = {}
46
 
47
- # MFCC features (most important for speech emotion)
48
  mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
49
  features['mfcc_mean'] = np.mean(mfcc, axis=1).tolist()
50
- features['mfcc_std'] = np.std(mfcc, axis=1).tolist()
51
 
52
  # Spectral features
53
  spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
54
- features['spectral_centroid_mean'] = float(np.mean(spectral_centroid))
55
- features['spectral_centroid_std'] = float(np.std(spectral_centroid))
56
 
57
  # Zero crossing rate
58
  zcr = librosa.feature.zero_crossing_rate(y)
59
- features['zcr_mean'] = float(np.mean(zcr))
60
- features['zcr_std'] = float(np.std(zcr))
61
 
62
  # RMS energy
63
  rms = librosa.feature.rms(y=y)
64
- features['rms_mean'] = float(np.mean(rms))
65
- features['rms_std'] = float(np.std(rms))
66
 
67
- # Pitch features
68
  pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
69
- features['pitch_mean'] = float(np.mean(pitches[pitches > 0])) if np.any(pitches > 0) else 0.0
70
- features['pitch_std'] = float(np.std(pitches[pitches > 0])) if np.any(pitches > 0) else 0.0
71
-
72
- # Tempo
73
- tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
74
- features['tempo'] = float(tempo) if tempo else 0.0
75
 
76
- # Duration
77
- features['duration'] = len(y) / sr
78
-
79
- print(f"βœ… Extracted {len(features)} audio features")
80
  return features
81
 
82
  except Exception as e:
83
  print(f"❌ Error extracting audio features: {e}")
84
  return {}
85
 
86
- def features_to_text_description(self, features: Dict[str, Any]) -> str:
87
- """Convert audio features to a descriptive text prompt"""
88
-
89
- # Create a descriptive prompt based on audio features
90
- description_parts = []
91
-
92
- # Analyze spectral characteristics
93
- if features.get('spectral_centroid_mean', 0) > 2000:
94
- description_parts.append("high-frequency content")
95
- else:
96
- description_parts.append("low-frequency content")
97
-
98
- # Analyze energy levels
99
- rms_mean = features.get('rms_mean', 0)
100
- if rms_mean > 0.1:
101
- description_parts.append("high energy")
102
- elif rms_mean < 0.01:
103
- description_parts.append("low energy")
104
- else:
105
- description_parts.append("moderate energy")
106
-
107
- # Analyze speaking rate through zero crossing rate
108
- zcr_mean = features.get('zcr_mean', 0)
109
- if zcr_mean > 0.1:
110
- description_parts.append("rapid speech")
111
- elif zcr_mean < 0.05:
112
- description_parts.append("slow speech")
113
 
114
- # Analyze pitch variation
115
- pitch_std = features.get('pitch_std', 0)
116
- if pitch_std > 100:
117
- description_parts.append("variable pitch")
118
- else:
119
- description_parts.append("steady pitch")
120
-
121
- # Analyze tempo
122
- tempo = features.get('tempo', 0)
123
- if tempo > 120:
124
- description_parts.append("fast tempo")
125
- elif tempo < 80:
126
- description_parts.append("slow tempo")
 
 
 
127
 
128
- description = "This audio has: " + ", ".join(description_parts)
129
- return description
130
 
131
  def analyze_emotion(self, audio_path: str) -> Dict[str, Any]:
132
- """Analyze emotion from audio file using Gemma model"""
133
  try:
134
  print(f"🎡 Analyzing audio: {audio_path}")
135
 
@@ -138,27 +116,8 @@ class AudioEmotionAnalyzer:
138
  if not features:
139
  return {"error": "Failed to extract audio features"}
140
 
141
- # Create feature description
142
- feature_description = self.features_to_text_description(features)
143
-
144
- # Create comprehensive prompt for emotion analysis
145
- prompt = f"""Analyze the emotional content of this audio based on its acoustic features.
146
-
147
- Audio Characteristics: {feature_description}
148
-
149
- Based on these acoustic properties, analyze the emotional content and provide:
150
- 1. Primary emotion (choose from: happy, sad, angry, fearful, disgusted, surprised, neutral, excited, calm, anxious)
151
- 2. Confidence level (0-100%)
152
- 3. Detailed reasoning based on the audio features
153
- 4. Secondary emotions if present
154
-
155
- Format your response as:
156
- Primary Emotion: [emotion]
157
- Confidence: [percentage]%
158
- Reasoning: [detailed explanation]
159
- Secondary Emotions: [comma-separated list]
160
-
161
- Analysis:"""
162
 
163
  print("πŸ€– Generating emotion analysis with Gemma...")
164
 
@@ -166,16 +125,15 @@ Analysis:"""
166
  inputs = self.tokenizer(
167
  prompt,
168
  return_tensors="pt",
169
- max_length=1024,
170
- truncation=True,
171
- padding=True
172
  ).to(self.device)
173
 
174
  # Generate response
175
  with torch.no_grad():
176
  outputs = self.model.generate(
177
  **inputs,
178
- max_new_tokens=256,
179
  temperature=0.7,
180
  do_sample=True,
181
  top_p=0.9,
@@ -184,13 +142,10 @@ Analysis:"""
184
 
185
  # Decode response
186
  response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
187
-
188
- # Extract just the new generated part (after the prompt)
189
  generated_text = response[len(prompt):].strip()
190
 
191
  print(f"βœ… Gemma response: {generated_text}")
192
 
193
- # Parse the response
194
  return self.parse_emotion_response(generated_text, features)
195
 
196
  except Exception as e:
@@ -198,15 +153,13 @@ Analysis:"""
198
  return {"error": f"Analysis failed: {str(e)}"}
199
 
200
  def parse_emotion_response(self, response: str, features: Dict[str, Any]) -> Dict[str, Any]:
201
- """Parse Gemma's response to extract structured emotion data"""
202
  try:
203
  result = {
204
  "primary_emotion": "unknown",
205
- "confidence": 0,
206
- "reasoning": "",
207
- "secondary_emotions": [],
208
- "audio_features": features,
209
- "raw_response": response
210
  }
211
 
212
  lines = response.split('\n')
@@ -215,157 +168,76 @@ Analysis:"""
215
  if line.startswith('Primary Emotion:'):
216
  result["primary_emotion"] = line.split(':', 1)[1].strip()
217
  elif line.startswith('Confidence:'):
218
- conf_text = line.split(':', 1)[1].strip().replace('%', '')
219
- try:
220
- result["confidence"] = float(conf_text)
221
- except:
222
- result["confidence"] = 50
223
  elif line.startswith('Reasoning:'):
224
  result["reasoning"] = line.split(':', 1)[1].strip()
225
- elif line.startswith('Secondary Emotions:'):
226
- sec_emotions = line.split(':', 1)[1].strip()
227
- result["secondary_emotions"] = [e.strip() for e in sec_emotions.split(',')]
228
 
229
- # If parsing failed, use the raw response as reasoning
230
- if not result["reasoning"]:
231
- result["reasoning"] = response
232
-
233
  return result
234
 
235
  except Exception as e:
236
- print(f"❌ Error parsing response: {e}")
237
  return {
238
  "primary_emotion": "unknown",
239
- "confidence": 0,
240
  "reasoning": response,
241
- "secondary_emotions": [],
242
  "audio_features": features,
243
- "raw_response": response,
244
  "error": f"Parsing error: {str(e)}"
245
  }
246
 
247
  # Initialize the analyzer
248
- print("πŸ”„ Initializing Audio Emotion Analyzer...")
249
  analyzer = AudioEmotionAnalyzer()
250
 
251
- def process_audio(audio_path: str) -> Dict[str, Any]:
252
  """Gradio-compatible function to process audio"""
253
  if audio_path is None:
254
- return {"error": "No audio file provided"}
255
 
256
  try:
257
  result = analyzer.analyze_emotion(audio_path)
258
- return result
259
- except Exception as e:
260
- return {"error": f"Processing error: {str(e)}"}
261
-
262
- # Create Gradio interface
263
- def create_interface():
264
- """Create the Gradio interface"""
265
-
266
- # Custom CSS for better styling
267
- css = """
268
- .emotion-result {
269
- padding: 20px;
270
- border-radius: 10px;
271
- margin: 10px 0;
272
- }
273
- .primary-emotion {
274
- font-size: 24px;
275
- font-weight: bold;
276
- margin: 10px 0;
277
- }
278
- .confidence-bar {
279
- height: 20px;
280
- background: linear-gradient(90deg, #ff6b6b, #4ecdc4);
281
- border-radius: 10px;
282
- margin: 10px 0;
283
- }
284
- """
285
-
286
- # Emotion color mapping
287
- emotion_colors = {
288
- "happy": "#4ecdc4",
289
- "sad": "#6c5ce7",
290
- "angry": "#ff6b6b",
291
- "fearful": "#a29bfe",
292
- "disgusted": "#00b894",
293
- "surprised": "#fdcb6e",
294
- "neutral": "#b2bec3",
295
- "excited": "#e17055",
296
- "calm": "#74b9ff",
297
- "anxious": "#fd79a8"
298
- }
299
-
300
- def process_audio_wrapper(audio_path):
301
- """Wrapper function for Gradio"""
302
- result = process_audio(audio_path)
303
 
304
  if "error" in result:
305
  return f"❌ Error: {result['error']}"
306
 
307
- # Create formatted output
308
  emotion = result.get("primary_emotion", "unknown")
309
- confidence = result.get("confidence", 0)
310
  reasoning = result.get("reasoning", "")
311
- secondary = result.get("secondary_emotions", [])
312
-
313
- color = emotion_colors.get(emotion.lower(), "#b2bec3")
314
 
315
  output = f"""
316
- <div class="emotion-result" style="border-left: 5px solid {color};">
317
- <div class="primary-emotion" style="color: {color};">
318
- 🎭 {emotion.title()}
319
- </div>
320
- <div>
321
- <strong>Confidence:</strong> {confidence}%
322
- </div>
323
- <div class="confidence-bar" style="width: {confidence}%;"></div>
324
- <div>
325
- <strong>Reasoning:</strong> {reasoning}
326
- </div>
327
- {f"<div><strong>Secondary Emotions:</strong> {', '.join(secondary)}</div>" if secondary else ""}
328
- </div>
329
  """
330
 
331
  return output
332
-
333
- # Create interface
334
- interface = gr.Interface(
335
- fn=process_audio_wrapper,
336
- inputs=gr.Audio(
337
- sources=["upload", "microphone"],
338
- type="filepath",
339
- label="Upload Audio File or Record",
340
- ),
341
- outputs=gr.HTML(label="Emotion Analysis Result"),
342
- title="🎡 Audio Emotion Analyzer with Gemma",
343
- description="""
344
- Upload an audio file or record your voice to analyze emotional content using Google's Gemma model.
345
- The AI will analyze acoustic features like pitch, energy, tempo, and spectral characteristics to detect emotions.
346
- """,
347
- examples=[
348
- ["examples/happy_sample.wav"] if os.path.exists("examples/happy_sample.wav") else None,
349
- ["examples/sad_sample.wav"] if os.path.exists("examples/sad_sample.wav") else None,
350
- ],
351
- css=css
352
- )
353
-
354
- return interface
355
 
356
- # Main execution
357
  if __name__ == "__main__":
358
- print("πŸš€ Starting Audio Emotion Analyzer...")
359
- print(f"πŸ“Š Using model: {MODEL_NAME}")
360
- print(f"🎡 Supported formats: WAV, MP3, FLAC, etc.")
361
-
362
- # Create and launch interface
363
- demo = create_interface()
364
-
365
- # Launch with appropriate settings
366
  demo.launch(
367
  server_name="0.0.0.0",
368
  server_port=7860,
369
- share=True,
370
- debug=True
371
  )
 
5
  from transformers import AutoTokenizer, AutoModelForCausalLM
6
  import tempfile
7
  import os
8
+ from typing import Dict, Any
9
+ from huggingface_hub import login
10
+
11
+ # Your Hugging Face token - REPLACE WITH YOUR ACTUAL TOKEN
12
+ HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
13
+
14
+ # Login to Hugging Face
15
+ try:
16
+ login(token=HF_TOKEN)
17
+ print("βœ… Successfully authenticated with Hugging Face")
18
+ except Exception as e:
19
+ print(f"❌ Authentication failed: {e}")
20
 
21
  # Model configuration
22
+ MODEL_NAME = "google/gemma-2-2b-it"
 
23
 
24
  class AudioEmotionAnalyzer:
25
  def __init__(self, model_name: str = MODEL_NAME):
 
27
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
28
  print(f"πŸš€ Using device: {self.device}")
29
 
30
+ # Load tokenizer and model with authentication
31
  print("πŸ“₯ Loading tokenizer...")
32
+ self.tokenizer = AutoTokenizer.from_pretrained(
33
+ model_name,
34
+ token=HF_TOKEN,
35
+ trust_remote_code=True
36
+ )
37
 
38
  print("πŸ“₯ Loading model...")
39
  self.model = AutoModelForCausalLM.from_pretrained(
40
  model_name,
41
+ token=HF_TOKEN,
42
  torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
43
  device_map="auto",
44
  trust_remote_code=True
 
48
  if self.tokenizer.pad_token is None:
49
  self.tokenizer.pad_token = self.tokenizer.eos_token
50
 
51
+ print("βœ… Gemma model loaded successfully!")
52
 
53
  def extract_audio_features(self, audio_path: str) -> Dict[str, Any]:
54
+ """Extract audio features for emotion analysis"""
55
  try:
56
+ y, sr = librosa.load(audio_path, sr=22050, duration=10)
 
57
 
 
58
  features = {}
59
 
60
+ # MFCC features
61
  mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
62
  features['mfcc_mean'] = np.mean(mfcc, axis=1).tolist()
 
63
 
64
  # Spectral features
65
  spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
66
+ features['spectral_centroid'] = float(np.mean(spectral_centroid))
 
67
 
68
  # Zero crossing rate
69
  zcr = librosa.feature.zero_crossing_rate(y)
70
+ features['zcr'] = float(np.mean(zcr))
 
71
 
72
  # RMS energy
73
  rms = librosa.feature.rms(y=y)
74
+ features['rms'] = float(np.mean(rms))
 
75
 
76
+ # Pitch
77
  pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
78
+ features['pitch'] = float(np.mean(pitches[pitches > 0])) if np.any(pitches > 0) else 0.0
 
 
 
 
 
79
 
80
+ print(f"βœ… Extracted audio features")
 
 
 
81
  return features
82
 
83
  except Exception as e:
84
  print(f"❌ Error extracting audio features: {e}")
85
  return {}
86
 
87
+ def features_to_prompt(self, features: Dict[str, Any]) -> str:
88
+ """Convert audio features to a prompt for Gemma"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ prompt = f"""Analyze the emotional content of audio based on these acoustic features:
91
+
92
+ Audio Features:
93
+ - Spectral Centroid: {features.get('spectral_centroid', 0):.1f} Hz (brightness)
94
+ - Zero Crossing Rate: {features.get('zcr', 0):.3f} (speech rate)
95
+ - RMS Energy: {features.get('rms', 0):.3f} (loudness)
96
+ - Pitch: {features.get('pitch', 0):.1f} Hz
97
+
98
+ Based on these acoustic properties, determine the primary emotion from: happy, sad, angry, fearful, disgusted, surprised, neutral, excited, calm, anxious.
99
+
100
+ Provide analysis in this format:
101
+ Primary Emotion: [emotion]
102
+ Confidence: [high/medium/low]
103
+ Reasoning: [brief explanation based on features]
104
+
105
+ Analysis:"""
106
 
107
+ return prompt
 
108
 
109
  def analyze_emotion(self, audio_path: str) -> Dict[str, Any]:
110
+ """Analyze emotion from audio file using Gemma"""
111
  try:
112
  print(f"🎡 Analyzing audio: {audio_path}")
113
 
 
116
  if not features:
117
  return {"error": "Failed to extract audio features"}
118
 
119
+ # Create prompt
120
+ prompt = self.features_to_prompt(features)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  print("πŸ€– Generating emotion analysis with Gemma...")
123
 
 
125
  inputs = self.tokenizer(
126
  prompt,
127
  return_tensors="pt",
128
+ max_length=512,
129
+ truncation=True
 
130
  ).to(self.device)
131
 
132
  # Generate response
133
  with torch.no_grad():
134
  outputs = self.model.generate(
135
  **inputs,
136
+ max_new_tokens=150,
137
  temperature=0.7,
138
  do_sample=True,
139
  top_p=0.9,
 
142
 
143
  # Decode response
144
  response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
145
  generated_text = response[len(prompt):].strip()
146
 
147
  print(f"βœ… Gemma response: {generated_text}")
148
 
 
149
  return self.parse_emotion_response(generated_text, features)
150
 
151
  except Exception as e:
 
153
  return {"error": f"Analysis failed: {str(e)}"}
154
 
155
  def parse_emotion_response(self, response: str, features: Dict[str, Any]) -> Dict[str, Any]:
156
+ """Parse Gemma's response"""
157
  try:
158
  result = {
159
  "primary_emotion": "unknown",
160
+ "confidence": "unknown",
161
+ "reasoning": response,
162
+ "audio_features": features
 
 
163
  }
164
 
165
  lines = response.split('\n')
 
168
  if line.startswith('Primary Emotion:'):
169
  result["primary_emotion"] = line.split(':', 1)[1].strip()
170
  elif line.startswith('Confidence:'):
171
+ result["confidence"] = line.split(':', 1)[1].strip()
 
 
 
 
172
  elif line.startswith('Reasoning:'):
173
  result["reasoning"] = line.split(':', 1)[1].strip()
 
 
 
174
 
 
 
 
 
175
  return result
176
 
177
  except Exception as e:
 
178
  return {
179
  "primary_emotion": "unknown",
180
+ "confidence": "unknown",
181
  "reasoning": response,
 
182
  "audio_features": features,
 
183
  "error": f"Parsing error: {str(e)}"
184
  }
185
 
186
  # Initialize the analyzer
187
+ print("πŸ”„ Initializing Gemma Audio Emotion Analyzer...")
188
  analyzer = AudioEmotionAnalyzer()
189
 
190
+ def process_audio(audio_path: str) -> str:
191
  """Gradio-compatible function to process audio"""
192
  if audio_path is None:
193
+ return "❌ No audio file provided"
194
 
195
  try:
196
  result = analyzer.analyze_emotion(audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
  if "error" in result:
199
  return f"❌ Error: {result['error']}"
200
 
201
+ # Format output
202
  emotion = result.get("primary_emotion", "unknown")
203
+ confidence = result.get("confidence", "unknown")
204
  reasoning = result.get("reasoning", "")
 
 
 
205
 
206
  output = f"""
207
+ 🎭 **Primary Emotion**: {emotion.title()}
208
+ πŸ“Š **Confidence**: {confidence}
209
+ πŸ’­ **Reasoning**: {reasoning}
210
+
211
+ πŸ“ˆ **Audio Features Analyzed**:
212
+ - Spectral Brightness: {result['audio_features'].get('spectral_centroid', 0):.1f} Hz
213
+ - Speech Rate: {result['audio_features'].get('zcr', 0):.3f}
214
+ - Loudness: {result['audio_features'].get('rms', 0):.3f}
215
+ - Pitch: {result['audio_features'].get('pitch', 0):.1f} Hz
 
 
 
 
216
  """
217
 
218
  return output
219
+
220
+ except Exception as e:
221
+ return f"❌ Processing error: {str(e)}"
222
+
223
+ # Create Gradio interface
224
+ demo = gr.Interface(
225
+ fn=process_audio,
226
+ inputs=gr.Audio(
227
+ sources=["upload", "microphone"],
228
+ type="filepath",
229
+ label="Upload Audio File or Record"
230
+ ),
231
+ outputs=gr.Textbox(label="Emotion Analysis Result"),
232
+ title="🎡 Audio Emotion Analyzer with Google Gemma",
233
+ description="Upload audio or record to analyze emotions using Google's Gemma-2-2B model",
234
+ examples=[],
235
+ )
 
 
 
 
 
 
236
 
 
237
  if __name__ == "__main__":
238
+ print("πŸš€ Starting Gemma Audio Emotion Analyzer...")
 
 
 
 
 
 
 
239
  demo.launch(
240
  server_name="0.0.0.0",
241
  server_port=7860,
242
+ share=True
 
243
  )