jatinsabari commited on
Commit
56a4063
Β·
verified Β·
1 Parent(s): a1009a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +192 -169
app.py CHANGED
@@ -1,194 +1,198 @@
1
  import gradio as gr
2
  import librosa
3
  import numpy as np
4
- import torch
5
- from transformers import AutoTokenizer, AutoModelForCausalLM
6
  import tempfile
7
  import os
8
  from typing import Dict, Any
9
- from huggingface_hub import login
10
 
11
- # Your Hugging Face token - REPLACE WITH YOUR ACTUAL TOKEN
12
- HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
13
-
14
- # Login to Hugging Face
15
- try:
16
- login(token=HF_TOKEN)
17
- print("βœ… Successfully authenticated with Hugging Face")
18
- except Exception as e:
19
- print(f"❌ Authentication failed: {e}")
20
-
21
- # Model configuration
22
- MODEL_NAME = "google/gemma-2-2b-it"
23
-
24
- class AudioEmotionAnalyzer:
25
- def __init__(self, model_name: str = MODEL_NAME):
26
- self.model_name = model_name
27
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
28
- print(f"πŸš€ Using device: {self.device}")
29
-
30
- # Load tokenizer and model with authentication
31
- print("πŸ“₯ Loading tokenizer...")
32
- self.tokenizer = AutoTokenizer.from_pretrained(
33
- model_name,
34
- token=HF_TOKEN,
35
- trust_remote_code=True
36
- )
37
-
38
- print("πŸ“₯ Loading model...")
39
- self.model = AutoModelForCausalLM.from_pretrained(
40
- model_name,
41
- token=HF_TOKEN,
42
- torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
43
- device_map="auto",
44
- trust_remote_code=True
45
- )
46
-
47
- # Add padding token if it doesn't exist
48
- if self.tokenizer.pad_token is None:
49
- self.tokenizer.pad_token = self.tokenizer.eos_token
50
-
51
- print("βœ… Gemma model loaded successfully!")
52
 
53
  def extract_audio_features(self, audio_path: str) -> Dict[str, Any]:
54
- """Extract audio features for emotion analysis"""
55
  try:
56
- y, sr = librosa.load(audio_path, sr=22050, duration=10)
 
57
 
58
  features = {}
59
 
60
- # MFCC features
61
- mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
62
- features['mfcc_mean'] = np.mean(mfcc, axis=1).tolist()
63
 
64
- # Spectral features
65
  spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
66
- features['spectral_centroid'] = float(np.mean(spectral_centroid))
 
 
 
 
67
 
68
- # Zero crossing rate
69
  zcr = librosa.feature.zero_crossing_rate(y)
70
- features['zcr'] = float(np.mean(zcr))
71
 
72
- # RMS energy
73
- rms = librosa.feature.rms(y=y)
74
- features['rms'] = float(np.mean(rms))
75
 
76
- # Pitch
77
- pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
78
- features['pitch'] = float(np.mean(pitches[pitches > 0])) if np.any(pitches > 0) else 0.0
 
79
 
80
- print(f"βœ… Extracted audio features")
81
  return features
82
 
83
  except Exception as e:
84
- print(f"❌ Error extracting audio features: {e}")
85
- return {}
 
 
 
 
 
 
 
 
86
 
87
- def features_to_prompt(self, features: Dict[str, Any]) -> str:
88
- """Convert audio features to a prompt for Gemma"""
89
 
90
- prompt = f"""Analyze the emotional content of audio based on these acoustic features:
91
-
92
- Audio Features:
93
- - Spectral Centroid: {features.get('spectral_centroid', 0):.1f} Hz (brightness)
94
- - Zero Crossing Rate: {features.get('zcr', 0):.3f} (speech rate)
95
- - RMS Energy: {features.get('rms', 0):.3f} (loudness)
96
- - Pitch: {features.get('pitch', 0):.1f} Hz
97
-
98
- Based on these acoustic properties, determine the primary emotion from: happy, sad, angry, fearful, disgusted, surprised, neutral, excited, calm, anxious.
99
-
100
- Provide analysis in this format:
101
- Primary Emotion: [emotion]
102
- Confidence: [high/medium/low]
103
- Reasoning: [brief explanation based on features]
104
-
105
- Analysis:"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
- return prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  def analyze_emotion(self, audio_path: str) -> Dict[str, Any]:
110
- """Analyze emotion from audio file using Gemma"""
111
  try:
112
- print(f"🎡 Analyzing audio: {audio_path}")
113
 
114
- # Extract audio features
115
  features = self.extract_audio_features(audio_path)
116
- if not features:
117
- return {"error": "Failed to extract audio features"}
118
-
119
- # Create prompt
120
- prompt = self.features_to_prompt(features)
121
-
122
- print("πŸ€– Generating emotion analysis with Gemma...")
123
-
124
- # Tokenize input
125
- inputs = self.tokenizer(
126
- prompt,
127
- return_tensors="pt",
128
- max_length=512,
129
- truncation=True
130
- ).to(self.device)
131
 
132
- # Generate response
133
- with torch.no_grad():
134
- outputs = self.model.generate(
135
- **inputs,
136
- max_new_tokens=150,
137
- temperature=0.7,
138
- do_sample=True,
139
- top_p=0.9,
140
- pad_token_id=self.tokenizer.eos_token_id
141
- )
142
-
143
- # Decode response
144
- response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
145
- generated_text = response[len(prompt):].strip()
146
-
147
- print(f"βœ… Gemma response: {generated_text}")
148
-
149
- return self.parse_emotion_response(generated_text, features)
150
-
151
- except Exception as e:
152
- print(f"❌ Error in emotion analysis: {e}")
153
- return {"error": f"Analysis failed: {str(e)}"}
154
-
155
- def parse_emotion_response(self, response: str, features: Dict[str, Any]) -> Dict[str, Any]:
156
- """Parse Gemma's response"""
157
- try:
158
- result = {
159
- "primary_emotion": "unknown",
160
- "confidence": "unknown",
161
- "reasoning": response,
162
- "audio_features": features
163
- }
164
-
165
- lines = response.split('\n')
166
- for line in lines:
167
- line = line.strip()
168
- if line.startswith('Primary Emotion:'):
169
- result["primary_emotion"] = line.split(':', 1)[1].strip()
170
- elif line.startswith('Confidence:'):
171
- result["confidence"] = line.split(':', 1)[1].strip()
172
- elif line.startswith('Reasoning:'):
173
- result["reasoning"] = line.split(':', 1)[1].strip()
174
 
 
175
  return result
176
 
177
  except Exception as e:
 
178
  return {
179
- "primary_emotion": "unknown",
180
- "confidence": "unknown",
181
- "reasoning": response,
182
- "audio_features": features,
183
- "error": f"Parsing error: {str(e)}"
184
  }
185
 
186
- # Initialize the analyzer
187
- print("πŸ”„ Initializing Gemma Audio Emotion Analyzer...")
188
- analyzer = AudioEmotionAnalyzer()
189
 
190
  def process_audio(audio_path: str) -> str:
191
- """Gradio-compatible function to process audio"""
192
  if audio_path is None:
193
  return "❌ No audio file provided"
194
 
@@ -198,22 +202,38 @@ def process_audio(audio_path: str) -> str:
198
  if "error" in result:
199
  return f"❌ Error: {result['error']}"
200
 
201
- # Format output
202
- emotion = result.get("primary_emotion", "unknown")
203
- confidence = result.get("confidence", "unknown")
204
- reasoning = result.get("reasoning", "")
205
 
206
- output = f"""
207
- 🎭 **Primary Emotion**: {emotion.title()}
208
- πŸ“Š **Confidence**: {confidence}
209
- πŸ’­ **Reasoning**: {reasoning}
 
 
 
 
 
 
210
 
211
- πŸ“ˆ **Audio Features Analyzed**:
212
- - Spectral Brightness: {result['audio_features'].get('spectral_centroid', 0):.1f} Hz
213
- - Speech Rate: {result['audio_features'].get('zcr', 0):.3f}
214
- - Loudness: {result['audio_features'].get('rms', 0):.3f}
215
- - Pitch: {result['audio_features'].get('pitch', 0):.1f} Hz
216
- """
 
 
 
 
 
 
 
 
 
 
 
217
 
218
  return output
219
 
@@ -226,16 +246,19 @@ demo = gr.Interface(
226
  inputs=gr.Audio(
227
  sources=["upload", "microphone"],
228
  type="filepath",
229
- label="Upload Audio File or Record"
 
230
  ),
231
- outputs=gr.Textbox(label="Emotion Analysis Result"),
232
- title="🎡 Audio Emotion Analyzer with Google Gemma",
233
- description="Upload audio or record to analyze emotions using Google's Gemma-2-2B model",
234
  examples=[],
 
235
  )
236
 
237
  if __name__ == "__main__":
238
- print("πŸš€ Starting Gemma Audio Emotion Analyzer...")
 
239
  demo.launch(
240
  server_name="0.0.0.0",
241
  server_port=7860,
 
1
  import gradio as gr
2
  import librosa
3
  import numpy as np
 
 
4
  import tempfile
5
  import os
6
  from typing import Dict, Any
7
+ import json
8
 
9
+ class FastAudioEmotionAnalyzer:
10
+ def __init__(self):
11
+ print("πŸš€ Initializing Fast Audio Emotion Analyzer...")
12
+
13
+ # Pre-defined emotion rules based on audio features (no model loading)
14
+ self.emotion_rules = {
15
+ 'happy': {
16
+ 'conditions': ['high_pitch', 'high_energy', 'fast_tempo', 'bright_timbre'],
17
+ 'description': 'Characterized by high energy, bright tones, and fast pace'
18
+ },
19
+ 'sad': {
20
+ 'conditions': ['low_pitch', 'low_energy', 'slow_tempo', 'dark_timbre'],
21
+ 'description': 'Characterized by low energy, slow pace, and dark tones'
22
+ },
23
+ 'angry': {
24
+ 'conditions': ['high_energy', 'harsh_timbre', 'irregular_rhythm', 'high_pitch_variability'],
25
+ 'description': 'Characterized by high energy, harsh tones, and irregular patterns'
26
+ },
27
+ 'fearful': {
28
+ 'conditions': ['high_pitch', 'irregular_energy', 'fast_tempo', 'tremolo_effect'],
29
+ 'description': 'Characterized by high pitch, irregular energy, and nervous tempo'
30
+ },
31
+ 'neutral': {
32
+ 'conditions': ['medium_energy', 'medium_pitch', 'steady_tempo', 'balanced_timbre'],
33
+ 'description': 'Characterized by balanced features and steady patterns'
34
+ },
35
+ 'excited': {
36
+ 'conditions': ['very_high_energy', 'fast_tempo', 'bright_timbre', 'high_pitch'],
37
+ 'description': 'Characterized by very high energy and fast, bright patterns'
38
+ },
39
+ 'calm': {
40
+ 'conditions': ['low_energy', 'slow_tempo', 'smooth_timbre', 'low_pitch_variability'],
41
+ 'description': 'Characterized by low energy, smooth tones, and steady pace'
42
+ }
43
+ }
44
+
45
+ print("βœ… Fast analyzer ready! (No heavy models to load)")
 
 
 
 
46
 
47
  def extract_audio_features(self, audio_path: str) -> Dict[str, Any]:
48
+ """Extract audio features quickly"""
49
  try:
50
+ # Load only first 5 seconds for faster processing
51
+ y, sr = librosa.load(audio_path, sr=22050, duration=5)
52
 
53
  features = {}
54
 
55
+ # Basic MFCC (fast)
56
+ mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=5) # Reduced from 13 to 5
57
+ features['mfcc_mean'] = float(np.mean(mfcc))
58
 
59
+ # Spectral centroid (brightness)
60
  spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
61
+ features['brightness'] = float(np.mean(spectral_centroid))
62
+
63
+ # RMS energy (loudness)
64
+ rms = librosa.feature.rms(y=y)
65
+ features['energy'] = float(np.mean(rms))
66
 
67
+ # Zero crossing rate (noisiness/speech rate)
68
  zcr = librosa.feature.zero_crossing_rate(y)
69
+ features['speech_rate'] = float(np.mean(zcr))
70
 
71
+ # Tempo (pace)
72
+ tempo, _ = librosa.beat.beat_track(y=y, sr=sr, onset_envelope=None) # Faster tempo estimation
73
+ features['tempo'] = float(tempo) if tempo else 80.0
74
 
75
+ # Pitch mean
76
+ pitches = librosa.piptrack(y=y, sr=sr, fmin=50, fmax=500)[0]
77
+ pitches = pitches[pitches > 0]
78
+ features['pitch'] = float(np.mean(pitches)) if len(pitches) > 0 else 150.0
79
 
80
+ print(f"βœ… Extracted features in milliseconds")
81
  return features
82
 
83
  except Exception as e:
84
+ print(f"❌ Feature extraction error: {e}")
85
+ # Return default features
86
+ return {
87
+ 'brightness': 1500.0,
88
+ 'energy': 0.05,
89
+ 'speech_rate': 0.1,
90
+ 'tempo': 100.0,
91
+ 'pitch': 200.0,
92
+ 'mfcc_mean': 0.0
93
+ }
94
 
95
+ def analyze_emotion_rules(self, features: Dict[str, Any]) -> Dict[str, Any]:
96
+ """Analyze emotion using rule-based system (very fast)"""
97
 
98
+ # Define feature thresholds
99
+ conditions = []
100
+
101
+ # Brightness conditions
102
+ if features['brightness'] > 2000:
103
+ conditions.append('bright_timbre')
104
+ elif features['brightness'] < 1000:
105
+ conditions.append('dark_timbre')
106
+ else:
107
+ conditions.append('balanced_timbre')
108
+
109
+ # Energy conditions
110
+ if features['energy'] > 0.1:
111
+ conditions.append('high_energy')
112
+ elif features['energy'] > 0.05:
113
+ conditions.append('medium_energy')
114
+ else:
115
+ conditions.append('low_energy')
116
+
117
+ # Pitch conditions
118
+ if features['pitch'] > 250:
119
+ conditions.append('high_pitch')
120
+ elif features['pitch'] < 150:
121
+ conditions.append('low_pitch')
122
+ else:
123
+ conditions.append('medium_pitch')
124
+
125
+ # Tempo conditions
126
+ if features['tempo'] > 140:
127
+ conditions.append('fast_tempo')
128
+ elif features['tempo'] < 90:
129
+ conditions.append('slow_tempo')
130
+ else:
131
+ conditions.append('steady_tempo')
132
+
133
+ # Speech rate conditions
134
+ if features['speech_rate'] > 0.15:
135
+ conditions.append('fast_speech')
136
+ elif features['speech_rate'] < 0.08:
137
+ conditions.append('slow_speech')
138
+ else:
139
+ conditions.append('normal_speech')
140
 
141
+ # Score each emotion based on matching conditions
142
+ emotion_scores = {}
143
+ for emotion, data in self.emotion_rules.items():
144
+ score = 0
145
+ for condition in data['conditions']:
146
+ if condition in conditions:
147
+ score += 1
148
+ emotion_scores[emotion] = score / len(data['conditions'])
149
+
150
+ # Get top emotion
151
+ top_emotion = max(emotion_scores, key=emotion_scores.get)
152
+ confidence = emotion_scores[top_emotion]
153
+
154
+ # Generate reasoning
155
+ reasoning = f"Audio shows {conditions[0]}, {conditions[1]}, {conditions[2]}. "
156
+ reasoning += f"Pattern matches {top_emotion} emotion ({self.emotion_rules[top_emotion]['description']})."
157
+
158
+ return {
159
+ 'primary_emotion': top_emotion,
160
+ 'confidence': confidence,
161
+ 'reasoning': reasoning,
162
+ 'all_scores': emotion_scores,
163
+ 'detected_conditions': conditions
164
+ }
165
 
166
  def analyze_emotion(self, audio_path: str) -> Dict[str, Any]:
167
+ """Fast emotion analysis (usually < 2 seconds)"""
168
  try:
169
+ print(f"🎡 Fast analyzing: {os.path.basename(audio_path)}")
170
 
171
+ # Extract features (fast)
172
  features = self.extract_audio_features(audio_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
+ # Rule-based analysis (instant)
175
+ result = self.analyze_emotion_rules(features)
176
+ result['audio_features'] = features
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
178
+ print(f"βœ… Analysis complete: {result['primary_emotion']} ({result['confidence']:.1%})")
179
  return result
180
 
181
  except Exception as e:
182
+ print(f"❌ Analysis error: {e}")
183
  return {
184
+ 'primary_emotion': 'neutral',
185
+ 'confidence': 0.5,
186
+ 'reasoning': f'Analysis failed: {str(e)}',
187
+ 'error': str(e)
 
188
  }
189
 
190
+ # Initialize the fast analyzer
191
+ print("πŸ”„ Initializing Fast Audio Emotion Analyzer...")
192
+ analyzer = FastAudioEmotionAnalyzer()
193
 
194
  def process_audio(audio_path: str) -> str:
195
+ """Gradio-compatible function"""
196
  if audio_path is None:
197
  return "❌ No audio file provided"
198
 
 
202
  if "error" in result:
203
  return f"❌ Error: {result['error']}"
204
 
205
+ # Format beautiful output
206
+ emotion = result['primary_emotion']
207
+ confidence = result['confidence']
 
208
 
209
+ # Emotion emojis
210
+ emotion_emojis = {
211
+ 'happy': '😊',
212
+ 'sad': '😒',
213
+ 'angry': '😠',
214
+ 'fearful': '😨',
215
+ 'neutral': '😐',
216
+ 'excited': '🀩',
217
+ 'calm': '😌'
218
+ }
219
 
220
+ emoji = emotion_emojis.get(emotion, '🎭')
221
+
222
+ output = f"""
223
+ {emoji} **Primary Emotion**: {emotion.title()}
224
+ πŸ“Š **Confidence**: {confidence:.1%}
225
+
226
+ πŸ’­ **Reasoning**: {result['reasoning']}
227
+
228
+ πŸ“ˆ **Audio Analysis**:
229
+ β€’ Brightness: {result['audio_features']['brightness']:.0f} Hz
230
+ β€’ Energy: {result['audio_features']['energy']:.3f}
231
+ β€’ Pitch: {result['audio_features']['pitch']:.0f} Hz
232
+ β€’ Tempo: {result['audio_features']['tempo']:.0f} BPM
233
+ β€’ Speech Rate: {result['audio_features']['speech_rate']:.3f}
234
+
235
+ πŸ” **Detected Patterns**: {', '.join(result['detected_conditions'][:3])}
236
+ """
237
 
238
  return output
239
 
 
246
  inputs=gr.Audio(
247
  sources=["upload", "microphone"],
248
  type="filepath",
249
+ label="Upload Audio File or Record",
250
+ max_length=30 # Limit to 30 seconds for faster processing
251
  ),
252
+ outputs=gr.Markdown(label="Emotion Analysis Result"),
253
+ title="🎡 Fast Audio Emotion Analyzer",
254
+ description="**Lightning-fast emotion detection from audio** ⚑ (Processes in 1-2 seconds)",
255
  examples=[],
256
+ allow_flagging="never"
257
  )
258
 
259
  if __name__ == "__main__":
260
+ print("πŸš€ Starting Fast Audio Emotion Analyzer...")
261
+ print("⚑ Ready to process audio in seconds!")
262
  demo.launch(
263
  server_name="0.0.0.0",
264
  server_port=7860,