EurekaPotato commited on
Commit
cedabd5
Β·
verified Β·
1 Parent(s): 2110638

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. audio_features.py +421 -0
  2. emotion_features.py +411 -0
  3. handler.py +57 -266
  4. requirements.txt +5 -3
audio_features.py ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Audio Feature Extractor - IMPROVED VERSION
3
+ Extracts 14 voice features from audio to detect busy/distracted states.
4
+
5
+ KEY IMPROVEMENTS:
6
+ 1. HNR instead of SNR - Better for voice recordings (not affected by recording noise)
7
+ 2. Smarter noise classification using multiple spectral features
8
+ 3. Removed useless latency feature (t9_latency) from consideration
9
+ """
10
+
11
+ import numpy as np
12
+ import librosa
13
+ import soundfile as sf
14
+ from scipy import signal
15
+ from typing import Dict, Tuple, List
16
+ import noisereduce as nr
17
+ import torch
18
+ import warnings
19
+ from .emotion_features import EmotionFeatureExtractor
20
+
21
+ warnings.filterwarnings("ignore")
22
+
23
+ class AudioFeatureExtractor:
24
+ """Extract 14 audio features for busy detection (Enhanced with Silero VAD)"""
25
+
26
+ _vad_model_cache = None
27
+ _vad_utils_cache = None
28
+ _emotion_extractor_cache = None
29
+
30
+ def __init__(self, sample_rate: int = 16000, use_emotion: bool = True, config: Dict = None, emotion_models_dir: str = None):
31
+ self.config = config or {}
32
+ self.sample_rate = self.config.get('audio_sample_rate', sample_rate)
33
+ self.vad_sample_rate = self.config.get('vad_sample_rate', self.sample_rate)
34
+ self.use_emotion = use_emotion and (not self.config.get('skip_emotion_features', False))
35
+ self.skip_noise_reduction = bool(self.config.get('skip_noise_reduction', False))
36
+ self.audio_duration_limit = self.config.get('audio_duration_limit', None)
37
+ self.emotion_models_dir = emotion_models_dir
38
+
39
+ print("Loading Silero VAD...")
40
+ try:
41
+ if AudioFeatureExtractor._vad_model_cache is None:
42
+ AudioFeatureExtractor._vad_model_cache, AudioFeatureExtractor._vad_utils_cache = torch.hub.load(
43
+ repo_or_dir='snakers4/silero-vad',
44
+ model='silero_vad',
45
+ force_reload=False,
46
+ trust_repo=True
47
+ )
48
+ self.vad_model = AudioFeatureExtractor._vad_model_cache
49
+ utils = AudioFeatureExtractor._vad_utils_cache
50
+ self.get_speech_timestamps = utils[0]
51
+ print("[OK] Silero VAD loaded (cached)")
52
+ except Exception as e:
53
+ print(f"[WARN] Failed to load Silero VAD: {e}. Fallback to energy VAD might be needed.")
54
+ self.vad_model = None
55
+
56
+ if self.use_emotion:
57
+ print("Loading Emotion CNN...")
58
+ try:
59
+ if AudioFeatureExtractor._emotion_extractor_cache is None:
60
+ # Pass models dir to extractor
61
+ AudioFeatureExtractor._emotion_extractor_cache = EmotionFeatureExtractor(models_dir=self.emotion_models_dir)
62
+ self.emotion_extractor = AudioFeatureExtractor._emotion_extractor_cache
63
+ print("[OK] Emotion CNN loaded (cached)")
64
+ except Exception as e:
65
+ print(f"[WARN] Emotion features disabled: {e}")
66
+ self.emotion_extractor = None
67
+ self.use_emotion = False
68
+ else:
69
+ self.emotion_extractor = None
70
+
71
+ def load_audio(self, audio_path: str) -> np.ndarray:
72
+ """Load and preprocess audio file"""
73
+ audio, sr = librosa.load(
74
+ audio_path,
75
+ sr=self.sample_rate,
76
+ mono=True,
77
+ duration=self.audio_duration_limit
78
+ )
79
+ return audio
80
+
81
+ def extract_hnr(self, audio: np.ndarray) -> float:
82
+ """
83
+ V1: Harmonics-to-Noise Ratio (HNR)
84
+ Measures voice quality - higher = clearer voice
85
+
86
+ IMPROVEMENT: HNR is better than SNR for voice because:
87
+ - Not affected by recording equipment noise
88
+ - Focuses on harmonic structure of speech
89
+ - More robust to environmental noise
90
+
91
+ Range: 0-30 dB (typical: 10-20 dB for clear speech)
92
+ """
93
+ if len(audio) == 0 or len(audio) < 2048:
94
+ return 15.0 # Neutral default
95
+
96
+ try:
97
+ # Method 1: Autocorrelation-based HNR (most accurate)
98
+ frame_length = 2048
99
+ hop_length = 512
100
+ hnr_values = []
101
+
102
+ for i in range(0, len(audio) - frame_length, hop_length):
103
+ frame = audio[i:i+frame_length]
104
+
105
+ # Only process frames with enough energy
106
+ energy = np.sum(frame ** 2)
107
+ if energy < 0.001:
108
+ continue
109
+
110
+ # Autocorrelation
111
+ autocorr = np.correlate(frame, frame, mode='full')
112
+ autocorr = autocorr[len(autocorr)//2:]
113
+
114
+ # Normalize
115
+ if autocorr[0] > 0:
116
+ autocorr = autocorr / autocorr[0]
117
+ else:
118
+ continue
119
+
120
+ # Find fundamental frequency peak (skip first 20 samples = ~1250 Hz max)
121
+ min_lag = int(self.sample_rate / 400) # Max 400 Hz
122
+ max_lag = int(self.sample_rate / 75) # Min 75 Hz
123
+
124
+ if max_lag >= len(autocorr):
125
+ continue
126
+
127
+ peak_idx = np.argmax(autocorr[min_lag:max_lag]) + min_lag
128
+
129
+ if peak_idx > 0 and autocorr[peak_idx] > 0.3: # Minimum correlation threshold
130
+ # HNR calculation
131
+ periodic_power = autocorr[peak_idx]
132
+ aperiodic_power = 1 - periodic_power
133
+
134
+ if aperiodic_power > 0:
135
+ hnr_db = 10 * np.log10(periodic_power / aperiodic_power)
136
+ # Clip to realistic range
137
+ hnr_db = np.clip(hnr_db, 0, 30)
138
+ hnr_values.append(hnr_db)
139
+
140
+ if len(hnr_values) > 0:
141
+ # Return median (more robust than mean)
142
+ return float(np.median(hnr_values))
143
+
144
+ # Method 2: Fallback using spectral flatness
145
+ flatness = np.mean(librosa.feature.spectral_flatness(y=audio))
146
+ # Convert to HNR-like scale (inverted)
147
+ hnr_proxy = (1 - np.clip(flatness, 0, 1)) * 25
148
+ return float(hnr_proxy)
149
+
150
+ except Exception as e:
151
+ print(f"HNR extraction failed: {e}")
152
+ return 15.0 # Safe default
153
+
154
+ def classify_noise_type(self, audio: np.ndarray) -> Dict[str, float]:
155
+ """
156
+ V2: Background Noise Classification (one-hot encoded)
157
+
158
+ IMPROVEMENT: Uses multiple spectral features for better accuracy:
159
+ - Spectral centroid (frequency brightness)
160
+ - Spectral rolloff (energy distribution)
161
+ - Zero crossing rate (noisiness)
162
+ - Low frequency energy (rumble)
163
+ - High frequency energy (hiss)
164
+ - Spectral contrast (texture)
165
+ """
166
+ if len(audio) < 512:
167
+ return {'traffic': 0, 'office': 0, 'crowd': 0, 'wind': 0, 'clean': 1}
168
+
169
+ try:
170
+ # Extract comprehensive spectral features
171
+ S = np.abs(librosa.stft(audio))
172
+ if S.shape[1] == 0:
173
+ return {'traffic': 0, 'office': 0, 'crowd': 0, 'wind': 0, 'clean': 1}
174
+
175
+ # Feature 1: Spectral Centroid (brightness)
176
+ centroid = np.mean(librosa.feature.spectral_centroid(S=S, sr=self.sample_rate))
177
+
178
+ # Feature 2: Spectral Rolloff (energy concentration)
179
+ rolloff = np.mean(librosa.feature.spectral_rolloff(S=S, sr=self.sample_rate))
180
+
181
+ # Feature 3: Zero Crossing Rate
182
+ zcr = np.mean(librosa.feature.zero_crossing_rate(audio))
183
+
184
+ # Feature 4: Low frequency energy (0-500 Hz)
185
+ freqs = librosa.fft_frequencies(sr=self.sample_rate, n_fft=2048)
186
+ low_freq_mask = freqs < 500
187
+ low_energy = np.mean(S[low_freq_mask, :]) if np.any(low_freq_mask) else 0
188
+
189
+ # Feature 5: High frequency energy (4000+ Hz)
190
+ high_freq_mask = freqs > 4000
191
+ high_energy = np.mean(S[high_freq_mask, :]) if np.any(high_freq_mask) else 0
192
+
193
+ # Feature 6: Overall energy
194
+ total_energy = np.mean(audio ** 2)
195
+
196
+ # Feature 7: Spectral contrast (texture measure)
197
+ contrast = np.mean(librosa.feature.spectral_contrast(S=S, sr=self.sample_rate))
198
+
199
+ # Score each noise type based on features
200
+ scores = {
201
+ 'traffic': 0.0,
202
+ 'office': 0.0,
203
+ 'crowd': 0.0,
204
+ 'wind': 0.0,
205
+ 'clean': 0.0
206
+ }
207
+
208
+ # Traffic: Low frequency dominant + rumble + consistent
209
+ if low_energy > 0.002 and centroid < 2000 and contrast < 20:
210
+ scores['traffic'] = low_energy * 100 + (2500 - centroid) / 1000
211
+
212
+ # Office: Mid frequencies + keyboard clicks + air conditioning hum
213
+ if 1500 < centroid < 3500 and 0.0005 < total_energy < 0.005:
214
+ scores['office'] = (3500 - abs(centroid - 2500)) / 1000 + contrast / 30
215
+
216
+ # Crowd: High ZCR + varying spectrum + speech-like energy
217
+ if zcr > 0.08 and total_energy > 0.003 and contrast > 15:
218
+ scores['crowd'] = zcr * 10 + total_energy * 50
219
+
220
+ # Wind: Very high ZCR + high frequency energy + low contrast
221
+ if zcr > 0.12 and high_energy > 0.001 and contrast < 15:
222
+ scores['wind'] = zcr * 8 + high_energy * 100
223
+
224
+ # Clean: Low energy + low ZCR + high contrast (speech only)
225
+ if total_energy < 0.005 and zcr < 0.08 and contrast > 20:
226
+ scores['clean'] = (0.005 - total_energy) * 200 + contrast / 30
227
+
228
+ # If all scores are low, default to clean
229
+ if max(scores.values()) < 0.1:
230
+ scores['clean'] = 1.0
231
+
232
+ # Normalize to probabilities
233
+ total = sum(scores.values())
234
+ if total > 0:
235
+ scores = {k: v/total for k, v in scores.items()}
236
+ else:
237
+ scores['clean'] = 1.0
238
+
239
+ return scores
240
+
241
+ except Exception as e:
242
+ print(f"Noise classification failed: {e}")
243
+ return {'traffic': 0, 'office': 0, 'crowd': 0, 'wind': 0, 'clean': 1}
244
+
245
+ def extract_speech_rate(self, audio: np.ndarray, transcript: str) -> float:
246
+ """V3: Speech Rate (words per second)"""
247
+ if not transcript:
248
+ return 0.0
249
+
250
+ word_count = len(transcript.split())
251
+ duration = len(audio) / self.sample_rate
252
+
253
+ if duration == 0:
254
+ return 0.0
255
+
256
+ return word_count / duration
257
+
258
+ def extract_pitch_features(self, audio: np.ndarray) -> Tuple[float, float]:
259
+ """V4-V5: Pitch Mean and Std"""
260
+ try:
261
+ if len(audio) < 2048:
262
+ return 0.0, 0.0
263
+
264
+ # Use pyin (more robust than yin)
265
+ f0, voiced_flag, voiced_probs = librosa.pyin(
266
+ audio,
267
+ fmin=librosa.note_to_hz('C2'),
268
+ fmax=librosa.note_to_hz('C7'),
269
+ sr=self.sample_rate
270
+ )
271
+
272
+ # Only use voiced frames
273
+ f0_voiced = f0[voiced_flag]
274
+
275
+ if len(f0_voiced) == 0:
276
+ return 0.0, 0.0
277
+
278
+ return float(np.mean(f0_voiced)), float(np.std(f0_voiced))
279
+ except Exception as e:
280
+ print(f"Pitch extraction failed: {e}")
281
+ return 0.0, 0.0
282
+
283
+ def extract_energy_features(self, audio: np.ndarray) -> Tuple[float, float]:
284
+ """V6-V7: Energy Mean and Std"""
285
+ try:
286
+ rms = librosa.feature.rms(y=audio)[0]
287
+ return float(np.mean(rms)), float(np.std(rms))
288
+ except:
289
+ return 0.0, 0.0
290
+
291
+ def extract_pause_features(self, audio: np.ndarray) -> Tuple[float, float, int]:
292
+ """
293
+ V8-V10: Pause Ratio, Average Pause Duration, Mid-Pause Count
294
+ Uses Silero VAD
295
+ """
296
+ if self.vad_model is None or len(audio) < 512:
297
+ return 0.0, 0.0, 0
298
+
299
+ # Resample for VAD if configured
300
+ if self.vad_sample_rate != self.sample_rate:
301
+ try:
302
+ audio = librosa.resample(audio, orig_sr=self.sample_rate, target_sr=self.vad_sample_rate)
303
+ except Exception:
304
+ pass
305
+
306
+ # Silero expects Tensor
307
+ wav = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)
308
+
309
+ try:
310
+ speech_dict = self.get_speech_timestamps(wav, self.vad_model, sampling_rate=self.vad_sample_rate)
311
+
312
+ # Calculate speech duration
313
+ speech_samples = sum(seg['end'] - seg['start'] for seg in speech_dict)
314
+ total_samples = len(audio)
315
+
316
+ if total_samples == 0:
317
+ return 0.0, 0.0, 0
318
+
319
+ # Pause Ratio
320
+ pause_samples = total_samples - speech_samples
321
+ pause_ratio = pause_samples / total_samples
322
+
323
+ # Calculate gaps between speech segments
324
+ gaps = []
325
+ if len(speech_dict) > 1:
326
+ for i in range(len(speech_dict) - 1):
327
+ gap = speech_dict[i+1]['start'] - speech_dict[i]['end']
328
+ if gap > 0:
329
+ gaps.append(gap / self.vad_sample_rate) # Convert to seconds
330
+
331
+ avg_pause_dur = float(np.mean(gaps)) if gaps else 0.0
332
+
333
+ # Mid-Pause Count (0.3s - 1.0s)
334
+ mid_pause_cnt = sum(1 for g in gaps if 0.3 <= g <= 1.0)
335
+
336
+ return float(pause_ratio), float(avg_pause_dur), int(mid_pause_cnt)
337
+
338
+ except Exception as e:
339
+ print(f"VAD Error: {e}")
340
+ return 0.0, 0.0, 0
341
+
342
+ def extract_all(self, audio: np.ndarray, transcript: str = "") -> Dict[str, float]:
343
+ """Extract all audio features (14 original + 3 emotion = 17 total)"""
344
+
345
+ if audio.dtype != np.float32:
346
+ audio = audio.astype(np.float32)
347
+
348
+ features = {}
349
+
350
+ # V1: HNR (IMPROVED from SNR)
351
+ features['v1_snr'] = self.extract_hnr(audio) # Keep name for compatibility
352
+
353
+ # V2: Noise classification (IMPROVED)
354
+ noise_class = self.classify_noise_type(audio)
355
+ features['v2_noise_traffic'] = noise_class['traffic']
356
+ features['v2_noise_office'] = noise_class['office']
357
+ features['v2_noise_crowd'] = noise_class['crowd']
358
+ features['v2_noise_wind'] = noise_class['wind']
359
+ features['v2_noise_clean'] = noise_class['clean']
360
+
361
+ # V3: Speech rate
362
+ features['v3_speech_rate'] = self.extract_speech_rate(audio, transcript)
363
+
364
+ # V4-V5: Pitch
365
+ p_mean, p_std = self.extract_pitch_features(audio)
366
+ features['v4_pitch_mean'] = p_mean
367
+ features['v5_pitch_std'] = p_std
368
+
369
+ # V6-V7: Energy
370
+ e_mean, e_std = self.extract_energy_features(audio)
371
+ features['v6_energy_mean'] = e_mean
372
+ features['v7_energy_std'] = e_std
373
+
374
+ # V8-V10: Pause features
375
+ pause_ratio, avg_pause, mid_pause_cnt = self.extract_pause_features(audio)
376
+ features['v8_pause_ratio'] = pause_ratio
377
+ features['v9_avg_pause_dur'] = avg_pause
378
+ features['v10_mid_pause_cnt'] = float(mid_pause_cnt)
379
+
380
+ # V11-V13: Emotion features
381
+ if self.use_emotion and self.emotion_extractor is not None:
382
+ try:
383
+ emotion_features = self.emotion_extractor.extract_all(audio, self.sample_rate)
384
+ features.update(emotion_features)
385
+ except Exception as e:
386
+ print(f"⚠ Emotion features skipped: {e}")
387
+ # Add zero values for compatibility
388
+ features['v11_emotion_stress'] = 0.0
389
+ features['v12_emotion_energy'] = 0.0
390
+ features['v13_emotion_valence'] = 0.0
391
+
392
+ return features
393
+
394
+ def extract_basic(self, audio: np.ndarray, transcript: str = "") -> Dict[str, float]:
395
+ """
396
+ Extract a minimal set of audio features for fast decisions.
397
+ Uses only low-cost features.
398
+ """
399
+ if audio.dtype != np.float32:
400
+ audio = audio.astype(np.float32)
401
+
402
+ features = {}
403
+ features['v1_snr'] = self.extract_hnr(audio) # Keep name for compatibility
404
+ features['v3_speech_rate'] = self.extract_speech_rate(audio, transcript)
405
+
406
+ e_mean, e_std = self.extract_energy_features(audio)
407
+ features['v6_energy_mean'] = e_mean
408
+ features['v7_energy_std'] = e_std
409
+
410
+ pause_ratio, avg_pause, mid_pause_cnt = self.extract_pause_features(audio)
411
+ features['v8_pause_ratio'] = pause_ratio
412
+ features['v9_avg_pause_dur'] = avg_pause
413
+ features['v10_mid_pause_cnt'] = float(mid_pause_cnt)
414
+
415
+ return features
416
+
417
+
418
+ if __name__ == "__main__":
419
+ extractor = AudioFeatureExtractor()
420
+ print("Audio Feature Extractor initialized successfully")
421
+ print("Using HNR instead of SNR for better voice quality measurement")
emotion_features.py ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Emotion Feature Extractor - Using NeuroByte Models
3
+ Extracts emotion features from audio for busy detection.
4
+
5
+ Uses 3 pre-trained Keras models from NeuroByte-Consulting:
6
+ 1. CRNN (Convolutional Recurrent Neural Network) - Best for sequential patterns
7
+ 2. Mel Spectrogram CNN - Best for frequency patterns
8
+ 3. MFCC CNN - Best for speech characteristics
9
+
10
+ Each model outputs 7 emotion classes: angry, disgust, fear, happy, neutral, sad, surprise
11
+ """
12
+
13
+ import numpy as np
14
+ import librosa
15
+ import warnings
16
+ from typing import Dict, Optional
17
+ import os
18
+
19
+ warnings.filterwarnings("ignore")
20
+
21
+ try:
22
+ import tensorflow as tf
23
+ from tensorflow import keras
24
+ TENSORFLOW_AVAILABLE = True
25
+ except ImportError:
26
+ TENSORFLOW_AVAILABLE = False
27
+ print("[WARN] TensorFlow not available. Install with: pip install tensorflow")
28
+
29
+
30
+ class EmotionFeatureExtractor:
31
+ """Extract emotion features using NeuroByte pre-trained models"""
32
+
33
+ # Emotion labels from the models
34
+ EMOTIONS = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
35
+
36
+ def __init__(self, models_dir: str = None, use_ensemble: bool = True):
37
+ """
38
+ Initialize emotion detector with NeuroByte models
39
+
40
+ Args:
41
+ models_dir: Directory containing the .keras model files.
42
+ Defaults to 'models' relative to this file.
43
+ use_ensemble: If True, average predictions from all 3 models (more accurate)
44
+ If False, use only CRNN model (faster)
45
+ """
46
+ if models_dir is None:
47
+ # Default to 'models' folder in same directory as this script
48
+ models_dir = os.path.join(os.path.dirname(__file__), 'models')
49
+
50
+ self.models_dir = models_dir
51
+ self.use_ensemble = use_ensemble
52
+ self.models = {}
53
+
54
+ if not TENSORFLOW_AVAILABLE:
55
+ print("[WARN] TensorFlow not installed. Falling back to acoustic features.")
56
+ self.use_tensorflow = False
57
+ return
58
+
59
+ self.use_tensorflow = True
60
+
61
+ # Model file paths
62
+ model_files = {
63
+ 'crnn': 'emotion_recognition_crnn.keras',
64
+ 'mel_spec': 'emotion_recognition_mel_spec.keras',
65
+ 'mfcc': 'emotion_recognition_mfcc.keras'
66
+ }
67
+
68
+ # Load models
69
+ print(f"Loading NeuroByte emotion models from {models_dir}...")
70
+
71
+ for model_name, filename in model_files.items():
72
+ model_path = os.path.join(models_dir, filename)
73
+
74
+ if os.path.exists(model_path):
75
+ try:
76
+ self.models[model_name] = keras.models.load_model(model_path)
77
+ print(f"[OK] Loaded {model_name} model")
78
+ except Exception as e:
79
+ print(f"[WARN] Failed to load {model_name}: {e}")
80
+ else:
81
+ print(f"[WARN] Model not found: {model_path}")
82
+
83
+ # If no models loaded, fall back to acoustics
84
+ if len(self.models) == 0:
85
+ print("[WARN] No models loaded. Using acoustic features fallback.")
86
+ self.use_tensorflow = False
87
+ else:
88
+ print(f"[OK] {len(self.models)} emotion model(s) loaded successfully")
89
+
90
+ def download_models(self):
91
+ """
92
+ Download NeuroByte models from Hugging Face
93
+
94
+ Run this once to download the models:
95
+ >>> extractor = EmotionFeatureExtractor()
96
+ >>> extractor.download_models()
97
+ """
98
+ if not TENSORFLOW_AVAILABLE:
99
+ print("[WARN] TensorFlow required to download models")
100
+ return
101
+
102
+ try:
103
+ from huggingface_hub import hf_hub_download
104
+
105
+ os.makedirs(self.models_dir, exist_ok=True)
106
+
107
+ repo_id = "neurobyte-org/speech-emotion-recognition"
108
+ model_files = [
109
+ 'emotion_recognition_crnn.keras',
110
+ 'emotion_recognition_mel_spec.keras',
111
+ 'emotion_recognition_mfcc.keras'
112
+ ]
113
+
114
+ print(f"Downloading models from {repo_id}...")
115
+ for filename in model_files:
116
+ try:
117
+ print(f" Downloading {filename}...")
118
+ downloaded_path = hf_hub_download(
119
+ repo_id=repo_id,
120
+ filename=filename,
121
+ cache_dir=self.models_dir
122
+ )
123
+
124
+ # Copy to expected location
125
+ target_path = os.path.join(self.models_dir, filename)
126
+ if downloaded_path != target_path:
127
+ import shutil
128
+ shutil.copy(downloaded_path, target_path)
129
+
130
+ print(f" [OK] {filename} downloaded")
131
+ except Exception as e:
132
+ print(f" [WARN] Failed to download {filename}: {e}")
133
+
134
+ print("[OK] Download complete! Reinitialize the extractor to load models.")
135
+
136
+ except ImportError:
137
+ print("[WARN] huggingface_hub not installed. Install with: pip install huggingface_hub")
138
+
139
+ def extract_mel_spectrogram(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
140
+ """
141
+ Extract mel spectrogram for the mel_spec model
142
+
143
+ Returns shape: (128, time_steps, 1) for CNN input
144
+ """
145
+ # Resample to 16kHz if needed
146
+ if sr != 16000:
147
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
148
+ sr = 16000
149
+
150
+ # Extract mel spectrogram
151
+ mel_spec = librosa.feature.melspectrogram(
152
+ y=audio,
153
+ sr=sr,
154
+ n_fft=2048,
155
+ hop_length=512,
156
+ n_mels=128,
157
+ fmin=0,
158
+ fmax=sr/2
159
+ )
160
+
161
+ # Convert to dB
162
+ mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
163
+
164
+ # Normalize to [0, 1]
165
+ mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
166
+
167
+ # Add channel dimension and transpose to (time, freq, 1)
168
+ mel_spec_norm = mel_spec_norm.T
169
+ mel_spec_norm = np.expand_dims(mel_spec_norm, axis=-1)
170
+
171
+ # Pad or truncate to fixed length (e.g., 216 frames for ~3 seconds)
172
+ target_length = 216
173
+ if mel_spec_norm.shape[0] < target_length:
174
+ # Pad with zeros
175
+ pad_width = target_length - mel_spec_norm.shape[0]
176
+ mel_spec_norm = np.pad(mel_spec_norm, ((0, pad_width), (0, 0), (0, 0)), mode='constant')
177
+ else:
178
+ # Truncate
179
+ mel_spec_norm = mel_spec_norm[:target_length, :, :]
180
+
181
+ return mel_spec_norm
182
+
183
+ def extract_mfcc(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
184
+ """
185
+ Extract MFCC features for the mfcc model
186
+
187
+ Returns shape: (40, time_steps, 1) for CNN input
188
+ """
189
+ # Resample to 16kHz if needed
190
+ if sr != 16000:
191
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
192
+ sr = 16000
193
+
194
+ # Extract MFCCs
195
+ mfccs = librosa.feature.mfcc(
196
+ y=audio,
197
+ sr=sr,
198
+ n_mfcc=40,
199
+ n_fft=2048,
200
+ hop_length=512
201
+ )
202
+
203
+ # Normalize
204
+ mfccs = (mfccs - mfccs.mean()) / (mfccs.std() + 1e-8)
205
+
206
+ # Transpose and add channel dimension
207
+ mfccs = mfccs.T
208
+ mfccs = np.expand_dims(mfccs, axis=-1)
209
+
210
+ # Pad or truncate to fixed length
211
+ target_length = 216
212
+ if mfccs.shape[0] < target_length:
213
+ pad_width = target_length - mfccs.shape[0]
214
+ mfccs = np.pad(mfccs, ((0, pad_width), (0, 0), (0, 0)), mode='constant')
215
+ else:
216
+ mfccs = mfccs[:target_length, :, :]
217
+
218
+ return mfccs
219
+
220
+ def predict_emotions(self, audio: np.ndarray, sr: int = 16000) -> Dict[str, float]:
221
+ """
222
+ Predict emotion probabilities using loaded models
223
+
224
+ Returns:
225
+ Dictionary with emotion labels as keys and probabilities as values
226
+ """
227
+ if not self.use_tensorflow or len(self.models) == 0:
228
+ return self.extract_from_acoustics(audio, sr)
229
+
230
+ try:
231
+ predictions = []
232
+
233
+ # CRNN model (if available)
234
+ if 'crnn' in self.models:
235
+ mel_spec = self.extract_mel_spectrogram(audio, sr)
236
+ mel_spec_batch = np.expand_dims(mel_spec, axis=0)
237
+
238
+ pred_crnn = self.models['crnn'].predict(mel_spec_batch, verbose=0)[0]
239
+ predictions.append(pred_crnn)
240
+
241
+ # Mel Spectrogram model (if available)
242
+ if 'mel_spec' in self.models and self.use_ensemble:
243
+ mel_spec = self.extract_mel_spectrogram(audio, sr)
244
+ mel_spec_batch = np.expand_dims(mel_spec, axis=0)
245
+
246
+ pred_mel = self.models['mel_spec'].predict(mel_spec_batch, verbose=0)[0]
247
+ predictions.append(pred_mel)
248
+
249
+ # MFCC model (if available)
250
+ if 'mfcc' in self.models and self.use_ensemble:
251
+ mfcc = self.extract_mfcc(audio, sr)
252
+ mfcc_batch = np.expand_dims(mfcc, axis=0)
253
+
254
+ pred_mfcc = self.models['mfcc'].predict(mfcc_batch, verbose=0)[0]
255
+ predictions.append(pred_mfcc)
256
+
257
+ # Average predictions if ensemble
258
+ if len(predictions) > 1:
259
+ avg_pred = np.mean(predictions, axis=0)
260
+ else:
261
+ avg_pred = predictions[0]
262
+
263
+ # Convert to dictionary
264
+ emotion_probs = {emotion: float(prob) for emotion, prob in zip(self.EMOTIONS, avg_pred)}
265
+
266
+ return emotion_probs
267
+
268
+ except Exception as e:
269
+ print(f"⚠ Prediction failed: {e}")
270
+ return self.extract_from_acoustics(audio, sr)
271
+
272
+ def extract_from_acoustics(self, audio: np.ndarray, sr: int = 16000) -> Dict[str, float]:
273
+ """
274
+ Fallback: Extract emotion proxies from acoustic features
275
+ Returns emotion-like scores without deep learning
276
+ """
277
+ try:
278
+ if len(audio) < 512:
279
+ return {emotion: 1.0/7 for emotion in self.EMOTIONS} # Uniform distribution
280
+
281
+ # Extract acoustic features
282
+ rms = librosa.feature.rms(y=audio)[0]
283
+ mean_energy = np.mean(rms)
284
+ energy_std = np.std(rms)
285
+
286
+ f0 = librosa.yin(audio, fmin=75, fmax=400, sr=sr)
287
+ f0_voiced = f0[f0 > 0]
288
+ pitch_mean = np.mean(f0_voiced) if len(f0_voiced) > 0 else 0
289
+ pitch_std = np.std(f0_voiced) if len(f0_voiced) > 0 else 0
290
+
291
+ zcr = np.mean(librosa.feature.zero_crossing_rate(audio))
292
+ centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sr))
293
+
294
+ # Heuristic mapping to emotions
295
+ scores = {
296
+ 'angry': (energy_std * 10 + pitch_std / 50) / 2,
297
+ 'disgust': (pitch_mean / 300) * 0.3,
298
+ 'fear': (pitch_mean / 250 + zcr * 5) / 2,
299
+ 'happy': (centroid / 3000 + mean_energy * 5) / 2,
300
+ 'neutral': 0.3, # Baseline
301
+ 'sad': (1 - centroid / 4000) * 0.5,
302
+ 'surprise': (energy_std * 8 + zcr * 3) / 2
303
+ }
304
+
305
+ # Normalize to sum to 1
306
+ total = sum(scores.values())
307
+ scores = {k: v / total for k, v in scores.items()}
308
+
309
+ return scores
310
+
311
+ except Exception as e:
312
+ print(f"⚠ Acoustic fallback failed: {e}")
313
+ return {emotion: 1.0/7 for emotion in self.EMOTIONS}
314
+
315
+ def extract_all(self, audio: np.ndarray, sr: int = 16000) -> Dict[str, float]:
316
+ """
317
+ Extract emotion features for busy detection
318
+
319
+ Returns:
320
+ v11_emotion_stress: 0-1 (angry + fear + disgust)
321
+ v12_emotion_energy: 0-1 (happy + surprise + angry)
322
+ v13_emotion_valence: 0-1 (happy - sad - angry)
323
+ """
324
+ if audio.dtype != np.float32:
325
+ audio = audio.astype(np.float32)
326
+
327
+ # Get emotion predictions
328
+ emotion_probs = self.predict_emotions(audio, sr)
329
+
330
+ # Map emotions to features
331
+ stress = (
332
+ emotion_probs.get('angry', 0.0) * 0.5 +
333
+ emotion_probs.get('fear', 0.0) * 0.3 +
334
+ emotion_probs.get('disgust', 0.0) * 0.2
335
+ )
336
+
337
+ energy = (
338
+ emotion_probs.get('happy', 0.0) * 0.4 +
339
+ emotion_probs.get('surprise', 0.0) * 0.3 +
340
+ emotion_probs.get('angry', 0.0) * 0.3
341
+ )
342
+
343
+ valence = (
344
+ emotion_probs.get('happy', 0.0) +
345
+ emotion_probs.get('surprise', 0.0) * 0.5 -
346
+ emotion_probs.get('sad', 0.0) -
347
+ emotion_probs.get('angry', 0.0) * 0.5
348
+ )
349
+
350
+ # Normalize valence to [0, 1]
351
+ valence = (valence + 1.0) / 2.0
352
+
353
+ return {
354
+ 'v11_emotion_stress': float(np.clip(stress, 0, 1)),
355
+ 'v12_emotion_energy': float(np.clip(energy, 0, 1)),
356
+ 'v13_emotion_valence': float(np.clip(valence, 0, 1))
357
+ }
358
+
359
+
360
+ # Standalone test
361
+ if __name__ == "__main__":
362
+ import time
363
+
364
+ print("Testing NeuroByte Emotion Feature Extractor...")
365
+
366
+ # Initialize extractor
367
+ extractor = EmotionFeatureExtractor(
368
+ models_dir="models_cache/emotion_models",
369
+ use_ensemble=True
370
+ )
371
+
372
+ # If models not found, try to download
373
+ if not extractor.use_tensorflow or len(extractor.models) == 0:
374
+ print("\nModels not found. Download them with:")
375
+ print(" extractor.download_models()")
376
+ print("\nUsing acoustic fallback for now...")
377
+
378
+ # Generate test audio
379
+ duration = 3
380
+ sr = 16000
381
+ t = np.linspace(0, duration, sr * duration)
382
+
383
+ # Test 1: Stressed voice (high pitch, varying)
384
+ print("\n1. Testing with stressed audio:")
385
+ audio_stressed = np.sin(2 * np.pi * 300 * t) + 0.5 * np.sin(2 * np.pi * 150 * t)
386
+ audio_stressed += 0.2 * np.random.randn(len(audio_stressed))
387
+
388
+ start = time.time()
389
+ features_stressed = extractor.extract_all(audio_stressed, sr)
390
+ print(f" Time: {(time.time() - start)*1000:.0f}ms")
391
+ print(" Features:")
392
+ for k, v in features_stressed.items():
393
+ print(f" {k}: {v:.3f}")
394
+
395
+ # Test 2: Calm voice (low pitch, steady)
396
+ print("\n2. Testing with calm audio:")
397
+ audio_calm = np.sin(2 * np.pi * 150 * t) * 0.3
398
+
399
+ start = time.time()
400
+ features_calm = extractor.extract_all(audio_calm, sr)
401
+ print(f" Time: {(time.time() - start)*1000:.0f}ms")
402
+ print(" Features:")
403
+ for k, v in features_calm.items():
404
+ print(f" {k}: {v:.3f}")
405
+
406
+ print("\nβœ“ Tests complete!")
407
+
408
+ if extractor.use_tensorflow and len(extractor.models) > 0:
409
+ print(f"\nUsing {len(extractor.models)} NeuroByte model(s)")
410
+ else:
411
+ print("\nUsing acoustic features fallback")
handler.py CHANGED
@@ -22,276 +22,56 @@ warnings.filterwarnings("ignore")
22
 
23
 
24
  # ──────────────────────────────────────────────────────────────────────── #
25
- # Constants & Defaults
26
  # ──────────────────────────────────────────────────────────────────────── #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- DEFAULT_AUDIO_FEATURES = {
29
- "v1_snr": 0.0,
30
- "v2_noise_traffic": 0.0, "v2_noise_office": 0.0,
31
- "v2_noise_crowd": 0.0, "v2_noise_wind": 0.0, "v2_noise_clean": 1.0,
32
- "v3_speech_rate": 0.0,
33
- "v4_pitch_mean": 0.0, "v5_pitch_std": 0.0,
34
- "v6_energy_mean": 0.0, "v7_energy_std": 0.0,
35
- "v8_pause_ratio": 0.0, "v9_avg_pause_dur": 0.0, "v10_mid_pause_cnt": 0,
36
- "v11_emotion_stress": 0.0, "v12_emotion_energy": 0.0, "v13_emotion_valence": 0.0,
37
- }
38
-
 
39
 
40
  # ──────────────────────────────────────────────────────────────────────── #
41
- # Emotion CNN (mirrors src/emotion_features.py EmotionCNN)
42
  # ──────────────────────────────────────────────────────────────────────── #
43
-
44
- class EmotionCNN:
45
- """Lightweight CNN for emotion embedding from spectrograms (MobileNetV3)."""
46
-
47
- def __init__(self):
48
- self.model = models.mobilenet_v3_small(pretrained=True)
49
- self.model.classifier = nn.Identity()
50
- self.model.eval()
51
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
52
- if self.device == "cuda":
53
- self.model = self.model.cuda()
54
-
55
- def audio_to_spectrogram(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
56
- mel_spec = librosa.feature.melspectrogram(
57
- y=audio, sr=sr, n_fft=512, hop_length=64, n_mels=128, fmin=0, fmax=sr / 2
58
- )
59
- mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
60
- mel_spec_db = np.clip(mel_spec_db, -80, 0)
61
- mel_spec_norm = (mel_spec_db + 80) / 80
62
-
63
- try:
64
- from skimage.transform import resize
65
- mel_resized = resize(mel_spec_norm, (224, 224), mode="constant")
66
- except ImportError:
67
- # Fallback: resizing with numpy interpolation (nearest neighbor for rows, linear for cols)
68
- target_h, target_w = 224, 224
69
- source_h, source_w = mel_spec_norm.shape
70
-
71
- if source_h > 0 and source_w > 0:
72
- # 1. Resize height (rows)
73
- row_indices = np.linspace(0, source_h - 1, target_h).astype(int)
74
- # Select rows (nearest neighbor)
75
- temp = mel_spec_norm[row_indices, :]
76
-
77
- # 2. Resize width (cols)
78
- mel_resized = np.zeros((target_h, target_w), dtype=mel_spec_norm.dtype)
79
- x_source = np.arange(source_w)
80
- x_target = np.linspace(0, source_w - 1, target_w)
81
-
82
- for i in range(target_h):
83
- mel_resized[i, :] = np.interp(x_target, x_source, temp[i, :])
84
  else:
85
- mel_resized = np.zeros((224, 224))
86
-
87
- try:
88
- from matplotlib import cm
89
- colormap = cm.get_cmap("jet")
90
- rgb = colormap(mel_resized)[:, :, :3]
91
- except (ImportError, Exception):
92
- # Fallback: stack grayscale into 3 channels
93
- rgb = np.stack([mel_resized] * 3, axis=-1)
94
-
95
- return np.transpose(rgb, (2, 0, 1)).astype(np.float32)
96
-
97
- def extract_embedding(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
98
- try:
99
- spec_rgb = self.audio_to_spectrogram(audio, sr)
100
- tensor = torch.from_numpy(spec_rgb).unsqueeze(0)
101
- if self.device == "cuda":
102
- tensor = tensor.cuda()
103
- with torch.no_grad():
104
- emb = self.model(tensor)
105
- return emb.cpu().numpy().flatten()
106
- except Exception as e:
107
- print(f"[WARN] EmotionCNN embedding extraction failed: {e}")
108
- return np.zeros(576) # MobileNetV3-small output size
109
 
110
 
111
- # ──────────────────────────────────────────────────────────────────────── #
112
- # Audio Feature Extractor (mirrors src/audio_features.py)
113
- # ──────────────────────────────────────────────────────────────────────── #
114
-
115
- class AudioFeatureExtractorEndpoint:
116
- """Stateless audio feature extraction for HF endpoint."""
117
-
118
- def __init__(self):
119
- self.sr = 16000
120
- self.emotion_cnn = EmotionCNN()
121
-
122
- # Load Silero VAD - optimized for CPU-only HF Spaces
123
- try:
124
- # Force CPU mode (HF Free Spaces don't have GPU)
125
- torch.set_num_threads(1)
126
-
127
- # Load from torch.hub (most reliable method)
128
- print("[INFO] Loading Silero VAD from torch.hub...")
129
- self.vad_model, self.vad_utils = torch.hub.load(
130
- repo_or_dir='snakers4/silero-vad',
131
- model='silero_vad',
132
- force_reload=False,
133
- trust_repo=True,
134
- verbose=False
135
- )
136
-
137
- # Force model to CPU
138
- self.vad_model = self.vad_model.cpu()
139
- self.vad_model.eval()
140
-
141
- # Extract the get_speech_timestamps utility
142
- self.get_speech_timestamps = self.vad_utils[0]
143
-
144
- print("βœ… Silero VAD loaded successfully (CPU mode)")
145
-
146
- except Exception as e:
147
- print(f"⚠️ Silero VAD failed to load: {e}")
148
- print(f" Audio features will use fallback values for pause detection")
149
- self.vad_model = None
150
- self.get_speech_timestamps = None
151
-
152
- # -------- V1: SNR --------
153
- def extract_snr(self, audio: np.ndarray) -> float:
154
- if len(audio) == 0:
155
- return 0.0
156
- frame_length = min(2048, len(audio))
157
- frames = librosa.util.frame(audio, frame_length=frame_length, hop_length=frame_length // 2)
158
- frame_energy = np.sum(frames ** 2, axis=0)
159
- if len(frame_energy) < 2:
160
- return 0.0
161
- sorted_energy = np.sort(frame_energy)
162
- n_noise = max(1, len(sorted_energy) // 5)
163
- noise_floor = np.mean(sorted_energy[:n_noise])
164
- signal_power = np.mean(sorted_energy)
165
- if noise_floor <= 0:
166
- return 40.0
167
- snr = 10 * np.log10(signal_power / noise_floor + 1e-10)
168
- return float(np.clip(snr, -10, 40))
169
-
170
- # -------- V2: Noise classification --------
171
- def classify_noise_type(self, audio: np.ndarray) -> Dict[str, float]:
172
- if len(audio) < 2048:
173
- return {
174
- "v2_noise_traffic": 0.0, "v2_noise_office": 0.0,
175
- "v2_noise_crowd": 0.0, "v2_noise_wind": 0.0, "v2_noise_clean": 1.0,
176
- }
177
- spec = np.abs(librosa.stft(audio, n_fft=2048))
178
- freq_bins = librosa.fft_frequencies(sr=self.sr, n_fft=2048)
179
-
180
- low = np.mean(spec[(freq_bins >= 50) & (freq_bins <= 500)])
181
- mid = np.mean(spec[(freq_bins >= 500) & (freq_bins <= 2000)])
182
- high = np.mean(spec[(freq_bins >= 2000) & (freq_bins <= 6000)])
183
- total = low + mid + high + 1e-10
184
-
185
- low_r, mid_r, high_r = low / total, mid / total, high / total
186
- spectral_centroid = float(np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sr)))
187
- spectral_flatness = float(np.mean(librosa.feature.spectral_flatness(y=audio)))
188
-
189
- noise = {
190
- "v2_noise_traffic": float(np.clip(low_r * 2 - 0.3, 0, 1)),
191
- "v2_noise_office": float(np.clip(mid_r * 1.5 - 0.2, 0, 1) if spectral_flatness > 0.01 else 0),
192
- "v2_noise_crowd": float(np.clip(mid_r * 2 - 0.5, 0, 1) if spectral_centroid > 1500 else 0),
193
- "v2_noise_wind": float(np.clip(low_r * 3 - 0.8, 0, 1) if spectral_flatness > 0.1 else 0),
194
- }
195
- noise["v2_noise_clean"] = float(np.clip(1 - max(noise.values()), 0, 1))
196
- return noise
197
-
198
- # -------- V3: Speech rate --------
199
- def extract_speech_rate(self, audio: np.ndarray, transcript: str) -> float:
200
- if not transcript:
201
- return 0.0
202
- word_count = len(transcript.split())
203
- duration = len(audio) / self.sr
204
- if duration == 0:
205
- return 0.0
206
- return float(word_count / duration)
207
-
208
- # -------- V4-V5: Pitch --------
209
- def extract_pitch_features(self, audio: np.ndarray) -> Dict[str, float]:
210
- try:
211
- pitches, magnitudes = librosa.piptrack(y=audio, sr=self.sr)
212
- pitch_values = pitches[magnitudes > np.median(magnitudes)]
213
- pitch_values = pitch_values[pitch_values > 0]
214
- if len(pitch_values) == 0:
215
- return {"v4_pitch_mean": 0.0, "v5_pitch_std": 0.0}
216
- return {
217
- "v4_pitch_mean": float(np.mean(pitch_values)),
218
- "v5_pitch_std": float(np.std(pitch_values)),
219
- }
220
- except Exception:
221
- return {"v4_pitch_mean": 0.0, "v5_pitch_std": 0.0}
222
-
223
- # -------- V6-V7: Energy --------
224
- def extract_energy_features(self, audio: np.ndarray) -> Dict[str, float]:
225
- rms = librosa.feature.rms(y=audio)[0]
226
- return {"v6_energy_mean": float(np.mean(rms)), "v7_energy_std": float(np.std(rms))}
227
-
228
- # -------- V8-V10: Pause features (Silero VAD) --------
229
- def extract_pause_features(self, audio: np.ndarray) -> Dict[str, float]:
230
- defaults = {"v8_pause_ratio": 0.0, "v9_avg_pause_dur": 0.0, "v10_mid_pause_cnt": 0}
231
- if self.vad_model is None or len(audio) < self.sr:
232
- return defaults
233
- try:
234
- audio_tensor = torch.FloatTensor(audio)
235
- timestamps = self.get_speech_timestamps(audio_tensor, self.vad_model, sampling_rate=self.sr)
236
- if not timestamps:
237
- return {"v8_pause_ratio": 1.0, "v9_avg_pause_dur": len(audio) / self.sr, "v10_mid_pause_cnt": 0}
238
-
239
- total_speech = sum(t["end"] - t["start"] for t in timestamps)
240
- total_samples = len(audio)
241
- pause_ratio = 1.0 - (total_speech / total_samples)
242
-
243
- pauses = []
244
- for i in range(1, len(timestamps)):
245
- gap = (timestamps[i]["start"] - timestamps[i - 1]["end"]) / self.sr
246
- if gap > 0.1:
247
- pauses.append(gap)
248
-
249
- return {
250
- "v8_pause_ratio": float(np.clip(pause_ratio, 0, 1)),
251
- "v9_avg_pause_dur": float(np.mean(pauses)) if pauses else 0.0,
252
- "v10_mid_pause_cnt": len([p for p in pauses if 0.3 < p < 2.0]),
253
- }
254
- except Exception:
255
- return defaults
256
-
257
- # -------- V11-V13: Emotion features --------
258
- def extract_emotion_features(self, audio: np.ndarray) -> Dict[str, float]:
259
- try:
260
- embedding = self.emotion_cnn.extract_embedding(audio, self.sr)
261
- stress_indices = [0, 100, 200, 300, 400]
262
- stress_values = embedding[stress_indices]
263
- stress_score = float(np.clip(np.mean(np.abs(stress_values)), 0, 1))
264
- return {
265
- "v11_emotion_stress": stress_score,
266
- "v12_emotion_energy": float(np.mean(np.abs(embedding[500:600]))),
267
- "v13_emotion_valence": float(np.mean(embedding[700:800])),
268
- }
269
- except Exception:
270
- return {"v11_emotion_stress": 0.0, "v12_emotion_energy": 0.0, "v13_emotion_valence": 0.0}
271
-
272
- # -------- Main: extract all --------
273
- def extract_all(self, audio: np.ndarray, transcript: str = "") -> Dict[str, float]:
274
- features = {}
275
- features["v1_snr"] = self.extract_snr(audio)
276
- features.update(self.classify_noise_type(audio))
277
- features["v3_speech_rate"] = self.extract_speech_rate(audio, transcript)
278
- features.update(self.extract_pitch_features(audio))
279
- features.update(self.extract_energy_features(audio))
280
- features.update(self.extract_pause_features(audio))
281
- features.update(self.extract_emotion_features(audio))
282
-
283
- # Sanitize: replace NaN/Inf with 0.0 (prevents JSON serialization errors)
284
- for key, val in features.items():
285
- if isinstance(val, (float, np.floating)):
286
- if np.isnan(val) or np.isinf(val):
287
- features[key] = 0.0
288
- else:
289
- features[key] = float(val) # ensure native Python float
290
- elif isinstance(val, (int, np.integer)):
291
- features[key] = int(val)
292
-
293
- return features
294
-
295
 
296
  # ──────────────────────────────────────────────────────────────────────── #
297
  # FastAPI handler for deployment (HF Spaces / Cloud Run / Lambda)
@@ -323,7 +103,7 @@ async def global_exception_handler(request: Request, exc: Exception):
323
  content={**DEFAULT_AUDIO_FEATURES, "_error": str(exc), "_handler": "global"},
324
  )
325
 
326
- extractor = AudioFeatureExtractorEndpoint()
327
 
328
  # ──────────────────────────────────────────────────────────────────────── #
329
  # Constants & Defaults
@@ -345,7 +125,13 @@ async def root():
345
 
346
  @app.get("/health")
347
  async def health():
348
- return {"status": "healthy", "vad_loaded": extractor.vad_model is not None}
 
 
 
 
 
 
349
 
350
 
351
  @app.post("/extract-audio-features")
@@ -353,9 +139,13 @@ async def extract_audio_features(audio: UploadFile = File(...), transcript: str
353
  """Extract all 17 voice features from uploaded audio file."""
354
  try:
355
  audio_bytes = await audio.read()
 
356
  y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
 
 
357
  features = extractor.extract_all(y, transcript)
358
- return features
 
359
  except Exception as e:
360
  print(f"[ERROR] extract_audio_features: {e}")
361
  traceback.print_exc()
@@ -403,7 +193,7 @@ async def extract_audio_features_base64(data: AudioBase64Request):
403
 
404
  features = extractor.extract_all(y, transcript)
405
  print(f"[OK] Extracted {len(features)} audio features")
406
- return features
407
  except Exception as e:
408
  print(f"[ERROR] extract_audio_features_base64: {e}")
409
  traceback.print_exc()
@@ -416,3 +206,4 @@ if __name__ == "__main__":
416
  import os
417
  port = int(os.environ.get("PORT", 7860))
418
  uvicorn.run(app, host="0.0.0.0", port=port)
 
 
22
 
23
 
24
  # ──────────────────────────────────────────────────────────────────────── #
25
+ # Imports from standardized modules
26
  # ──────────────────────────────────────────────────────────────────────── #
27
+ try:
28
+ from audio_features import AudioFeatureExtractor
29
+ except ImportError:
30
+ # Fallback if running from a different context
31
+ import sys
32
+ sys.path.append('.')
33
+ from audio_features import AudioFeatureExtractor
34
+
35
+ # Initialize global extractor
36
+ # We use a global instance to cache models (VAD, Emotion)
37
+ print("[INFO] Initializing Global AudioFeatureExtractor...")
38
+ extractor = AudioFeatureExtractor(
39
+ sample_rate=16000,
40
+ use_emotion=True,
41
+ models_dir="models" # Dockerfile should place models here or download them
42
+ )
43
 
44
+ # Ensure models are downloaded/ready
45
+ if extractor.use_emotion and extractor.emotion_extractor:
46
+ print("[INFO] Checking for emotion models...")
47
+ # Trigger download if needed/possible
48
+ try:
49
+ if len(extractor.emotion_extractor.models) == 0:
50
+ print("[INFO] Models not found, attempting download...")
51
+ extractor.emotion_extractor.download_models()
52
+ # Re-init manually to load them
53
+ extractor.emotion_extractor.__init__(models_dir=extractor.emotion_extractor.models_dir)
54
+ except Exception as e:
55
+ print(f"[WARN] Failed to download emotion models: {e}")
56
 
57
  # ──────────────────────────────────────────────────────────────────────── #
58
+ # Helper to handle NaN/Inf for JSON
59
  # ──────────────────────────────────────────────────────────────────────── #
60
+ def sanitize_features(features: Dict[str, float]) -> Dict[str, float]:
61
+ sanitized = {}
62
+ for key, val in features.items():
63
+ if isinstance(val, (float, np.floating)):
64
+ if np.isnan(val) or np.isinf(val):
65
+ sanitized[key] = 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  else:
67
+ sanitized[key] = float(val)
68
+ elif isinstance(val, (int, np.integer)):
69
+ sanitized[key] = int(val)
70
+ else:
71
+ sanitized[key] = val # keep string/other as is
72
+ return sanitized
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  # ──────────────────────────────────────────────────────────────────────── #
77
  # FastAPI handler for deployment (HF Spaces / Cloud Run / Lambda)
 
103
  content={**DEFAULT_AUDIO_FEATURES, "_error": str(exc), "_handler": "global"},
104
  )
105
 
106
+ # Extractor is already initialized globally above
107
 
108
  # ──────────────────────────────────────────────────────────────────────── #
109
  # Constants & Defaults
 
125
 
126
  @app.get("/health")
127
  async def health():
128
+ vad_status = extractor.vad_model is not None
129
+ emotion_status = extractor.emotion_extractor is not None if extractor.use_emotion else False
130
+ return {
131
+ "status": "healthy",
132
+ "vad_loaded": vad_status,
133
+ "emotion_loaded": emotion_status
134
+ }
135
 
136
 
137
  @app.post("/extract-audio-features")
 
139
  """Extract all 17 voice features from uploaded audio file."""
140
  try:
141
  audio_bytes = await audio.read()
142
+ # librosa.load returns (audio, sr)
143
  y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
144
+
145
+ # AudioFeatureExtractor.extract_all expects numpy array and optional transcript
146
  features = extractor.extract_all(y, transcript)
147
+
148
+ return sanitize_features(features)
149
  except Exception as e:
150
  print(f"[ERROR] extract_audio_features: {e}")
151
  traceback.print_exc()
 
193
 
194
  features = extractor.extract_all(y, transcript)
195
  print(f"[OK] Extracted {len(features)} audio features")
196
+ return sanitize_features(features)
197
  except Exception as e:
198
  print(f"[ERROR] extract_audio_features_base64: {e}")
199
  traceback.print_exc()
 
206
  import os
207
  port = int(os.environ.get("PORT", 7860))
208
  uvicorn.run(app, host="0.0.0.0", port=port)
209
+
requirements.txt CHANGED
@@ -4,15 +4,17 @@ soundfile==0.12.1
4
  numpy==1.24.3
5
  scipy==1.11.2
6
 
7
- # ML - CPU-only versions (for HF Free Spaces without GPU)
 
8
  --extra-index-url https://download.pytorch.org/whl/cpu
9
  torch==2.1.0+cpu
10
- torchvision==0.16.0+cpu
11
  torchaudio==2.1.0+cpu
12
 
 
 
 
13
  # API
14
  fastapi==0.95.2
15
  uvicorn==0.22.0
16
  python-multipart==0.0.6
17
  huggingface_hub>=0.19.0
18
- scikit-image>=0.21.0
 
4
  numpy==1.24.3
5
  scipy==1.11.2
6
 
7
+ # ML - CPU-only versions (HF Spaces friendly)
8
+ # Torch for Silero VAD
9
  --extra-index-url https://download.pytorch.org/whl/cpu
10
  torch==2.1.0+cpu
 
11
  torchaudio==2.1.0+cpu
12
 
13
+ # TensorFlow for Emotion Models
14
+ tensorflow-cpu==2.15.0
15
+
16
  # API
17
  fastapi==0.95.2
18
  uvicorn==0.22.0
19
  python-multipart==0.0.6
20
  huggingface_hub>=0.19.0