EurekaPotato commited on
Commit
563e76e
·
verified ·
1 Parent(s): c59ae07

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. audio_features.py +135 -55
audio_features.py CHANGED
@@ -23,8 +23,8 @@ except ImportError:
23
 
24
  warnings.filterwarnings("ignore")
25
 
26
- class AudioFeatureExtractor:
27
- """Extract 14 audio features for busy detection (Enhanced with Silero VAD)"""
28
 
29
  _vad_model_cache = None
30
  _vad_utils_cache = None
@@ -68,10 +68,52 @@ class AudioFeatureExtractor:
68
  print(f"[WARN] Emotion features disabled: {e}")
69
  self.emotion_extractor = None
70
  self.use_emotion = False
71
- else:
72
- self.emotion_extractor = None
73
-
74
- def load_audio(self, audio_path: str) -> np.ndarray:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  """Load and preprocess audio file"""
76
  audio, sr = librosa.load(
77
  audio_path,
@@ -79,9 +121,41 @@ class AudioFeatureExtractor:
79
  mono=True,
80
  duration=self.audio_duration_limit
81
  )
82
- return audio
83
-
84
- def extract_hnr(self, audio: np.ndarray) -> float:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  """
86
  V1: Harmonics-to-Noise Ratio (HNR)
87
  Measures voice quality - higher = clearer voice
@@ -154,7 +228,7 @@ class AudioFeatureExtractor:
154
  print(f"HNR extraction failed: {e}")
155
  return 15.0 # Safe default
156
 
157
- def classify_noise_type(self, audio: np.ndarray) -> Dict[str, float]:
158
  """
159
  V2: Background Noise Classification (one-hot encoded)
160
 
@@ -166,17 +240,27 @@ class AudioFeatureExtractor:
166
  - High frequency energy (hiss)
167
  - Spectral contrast (texture)
168
  """
169
- if len(audio) < 512:
170
- return {'traffic': 0, 'office': 0, 'crowd': 0, 'wind': 0, 'clean': 1}
171
-
172
- try:
173
- # Extract comprehensive spectral features
174
- S = np.abs(librosa.stft(audio))
175
- if S.shape[1] == 0:
176
- return {'traffic': 0, 'office': 0, 'crowd': 0, 'wind': 0, 'clean': 1}
177
-
178
- # Feature 1: Spectral Centroid (brightness)
179
- centroid = np.mean(librosa.feature.spectral_centroid(S=S, sr=self.sample_rate))
 
 
 
 
 
 
 
 
 
 
180
 
181
  # Feature 2: Spectral Rolloff (energy concentration)
182
  rolloff = np.mean(librosa.feature.spectral_rolloff(S=S, sr=self.sample_rate))
@@ -283,41 +367,37 @@ class AudioFeatureExtractor:
283
  print(f"Pitch extraction failed: {e}")
284
  return 0.0, 0.0
285
 
286
- def extract_energy_features(self, audio: np.ndarray) -> Tuple[float, float]:
287
- """V6-V7: Energy Mean and Std"""
288
- try:
289
- rms = librosa.feature.rms(y=audio)[0]
290
- return float(np.mean(rms)), float(np.std(rms))
291
- except:
292
- return 0.0, 0.0
 
 
 
 
 
 
293
 
294
- def extract_pause_features(self, audio: np.ndarray) -> Tuple[float, float, int]:
295
  """
296
  V8-V10: Pause Ratio, Average Pause Duration, Mid-Pause Count
297
  Uses Silero VAD
298
  """
299
- if self.vad_model is None or len(audio) < 512:
300
- return 0.0, 0.0, 0
301
-
302
- # Resample for VAD if configured
303
- if self.vad_sample_rate != self.sample_rate:
304
- try:
305
- audio = librosa.resample(audio, orig_sr=self.sample_rate, target_sr=self.vad_sample_rate)
306
- except Exception:
307
- pass
308
-
309
- # Silero expects Tensor
310
- wav = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)
311
-
312
- try:
313
- speech_dict = self.get_speech_timestamps(wav, self.vad_model, sampling_rate=self.vad_sample_rate)
314
-
315
- # Calculate speech duration
316
- speech_samples = sum(seg['end'] - seg['start'] for seg in speech_dict)
317
- total_samples = len(audio)
318
-
319
- if total_samples == 0:
320
- return 0.0, 0.0, 0
321
 
322
  # Pause Ratio
323
  pause_samples = total_samples - speech_samples
@@ -329,7 +409,7 @@ class AudioFeatureExtractor:
329
  for i in range(len(speech_dict) - 1):
330
  gap = speech_dict[i+1]['start'] - speech_dict[i]['end']
331
  if gap > 0:
332
- gaps.append(gap / self.vad_sample_rate) # Convert to seconds
333
 
334
  avg_pause_dur = float(np.mean(gaps)) if gaps else 0.0
335
 
@@ -350,8 +430,8 @@ class AudioFeatureExtractor:
350
 
351
  features = {}
352
 
353
- # V1: HNR (IMPROVED from SNR)
354
- features['v1_snr'] = self.extract_hnr(audio) # Keep name for compatibility
355
 
356
  # V2: Noise classification (IMPROVED)
357
  noise_class = self.classify_noise_type(audio)
@@ -403,7 +483,7 @@ class AudioFeatureExtractor:
403
  audio = audio.astype(np.float32)
404
 
405
  features = {}
406
- features['v1_snr'] = self.extract_hnr(audio) # Keep name for compatibility
407
  features['v3_speech_rate'] = self.extract_speech_rate(audio, transcript)
408
 
409
  e_mean, e_std = self.extract_energy_features(audio)
 
23
 
24
  warnings.filterwarnings("ignore")
25
 
26
+ class AudioFeatureExtractor:
27
+ """Extract 14 audio features for busy detection (Enhanced with Silero VAD)"""
28
 
29
  _vad_model_cache = None
30
  _vad_utils_cache = None
 
68
  print(f"[WARN] Emotion features disabled: {e}")
69
  self.emotion_extractor = None
70
  self.use_emotion = False
71
+ else:
72
+ self.emotion_extractor = None
73
+
74
+ def _prepare_vad_audio(self, audio: np.ndarray) -> Tuple[np.ndarray, List[Dict]]:
75
+ """Prepare audio for VAD and return speech timestamps."""
76
+ if self.vad_model is None or len(audio) < 512:
77
+ return audio, []
78
+
79
+ audio_vad = audio
80
+ if self.vad_sample_rate != self.sample_rate:
81
+ try:
82
+ audio_vad = librosa.resample(audio, orig_sr=self.sample_rate, target_sr=self.vad_sample_rate)
83
+ except Exception:
84
+ audio_vad = audio
85
+
86
+ wav = torch.tensor(audio_vad, dtype=torch.float32).unsqueeze(0)
87
+
88
+ try:
89
+ speech_dict = self.get_speech_timestamps(wav, self.vad_model, sampling_rate=self.vad_sample_rate)
90
+ except Exception:
91
+ speech_dict = []
92
+
93
+ return audio_vad, speech_dict
94
+
95
+ def _split_speech_pause(self, audio: np.ndarray) -> Tuple[np.ndarray, np.ndarray, int]:
96
+ """Return speech audio, pause audio, and the sample rate used for VAD."""
97
+ if self.vad_model is None:
98
+ return audio, np.array([], dtype=audio.dtype), self.sample_rate
99
+
100
+ audio_vad, speech_dict = self._prepare_vad_audio(audio)
101
+
102
+ if not speech_dict:
103
+ return np.array([], dtype=audio_vad.dtype), audio_vad, self.vad_sample_rate
104
+
105
+ mask = np.zeros(len(audio_vad), dtype=bool)
106
+ for seg in speech_dict:
107
+ start = max(0, int(seg.get('start', 0)))
108
+ end = min(len(audio_vad), int(seg.get('end', 0)))
109
+ if end > start:
110
+ mask[start:end] = True
111
+
112
+ speech_audio = audio_vad[mask]
113
+ pause_audio = audio_vad[~mask]
114
+ return speech_audio, pause_audio, self.vad_sample_rate
115
+
116
+ def load_audio(self, audio_path: str) -> np.ndarray:
117
  """Load and preprocess audio file"""
118
  audio, sr = librosa.load(
119
  audio_path,
 
121
  mono=True,
122
  duration=self.audio_duration_limit
123
  )
124
+ return audio
125
+
126
+ def extract_snr(self, audio: np.ndarray) -> float:
127
+ """
128
+ V1: Signal-to-Noise Ratio (SNR)
129
+ Signal power is calculated only during speech; noise power only during pauses.
130
+ """
131
+ if len(audio) == 0 or len(audio) < 2048:
132
+ return 15.0 # Neutral default
133
+
134
+ try:
135
+ speech_audio, pause_audio, _ = self._split_speech_pause(audio)
136
+
137
+ if len(speech_audio) == 0:
138
+ return 0.0
139
+
140
+ signal_power = float(np.mean(speech_audio ** 2))
141
+ if signal_power <= 0:
142
+ return 0.0
143
+
144
+ if len(pause_audio) > 0:
145
+ noise_power = float(np.mean(pause_audio ** 2))
146
+ else:
147
+ noise_power = 1e-8
148
+
149
+ if noise_power <= 0:
150
+ noise_power = 1e-8
151
+
152
+ snr_db = 10.0 * np.log10(signal_power / noise_power)
153
+ return float(np.clip(snr_db, -10.0, 40.0))
154
+ except Exception as e:
155
+ print(f"SNR extraction failed: {e}")
156
+ return 15.0
157
+
158
+ def extract_hnr(self, audio: np.ndarray) -> float:
159
  """
160
  V1: Harmonics-to-Noise Ratio (HNR)
161
  Measures voice quality - higher = clearer voice
 
228
  print(f"HNR extraction failed: {e}")
229
  return 15.0 # Safe default
230
 
231
+ def classify_noise_type(self, audio: np.ndarray) -> Dict[str, float]:
232
  """
233
  V2: Background Noise Classification (one-hot encoded)
234
 
 
240
  - High frequency energy (hiss)
241
  - Spectral contrast (texture)
242
  """
243
+ if len(audio) < 512:
244
+ return {'traffic': 0, 'office': 0, 'crowd': 0, 'wind': 0, 'clean': 1}
245
+
246
+ try:
247
+ # Extract comprehensive spectral features
248
+ S = np.abs(librosa.stft(audio))
249
+ if S.shape[1] == 0:
250
+ return {'traffic': 0, 'office': 0, 'crowd': 0, 'wind': 0, 'clean': 1}
251
+
252
+ # Feature 1: Spectral Centroid (brightness) - computed on pauses only
253
+ pause_audio = None
254
+ if self.vad_model is not None:
255
+ _, pause_audio, vad_sr = self._split_speech_pause(audio)
256
+ else:
257
+ vad_sr = self.sample_rate
258
+
259
+ if pause_audio is not None and len(pause_audio) >= 512:
260
+ S_pause = np.abs(librosa.stft(pause_audio))
261
+ centroid = np.mean(librosa.feature.spectral_centroid(S=S_pause, sr=vad_sr))
262
+ else:
263
+ centroid = np.mean(librosa.feature.spectral_centroid(S=S, sr=self.sample_rate))
264
 
265
  # Feature 2: Spectral Rolloff (energy concentration)
266
  rolloff = np.mean(librosa.feature.spectral_rolloff(S=S, sr=self.sample_rate))
 
367
  print(f"Pitch extraction failed: {e}")
368
  return 0.0, 0.0
369
 
370
+ def extract_energy_features(self, audio: np.ndarray) -> Tuple[float, float]:
371
+ """V6-V7: Energy Mean and Std"""
372
+ try:
373
+ rms = librosa.feature.rms(y=audio)[0]
374
+ e_mean = float(np.mean(rms))
375
+ e_std = float(np.std(rms))
376
+ if e_mean > 0:
377
+ e_std = e_std / e_mean
378
+ else:
379
+ e_std = 0.0
380
+ return e_mean, e_std
381
+ except:
382
+ return 0.0, 0.0
383
 
384
+ def extract_pause_features(self, audio: np.ndarray) -> Tuple[float, float, int]:
385
  """
386
  V8-V10: Pause Ratio, Average Pause Duration, Mid-Pause Count
387
  Uses Silero VAD
388
  """
389
+ if self.vad_model is None or len(audio) < 512:
390
+ return 0.0, 0.0, 0
391
+
392
+ try:
393
+ audio_vad, speech_dict = self._prepare_vad_audio(audio)
394
+
395
+ # Calculate speech duration
396
+ speech_samples = sum(seg['end'] - seg['start'] for seg in speech_dict)
397
+ total_samples = len(audio_vad)
398
+
399
+ if total_samples == 0:
400
+ return 0.0, 0.0, 0
 
 
 
 
 
 
 
 
 
 
401
 
402
  # Pause Ratio
403
  pause_samples = total_samples - speech_samples
 
409
  for i in range(len(speech_dict) - 1):
410
  gap = speech_dict[i+1]['start'] - speech_dict[i]['end']
411
  if gap > 0:
412
+ gaps.append(gap / self.vad_sample_rate) # Convert to seconds
413
 
414
  avg_pause_dur = float(np.mean(gaps)) if gaps else 0.0
415
 
 
430
 
431
  features = {}
432
 
433
+ # V1: SNR (speech-only signal vs pause-only noise)
434
+ features['v1_snr'] = self.extract_snr(audio)
435
 
436
  # V2: Noise classification (IMPROVED)
437
  noise_class = self.classify_noise_type(audio)
 
483
  audio = audio.astype(np.float32)
484
 
485
  features = {}
486
+ features['v1_snr'] = self.extract_snr(audio)
487
  features['v3_speech_rate'] = self.extract_speech_rate(audio, transcript)
488
 
489
  e_mean, e_std = self.extract_energy_features(audio)