Spaces:

divAIne
/

busy-module-audio

Sleeping

App Files Files Community

EurekaPotato commited on Mar 8

Commit

563e76e

verified ·

1 Parent(s): c59ae07

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

audio_features.py +135 -55

audio_features.py CHANGED Viewed

@@ -23,8 +23,8 @@ except ImportError:
 warnings.filterwarnings("ignore")
-class AudioFeatureExtractor:
-    """Extract 14 audio features for busy detection (Enhanced with Silero VAD)"""
     _vad_model_cache = None
     _vad_utils_cache = None
@@ -68,10 +68,52 @@ class AudioFeatureExtractor:
                 print(f"[WARN] Emotion features disabled: {e}")
                 self.emotion_extractor = None
                 self.use_emotion = False
-        else:
-            self.emotion_extractor = None
-    def load_audio(self, audio_path: str) -> np.ndarray:
         """Load and preprocess audio file"""
         audio, sr = librosa.load(
             audio_path,
@@ -79,9 +121,41 @@ class AudioFeatureExtractor:
             mono=True,
             duration=self.audio_duration_limit
         )
-        return audio
-    def extract_hnr(self, audio: np.ndarray) -> float:
         """
         V1: Harmonics-to-Noise Ratio (HNR)
         Measures voice quality - higher = clearer voice
@@ -154,7 +228,7 @@ class AudioFeatureExtractor:
             print(f"HNR extraction failed: {e}")
             return 15.0  # Safe default
-    def classify_noise_type(self, audio: np.ndarray) -> Dict[str, float]:
         """
         V2: Background Noise Classification (one-hot encoded)
@@ -166,17 +240,27 @@ class AudioFeatureExtractor:
         - High frequency energy (hiss)
         - Spectral contrast (texture)
         """
-        if len(audio) < 512:
-            return {'traffic': 0, 'office': 0, 'crowd': 0, 'wind': 0, 'clean': 1}
-        try:
-            # Extract comprehensive spectral features
-            S = np.abs(librosa.stft(audio))
-            if S.shape[1] == 0:
-                return {'traffic': 0, 'office': 0, 'crowd': 0, 'wind': 0, 'clean': 1}
-            # Feature 1: Spectral Centroid (brightness)
-            centroid = np.mean(librosa.feature.spectral_centroid(S=S, sr=self.sample_rate))
             # Feature 2: Spectral Rolloff (energy concentration)
             rolloff = np.mean(librosa.feature.spectral_rolloff(S=S, sr=self.sample_rate))
@@ -283,41 +367,37 @@ class AudioFeatureExtractor:
             print(f"Pitch extraction failed: {e}")
             return 0.0, 0.0
-    def extract_energy_features(self, audio: np.ndarray) -> Tuple[float, float]:
-        """V6-V7: Energy Mean and Std"""
-        try:
-            rms = librosa.feature.rms(y=audio)[0]
-            return float(np.mean(rms)), float(np.std(rms))
-        except:
-            return 0.0, 0.0
-    def extract_pause_features(self, audio: np.ndarray) -> Tuple[float, float, int]:
         """
         V8-V10: Pause Ratio, Average Pause Duration, Mid-Pause Count
         Uses Silero VAD
         """
-        if self.vad_model is None or len(audio) < 512:
-            return 0.0, 0.0, 0
-        # Resample for VAD if configured
-        if self.vad_sample_rate != self.sample_rate:
-            try:
-                audio = librosa.resample(audio, orig_sr=self.sample_rate, target_sr=self.vad_sample_rate)
-            except Exception:
-                pass
-        # Silero expects Tensor
-        wav = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)
-        try:
-            speech_dict = self.get_speech_timestamps(wav, self.vad_model, sampling_rate=self.vad_sample_rate)
-            # Calculate speech duration
-            speech_samples = sum(seg['end'] - seg['start'] for seg in speech_dict)
-            total_samples = len(audio)
-            if total_samples == 0:
-                return 0.0, 0.0, 0
             # Pause Ratio
             pause_samples = total_samples - speech_samples
@@ -329,7 +409,7 @@ class AudioFeatureExtractor:
                 for i in range(len(speech_dict) - 1):
                     gap = speech_dict[i+1]['start'] - speech_dict[i]['end']
                     if gap > 0:
-                        gaps.append(gap / self.vad_sample_rate)  # Convert to seconds
             avg_pause_dur = float(np.mean(gaps)) if gaps else 0.0
@@ -350,8 +430,8 @@ class AudioFeatureExtractor:
         features = {}
-        # V1: HNR (IMPROVED from SNR)
-        features['v1_snr'] = self.extract_hnr(audio)  # Keep name for compatibility
         # V2: Noise classification (IMPROVED)
         noise_class = self.classify_noise_type(audio)
@@ -403,7 +483,7 @@ class AudioFeatureExtractor:
             audio = audio.astype(np.float32)
         features = {}
-        features['v1_snr'] = self.extract_hnr(audio)  # Keep name for compatibility
         features['v3_speech_rate'] = self.extract_speech_rate(audio, transcript)
         e_mean, e_std = self.extract_energy_features(audio)

 warnings.filterwarnings("ignore")
+class AudioFeatureExtractor:
+    """Extract 14 audio features for busy detection (Enhanced with Silero VAD)"""
     _vad_model_cache = None
     _vad_utils_cache = None
                 print(f"[WARN] Emotion features disabled: {e}")
                 self.emotion_extractor = None
                 self.use_emotion = False
+        else:
+            self.emotion_extractor = None
+    def _prepare_vad_audio(self, audio: np.ndarray) -> Tuple[np.ndarray, List[Dict]]:
+        """Prepare audio for VAD and return speech timestamps."""
+        if self.vad_model is None or len(audio) < 512:
+            return audio, []
+        audio_vad = audio
+        if self.vad_sample_rate != self.sample_rate:
+            try:
+                audio_vad = librosa.resample(audio, orig_sr=self.sample_rate, target_sr=self.vad_sample_rate)
+            except Exception:
+                audio_vad = audio
+        wav = torch.tensor(audio_vad, dtype=torch.float32).unsqueeze(0)
+        try:
+            speech_dict = self.get_speech_timestamps(wav, self.vad_model, sampling_rate=self.vad_sample_rate)
+        except Exception:
+            speech_dict = []
+        return audio_vad, speech_dict
+    def _split_speech_pause(self, audio: np.ndarray) -> Tuple[np.ndarray, np.ndarray, int]:
+        """Return speech audio, pause audio, and the sample rate used for VAD."""
+        if self.vad_model is None:
+            return audio, np.array([], dtype=audio.dtype), self.sample_rate
+        audio_vad, speech_dict = self._prepare_vad_audio(audio)
+        if not speech_dict:
+            return np.array([], dtype=audio_vad.dtype), audio_vad, self.vad_sample_rate
+        mask = np.zeros(len(audio_vad), dtype=bool)
+        for seg in speech_dict:
+            start = max(0, int(seg.get('start', 0)))
+            end = min(len(audio_vad), int(seg.get('end', 0)))
+            if end > start:
+                mask[start:end] = True
+        speech_audio = audio_vad[mask]
+        pause_audio = audio_vad[~mask]
+        return speech_audio, pause_audio, self.vad_sample_rate
+    def load_audio(self, audio_path: str) -> np.ndarray:
         """Load and preprocess audio file"""
         audio, sr = librosa.load(
             audio_path,
             mono=True,
             duration=self.audio_duration_limit
         )
+        return audio
+    def extract_snr(self, audio: np.ndarray) -> float:
+        """
+        V1: Signal-to-Noise Ratio (SNR)
+        Signal power is calculated only during speech; noise power only during pauses.
+        """
+        if len(audio) == 0 or len(audio) < 2048:
+            return 15.0  # Neutral default
+        try:
+            speech_audio, pause_audio, _ = self._split_speech_pause(audio)
+            if len(speech_audio) == 0:
+                return 0.0
+            signal_power = float(np.mean(speech_audio ** 2))
+            if signal_power <= 0:
+                return 0.0
+            if len(pause_audio) > 0:
+                noise_power = float(np.mean(pause_audio ** 2))
+            else:
+                noise_power = 1e-8
+            if noise_power <= 0:
+                noise_power = 1e-8
+            snr_db = 10.0 * np.log10(signal_power / noise_power)
+            return float(np.clip(snr_db, -10.0, 40.0))
+        except Exception as e:
+            print(f"SNR extraction failed: {e}")
+            return 15.0
+    def extract_hnr(self, audio: np.ndarray) -> float:
         """
         V1: Harmonics-to-Noise Ratio (HNR)
         Measures voice quality - higher = clearer voice
             print(f"HNR extraction failed: {e}")
             return 15.0  # Safe default
+    def classify_noise_type(self, audio: np.ndarray) -> Dict[str, float]:
         """
         V2: Background Noise Classification (one-hot encoded)
         - High frequency energy (hiss)
         - Spectral contrast (texture)
         """
+        if len(audio) < 512:
+            return {'traffic': 0, 'office': 0, 'crowd': 0, 'wind': 0, 'clean': 1}
+        try:
+            # Extract comprehensive spectral features
+            S = np.abs(librosa.stft(audio))
+            if S.shape[1] == 0:
+                return {'traffic': 0, 'office': 0, 'crowd': 0, 'wind': 0, 'clean': 1}
+            # Feature 1: Spectral Centroid (brightness) - computed on pauses only
+            pause_audio = None
+            if self.vad_model is not None:
+                _, pause_audio, vad_sr = self._split_speech_pause(audio)
+            else:
+                vad_sr = self.sample_rate
+            if pause_audio is not None and len(pause_audio) >= 512:
+                S_pause = np.abs(librosa.stft(pause_audio))
+                centroid = np.mean(librosa.feature.spectral_centroid(S=S_pause, sr=vad_sr))
+            else:
+                centroid = np.mean(librosa.feature.spectral_centroid(S=S, sr=self.sample_rate))
             # Feature 2: Spectral Rolloff (energy concentration)
             rolloff = np.mean(librosa.feature.spectral_rolloff(S=S, sr=self.sample_rate))
             print(f"Pitch extraction failed: {e}")
             return 0.0, 0.0
+    def extract_energy_features(self, audio: np.ndarray) -> Tuple[float, float]:
+        """V6-V7: Energy Mean and Std"""
+        try:
+            rms = librosa.feature.rms(y=audio)[0]
+            e_mean = float(np.mean(rms))
+            e_std = float(np.std(rms))
+            if e_mean > 0:
+                e_std = e_std / e_mean
+            else:
+                e_std = 0.0
+            return e_mean, e_std
+        except:
+            return 0.0, 0.0
+    def extract_pause_features(self, audio: np.ndarray) -> Tuple[float, float, int]:
         """
         V8-V10: Pause Ratio, Average Pause Duration, Mid-Pause Count
         Uses Silero VAD
         """
+        if self.vad_model is None or len(audio) < 512:
+            return 0.0, 0.0, 0
+        try:
+            audio_vad, speech_dict = self._prepare_vad_audio(audio)
+            # Calculate speech duration
+            speech_samples = sum(seg['end'] - seg['start'] for seg in speech_dict)
+            total_samples = len(audio_vad)
+            if total_samples == 0:
+                return 0.0, 0.0, 0
             # Pause Ratio
             pause_samples = total_samples - speech_samples
                 for i in range(len(speech_dict) - 1):
                     gap = speech_dict[i+1]['start'] - speech_dict[i]['end']
                     if gap > 0:
+                        gaps.append(gap / self.vad_sample_rate)  # Convert to seconds
             avg_pause_dur = float(np.mean(gaps)) if gaps else 0.0
         features = {}
+        # V1: SNR (speech-only signal vs pause-only noise)
+        features['v1_snr'] = self.extract_snr(audio)
         # V2: Noise classification (IMPROVED)
         noise_class = self.classify_noise_type(audio)
             audio = audio.astype(np.float32)
         features = {}
+        features['v1_snr'] = self.extract_snr(audio)
         features['v3_speech_rate'] = self.extract_speech_rate(audio, transcript)
         e_mean, e_std = self.extract_energy_features(audio)