Spaces:

divAIne
/

busy-module-audio

Sleeping

App Files Files Community

EurekaPotato commited on Feb 22

Commit

f4320c5

verified ·

1 Parent(s): 9d8ae5e

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

audio_features.py +1 -1
emotion_features.py +189 -114

audio_features.py CHANGED Viewed

@@ -16,7 +16,7 @@ from typing import Dict, Tuple, List
 import noisereduce as nr
 import torch
 import warnings
-from emotion_features import EmotionFeatureExtractor
 warnings.filterwarnings("ignore")

 import noisereduce as nr
 import torch
 import warnings
+from .emotion_features import EmotionFeatureExtractor
 warnings.filterwarnings("ignore")

emotion_features.py CHANGED Viewed

@@ -27,11 +27,20 @@ except ImportError:
     print("[WARN] TensorFlow not available. Install with: pip install tensorflow")
-class EmotionFeatureExtractor:
-    """Extract emotion features using NeuroByte pre-trained models"""
-    # Emotion labels from the models
-    EMOTIONS = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
     def __init__(self, models_dir: str = None, use_ensemble: bool = True):
         """
@@ -68,15 +77,16 @@ class EmotionFeatureExtractor:
         # Load models
         print(f"Loading NeuroByte emotion models from {models_dir}...")
-        for model_name, filename in model_files.items():
-            model_path = os.path.join(models_dir, filename)
             if os.path.exists(model_path):
-                try:
-                    self.models[model_name] = keras.models.load_model(model_path)
-                    print(f"[OK] Loaded {model_name} model")
-                except Exception as e:
-                    print(f"[WARN] Failed to load {model_name}: {e}")
             else:
                 print(f"[WARN] Model not found: {model_path}")
@@ -87,95 +97,149 @@ class EmotionFeatureExtractor:
         else:
             print(f"[OK] {len(self.models)} emotion model(s) loaded successfully")
-    # def download_models(self):
-    #     """
-    #     Download method removed. Models are now bundled with the application.
-    #     """
-    #     print("[INFO] Models should be present in the 'models' directory.")
-    def extract_mel_spectrogram(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
         """
-        Extract mel spectrogram for the mel_spec model
-        Returns shape: (128, time_steps, 1) for CNN input
         """
-        # Resample to 16kHz if needed
-        if sr != 16000:
-            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
-            sr = 16000
-        # Extract mel spectrogram
-        mel_spec = librosa.feature.melspectrogram(
-            y=audio,
-            sr=sr,
-            n_fft=2048,
-            hop_length=512,
-            n_mels=128,
-            fmin=0,
-            fmax=sr/2
-        )
         # Convert to dB
         mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
-        # Normalize to [0, 1]
-        mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
-        # Add channel dimension and transpose to (time, freq, 1)
-        mel_spec_norm = mel_spec_norm.T
-        mel_spec_norm = np.expand_dims(mel_spec_norm, axis=-1)
-        # Pad or truncate to fixed length (e.g., 216 frames for ~3 seconds)
-        target_length = 216
-        if mel_spec_norm.shape[0] < target_length:
-            # Pad with zeros
-            pad_width = target_length - mel_spec_norm.shape[0]
-            mel_spec_norm = np.pad(mel_spec_norm, ((0, pad_width), (0, 0), (0, 0)), mode='constant')
-        else:
-            # Truncate
-            mel_spec_norm = mel_spec_norm[:target_length, :, :]
-        return mel_spec_norm
-    def extract_mfcc(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
-        """
-        Extract MFCC features for the mfcc model
-        Returns shape: (40, time_steps, 1) for CNN input
-        """
-        # Resample to 16kHz if needed
-        if sr != 16000:
-            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
-            sr = 16000
-        # Extract MFCCs
-        mfccs = librosa.feature.mfcc(
-            y=audio,
-            sr=sr,
-            n_mfcc=40,
-            n_fft=2048,
-            hop_length=512
-        )
-        # Normalize
-        mfccs = (mfccs - mfccs.mean()) / (mfccs.std() + 1e-8)
-        # Transpose and add channel dimension
-        mfccs = mfccs.T
-        mfccs = np.expand_dims(mfccs, axis=-1)
-        # Pad or truncate to fixed length
-        target_length = 216
-        if mfccs.shape[0] < target_length:
-            pad_width = target_length - mfccs.shape[0]
-            mfccs = np.pad(mfccs, ((0, pad_width), (0, 0), (0, 0)), mode='constant')
-        else:
-            mfccs = mfccs[:target_length, :, :]
-        return mfccs
-    def predict_emotions(self, audio: np.ndarray, sr: int = 16000) -> Dict[str, float]:
         """
         Predict emotion probabilities using loaded models
@@ -188,29 +252,40 @@ class EmotionFeatureExtractor:
         try:
             predictions = []
-            # CRNN model (if available)
-            if 'crnn' in self.models:
-                mel_spec = self.extract_mel_spectrogram(audio, sr)
-                mel_spec_batch = np.expand_dims(mel_spec, axis=0)
-                pred_crnn = self.models['crnn'].predict(mel_spec_batch, verbose=0)[0]
-                predictions.append(pred_crnn)
-            # Mel Spectrogram model (if available)
-            if 'mel_spec' in self.models and self.use_ensemble:
-                mel_spec = self.extract_mel_spectrogram(audio, sr)
-                mel_spec_batch = np.expand_dims(mel_spec, axis=0)
-                pred_mel = self.models['mel_spec'].predict(mel_spec_batch, verbose=0)[0]
-                predictions.append(pred_mel)
-            # MFCC model (if available)
-            if 'mfcc' in self.models and self.use_ensemble:
-                mfcc = self.extract_mfcc(audio, sr)
-                mfcc_batch = np.expand_dims(mfcc, axis=0)
-                pred_mfcc = self.models['mfcc'].predict(mfcc_batch, verbose=0)[0]
-                predictions.append(pred_mfcc)
             # Average predictions if ensemble
             if len(predictions) > 1:
@@ -366,4 +441,4 @@ if __name__ == "__main__":
     if extractor.use_tensorflow and len(extractor.models) > 0:
         print(f"\nUsing {len(extractor.models)} NeuroByte model(s)")
     else:
-        print("\nUsing acoustic features fallback")

     print("[WARN] TensorFlow not available. Install with: pip install tensorflow")
+class EmotionFeatureExtractor:
+    """Extract emotion features using NeuroByte pre-trained models"""
+    # Emotion labels from the models
+    EMOTIONS = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
+    # Preprocessing parameters used during model training
+    MODEL_SAMPLE_RATE = 44100
+    MODEL_CLIP_DURATION = 4.0  # seconds
+    MODEL_N_FFT = 2048
+    MODEL_HOP_LENGTH = 512
+    MODEL_N_MELS = 128
+    MODEL_N_MFCC = 40
+    MODEL_TIME_FRAMES = 345
     def __init__(self, models_dir: str = None, use_ensemble: bool = True):
         """
         # Load models
         print(f"Loading NeuroByte emotion models from {models_dir}...")
+        for model_name, filename in model_files.items():
+            model_path = os.path.join(models_dir, filename)
             if os.path.exists(model_path):
+                try:
+                    model = keras.models.load_model(model_path)
+                    self.models[model_name] = model
+                    print(f"[OK] Loaded {model_name} model")
+                except Exception as e:
+                    print(f"[WARN] Failed to load {model_name}: {e}")
             else:
                 print(f"[WARN] Model not found: {model_path}")
         else:
             print(f"[OK] {len(self.models)} emotion model(s) loaded successfully")
+    def download_models(self):
         """
+        Download NeuroByte models from Hugging Face
+        Run this once to download the models:
+        >>> extractor = EmotionFeatureExtractor()
+        >>> extractor.download_models()
         """
+        if not TENSORFLOW_AVAILABLE:
+            print("[WARN] TensorFlow required to download models")
+            return
+        try:
+            from huggingface_hub import hf_hub_download
+            os.makedirs(self.models_dir, exist_ok=True)
+            repo_id = "neurobyte-org/speech-emotion-recognition"
+            model_files = [
+                'emotion_recognition_crnn.keras',
+                'emotion_recognition_mel_spec.keras',
+                'emotion_recognition_mfcc.keras'
+            ]
+            print(f"Downloading models from {repo_id}...")
+            for filename in model_files:
+                try:
+                    print(f"  Downloading {filename}...")
+                    downloaded_path = hf_hub_download(
+                        repo_id=repo_id,
+                        filename=filename,
+                        cache_dir=self.models_dir
+                    )
+                    # Copy to expected location
+                    target_path = os.path.join(self.models_dir, filename)
+                    if downloaded_path != target_path:
+                        import shutil
+                        shutil.copy(downloaded_path, target_path)
+                    print(f"  [OK] {filename} downloaded")
+                except Exception as e:
+                    print(f"  [WARN] Failed to download {filename}: {e}")
+            print("[OK] Download complete! Reinitialize the extractor to load models.")
+        except ImportError:
+            print("[WARN] huggingface_hub not installed. Install with: pip install huggingface_hub")
+    def extract_mel_spectrogram(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
+        """
+        Extract mel spectrogram for the mel_spec model
+        Returns shape: (128, 345, 1) for CNN input
+        """
+        # Resample to training sample rate if needed
+        if sr != self.MODEL_SAMPLE_RATE:
+            audio = librosa.resample(audio, orig_sr=sr, target_sr=self.MODEL_SAMPLE_RATE)
+            sr = self.MODEL_SAMPLE_RATE
+        # Pad/trim to fixed duration
+        target_samples = int(self.MODEL_CLIP_DURATION * sr)
+        if len(audio) < target_samples:
+            audio = np.pad(audio, (0, target_samples - len(audio)), mode='constant')
+        else:
+            audio = audio[:target_samples]
+        # Extract mel spectrogram
+        mel_spec = librosa.feature.melspectrogram(
+            y=audio,
+            sr=sr,
+            n_fft=self.MODEL_N_FFT,
+            hop_length=self.MODEL_HOP_LENGTH,
+            n_mels=self.MODEL_N_MELS,
+            fmin=0,
+            fmax=sr/2
+        )
         # Convert to dB
         mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
+        # Normalize to [0, 1]
+        mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
+        # Add channel dimension (freq, time, 1)
+        mel_spec_norm = np.expand_dims(mel_spec_norm, axis=-1)
+        # Pad or truncate to fixed time length
+        target_length = self.MODEL_TIME_FRAMES
+        if mel_spec_norm.shape[1] < target_length:
+            # Pad with zeros
+            pad_width = target_length - mel_spec_norm.shape[1]
+            mel_spec_norm = np.pad(mel_spec_norm, ((0, 0), (0, pad_width), (0, 0)), mode='constant')
+        else:
+            # Truncate
+            mel_spec_norm = mel_spec_norm[:, :target_length, :]
+        return mel_spec_norm
+    def extract_mfcc(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
+        """
+        Extract MFCC features for the mfcc model
+        Returns shape: (40, 345, 1) for CNN input
+        """
+        # Resample to training sample rate if needed
+        if sr != self.MODEL_SAMPLE_RATE:
+            audio = librosa.resample(audio, orig_sr=sr, target_sr=self.MODEL_SAMPLE_RATE)
+            sr = self.MODEL_SAMPLE_RATE
+        # Pad/trim to fixed duration
+        target_samples = int(self.MODEL_CLIP_DURATION * sr)
+        if len(audio) < target_samples:
+            audio = np.pad(audio, (0, target_samples - len(audio)), mode='constant')
+        else:
+            audio = audio[:target_samples]
+        # Extract MFCCs
+        mfccs = librosa.feature.mfcc(
+            y=audio,
+            sr=sr,
+            n_mfcc=self.MODEL_N_MFCC,
+            n_fft=self.MODEL_N_FFT,
+            hop_length=self.MODEL_HOP_LENGTH
+        )
+        # Normalize
+        mfccs = (mfccs - mfccs.mean()) / (mfccs.std() + 1e-8)
+        # Add channel dimension (coeff, time, 1)
+        mfccs = np.expand_dims(mfccs, axis=-1)
+        # Pad or truncate to fixed length
+        target_length = self.MODEL_TIME_FRAMES
+        if mfccs.shape[1] < target_length:
+            pad_width = target_length - mfccs.shape[1]
+            mfccs = np.pad(mfccs, ((0, 0), (0, pad_width), (0, 0)), mode='constant')
+        else:
+            mfccs = mfccs[:, :target_length, :]
+        return mfccs
+    def predict_emotions(self, audio: np.ndarray, sr: int = 16000) -> Dict[str, float]:
         """
         Predict emotion probabilities using loaded models
         try:
             predictions = []
+            def _predict_with_shape_guard(model, mel_spec_batch, mfcc_batch):
+                expected = model.input_shape
+                if expected is None or len(expected) < 4:
+                    return model.predict(mel_spec_batch, verbose=0)[0]
+                freq_bins = expected[1]
+                if freq_bins == self.MODEL_N_MELS:
+                    return model.predict(mel_spec_batch, verbose=0)[0]
+                if freq_bins == self.MODEL_N_MFCC:
+                    return model.predict(mfcc_batch, verbose=0)[0]
+                # Fallback: try mel then mfcc
+                try:
+                    return model.predict(mel_spec_batch, verbose=0)[0]
+                except Exception:
+                    return model.predict(mfcc_batch, verbose=0)[0]
+            mel_spec = self.extract_mel_spectrogram(audio, sr)
+            mel_spec_batch = np.expand_dims(mel_spec, axis=0)
+            mfcc = self.extract_mfcc(audio, sr)
+            mfcc_batch = np.expand_dims(mfcc, axis=0)
+            # CRNN model (if available)
+            if 'crnn' in self.models:
+                pred_crnn = _predict_with_shape_guard(self.models['crnn'], mel_spec_batch, mfcc_batch)
+                predictions.append(pred_crnn)
+            # Mel Spectrogram model (if available)
+            if 'mel_spec' in self.models and self.use_ensemble:
+                pred_mel = _predict_with_shape_guard(self.models['mel_spec'], mel_spec_batch, mfcc_batch)
+                predictions.append(pred_mel)
+            # MFCC model (if available)
+            if 'mfcc' in self.models and self.use_ensemble:
+                pred_mfcc = _predict_with_shape_guard(self.models['mfcc'], mel_spec_batch, mfcc_batch)
+                predictions.append(pred_mfcc)
             # Average predictions if ensemble
             if len(predictions) > 1:
     if extractor.use_tensorflow and len(extractor.models) > 0:
         print(f"\nUsing {len(extractor.models)} NeuroByte model(s)")
     else:
+        print("\nUsing acoustic features fallback")