EurekaPotato commited on
Commit
f4320c5
·
verified ·
1 Parent(s): 9d8ae5e

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. audio_features.py +1 -1
  2. emotion_features.py +189 -114
audio_features.py CHANGED
@@ -16,7 +16,7 @@ from typing import Dict, Tuple, List
16
  import noisereduce as nr
17
  import torch
18
  import warnings
19
- from emotion_features import EmotionFeatureExtractor
20
 
21
  warnings.filterwarnings("ignore")
22
 
 
16
  import noisereduce as nr
17
  import torch
18
  import warnings
19
+ from .emotion_features import EmotionFeatureExtractor
20
 
21
  warnings.filterwarnings("ignore")
22
 
emotion_features.py CHANGED
@@ -27,11 +27,20 @@ except ImportError:
27
  print("[WARN] TensorFlow not available. Install with: pip install tensorflow")
28
 
29
 
30
- class EmotionFeatureExtractor:
31
- """Extract emotion features using NeuroByte pre-trained models"""
32
-
33
- # Emotion labels from the models
34
- EMOTIONS = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
 
 
 
 
 
 
 
 
 
35
 
36
  def __init__(self, models_dir: str = None, use_ensemble: bool = True):
37
  """
@@ -68,15 +77,16 @@ class EmotionFeatureExtractor:
68
  # Load models
69
  print(f"Loading NeuroByte emotion models from {models_dir}...")
70
 
71
- for model_name, filename in model_files.items():
72
- model_path = os.path.join(models_dir, filename)
73
 
74
  if os.path.exists(model_path):
75
- try:
76
- self.models[model_name] = keras.models.load_model(model_path)
77
- print(f"[OK] Loaded {model_name} model")
78
- except Exception as e:
79
- print(f"[WARN] Failed to load {model_name}: {e}")
 
80
  else:
81
  print(f"[WARN] Model not found: {model_path}")
82
 
@@ -87,95 +97,149 @@ class EmotionFeatureExtractor:
87
  else:
88
  print(f"[OK] {len(self.models)} emotion model(s) loaded successfully")
89
 
90
- # def download_models(self):
91
- # """
92
- # Download method removed. Models are now bundled with the application.
93
- # """
94
- # print("[INFO] Models should be present in the 'models' directory.")
95
-
96
-
97
- def extract_mel_spectrogram(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
98
  """
99
- Extract mel spectrogram for the mel_spec model
100
 
101
- Returns shape: (128, time_steps, 1) for CNN input
 
 
102
  """
103
- # Resample to 16kHz if needed
104
- if sr != 16000:
105
- audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
106
- sr = 16000
107
 
108
- # Extract mel spectrogram
109
- mel_spec = librosa.feature.melspectrogram(
110
- y=audio,
111
- sr=sr,
112
- n_fft=2048,
113
- hop_length=512,
114
- n_mels=128,
115
- fmin=0,
116
- fmax=sr/2
117
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  # Convert to dB
120
  mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
121
 
122
- # Normalize to [0, 1]
123
- mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
124
-
125
- # Add channel dimension and transpose to (time, freq, 1)
126
- mel_spec_norm = mel_spec_norm.T
127
- mel_spec_norm = np.expand_dims(mel_spec_norm, axis=-1)
128
-
129
- # Pad or truncate to fixed length (e.g., 216 frames for ~3 seconds)
130
- target_length = 216
131
- if mel_spec_norm.shape[0] < target_length:
132
- # Pad with zeros
133
- pad_width = target_length - mel_spec_norm.shape[0]
134
- mel_spec_norm = np.pad(mel_spec_norm, ((0, pad_width), (0, 0), (0, 0)), mode='constant')
135
- else:
136
- # Truncate
137
- mel_spec_norm = mel_spec_norm[:target_length, :, :]
138
-
139
- return mel_spec_norm
140
 
141
- def extract_mfcc(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
142
- """
143
- Extract MFCC features for the mfcc model
144
-
145
- Returns shape: (40, time_steps, 1) for CNN input
146
- """
147
- # Resample to 16kHz if needed
148
- if sr != 16000:
149
- audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
150
- sr = 16000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
- # Extract MFCCs
153
- mfccs = librosa.feature.mfcc(
154
- y=audio,
155
- sr=sr,
156
- n_mfcc=40,
157
- n_fft=2048,
158
- hop_length=512
159
- )
160
-
161
- # Normalize
162
- mfccs = (mfccs - mfccs.mean()) / (mfccs.std() + 1e-8)
163
-
164
- # Transpose and add channel dimension
165
- mfccs = mfccs.T
166
- mfccs = np.expand_dims(mfccs, axis=-1)
167
-
168
- # Pad or truncate to fixed length
169
- target_length = 216
170
- if mfccs.shape[0] < target_length:
171
- pad_width = target_length - mfccs.shape[0]
172
- mfccs = np.pad(mfccs, ((0, pad_width), (0, 0), (0, 0)), mode='constant')
173
- else:
174
- mfccs = mfccs[:target_length, :, :]
175
-
176
- return mfccs
177
 
178
- def predict_emotions(self, audio: np.ndarray, sr: int = 16000) -> Dict[str, float]:
179
  """
180
  Predict emotion probabilities using loaded models
181
 
@@ -188,29 +252,40 @@ class EmotionFeatureExtractor:
188
  try:
189
  predictions = []
190
 
191
- # CRNN model (if available)
192
- if 'crnn' in self.models:
193
- mel_spec = self.extract_mel_spectrogram(audio, sr)
194
- mel_spec_batch = np.expand_dims(mel_spec, axis=0)
195
-
196
- pred_crnn = self.models['crnn'].predict(mel_spec_batch, verbose=0)[0]
197
- predictions.append(pred_crnn)
198
-
199
- # Mel Spectrogram model (if available)
200
- if 'mel_spec' in self.models and self.use_ensemble:
201
- mel_spec = self.extract_mel_spectrogram(audio, sr)
202
- mel_spec_batch = np.expand_dims(mel_spec, axis=0)
203
-
204
- pred_mel = self.models['mel_spec'].predict(mel_spec_batch, verbose=0)[0]
205
- predictions.append(pred_mel)
206
-
207
- # MFCC model (if available)
208
- if 'mfcc' in self.models and self.use_ensemble:
209
- mfcc = self.extract_mfcc(audio, sr)
210
- mfcc_batch = np.expand_dims(mfcc, axis=0)
211
-
212
- pred_mfcc = self.models['mfcc'].predict(mfcc_batch, verbose=0)[0]
213
- predictions.append(pred_mfcc)
 
 
 
 
 
 
 
 
 
 
 
214
 
215
  # Average predictions if ensemble
216
  if len(predictions) > 1:
@@ -366,4 +441,4 @@ if __name__ == "__main__":
366
  if extractor.use_tensorflow and len(extractor.models) > 0:
367
  print(f"\nUsing {len(extractor.models)} NeuroByte model(s)")
368
  else:
369
- print("\nUsing acoustic features fallback")
 
27
  print("[WARN] TensorFlow not available. Install with: pip install tensorflow")
28
 
29
 
30
+ class EmotionFeatureExtractor:
31
+ """Extract emotion features using NeuroByte pre-trained models"""
32
+
33
+ # Emotion labels from the models
34
+ EMOTIONS = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
35
+
36
+ # Preprocessing parameters used during model training
37
+ MODEL_SAMPLE_RATE = 44100
38
+ MODEL_CLIP_DURATION = 4.0 # seconds
39
+ MODEL_N_FFT = 2048
40
+ MODEL_HOP_LENGTH = 512
41
+ MODEL_N_MELS = 128
42
+ MODEL_N_MFCC = 40
43
+ MODEL_TIME_FRAMES = 345
44
 
45
  def __init__(self, models_dir: str = None, use_ensemble: bool = True):
46
  """
 
77
  # Load models
78
  print(f"Loading NeuroByte emotion models from {models_dir}...")
79
 
80
+ for model_name, filename in model_files.items():
81
+ model_path = os.path.join(models_dir, filename)
82
 
83
  if os.path.exists(model_path):
84
+ try:
85
+ model = keras.models.load_model(model_path)
86
+ self.models[model_name] = model
87
+ print(f"[OK] Loaded {model_name} model")
88
+ except Exception as e:
89
+ print(f"[WARN] Failed to load {model_name}: {e}")
90
  else:
91
  print(f"[WARN] Model not found: {model_path}")
92
 
 
97
  else:
98
  print(f"[OK] {len(self.models)} emotion model(s) loaded successfully")
99
 
100
+ def download_models(self):
 
 
 
 
 
 
 
101
  """
102
+ Download NeuroByte models from Hugging Face
103
 
104
+ Run this once to download the models:
105
+ >>> extractor = EmotionFeatureExtractor()
106
+ >>> extractor.download_models()
107
  """
108
+ if not TENSORFLOW_AVAILABLE:
109
+ print("[WARN] TensorFlow required to download models")
110
+ return
 
111
 
112
+ try:
113
+ from huggingface_hub import hf_hub_download
114
+
115
+ os.makedirs(self.models_dir, exist_ok=True)
116
+
117
+ repo_id = "neurobyte-org/speech-emotion-recognition"
118
+ model_files = [
119
+ 'emotion_recognition_crnn.keras',
120
+ 'emotion_recognition_mel_spec.keras',
121
+ 'emotion_recognition_mfcc.keras'
122
+ ]
123
+
124
+ print(f"Downloading models from {repo_id}...")
125
+ for filename in model_files:
126
+ try:
127
+ print(f" Downloading {filename}...")
128
+ downloaded_path = hf_hub_download(
129
+ repo_id=repo_id,
130
+ filename=filename,
131
+ cache_dir=self.models_dir
132
+ )
133
+
134
+ # Copy to expected location
135
+ target_path = os.path.join(self.models_dir, filename)
136
+ if downloaded_path != target_path:
137
+ import shutil
138
+ shutil.copy(downloaded_path, target_path)
139
+
140
+ print(f" [OK] {filename} downloaded")
141
+ except Exception as e:
142
+ print(f" [WARN] Failed to download {filename}: {e}")
143
+
144
+ print("[OK] Download complete! Reinitialize the extractor to load models.")
145
+
146
+ except ImportError:
147
+ print("[WARN] huggingface_hub not installed. Install with: pip install huggingface_hub")
148
+
149
+ def extract_mel_spectrogram(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
150
+ """
151
+ Extract mel spectrogram for the mel_spec model
152
+
153
+ Returns shape: (128, 345, 1) for CNN input
154
+ """
155
+ # Resample to training sample rate if needed
156
+ if sr != self.MODEL_SAMPLE_RATE:
157
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=self.MODEL_SAMPLE_RATE)
158
+ sr = self.MODEL_SAMPLE_RATE
159
+
160
+ # Pad/trim to fixed duration
161
+ target_samples = int(self.MODEL_CLIP_DURATION * sr)
162
+ if len(audio) < target_samples:
163
+ audio = np.pad(audio, (0, target_samples - len(audio)), mode='constant')
164
+ else:
165
+ audio = audio[:target_samples]
166
+
167
+ # Extract mel spectrogram
168
+ mel_spec = librosa.feature.melspectrogram(
169
+ y=audio,
170
+ sr=sr,
171
+ n_fft=self.MODEL_N_FFT,
172
+ hop_length=self.MODEL_HOP_LENGTH,
173
+ n_mels=self.MODEL_N_MELS,
174
+ fmin=0,
175
+ fmax=sr/2
176
+ )
177
 
178
  # Convert to dB
179
  mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
180
 
181
+ # Normalize to [0, 1]
182
+ mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
183
+
184
+ # Add channel dimension (freq, time, 1)
185
+ mel_spec_norm = np.expand_dims(mel_spec_norm, axis=-1)
186
+
187
+ # Pad or truncate to fixed time length
188
+ target_length = self.MODEL_TIME_FRAMES
189
+ if mel_spec_norm.shape[1] < target_length:
190
+ # Pad with zeros
191
+ pad_width = target_length - mel_spec_norm.shape[1]
192
+ mel_spec_norm = np.pad(mel_spec_norm, ((0, 0), (0, pad_width), (0, 0)), mode='constant')
193
+ else:
194
+ # Truncate
195
+ mel_spec_norm = mel_spec_norm[:, :target_length, :]
196
+
197
+ return mel_spec_norm
 
198
 
199
+ def extract_mfcc(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
200
+ """
201
+ Extract MFCC features for the mfcc model
202
+
203
+ Returns shape: (40, 345, 1) for CNN input
204
+ """
205
+ # Resample to training sample rate if needed
206
+ if sr != self.MODEL_SAMPLE_RATE:
207
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=self.MODEL_SAMPLE_RATE)
208
+ sr = self.MODEL_SAMPLE_RATE
209
+
210
+ # Pad/trim to fixed duration
211
+ target_samples = int(self.MODEL_CLIP_DURATION * sr)
212
+ if len(audio) < target_samples:
213
+ audio = np.pad(audio, (0, target_samples - len(audio)), mode='constant')
214
+ else:
215
+ audio = audio[:target_samples]
216
+
217
+ # Extract MFCCs
218
+ mfccs = librosa.feature.mfcc(
219
+ y=audio,
220
+ sr=sr,
221
+ n_mfcc=self.MODEL_N_MFCC,
222
+ n_fft=self.MODEL_N_FFT,
223
+ hop_length=self.MODEL_HOP_LENGTH
224
+ )
225
 
226
+ # Normalize
227
+ mfccs = (mfccs - mfccs.mean()) / (mfccs.std() + 1e-8)
228
+
229
+ # Add channel dimension (coeff, time, 1)
230
+ mfccs = np.expand_dims(mfccs, axis=-1)
231
+
232
+ # Pad or truncate to fixed length
233
+ target_length = self.MODEL_TIME_FRAMES
234
+ if mfccs.shape[1] < target_length:
235
+ pad_width = target_length - mfccs.shape[1]
236
+ mfccs = np.pad(mfccs, ((0, 0), (0, pad_width), (0, 0)), mode='constant')
237
+ else:
238
+ mfccs = mfccs[:, :target_length, :]
239
+
240
+ return mfccs
 
 
 
 
 
 
 
 
 
 
241
 
242
+ def predict_emotions(self, audio: np.ndarray, sr: int = 16000) -> Dict[str, float]:
243
  """
244
  Predict emotion probabilities using loaded models
245
 
 
252
  try:
253
  predictions = []
254
 
255
+ def _predict_with_shape_guard(model, mel_spec_batch, mfcc_batch):
256
+ expected = model.input_shape
257
+ if expected is None or len(expected) < 4:
258
+ return model.predict(mel_spec_batch, verbose=0)[0]
259
+ freq_bins = expected[1]
260
+ if freq_bins == self.MODEL_N_MELS:
261
+ return model.predict(mel_spec_batch, verbose=0)[0]
262
+ if freq_bins == self.MODEL_N_MFCC:
263
+ return model.predict(mfcc_batch, verbose=0)[0]
264
+ # Fallback: try mel then mfcc
265
+ try:
266
+ return model.predict(mel_spec_batch, verbose=0)[0]
267
+ except Exception:
268
+ return model.predict(mfcc_batch, verbose=0)[0]
269
+
270
+ mel_spec = self.extract_mel_spectrogram(audio, sr)
271
+ mel_spec_batch = np.expand_dims(mel_spec, axis=0)
272
+ mfcc = self.extract_mfcc(audio, sr)
273
+ mfcc_batch = np.expand_dims(mfcc, axis=0)
274
+
275
+ # CRNN model (if available)
276
+ if 'crnn' in self.models:
277
+ pred_crnn = _predict_with_shape_guard(self.models['crnn'], mel_spec_batch, mfcc_batch)
278
+ predictions.append(pred_crnn)
279
+
280
+ # Mel Spectrogram model (if available)
281
+ if 'mel_spec' in self.models and self.use_ensemble:
282
+ pred_mel = _predict_with_shape_guard(self.models['mel_spec'], mel_spec_batch, mfcc_batch)
283
+ predictions.append(pred_mel)
284
+
285
+ # MFCC model (if available)
286
+ if 'mfcc' in self.models and self.use_ensemble:
287
+ pred_mfcc = _predict_with_shape_guard(self.models['mfcc'], mel_spec_batch, mfcc_batch)
288
+ predictions.append(pred_mfcc)
289
 
290
  # Average predictions if ensemble
291
  if len(predictions) > 1:
 
441
  if extractor.use_tensorflow and len(extractor.models) > 0:
442
  print(f"\nUsing {len(extractor.models)} NeuroByte model(s)")
443
  else:
444
+ print("\nUsing acoustic features fallback")