""" Emotion Feature Extractor - Using NeuroByte Models Extracts emotion features from audio for busy detection. Uses 3 pre-trained Keras models from NeuroByte-Consulting: 1. CRNN (Convolutional Recurrent Neural Network) - Best for sequential patterns 2. Mel Spectrogram CNN - Best for frequency patterns 3. MFCC CNN - Best for speech characteristics Each model outputs 7 emotion classes: angry, disgust, fear, happy, neutral, sad, surprise """ import numpy as np import librosa import warnings from typing import Dict, Optional import os warnings.filterwarnings("ignore") try: import tensorflow as tf from tensorflow import keras TENSORFLOW_AVAILABLE = True except ImportError: TENSORFLOW_AVAILABLE = False print("[WARN] TensorFlow not available. Install with: pip install tensorflow") class EmotionFeatureExtractor: """Extract emotion features using NeuroByte pre-trained models""" # Emotion labels from the models EMOTIONS = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise'] # Preprocessing parameters used during model training MODEL_SAMPLE_RATE = 44100 MODEL_CLIP_DURATION = 4.0 # seconds MODEL_N_FFT = 2048 MODEL_HOP_LENGTH = 512 MODEL_N_MELS = 128 MODEL_N_MFCC = 40 MODEL_TIME_FRAMES = 345 def __init__(self, models_dir: str = None, use_ensemble: bool = True): """ Initialize emotion detector with NeuroByte models Args: models_dir: Directory containing the .keras model files. Defaults to 'models' relative to this file. use_ensemble: If True, average predictions from all 3 models (more accurate) If False, use only CRNN model (faster) """ if models_dir is None: # Default to 'models' folder in same directory as this script models_dir = os.path.join(os.path.dirname(__file__), 'models') self.models_dir = models_dir self.use_ensemble = use_ensemble self.models = {} if not TENSORFLOW_AVAILABLE: print("[WARN] TensorFlow not installed. Falling back to acoustic features.") self.use_tensorflow = False return self.use_tensorflow = True # Model file paths model_files = { 'crnn': 'emotion_recognition_crnn.keras', 'mel_spec': 'emotion_recognition_mel_spec.keras', 'mfcc': 'emotion_recognition_mfcc.keras' } # Load models print(f"Loading NeuroByte emotion models from {models_dir}...") for model_name, filename in model_files.items(): model_path = os.path.join(models_dir, filename) if os.path.exists(model_path): try: model = keras.models.load_model(model_path) self.models[model_name] = model print(f"[OK] Loaded {model_name} model") except Exception as e: print(f"[WARN] Failed to load {model_name}: {e}") else: print(f"[WARN] Model not found: {model_path}") # If no models loaded, fall back to acoustics if len(self.models) == 0: print("[WARN] No models loaded. Using acoustic features fallback.") self.use_tensorflow = False else: print(f"[OK] {len(self.models)} emotion model(s) loaded successfully") def download_models(self): """ Download NeuroByte models from Hugging Face Run this once to download the models: >>> extractor = EmotionFeatureExtractor() >>> extractor.download_models() """ if not TENSORFLOW_AVAILABLE: print("[WARN] TensorFlow required to download models") return try: from huggingface_hub import hf_hub_download os.makedirs(self.models_dir, exist_ok=True) repo_id = "neurobyte-org/speech-emotion-recognition" model_files = [ 'emotion_recognition_crnn.keras', 'emotion_recognition_mel_spec.keras', 'emotion_recognition_mfcc.keras' ] print(f"Downloading models from {repo_id}...") for filename in model_files: try: print(f" Downloading {filename}...") downloaded_path = hf_hub_download( repo_id=repo_id, filename=filename, cache_dir=self.models_dir ) # Copy to expected location target_path = os.path.join(self.models_dir, filename) if downloaded_path != target_path: import shutil shutil.copy(downloaded_path, target_path) print(f" [OK] {filename} downloaded") except Exception as e: print(f" [WARN] Failed to download {filename}: {e}") print("[OK] Download complete! Reinitialize the extractor to load models.") except ImportError: print("[WARN] huggingface_hub not installed. Install with: pip install huggingface_hub") def extract_mel_spectrogram(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray: """ Extract mel spectrogram for the mel_spec model Returns shape: (128, 345, 1) for CNN input """ # Resample to training sample rate if needed if sr != self.MODEL_SAMPLE_RATE: audio = librosa.resample(audio, orig_sr=sr, target_sr=self.MODEL_SAMPLE_RATE) sr = self.MODEL_SAMPLE_RATE # Pad/trim to fixed duration target_samples = int(self.MODEL_CLIP_DURATION * sr) if len(audio) < target_samples: audio = np.pad(audio, (0, target_samples - len(audio)), mode='constant') else: audio = audio[:target_samples] # Extract mel spectrogram mel_spec = librosa.feature.melspectrogram( y=audio, sr=sr, n_fft=self.MODEL_N_FFT, hop_length=self.MODEL_HOP_LENGTH, n_mels=self.MODEL_N_MELS, fmin=0, fmax=sr/2 ) # Convert to dB mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max) # Normalize to [0, 1] mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8) # Add channel dimension (freq, time, 1) mel_spec_norm = np.expand_dims(mel_spec_norm, axis=-1) # Pad or truncate to fixed time length target_length = self.MODEL_TIME_FRAMES if mel_spec_norm.shape[1] < target_length: # Pad with zeros pad_width = target_length - mel_spec_norm.shape[1] mel_spec_norm = np.pad(mel_spec_norm, ((0, 0), (0, pad_width), (0, 0)), mode='constant') else: # Truncate mel_spec_norm = mel_spec_norm[:, :target_length, :] return mel_spec_norm def extract_mfcc(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray: """ Extract MFCC features for the mfcc model Returns shape: (40, 345, 1) for CNN input """ # Resample to training sample rate if needed if sr != self.MODEL_SAMPLE_RATE: audio = librosa.resample(audio, orig_sr=sr, target_sr=self.MODEL_SAMPLE_RATE) sr = self.MODEL_SAMPLE_RATE # Pad/trim to fixed duration target_samples = int(self.MODEL_CLIP_DURATION * sr) if len(audio) < target_samples: audio = np.pad(audio, (0, target_samples - len(audio)), mode='constant') else: audio = audio[:target_samples] # Extract MFCCs mfccs = librosa.feature.mfcc( y=audio, sr=sr, n_mfcc=self.MODEL_N_MFCC, n_fft=self.MODEL_N_FFT, hop_length=self.MODEL_HOP_LENGTH ) # Normalize mfccs = (mfccs - mfccs.mean()) / (mfccs.std() + 1e-8) # Add channel dimension (coeff, time, 1) mfccs = np.expand_dims(mfccs, axis=-1) # Pad or truncate to fixed length target_length = self.MODEL_TIME_FRAMES if mfccs.shape[1] < target_length: pad_width = target_length - mfccs.shape[1] mfccs = np.pad(mfccs, ((0, 0), (0, pad_width), (0, 0)), mode='constant') else: mfccs = mfccs[:, :target_length, :] return mfccs def predict_emotions(self, audio: np.ndarray, sr: int = 16000) -> Dict[str, float]: """ Predict emotion probabilities using loaded models Returns: Dictionary with emotion labels as keys and probabilities as values """ if not self.use_tensorflow or len(self.models) == 0: return self.extract_from_acoustics(audio, sr) try: predictions = [] def _predict_with_shape_guard(model, mel_spec_batch, mfcc_batch): expected = model.input_shape if expected is None or len(expected) < 4: return model.predict(mel_spec_batch, verbose=0)[0] freq_bins = expected[1] if freq_bins == self.MODEL_N_MELS: return model.predict(mel_spec_batch, verbose=0)[0] if freq_bins == self.MODEL_N_MFCC: return model.predict(mfcc_batch, verbose=0)[0] # Fallback: try mel then mfcc try: return model.predict(mel_spec_batch, verbose=0)[0] except Exception: return model.predict(mfcc_batch, verbose=0)[0] mel_spec = self.extract_mel_spectrogram(audio, sr) mel_spec_batch = np.expand_dims(mel_spec, axis=0) mfcc = self.extract_mfcc(audio, sr) mfcc_batch = np.expand_dims(mfcc, axis=0) # CRNN model (if available) if 'crnn' in self.models: pred_crnn = _predict_with_shape_guard(self.models['crnn'], mel_spec_batch, mfcc_batch) predictions.append(pred_crnn) # Mel Spectrogram model (if available) if 'mel_spec' in self.models and self.use_ensemble: pred_mel = _predict_with_shape_guard(self.models['mel_spec'], mel_spec_batch, mfcc_batch) predictions.append(pred_mel) # MFCC model (if available) if 'mfcc' in self.models and self.use_ensemble: pred_mfcc = _predict_with_shape_guard(self.models['mfcc'], mel_spec_batch, mfcc_batch) predictions.append(pred_mfcc) # Average predictions if ensemble if len(predictions) > 1: avg_pred = np.mean(predictions, axis=0) else: avg_pred = predictions[0] # Convert to dictionary emotion_probs = {emotion: float(prob) for emotion, prob in zip(self.EMOTIONS, avg_pred)} return emotion_probs except Exception as e: print(f"⚠ Prediction failed: {e}") return self.extract_from_acoustics(audio, sr) def extract_from_acoustics(self, audio: np.ndarray, sr: int = 16000) -> Dict[str, float]: """ Fallback: Extract emotion proxies from acoustic features Returns emotion-like scores without deep learning """ try: if len(audio) < 512: return {emotion: 1.0/7 for emotion in self.EMOTIONS} # Uniform distribution # Extract acoustic features rms = librosa.feature.rms(y=audio)[0] mean_energy = np.mean(rms) energy_std = np.std(rms) f0 = librosa.yin(audio, fmin=75, fmax=400, sr=sr) f0_voiced = f0[f0 > 0] pitch_mean = np.mean(f0_voiced) if len(f0_voiced) > 0 else 0 pitch_std = np.std(f0_voiced) if len(f0_voiced) > 0 else 0 zcr = np.mean(librosa.feature.zero_crossing_rate(audio)) centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sr)) # Heuristic mapping to emotions scores = { 'angry': (energy_std * 10 + pitch_std / 50) / 2, 'disgust': (pitch_mean / 300) * 0.3, 'fear': (pitch_mean / 250 + zcr * 5) / 2, 'happy': (centroid / 3000 + mean_energy * 5) / 2, 'neutral': 0.3, # Baseline 'sad': (1 - centroid / 4000) * 0.5, 'surprise': (energy_std * 8 + zcr * 3) / 2 } # Normalize to sum to 1 total = sum(scores.values()) scores = {k: v / total for k, v in scores.items()} return scores except Exception as e: print(f"⚠ Acoustic fallback failed: {e}") return {emotion: 1.0/7 for emotion in self.EMOTIONS} def extract_all(self, audio: np.ndarray, sr: int = 16000) -> Dict[str, float]: """ Extract emotion features for busy detection Returns: v11_emotion_stress: 0-1 (angry + fear + disgust) v12_emotion_energy: 0-1 (happy + surprise + angry) v13_emotion_valence: 0-1 (happy - sad - angry) """ if audio.dtype != np.float32: audio = audio.astype(np.float32) # Get emotion predictions emotion_probs = self.predict_emotions(audio, sr) # Map emotions to features stress = ( emotion_probs.get('angry', 0.0) * 0.5 + emotion_probs.get('fear', 0.0) * 0.3 + emotion_probs.get('disgust', 0.0) * 0.2 ) energy = ( emotion_probs.get('happy', 0.0) * 0.4 + emotion_probs.get('surprise', 0.0) * 0.3 + emotion_probs.get('angry', 0.0) * 0.3 ) valence = ( emotion_probs.get('happy', 0.0) + emotion_probs.get('surprise', 0.0) * 0.5 - emotion_probs.get('sad', 0.0) - emotion_probs.get('angry', 0.0) * 0.5 ) # Normalize valence to [0, 1] valence = (valence + 1.0) / 2.0 return { 'v11_emotion_stress': float(np.clip(stress, 0, 1)), 'v12_emotion_energy': float(np.clip(energy, 0, 1)), 'v13_emotion_valence': float(np.clip(valence, 0, 1)) } # Standalone test if __name__ == "__main__": import time print("Testing NeuroByte Emotion Feature Extractor...") # Initialize extractor extractor = EmotionFeatureExtractor( models_dir="models_cache/emotion_models", use_ensemble=True ) # If models not found, try to download if not extractor.use_tensorflow or len(extractor.models) == 0: print("\nModels not found. Download them with:") print(" extractor.download_models()") print("\nUsing acoustic fallback for now...") # Generate test audio duration = 3 sr = 16000 t = np.linspace(0, duration, sr * duration) # Test 1: Stressed voice (high pitch, varying) print("\n1. Testing with stressed audio:") audio_stressed = np.sin(2 * np.pi * 300 * t) + 0.5 * np.sin(2 * np.pi * 150 * t) audio_stressed += 0.2 * np.random.randn(len(audio_stressed)) start = time.time() features_stressed = extractor.extract_all(audio_stressed, sr) print(f" Time: {(time.time() - start)*1000:.0f}ms") print(" Features:") for k, v in features_stressed.items(): print(f" {k}: {v:.3f}") # Test 2: Calm voice (low pitch, steady) print("\n2. Testing with calm audio:") audio_calm = np.sin(2 * np.pi * 150 * t) * 0.3 start = time.time() features_calm = extractor.extract_all(audio_calm, sr) print(f" Time: {(time.time() - start)*1000:.0f}ms") print(" Features:") for k, v in features_calm.items(): print(f" {k}: {v:.3f}") print("\n✓ Tests complete!") if extractor.use_tensorflow and len(extractor.models) > 0: print(f"\nUsing {len(extractor.models)} NeuroByte model(s)") else: print("\nUsing acoustic features fallback")