Spaces:

divAIne
/

busy-module-audio

Running

App Files Files Community

EurekaPotato commited on Feb 19

Commit

cedabd5

verified ·

1 Parent(s): 2110638

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

audio_features.py +421 -0
emotion_features.py +411 -0
handler.py +57 -266
requirements.txt +5 -3

audio_features.py ADDED Viewed

	@@ -0,0 +1,421 @@

+"""
+Audio Feature Extractor - IMPROVED VERSION
+Extracts 14 voice features from audio to detect busy/distracted states.
+KEY IMPROVEMENTS:
+1. HNR instead of SNR - Better for voice recordings (not affected by recording noise)
+2. Smarter noise classification using multiple spectral features
+3. Removed useless latency feature (t9_latency) from consideration
+"""
+import numpy as np
+import librosa
+import soundfile as sf
+from scipy import signal
+from typing import Dict, Tuple, List
+import noisereduce as nr
+import torch
+import warnings
+from .emotion_features import EmotionFeatureExtractor
+warnings.filterwarnings("ignore")
+class AudioFeatureExtractor:
+    """Extract 14 audio features for busy detection (Enhanced with Silero VAD)"""
+    _vad_model_cache = None
+    _vad_utils_cache = None
+    _emotion_extractor_cache = None
+    def __init__(self, sample_rate: int = 16000, use_emotion: bool = True, config: Dict = None, emotion_models_dir: str = None):
+        self.config = config or {}
+        self.sample_rate = self.config.get('audio_sample_rate', sample_rate)
+        self.vad_sample_rate = self.config.get('vad_sample_rate', self.sample_rate)
+        self.use_emotion = use_emotion and (not self.config.get('skip_emotion_features', False))
+        self.skip_noise_reduction = bool(self.config.get('skip_noise_reduction', False))
+        self.audio_duration_limit = self.config.get('audio_duration_limit', None)
+        self.emotion_models_dir = emotion_models_dir
+        print("Loading Silero VAD...")
+        try:
+            if AudioFeatureExtractor._vad_model_cache is None:
+                AudioFeatureExtractor._vad_model_cache, AudioFeatureExtractor._vad_utils_cache = torch.hub.load(
+                    repo_or_dir='snakers4/silero-vad',
+                    model='silero_vad',
+                    force_reload=False,
+                    trust_repo=True
+                )
+            self.vad_model = AudioFeatureExtractor._vad_model_cache
+            utils = AudioFeatureExtractor._vad_utils_cache
+            self.get_speech_timestamps = utils[0]
+            print("[OK] Silero VAD loaded (cached)")
+        except Exception as e:
+            print(f"[WARN] Failed to load Silero VAD: {e}. Fallback to energy VAD might be needed.")
+            self.vad_model = None
+        if self.use_emotion:
+            print("Loading Emotion CNN...")
+            try:
+                if AudioFeatureExtractor._emotion_extractor_cache is None:
+                    # Pass models dir to extractor
+                    AudioFeatureExtractor._emotion_extractor_cache = EmotionFeatureExtractor(models_dir=self.emotion_models_dir)
+                self.emotion_extractor = AudioFeatureExtractor._emotion_extractor_cache
+                print("[OK] Emotion CNN loaded (cached)")
+            except Exception as e:
+                print(f"[WARN] Emotion features disabled: {e}")
+                self.emotion_extractor = None
+                self.use_emotion = False
+        else:
+            self.emotion_extractor = None
+    def load_audio(self, audio_path: str) -> np.ndarray:
+        """Load and preprocess audio file"""
+        audio, sr = librosa.load(
+            audio_path,
+            sr=self.sample_rate,
+            mono=True,
+            duration=self.audio_duration_limit
+        )
+        return audio
+    def extract_hnr(self, audio: np.ndarray) -> float:
+        """
+        V1: Harmonics-to-Noise Ratio (HNR)
+        Measures voice quality - higher = clearer voice
+        IMPROVEMENT: HNR is better than SNR for voice because:
+        - Not affected by recording equipment noise
+        - Focuses on harmonic structure of speech
+        - More robust to environmental noise
+        Range: 0-30 dB (typical: 10-20 dB for clear speech)
+        """
+        if len(audio) == 0 or len(audio) < 2048:
+            return 15.0  # Neutral default
+        try:
+            # Method 1: Autocorrelation-based HNR (most accurate)
+            frame_length = 2048
+            hop_length = 512
+            hnr_values = []
+            for i in range(0, len(audio) - frame_length, hop_length):
+                frame = audio[i:i+frame_length]
+                # Only process frames with enough energy
+                energy = np.sum(frame ** 2)
+                if energy < 0.001:
+                    continue
+                # Autocorrelation
+                autocorr = np.correlate(frame, frame, mode='full')
+                autocorr = autocorr[len(autocorr)//2:]
+                # Normalize
+                if autocorr[0] > 0:
+                    autocorr = autocorr / autocorr[0]
+                else:
+                    continue
+                # Find fundamental frequency peak (skip first 20 samples = ~1250 Hz max)
+                min_lag = int(self.sample_rate / 400)  # Max 400 Hz
+                max_lag = int(self.sample_rate / 75)   # Min 75 Hz
+                if max_lag >= len(autocorr):
+                    continue
+                peak_idx = np.argmax(autocorr[min_lag:max_lag]) + min_lag
+                if peak_idx > 0 and autocorr[peak_idx] > 0.3:  # Minimum correlation threshold
+                    # HNR calculation
+                    periodic_power = autocorr[peak_idx]
+                    aperiodic_power = 1 - periodic_power
+                    if aperiodic_power > 0:
+                        hnr_db = 10 * np.log10(periodic_power / aperiodic_power)
+                        # Clip to realistic range
+                        hnr_db = np.clip(hnr_db, 0, 30)
+                        hnr_values.append(hnr_db)
+            if len(hnr_values) > 0:
+                # Return median (more robust than mean)
+                return float(np.median(hnr_values))
+            # Method 2: Fallback using spectral flatness
+            flatness = np.mean(librosa.feature.spectral_flatness(y=audio))
+            # Convert to HNR-like scale (inverted)
+            hnr_proxy = (1 - np.clip(flatness, 0, 1)) * 25
+            return float(hnr_proxy)
+        except Exception as e:
+            print(f"HNR extraction failed: {e}")
+            return 15.0  # Safe default
+    def classify_noise_type(self, audio: np.ndarray) -> Dict[str, float]:
+        """
+        V2: Background Noise Classification (one-hot encoded)
+        IMPROVEMENT: Uses multiple spectral features for better accuracy:
+        - Spectral centroid (frequency brightness)
+        - Spectral rolloff (energy distribution)
+        - Zero crossing rate (noisiness)
+        - Low frequency energy (rumble)
+        - High frequency energy (hiss)
+        - Spectral contrast (texture)
+        """
+        if len(audio) < 512:
+            return {'traffic': 0, 'office': 0, 'crowd': 0, 'wind': 0, 'clean': 1}
+        try:
+            # Extract comprehensive spectral features
+            S = np.abs(librosa.stft(audio))
+            if S.shape[1] == 0:
+                return {'traffic': 0, 'office': 0, 'crowd': 0, 'wind': 0, 'clean': 1}
+            # Feature 1: Spectral Centroid (brightness)
+            centroid = np.mean(librosa.feature.spectral_centroid(S=S, sr=self.sample_rate))
+            # Feature 2: Spectral Rolloff (energy concentration)
+            rolloff = np.mean(librosa.feature.spectral_rolloff(S=S, sr=self.sample_rate))
+            # Feature 3: Zero Crossing Rate
+            zcr = np.mean(librosa.feature.zero_crossing_rate(audio))
+            # Feature 4: Low frequency energy (0-500 Hz)
+            freqs = librosa.fft_frequencies(sr=self.sample_rate, n_fft=2048)
+            low_freq_mask = freqs < 500
+            low_energy = np.mean(S[low_freq_mask, :]) if np.any(low_freq_mask) else 0
+            # Feature 5: High frequency energy (4000+ Hz)
+            high_freq_mask = freqs > 4000
+            high_energy = np.mean(S[high_freq_mask, :]) if np.any(high_freq_mask) else 0
+            # Feature 6: Overall energy
+            total_energy = np.mean(audio ** 2)
+            # Feature 7: Spectral contrast (texture measure)
+            contrast = np.mean(librosa.feature.spectral_contrast(S=S, sr=self.sample_rate))
+            # Score each noise type based on features
+            scores = {
+                'traffic': 0.0,
+                'office': 0.0,
+                'crowd': 0.0,
+                'wind': 0.0,
+                'clean': 0.0
+            }
+            # Traffic: Low frequency dominant + rumble + consistent
+            if low_energy > 0.002 and centroid < 2000 and contrast < 20:
+                scores['traffic'] = low_energy * 100 + (2500 - centroid) / 1000
+            # Office: Mid frequencies + keyboard clicks + air conditioning hum
+            if 1500 < centroid < 3500 and 0.0005 < total_energy < 0.005:
+                scores['office'] = (3500 - abs(centroid - 2500)) / 1000 + contrast / 30
+            # Crowd: High ZCR + varying spectrum + speech-like energy
+            if zcr > 0.08 and total_energy > 0.003 and contrast > 15:
+                scores['crowd'] = zcr * 10 + total_energy * 50
+            # Wind: Very high ZCR + high frequency energy + low contrast
+            if zcr > 0.12 and high_energy > 0.001 and contrast < 15:
+                scores['wind'] = zcr * 8 + high_energy * 100
+            # Clean: Low energy + low ZCR + high contrast (speech only)
+            if total_energy < 0.005 and zcr < 0.08 and contrast > 20:
+                scores['clean'] = (0.005 - total_energy) * 200 + contrast / 30
+            # If all scores are low, default to clean
+            if max(scores.values()) < 0.1:
+                scores['clean'] = 1.0
+            # Normalize to probabilities
+            total = sum(scores.values())
+            if total > 0:
+                scores = {k: v/total for k, v in scores.items()}
+            else:
+                scores['clean'] = 1.0
+            return scores
+        except Exception as e:
+            print(f"Noise classification failed: {e}")
+            return {'traffic': 0, 'office': 0, 'crowd': 0, 'wind': 0, 'clean': 1}
+    def extract_speech_rate(self, audio: np.ndarray, transcript: str) -> float:
+        """V3: Speech Rate (words per second)"""
+        if not transcript:
+            return 0.0
+        word_count = len(transcript.split())
+        duration = len(audio) / self.sample_rate
+        if duration == 0:
+            return 0.0
+        return word_count / duration
+    def extract_pitch_features(self, audio: np.ndarray) -> Tuple[float, float]:
+        """V4-V5: Pitch Mean and Std"""
+        try:
+            if len(audio) < 2048:
+                return 0.0, 0.0
+            # Use pyin (more robust than yin)
+            f0, voiced_flag, voiced_probs = librosa.pyin(
+                audio,
+                fmin=librosa.note_to_hz('C2'),
+                fmax=librosa.note_to_hz('C7'),
+                sr=self.sample_rate
+            )
+            # Only use voiced frames
+            f0_voiced = f0[voiced_flag]
+            if len(f0_voiced) == 0:
+                return 0.0, 0.0
+            return float(np.mean(f0_voiced)), float(np.std(f0_voiced))
+        except Exception as e:
+            print(f"Pitch extraction failed: {e}")
+            return 0.0, 0.0
+    def extract_energy_features(self, audio: np.ndarray) -> Tuple[float, float]:
+        """V6-V7: Energy Mean and Std"""
+        try:
+            rms = librosa.feature.rms(y=audio)[0]
+            return float(np.mean(rms)), float(np.std(rms))
+        except:
+            return 0.0, 0.0
+    def extract_pause_features(self, audio: np.ndarray) -> Tuple[float, float, int]:
+        """
+        V8-V10: Pause Ratio, Average Pause Duration, Mid-Pause Count
+        Uses Silero VAD
+        """
+        if self.vad_model is None or len(audio) < 512:
+            return 0.0, 0.0, 0
+        # Resample for VAD if configured
+        if self.vad_sample_rate != self.sample_rate:
+            try:
+                audio = librosa.resample(audio, orig_sr=self.sample_rate, target_sr=self.vad_sample_rate)
+            except Exception:
+                pass
+        # Silero expects Tensor
+        wav = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)
+        try:
+            speech_dict = self.get_speech_timestamps(wav, self.vad_model, sampling_rate=self.vad_sample_rate)
+            # Calculate speech duration
+            speech_samples = sum(seg['end'] - seg['start'] for seg in speech_dict)
+            total_samples = len(audio)
+            if total_samples == 0:
+                return 0.0, 0.0, 0
+            # Pause Ratio
+            pause_samples = total_samples - speech_samples
+            pause_ratio = pause_samples / total_samples
+            # Calculate gaps between speech segments
+            gaps = []
+            if len(speech_dict) > 1:
+                for i in range(len(speech_dict) - 1):
+                    gap = speech_dict[i+1]['start'] - speech_dict[i]['end']
+                    if gap > 0:
+                        gaps.append(gap / self.vad_sample_rate)  # Convert to seconds
+            avg_pause_dur = float(np.mean(gaps)) if gaps else 0.0
+            # Mid-Pause Count (0.3s - 1.0s)
+            mid_pause_cnt = sum(1 for g in gaps if 0.3 <= g <= 1.0)
+            return float(pause_ratio), float(avg_pause_dur), int(mid_pause_cnt)
+        except Exception as e:
+            print(f"VAD Error: {e}")
+            return 0.0, 0.0, 0
+    def extract_all(self, audio: np.ndarray, transcript: str = "") -> Dict[str, float]:
+        """Extract all audio features (14 original + 3 emotion = 17 total)"""
+        if audio.dtype != np.float32:
+            audio = audio.astype(np.float32)
+        features = {}
+        # V1: HNR (IMPROVED from SNR)
+        features['v1_snr'] = self.extract_hnr(audio)  # Keep name for compatibility
+        # V2: Noise classification (IMPROVED)
+        noise_class = self.classify_noise_type(audio)
+        features['v2_noise_traffic'] = noise_class['traffic']
+        features['v2_noise_office'] = noise_class['office']
+        features['v2_noise_crowd'] = noise_class['crowd']
+        features['v2_noise_wind'] = noise_class['wind']
+        features['v2_noise_clean'] = noise_class['clean']
+        # V3: Speech rate
+        features['v3_speech_rate'] = self.extract_speech_rate(audio, transcript)
+        # V4-V5: Pitch
+        p_mean, p_std = self.extract_pitch_features(audio)
+        features['v4_pitch_mean'] = p_mean
+        features['v5_pitch_std'] = p_std
+        # V6-V7: Energy
+        e_mean, e_std = self.extract_energy_features(audio)
+        features['v6_energy_mean'] = e_mean
+        features['v7_energy_std'] = e_std
+        # V8-V10: Pause features
+        pause_ratio, avg_pause, mid_pause_cnt = self.extract_pause_features(audio)
+        features['v8_pause_ratio'] = pause_ratio
+        features['v9_avg_pause_dur'] = avg_pause
+        features['v10_mid_pause_cnt'] = float(mid_pause_cnt)
+        # V11-V13: Emotion features
+        if self.use_emotion and self.emotion_extractor is not None:
+            try:
+                emotion_features = self.emotion_extractor.extract_all(audio, self.sample_rate)
+                features.update(emotion_features)
+            except Exception as e:
+                print(f"⚠ Emotion features skipped: {e}")
+                # Add zero values for compatibility
+                features['v11_emotion_stress'] = 0.0
+                features['v12_emotion_energy'] = 0.0
+                features['v13_emotion_valence'] = 0.0
+        return features
+    def extract_basic(self, audio: np.ndarray, transcript: str = "") -> Dict[str, float]:
+        """
+        Extract a minimal set of audio features for fast decisions.
+        Uses only low-cost features.
+        """
+        if audio.dtype != np.float32:
+            audio = audio.astype(np.float32)
+        features = {}
+        features['v1_snr'] = self.extract_hnr(audio)  # Keep name for compatibility
+        features['v3_speech_rate'] = self.extract_speech_rate(audio, transcript)
+        e_mean, e_std = self.extract_energy_features(audio)
+        features['v6_energy_mean'] = e_mean
+        features['v7_energy_std'] = e_std
+        pause_ratio, avg_pause, mid_pause_cnt = self.extract_pause_features(audio)
+        features['v8_pause_ratio'] = pause_ratio
+        features['v9_avg_pause_dur'] = avg_pause
+        features['v10_mid_pause_cnt'] = float(mid_pause_cnt)
+        return features
+if __name__ == "__main__":
+    extractor = AudioFeatureExtractor()
+    print("Audio Feature Extractor initialized successfully")
+    print("Using HNR instead of SNR for better voice quality measurement")

emotion_features.py ADDED Viewed

	@@ -0,0 +1,411 @@

+"""
+Emotion Feature Extractor - Using NeuroByte Models
+Extracts emotion features from audio for busy detection.
+Uses 3 pre-trained Keras models from NeuroByte-Consulting:
+1. CRNN (Convolutional Recurrent Neural Network) - Best for sequential patterns
+2. Mel Spectrogram CNN - Best for frequency patterns
+3. MFCC CNN - Best for speech characteristics
+Each model outputs 7 emotion classes: angry, disgust, fear, happy, neutral, sad, surprise
+"""
+import numpy as np
+import librosa
+import warnings
+from typing import Dict, Optional
+import os
+warnings.filterwarnings("ignore")
+try:
+    import tensorflow as tf
+    from tensorflow import keras
+    TENSORFLOW_AVAILABLE = True
+except ImportError:
+    TENSORFLOW_AVAILABLE = False
+    print("[WARN] TensorFlow not available. Install with: pip install tensorflow")
+class EmotionFeatureExtractor:
+    """Extract emotion features using NeuroByte pre-trained models"""
+    # Emotion labels from the models
+    EMOTIONS = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
+    def __init__(self, models_dir: str = None, use_ensemble: bool = True):
+        """
+        Initialize emotion detector with NeuroByte models
+        Args:
+            models_dir: Directory containing the .keras model files.
+                       Defaults to 'models' relative to this file.
+            use_ensemble: If True, average predictions from all 3 models (more accurate)
+                         If False, use only CRNN model (faster)
+        """
+        if models_dir is None:
+            # Default to 'models' folder in same directory as this script
+            models_dir = os.path.join(os.path.dirname(__file__), 'models')
+        self.models_dir = models_dir
+        self.use_ensemble = use_ensemble
+        self.models = {}
+        if not TENSORFLOW_AVAILABLE:
+            print("[WARN] TensorFlow not installed. Falling back to acoustic features.")
+            self.use_tensorflow = False
+            return
+        self.use_tensorflow = True
+        # Model file paths
+        model_files = {
+            'crnn': 'emotion_recognition_crnn.keras',
+            'mel_spec': 'emotion_recognition_mel_spec.keras',
+            'mfcc': 'emotion_recognition_mfcc.keras'
+        }
+        # Load models
+        print(f"Loading NeuroByte emotion models from {models_dir}...")
+        for model_name, filename in model_files.items():
+            model_path = os.path.join(models_dir, filename)
+            if os.path.exists(model_path):
+                try:
+                    self.models[model_name] = keras.models.load_model(model_path)
+                    print(f"[OK] Loaded {model_name} model")
+                except Exception as e:
+                    print(f"[WARN] Failed to load {model_name}: {e}")
+            else:
+                print(f"[WARN] Model not found: {model_path}")
+        # If no models loaded, fall back to acoustics
+        if len(self.models) == 0:
+            print("[WARN] No models loaded. Using acoustic features fallback.")
+            self.use_tensorflow = False
+        else:
+            print(f"[OK] {len(self.models)} emotion model(s) loaded successfully")
+    def download_models(self):
+        """
+        Download NeuroByte models from Hugging Face
+        Run this once to download the models:
+        >>> extractor = EmotionFeatureExtractor()
+        >>> extractor.download_models()
+        """
+        if not TENSORFLOW_AVAILABLE:
+            print("[WARN] TensorFlow required to download models")
+            return
+        try:
+            from huggingface_hub import hf_hub_download
+            os.makedirs(self.models_dir, exist_ok=True)
+            repo_id = "neurobyte-org/speech-emotion-recognition"
+            model_files = [
+                'emotion_recognition_crnn.keras',
+                'emotion_recognition_mel_spec.keras',
+                'emotion_recognition_mfcc.keras'
+            ]
+            print(f"Downloading models from {repo_id}...")
+            for filename in model_files:
+                try:
+                    print(f"  Downloading {filename}...")
+                    downloaded_path = hf_hub_download(
+                        repo_id=repo_id,
+                        filename=filename,
+                        cache_dir=self.models_dir
+                    )
+                    # Copy to expected location
+                    target_path = os.path.join(self.models_dir, filename)
+                    if downloaded_path != target_path:
+                        import shutil
+                        shutil.copy(downloaded_path, target_path)
+                    print(f"  [OK] {filename} downloaded")
+                except Exception as e:
+                    print(f"  [WARN] Failed to download {filename}: {e}")
+            print("[OK] Download complete! Reinitialize the extractor to load models.")
+        except ImportError:
+            print("[WARN] huggingface_hub not installed. Install with: pip install huggingface_hub")
+    def extract_mel_spectrogram(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
+        """
+        Extract mel spectrogram for the mel_spec model
+        Returns shape: (128, time_steps, 1) for CNN input
+        """
+        # Resample to 16kHz if needed
+        if sr != 16000:
+            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
+            sr = 16000
+        # Extract mel spectrogram
+        mel_spec = librosa.feature.melspectrogram(
+            y=audio,
+            sr=sr,
+            n_fft=2048,
+            hop_length=512,
+            n_mels=128,
+            fmin=0,
+            fmax=sr/2
+        )
+        # Convert to dB
+        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
+        # Normalize to [0, 1]
+        mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
+        # Add channel dimension and transpose to (time, freq, 1)
+        mel_spec_norm = mel_spec_norm.T
+        mel_spec_norm = np.expand_dims(mel_spec_norm, axis=-1)
+        # Pad or truncate to fixed length (e.g., 216 frames for ~3 seconds)
+        target_length = 216
+        if mel_spec_norm.shape[0] < target_length:
+            # Pad with zeros
+            pad_width = target_length - mel_spec_norm.shape[0]
+            mel_spec_norm = np.pad(mel_spec_norm, ((0, pad_width), (0, 0), (0, 0)), mode='constant')
+        else:
+            # Truncate
+            mel_spec_norm = mel_spec_norm[:target_length, :, :]
+        return mel_spec_norm
+    def extract_mfcc(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
+        """
+        Extract MFCC features for the mfcc model
+        Returns shape: (40, time_steps, 1) for CNN input
+        """
+        # Resample to 16kHz if needed
+        if sr != 16000:
+            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
+            sr = 16000
+        # Extract MFCCs
+        mfccs = librosa.feature.mfcc(
+            y=audio,
+            sr=sr,
+            n_mfcc=40,
+            n_fft=2048,
+            hop_length=512
+        )
+        # Normalize
+        mfccs = (mfccs - mfccs.mean()) / (mfccs.std() + 1e-8)
+        # Transpose and add channel dimension
+        mfccs = mfccs.T
+        mfccs = np.expand_dims(mfccs, axis=-1)
+        # Pad or truncate to fixed length
+        target_length = 216
+        if mfccs.shape[0] < target_length:
+            pad_width = target_length - mfccs.shape[0]
+            mfccs = np.pad(mfccs, ((0, pad_width), (0, 0), (0, 0)), mode='constant')
+        else:
+            mfccs = mfccs[:target_length, :, :]
+        return mfccs
+    def predict_emotions(self, audio: np.ndarray, sr: int = 16000) -> Dict[str, float]:
+        """
+        Predict emotion probabilities using loaded models
+        Returns:
+            Dictionary with emotion labels as keys and probabilities as values
+        """
+        if not self.use_tensorflow or len(self.models) == 0:
+            return self.extract_from_acoustics(audio, sr)
+        try:
+            predictions = []
+            # CRNN model (if available)
+            if 'crnn' in self.models:
+                mel_spec = self.extract_mel_spectrogram(audio, sr)
+                mel_spec_batch = np.expand_dims(mel_spec, axis=0)
+                pred_crnn = self.models['crnn'].predict(mel_spec_batch, verbose=0)[0]
+                predictions.append(pred_crnn)
+            # Mel Spectrogram model (if available)
+            if 'mel_spec' in self.models and self.use_ensemble:
+                mel_spec = self.extract_mel_spectrogram(audio, sr)
+                mel_spec_batch = np.expand_dims(mel_spec, axis=0)
+                pred_mel = self.models['mel_spec'].predict(mel_spec_batch, verbose=0)[0]
+                predictions.append(pred_mel)
+            # MFCC model (if available)
+            if 'mfcc' in self.models and self.use_ensemble:
+                mfcc = self.extract_mfcc(audio, sr)
+                mfcc_batch = np.expand_dims(mfcc, axis=0)
+                pred_mfcc = self.models['mfcc'].predict(mfcc_batch, verbose=0)[0]
+                predictions.append(pred_mfcc)
+            # Average predictions if ensemble
+            if len(predictions) > 1:
+                avg_pred = np.mean(predictions, axis=0)
+            else:
+                avg_pred = predictions[0]
+            # Convert to dictionary
+            emotion_probs = {emotion: float(prob) for emotion, prob in zip(self.EMOTIONS, avg_pred)}
+            return emotion_probs
+        except Exception as e:
+            print(f"⚠ Prediction failed: {e}")
+            return self.extract_from_acoustics(audio, sr)
+    def extract_from_acoustics(self, audio: np.ndarray, sr: int = 16000) -> Dict[str, float]:
+        """
+        Fallback: Extract emotion proxies from acoustic features
+        Returns emotion-like scores without deep learning
+        """
+        try:
+            if len(audio) < 512:
+                return {emotion: 1.0/7 for emotion in self.EMOTIONS}  # Uniform distribution
+            # Extract acoustic features
+            rms = librosa.feature.rms(y=audio)[0]
+            mean_energy = np.mean(rms)
+            energy_std = np.std(rms)
+            f0 = librosa.yin(audio, fmin=75, fmax=400, sr=sr)
+            f0_voiced = f0[f0 > 0]
+            pitch_mean = np.mean(f0_voiced) if len(f0_voiced) > 0 else 0
+            pitch_std = np.std(f0_voiced) if len(f0_voiced) > 0 else 0
+            zcr = np.mean(librosa.feature.zero_crossing_rate(audio))
+            centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sr))
+            # Heuristic mapping to emotions
+            scores = {
+                'angry': (energy_std * 10 + pitch_std / 50) / 2,
+                'disgust': (pitch_mean / 300) * 0.3,
+                'fear': (pitch_mean / 250 + zcr * 5) / 2,
+                'happy': (centroid / 3000 + mean_energy * 5) / 2,
+                'neutral': 0.3,  # Baseline
+                'sad': (1 - centroid / 4000) * 0.5,
+                'surprise': (energy_std * 8 + zcr * 3) / 2
+            }
+            # Normalize to sum to 1
+            total = sum(scores.values())
+            scores = {k: v / total for k, v in scores.items()}
+            return scores
+        except Exception as e:
+            print(f"⚠ Acoustic fallback failed: {e}")
+            return {emotion: 1.0/7 for emotion in self.EMOTIONS}
+    def extract_all(self, audio: np.ndarray, sr: int = 16000) -> Dict[str, float]:
+        """
+        Extract emotion features for busy detection
+        Returns:
+            v11_emotion_stress: 0-1 (angry + fear + disgust)
+            v12_emotion_energy: 0-1 (happy + surprise + angry)
+            v13_emotion_valence: 0-1 (happy - sad - angry)
+        """
+        if audio.dtype != np.float32:
+            audio = audio.astype(np.float32)
+        # Get emotion predictions
+        emotion_probs = self.predict_emotions(audio, sr)
+        # Map emotions to features
+        stress = (
+            emotion_probs.get('angry', 0.0) * 0.5 +
+            emotion_probs.get('fear', 0.0) * 0.3 +
+            emotion_probs.get('disgust', 0.0) * 0.2
+        )
+        energy = (
+            emotion_probs.get('happy', 0.0) * 0.4 +
+            emotion_probs.get('surprise', 0.0) * 0.3 +
+            emotion_probs.get('angry', 0.0) * 0.3
+        )
+        valence = (
+            emotion_probs.get('happy', 0.0) +
+            emotion_probs.get('surprise', 0.0) * 0.5 -
+            emotion_probs.get('sad', 0.0) -
+            emotion_probs.get('angry', 0.0) * 0.5
+        )
+        # Normalize valence to [0, 1]
+        valence = (valence + 1.0) / 2.0
+        return {
+            'v11_emotion_stress': float(np.clip(stress, 0, 1)),
+            'v12_emotion_energy': float(np.clip(energy, 0, 1)),
+            'v13_emotion_valence': float(np.clip(valence, 0, 1))
+        }
+# Standalone test
+if __name__ == "__main__":
+    import time
+    print("Testing NeuroByte Emotion Feature Extractor...")
+    # Initialize extractor
+    extractor = EmotionFeatureExtractor(
+        models_dir="models_cache/emotion_models",
+        use_ensemble=True
+    )
+    # If models not found, try to download
+    if not extractor.use_tensorflow or len(extractor.models) == 0:
+        print("\nModels not found. Download them with:")
+        print("  extractor.download_models()")
+        print("\nUsing acoustic fallback for now...")
+    # Generate test audio
+    duration = 3
+    sr = 16000
+    t = np.linspace(0, duration, sr * duration)
+    # Test 1: Stressed voice (high pitch, varying)
+    print("\n1. Testing with stressed audio:")
+    audio_stressed = np.sin(2 * np.pi * 300 * t) + 0.5 * np.sin(2 * np.pi * 150 * t)
+    audio_stressed += 0.2 * np.random.randn(len(audio_stressed))
+    start = time.time()
+    features_stressed = extractor.extract_all(audio_stressed, sr)
+    print(f"  Time: {(time.time() - start)*1000:.0f}ms")
+    print("  Features:")
+    for k, v in features_stressed.items():
+        print(f"    {k}: {v:.3f}")
+    # Test 2: Calm voice (low pitch, steady)
+    print("\n2. Testing with calm audio:")
+    audio_calm = np.sin(2 * np.pi * 150 * t) * 0.3
+    start = time.time()
+    features_calm = extractor.extract_all(audio_calm, sr)
+    print(f"  Time: {(time.time() - start)*1000:.0f}ms")
+    print("  Features:")
+    for k, v in features_calm.items():
+        print(f"    {k}: {v:.3f}")
+    print("\n✓ Tests complete!")
+    if extractor.use_tensorflow and len(extractor.models) > 0:
+        print(f"\nUsing {len(extractor.models)} NeuroByte model(s)")
+    else:
+        print("\nUsing acoustic features fallback")

handler.py CHANGED Viewed

@@ -22,276 +22,56 @@ warnings.filterwarnings("ignore")
 # ──────────────────────────────────────────────────────────────────────── #
-# Constants & Defaults
 # ──────────────────────────────────────────────────────────────────────── #
-DEFAULT_AUDIO_FEATURES = {
-    "v1_snr": 0.0,
-    "v2_noise_traffic": 0.0, "v2_noise_office": 0.0,
-    "v2_noise_crowd": 0.0, "v2_noise_wind": 0.0, "v2_noise_clean": 1.0,
-    "v3_speech_rate": 0.0,
-    "v4_pitch_mean": 0.0, "v5_pitch_std": 0.0,
-    "v6_energy_mean": 0.0, "v7_energy_std": 0.0,
-    "v8_pause_ratio": 0.0, "v9_avg_pause_dur": 0.0, "v10_mid_pause_cnt": 0,
-    "v11_emotion_stress": 0.0, "v12_emotion_energy": 0.0, "v13_emotion_valence": 0.0,
-}
 # ──────────────────────────────────────────────────────────────────────── #
-# Emotion CNN (mirrors src/emotion_features.py EmotionCNN)
 # ──────────────────────────────────────────────────────────────────────── #
-class EmotionCNN:
-    """Lightweight CNN for emotion embedding from spectrograms (MobileNetV3)."""
-    def __init__(self):
-        self.model = models.mobilenet_v3_small(pretrained=True)
-        self.model.classifier = nn.Identity()
-        self.model.eval()
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        if self.device == "cuda":
-            self.model = self.model.cuda()
-    def audio_to_spectrogram(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
-        mel_spec = librosa.feature.melspectrogram(
-            y=audio, sr=sr, n_fft=512, hop_length=64, n_mels=128, fmin=0, fmax=sr / 2
-        )
-        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
-        mel_spec_db = np.clip(mel_spec_db, -80, 0)
-        mel_spec_norm = (mel_spec_db + 80) / 80
-        try:
-            from skimage.transform import resize
-            mel_resized = resize(mel_spec_norm, (224, 224), mode="constant")
-        except ImportError:
-            # Fallback: resizing with numpy interpolation (nearest neighbor for rows, linear for cols)
-            target_h, target_w = 224, 224
-            source_h, source_w = mel_spec_norm.shape
-            if source_h > 0 and source_w > 0:
-                # 1. Resize height (rows)
-                row_indices = np.linspace(0, source_h - 1, target_h).astype(int)
-                # Select rows (nearest neighbor)
-                temp = mel_spec_norm[row_indices, :]
-                # 2. Resize width (cols)
-                mel_resized = np.zeros((target_h, target_w), dtype=mel_spec_norm.dtype)
-                x_source = np.arange(source_w)
-                x_target = np.linspace(0, source_w - 1, target_w)
-                for i in range(target_h):
-                    mel_resized[i, :] = np.interp(x_target, x_source, temp[i, :])
             else:
-                mel_resized = np.zeros((224, 224))
-        try:
-            from matplotlib import cm
-            colormap = cm.get_cmap("jet")
-            rgb = colormap(mel_resized)[:, :, :3]
-        except (ImportError, Exception):
-            # Fallback: stack grayscale into 3 channels
-            rgb = np.stack([mel_resized] * 3, axis=-1)
-        return np.transpose(rgb, (2, 0, 1)).astype(np.float32)
-    def extract_embedding(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
-        try:
-            spec_rgb = self.audio_to_spectrogram(audio, sr)
-            tensor = torch.from_numpy(spec_rgb).unsqueeze(0)
-            if self.device == "cuda":
-                tensor = tensor.cuda()
-            with torch.no_grad():
-                emb = self.model(tensor)
-            return emb.cpu().numpy().flatten()
-        except Exception as e:
-            print(f"[WARN] EmotionCNN embedding extraction failed: {e}")
-            return np.zeros(576)  # MobileNetV3-small output size
-# ──────────────────────────────────────────────────────────────────────── #
-# Audio Feature Extractor (mirrors src/audio_features.py)
-# ──────────────────────────────────────────────────────────────────────── #
-class AudioFeatureExtractorEndpoint:
-    """Stateless audio feature extraction for HF endpoint."""
-    def __init__(self):
-        self.sr = 16000
-        self.emotion_cnn = EmotionCNN()
-        # Load Silero VAD - optimized for CPU-only HF Spaces
-        try:
-            # Force CPU mode (HF Free Spaces don't have GPU)
-            torch.set_num_threads(1)
-            # Load from torch.hub (most reliable method)
-            print("[INFO] Loading Silero VAD from torch.hub...")
-            self.vad_model, self.vad_utils = torch.hub.load(
-                repo_or_dir='snakers4/silero-vad',
-                model='silero_vad',
-                force_reload=False,
-                trust_repo=True,
-                verbose=False
-            )
-            # Force model to CPU
-            self.vad_model = self.vad_model.cpu()
-            self.vad_model.eval()
-            # Extract the get_speech_timestamps utility
-            self.get_speech_timestamps = self.vad_utils[0]
-            print("✅ Silero VAD loaded successfully (CPU mode)")
-        except Exception as e:
-            print(f"⚠️ Silero VAD failed to load: {e}")
-            print(f"   Audio features will use fallback values for pause detection")
-            self.vad_model = None
-            self.get_speech_timestamps = None
-    # -------- V1: SNR --------
-    def extract_snr(self, audio: np.ndarray) -> float:
-        if len(audio) == 0:
-            return 0.0
-        frame_length = min(2048, len(audio))
-        frames = librosa.util.frame(audio, frame_length=frame_length, hop_length=frame_length // 2)
-        frame_energy = np.sum(frames ** 2, axis=0)
-        if len(frame_energy) < 2:
-            return 0.0
-        sorted_energy = np.sort(frame_energy)
-        n_noise = max(1, len(sorted_energy) // 5)
-        noise_floor = np.mean(sorted_energy[:n_noise])
-        signal_power = np.mean(sorted_energy)
-        if noise_floor <= 0:
-            return 40.0
-        snr = 10 * np.log10(signal_power / noise_floor + 1e-10)
-        return float(np.clip(snr, -10, 40))
-    # -------- V2: Noise classification --------
-    def classify_noise_type(self, audio: np.ndarray) -> Dict[str, float]:
-        if len(audio) < 2048:
-            return {
-                "v2_noise_traffic": 0.0, "v2_noise_office": 0.0,
-                "v2_noise_crowd": 0.0, "v2_noise_wind": 0.0, "v2_noise_clean": 1.0,
-            }
-        spec = np.abs(librosa.stft(audio, n_fft=2048))
-        freq_bins = librosa.fft_frequencies(sr=self.sr, n_fft=2048)
-        low = np.mean(spec[(freq_bins >= 50) & (freq_bins <= 500)])
-        mid = np.mean(spec[(freq_bins >= 500) & (freq_bins <= 2000)])
-        high = np.mean(spec[(freq_bins >= 2000) & (freq_bins <= 6000)])
-        total = low + mid + high + 1e-10
-        low_r, mid_r, high_r = low / total, mid / total, high / total
-        spectral_centroid = float(np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sr)))
-        spectral_flatness = float(np.mean(librosa.feature.spectral_flatness(y=audio)))
-        noise = {
-            "v2_noise_traffic": float(np.clip(low_r * 2 - 0.3, 0, 1)),
-            "v2_noise_office": float(np.clip(mid_r * 1.5 - 0.2, 0, 1) if spectral_flatness > 0.01 else 0),
-            "v2_noise_crowd": float(np.clip(mid_r * 2 - 0.5, 0, 1) if spectral_centroid > 1500 else 0),
-            "v2_noise_wind": float(np.clip(low_r * 3 - 0.8, 0, 1) if spectral_flatness > 0.1 else 0),
-        }
-        noise["v2_noise_clean"] = float(np.clip(1 - max(noise.values()), 0, 1))
-        return noise
-    # -------- V3: Speech rate --------
-    def extract_speech_rate(self, audio: np.ndarray, transcript: str) -> float:
-        if not transcript:
-            return 0.0
-        word_count = len(transcript.split())
-        duration = len(audio) / self.sr
-        if duration == 0:
-            return 0.0
-        return float(word_count / duration)
-    # -------- V4-V5: Pitch --------
-    def extract_pitch_features(self, audio: np.ndarray) -> Dict[str, float]:
-        try:
-            pitches, magnitudes = librosa.piptrack(y=audio, sr=self.sr)
-            pitch_values = pitches[magnitudes > np.median(magnitudes)]
-            pitch_values = pitch_values[pitch_values > 0]
-            if len(pitch_values) == 0:
-                return {"v4_pitch_mean": 0.0, "v5_pitch_std": 0.0}
-            return {
-                "v4_pitch_mean": float(np.mean(pitch_values)),
-                "v5_pitch_std": float(np.std(pitch_values)),
-            }
-        except Exception:
-            return {"v4_pitch_mean": 0.0, "v5_pitch_std": 0.0}
-    # -------- V6-V7: Energy --------
-    def extract_energy_features(self, audio: np.ndarray) -> Dict[str, float]:
-        rms = librosa.feature.rms(y=audio)[0]
-        return {"v6_energy_mean": float(np.mean(rms)), "v7_energy_std": float(np.std(rms))}
-    # -------- V8-V10: Pause features (Silero VAD) --------
-    def extract_pause_features(self, audio: np.ndarray) -> Dict[str, float]:
-        defaults = {"v8_pause_ratio": 0.0, "v9_avg_pause_dur": 0.0, "v10_mid_pause_cnt": 0}
-        if self.vad_model is None or len(audio) < self.sr:
-            return defaults
-        try:
-            audio_tensor = torch.FloatTensor(audio)
-            timestamps = self.get_speech_timestamps(audio_tensor, self.vad_model, sampling_rate=self.sr)
-            if not timestamps:
-                return {"v8_pause_ratio": 1.0, "v9_avg_pause_dur": len(audio) / self.sr, "v10_mid_pause_cnt": 0}
-            total_speech = sum(t["end"] - t["start"] for t in timestamps)
-            total_samples = len(audio)
-            pause_ratio = 1.0 - (total_speech / total_samples)
-            pauses = []
-            for i in range(1, len(timestamps)):
-                gap = (timestamps[i]["start"] - timestamps[i - 1]["end"]) / self.sr
-                if gap > 0.1:
-                    pauses.append(gap)
-            return {
-                "v8_pause_ratio": float(np.clip(pause_ratio, 0, 1)),
-                "v9_avg_pause_dur": float(np.mean(pauses)) if pauses else 0.0,
-                "v10_mid_pause_cnt": len([p for p in pauses if 0.3 < p < 2.0]),
-            }
-        except Exception:
-            return defaults
-    # -------- V11-V13: Emotion features --------
-    def extract_emotion_features(self, audio: np.ndarray) -> Dict[str, float]:
-        try:
-            embedding = self.emotion_cnn.extract_embedding(audio, self.sr)
-            stress_indices = [0, 100, 200, 300, 400]
-            stress_values = embedding[stress_indices]
-            stress_score = float(np.clip(np.mean(np.abs(stress_values)), 0, 1))
-            return {
-                "v11_emotion_stress": stress_score,
-                "v12_emotion_energy": float(np.mean(np.abs(embedding[500:600]))),
-                "v13_emotion_valence": float(np.mean(embedding[700:800])),
-            }
-        except Exception:
-            return {"v11_emotion_stress": 0.0, "v12_emotion_energy": 0.0, "v13_emotion_valence": 0.0}
-    # -------- Main: extract all --------
-    def extract_all(self, audio: np.ndarray, transcript: str = "") -> Dict[str, float]:
-        features = {}
-        features["v1_snr"] = self.extract_snr(audio)
-        features.update(self.classify_noise_type(audio))
-        features["v3_speech_rate"] = self.extract_speech_rate(audio, transcript)
-        features.update(self.extract_pitch_features(audio))
-        features.update(self.extract_energy_features(audio))
-        features.update(self.extract_pause_features(audio))
-        features.update(self.extract_emotion_features(audio))
-        # Sanitize: replace NaN/Inf with 0.0 (prevents JSON serialization errors)
-        for key, val in features.items():
-            if isinstance(val, (float, np.floating)):
-                if np.isnan(val) or np.isinf(val):
-                    features[key] = 0.0
-                else:
-                    features[key] = float(val)  # ensure native Python float
-            elif isinstance(val, (int, np.integer)):
-                features[key] = int(val)
-        return features
 # ──────────────────────────────────────────────────────────────────────── #
 # FastAPI handler for deployment (HF Spaces / Cloud Run / Lambda)
@@ -323,7 +103,7 @@ async def global_exception_handler(request: Request, exc: Exception):
         content={**DEFAULT_AUDIO_FEATURES, "_error": str(exc), "_handler": "global"},
     )
-extractor = AudioFeatureExtractorEndpoint()
 # ──────────────────────────────────────────────────────────────────────── #
 # Constants & Defaults
@@ -345,7 +125,13 @@ async def root():
 @app.get("/health")
 async def health():
-    return {"status": "healthy", "vad_loaded": extractor.vad_model is not None}
 @app.post("/extract-audio-features")
@@ -353,9 +139,13 @@ async def extract_audio_features(audio: UploadFile = File(...), transcript: str
     """Extract all 17 voice features from uploaded audio file."""
     try:
         audio_bytes = await audio.read()
         y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
         features = extractor.extract_all(y, transcript)
-        return features
     except Exception as e:
         print(f"[ERROR] extract_audio_features: {e}")
         traceback.print_exc()
@@ -403,7 +193,7 @@ async def extract_audio_features_base64(data: AudioBase64Request):
         features = extractor.extract_all(y, transcript)
         print(f"[OK] Extracted {len(features)} audio features")
-        return features
     except Exception as e:
         print(f"[ERROR] extract_audio_features_base64: {e}")
         traceback.print_exc()
@@ -416,3 +206,4 @@ if __name__ == "__main__":
     import os
     port = int(os.environ.get("PORT", 7860))
     uvicorn.run(app, host="0.0.0.0", port=port)

 # ──────────────────────────────────────────────────────────────────────── #
+# Imports from standardized modules
 # ──────────────────────────────────────────────────────────────────────── #
+try:
+    from audio_features import AudioFeatureExtractor
+except ImportError:
+    # Fallback if running from a different context
+    import sys
+    sys.path.append('.')
+    from audio_features import AudioFeatureExtractor
+# Initialize global extractor
+# We use a global instance to cache models (VAD, Emotion)
+print("[INFO] Initializing Global AudioFeatureExtractor...")
+extractor = AudioFeatureExtractor(
+    sample_rate=16000,
+    use_emotion=True,
+    models_dir="models" # Dockerfile should place models here or download them
+)
+# Ensure models are downloaded/ready
+if extractor.use_emotion and extractor.emotion_extractor:
+    print("[INFO] Checking for emotion models...")
+    # Trigger download if needed/possible
+    try:
+        if len(extractor.emotion_extractor.models) == 0:
+             print("[INFO] Models not found, attempting download...")
+             extractor.emotion_extractor.download_models()
+             # Re-init manually to load them
+             extractor.emotion_extractor.__init__(models_dir=extractor.emotion_extractor.models_dir)
+    except Exception as e:
+        print(f"[WARN] Failed to download emotion models: {e}")
 # ──────────────────────────────────────────────────────────────────────── #
+# Helper to handle NaN/Inf for JSON
 # ──────────────────────────────────────────────────────────────────────── #
+def sanitize_features(features: Dict[str, float]) -> Dict[str, float]:
+    sanitized = {}
+    for key, val in features.items():
+        if isinstance(val, (float, np.floating)):
+            if np.isnan(val) or np.isinf(val):
+                sanitized[key] = 0.0
             else:
+                sanitized[key] = float(val)
+        elif isinstance(val, (int, np.integer)):
+            sanitized[key] = int(val)
+        else:
+            sanitized[key] = val # keep string/other as is
+    return sanitized
 # ──────────────────────────────────────────────────────────────────────── #
 # FastAPI handler for deployment (HF Spaces / Cloud Run / Lambda)
         content={**DEFAULT_AUDIO_FEATURES, "_error": str(exc), "_handler": "global"},
     )
+# Extractor is already initialized globally above
 # ──────────────────────────────────────────────────────────────────────── #
 # Constants & Defaults
 @app.get("/health")
 async def health():
+    vad_status = extractor.vad_model is not None
+    emotion_status = extractor.emotion_extractor is not None if extractor.use_emotion else False
+    return {
+        "status": "healthy",
+        "vad_loaded": vad_status,
+        "emotion_loaded": emotion_status
+    }
 @app.post("/extract-audio-features")
     """Extract all 17 voice features from uploaded audio file."""
     try:
         audio_bytes = await audio.read()
+        # librosa.load returns (audio, sr)
         y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
+        # AudioFeatureExtractor.extract_all expects numpy array and optional transcript
         features = extractor.extract_all(y, transcript)
+        return sanitize_features(features)
     except Exception as e:
         print(f"[ERROR] extract_audio_features: {e}")
         traceback.print_exc()
         features = extractor.extract_all(y, transcript)
         print(f"[OK] Extracted {len(features)} audio features")
+        return sanitize_features(features)
     except Exception as e:
         print(f"[ERROR] extract_audio_features_base64: {e}")
         traceback.print_exc()
     import os
     port = int(os.environ.get("PORT", 7860))
     uvicorn.run(app, host="0.0.0.0", port=port)

requirements.txt CHANGED Viewed

@@ -4,15 +4,17 @@ soundfile==0.12.1
 numpy==1.24.3
 scipy==1.11.2
-# ML - CPU-only versions (for HF Free Spaces without GPU)
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.1.0+cpu
-torchvision==0.16.0+cpu
 torchaudio==2.1.0+cpu
 # API
 fastapi==0.95.2
 uvicorn==0.22.0
 python-multipart==0.0.6
 huggingface_hub>=0.19.0
-scikit-image>=0.21.0

 numpy==1.24.3
 scipy==1.11.2
+# ML - CPU-only versions (HF Spaces friendly)
+# Torch for Silero VAD
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.1.0+cpu
 torchaudio==2.1.0+cpu
+# TensorFlow for Emotion Models
+tensorflow-cpu==2.15.0
 # API
 fastapi==0.95.2
 uvicorn==0.22.0
 python-multipart==0.0.6
 huggingface_hub>=0.19.0