Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- audio_features.py +421 -0
- emotion_features.py +411 -0
- handler.py +57 -266
- requirements.txt +5 -3
audio_features.py
ADDED
|
@@ -0,0 +1,421 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Audio Feature Extractor - IMPROVED VERSION
|
| 3 |
+
Extracts 14 voice features from audio to detect busy/distracted states.
|
| 4 |
+
|
| 5 |
+
KEY IMPROVEMENTS:
|
| 6 |
+
1. HNR instead of SNR - Better for voice recordings (not affected by recording noise)
|
| 7 |
+
2. Smarter noise classification using multiple spectral features
|
| 8 |
+
3. Removed useless latency feature (t9_latency) from consideration
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import numpy as np
|
| 12 |
+
import librosa
|
| 13 |
+
import soundfile as sf
|
| 14 |
+
from scipy import signal
|
| 15 |
+
from typing import Dict, Tuple, List
|
| 16 |
+
import noisereduce as nr
|
| 17 |
+
import torch
|
| 18 |
+
import warnings
|
| 19 |
+
from .emotion_features import EmotionFeatureExtractor
|
| 20 |
+
|
| 21 |
+
warnings.filterwarnings("ignore")
|
| 22 |
+
|
| 23 |
+
class AudioFeatureExtractor:
|
| 24 |
+
"""Extract 14 audio features for busy detection (Enhanced with Silero VAD)"""
|
| 25 |
+
|
| 26 |
+
_vad_model_cache = None
|
| 27 |
+
_vad_utils_cache = None
|
| 28 |
+
_emotion_extractor_cache = None
|
| 29 |
+
|
| 30 |
+
def __init__(self, sample_rate: int = 16000, use_emotion: bool = True, config: Dict = None, emotion_models_dir: str = None):
|
| 31 |
+
self.config = config or {}
|
| 32 |
+
self.sample_rate = self.config.get('audio_sample_rate', sample_rate)
|
| 33 |
+
self.vad_sample_rate = self.config.get('vad_sample_rate', self.sample_rate)
|
| 34 |
+
self.use_emotion = use_emotion and (not self.config.get('skip_emotion_features', False))
|
| 35 |
+
self.skip_noise_reduction = bool(self.config.get('skip_noise_reduction', False))
|
| 36 |
+
self.audio_duration_limit = self.config.get('audio_duration_limit', None)
|
| 37 |
+
self.emotion_models_dir = emotion_models_dir
|
| 38 |
+
|
| 39 |
+
print("Loading Silero VAD...")
|
| 40 |
+
try:
|
| 41 |
+
if AudioFeatureExtractor._vad_model_cache is None:
|
| 42 |
+
AudioFeatureExtractor._vad_model_cache, AudioFeatureExtractor._vad_utils_cache = torch.hub.load(
|
| 43 |
+
repo_or_dir='snakers4/silero-vad',
|
| 44 |
+
model='silero_vad',
|
| 45 |
+
force_reload=False,
|
| 46 |
+
trust_repo=True
|
| 47 |
+
)
|
| 48 |
+
self.vad_model = AudioFeatureExtractor._vad_model_cache
|
| 49 |
+
utils = AudioFeatureExtractor._vad_utils_cache
|
| 50 |
+
self.get_speech_timestamps = utils[0]
|
| 51 |
+
print("[OK] Silero VAD loaded (cached)")
|
| 52 |
+
except Exception as e:
|
| 53 |
+
print(f"[WARN] Failed to load Silero VAD: {e}. Fallback to energy VAD might be needed.")
|
| 54 |
+
self.vad_model = None
|
| 55 |
+
|
| 56 |
+
if self.use_emotion:
|
| 57 |
+
print("Loading Emotion CNN...")
|
| 58 |
+
try:
|
| 59 |
+
if AudioFeatureExtractor._emotion_extractor_cache is None:
|
| 60 |
+
# Pass models dir to extractor
|
| 61 |
+
AudioFeatureExtractor._emotion_extractor_cache = EmotionFeatureExtractor(models_dir=self.emotion_models_dir)
|
| 62 |
+
self.emotion_extractor = AudioFeatureExtractor._emotion_extractor_cache
|
| 63 |
+
print("[OK] Emotion CNN loaded (cached)")
|
| 64 |
+
except Exception as e:
|
| 65 |
+
print(f"[WARN] Emotion features disabled: {e}")
|
| 66 |
+
self.emotion_extractor = None
|
| 67 |
+
self.use_emotion = False
|
| 68 |
+
else:
|
| 69 |
+
self.emotion_extractor = None
|
| 70 |
+
|
| 71 |
+
def load_audio(self, audio_path: str) -> np.ndarray:
|
| 72 |
+
"""Load and preprocess audio file"""
|
| 73 |
+
audio, sr = librosa.load(
|
| 74 |
+
audio_path,
|
| 75 |
+
sr=self.sample_rate,
|
| 76 |
+
mono=True,
|
| 77 |
+
duration=self.audio_duration_limit
|
| 78 |
+
)
|
| 79 |
+
return audio
|
| 80 |
+
|
| 81 |
+
def extract_hnr(self, audio: np.ndarray) -> float:
|
| 82 |
+
"""
|
| 83 |
+
V1: Harmonics-to-Noise Ratio (HNR)
|
| 84 |
+
Measures voice quality - higher = clearer voice
|
| 85 |
+
|
| 86 |
+
IMPROVEMENT: HNR is better than SNR for voice because:
|
| 87 |
+
- Not affected by recording equipment noise
|
| 88 |
+
- Focuses on harmonic structure of speech
|
| 89 |
+
- More robust to environmental noise
|
| 90 |
+
|
| 91 |
+
Range: 0-30 dB (typical: 10-20 dB for clear speech)
|
| 92 |
+
"""
|
| 93 |
+
if len(audio) == 0 or len(audio) < 2048:
|
| 94 |
+
return 15.0 # Neutral default
|
| 95 |
+
|
| 96 |
+
try:
|
| 97 |
+
# Method 1: Autocorrelation-based HNR (most accurate)
|
| 98 |
+
frame_length = 2048
|
| 99 |
+
hop_length = 512
|
| 100 |
+
hnr_values = []
|
| 101 |
+
|
| 102 |
+
for i in range(0, len(audio) - frame_length, hop_length):
|
| 103 |
+
frame = audio[i:i+frame_length]
|
| 104 |
+
|
| 105 |
+
# Only process frames with enough energy
|
| 106 |
+
energy = np.sum(frame ** 2)
|
| 107 |
+
if energy < 0.001:
|
| 108 |
+
continue
|
| 109 |
+
|
| 110 |
+
# Autocorrelation
|
| 111 |
+
autocorr = np.correlate(frame, frame, mode='full')
|
| 112 |
+
autocorr = autocorr[len(autocorr)//2:]
|
| 113 |
+
|
| 114 |
+
# Normalize
|
| 115 |
+
if autocorr[0] > 0:
|
| 116 |
+
autocorr = autocorr / autocorr[0]
|
| 117 |
+
else:
|
| 118 |
+
continue
|
| 119 |
+
|
| 120 |
+
# Find fundamental frequency peak (skip first 20 samples = ~1250 Hz max)
|
| 121 |
+
min_lag = int(self.sample_rate / 400) # Max 400 Hz
|
| 122 |
+
max_lag = int(self.sample_rate / 75) # Min 75 Hz
|
| 123 |
+
|
| 124 |
+
if max_lag >= len(autocorr):
|
| 125 |
+
continue
|
| 126 |
+
|
| 127 |
+
peak_idx = np.argmax(autocorr[min_lag:max_lag]) + min_lag
|
| 128 |
+
|
| 129 |
+
if peak_idx > 0 and autocorr[peak_idx] > 0.3: # Minimum correlation threshold
|
| 130 |
+
# HNR calculation
|
| 131 |
+
periodic_power = autocorr[peak_idx]
|
| 132 |
+
aperiodic_power = 1 - periodic_power
|
| 133 |
+
|
| 134 |
+
if aperiodic_power > 0:
|
| 135 |
+
hnr_db = 10 * np.log10(periodic_power / aperiodic_power)
|
| 136 |
+
# Clip to realistic range
|
| 137 |
+
hnr_db = np.clip(hnr_db, 0, 30)
|
| 138 |
+
hnr_values.append(hnr_db)
|
| 139 |
+
|
| 140 |
+
if len(hnr_values) > 0:
|
| 141 |
+
# Return median (more robust than mean)
|
| 142 |
+
return float(np.median(hnr_values))
|
| 143 |
+
|
| 144 |
+
# Method 2: Fallback using spectral flatness
|
| 145 |
+
flatness = np.mean(librosa.feature.spectral_flatness(y=audio))
|
| 146 |
+
# Convert to HNR-like scale (inverted)
|
| 147 |
+
hnr_proxy = (1 - np.clip(flatness, 0, 1)) * 25
|
| 148 |
+
return float(hnr_proxy)
|
| 149 |
+
|
| 150 |
+
except Exception as e:
|
| 151 |
+
print(f"HNR extraction failed: {e}")
|
| 152 |
+
return 15.0 # Safe default
|
| 153 |
+
|
| 154 |
+
def classify_noise_type(self, audio: np.ndarray) -> Dict[str, float]:
|
| 155 |
+
"""
|
| 156 |
+
V2: Background Noise Classification (one-hot encoded)
|
| 157 |
+
|
| 158 |
+
IMPROVEMENT: Uses multiple spectral features for better accuracy:
|
| 159 |
+
- Spectral centroid (frequency brightness)
|
| 160 |
+
- Spectral rolloff (energy distribution)
|
| 161 |
+
- Zero crossing rate (noisiness)
|
| 162 |
+
- Low frequency energy (rumble)
|
| 163 |
+
- High frequency energy (hiss)
|
| 164 |
+
- Spectral contrast (texture)
|
| 165 |
+
"""
|
| 166 |
+
if len(audio) < 512:
|
| 167 |
+
return {'traffic': 0, 'office': 0, 'crowd': 0, 'wind': 0, 'clean': 1}
|
| 168 |
+
|
| 169 |
+
try:
|
| 170 |
+
# Extract comprehensive spectral features
|
| 171 |
+
S = np.abs(librosa.stft(audio))
|
| 172 |
+
if S.shape[1] == 0:
|
| 173 |
+
return {'traffic': 0, 'office': 0, 'crowd': 0, 'wind': 0, 'clean': 1}
|
| 174 |
+
|
| 175 |
+
# Feature 1: Spectral Centroid (brightness)
|
| 176 |
+
centroid = np.mean(librosa.feature.spectral_centroid(S=S, sr=self.sample_rate))
|
| 177 |
+
|
| 178 |
+
# Feature 2: Spectral Rolloff (energy concentration)
|
| 179 |
+
rolloff = np.mean(librosa.feature.spectral_rolloff(S=S, sr=self.sample_rate))
|
| 180 |
+
|
| 181 |
+
# Feature 3: Zero Crossing Rate
|
| 182 |
+
zcr = np.mean(librosa.feature.zero_crossing_rate(audio))
|
| 183 |
+
|
| 184 |
+
# Feature 4: Low frequency energy (0-500 Hz)
|
| 185 |
+
freqs = librosa.fft_frequencies(sr=self.sample_rate, n_fft=2048)
|
| 186 |
+
low_freq_mask = freqs < 500
|
| 187 |
+
low_energy = np.mean(S[low_freq_mask, :]) if np.any(low_freq_mask) else 0
|
| 188 |
+
|
| 189 |
+
# Feature 5: High frequency energy (4000+ Hz)
|
| 190 |
+
high_freq_mask = freqs > 4000
|
| 191 |
+
high_energy = np.mean(S[high_freq_mask, :]) if np.any(high_freq_mask) else 0
|
| 192 |
+
|
| 193 |
+
# Feature 6: Overall energy
|
| 194 |
+
total_energy = np.mean(audio ** 2)
|
| 195 |
+
|
| 196 |
+
# Feature 7: Spectral contrast (texture measure)
|
| 197 |
+
contrast = np.mean(librosa.feature.spectral_contrast(S=S, sr=self.sample_rate))
|
| 198 |
+
|
| 199 |
+
# Score each noise type based on features
|
| 200 |
+
scores = {
|
| 201 |
+
'traffic': 0.0,
|
| 202 |
+
'office': 0.0,
|
| 203 |
+
'crowd': 0.0,
|
| 204 |
+
'wind': 0.0,
|
| 205 |
+
'clean': 0.0
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
# Traffic: Low frequency dominant + rumble + consistent
|
| 209 |
+
if low_energy > 0.002 and centroid < 2000 and contrast < 20:
|
| 210 |
+
scores['traffic'] = low_energy * 100 + (2500 - centroid) / 1000
|
| 211 |
+
|
| 212 |
+
# Office: Mid frequencies + keyboard clicks + air conditioning hum
|
| 213 |
+
if 1500 < centroid < 3500 and 0.0005 < total_energy < 0.005:
|
| 214 |
+
scores['office'] = (3500 - abs(centroid - 2500)) / 1000 + contrast / 30
|
| 215 |
+
|
| 216 |
+
# Crowd: High ZCR + varying spectrum + speech-like energy
|
| 217 |
+
if zcr > 0.08 and total_energy > 0.003 and contrast > 15:
|
| 218 |
+
scores['crowd'] = zcr * 10 + total_energy * 50
|
| 219 |
+
|
| 220 |
+
# Wind: Very high ZCR + high frequency energy + low contrast
|
| 221 |
+
if zcr > 0.12 and high_energy > 0.001 and contrast < 15:
|
| 222 |
+
scores['wind'] = zcr * 8 + high_energy * 100
|
| 223 |
+
|
| 224 |
+
# Clean: Low energy + low ZCR + high contrast (speech only)
|
| 225 |
+
if total_energy < 0.005 and zcr < 0.08 and contrast > 20:
|
| 226 |
+
scores['clean'] = (0.005 - total_energy) * 200 + contrast / 30
|
| 227 |
+
|
| 228 |
+
# If all scores are low, default to clean
|
| 229 |
+
if max(scores.values()) < 0.1:
|
| 230 |
+
scores['clean'] = 1.0
|
| 231 |
+
|
| 232 |
+
# Normalize to probabilities
|
| 233 |
+
total = sum(scores.values())
|
| 234 |
+
if total > 0:
|
| 235 |
+
scores = {k: v/total for k, v in scores.items()}
|
| 236 |
+
else:
|
| 237 |
+
scores['clean'] = 1.0
|
| 238 |
+
|
| 239 |
+
return scores
|
| 240 |
+
|
| 241 |
+
except Exception as e:
|
| 242 |
+
print(f"Noise classification failed: {e}")
|
| 243 |
+
return {'traffic': 0, 'office': 0, 'crowd': 0, 'wind': 0, 'clean': 1}
|
| 244 |
+
|
| 245 |
+
def extract_speech_rate(self, audio: np.ndarray, transcript: str) -> float:
|
| 246 |
+
"""V3: Speech Rate (words per second)"""
|
| 247 |
+
if not transcript:
|
| 248 |
+
return 0.0
|
| 249 |
+
|
| 250 |
+
word_count = len(transcript.split())
|
| 251 |
+
duration = len(audio) / self.sample_rate
|
| 252 |
+
|
| 253 |
+
if duration == 0:
|
| 254 |
+
return 0.0
|
| 255 |
+
|
| 256 |
+
return word_count / duration
|
| 257 |
+
|
| 258 |
+
def extract_pitch_features(self, audio: np.ndarray) -> Tuple[float, float]:
|
| 259 |
+
"""V4-V5: Pitch Mean and Std"""
|
| 260 |
+
try:
|
| 261 |
+
if len(audio) < 2048:
|
| 262 |
+
return 0.0, 0.0
|
| 263 |
+
|
| 264 |
+
# Use pyin (more robust than yin)
|
| 265 |
+
f0, voiced_flag, voiced_probs = librosa.pyin(
|
| 266 |
+
audio,
|
| 267 |
+
fmin=librosa.note_to_hz('C2'),
|
| 268 |
+
fmax=librosa.note_to_hz('C7'),
|
| 269 |
+
sr=self.sample_rate
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
# Only use voiced frames
|
| 273 |
+
f0_voiced = f0[voiced_flag]
|
| 274 |
+
|
| 275 |
+
if len(f0_voiced) == 0:
|
| 276 |
+
return 0.0, 0.0
|
| 277 |
+
|
| 278 |
+
return float(np.mean(f0_voiced)), float(np.std(f0_voiced))
|
| 279 |
+
except Exception as e:
|
| 280 |
+
print(f"Pitch extraction failed: {e}")
|
| 281 |
+
return 0.0, 0.0
|
| 282 |
+
|
| 283 |
+
def extract_energy_features(self, audio: np.ndarray) -> Tuple[float, float]:
|
| 284 |
+
"""V6-V7: Energy Mean and Std"""
|
| 285 |
+
try:
|
| 286 |
+
rms = librosa.feature.rms(y=audio)[0]
|
| 287 |
+
return float(np.mean(rms)), float(np.std(rms))
|
| 288 |
+
except:
|
| 289 |
+
return 0.0, 0.0
|
| 290 |
+
|
| 291 |
+
def extract_pause_features(self, audio: np.ndarray) -> Tuple[float, float, int]:
|
| 292 |
+
"""
|
| 293 |
+
V8-V10: Pause Ratio, Average Pause Duration, Mid-Pause Count
|
| 294 |
+
Uses Silero VAD
|
| 295 |
+
"""
|
| 296 |
+
if self.vad_model is None or len(audio) < 512:
|
| 297 |
+
return 0.0, 0.0, 0
|
| 298 |
+
|
| 299 |
+
# Resample for VAD if configured
|
| 300 |
+
if self.vad_sample_rate != self.sample_rate:
|
| 301 |
+
try:
|
| 302 |
+
audio = librosa.resample(audio, orig_sr=self.sample_rate, target_sr=self.vad_sample_rate)
|
| 303 |
+
except Exception:
|
| 304 |
+
pass
|
| 305 |
+
|
| 306 |
+
# Silero expects Tensor
|
| 307 |
+
wav = torch.tensor(audio, dtype=torch.float32).unsqueeze(0)
|
| 308 |
+
|
| 309 |
+
try:
|
| 310 |
+
speech_dict = self.get_speech_timestamps(wav, self.vad_model, sampling_rate=self.vad_sample_rate)
|
| 311 |
+
|
| 312 |
+
# Calculate speech duration
|
| 313 |
+
speech_samples = sum(seg['end'] - seg['start'] for seg in speech_dict)
|
| 314 |
+
total_samples = len(audio)
|
| 315 |
+
|
| 316 |
+
if total_samples == 0:
|
| 317 |
+
return 0.0, 0.0, 0
|
| 318 |
+
|
| 319 |
+
# Pause Ratio
|
| 320 |
+
pause_samples = total_samples - speech_samples
|
| 321 |
+
pause_ratio = pause_samples / total_samples
|
| 322 |
+
|
| 323 |
+
# Calculate gaps between speech segments
|
| 324 |
+
gaps = []
|
| 325 |
+
if len(speech_dict) > 1:
|
| 326 |
+
for i in range(len(speech_dict) - 1):
|
| 327 |
+
gap = speech_dict[i+1]['start'] - speech_dict[i]['end']
|
| 328 |
+
if gap > 0:
|
| 329 |
+
gaps.append(gap / self.vad_sample_rate) # Convert to seconds
|
| 330 |
+
|
| 331 |
+
avg_pause_dur = float(np.mean(gaps)) if gaps else 0.0
|
| 332 |
+
|
| 333 |
+
# Mid-Pause Count (0.3s - 1.0s)
|
| 334 |
+
mid_pause_cnt = sum(1 for g in gaps if 0.3 <= g <= 1.0)
|
| 335 |
+
|
| 336 |
+
return float(pause_ratio), float(avg_pause_dur), int(mid_pause_cnt)
|
| 337 |
+
|
| 338 |
+
except Exception as e:
|
| 339 |
+
print(f"VAD Error: {e}")
|
| 340 |
+
return 0.0, 0.0, 0
|
| 341 |
+
|
| 342 |
+
def extract_all(self, audio: np.ndarray, transcript: str = "") -> Dict[str, float]:
|
| 343 |
+
"""Extract all audio features (14 original + 3 emotion = 17 total)"""
|
| 344 |
+
|
| 345 |
+
if audio.dtype != np.float32:
|
| 346 |
+
audio = audio.astype(np.float32)
|
| 347 |
+
|
| 348 |
+
features = {}
|
| 349 |
+
|
| 350 |
+
# V1: HNR (IMPROVED from SNR)
|
| 351 |
+
features['v1_snr'] = self.extract_hnr(audio) # Keep name for compatibility
|
| 352 |
+
|
| 353 |
+
# V2: Noise classification (IMPROVED)
|
| 354 |
+
noise_class = self.classify_noise_type(audio)
|
| 355 |
+
features['v2_noise_traffic'] = noise_class['traffic']
|
| 356 |
+
features['v2_noise_office'] = noise_class['office']
|
| 357 |
+
features['v2_noise_crowd'] = noise_class['crowd']
|
| 358 |
+
features['v2_noise_wind'] = noise_class['wind']
|
| 359 |
+
features['v2_noise_clean'] = noise_class['clean']
|
| 360 |
+
|
| 361 |
+
# V3: Speech rate
|
| 362 |
+
features['v3_speech_rate'] = self.extract_speech_rate(audio, transcript)
|
| 363 |
+
|
| 364 |
+
# V4-V5: Pitch
|
| 365 |
+
p_mean, p_std = self.extract_pitch_features(audio)
|
| 366 |
+
features['v4_pitch_mean'] = p_mean
|
| 367 |
+
features['v5_pitch_std'] = p_std
|
| 368 |
+
|
| 369 |
+
# V6-V7: Energy
|
| 370 |
+
e_mean, e_std = self.extract_energy_features(audio)
|
| 371 |
+
features['v6_energy_mean'] = e_mean
|
| 372 |
+
features['v7_energy_std'] = e_std
|
| 373 |
+
|
| 374 |
+
# V8-V10: Pause features
|
| 375 |
+
pause_ratio, avg_pause, mid_pause_cnt = self.extract_pause_features(audio)
|
| 376 |
+
features['v8_pause_ratio'] = pause_ratio
|
| 377 |
+
features['v9_avg_pause_dur'] = avg_pause
|
| 378 |
+
features['v10_mid_pause_cnt'] = float(mid_pause_cnt)
|
| 379 |
+
|
| 380 |
+
# V11-V13: Emotion features
|
| 381 |
+
if self.use_emotion and self.emotion_extractor is not None:
|
| 382 |
+
try:
|
| 383 |
+
emotion_features = self.emotion_extractor.extract_all(audio, self.sample_rate)
|
| 384 |
+
features.update(emotion_features)
|
| 385 |
+
except Exception as e:
|
| 386 |
+
print(f"β Emotion features skipped: {e}")
|
| 387 |
+
# Add zero values for compatibility
|
| 388 |
+
features['v11_emotion_stress'] = 0.0
|
| 389 |
+
features['v12_emotion_energy'] = 0.0
|
| 390 |
+
features['v13_emotion_valence'] = 0.0
|
| 391 |
+
|
| 392 |
+
return features
|
| 393 |
+
|
| 394 |
+
def extract_basic(self, audio: np.ndarray, transcript: str = "") -> Dict[str, float]:
|
| 395 |
+
"""
|
| 396 |
+
Extract a minimal set of audio features for fast decisions.
|
| 397 |
+
Uses only low-cost features.
|
| 398 |
+
"""
|
| 399 |
+
if audio.dtype != np.float32:
|
| 400 |
+
audio = audio.astype(np.float32)
|
| 401 |
+
|
| 402 |
+
features = {}
|
| 403 |
+
features['v1_snr'] = self.extract_hnr(audio) # Keep name for compatibility
|
| 404 |
+
features['v3_speech_rate'] = self.extract_speech_rate(audio, transcript)
|
| 405 |
+
|
| 406 |
+
e_mean, e_std = self.extract_energy_features(audio)
|
| 407 |
+
features['v6_energy_mean'] = e_mean
|
| 408 |
+
features['v7_energy_std'] = e_std
|
| 409 |
+
|
| 410 |
+
pause_ratio, avg_pause, mid_pause_cnt = self.extract_pause_features(audio)
|
| 411 |
+
features['v8_pause_ratio'] = pause_ratio
|
| 412 |
+
features['v9_avg_pause_dur'] = avg_pause
|
| 413 |
+
features['v10_mid_pause_cnt'] = float(mid_pause_cnt)
|
| 414 |
+
|
| 415 |
+
return features
|
| 416 |
+
|
| 417 |
+
|
| 418 |
+
if __name__ == "__main__":
|
| 419 |
+
extractor = AudioFeatureExtractor()
|
| 420 |
+
print("Audio Feature Extractor initialized successfully")
|
| 421 |
+
print("Using HNR instead of SNR for better voice quality measurement")
|
emotion_features.py
ADDED
|
@@ -0,0 +1,411 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Emotion Feature Extractor - Using NeuroByte Models
|
| 3 |
+
Extracts emotion features from audio for busy detection.
|
| 4 |
+
|
| 5 |
+
Uses 3 pre-trained Keras models from NeuroByte-Consulting:
|
| 6 |
+
1. CRNN (Convolutional Recurrent Neural Network) - Best for sequential patterns
|
| 7 |
+
2. Mel Spectrogram CNN - Best for frequency patterns
|
| 8 |
+
3. MFCC CNN - Best for speech characteristics
|
| 9 |
+
|
| 10 |
+
Each model outputs 7 emotion classes: angry, disgust, fear, happy, neutral, sad, surprise
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import numpy as np
|
| 14 |
+
import librosa
|
| 15 |
+
import warnings
|
| 16 |
+
from typing import Dict, Optional
|
| 17 |
+
import os
|
| 18 |
+
|
| 19 |
+
warnings.filterwarnings("ignore")
|
| 20 |
+
|
| 21 |
+
try:
|
| 22 |
+
import tensorflow as tf
|
| 23 |
+
from tensorflow import keras
|
| 24 |
+
TENSORFLOW_AVAILABLE = True
|
| 25 |
+
except ImportError:
|
| 26 |
+
TENSORFLOW_AVAILABLE = False
|
| 27 |
+
print("[WARN] TensorFlow not available. Install with: pip install tensorflow")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class EmotionFeatureExtractor:
|
| 31 |
+
"""Extract emotion features using NeuroByte pre-trained models"""
|
| 32 |
+
|
| 33 |
+
# Emotion labels from the models
|
| 34 |
+
EMOTIONS = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
|
| 35 |
+
|
| 36 |
+
def __init__(self, models_dir: str = None, use_ensemble: bool = True):
|
| 37 |
+
"""
|
| 38 |
+
Initialize emotion detector with NeuroByte models
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
models_dir: Directory containing the .keras model files.
|
| 42 |
+
Defaults to 'models' relative to this file.
|
| 43 |
+
use_ensemble: If True, average predictions from all 3 models (more accurate)
|
| 44 |
+
If False, use only CRNN model (faster)
|
| 45 |
+
"""
|
| 46 |
+
if models_dir is None:
|
| 47 |
+
# Default to 'models' folder in same directory as this script
|
| 48 |
+
models_dir = os.path.join(os.path.dirname(__file__), 'models')
|
| 49 |
+
|
| 50 |
+
self.models_dir = models_dir
|
| 51 |
+
self.use_ensemble = use_ensemble
|
| 52 |
+
self.models = {}
|
| 53 |
+
|
| 54 |
+
if not TENSORFLOW_AVAILABLE:
|
| 55 |
+
print("[WARN] TensorFlow not installed. Falling back to acoustic features.")
|
| 56 |
+
self.use_tensorflow = False
|
| 57 |
+
return
|
| 58 |
+
|
| 59 |
+
self.use_tensorflow = True
|
| 60 |
+
|
| 61 |
+
# Model file paths
|
| 62 |
+
model_files = {
|
| 63 |
+
'crnn': 'emotion_recognition_crnn.keras',
|
| 64 |
+
'mel_spec': 'emotion_recognition_mel_spec.keras',
|
| 65 |
+
'mfcc': 'emotion_recognition_mfcc.keras'
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
# Load models
|
| 69 |
+
print(f"Loading NeuroByte emotion models from {models_dir}...")
|
| 70 |
+
|
| 71 |
+
for model_name, filename in model_files.items():
|
| 72 |
+
model_path = os.path.join(models_dir, filename)
|
| 73 |
+
|
| 74 |
+
if os.path.exists(model_path):
|
| 75 |
+
try:
|
| 76 |
+
self.models[model_name] = keras.models.load_model(model_path)
|
| 77 |
+
print(f"[OK] Loaded {model_name} model")
|
| 78 |
+
except Exception as e:
|
| 79 |
+
print(f"[WARN] Failed to load {model_name}: {e}")
|
| 80 |
+
else:
|
| 81 |
+
print(f"[WARN] Model not found: {model_path}")
|
| 82 |
+
|
| 83 |
+
# If no models loaded, fall back to acoustics
|
| 84 |
+
if len(self.models) == 0:
|
| 85 |
+
print("[WARN] No models loaded. Using acoustic features fallback.")
|
| 86 |
+
self.use_tensorflow = False
|
| 87 |
+
else:
|
| 88 |
+
print(f"[OK] {len(self.models)} emotion model(s) loaded successfully")
|
| 89 |
+
|
| 90 |
+
def download_models(self):
|
| 91 |
+
"""
|
| 92 |
+
Download NeuroByte models from Hugging Face
|
| 93 |
+
|
| 94 |
+
Run this once to download the models:
|
| 95 |
+
>>> extractor = EmotionFeatureExtractor()
|
| 96 |
+
>>> extractor.download_models()
|
| 97 |
+
"""
|
| 98 |
+
if not TENSORFLOW_AVAILABLE:
|
| 99 |
+
print("[WARN] TensorFlow required to download models")
|
| 100 |
+
return
|
| 101 |
+
|
| 102 |
+
try:
|
| 103 |
+
from huggingface_hub import hf_hub_download
|
| 104 |
+
|
| 105 |
+
os.makedirs(self.models_dir, exist_ok=True)
|
| 106 |
+
|
| 107 |
+
repo_id = "neurobyte-org/speech-emotion-recognition"
|
| 108 |
+
model_files = [
|
| 109 |
+
'emotion_recognition_crnn.keras',
|
| 110 |
+
'emotion_recognition_mel_spec.keras',
|
| 111 |
+
'emotion_recognition_mfcc.keras'
|
| 112 |
+
]
|
| 113 |
+
|
| 114 |
+
print(f"Downloading models from {repo_id}...")
|
| 115 |
+
for filename in model_files:
|
| 116 |
+
try:
|
| 117 |
+
print(f" Downloading {filename}...")
|
| 118 |
+
downloaded_path = hf_hub_download(
|
| 119 |
+
repo_id=repo_id,
|
| 120 |
+
filename=filename,
|
| 121 |
+
cache_dir=self.models_dir
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
# Copy to expected location
|
| 125 |
+
target_path = os.path.join(self.models_dir, filename)
|
| 126 |
+
if downloaded_path != target_path:
|
| 127 |
+
import shutil
|
| 128 |
+
shutil.copy(downloaded_path, target_path)
|
| 129 |
+
|
| 130 |
+
print(f" [OK] {filename} downloaded")
|
| 131 |
+
except Exception as e:
|
| 132 |
+
print(f" [WARN] Failed to download {filename}: {e}")
|
| 133 |
+
|
| 134 |
+
print("[OK] Download complete! Reinitialize the extractor to load models.")
|
| 135 |
+
|
| 136 |
+
except ImportError:
|
| 137 |
+
print("[WARN] huggingface_hub not installed. Install with: pip install huggingface_hub")
|
| 138 |
+
|
| 139 |
+
def extract_mel_spectrogram(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
|
| 140 |
+
"""
|
| 141 |
+
Extract mel spectrogram for the mel_spec model
|
| 142 |
+
|
| 143 |
+
Returns shape: (128, time_steps, 1) for CNN input
|
| 144 |
+
"""
|
| 145 |
+
# Resample to 16kHz if needed
|
| 146 |
+
if sr != 16000:
|
| 147 |
+
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
|
| 148 |
+
sr = 16000
|
| 149 |
+
|
| 150 |
+
# Extract mel spectrogram
|
| 151 |
+
mel_spec = librosa.feature.melspectrogram(
|
| 152 |
+
y=audio,
|
| 153 |
+
sr=sr,
|
| 154 |
+
n_fft=2048,
|
| 155 |
+
hop_length=512,
|
| 156 |
+
n_mels=128,
|
| 157 |
+
fmin=0,
|
| 158 |
+
fmax=sr/2
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
# Convert to dB
|
| 162 |
+
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
|
| 163 |
+
|
| 164 |
+
# Normalize to [0, 1]
|
| 165 |
+
mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min() + 1e-8)
|
| 166 |
+
|
| 167 |
+
# Add channel dimension and transpose to (time, freq, 1)
|
| 168 |
+
mel_spec_norm = mel_spec_norm.T
|
| 169 |
+
mel_spec_norm = np.expand_dims(mel_spec_norm, axis=-1)
|
| 170 |
+
|
| 171 |
+
# Pad or truncate to fixed length (e.g., 216 frames for ~3 seconds)
|
| 172 |
+
target_length = 216
|
| 173 |
+
if mel_spec_norm.shape[0] < target_length:
|
| 174 |
+
# Pad with zeros
|
| 175 |
+
pad_width = target_length - mel_spec_norm.shape[0]
|
| 176 |
+
mel_spec_norm = np.pad(mel_spec_norm, ((0, pad_width), (0, 0), (0, 0)), mode='constant')
|
| 177 |
+
else:
|
| 178 |
+
# Truncate
|
| 179 |
+
mel_spec_norm = mel_spec_norm[:target_length, :, :]
|
| 180 |
+
|
| 181 |
+
return mel_spec_norm
|
| 182 |
+
|
| 183 |
+
def extract_mfcc(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
|
| 184 |
+
"""
|
| 185 |
+
Extract MFCC features for the mfcc model
|
| 186 |
+
|
| 187 |
+
Returns shape: (40, time_steps, 1) for CNN input
|
| 188 |
+
"""
|
| 189 |
+
# Resample to 16kHz if needed
|
| 190 |
+
if sr != 16000:
|
| 191 |
+
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
|
| 192 |
+
sr = 16000
|
| 193 |
+
|
| 194 |
+
# Extract MFCCs
|
| 195 |
+
mfccs = librosa.feature.mfcc(
|
| 196 |
+
y=audio,
|
| 197 |
+
sr=sr,
|
| 198 |
+
n_mfcc=40,
|
| 199 |
+
n_fft=2048,
|
| 200 |
+
hop_length=512
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
# Normalize
|
| 204 |
+
mfccs = (mfccs - mfccs.mean()) / (mfccs.std() + 1e-8)
|
| 205 |
+
|
| 206 |
+
# Transpose and add channel dimension
|
| 207 |
+
mfccs = mfccs.T
|
| 208 |
+
mfccs = np.expand_dims(mfccs, axis=-1)
|
| 209 |
+
|
| 210 |
+
# Pad or truncate to fixed length
|
| 211 |
+
target_length = 216
|
| 212 |
+
if mfccs.shape[0] < target_length:
|
| 213 |
+
pad_width = target_length - mfccs.shape[0]
|
| 214 |
+
mfccs = np.pad(mfccs, ((0, pad_width), (0, 0), (0, 0)), mode='constant')
|
| 215 |
+
else:
|
| 216 |
+
mfccs = mfccs[:target_length, :, :]
|
| 217 |
+
|
| 218 |
+
return mfccs
|
| 219 |
+
|
| 220 |
+
def predict_emotions(self, audio: np.ndarray, sr: int = 16000) -> Dict[str, float]:
|
| 221 |
+
"""
|
| 222 |
+
Predict emotion probabilities using loaded models
|
| 223 |
+
|
| 224 |
+
Returns:
|
| 225 |
+
Dictionary with emotion labels as keys and probabilities as values
|
| 226 |
+
"""
|
| 227 |
+
if not self.use_tensorflow or len(self.models) == 0:
|
| 228 |
+
return self.extract_from_acoustics(audio, sr)
|
| 229 |
+
|
| 230 |
+
try:
|
| 231 |
+
predictions = []
|
| 232 |
+
|
| 233 |
+
# CRNN model (if available)
|
| 234 |
+
if 'crnn' in self.models:
|
| 235 |
+
mel_spec = self.extract_mel_spectrogram(audio, sr)
|
| 236 |
+
mel_spec_batch = np.expand_dims(mel_spec, axis=0)
|
| 237 |
+
|
| 238 |
+
pred_crnn = self.models['crnn'].predict(mel_spec_batch, verbose=0)[0]
|
| 239 |
+
predictions.append(pred_crnn)
|
| 240 |
+
|
| 241 |
+
# Mel Spectrogram model (if available)
|
| 242 |
+
if 'mel_spec' in self.models and self.use_ensemble:
|
| 243 |
+
mel_spec = self.extract_mel_spectrogram(audio, sr)
|
| 244 |
+
mel_spec_batch = np.expand_dims(mel_spec, axis=0)
|
| 245 |
+
|
| 246 |
+
pred_mel = self.models['mel_spec'].predict(mel_spec_batch, verbose=0)[0]
|
| 247 |
+
predictions.append(pred_mel)
|
| 248 |
+
|
| 249 |
+
# MFCC model (if available)
|
| 250 |
+
if 'mfcc' in self.models and self.use_ensemble:
|
| 251 |
+
mfcc = self.extract_mfcc(audio, sr)
|
| 252 |
+
mfcc_batch = np.expand_dims(mfcc, axis=0)
|
| 253 |
+
|
| 254 |
+
pred_mfcc = self.models['mfcc'].predict(mfcc_batch, verbose=0)[0]
|
| 255 |
+
predictions.append(pred_mfcc)
|
| 256 |
+
|
| 257 |
+
# Average predictions if ensemble
|
| 258 |
+
if len(predictions) > 1:
|
| 259 |
+
avg_pred = np.mean(predictions, axis=0)
|
| 260 |
+
else:
|
| 261 |
+
avg_pred = predictions[0]
|
| 262 |
+
|
| 263 |
+
# Convert to dictionary
|
| 264 |
+
emotion_probs = {emotion: float(prob) for emotion, prob in zip(self.EMOTIONS, avg_pred)}
|
| 265 |
+
|
| 266 |
+
return emotion_probs
|
| 267 |
+
|
| 268 |
+
except Exception as e:
|
| 269 |
+
print(f"β Prediction failed: {e}")
|
| 270 |
+
return self.extract_from_acoustics(audio, sr)
|
| 271 |
+
|
| 272 |
+
def extract_from_acoustics(self, audio: np.ndarray, sr: int = 16000) -> Dict[str, float]:
|
| 273 |
+
"""
|
| 274 |
+
Fallback: Extract emotion proxies from acoustic features
|
| 275 |
+
Returns emotion-like scores without deep learning
|
| 276 |
+
"""
|
| 277 |
+
try:
|
| 278 |
+
if len(audio) < 512:
|
| 279 |
+
return {emotion: 1.0/7 for emotion in self.EMOTIONS} # Uniform distribution
|
| 280 |
+
|
| 281 |
+
# Extract acoustic features
|
| 282 |
+
rms = librosa.feature.rms(y=audio)[0]
|
| 283 |
+
mean_energy = np.mean(rms)
|
| 284 |
+
energy_std = np.std(rms)
|
| 285 |
+
|
| 286 |
+
f0 = librosa.yin(audio, fmin=75, fmax=400, sr=sr)
|
| 287 |
+
f0_voiced = f0[f0 > 0]
|
| 288 |
+
pitch_mean = np.mean(f0_voiced) if len(f0_voiced) > 0 else 0
|
| 289 |
+
pitch_std = np.std(f0_voiced) if len(f0_voiced) > 0 else 0
|
| 290 |
+
|
| 291 |
+
zcr = np.mean(librosa.feature.zero_crossing_rate(audio))
|
| 292 |
+
centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sr))
|
| 293 |
+
|
| 294 |
+
# Heuristic mapping to emotions
|
| 295 |
+
scores = {
|
| 296 |
+
'angry': (energy_std * 10 + pitch_std / 50) / 2,
|
| 297 |
+
'disgust': (pitch_mean / 300) * 0.3,
|
| 298 |
+
'fear': (pitch_mean / 250 + zcr * 5) / 2,
|
| 299 |
+
'happy': (centroid / 3000 + mean_energy * 5) / 2,
|
| 300 |
+
'neutral': 0.3, # Baseline
|
| 301 |
+
'sad': (1 - centroid / 4000) * 0.5,
|
| 302 |
+
'surprise': (energy_std * 8 + zcr * 3) / 2
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
# Normalize to sum to 1
|
| 306 |
+
total = sum(scores.values())
|
| 307 |
+
scores = {k: v / total for k, v in scores.items()}
|
| 308 |
+
|
| 309 |
+
return scores
|
| 310 |
+
|
| 311 |
+
except Exception as e:
|
| 312 |
+
print(f"β Acoustic fallback failed: {e}")
|
| 313 |
+
return {emotion: 1.0/7 for emotion in self.EMOTIONS}
|
| 314 |
+
|
| 315 |
+
def extract_all(self, audio: np.ndarray, sr: int = 16000) -> Dict[str, float]:
|
| 316 |
+
"""
|
| 317 |
+
Extract emotion features for busy detection
|
| 318 |
+
|
| 319 |
+
Returns:
|
| 320 |
+
v11_emotion_stress: 0-1 (angry + fear + disgust)
|
| 321 |
+
v12_emotion_energy: 0-1 (happy + surprise + angry)
|
| 322 |
+
v13_emotion_valence: 0-1 (happy - sad - angry)
|
| 323 |
+
"""
|
| 324 |
+
if audio.dtype != np.float32:
|
| 325 |
+
audio = audio.astype(np.float32)
|
| 326 |
+
|
| 327 |
+
# Get emotion predictions
|
| 328 |
+
emotion_probs = self.predict_emotions(audio, sr)
|
| 329 |
+
|
| 330 |
+
# Map emotions to features
|
| 331 |
+
stress = (
|
| 332 |
+
emotion_probs.get('angry', 0.0) * 0.5 +
|
| 333 |
+
emotion_probs.get('fear', 0.0) * 0.3 +
|
| 334 |
+
emotion_probs.get('disgust', 0.0) * 0.2
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
energy = (
|
| 338 |
+
emotion_probs.get('happy', 0.0) * 0.4 +
|
| 339 |
+
emotion_probs.get('surprise', 0.0) * 0.3 +
|
| 340 |
+
emotion_probs.get('angry', 0.0) * 0.3
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
valence = (
|
| 344 |
+
emotion_probs.get('happy', 0.0) +
|
| 345 |
+
emotion_probs.get('surprise', 0.0) * 0.5 -
|
| 346 |
+
emotion_probs.get('sad', 0.0) -
|
| 347 |
+
emotion_probs.get('angry', 0.0) * 0.5
|
| 348 |
+
)
|
| 349 |
+
|
| 350 |
+
# Normalize valence to [0, 1]
|
| 351 |
+
valence = (valence + 1.0) / 2.0
|
| 352 |
+
|
| 353 |
+
return {
|
| 354 |
+
'v11_emotion_stress': float(np.clip(stress, 0, 1)),
|
| 355 |
+
'v12_emotion_energy': float(np.clip(energy, 0, 1)),
|
| 356 |
+
'v13_emotion_valence': float(np.clip(valence, 0, 1))
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
|
| 360 |
+
# Standalone test
|
| 361 |
+
if __name__ == "__main__":
|
| 362 |
+
import time
|
| 363 |
+
|
| 364 |
+
print("Testing NeuroByte Emotion Feature Extractor...")
|
| 365 |
+
|
| 366 |
+
# Initialize extractor
|
| 367 |
+
extractor = EmotionFeatureExtractor(
|
| 368 |
+
models_dir="models_cache/emotion_models",
|
| 369 |
+
use_ensemble=True
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
# If models not found, try to download
|
| 373 |
+
if not extractor.use_tensorflow or len(extractor.models) == 0:
|
| 374 |
+
print("\nModels not found. Download them with:")
|
| 375 |
+
print(" extractor.download_models()")
|
| 376 |
+
print("\nUsing acoustic fallback for now...")
|
| 377 |
+
|
| 378 |
+
# Generate test audio
|
| 379 |
+
duration = 3
|
| 380 |
+
sr = 16000
|
| 381 |
+
t = np.linspace(0, duration, sr * duration)
|
| 382 |
+
|
| 383 |
+
# Test 1: Stressed voice (high pitch, varying)
|
| 384 |
+
print("\n1. Testing with stressed audio:")
|
| 385 |
+
audio_stressed = np.sin(2 * np.pi * 300 * t) + 0.5 * np.sin(2 * np.pi * 150 * t)
|
| 386 |
+
audio_stressed += 0.2 * np.random.randn(len(audio_stressed))
|
| 387 |
+
|
| 388 |
+
start = time.time()
|
| 389 |
+
features_stressed = extractor.extract_all(audio_stressed, sr)
|
| 390 |
+
print(f" Time: {(time.time() - start)*1000:.0f}ms")
|
| 391 |
+
print(" Features:")
|
| 392 |
+
for k, v in features_stressed.items():
|
| 393 |
+
print(f" {k}: {v:.3f}")
|
| 394 |
+
|
| 395 |
+
# Test 2: Calm voice (low pitch, steady)
|
| 396 |
+
print("\n2. Testing with calm audio:")
|
| 397 |
+
audio_calm = np.sin(2 * np.pi * 150 * t) * 0.3
|
| 398 |
+
|
| 399 |
+
start = time.time()
|
| 400 |
+
features_calm = extractor.extract_all(audio_calm, sr)
|
| 401 |
+
print(f" Time: {(time.time() - start)*1000:.0f}ms")
|
| 402 |
+
print(" Features:")
|
| 403 |
+
for k, v in features_calm.items():
|
| 404 |
+
print(f" {k}: {v:.3f}")
|
| 405 |
+
|
| 406 |
+
print("\nβ Tests complete!")
|
| 407 |
+
|
| 408 |
+
if extractor.use_tensorflow and len(extractor.models) > 0:
|
| 409 |
+
print(f"\nUsing {len(extractor.models)} NeuroByte model(s)")
|
| 410 |
+
else:
|
| 411 |
+
print("\nUsing acoustic features fallback")
|
handler.py
CHANGED
|
@@ -22,276 +22,56 @@ warnings.filterwarnings("ignore")
|
|
| 22 |
|
| 23 |
|
| 24 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ #
|
| 25 |
-
#
|
| 26 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ #
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
"
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
|
|
|
| 39 |
|
| 40 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ #
|
| 41 |
-
#
|
| 42 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ #
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
self.model.classifier = nn.Identity()
|
| 50 |
-
self.model.eval()
|
| 51 |
-
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 52 |
-
if self.device == "cuda":
|
| 53 |
-
self.model = self.model.cuda()
|
| 54 |
-
|
| 55 |
-
def audio_to_spectrogram(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
|
| 56 |
-
mel_spec = librosa.feature.melspectrogram(
|
| 57 |
-
y=audio, sr=sr, n_fft=512, hop_length=64, n_mels=128, fmin=0, fmax=sr / 2
|
| 58 |
-
)
|
| 59 |
-
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
|
| 60 |
-
mel_spec_db = np.clip(mel_spec_db, -80, 0)
|
| 61 |
-
mel_spec_norm = (mel_spec_db + 80) / 80
|
| 62 |
-
|
| 63 |
-
try:
|
| 64 |
-
from skimage.transform import resize
|
| 65 |
-
mel_resized = resize(mel_spec_norm, (224, 224), mode="constant")
|
| 66 |
-
except ImportError:
|
| 67 |
-
# Fallback: resizing with numpy interpolation (nearest neighbor for rows, linear for cols)
|
| 68 |
-
target_h, target_w = 224, 224
|
| 69 |
-
source_h, source_w = mel_spec_norm.shape
|
| 70 |
-
|
| 71 |
-
if source_h > 0 and source_w > 0:
|
| 72 |
-
# 1. Resize height (rows)
|
| 73 |
-
row_indices = np.linspace(0, source_h - 1, target_h).astype(int)
|
| 74 |
-
# Select rows (nearest neighbor)
|
| 75 |
-
temp = mel_spec_norm[row_indices, :]
|
| 76 |
-
|
| 77 |
-
# 2. Resize width (cols)
|
| 78 |
-
mel_resized = np.zeros((target_h, target_w), dtype=mel_spec_norm.dtype)
|
| 79 |
-
x_source = np.arange(source_w)
|
| 80 |
-
x_target = np.linspace(0, source_w - 1, target_w)
|
| 81 |
-
|
| 82 |
-
for i in range(target_h):
|
| 83 |
-
mel_resized[i, :] = np.interp(x_target, x_source, temp[i, :])
|
| 84 |
else:
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
except (ImportError, Exception):
|
| 92 |
-
# Fallback: stack grayscale into 3 channels
|
| 93 |
-
rgb = np.stack([mel_resized] * 3, axis=-1)
|
| 94 |
-
|
| 95 |
-
return np.transpose(rgb, (2, 0, 1)).astype(np.float32)
|
| 96 |
-
|
| 97 |
-
def extract_embedding(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
|
| 98 |
-
try:
|
| 99 |
-
spec_rgb = self.audio_to_spectrogram(audio, sr)
|
| 100 |
-
tensor = torch.from_numpy(spec_rgb).unsqueeze(0)
|
| 101 |
-
if self.device == "cuda":
|
| 102 |
-
tensor = tensor.cuda()
|
| 103 |
-
with torch.no_grad():
|
| 104 |
-
emb = self.model(tensor)
|
| 105 |
-
return emb.cpu().numpy().flatten()
|
| 106 |
-
except Exception as e:
|
| 107 |
-
print(f"[WARN] EmotionCNN embedding extraction failed: {e}")
|
| 108 |
-
return np.zeros(576) # MobileNetV3-small output size
|
| 109 |
|
| 110 |
|
| 111 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ #
|
| 112 |
-
# Audio Feature Extractor (mirrors src/audio_features.py)
|
| 113 |
-
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ #
|
| 114 |
-
|
| 115 |
-
class AudioFeatureExtractorEndpoint:
|
| 116 |
-
"""Stateless audio feature extraction for HF endpoint."""
|
| 117 |
-
|
| 118 |
-
def __init__(self):
|
| 119 |
-
self.sr = 16000
|
| 120 |
-
self.emotion_cnn = EmotionCNN()
|
| 121 |
-
|
| 122 |
-
# Load Silero VAD - optimized for CPU-only HF Spaces
|
| 123 |
-
try:
|
| 124 |
-
# Force CPU mode (HF Free Spaces don't have GPU)
|
| 125 |
-
torch.set_num_threads(1)
|
| 126 |
-
|
| 127 |
-
# Load from torch.hub (most reliable method)
|
| 128 |
-
print("[INFO] Loading Silero VAD from torch.hub...")
|
| 129 |
-
self.vad_model, self.vad_utils = torch.hub.load(
|
| 130 |
-
repo_or_dir='snakers4/silero-vad',
|
| 131 |
-
model='silero_vad',
|
| 132 |
-
force_reload=False,
|
| 133 |
-
trust_repo=True,
|
| 134 |
-
verbose=False
|
| 135 |
-
)
|
| 136 |
-
|
| 137 |
-
# Force model to CPU
|
| 138 |
-
self.vad_model = self.vad_model.cpu()
|
| 139 |
-
self.vad_model.eval()
|
| 140 |
-
|
| 141 |
-
# Extract the get_speech_timestamps utility
|
| 142 |
-
self.get_speech_timestamps = self.vad_utils[0]
|
| 143 |
-
|
| 144 |
-
print("β
Silero VAD loaded successfully (CPU mode)")
|
| 145 |
-
|
| 146 |
-
except Exception as e:
|
| 147 |
-
print(f"β οΈ Silero VAD failed to load: {e}")
|
| 148 |
-
print(f" Audio features will use fallback values for pause detection")
|
| 149 |
-
self.vad_model = None
|
| 150 |
-
self.get_speech_timestamps = None
|
| 151 |
-
|
| 152 |
-
# -------- V1: SNR --------
|
| 153 |
-
def extract_snr(self, audio: np.ndarray) -> float:
|
| 154 |
-
if len(audio) == 0:
|
| 155 |
-
return 0.0
|
| 156 |
-
frame_length = min(2048, len(audio))
|
| 157 |
-
frames = librosa.util.frame(audio, frame_length=frame_length, hop_length=frame_length // 2)
|
| 158 |
-
frame_energy = np.sum(frames ** 2, axis=0)
|
| 159 |
-
if len(frame_energy) < 2:
|
| 160 |
-
return 0.0
|
| 161 |
-
sorted_energy = np.sort(frame_energy)
|
| 162 |
-
n_noise = max(1, len(sorted_energy) // 5)
|
| 163 |
-
noise_floor = np.mean(sorted_energy[:n_noise])
|
| 164 |
-
signal_power = np.mean(sorted_energy)
|
| 165 |
-
if noise_floor <= 0:
|
| 166 |
-
return 40.0
|
| 167 |
-
snr = 10 * np.log10(signal_power / noise_floor + 1e-10)
|
| 168 |
-
return float(np.clip(snr, -10, 40))
|
| 169 |
-
|
| 170 |
-
# -------- V2: Noise classification --------
|
| 171 |
-
def classify_noise_type(self, audio: np.ndarray) -> Dict[str, float]:
|
| 172 |
-
if len(audio) < 2048:
|
| 173 |
-
return {
|
| 174 |
-
"v2_noise_traffic": 0.0, "v2_noise_office": 0.0,
|
| 175 |
-
"v2_noise_crowd": 0.0, "v2_noise_wind": 0.0, "v2_noise_clean": 1.0,
|
| 176 |
-
}
|
| 177 |
-
spec = np.abs(librosa.stft(audio, n_fft=2048))
|
| 178 |
-
freq_bins = librosa.fft_frequencies(sr=self.sr, n_fft=2048)
|
| 179 |
-
|
| 180 |
-
low = np.mean(spec[(freq_bins >= 50) & (freq_bins <= 500)])
|
| 181 |
-
mid = np.mean(spec[(freq_bins >= 500) & (freq_bins <= 2000)])
|
| 182 |
-
high = np.mean(spec[(freq_bins >= 2000) & (freq_bins <= 6000)])
|
| 183 |
-
total = low + mid + high + 1e-10
|
| 184 |
-
|
| 185 |
-
low_r, mid_r, high_r = low / total, mid / total, high / total
|
| 186 |
-
spectral_centroid = float(np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sr)))
|
| 187 |
-
spectral_flatness = float(np.mean(librosa.feature.spectral_flatness(y=audio)))
|
| 188 |
-
|
| 189 |
-
noise = {
|
| 190 |
-
"v2_noise_traffic": float(np.clip(low_r * 2 - 0.3, 0, 1)),
|
| 191 |
-
"v2_noise_office": float(np.clip(mid_r * 1.5 - 0.2, 0, 1) if spectral_flatness > 0.01 else 0),
|
| 192 |
-
"v2_noise_crowd": float(np.clip(mid_r * 2 - 0.5, 0, 1) if spectral_centroid > 1500 else 0),
|
| 193 |
-
"v2_noise_wind": float(np.clip(low_r * 3 - 0.8, 0, 1) if spectral_flatness > 0.1 else 0),
|
| 194 |
-
}
|
| 195 |
-
noise["v2_noise_clean"] = float(np.clip(1 - max(noise.values()), 0, 1))
|
| 196 |
-
return noise
|
| 197 |
-
|
| 198 |
-
# -------- V3: Speech rate --------
|
| 199 |
-
def extract_speech_rate(self, audio: np.ndarray, transcript: str) -> float:
|
| 200 |
-
if not transcript:
|
| 201 |
-
return 0.0
|
| 202 |
-
word_count = len(transcript.split())
|
| 203 |
-
duration = len(audio) / self.sr
|
| 204 |
-
if duration == 0:
|
| 205 |
-
return 0.0
|
| 206 |
-
return float(word_count / duration)
|
| 207 |
-
|
| 208 |
-
# -------- V4-V5: Pitch --------
|
| 209 |
-
def extract_pitch_features(self, audio: np.ndarray) -> Dict[str, float]:
|
| 210 |
-
try:
|
| 211 |
-
pitches, magnitudes = librosa.piptrack(y=audio, sr=self.sr)
|
| 212 |
-
pitch_values = pitches[magnitudes > np.median(magnitudes)]
|
| 213 |
-
pitch_values = pitch_values[pitch_values > 0]
|
| 214 |
-
if len(pitch_values) == 0:
|
| 215 |
-
return {"v4_pitch_mean": 0.0, "v5_pitch_std": 0.0}
|
| 216 |
-
return {
|
| 217 |
-
"v4_pitch_mean": float(np.mean(pitch_values)),
|
| 218 |
-
"v5_pitch_std": float(np.std(pitch_values)),
|
| 219 |
-
}
|
| 220 |
-
except Exception:
|
| 221 |
-
return {"v4_pitch_mean": 0.0, "v5_pitch_std": 0.0}
|
| 222 |
-
|
| 223 |
-
# -------- V6-V7: Energy --------
|
| 224 |
-
def extract_energy_features(self, audio: np.ndarray) -> Dict[str, float]:
|
| 225 |
-
rms = librosa.feature.rms(y=audio)[0]
|
| 226 |
-
return {"v6_energy_mean": float(np.mean(rms)), "v7_energy_std": float(np.std(rms))}
|
| 227 |
-
|
| 228 |
-
# -------- V8-V10: Pause features (Silero VAD) --------
|
| 229 |
-
def extract_pause_features(self, audio: np.ndarray) -> Dict[str, float]:
|
| 230 |
-
defaults = {"v8_pause_ratio": 0.0, "v9_avg_pause_dur": 0.0, "v10_mid_pause_cnt": 0}
|
| 231 |
-
if self.vad_model is None or len(audio) < self.sr:
|
| 232 |
-
return defaults
|
| 233 |
-
try:
|
| 234 |
-
audio_tensor = torch.FloatTensor(audio)
|
| 235 |
-
timestamps = self.get_speech_timestamps(audio_tensor, self.vad_model, sampling_rate=self.sr)
|
| 236 |
-
if not timestamps:
|
| 237 |
-
return {"v8_pause_ratio": 1.0, "v9_avg_pause_dur": len(audio) / self.sr, "v10_mid_pause_cnt": 0}
|
| 238 |
-
|
| 239 |
-
total_speech = sum(t["end"] - t["start"] for t in timestamps)
|
| 240 |
-
total_samples = len(audio)
|
| 241 |
-
pause_ratio = 1.0 - (total_speech / total_samples)
|
| 242 |
-
|
| 243 |
-
pauses = []
|
| 244 |
-
for i in range(1, len(timestamps)):
|
| 245 |
-
gap = (timestamps[i]["start"] - timestamps[i - 1]["end"]) / self.sr
|
| 246 |
-
if gap > 0.1:
|
| 247 |
-
pauses.append(gap)
|
| 248 |
-
|
| 249 |
-
return {
|
| 250 |
-
"v8_pause_ratio": float(np.clip(pause_ratio, 0, 1)),
|
| 251 |
-
"v9_avg_pause_dur": float(np.mean(pauses)) if pauses else 0.0,
|
| 252 |
-
"v10_mid_pause_cnt": len([p for p in pauses if 0.3 < p < 2.0]),
|
| 253 |
-
}
|
| 254 |
-
except Exception:
|
| 255 |
-
return defaults
|
| 256 |
-
|
| 257 |
-
# -------- V11-V13: Emotion features --------
|
| 258 |
-
def extract_emotion_features(self, audio: np.ndarray) -> Dict[str, float]:
|
| 259 |
-
try:
|
| 260 |
-
embedding = self.emotion_cnn.extract_embedding(audio, self.sr)
|
| 261 |
-
stress_indices = [0, 100, 200, 300, 400]
|
| 262 |
-
stress_values = embedding[stress_indices]
|
| 263 |
-
stress_score = float(np.clip(np.mean(np.abs(stress_values)), 0, 1))
|
| 264 |
-
return {
|
| 265 |
-
"v11_emotion_stress": stress_score,
|
| 266 |
-
"v12_emotion_energy": float(np.mean(np.abs(embedding[500:600]))),
|
| 267 |
-
"v13_emotion_valence": float(np.mean(embedding[700:800])),
|
| 268 |
-
}
|
| 269 |
-
except Exception:
|
| 270 |
-
return {"v11_emotion_stress": 0.0, "v12_emotion_energy": 0.0, "v13_emotion_valence": 0.0}
|
| 271 |
-
|
| 272 |
-
# -------- Main: extract all --------
|
| 273 |
-
def extract_all(self, audio: np.ndarray, transcript: str = "") -> Dict[str, float]:
|
| 274 |
-
features = {}
|
| 275 |
-
features["v1_snr"] = self.extract_snr(audio)
|
| 276 |
-
features.update(self.classify_noise_type(audio))
|
| 277 |
-
features["v3_speech_rate"] = self.extract_speech_rate(audio, transcript)
|
| 278 |
-
features.update(self.extract_pitch_features(audio))
|
| 279 |
-
features.update(self.extract_energy_features(audio))
|
| 280 |
-
features.update(self.extract_pause_features(audio))
|
| 281 |
-
features.update(self.extract_emotion_features(audio))
|
| 282 |
-
|
| 283 |
-
# Sanitize: replace NaN/Inf with 0.0 (prevents JSON serialization errors)
|
| 284 |
-
for key, val in features.items():
|
| 285 |
-
if isinstance(val, (float, np.floating)):
|
| 286 |
-
if np.isnan(val) or np.isinf(val):
|
| 287 |
-
features[key] = 0.0
|
| 288 |
-
else:
|
| 289 |
-
features[key] = float(val) # ensure native Python float
|
| 290 |
-
elif isinstance(val, (int, np.integer)):
|
| 291 |
-
features[key] = int(val)
|
| 292 |
-
|
| 293 |
-
return features
|
| 294 |
-
|
| 295 |
|
| 296 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ #
|
| 297 |
# FastAPI handler for deployment (HF Spaces / Cloud Run / Lambda)
|
|
@@ -323,7 +103,7 @@ async def global_exception_handler(request: Request, exc: Exception):
|
|
| 323 |
content={**DEFAULT_AUDIO_FEATURES, "_error": str(exc), "_handler": "global"},
|
| 324 |
)
|
| 325 |
|
| 326 |
-
|
| 327 |
|
| 328 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ #
|
| 329 |
# Constants & Defaults
|
|
@@ -345,7 +125,13 @@ async def root():
|
|
| 345 |
|
| 346 |
@app.get("/health")
|
| 347 |
async def health():
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
|
| 350 |
|
| 351 |
@app.post("/extract-audio-features")
|
|
@@ -353,9 +139,13 @@ async def extract_audio_features(audio: UploadFile = File(...), transcript: str
|
|
| 353 |
"""Extract all 17 voice features from uploaded audio file."""
|
| 354 |
try:
|
| 355 |
audio_bytes = await audio.read()
|
|
|
|
| 356 |
y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
|
|
|
|
|
|
|
| 357 |
features = extractor.extract_all(y, transcript)
|
| 358 |
-
|
|
|
|
| 359 |
except Exception as e:
|
| 360 |
print(f"[ERROR] extract_audio_features: {e}")
|
| 361 |
traceback.print_exc()
|
|
@@ -403,7 +193,7 @@ async def extract_audio_features_base64(data: AudioBase64Request):
|
|
| 403 |
|
| 404 |
features = extractor.extract_all(y, transcript)
|
| 405 |
print(f"[OK] Extracted {len(features)} audio features")
|
| 406 |
-
return features
|
| 407 |
except Exception as e:
|
| 408 |
print(f"[ERROR] extract_audio_features_base64: {e}")
|
| 409 |
traceback.print_exc()
|
|
@@ -416,3 +206,4 @@ if __name__ == "__main__":
|
|
| 416 |
import os
|
| 417 |
port = int(os.environ.get("PORT", 7860))
|
| 418 |
uvicorn.run(app, host="0.0.0.0", port=port)
|
|
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ #
|
| 25 |
+
# Imports from standardized modules
|
| 26 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ #
|
| 27 |
+
try:
|
| 28 |
+
from audio_features import AudioFeatureExtractor
|
| 29 |
+
except ImportError:
|
| 30 |
+
# Fallback if running from a different context
|
| 31 |
+
import sys
|
| 32 |
+
sys.path.append('.')
|
| 33 |
+
from audio_features import AudioFeatureExtractor
|
| 34 |
+
|
| 35 |
+
# Initialize global extractor
|
| 36 |
+
# We use a global instance to cache models (VAD, Emotion)
|
| 37 |
+
print("[INFO] Initializing Global AudioFeatureExtractor...")
|
| 38 |
+
extractor = AudioFeatureExtractor(
|
| 39 |
+
sample_rate=16000,
|
| 40 |
+
use_emotion=True,
|
| 41 |
+
models_dir="models" # Dockerfile should place models here or download them
|
| 42 |
+
)
|
| 43 |
|
| 44 |
+
# Ensure models are downloaded/ready
|
| 45 |
+
if extractor.use_emotion and extractor.emotion_extractor:
|
| 46 |
+
print("[INFO] Checking for emotion models...")
|
| 47 |
+
# Trigger download if needed/possible
|
| 48 |
+
try:
|
| 49 |
+
if len(extractor.emotion_extractor.models) == 0:
|
| 50 |
+
print("[INFO] Models not found, attempting download...")
|
| 51 |
+
extractor.emotion_extractor.download_models()
|
| 52 |
+
# Re-init manually to load them
|
| 53 |
+
extractor.emotion_extractor.__init__(models_dir=extractor.emotion_extractor.models_dir)
|
| 54 |
+
except Exception as e:
|
| 55 |
+
print(f"[WARN] Failed to download emotion models: {e}")
|
| 56 |
|
| 57 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ #
|
| 58 |
+
# Helper to handle NaN/Inf for JSON
|
| 59 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ #
|
| 60 |
+
def sanitize_features(features: Dict[str, float]) -> Dict[str, float]:
|
| 61 |
+
sanitized = {}
|
| 62 |
+
for key, val in features.items():
|
| 63 |
+
if isinstance(val, (float, np.floating)):
|
| 64 |
+
if np.isnan(val) or np.isinf(val):
|
| 65 |
+
sanitized[key] = 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
else:
|
| 67 |
+
sanitized[key] = float(val)
|
| 68 |
+
elif isinstance(val, (int, np.integer)):
|
| 69 |
+
sanitized[key] = int(val)
|
| 70 |
+
else:
|
| 71 |
+
sanitized[key] = val # keep string/other as is
|
| 72 |
+
return sanitized
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ #
|
| 77 |
# FastAPI handler for deployment (HF Spaces / Cloud Run / Lambda)
|
|
|
|
| 103 |
content={**DEFAULT_AUDIO_FEATURES, "_error": str(exc), "_handler": "global"},
|
| 104 |
)
|
| 105 |
|
| 106 |
+
# Extractor is already initialized globally above
|
| 107 |
|
| 108 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ #
|
| 109 |
# Constants & Defaults
|
|
|
|
| 125 |
|
| 126 |
@app.get("/health")
|
| 127 |
async def health():
|
| 128 |
+
vad_status = extractor.vad_model is not None
|
| 129 |
+
emotion_status = extractor.emotion_extractor is not None if extractor.use_emotion else False
|
| 130 |
+
return {
|
| 131 |
+
"status": "healthy",
|
| 132 |
+
"vad_loaded": vad_status,
|
| 133 |
+
"emotion_loaded": emotion_status
|
| 134 |
+
}
|
| 135 |
|
| 136 |
|
| 137 |
@app.post("/extract-audio-features")
|
|
|
|
| 139 |
"""Extract all 17 voice features from uploaded audio file."""
|
| 140 |
try:
|
| 141 |
audio_bytes = await audio.read()
|
| 142 |
+
# librosa.load returns (audio, sr)
|
| 143 |
y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
|
| 144 |
+
|
| 145 |
+
# AudioFeatureExtractor.extract_all expects numpy array and optional transcript
|
| 146 |
features = extractor.extract_all(y, transcript)
|
| 147 |
+
|
| 148 |
+
return sanitize_features(features)
|
| 149 |
except Exception as e:
|
| 150 |
print(f"[ERROR] extract_audio_features: {e}")
|
| 151 |
traceback.print_exc()
|
|
|
|
| 193 |
|
| 194 |
features = extractor.extract_all(y, transcript)
|
| 195 |
print(f"[OK] Extracted {len(features)} audio features")
|
| 196 |
+
return sanitize_features(features)
|
| 197 |
except Exception as e:
|
| 198 |
print(f"[ERROR] extract_audio_features_base64: {e}")
|
| 199 |
traceback.print_exc()
|
|
|
|
| 206 |
import os
|
| 207 |
port = int(os.environ.get("PORT", 7860))
|
| 208 |
uvicorn.run(app, host="0.0.0.0", port=port)
|
| 209 |
+
|
requirements.txt
CHANGED
|
@@ -4,15 +4,17 @@ soundfile==0.12.1
|
|
| 4 |
numpy==1.24.3
|
| 5 |
scipy==1.11.2
|
| 6 |
|
| 7 |
-
# ML - CPU-only versions (
|
|
|
|
| 8 |
--extra-index-url https://download.pytorch.org/whl/cpu
|
| 9 |
torch==2.1.0+cpu
|
| 10 |
-
torchvision==0.16.0+cpu
|
| 11 |
torchaudio==2.1.0+cpu
|
| 12 |
|
|
|
|
|
|
|
|
|
|
| 13 |
# API
|
| 14 |
fastapi==0.95.2
|
| 15 |
uvicorn==0.22.0
|
| 16 |
python-multipart==0.0.6
|
| 17 |
huggingface_hub>=0.19.0
|
| 18 |
-
scikit-image>=0.21.0
|
|
|
|
| 4 |
numpy==1.24.3
|
| 5 |
scipy==1.11.2
|
| 6 |
|
| 7 |
+
# ML - CPU-only versions (HF Spaces friendly)
|
| 8 |
+
# Torch for Silero VAD
|
| 9 |
--extra-index-url https://download.pytorch.org/whl/cpu
|
| 10 |
torch==2.1.0+cpu
|
|
|
|
| 11 |
torchaudio==2.1.0+cpu
|
| 12 |
|
| 13 |
+
# TensorFlow for Emotion Models
|
| 14 |
+
tensorflow-cpu==2.15.0
|
| 15 |
+
|
| 16 |
# API
|
| 17 |
fastapi==0.95.2
|
| 18 |
uvicorn==0.22.0
|
| 19 |
python-multipart==0.0.6
|
| 20 |
huggingface_hub>=0.19.0
|
|
|