EurekaPotato commited on
Commit
55e88f4
Β·
verified Β·
1 Parent(s): 3a6257a

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. Dockerfile +18 -0
  2. README.md +29 -10
  3. handler.py +274 -0
  4. requirements.txt +16 -0
Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # System dependencies for audio processing
6
+ RUN apt-get update && apt-get install -y \
7
+ libsndfile1 \
8
+ ffmpeg \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ COPY handler.py .
15
+
16
+ EXPOSE 7860
17
+
18
+ CMD ["python", "handler.py"]
README.md CHANGED
@@ -1,10 +1,29 @@
1
- ---
2
- title: Busy Module Audio
3
- emoji: 🐨
4
- colorFrom: yellow
5
- colorTo: yellow
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Busy Module Audio Features
3
+ emoji: 🎀
4
+ colorFrom: indigo
5
+ colorTo: purple
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ ---
10
+
11
+ # Audio Feature Extraction API
12
+
13
+ Extracts 17 voice features from audio: SNR, noise classification, speech rate, pitch, energy, pause analysis, and emotion features.
14
+
15
+ ## API
16
+
17
+ **POST** `/extract-audio-features-base64`
18
+ ```json
19
+ {
20
+ "audio_base64": "<base64-encoded-wav>",
21
+ "transcript": "I'm driving right now"
22
+ }
23
+ ```
24
+
25
+ **POST** `/extract-audio-features` (multipart form)
26
+ - `audio`: audio file upload
27
+ - `transcript`: text transcript
28
+
29
+ **GET** `/health`
handler.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Audio Feature Extraction β€” Hugging Face Inference Endpoint Handler
3
+
4
+ Extracts all 17 voice features from uploaded audio:
5
+ v1_snr, v2_noise_* (5), v3_speech_rate, v4/v5_pitch, v6/v7_energy,
6
+ v8/v9/v10_pause, v11/v12/v13_emotion
7
+
8
+ Derived from: src/audio_features.py, src/emotion_features.py
9
+ """
10
+
11
+ import io
12
+ import numpy as np
13
+ import librosa
14
+ from scipy import signal as scipy_signal
15
+ from typing import Dict
16
+ import torch
17
+ import torch.nn as nn
18
+ from torchvision import models
19
+ import warnings
20
+
21
+ warnings.filterwarnings("ignore")
22
+
23
+
24
+ # ──────────────────────────────────────────────────────────────────────── #
25
+ # Emotion CNN (mirrors src/emotion_features.py EmotionCNN)
26
+ # ──────────────────────────────────────────────────────────────────────── #
27
+
28
+ class EmotionCNN:
29
+ """Lightweight CNN for emotion embedding from spectrograms (MobileNetV3)."""
30
+
31
+ def __init__(self):
32
+ self.model = models.mobilenet_v3_small(pretrained=True)
33
+ self.model.classifier = nn.Identity()
34
+ self.model.eval()
35
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
36
+ if self.device == "cuda":
37
+ self.model = self.model.cuda()
38
+
39
+ def audio_to_spectrogram(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
40
+ mel_spec = librosa.feature.melspectrogram(
41
+ y=audio, sr=sr, n_fft=512, hop_length=64, n_mels=128, fmin=0, fmax=sr / 2
42
+ )
43
+ mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
44
+ mel_spec_db = np.clip(mel_spec_db, -80, 0)
45
+ mel_spec_norm = (mel_spec_db + 80) / 80
46
+
47
+ from skimage.transform import resize
48
+ mel_resized = resize(mel_spec_norm, (224, 224), mode="constant")
49
+
50
+ from matplotlib import cm
51
+ colormap = cm.get_cmap("jet")
52
+ rgb = colormap(mel_resized)[:, :, :3]
53
+ return np.transpose(rgb, (2, 0, 1)).astype(np.float32)
54
+
55
+ def extract_embedding(self, audio: np.ndarray, sr: int = 16000) -> np.ndarray:
56
+ spec_rgb = self.audio_to_spectrogram(audio, sr)
57
+ tensor = torch.from_numpy(spec_rgb).unsqueeze(0)
58
+ if self.device == "cuda":
59
+ tensor = tensor.cuda()
60
+ with torch.no_grad():
61
+ emb = self.model(tensor)
62
+ return emb.cpu().numpy().flatten()
63
+
64
+
65
+ # ──────────────────────────────────────────────────────────────────────── #
66
+ # Audio Feature Extractor (mirrors src/audio_features.py)
67
+ # ──────────────────────────────────────────────────────────────────────── #
68
+
69
+ class AudioFeatureExtractorEndpoint:
70
+ """Stateless audio feature extraction for HF endpoint."""
71
+
72
+ def __init__(self):
73
+ self.sr = 16000
74
+ self.emotion_cnn = EmotionCNN()
75
+
76
+ # Load Silero VAD
77
+ try:
78
+ self.vad_model, self.vad_utils = torch.hub.load(
79
+ repo_or_dir="snakers4/silero-vad", model="silero_vad", trust_repo=True
80
+ )
81
+ self.get_speech_timestamps = self.vad_utils[0]
82
+ print("βœ“ Silero VAD loaded")
83
+ except Exception as e:
84
+ print(f"⚠ Silero VAD failed: {e}")
85
+ self.vad_model = None
86
+
87
+ # -------- V1: SNR --------
88
+ def extract_snr(self, audio: np.ndarray) -> float:
89
+ if len(audio) == 0:
90
+ return 0.0
91
+ frame_length = min(2048, len(audio))
92
+ frames = librosa.util.frame(audio, frame_length=frame_length, hop_length=frame_length // 2)
93
+ frame_energy = np.sum(frames ** 2, axis=0)
94
+ if len(frame_energy) < 2:
95
+ return 0.0
96
+ sorted_energy = np.sort(frame_energy)
97
+ n_noise = max(1, len(sorted_energy) // 5)
98
+ noise_floor = np.mean(sorted_energy[:n_noise])
99
+ signal_power = np.mean(sorted_energy)
100
+ if noise_floor <= 0:
101
+ return 40.0
102
+ snr = 10 * np.log10(signal_power / noise_floor + 1e-10)
103
+ return float(np.clip(snr, -10, 40))
104
+
105
+ # -------- V2: Noise classification --------
106
+ def classify_noise_type(self, audio: np.ndarray) -> Dict[str, float]:
107
+ if len(audio) < 2048:
108
+ return {
109
+ "v2_noise_traffic": 0.0, "v2_noise_office": 0.0,
110
+ "v2_noise_crowd": 0.0, "v2_noise_wind": 0.0, "v2_noise_clean": 1.0,
111
+ }
112
+ spec = np.abs(librosa.stft(audio, n_fft=2048))
113
+ freq_bins = librosa.fft_frequencies(sr=self.sr, n_fft=2048)
114
+
115
+ low = np.mean(spec[(freq_bins >= 50) & (freq_bins <= 500)])
116
+ mid = np.mean(spec[(freq_bins >= 500) & (freq_bins <= 2000)])
117
+ high = np.mean(spec[(freq_bins >= 2000) & (freq_bins <= 6000)])
118
+ total = low + mid + high + 1e-10
119
+
120
+ low_r, mid_r, high_r = low / total, mid / total, high / total
121
+ spectral_centroid = float(np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sr)))
122
+ spectral_flatness = float(np.mean(librosa.feature.spectral_flatness(y=audio)))
123
+
124
+ noise = {
125
+ "v2_noise_traffic": float(np.clip(low_r * 2 - 0.3, 0, 1)),
126
+ "v2_noise_office": float(np.clip(mid_r * 1.5 - 0.2, 0, 1) if spectral_flatness > 0.01 else 0),
127
+ "v2_noise_crowd": float(np.clip(mid_r * 2 - 0.5, 0, 1) if spectral_centroid > 1500 else 0),
128
+ "v2_noise_wind": float(np.clip(low_r * 3 - 0.8, 0, 1) if spectral_flatness > 0.1 else 0),
129
+ }
130
+ noise["v2_noise_clean"] = float(np.clip(1 - max(noise.values()), 0, 1))
131
+ return noise
132
+
133
+ # -------- V3: Speech rate --------
134
+ def extract_speech_rate(self, audio: np.ndarray, transcript: str) -> float:
135
+ if not transcript:
136
+ return 0.0
137
+ word_count = len(transcript.split())
138
+ duration = len(audio) / self.sr
139
+ if duration == 0:
140
+ return 0.0
141
+ return float(word_count / duration)
142
+
143
+ # -------- V4-V5: Pitch --------
144
+ def extract_pitch_features(self, audio: np.ndarray) -> Dict[str, float]:
145
+ try:
146
+ pitches, magnitudes = librosa.piptrack(y=audio, sr=self.sr)
147
+ pitch_values = pitches[magnitudes > np.median(magnitudes)]
148
+ pitch_values = pitch_values[pitch_values > 0]
149
+ if len(pitch_values) == 0:
150
+ return {"v4_pitch_mean": 0.0, "v5_pitch_std": 0.0}
151
+ return {
152
+ "v4_pitch_mean": float(np.mean(pitch_values)),
153
+ "v5_pitch_std": float(np.std(pitch_values)),
154
+ }
155
+ except Exception:
156
+ return {"v4_pitch_mean": 0.0, "v5_pitch_std": 0.0}
157
+
158
+ # -------- V6-V7: Energy --------
159
+ def extract_energy_features(self, audio: np.ndarray) -> Dict[str, float]:
160
+ rms = librosa.feature.rms(y=audio)[0]
161
+ return {"v6_energy_mean": float(np.mean(rms)), "v7_energy_std": float(np.std(rms))}
162
+
163
+ # -------- V8-V10: Pause features (Silero VAD) --------
164
+ def extract_pause_features(self, audio: np.ndarray) -> Dict[str, float]:
165
+ defaults = {"v8_pause_ratio": 0.0, "v9_avg_pause_dur": 0.0, "v10_mid_pause_cnt": 0}
166
+ if self.vad_model is None or len(audio) < self.sr:
167
+ return defaults
168
+ try:
169
+ audio_tensor = torch.FloatTensor(audio)
170
+ timestamps = self.get_speech_timestamps(audio_tensor, self.vad_model, sampling_rate=self.sr)
171
+ if not timestamps:
172
+ return {"v8_pause_ratio": 1.0, "v9_avg_pause_dur": len(audio) / self.sr, "v10_mid_pause_cnt": 0}
173
+
174
+ total_speech = sum(t["end"] - t["start"] for t in timestamps)
175
+ total_samples = len(audio)
176
+ pause_ratio = 1.0 - (total_speech / total_samples)
177
+
178
+ pauses = []
179
+ for i in range(1, len(timestamps)):
180
+ gap = (timestamps[i]["start"] - timestamps[i - 1]["end"]) / self.sr
181
+ if gap > 0.1:
182
+ pauses.append(gap)
183
+
184
+ return {
185
+ "v8_pause_ratio": float(np.clip(pause_ratio, 0, 1)),
186
+ "v9_avg_pause_dur": float(np.mean(pauses)) if pauses else 0.0,
187
+ "v10_mid_pause_cnt": len([p for p in pauses if 0.3 < p < 2.0]),
188
+ }
189
+ except Exception:
190
+ return defaults
191
+
192
+ # -------- V11-V13: Emotion features --------
193
+ def extract_emotion_features(self, audio: np.ndarray) -> Dict[str, float]:
194
+ try:
195
+ embedding = self.emotion_cnn.extract_embedding(audio, self.sr)
196
+ stress_indices = [0, 100, 200, 300, 400]
197
+ stress_values = embedding[stress_indices]
198
+ stress_score = float(np.clip(np.mean(np.abs(stress_values)), 0, 1))
199
+ return {
200
+ "v11_emotion_stress": stress_score,
201
+ "v12_emotion_energy": float(np.mean(np.abs(embedding[500:600]))),
202
+ "v13_emotion_valence": float(np.mean(embedding[700:800])),
203
+ }
204
+ except Exception:
205
+ return {"v11_emotion_stress": 0.0, "v12_emotion_energy": 0.0, "v13_emotion_valence": 0.0}
206
+
207
+ # -------- Main: extract all --------
208
+ def extract_all(self, audio: np.ndarray, transcript: str = "") -> Dict[str, float]:
209
+ features = {}
210
+ features["v1_snr"] = self.extract_snr(audio)
211
+ features.update(self.classify_noise_type(audio))
212
+ features["v3_speech_rate"] = self.extract_speech_rate(audio, transcript)
213
+ features.update(self.extract_pitch_features(audio))
214
+ features.update(self.extract_energy_features(audio))
215
+ features.update(self.extract_pause_features(audio))
216
+ features.update(self.extract_emotion_features(audio))
217
+ return features
218
+
219
+
220
+ # ──────────────────────────────────────────────────────────────────────── #
221
+ # FastAPI handler for deployment (HF Spaces / Cloud Run / Lambda)
222
+ # ──────────────────────────────────────────────────────────────────────── #
223
+
224
+ from fastapi import FastAPI, File, UploadFile, Form
225
+ from fastapi.middleware.cors import CORSMiddleware
226
+ import base64
227
+
228
+ app = FastAPI(title="Audio Feature Extraction API", version="1.0.0")
229
+ app.add_middleware(
230
+ CORSMiddleware,
231
+ allow_origins=["*"], allow_credentials=True,
232
+ allow_methods=["*"], allow_headers=["*"],
233
+ )
234
+
235
+ extractor = AudioFeatureExtractorEndpoint()
236
+
237
+
238
+ @app.get("/health")
239
+ async def health():
240
+ return {"status": "healthy", "vad_loaded": extractor.vad_model is not None}
241
+
242
+
243
+ @app.post("/extract-audio-features")
244
+ async def extract_audio_features(audio: UploadFile = File(...), transcript: str = Form("")):
245
+ """Extract all 17 voice features from uploaded audio file."""
246
+ audio_bytes = await audio.read()
247
+ y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
248
+ features = extractor.extract_all(y, transcript)
249
+ return features
250
+
251
+
252
+ @app.post("/extract-audio-features-base64")
253
+ async def extract_audio_features_base64(data: dict):
254
+ """Extract features from base64-encoded audio (for Vercel serverless calls)."""
255
+ import soundfile as sf
256
+
257
+ audio_b64 = data.get("audio_base64", "")
258
+ transcript = data.get("transcript", "")
259
+
260
+ audio_bytes = base64.b64decode(audio_b64)
261
+ y, sr = sf.read(io.BytesIO(audio_bytes))
262
+ if len(y.shape) > 1:
263
+ y = np.mean(y, axis=1)
264
+ if sr != 16000:
265
+ y = librosa.resample(y, orig_sr=sr, target_sr=16000)
266
+ y = y.astype(np.float32)
267
+
268
+ features = extractor.extract_all(y, transcript)
269
+ return features
270
+
271
+
272
+ if __name__ == "__main__":
273
+ import uvicorn
274
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core audio
2
+ librosa==0.10.1
3
+ soundfile==0.12.1
4
+ numpy==1.24.3
5
+ scipy==1.11.2
6
+
7
+ # ML
8
+ torch==2.1.0
9
+ torchvision==0.16.0
10
+ scikit-image==0.22.0
11
+ matplotlib==3.8.2
12
+
13
+ # API
14
+ fastapi==0.95.2
15
+ uvicorn==0.22.0
16
+ python-multipart==0.0.6