EurekaPotato commited on
Commit
dde584b
·
verified ·
1 Parent(s): 43b81f4

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. README.md +38 -39
  2. handler.py +90 -29
README.md CHANGED
@@ -1,43 +1,42 @@
1
- ---
2
  title: Busy Module Audio Features
3
- emoji: 🎤
4
- colorFrom: indigo
5
- colorTo: purple
6
- sdk: docker
7
- app_port: 7860
8
- pinned: false
9
- ---
10
-
11
  # Busy Module Audio Features
12
 
13
  ## Audio Feature Extraction API
14
-
15
- Extracts 17 voice features from audio: SNR, noise classification, speech rate, pitch, energy, pause analysis, and emotion features.
16
-
17
- ## API
18
-
19
- **POST** `/extract-audio-features-base64`
20
- ```json
21
- {
22
- "audio_base64": "<base64-encoded-wav>",
23
- "transcript": "I'm driving right now"
24
- }
25
- ```
26
-
27
- **POST** `/extract-audio-features` (multipart form)
28
- - `audio`: audio file upload
29
- - `transcript`: text transcript
30
-
31
- **POST** `/extract-audio-features` (multipart form)
32
- - `audio`: audio file upload
33
- - `transcript`: text transcript
34
-
35
- **GET** `/health`
36
-
37
- ## Authentication
38
-
39
- This Space requires access to private models. You must add your Hugging Face token as a secret:
40
- 1. Go to **Settings** -> **Variables and secrets**.
41
- 2. Click **New secret**.
42
- 3. Name: `HF_TOKEN`
43
- 4. Value: Your Hugging Face Access Token (with read permissions).
 
1
+ ---
2
  title: Busy Module Audio Features
3
+ emoji: "🎤"
4
+ colorFrom: indigo
5
+ colorTo: purple
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ ---
10
+
11
  # Busy Module Audio Features
12
 
13
  ## Audio Feature Extraction API
14
+
15
+ This Space extracts 17 voice features from audio, including SNR, noise classification, speech rate, pitch, energy, pause analysis, and emotion features.
16
+
17
+ ## API
18
+
19
+ **POST** `/extract-audio-features-base64`
20
+
21
+ ```json
22
+ {
23
+ "audio_base64": "<base64-encoded-wav>",
24
+ "transcript": "I'm driving right now"
25
+ }
26
+ ```
27
+
28
+ **POST** `/extract-audio-features` (multipart form)
29
+
30
+ - `audio`: audio file upload
31
+ - `transcript`: text transcript
32
+
33
+ **GET** `/health`
34
+
35
+ ## Authentication
36
+
37
+ This Space requires access to private models. Add your Hugging Face token as a secret:
38
+
39
+ 1. Go to **Settings** -> **Variables and secrets**.
40
+ 2. Click **New secret**.
41
+ 3. Name it `HF_TOKEN`.
42
+ 4. Set the value to a Hugging Face access token with read permissions.
 
handler.py CHANGED
@@ -8,10 +8,12 @@ Extracts all 17 voice features from uploaded audio:
8
  Derived from: src/audio_features.py, src/emotion_features.py
9
  """
10
 
11
- import io
12
- import numpy as np
13
- import librosa
14
- from scipy import signal as scipy_signal
 
 
15
  from typing import Dict
16
  import torch
17
  import torch.nn as nn
@@ -129,9 +131,72 @@ DEFAULT_AUDIO_FEATURES = {
129
  "v13_emotion_valence": 0.0,
130
  }
131
 
132
- class AudioBase64Request(BaseModel):
133
- audio_base64: str = ""
134
- transcript: str = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
 
137
  @app.get("/")
@@ -173,15 +238,14 @@ async def extract_audio_features(audio: UploadFile = File(...), transcript: str
173
 
174
 
175
  @app.post("/extract-audio-features-base64")
176
- async def extract_audio_features_base64(data: AudioBase64Request):
177
- """Extract features from base64-encoded audio (for Vercel serverless calls)."""
178
- import soundfile as sf
179
-
180
- audio_b64 = data.audio_base64
181
- transcript = data.transcript
182
-
183
- # Handle empty / missing audio — return default features
184
- if not audio_b64 or len(audio_b64) < 100:
185
  print("[INFO] Empty or too-short audio_base64, returning defaults")
186
  return {**DEFAULT_AUDIO_FEATURES}
187
 
@@ -189,19 +253,16 @@ async def extract_audio_features_base64(data: AudioBase64Request):
189
  # Strip data URL prefix if present (e.g. "data:audio/wav;base64,...")
190
  if "," in audio_b64[:80]:
191
  audio_b64 = audio_b64.split(",", 1)[1]
192
-
193
- audio_bytes = base64.b64decode(audio_b64)
194
- print(f"[INFO] Decoded {len(audio_bytes)} bytes of audio")
195
-
196
- # Try soundfile first, fall back to librosa
197
- try:
198
- y, sr = sf.read(io.BytesIO(audio_bytes))
199
- except Exception as sf_err:
200
- print(f"[WARN] soundfile failed ({sf_err}), trying librosa...")
201
- y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
202
-
203
- if hasattr(y, 'shape') and len(y.shape) > 1:
204
- y = np.mean(y, axis=1)
205
  y = np.asarray(y, dtype=np.float32)
206
  if sr != 16000:
207
  y = librosa.resample(y, orig_sr=sr, target_sr=16000)
 
8
  Derived from: src/audio_features.py, src/emotion_features.py
9
  """
10
 
11
+ import io
12
+ import os
13
+ import tempfile
14
+ import numpy as np
15
+ import librosa
16
+ from scipy import signal as scipy_signal
17
  from typing import Dict
18
  import torch
19
  import torch.nn as nn
 
131
  "v13_emotion_valence": 0.0,
132
  }
133
 
134
+ class AudioBase64Request(BaseModel):
135
+ audio_base64: str = ""
136
+ transcript: str = ""
137
+ mime_type: str = ""
138
+
139
+
140
+ def infer_audio_extension(audio_bytes: bytes, mime_type: str = "") -> str:
141
+ normalized = (mime_type or "").lower().split(";")[0].strip()
142
+ mime_map = {
143
+ "audio/webm": ".webm",
144
+ "audio/ogg": ".ogg",
145
+ "audio/wav": ".wav",
146
+ "audio/x-wav": ".wav",
147
+ "audio/mpeg": ".mp3",
148
+ "audio/mp3": ".mp3",
149
+ "audio/mp4": ".m4a",
150
+ "audio/x-m4a": ".m4a",
151
+ "audio/aac": ".aac",
152
+ "audio/flac": ".flac",
153
+ }
154
+ if normalized in mime_map:
155
+ return mime_map[normalized]
156
+
157
+ if audio_bytes.startswith(b"RIFF"):
158
+ return ".wav"
159
+ if audio_bytes.startswith(b"OggS"):
160
+ return ".ogg"
161
+ if audio_bytes.startswith(b"\x1A\x45\xDF\xA3"):
162
+ return ".webm"
163
+ if audio_bytes.startswith(b"fLaC"):
164
+ return ".flac"
165
+ if audio_bytes[4:8] == b"ftyp":
166
+ return ".m4a"
167
+ if audio_bytes.startswith(b"ID3") or (len(audio_bytes) > 1 and audio_bytes[0] == 0xFF and (audio_bytes[1] & 0xE0) == 0xE0):
168
+ return ".mp3"
169
+
170
+ return ".bin"
171
+
172
+
173
+ def decode_audio_bytes(audio_bytes: bytes, mime_type: str = ""):
174
+ import soundfile as sf
175
+
176
+ try:
177
+ y, sr = sf.read(io.BytesIO(audio_bytes))
178
+ return y, sr
179
+ except Exception as sf_err:
180
+ print(f"[WARN] soundfile failed ({sf_err}), trying librosa from buffer...")
181
+
182
+ try:
183
+ y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
184
+ return y, sr
185
+ except Exception as librosa_err:
186
+ print(f"[WARN] librosa buffer decode failed ({librosa_err}), trying temp file...")
187
+
188
+ suffix = infer_audio_extension(audio_bytes, mime_type)
189
+ temp_path = None
190
+ try:
191
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
192
+ temp_file.write(audio_bytes)
193
+ temp_path = temp_file.name
194
+
195
+ y, sr = librosa.load(temp_path, sr=16000, mono=True)
196
+ return y, sr
197
+ finally:
198
+ if temp_path and os.path.exists(temp_path):
199
+ os.remove(temp_path)
200
 
201
 
202
  @app.get("/")
 
238
 
239
 
240
  @app.post("/extract-audio-features-base64")
241
+ async def extract_audio_features_base64(data: AudioBase64Request):
242
+ """Extract features from base64-encoded audio (for Vercel serverless calls)."""
243
+ audio_b64 = data.audio_base64
244
+ transcript = data.transcript
245
+ mime_type = data.mime_type
246
+
247
+ # Handle empty / missing audio — return default features
248
+ if not audio_b64 or len(audio_b64) < 100:
 
249
  print("[INFO] Empty or too-short audio_base64, returning defaults")
250
  return {**DEFAULT_AUDIO_FEATURES}
251
 
 
253
  # Strip data URL prefix if present (e.g. "data:audio/wav;base64,...")
254
  if "," in audio_b64[:80]:
255
  audio_b64 = audio_b64.split(",", 1)[1]
256
+
257
+ audio_bytes = base64.b64decode(audio_b64)
258
+ print(f"[INFO] Decoded {len(audio_bytes)} bytes of audio")
259
+ if mime_type:
260
+ print(f"[INFO] MIME type hint: {mime_type}")
261
+
262
+ y, sr = decode_audio_bytes(audio_bytes, mime_type)
263
+
264
+ if hasattr(y, 'shape') and len(y.shape) > 1:
265
+ y = np.mean(y, axis=1)
 
 
 
266
  y = np.asarray(y, dtype=np.float32)
267
  if sr != 16000:
268
  y = librosa.resample(y, orig_sr=sr, target_sr=16000)