Spaces:

divAIne
/

busy-module-audio

Sleeping

App Files Files Community

EurekaPotato commited on 2 days ago

Commit

dde584b

verified ·

1 Parent(s): 43b81f4

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

README.md +38 -39
handler.py +90 -29

README.md CHANGED Viewed

@@ -1,43 +1,42 @@
----
 title: Busy Module Audio Features
-emoji: 🎤
-colorFrom: indigo
-colorTo: purple
-sdk: docker
-app_port: 7860
-pinned: false
----
 # Busy Module Audio Features
 ## Audio Feature Extraction API
-Extracts 17 voice features from audio: SNR, noise classification, speech rate, pitch, energy, pause analysis, and emotion features.
-## API
-**POST** `/extract-audio-features-base64`
-```json
-{
-  "audio_base64": "<base64-encoded-wav>",
-  "transcript": "I'm driving right now"
-}
-```
-**POST** `/extract-audio-features` (multipart form)
-- `audio`: audio file upload
-- `transcript`: text transcript
-**POST** `/extract-audio-features` (multipart form)
-- `audio`: audio file upload
-- `transcript`: text transcript
-**GET** `/health`
-## Authentication
-This Space requires access to private models. You must add your Hugging Face token as a secret:
-1. Go to **Settings** -> **Variables and secrets**.
-2. Click **New secret**.
-3. Name: `HF_TOKEN`
-4. Value: Your Hugging Face Access Token (with read permissions).

+---
 title: Busy Module Audio Features
+emoji: "🎤"
+colorFrom: indigo
+colorTo: purple
+sdk: docker
+app_port: 7860
+pinned: false
+---
 # Busy Module Audio Features
 ## Audio Feature Extraction API
+This Space extracts 17 voice features from audio, including SNR, noise classification, speech rate, pitch, energy, pause analysis, and emotion features.
+## API
+**POST** `/extract-audio-features-base64`
+```json
+{
+  "audio_base64": "<base64-encoded-wav>",
+  "transcript": "I'm driving right now"
+}
+```
+**POST** `/extract-audio-features` (multipart form)
+- `audio`: audio file upload
+- `transcript`: text transcript
+**GET** `/health`
+## Authentication
+This Space requires access to private models. Add your Hugging Face token as a secret:
+1. Go to **Settings** -> **Variables and secrets**.
+2. Click **New secret**.
+3. Name it `HF_TOKEN`.
+4. Set the value to a Hugging Face access token with read permissions.

handler.py CHANGED Viewed

@@ -8,10 +8,12 @@ Extracts all 17 voice features from uploaded audio:
 Derived from: src/audio_features.py, src/emotion_features.py
 """
-import io
-import numpy as np
-import librosa
-from scipy import signal as scipy_signal
 from typing import Dict
 import torch
 import torch.nn as nn
@@ -129,9 +131,72 @@ DEFAULT_AUDIO_FEATURES = {
     "v13_emotion_valence": 0.0,
 }
-class AudioBase64Request(BaseModel):
-    audio_base64: str = ""
-    transcript: str = ""
 @app.get("/")
@@ -173,15 +238,14 @@ async def extract_audio_features(audio: UploadFile = File(...), transcript: str
 @app.post("/extract-audio-features-base64")
-async def extract_audio_features_base64(data: AudioBase64Request):
-    """Extract features from base64-encoded audio (for Vercel serverless calls)."""
-    import soundfile as sf
-    audio_b64 = data.audio_base64
-    transcript = data.transcript
-    # Handle empty / missing audio — return default features
-    if not audio_b64 or len(audio_b64) < 100:
         print("[INFO] Empty or too-short audio_base64, returning defaults")
         return {**DEFAULT_AUDIO_FEATURES}
@@ -189,19 +253,16 @@ async def extract_audio_features_base64(data: AudioBase64Request):
         # Strip data URL prefix if present (e.g. "data:audio/wav;base64,...")
         if "," in audio_b64[:80]:
             audio_b64 = audio_b64.split(",", 1)[1]
-        audio_bytes = base64.b64decode(audio_b64)
-        print(f"[INFO] Decoded {len(audio_bytes)} bytes of audio")
-        # Try soundfile first, fall back to librosa
-        try:
-            y, sr = sf.read(io.BytesIO(audio_bytes))
-        except Exception as sf_err:
-            print(f"[WARN] soundfile failed ({sf_err}), trying librosa...")
-            y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
-        if hasattr(y, 'shape') and len(y.shape) > 1:
-            y = np.mean(y, axis=1)
         y = np.asarray(y, dtype=np.float32)
         if sr != 16000:
             y = librosa.resample(y, orig_sr=sr, target_sr=16000)

 Derived from: src/audio_features.py, src/emotion_features.py
 """
+import io
+import os
+import tempfile
+import numpy as np
+import librosa
+from scipy import signal as scipy_signal
 from typing import Dict
 import torch
 import torch.nn as nn
     "v13_emotion_valence": 0.0,
 }
+class AudioBase64Request(BaseModel):
+    audio_base64: str = ""
+    transcript: str = ""
+    mime_type: str = ""
+def infer_audio_extension(audio_bytes: bytes, mime_type: str = "") -> str:
+    normalized = (mime_type or "").lower().split(";")[0].strip()
+    mime_map = {
+        "audio/webm": ".webm",
+        "audio/ogg": ".ogg",
+        "audio/wav": ".wav",
+        "audio/x-wav": ".wav",
+        "audio/mpeg": ".mp3",
+        "audio/mp3": ".mp3",
+        "audio/mp4": ".m4a",
+        "audio/x-m4a": ".m4a",
+        "audio/aac": ".aac",
+        "audio/flac": ".flac",
+    }
+    if normalized in mime_map:
+        return mime_map[normalized]
+    if audio_bytes.startswith(b"RIFF"):
+        return ".wav"
+    if audio_bytes.startswith(b"OggS"):
+        return ".ogg"
+    if audio_bytes.startswith(b"\x1A\x45\xDF\xA3"):
+        return ".webm"
+    if audio_bytes.startswith(b"fLaC"):
+        return ".flac"
+    if audio_bytes[4:8] == b"ftyp":
+        return ".m4a"
+    if audio_bytes.startswith(b"ID3") or (len(audio_bytes) > 1 and audio_bytes[0] == 0xFF and (audio_bytes[1] & 0xE0) == 0xE0):
+        return ".mp3"
+    return ".bin"
+def decode_audio_bytes(audio_bytes: bytes, mime_type: str = ""):
+    import soundfile as sf
+    try:
+        y, sr = sf.read(io.BytesIO(audio_bytes))
+        return y, sr
+    except Exception as sf_err:
+        print(f"[WARN] soundfile failed ({sf_err}), trying librosa from buffer...")
+    try:
+        y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True)
+        return y, sr
+    except Exception as librosa_err:
+        print(f"[WARN] librosa buffer decode failed ({librosa_err}), trying temp file...")
+    suffix = infer_audio_extension(audio_bytes, mime_type)
+    temp_path = None
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
+            temp_file.write(audio_bytes)
+            temp_path = temp_file.name
+        y, sr = librosa.load(temp_path, sr=16000, mono=True)
+        return y, sr
+    finally:
+        if temp_path and os.path.exists(temp_path):
+            os.remove(temp_path)
 @app.get("/")
 @app.post("/extract-audio-features-base64")
+async def extract_audio_features_base64(data: AudioBase64Request):
+    """Extract features from base64-encoded audio (for Vercel serverless calls)."""
+    audio_b64 = data.audio_base64
+    transcript = data.transcript
+    mime_type = data.mime_type
+    # Handle empty / missing audio — return default features
+    if not audio_b64 or len(audio_b64) < 100:
         print("[INFO] Empty or too-short audio_base64, returning defaults")
         return {**DEFAULT_AUDIO_FEATURES}
         # Strip data URL prefix if present (e.g. "data:audio/wav;base64,...")
         if "," in audio_b64[:80]:
             audio_b64 = audio_b64.split(",", 1)[1]
+        audio_bytes = base64.b64decode(audio_b64)
+        print(f"[INFO] Decoded {len(audio_bytes)} bytes of audio")
+        if mime_type:
+            print(f"[INFO] MIME type hint: {mime_type}")
+        y, sr = decode_audio_bytes(audio_bytes, mime_type)
+        if hasattr(y, 'shape') and len(y.shape) > 1:
+            y = np.mean(y, axis=1)
         y = np.asarray(y, dtype=np.float32)
         if sr != 16000:
             y = librosa.resample(y, orig_sr=sr, target_sr=16000)