""" Audio Feature Extraction — Hugging Face Inference Endpoint Handler Extracts all 17 voice features from uploaded audio: v1_snr, v2_noise_* (5), v3_speech_rate, v4/v5_pitch, v6/v7_energy, v8/v9/v10_pause, v11/v12/v13_emotion Derived from: src/audio_features.py, src/emotion_features.py """ import io import numpy as np import librosa from scipy import signal as scipy_signal from typing import Dict import torch import torch.nn as nn from torchvision import models import warnings warnings.filterwarnings("ignore") # ──────────────────────────────────────────────────────────────────────── # # Imports from standardized modules # ──────────────────────────────────────────────────────────────────────── # try: from audio_features import AudioFeatureExtractor except ImportError: # Fallback if running from a different context import sys sys.path.append('.') from audio_features import AudioFeatureExtractor # Initialize global extractor # We use a global instance to cache models (VAD, Emotion) print("[INFO] Initializing Global AudioFeatureExtractor...") extractor = AudioFeatureExtractor( sample_rate=16000, use_emotion=True, emotion_models_dir="/app/models" # Absolute path in Docker container ) # Ensure models are downloaded/ready if extractor.use_emotion and extractor.emotion_extractor: print("[INFO] Checking for emotion models...") # Trigger download if needed/possible try: if len(extractor.emotion_extractor.models) == 0: print("[INFO] Models not found, attempting download...") extractor.emotion_extractor.download_models() # Re-init manually to load them extractor.emotion_extractor.__init__(models_dir=extractor.emotion_extractor.models_dir) except Exception as e: print(f"[WARN] Failed to download emotion models: {e}") # ──────────────────────────────────────────────────────────────────────── # # Helper to handle NaN/Inf for JSON # ──────────────────────────────────────────────────────────────────────── # def sanitize_features(features: Dict[str, float]) -> Dict[str, float]: sanitized = {} for key, val in features.items(): if isinstance(val, (float, np.floating)): if np.isnan(val) or np.isinf(val): sanitized[key] = 0.0 else: sanitized[key] = float(val) elif isinstance(val, (int, np.integer)): sanitized[key] = int(val) else: sanitized[key] = val # keep string/other as is return sanitized # ──────────────────────────────────────────────────────────────────────── # # FastAPI handler for deployment (HF Spaces / Cloud Run / Lambda) # ──────────────────────────────────────────────────────────────────────── # from fastapi import FastAPI, File, UploadFile, Form, Request from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse from pydantic import BaseModel from typing import Optional import base64 import traceback app = FastAPI(title="Audio Feature Extraction API", version="1.0.0") app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) @app.exception_handler(Exception) async def global_exception_handler(request: Request, exc: Exception): """Catch any unhandled exceptions and return defaults instead of 500.""" print(f"[GLOBAL ERROR] {request.url}: {exc}") traceback.print_exc() return JSONResponse( status_code=200, content={**DEFAULT_AUDIO_FEATURES, "_error": str(exc), "_handler": "global"}, ) # Extractor is already initialized globally above # ──────────────────────────────────────────────────────────────────────── # # Constants & Defaults # ──────────────────────────────────────────────────────────────────────── # DEFAULT_AUDIO_FEATURES = { "v1_snr": 0.0, "v2_noise_traffic": 0.0, "v2_noise_office": 0.0, "v2_noise_crowd": 0.0, "v2_noise_wind": 0.0, "v2_noise_clean": 1.0, "v3_speech_rate": 0.0, "v4_pitch_mean": 0.0, "v5_pitch_std": 0.0, "v6_energy_mean": 0.0, "v7_energy_std": 0.0, "v8_pause_ratio": 0.0, "v9_avg_pause_dur": 0.0, "v10_mid_pause_cnt": 0.0, "v11_emotion_stress": 0.0, "v12_emotion_energy": 0.0, "v13_emotion_valence": 0.0, } class AudioBase64Request(BaseModel): audio_base64: str = "" transcript: str = "" @app.get("/") async def root(): return { "service": "Audio Feature Extraction API", "version": "1.0.0", "endpoints": ["/health", "/extract-audio-features", "/extract-audio-features-base64"], } @app.get("/health") async def health(): vad_status = extractor.vad_model is not None emotion_status = extractor.emotion_extractor is not None if extractor.use_emotion else False return { "status": "healthy", "vad_loaded": vad_status, "emotion_loaded": emotion_status } @app.post("/extract-audio-features") async def extract_audio_features(audio: UploadFile = File(...), transcript: str = Form("")): """Extract all 17 voice features from uploaded audio file.""" try: audio_bytes = await audio.read() # librosa.load returns (audio, sr) y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True) # AudioFeatureExtractor.extract_all expects numpy array and optional transcript features = extractor.extract_all(y, transcript) return sanitize_features(features) except Exception as e: print(f"[ERROR] extract_audio_features: {e}") traceback.print_exc() return {**DEFAULT_AUDIO_FEATURES, "_error": str(e)} @app.post("/extract-audio-features-base64") async def extract_audio_features_base64(data: AudioBase64Request): """Extract features from base64-encoded audio (for Vercel serverless calls).""" import soundfile as sf audio_b64 = data.audio_base64 transcript = data.transcript # Handle empty / missing audio — return default features if not audio_b64 or len(audio_b64) < 100: print("[INFO] Empty or too-short audio_base64, returning defaults") return {**DEFAULT_AUDIO_FEATURES} try: # Strip data URL prefix if present (e.g. "data:audio/wav;base64,...") if "," in audio_b64[:80]: audio_b64 = audio_b64.split(",", 1)[1] audio_bytes = base64.b64decode(audio_b64) print(f"[INFO] Decoded {len(audio_bytes)} bytes of audio") # Try soundfile first, fall back to librosa try: y, sr = sf.read(io.BytesIO(audio_bytes)) except Exception as sf_err: print(f"[WARN] soundfile failed ({sf_err}), trying librosa...") y, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000, mono=True) if hasattr(y, 'shape') and len(y.shape) > 1: y = np.mean(y, axis=1) y = np.asarray(y, dtype=np.float32) if sr != 16000: y = librosa.resample(y, orig_sr=sr, target_sr=16000) y = y.astype(np.float32) if len(y) < 100: print("[WARN] Audio too short after decode, returning defaults") return {**DEFAULT_AUDIO_FEATURES} features = extractor.extract_all(y, transcript) print(f"[OK] Extracted {len(features)} audio features") return sanitize_features(features) except Exception as e: print(f"[ERROR] extract_audio_features_base64: {e}") traceback.print_exc() # Return defaults rather than 500 return {**DEFAULT_AUDIO_FEATURES, "_error": str(e)} if __name__ == "__main__": import uvicorn import os port = int(os.environ.get("PORT", 7860)) uvicorn.run(app, host="0.0.0.0", port=port)