sanoramyun8's picture
Upload handler.py with huggingface_hub
683d6e8 verified
"""
Custom Handler for Speaker Embedding Extraction
Using SpeechBrain ECAPA-TDNN model for HuggingFace Inference Endpoints
"""
from typing import Dict, List, Any
import torch
import torchaudio
import io
import numpy as np
class EndpointHandler:
"""
HuggingFace Inference Endpoint Handler for Speaker Embedding
Extracts speaker embeddings using SpeechBrain's ECAPA-TDNN model.
Returns 192-dimensional embedding vectors for speaker verification.
"""
def __init__(self, path: str = ""):
"""
Initialize the handler by loading the SpeechBrain model.
Args:
path: Path to the model directory (provided by HuggingFace)
"""
from speechbrain.inference.speaker import EncoderClassifier
# Load ECAPA-TDNN model from SpeechBrain
self.model = EncoderClassifier.from_hparams(
source="speechbrain/spkrec-ecapa-voxceleb",
savedir=path if path else "/tmp/spkrec-ecapa-voxceleb",
run_opts={"device": "cpu"}
)
self.sample_rate = 16000
print("[SpeakerEmbedding] Model loaded successfully")
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
"""
Process audio input and return speaker embedding.
Args:
data: Dictionary containing:
- "inputs": Audio bytes or base64 encoded audio
- "parameters": Optional parameters
Returns:
Dictionary with embedding vector
"""
try:
# Get audio data from request
inputs = data.get("inputs")
if inputs is None:
return {"error": "No audio input provided"}
# Handle different input formats
if isinstance(inputs, bytes):
audio_bytes = inputs
elif isinstance(inputs, str):
# Base64 encoded
import base64
audio_bytes = base64.b64decode(inputs)
else:
return {"error": f"Unsupported input type: {type(inputs)}"}
# Load audio from bytes
audio_buffer = io.BytesIO(audio_bytes)
waveform, sample_rate = torchaudio.load(audio_buffer)
# Resample if necessary
if sample_rate != self.sample_rate:
resampler = torchaudio.transforms.Resample(
orig_freq=sample_rate,
new_freq=self.sample_rate
)
waveform = resampler(waveform)
# Convert to mono if stereo
if waveform.shape[0] > 1:
waveform = torch.mean(waveform, dim=0, keepdim=True)
# Extract embedding
with torch.no_grad():
embedding = self.model.encode_batch(waveform)
embedding = embedding.squeeze().cpu().numpy()
# Normalize embedding
embedding = embedding / np.linalg.norm(embedding)
return {
"embedding": embedding.tolist(),
"dimension": len(embedding),
"model": "speechbrain/spkrec-ecapa-voxceleb"
}
except Exception as e:
return {"error": str(e)}