|
|
"""
|
|
|
Custom Handler for Speaker Embedding Extraction
|
|
|
Using SpeechBrain ECAPA-TDNN model for HuggingFace Inference Endpoints
|
|
|
"""
|
|
|
|
|
|
from typing import Dict, List, Any
|
|
|
import torch
|
|
|
import torchaudio
|
|
|
import io
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
|
class EndpointHandler:
|
|
|
"""
|
|
|
HuggingFace Inference Endpoint Handler for Speaker Embedding
|
|
|
|
|
|
Extracts speaker embeddings using SpeechBrain's ECAPA-TDNN model.
|
|
|
Returns 192-dimensional embedding vectors for speaker verification.
|
|
|
"""
|
|
|
|
|
|
def __init__(self, path: str = ""):
|
|
|
"""
|
|
|
Initialize the handler by loading the SpeechBrain model.
|
|
|
|
|
|
Args:
|
|
|
path: Path to the model directory (provided by HuggingFace)
|
|
|
"""
|
|
|
from speechbrain.inference.speaker import EncoderClassifier
|
|
|
|
|
|
|
|
|
self.model = EncoderClassifier.from_hparams(
|
|
|
source="speechbrain/spkrec-ecapa-voxceleb",
|
|
|
savedir=path if path else "/tmp/spkrec-ecapa-voxceleb",
|
|
|
run_opts={"device": "cpu"}
|
|
|
)
|
|
|
|
|
|
self.sample_rate = 16000
|
|
|
print("[SpeakerEmbedding] Model loaded successfully")
|
|
|
|
|
|
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
"""
|
|
|
Process audio input and return speaker embedding.
|
|
|
|
|
|
Args:
|
|
|
data: Dictionary containing:
|
|
|
- "inputs": Audio bytes or base64 encoded audio
|
|
|
- "parameters": Optional parameters
|
|
|
|
|
|
Returns:
|
|
|
Dictionary with embedding vector
|
|
|
"""
|
|
|
try:
|
|
|
|
|
|
inputs = data.get("inputs")
|
|
|
|
|
|
if inputs is None:
|
|
|
return {"error": "No audio input provided"}
|
|
|
|
|
|
|
|
|
if isinstance(inputs, bytes):
|
|
|
audio_bytes = inputs
|
|
|
elif isinstance(inputs, str):
|
|
|
|
|
|
import base64
|
|
|
audio_bytes = base64.b64decode(inputs)
|
|
|
else:
|
|
|
return {"error": f"Unsupported input type: {type(inputs)}"}
|
|
|
|
|
|
|
|
|
audio_buffer = io.BytesIO(audio_bytes)
|
|
|
waveform, sample_rate = torchaudio.load(audio_buffer)
|
|
|
|
|
|
|
|
|
if sample_rate != self.sample_rate:
|
|
|
resampler = torchaudio.transforms.Resample(
|
|
|
orig_freq=sample_rate,
|
|
|
new_freq=self.sample_rate
|
|
|
)
|
|
|
waveform = resampler(waveform)
|
|
|
|
|
|
|
|
|
if waveform.shape[0] > 1:
|
|
|
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
|
|
|
|
|
|
|
|
with torch.no_grad():
|
|
|
embedding = self.model.encode_batch(waveform)
|
|
|
embedding = embedding.squeeze().cpu().numpy()
|
|
|
|
|
|
|
|
|
embedding = embedding / np.linalg.norm(embedding)
|
|
|
|
|
|
return {
|
|
|
"embedding": embedding.tolist(),
|
|
|
"dimension": len(embedding),
|
|
|
"model": "speechbrain/spkrec-ecapa-voxceleb"
|
|
|
}
|
|
|
|
|
|
except Exception as e:
|
|
|
return {"error": str(e)}
|
|
|
|