""" Custom Handler for Speaker Embedding Extraction Using SpeechBrain ECAPA-TDNN model for HuggingFace Inference Endpoints """ from typing import Dict, List, Any import torch import torchaudio import io import numpy as np class EndpointHandler: """ HuggingFace Inference Endpoint Handler for Speaker Embedding Extracts speaker embeddings using SpeechBrain's ECAPA-TDNN model. Returns 192-dimensional embedding vectors for speaker verification. """ def __init__(self, path: str = ""): """ Initialize the handler by loading the SpeechBrain model. Args: path: Path to the model directory (provided by HuggingFace) """ from speechbrain.inference.speaker import EncoderClassifier # Load ECAPA-TDNN model from SpeechBrain self.model = EncoderClassifier.from_hparams( source="speechbrain/spkrec-ecapa-voxceleb", savedir=path if path else "/tmp/spkrec-ecapa-voxceleb", run_opts={"device": "cpu"} ) self.sample_rate = 16000 print("[SpeakerEmbedding] Model loaded successfully") def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: """ Process audio input and return speaker embedding. Args: data: Dictionary containing: - "inputs": Audio bytes or base64 encoded audio - "parameters": Optional parameters Returns: Dictionary with embedding vector """ try: # Get audio data from request inputs = data.get("inputs") if inputs is None: return {"error": "No audio input provided"} # Handle different input formats if isinstance(inputs, bytes): audio_bytes = inputs elif isinstance(inputs, str): # Base64 encoded import base64 audio_bytes = base64.b64decode(inputs) else: return {"error": f"Unsupported input type: {type(inputs)}"} # Load audio from bytes audio_buffer = io.BytesIO(audio_bytes) waveform, sample_rate = torchaudio.load(audio_buffer) # Resample if necessary if sample_rate != self.sample_rate: resampler = torchaudio.transforms.Resample( orig_freq=sample_rate, new_freq=self.sample_rate ) waveform = resampler(waveform) # Convert to mono if stereo if waveform.shape[0] > 1: waveform = torch.mean(waveform, dim=0, keepdim=True) # Extract embedding with torch.no_grad(): embedding = self.model.encode_batch(waveform) embedding = embedding.squeeze().cpu().numpy() # Normalize embedding embedding = embedding / np.linalg.norm(embedding) return { "embedding": embedding.tolist(), "dimension": len(embedding), "model": "speechbrain/spkrec-ecapa-voxceleb" } except Exception as e: return {"error": str(e)}