sanoramyun8 commited on
Commit
683d6e8
·
verified ·
1 Parent(s): f2be6c2

Upload handler.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. handler.py +100 -0
handler.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom Handler for Speaker Embedding Extraction
3
+ Using SpeechBrain ECAPA-TDNN model for HuggingFace Inference Endpoints
4
+ """
5
+
6
+ from typing import Dict, List, Any
7
+ import torch
8
+ import torchaudio
9
+ import io
10
+ import numpy as np
11
+
12
+
13
+ class EndpointHandler:
14
+ """
15
+ HuggingFace Inference Endpoint Handler for Speaker Embedding
16
+
17
+ Extracts speaker embeddings using SpeechBrain's ECAPA-TDNN model.
18
+ Returns 192-dimensional embedding vectors for speaker verification.
19
+ """
20
+
21
+ def __init__(self, path: str = ""):
22
+ """
23
+ Initialize the handler by loading the SpeechBrain model.
24
+
25
+ Args:
26
+ path: Path to the model directory (provided by HuggingFace)
27
+ """
28
+ from speechbrain.inference.speaker import EncoderClassifier
29
+
30
+ # Load ECAPA-TDNN model from SpeechBrain
31
+ self.model = EncoderClassifier.from_hparams(
32
+ source="speechbrain/spkrec-ecapa-voxceleb",
33
+ savedir=path if path else "/tmp/spkrec-ecapa-voxceleb",
34
+ run_opts={"device": "cpu"}
35
+ )
36
+
37
+ self.sample_rate = 16000
38
+ print("[SpeakerEmbedding] Model loaded successfully")
39
+
40
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
41
+ """
42
+ Process audio input and return speaker embedding.
43
+
44
+ Args:
45
+ data: Dictionary containing:
46
+ - "inputs": Audio bytes or base64 encoded audio
47
+ - "parameters": Optional parameters
48
+
49
+ Returns:
50
+ Dictionary with embedding vector
51
+ """
52
+ try:
53
+ # Get audio data from request
54
+ inputs = data.get("inputs")
55
+
56
+ if inputs is None:
57
+ return {"error": "No audio input provided"}
58
+
59
+ # Handle different input formats
60
+ if isinstance(inputs, bytes):
61
+ audio_bytes = inputs
62
+ elif isinstance(inputs, str):
63
+ # Base64 encoded
64
+ import base64
65
+ audio_bytes = base64.b64decode(inputs)
66
+ else:
67
+ return {"error": f"Unsupported input type: {type(inputs)}"}
68
+
69
+ # Load audio from bytes
70
+ audio_buffer = io.BytesIO(audio_bytes)
71
+ waveform, sample_rate = torchaudio.load(audio_buffer)
72
+
73
+ # Resample if necessary
74
+ if sample_rate != self.sample_rate:
75
+ resampler = torchaudio.transforms.Resample(
76
+ orig_freq=sample_rate,
77
+ new_freq=self.sample_rate
78
+ )
79
+ waveform = resampler(waveform)
80
+
81
+ # Convert to mono if stereo
82
+ if waveform.shape[0] > 1:
83
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
84
+
85
+ # Extract embedding
86
+ with torch.no_grad():
87
+ embedding = self.model.encode_batch(waveform)
88
+ embedding = embedding.squeeze().cpu().numpy()
89
+
90
+ # Normalize embedding
91
+ embedding = embedding / np.linalg.norm(embedding)
92
+
93
+ return {
94
+ "embedding": embedding.tolist(),
95
+ "dimension": len(embedding),
96
+ "model": "speechbrain/spkrec-ecapa-voxceleb"
97
+ }
98
+
99
+ except Exception as e:
100
+ return {"error": str(e)}