Spaces:
Sleeping
Sleeping
File size: 1,622 Bytes
36e0dea | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | import torch
import torchaudio
from speechbrain.lobes.models.ECAPA_TDNN import ECAPA_TDNN
from core.config import MODEL_DIR, DEVICE, N_MELS
from audio.preprocessing import waveform_to_mel
import numpy as np
class ECAPAENCODER:
def __init__(self):
self.ecapa = ECAPA_TDNN(
input_size = N_MELS,
lin_neurons = 192,
channels = [512, 512, 512],
kernel_sizes = [5, 3, 3],
dilations = [1, 2 , 3]
).to(DEVICE)
checkpoint = torch.load(MODEL_DIR, map_location = DEVICE)
self.ecapa.load_state_dict(checkpoint['ecapa_state_dict'])
self.ecapa.eval()
@torch.no_grad()
def encode(self, waveform):
"""
waveform: Tensor [T]
returns: np.ndarray [192]
"""
# ---- safety checks ----
if waveform.dim() == 2 and waveform.shape[0] == 1:
waveform = waveform.squeeze(0)
if waveform.dim() != 1:
raise ValueError(f"Expected waveform [T], got {waveform.shape}")
waveform = waveform.float().to(DEVICE)
# waveform = waveform.unsqueeze(0) # [1, T]
mel = waveform_to_mel(waveform) # [1, n_mels, frames]
# if mel.dim() == 4:
# mel = mel.squeeze(1)
# mel = mel.transpose(1, 2).contiguous() # [1, T, n_mels]
# ---- critical debug line (keep this while testing) ----
print("ECAPA INPUT SHAPE:", mel.shape)
emb = self.ecapa(mel) # [1, 192]
return emb.squeeze(0).cpu().numpy()
|