|
|
import torch |
|
|
import torchaudio |
|
|
import numpy as np |
|
|
from decoder_base import AcousticModel |
|
|
|
|
|
class InferencePipeline(): |
|
|
def __init__(self): |
|
|
|
|
|
self.hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True) |
|
|
|
|
|
|
|
|
ckpts_path = 'model-best.pt' |
|
|
self.model = AcousticModel() |
|
|
cp = torch.load(ckpts_path, map_location=torch.device('cpu')) |
|
|
self.model.load_state_dict(cp['acoustic-model']) |
|
|
|
|
|
|
|
|
self.hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, map_location=torch.device('cpu')) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
self.trg_spk_emb = np.load('content/vctk/spk_emb/p226/p226_322_mic1.npy') |
|
|
self.trg_spk_emb = torch.from_numpy(self.trg_spk_emb) |
|
|
self.trg_spk_emb = self.trg_spk_emb.unsqueeze(0) |
|
|
|
|
|
def voice_conversion(self, audio_file_path): |
|
|
|
|
|
self.model.eval() |
|
|
with torch.inference_mode(): |
|
|
|
|
|
units = self.hubert.units(audio_file_path) |
|
|
|
|
|
mel = self.model.generate(units, self.trg_spk_emb).transpose(1, 2) |
|
|
|
|
|
target = self.hifigan(mel) |
|
|
|
|
|
|
|
|
|
|
|
output_audio_path = "output.wav" |
|
|
torchaudio.save(output_audio_path, target.cpu(), sample_rate=16000) |
|
|
|
|
|
return output_audio_path |
|
|
|
|
|
|