Spaces:
Runtime error
Runtime error
| import torch | |
| import torchaudio | |
| import numpy as np | |
| from decoder_base import AcousticModel | |
| class InferencePipeline(): | |
| def __init__(self): | |
| # download hubert content encoder | |
| self.hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)#.cuda() | |
| # initialize decoder with checkpoint | |
| ckpts_path = 'model-best.pt' | |
| self.model = AcousticModel() | |
| cp = torch.load(ckpts_path, map_location=torch.device('cpu')) | |
| self.model.load_state_dict(cp['acoustic-model']) | |
| # download vocoder | |
| self.hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, map_location=torch.device('cpu')) | |
| # load target speaker embedding | |
| self.trg_spk_emb = np.load('merkel.npy') | |
| self.trg_spk_emb = torch.from_numpy(self.trg_spk_emb) | |
| self.trg_spk_emb = self.trg_spk_emb.unsqueeze(0)#.cuda() | |
| def voice_conversion(self, audio_data): | |
| # Extract the file path from the tuple | |
| audio_path = audio_data[0] if isinstance(audio_data, tuple) else audio_data | |
| print(f"Loading audio from: {audio_path}") | |
| # load source audio | |
| source, sr = torchaudio.load(audio_path) #"test.wav") | |
| source = torchaudio.functional.resample(source, sr, 16000) | |
| source = source.unsqueeze(0)#.cuda() | |
| # run inference | |
| self.model.eval() | |
| with torch.inference_mode(): | |
| # Extract speech units | |
| units = self.hubert.units(source) | |
| # Generate target spectrogram | |
| mel = self.model.generate(units, self.trg_spk_emb).transpose(1, 2) | |
| # Generate audio waveform | |
| target = self.hifigan(mel) | |
| # Assuming `target` is a tensor with the audio waveform | |
| # Convert it to numpy array and save it as an output audio file | |
| output_audio_path = "output.wav" | |
| torchaudio.save("output.wav", target.squeeze(0), 16000) | |
| return output_audio_path |