| | import os
|
| | import torch
|
| | import librosa
|
| | import soundfile as sf
|
| | from pathlib import Path
|
| |
|
| | from transformers import WavLMModel
|
| | from freevc.utils import load_checkpoint, get_hparams_from_file
|
| | from freevc.models import SynthesizerTrn
|
| |
|
| |
|
| | from speaker_encoder.voice_encoder import SpeakerEncoder
|
| |
|
| |
|
| | def get_freevc_models(path='freevc', speaker_path='../pre_ckpts/spk_encoder/pretrained.pt', device='cuda'):
|
| | hps = get_hparams_from_file("freevc/configs/freevc.json")
|
| | freevc = SynthesizerTrn(
|
| | hps.data.filter_length // 2 + 1,
|
| | hps.train.segment_size // hps.data.hop_length,
|
| | **hps.model).to(device)
|
| | freevc.eval()
|
| | load_checkpoint("../prepare_freevc/ckpts/freevc.pth", freevc, None)
|
| |
|
| | cmodel = WavLMModel.from_pretrained("microsoft/wavlm-large").to(device)
|
| | cmodel.eval()
|
| |
|
| |
|
| |
|
| | smodel = SpeakerEncoder(f"speaker_encoder/ckpt/pretrained_bak_5805000.pt", device)
|
| |
|
| | return freevc, cmodel, smodel, hps
|
| |
|
| |
|
| | @torch.no_grad()
|
| | def convert(freevc, content, speaker):
|
| | audio = freevc.infer(content, g=speaker)
|
| | audio = audio[0][0].data.cpu().float().numpy()
|
| | return audio, 24000
|
| |
|
| |
|
| | if __name__ == '__main__':
|
| | freevc_24, cmodel, smodel, hps = get_freevc_models()
|
| |
|
| | tgt = 'p226_002.wav'
|
| |
|
| | src = 'p225_001.wav'
|
| | device = 'cuda'
|
| |
|
| |
|
| | wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
|
| | wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
|
| | g_tgt = smodel.embed_utterance(wav_tgt)
|
| | g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(device)
|
| |
|
| |
|
| |
|
| | wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
|
| | wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(device)
|
| | content = cmodel(wav_src).last_hidden_state.transpose(1, 2).to(device)
|
| |
|
| | output, sr = convert(freevc_24, content, g_tgt)
|
| |
|
| | sf.write('output.wav', output, sr) |