| import json
|
|
|
| import librosa
|
| import torch
|
| import numpy as np
|
| import utils
|
| from models import SynthesizerTrn
|
| from text.symbols import symbols
|
| from mel_processing import spectrogram_torch, create_wav_header
|
| import gradio as gr
|
| import stream
|
| import os
|
| inference = stream.Inference(onnx_path='./ckpt/model.onnx', lang='chs')
|
|
|
| def cs(a, b):
|
| return np.dot(a, b.reshape(-1, 1)).T / (np.linalg.norm(a, axis=1) * np.linalg.norm(b))
|
|
|
| devices = "cpu"
|
|
|
|
|
|
|
| hps = utils.get_hparams_from_file("./configs/aishell3_base.json")
|
|
|
|
|
| net_g = SynthesizerTrn(
|
| len(symbols),
|
| hps.data.filter_length // 2 + 1,
|
| hps.train.segment_size // hps.data.hop_length,
|
| n_speakers=175,
|
| **hps.model)
|
| if devices == 'cuda':
|
| net_g = net_g.cuda()
|
| _ = net_g.eval()
|
|
|
| _ = utils.load_checkpoint("./ckpt/G_000.pth", net_g, None)
|
|
|
|
|
|
|
| speaker_embedding = []
|
| with open('speaker_embedding.txt', 'r', encoding='utf-8') as f:
|
| temp = f.readlines()
|
| for t in temp:
|
| speaker_embedding.append(eval(t.split('|')[1]))
|
| speaker_embedding = np.array(speaker_embedding)
|
|
|
| def vc(mic, tag_s, tt):
|
| if tag_s == 'Male':
|
| tag_s = 41
|
| else:
|
| tag_s = 112
|
| sr, data = mic
|
|
|
| if sr != 22050:
|
| data = librosa.resample(data.astype(np.float32) / 32767.0, sr, 22050)
|
| else:
|
| data = data.astype(np.float32) / 32767.0
|
| contents = torch.FloatTensor(data.astype(np.float32))
|
| audio_norm = contents.unsqueeze(0)
|
| temp_speaker_embedding = inference.extract_embedding_wav(audio_norm)
|
|
|
| dist = cs(speaker_embedding, temp_speaker_embedding).reshape(-1)
|
| sid = dist.tolist().index(max(dist.tolist()))
|
| print('最相似的sid为 %d' % sid)
|
| spec = spectrogram_torch(audio_norm, 1024,
|
| 22050, 256, 1024,
|
| center=False)
|
| with torch.no_grad():
|
| if devices == 'cuda':
|
| spec = spec.cuda()
|
| spec_lengths = torch.LongTensor([spec.shape[2]]).cuda()
|
| sid_src = torch.LongTensor([sid + 1]).cuda()
|
| sid_tgt = torch.LongTensor([tag_s]).cuda()
|
| else:
|
| spec = spec
|
| spec_lengths = torch.LongTensor([spec.shape[2]])
|
| sid_src = torch.LongTensor([sid + 1])
|
| sid_tgt = torch.LongTensor([tag_s])
|
|
|
| audio = net_g.voice_conversion(spec, spec_lengths, sid_src, sid_tgt=sid_tgt)[0][
|
| 0, 0].data.cpu().float().numpy()
|
| i = np.random.uniform(0.12, 0.35, 1)[0]
|
| space_time = np.zeros(int(i * 22050), dtype=np.int16)
|
| audio = audio * 32767.0
|
| audio = np.concatenate((audio, space_time))
|
| audio = audio.astype(np.short)
|
| return 22050, audio
|
|
|
|
|
| demo = gr.Interface(
|
| fn=vc,
|
| inputs=[
|
| gr.Audio(label='Source Speaker'),
|
| gr.components.Dropdown(label="Target Speaker", choices=['Male', 'Female']),
|
| gr.Audio(label='Target Speaker Audio')
|
| ],
|
| outputs=gr.Audio(label="Output"),
|
| cache_examples=False,
|
| examples=[
|
| [os.path.join(os.path.dirname(__file__), "audio/AISHELL-3-SSB1863-0001.wav"), 'Male',
|
| os.path.join(os.path.dirname(__file__), "audio/source_man.wav")],
|
| [os.path.join(os.path.dirname(__file__), "audio/AISHELL3-SSB0122-0001.wav"), 'Male',
|
| os.path.join(os.path.dirname(__file__), "audio/source_man.wav")],
|
| [os.path.join(os.path.dirname(__file__), "audio/AISHELL-3-SSB1863-0001.wav"), 'Female',
|
| os.path.join(os.path.dirname(__file__), "audio/source_female.wav")],
|
| [os.path.join(os.path.dirname(__file__), "audio/AISHELL3-SSB0122-0001.wav"), 'Female',
|
| os.path.join(os.path.dirname(__file__), "audio/source_female.wav")],
|
| [os.path.join(os.path.dirname(__file__), "audio/baker-000001.wav"), 'Female',
|
| os.path.join(os.path.dirname(__file__), "audio/source_female.wav")],
|
| [os.path.join(os.path.dirname(__file__), "audio/LJSpeech-001-0001.wav"), 'Male',
|
| os.path.join(os.path.dirname(__file__), "audio/source_man.wav")],
|
| ],
|
| title='Empathy-VC',
|
| description="Note: This space is running on CPU, inference times will be higher."
|
| )
|
|
|
| demo.launch(server_name='0.0.0.0')
|
|
|