EmpathyVC / app.py
admin
init app
d47c0cc
import json
import librosa
import torch
import numpy as np
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from mel_processing import spectrogram_torch, create_wav_header
import gradio as gr
import stream
import os
inference = stream.Inference(onnx_path='./ckpt/model.onnx', lang='chs')
def cs(a, b):
return np.dot(a, b.reshape(-1, 1)).T / (np.linalg.norm(a, axis=1) * np.linalg.norm(b))
devices = "cpu"
# 加载配置文件
hps = utils.get_hparams_from_file("./configs/aishell3_base.json")
# 加载模型
net_g = SynthesizerTrn(
len(symbols),
hps.data.filter_length // 2 + 1,
hps.train.segment_size // hps.data.hop_length,
n_speakers=175,
**hps.model)
if devices == 'cuda':
net_g = net_g.cuda()
_ = net_g.eval()
_ = utils.load_checkpoint("./ckpt/G_000.pth", net_g, None)
# _ = utils.load_checkpoint("./ckpt/G_910000.pth", net_g, None)
# 加载声纹信息
speaker_embedding = []
with open('speaker_embedding.txt', 'r', encoding='utf-8') as f:
temp = f.readlines()
for t in temp:
speaker_embedding.append(eval(t.split('|')[1]))
speaker_embedding = np.array(speaker_embedding)
def vc(mic, tag_s, tt):
if tag_s == 'Male':
tag_s = 41
else:
tag_s = 112
sr, data = mic
# data 为 numpy 数组
if sr != 22050:
data = librosa.resample(data.astype(np.float32) / 32767.0, sr, 22050)
else:
data = data.astype(np.float32) / 32767.0
contents = torch.FloatTensor(data.astype(np.float32))
audio_norm = contents.unsqueeze(0)
temp_speaker_embedding = inference.extract_embedding_wav(audio_norm)
# 计算余弦相似度,获得最相似的 speaker_id
dist = cs(speaker_embedding, temp_speaker_embedding).reshape(-1)
sid = dist.tolist().index(max(dist.tolist()))
print('最相似的sid为 %d' % sid)
spec = spectrogram_torch(audio_norm, 1024,
22050, 256, 1024,
center=False)
with torch.no_grad():
if devices == 'cuda':
spec = spec.cuda()
spec_lengths = torch.LongTensor([spec.shape[2]]).cuda()
sid_src = torch.LongTensor([sid + 1]).cuda()
sid_tgt = torch.LongTensor([tag_s]).cuda()
else:
spec = spec
spec_lengths = torch.LongTensor([spec.shape[2]])
sid_src = torch.LongTensor([sid + 1])
sid_tgt = torch.LongTensor([tag_s])
audio = net_g.voice_conversion(spec, spec_lengths, sid_src, sid_tgt=sid_tgt)[0][
0, 0].data.cpu().float().numpy()
i = np.random.uniform(0.12, 0.35, 1)[0]
space_time = np.zeros(int(i * 22050), dtype=np.int16)
audio = audio * 32767.0
audio = np.concatenate((audio, space_time))
audio = audio.astype(np.short)
return 22050, audio
demo = gr.Interface(
fn=vc,
inputs=[
gr.Audio(label='Source Speaker'),
gr.components.Dropdown(label="Target Speaker", choices=['Male', 'Female']),
gr.Audio(label='Target Speaker Audio')
],
outputs=gr.Audio(label="Output"),
cache_examples=False,
examples=[
[os.path.join(os.path.dirname(__file__), "audio/AISHELL-3-SSB1863-0001.wav"), 'Male',
os.path.join(os.path.dirname(__file__), "audio/source_man.wav")],
[os.path.join(os.path.dirname(__file__), "audio/AISHELL3-SSB0122-0001.wav"), 'Male',
os.path.join(os.path.dirname(__file__), "audio/source_man.wav")],
[os.path.join(os.path.dirname(__file__), "audio/AISHELL-3-SSB1863-0001.wav"), 'Female',
os.path.join(os.path.dirname(__file__), "audio/source_female.wav")],
[os.path.join(os.path.dirname(__file__), "audio/AISHELL3-SSB0122-0001.wav"), 'Female',
os.path.join(os.path.dirname(__file__), "audio/source_female.wav")],
[os.path.join(os.path.dirname(__file__), "audio/baker-000001.wav"), 'Female',
os.path.join(os.path.dirname(__file__), "audio/source_female.wav")],
[os.path.join(os.path.dirname(__file__), "audio/LJSpeech-001-0001.wav"), 'Male',
os.path.join(os.path.dirname(__file__), "audio/source_man.wav")],
],
title='Empathy-VC',
description="Note: This space is running on CPU, inference times will be higher."
)
demo.launch(server_name='0.0.0.0')