Spaces:
Runtime error
Runtime error
feature: Support off-key analysis
Browse files- app.py +36 -19
- inference/infer_tool.py +45 -1
app.py
CHANGED
|
@@ -39,7 +39,9 @@ class YukieGradio:
|
|
| 39 |
在使用此模型前请阅读[AI雪绘Yukie模型使用协议](https://huggingface.co/spaces/yukie/yukie-sovits3/edit/main/terms.md)
|
| 40 |
|
| 41 |
# start!
|
| 42 |
-
上传一段**纯人声**干音(60s以内),
|
|
|
|
|
|
|
| 43 |
|
| 44 |
**请使用无bgm,无混响的人声来进行生成推理,否则效果可能会较差**
|
| 45 |
""")
|
|
@@ -47,6 +49,7 @@ class YukieGradio:
|
|
| 47 |
"唱歌特化", "杂谈特化"], value="唱歌特化", interactive=True)
|
| 48 |
self.dev = gr.Dropdown(label="设备(云端一般请勿切换,使用默认值即可)", choices=[
|
| 49 |
"cuda", "cpu"], value="cpu", interactive=True)
|
|
|
|
| 50 |
self.inAudio = gr.Audio(label="上传音频")
|
| 51 |
self.needLogmmse = gr.Checkbox(label="是否使用自带降噪")
|
| 52 |
self.slice_db = gr.Slider(label="切片阈值(较嘈杂时-30,保留呼吸声时-50,一般默认-40)",
|
|
@@ -54,39 +57,45 @@ class YukieGradio:
|
|
| 54 |
self.vcTransform = gr.Number(
|
| 55 |
label="升降调(整数,可以正负,半音数量,升高八度就是12)", value=0)
|
| 56 |
self.vcSubmit = gr.Button("转换", variant="primary")
|
| 57 |
-
self.outVcText = gr.Textbox(
|
|
|
|
| 58 |
self.outAudio = gr.Audio(
|
| 59 |
source="upload", type="numpy", label="Output Audio")
|
|
|
|
|
|
|
| 60 |
gr.Markdown(value="""
|
| 61 |
## 注意
|
| 62 |
如果要在本地使用该demo,请使用 `git lfs clone https://huggingface.co/spaces/yukie/yukie-sovits3`克隆该仓库([简单教程](https://huggingface.co/spaces/yukie/yukie-sovits3/edit/main/local.md))
|
| 63 |
""")
|
| 64 |
-
self.vcSubmit.click(infer, inputs=[self.inAudio, self.vcTransform, self.slice_db, self.needLogmmse, self.sid, self.dev], outputs=[
|
| 65 |
-
self.outVcText, self.outAudio])
|
| 66 |
|
| 67 |
|
| 68 |
-
def infer(inAudio, transform, slice_db, lm, sid, dev):
|
| 69 |
if inAudio != None:
|
| 70 |
-
sampling_rate,
|
| 71 |
else:
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
print("start inference")
|
| 75 |
start_time = time.time()
|
| 76 |
# 预处理,重编码
|
| 77 |
-
|
| 78 |
-
if len(
|
| 79 |
-
|
| 80 |
if sampling_rate != 32000:
|
| 81 |
-
|
| 82 |
-
|
| 83 |
if lm:
|
| 84 |
-
|
| 85 |
|
| 86 |
-
|
| 87 |
-
soundfile.write(
|
| 88 |
-
chunks = slicer.cut(
|
| 89 |
-
audio_data, audio_sr = slicer.chunks2audio(
|
| 90 |
|
| 91 |
audio = []
|
| 92 |
sid = sid_map[sid]
|
|
@@ -108,8 +117,16 @@ def infer(inAudio, transform, slice_db, lm, sid, dev):
|
|
| 108 |
audio.extend(list(_audio))
|
| 109 |
audio = (np.array(audio) * 32768.0).astype('int16')
|
| 110 |
used_time = time.time() - start_time
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
|
| 115 |
if __name__ == "__main__":
|
|
|
|
| 39 |
在使用此模型前请阅读[AI雪绘Yukie模型使用协议](https://huggingface.co/spaces/yukie/yukie-sovits3/edit/main/terms.md)
|
| 40 |
|
| 41 |
# start!
|
| 42 |
+
上传一段**纯人声**干音(推荐60s以内),或者直接使用网站录音(二者只能选其一,优先使用上传音频)
|
| 43 |
+
|
| 44 |
+
然后点击提交即可开始推理!
|
| 45 |
|
| 46 |
**请使用无bgm,无混响的人声来进行生成推理,否则效果可能会较差**
|
| 47 |
""")
|
|
|
|
| 49 |
"唱歌特化", "杂谈特化"], value="唱歌特化", interactive=True)
|
| 50 |
self.dev = gr.Dropdown(label="设备(云端一般请勿切换,使用默认值即可)", choices=[
|
| 51 |
"cuda", "cpu"], value="cpu", interactive=True)
|
| 52 |
+
self.inMic = gr.Microphone(label="录音")
|
| 53 |
self.inAudio = gr.Audio(label="上传音频")
|
| 54 |
self.needLogmmse = gr.Checkbox(label="是否使用自带降噪")
|
| 55 |
self.slice_db = gr.Slider(label="切片阈值(较嘈杂时-30,保留呼吸声时-50,一般默认-40)",
|
|
|
|
| 57 |
self.vcTransform = gr.Number(
|
| 58 |
label="升降调(整数,可以正负,半音数量,升高八度就是12)", value=0)
|
| 59 |
self.vcSubmit = gr.Button("转换", variant="primary")
|
| 60 |
+
self.outVcText = gr.Textbox(
|
| 61 |
+
label="音高平均偏差半音数量,体现转换音频的跑调情况(一般小于0.5)")
|
| 62 |
self.outAudio = gr.Audio(
|
| 63 |
source="upload", type="numpy", label="Output Audio")
|
| 64 |
+
self.f0_image = gr.Image(
|
| 65 |
+
label="f0曲线,蓝色为输入音高,橙色为合成音频的音高(代码有误差)")
|
| 66 |
gr.Markdown(value="""
|
| 67 |
## 注意
|
| 68 |
如果要在本地使用该demo,请使用 `git lfs clone https://huggingface.co/spaces/yukie/yukie-sovits3`克隆该仓库([简单教程](https://huggingface.co/spaces/yukie/yukie-sovits3/edit/main/local.md))
|
| 69 |
""")
|
| 70 |
+
self.vcSubmit.click(infer, inputs=[self.inMic, self.inAudio, self.vcTransform, self.slice_db, self.needLogmmse, self.sid, self.dev], outputs=[
|
| 71 |
+
self.outVcText, self.outAudio, self.f0_image])
|
| 72 |
|
| 73 |
|
| 74 |
+
def infer(inMic, inAudio, transform, slice_db, lm, sid, dev):
|
| 75 |
if inAudio != None:
|
| 76 |
+
sampling_rate, inaudio = inAudio
|
| 77 |
else:
|
| 78 |
+
if inMic != None:
|
| 79 |
+
sampling_rate, inaudio = inMic
|
| 80 |
+
else:
|
| 81 |
+
return "请上传一段音频后再次尝试", None
|
| 82 |
|
| 83 |
print("start inference")
|
| 84 |
start_time = time.time()
|
| 85 |
# 预处理,重编码
|
| 86 |
+
inaudio = (inaudio / np.iinfo(inaudio.dtype).max).astype(np.float32)
|
| 87 |
+
if len(inaudio.shape) > 1:
|
| 88 |
+
inaudio = librosa.to_mono(inaudio.transpose(1, 0))
|
| 89 |
if sampling_rate != 32000:
|
| 90 |
+
inaudio = librosa.resample(
|
| 91 |
+
inaudio, orig_sr=sampling_rate, target_sr=32000)
|
| 92 |
if lm:
|
| 93 |
+
inaudio = logmmse(inaudio, 32000)
|
| 94 |
|
| 95 |
+
ori_wav_path = "tmp_ori.wav"
|
| 96 |
+
soundfile.write(ori_wav_path, inaudio, 32000, format="wav")
|
| 97 |
+
chunks = slicer.cut(ori_wav_path, db_thresh=slice_db)
|
| 98 |
+
audio_data, audio_sr = slicer.chunks2audio(ori_wav_path, chunks)
|
| 99 |
|
| 100 |
audio = []
|
| 101 |
sid = sid_map[sid]
|
|
|
|
| 117 |
audio.extend(list(_audio))
|
| 118 |
audio = (np.array(audio) * 32768.0).astype('int16')
|
| 119 |
used_time = time.time() - start_time
|
| 120 |
+
|
| 121 |
+
out_wav_path = "tmp.wav"
|
| 122 |
+
soundfile.write(out_wav_path, audio, 32000, format="wav")
|
| 123 |
+
|
| 124 |
+
mistake, var = svc_model.calc_error(ori_wav_path, out_wav_path, transform)
|
| 125 |
+
out_picture = svc_model.f0_plt(ori_wav_path, out_wav_path, transform)
|
| 126 |
+
out_str = ("Success! total use time:{}s\n半音偏差:{}\n半音方差:{}".format(
|
| 127 |
+
used_time, mistake, var))
|
| 128 |
+
|
| 129 |
+
return out_str, (32000, audio), gr.Image.update("temp.jpg")
|
| 130 |
|
| 131 |
|
| 132 |
if __name__ == "__main__":
|
inference/infer_tool.py
CHANGED
|
@@ -13,10 +13,13 @@ import parselmouth
|
|
| 13 |
import soundfile
|
| 14 |
import torch
|
| 15 |
import torchaudio
|
|
|
|
| 16 |
|
| 17 |
from hubert import hubert_model
|
| 18 |
import utils
|
|
|
|
| 19 |
from models import SynthesizerTrn
|
|
|
|
| 20 |
|
| 21 |
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
| 22 |
|
|
@@ -190,7 +193,7 @@ class Svc(object):
|
|
| 190 |
self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
|
| 191 |
**self.hps_ms.model)
|
| 192 |
_ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
|
| 193 |
-
if "half" in self.net_g_path and self.dev == "cuda":
|
| 194 |
_ = self.net_g_ms.half().eval().to(self.dev)
|
| 195 |
else:
|
| 196 |
_ = self.net_g_ms.eval().to(self.dev)
|
|
@@ -234,6 +237,47 @@ class Svc(object):
|
|
| 234 |
print("vits use time:{}".format(use_time))
|
| 235 |
return audio, audio.shape[-1]
|
| 236 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
# class SvcONNXInferModel(object):
|
| 239 |
# def __init__(self, hubert_onnx, vits_onnx, config_path):
|
|
|
|
| 13 |
import soundfile
|
| 14 |
import torch
|
| 15 |
import torchaudio
|
| 16 |
+
import pyworld
|
| 17 |
|
| 18 |
from hubert import hubert_model
|
| 19 |
import utils
|
| 20 |
+
# from preprocess_hubert_f0 import compute_f0
|
| 21 |
from models import SynthesizerTrn
|
| 22 |
+
import matplotlib.pyplot as plt
|
| 23 |
|
| 24 |
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
| 25 |
|
|
|
|
| 193 |
self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
|
| 194 |
**self.hps_ms.model)
|
| 195 |
_ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
|
| 196 |
+
if "half" in self.net_g_path and self.dev == torch.device("cuda"):
|
| 197 |
_ = self.net_g_ms.half().eval().to(self.dev)
|
| 198 |
else:
|
| 199 |
_ = self.net_g_ms.eval().to(self.dev)
|
|
|
|
| 237 |
print("vits use time:{}".format(use_time))
|
| 238 |
return audio, audio.shape[-1]
|
| 239 |
|
| 240 |
+
def f0_plt(self, in_path, out_path, tran):
|
| 241 |
+
s1, input_pitch = self.get_unit_pitch(in_path, tran)
|
| 242 |
+
s2, output_pitch = self.get_unit_pitch(out_path, 0)
|
| 243 |
+
plt.clf()
|
| 244 |
+
plt.plot(plt_pitch(input_pitch), color="#66ccff")
|
| 245 |
+
plt.plot(plt_pitch(output_pitch), color="orange")
|
| 246 |
+
plt.savefig("temp.jpg")
|
| 247 |
+
|
| 248 |
+
def calc_error(self, in_path, out_path, tran):
|
| 249 |
+
input_pitch = compute_f0(in_path)
|
| 250 |
+
output_pitch = compute_f0(out_path)
|
| 251 |
+
sum_y = []
|
| 252 |
+
if np.sum(input_pitch == 0) / len(input_pitch) > 0.9:
|
| 253 |
+
mistake, var_take = 0, 0
|
| 254 |
+
else:
|
| 255 |
+
for i in range(min(len(input_pitch), len(output_pitch))):
|
| 256 |
+
if input_pitch[i] > 0 and output_pitch[i] > 0:
|
| 257 |
+
sum_y.append(
|
| 258 |
+
abs(f0_to_pitch(output_pitch[i]) - (f0_to_pitch(input_pitch[i]) + tran)))
|
| 259 |
+
num_y = 0
|
| 260 |
+
for x in sum_y:
|
| 261 |
+
num_y += x
|
| 262 |
+
len_y = len(sum_y) if len(sum_y) else 1
|
| 263 |
+
mistake = round(float(num_y / len_y), 2)
|
| 264 |
+
var_take = round(float(np.std(sum_y, ddof=1)), 2)
|
| 265 |
+
return mistake, var_take
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def compute_f0(path):
|
| 269 |
+
x, sr = librosa.load(path, sr=32000)
|
| 270 |
+
assert sr == 32000
|
| 271 |
+
f0, t = pyworld.dio(
|
| 272 |
+
x.astype(np.double),
|
| 273 |
+
fs=sr,
|
| 274 |
+
f0_ceil=800,
|
| 275 |
+
frame_period=1000 * 320 / sr,
|
| 276 |
+
)
|
| 277 |
+
f0 = pyworld.stonemask(x.astype(np.double), f0, t, 32000)
|
| 278 |
+
for index, pitch in enumerate(f0):
|
| 279 |
+
f0[index] = round(pitch, 1)
|
| 280 |
+
return f0
|
| 281 |
|
| 282 |
# class SvcONNXInferModel(object):
|
| 283 |
# def __init__(self, hubert_onnx, vits_onnx, config_path):
|