yukie commited on
Commit
3653122
·
1 Parent(s): f4828a8

feature: Support off-key analysis

Browse files
Files changed (2) hide show
  1. app.py +36 -19
  2. inference/infer_tool.py +45 -1
app.py CHANGED
@@ -39,7 +39,9 @@ class YukieGradio:
39
  在使用此模型前请阅读[AI雪绘Yukie模型使用协议](https://huggingface.co/spaces/yukie/yukie-sovits3/edit/main/terms.md)
40
 
41
  # start!
42
- 上传一段**纯人声**干音(60s以内),然后点击提交即可开始推理!
 
 
43
 
44
  **请使用无bgm,无混响的人声来进行生成推理,否则效果可能会较差**
45
  """)
@@ -47,6 +49,7 @@ class YukieGradio:
47
  "唱歌特化", "杂谈特化"], value="唱歌特化", interactive=True)
48
  self.dev = gr.Dropdown(label="设备(云端一般请勿切换,使用默认值即可)", choices=[
49
  "cuda", "cpu"], value="cpu", interactive=True)
 
50
  self.inAudio = gr.Audio(label="上传音频")
51
  self.needLogmmse = gr.Checkbox(label="是否使用自带降噪")
52
  self.slice_db = gr.Slider(label="切片阈值(较嘈杂时-30,保留呼吸声时-50,一般默认-40)",
@@ -54,39 +57,45 @@ class YukieGradio:
54
  self.vcTransform = gr.Number(
55
  label="升降调(整数,可以正负,半音数量,升高八度就是12)", value=0)
56
  self.vcSubmit = gr.Button("转换", variant="primary")
57
- self.outVcText = gr.Textbox(label="Output Message")
 
58
  self.outAudio = gr.Audio(
59
  source="upload", type="numpy", label="Output Audio")
 
 
60
  gr.Markdown(value="""
61
  ## 注意
62
  如果要在本地使用该demo,请使用 `git lfs clone https://huggingface.co/spaces/yukie/yukie-sovits3`克隆该仓库([简单教程](https://huggingface.co/spaces/yukie/yukie-sovits3/edit/main/local.md))
63
  """)
64
- self.vcSubmit.click(infer, inputs=[self.inAudio, self.vcTransform, self.slice_db, self.needLogmmse, self.sid, self.dev], outputs=[
65
- self.outVcText, self.outAudio])
66
 
67
 
68
- def infer(inAudio, transform, slice_db, lm, sid, dev):
69
  if inAudio != None:
70
- sampling_rate, audio = inAudio
71
  else:
72
- return "请上传一段音频后再次尝试", None
 
 
 
73
 
74
  print("start inference")
75
  start_time = time.time()
76
  # 预处理,重编码
77
- audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
78
- if len(audio.shape) > 1:
79
- audio = librosa.to_mono(audio.transpose(1, 0))
80
  if sampling_rate != 32000:
81
- audio = librosa.resample(
82
- audio, orig_sr=sampling_rate, target_sr=32000)
83
  if lm:
84
- audio = logmmse(audio, 32000)
85
 
86
- out_wav_path = "tmp.wav"
87
- soundfile.write(out_wav_path, audio, 32000, format="wav")
88
- chunks = slicer.cut(out_wav_path, db_thresh=slice_db)
89
- audio_data, audio_sr = slicer.chunks2audio(out_wav_path, chunks)
90
 
91
  audio = []
92
  sid = sid_map[sid]
@@ -108,8 +117,16 @@ def infer(inAudio, transform, slice_db, lm, sid, dev):
108
  audio.extend(list(_audio))
109
  audio = (np.array(audio) * 32768.0).astype('int16')
110
  used_time = time.time() - start_time
111
- out_str = ("Success! total use time:{}s".format(used_time))
112
- return out_str, (32000, audio)
 
 
 
 
 
 
 
 
113
 
114
 
115
  if __name__ == "__main__":
 
39
  在使用此模型前请阅读[AI雪绘Yukie模型使用协议](https://huggingface.co/spaces/yukie/yukie-sovits3/edit/main/terms.md)
40
 
41
  # start!
42
+ 上传一段**纯人声**干音(推荐60s以内),或者直接使用网站录音(二者只能选其一,优先使用上传音频)
43
+
44
+ 然后点击提交即可开始推理!
45
 
46
  **请使用无bgm,无混响的人声来进行生成推理,否则效果可能会较差**
47
  """)
 
49
  "唱歌特化", "杂谈特化"], value="唱歌特化", interactive=True)
50
  self.dev = gr.Dropdown(label="设备(云端一般请勿切换,使用默认值即可)", choices=[
51
  "cuda", "cpu"], value="cpu", interactive=True)
52
+ self.inMic = gr.Microphone(label="录音")
53
  self.inAudio = gr.Audio(label="上传音频")
54
  self.needLogmmse = gr.Checkbox(label="是否使用自带降噪")
55
  self.slice_db = gr.Slider(label="切片阈值(较嘈杂时-30,保留呼吸声时-50,一般默认-40)",
 
57
  self.vcTransform = gr.Number(
58
  label="升降调(整数,可以正负,半音数量,升高八度就是12)", value=0)
59
  self.vcSubmit = gr.Button("转换", variant="primary")
60
+ self.outVcText = gr.Textbox(
61
+ label="音高平均偏差半音数量,体现转换音频的跑调情况(一般小于0.5)")
62
  self.outAudio = gr.Audio(
63
  source="upload", type="numpy", label="Output Audio")
64
+ self.f0_image = gr.Image(
65
+ label="f0曲线,蓝色为输入音高,橙色为合成音频的音高(代码有误差)")
66
  gr.Markdown(value="""
67
  ## 注意
68
  如果要在本地使用该demo,请使用 `git lfs clone https://huggingface.co/spaces/yukie/yukie-sovits3`克隆该仓库([简单教程](https://huggingface.co/spaces/yukie/yukie-sovits3/edit/main/local.md))
69
  """)
70
+ self.vcSubmit.click(infer, inputs=[self.inMic, self.inAudio, self.vcTransform, self.slice_db, self.needLogmmse, self.sid, self.dev], outputs=[
71
+ self.outVcText, self.outAudio, self.f0_image])
72
 
73
 
74
+ def infer(inMic, inAudio, transform, slice_db, lm, sid, dev):
75
  if inAudio != None:
76
+ sampling_rate, inaudio = inAudio
77
  else:
78
+ if inMic != None:
79
+ sampling_rate, inaudio = inMic
80
+ else:
81
+ return "请上传一段音频后再次尝试", None
82
 
83
  print("start inference")
84
  start_time = time.time()
85
  # 预处理,重编码
86
+ inaudio = (inaudio / np.iinfo(inaudio.dtype).max).astype(np.float32)
87
+ if len(inaudio.shape) > 1:
88
+ inaudio = librosa.to_mono(inaudio.transpose(1, 0))
89
  if sampling_rate != 32000:
90
+ inaudio = librosa.resample(
91
+ inaudio, orig_sr=sampling_rate, target_sr=32000)
92
  if lm:
93
+ inaudio = logmmse(inaudio, 32000)
94
 
95
+ ori_wav_path = "tmp_ori.wav"
96
+ soundfile.write(ori_wav_path, inaudio, 32000, format="wav")
97
+ chunks = slicer.cut(ori_wav_path, db_thresh=slice_db)
98
+ audio_data, audio_sr = slicer.chunks2audio(ori_wav_path, chunks)
99
 
100
  audio = []
101
  sid = sid_map[sid]
 
117
  audio.extend(list(_audio))
118
  audio = (np.array(audio) * 32768.0).astype('int16')
119
  used_time = time.time() - start_time
120
+
121
+ out_wav_path = "tmp.wav"
122
+ soundfile.write(out_wav_path, audio, 32000, format="wav")
123
+
124
+ mistake, var = svc_model.calc_error(ori_wav_path, out_wav_path, transform)
125
+ out_picture = svc_model.f0_plt(ori_wav_path, out_wav_path, transform)
126
+ out_str = ("Success! total use time:{}s\n半音偏差:{}\n半音方差:{}".format(
127
+ used_time, mistake, var))
128
+
129
+ return out_str, (32000, audio), gr.Image.update("temp.jpg")
130
 
131
 
132
  if __name__ == "__main__":
inference/infer_tool.py CHANGED
@@ -13,10 +13,13 @@ import parselmouth
13
  import soundfile
14
  import torch
15
  import torchaudio
 
16
 
17
  from hubert import hubert_model
18
  import utils
 
19
  from models import SynthesizerTrn
 
20
 
21
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
22
 
@@ -190,7 +193,7 @@ class Svc(object):
190
  self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
191
  **self.hps_ms.model)
192
  _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
193
- if "half" in self.net_g_path and self.dev == "cuda":
194
  _ = self.net_g_ms.half().eval().to(self.dev)
195
  else:
196
  _ = self.net_g_ms.eval().to(self.dev)
@@ -234,6 +237,47 @@ class Svc(object):
234
  print("vits use time:{}".format(use_time))
235
  return audio, audio.shape[-1]
236
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
  # class SvcONNXInferModel(object):
239
  # def __init__(self, hubert_onnx, vits_onnx, config_path):
 
13
  import soundfile
14
  import torch
15
  import torchaudio
16
+ import pyworld
17
 
18
  from hubert import hubert_model
19
  import utils
20
+ # from preprocess_hubert_f0 import compute_f0
21
  from models import SynthesizerTrn
22
+ import matplotlib.pyplot as plt
23
 
24
  logging.getLogger('matplotlib').setLevel(logging.WARNING)
25
 
 
193
  self.hps_ms.train.segment_size // self.hps_ms.data.hop_length,
194
  **self.hps_ms.model)
195
  _ = utils.load_checkpoint(self.net_g_path, self.net_g_ms, None)
196
+ if "half" in self.net_g_path and self.dev == torch.device("cuda"):
197
  _ = self.net_g_ms.half().eval().to(self.dev)
198
  else:
199
  _ = self.net_g_ms.eval().to(self.dev)
 
237
  print("vits use time:{}".format(use_time))
238
  return audio, audio.shape[-1]
239
 
240
+ def f0_plt(self, in_path, out_path, tran):
241
+ s1, input_pitch = self.get_unit_pitch(in_path, tran)
242
+ s2, output_pitch = self.get_unit_pitch(out_path, 0)
243
+ plt.clf()
244
+ plt.plot(plt_pitch(input_pitch), color="#66ccff")
245
+ plt.plot(plt_pitch(output_pitch), color="orange")
246
+ plt.savefig("temp.jpg")
247
+
248
+ def calc_error(self, in_path, out_path, tran):
249
+ input_pitch = compute_f0(in_path)
250
+ output_pitch = compute_f0(out_path)
251
+ sum_y = []
252
+ if np.sum(input_pitch == 0) / len(input_pitch) > 0.9:
253
+ mistake, var_take = 0, 0
254
+ else:
255
+ for i in range(min(len(input_pitch), len(output_pitch))):
256
+ if input_pitch[i] > 0 and output_pitch[i] > 0:
257
+ sum_y.append(
258
+ abs(f0_to_pitch(output_pitch[i]) - (f0_to_pitch(input_pitch[i]) + tran)))
259
+ num_y = 0
260
+ for x in sum_y:
261
+ num_y += x
262
+ len_y = len(sum_y) if len(sum_y) else 1
263
+ mistake = round(float(num_y / len_y), 2)
264
+ var_take = round(float(np.std(sum_y, ddof=1)), 2)
265
+ return mistake, var_take
266
+
267
+
268
+ def compute_f0(path):
269
+ x, sr = librosa.load(path, sr=32000)
270
+ assert sr == 32000
271
+ f0, t = pyworld.dio(
272
+ x.astype(np.double),
273
+ fs=sr,
274
+ f0_ceil=800,
275
+ frame_period=1000 * 320 / sr,
276
+ )
277
+ f0 = pyworld.stonemask(x.astype(np.double), f0, t, 32000)
278
+ for index, pitch in enumerate(f0):
279
+ f0[index] = round(pitch, 1)
280
+ return f0
281
 
282
  # class SvcONNXInferModel(object):
283
  # def __init__(self, hubert_onnx, vits_onnx, config_path):