Spaces:
Runtime error
Runtime error
| import io | |
| import gradio as gr | |
| import librosa | |
| import numpy as np | |
| import soundfile | |
| from inference import slicer | |
| from inference.infer_tool import Svc | |
| import logging | |
| from logmmse import logmmse | |
| from typing import Tuple | |
| import time | |
| logging.getLogger('numba').setLevel(logging.WARNING) | |
| model_sing = "logs/32k/sing1.pth" | |
| model_talk = "logs/32k/talk1.pth" | |
| config_name = "configs/config.json" | |
| sid_map = { | |
| "唱歌特化": "sing", | |
| "杂谈特化": "talk" | |
| } | |
| class YukieGradio: | |
| def __init__(self): | |
| self.UI = gr.Blocks() | |
| with self.UI: | |
| with gr.Tabs(): | |
| with gr.TabItem("Basic"): | |
| gr.Markdown(value=""" | |
| # 前言 | |
| * 本demo基于[sovits 3.0 32khz版本](https://github.com/innnky/so-vits-svc)训练的 | |
| * 由于雪宝唱歌和说话声线差异较大,一个模型包括说话和唱歌两部分声线会使得工作量较大(声纹识别等),故而选择将这两部分分开训练;目前这一版中,杂谈声线数据较差,仍需后续更新 | |
| # 声明 | |
| 在使用此模型前请阅读[AI雪绘Yukie模型使用协议](https://huggingface.co/spaces/yukie/yukie-sovits3/edit/main/terms.md) | |
| # start! | |
| 上传一段**纯人声**干音(推荐60s以内),或者直接使用网站录音(二者只能选其一,优先使用上传音频) | |
| 然后点击提交即可开始推理! | |
| **请使用无bgm,无混响的人声来进行生成推理,否则效果可能会较差** | |
| """) | |
| self.sid = gr.Dropdown(label="音色", choices=[ | |
| "唱歌特化", "杂谈特化"], value="唱歌特化", interactive=True) | |
| self.dev = gr.Dropdown(label="设备(云端一般请勿切换,使用默认值即可)", choices=[ | |
| "cuda", "cpu"], value="cpu", interactive=True) | |
| self.inMic = gr.Microphone(label="录音") | |
| self.inAudio = gr.Audio(label="上传音频") | |
| self.needLogmmse = gr.Checkbox(label="是否使用自带降噪") | |
| self.slice_db = gr.Slider(label="切片阈值(较嘈杂时-30,保留呼吸声时-50,一般默认-40)", | |
| maximum=32767, minimum=-32768, step=0.1, value=-40) | |
| self.vcTransform = gr.Number( | |
| label="升降调(整数,可以正负,半音数量,升高八度就是12)", value=0) | |
| self.vcSubmit = gr.Button("转换", variant="primary") | |
| self.outVcText = gr.Textbox( | |
| label="音高平均偏差半音数量,体现转换音频的跑调情况(一般小于0.5)") | |
| self.outAudio = gr.Audio( | |
| source="upload", type="numpy", label="Output Audio") | |
| self.f0_image = gr.Image( | |
| label="f0曲线,蓝色为输入音高,橙色为合成音频的音高(代码有误差)") | |
| gr.Markdown(value=""" | |
| ## 注意 | |
| 如果要在本地使用该demo,请使用 `git lfs clone https://huggingface.co/spaces/yukie/yukie-sovits3`克隆该仓库([简单教程](https://huggingface.co/spaces/yukie/yukie-sovits3/edit/main/local.md)) | |
| """) | |
| self.vcSubmit.click(infer, inputs=[self.inMic, self.inAudio, self.vcTransform, self.slice_db, self.needLogmmse, self.sid, self.dev], outputs=[ | |
| self.outVcText, self.outAudio, self.f0_image]) | |
| def infer(inMic, inAudio, transform, slice_db, lm, sid, dev): | |
| if inAudio != None: | |
| sampling_rate, inaudio = inAudio | |
| else: | |
| if inMic != None: | |
| sampling_rate, inaudio = inMic | |
| else: | |
| return "请上传一段音频后再次尝试", None | |
| print("start inference") | |
| start_time = time.time() | |
| # 预处理,重编码 | |
| inaudio = (inaudio / np.iinfo(inaudio.dtype).max).astype(np.float32) | |
| if len(inaudio.shape) > 1: | |
| inaudio = librosa.to_mono(inaudio.transpose(1, 0)) | |
| if sampling_rate != 32000: | |
| inaudio = librosa.resample( | |
| inaudio, orig_sr=sampling_rate, target_sr=32000) | |
| if lm: | |
| inaudio = logmmse(inaudio, 32000) | |
| ori_wav_path = "tmp_ori.wav" | |
| soundfile.write(ori_wav_path, inaudio, 32000, format="wav") | |
| chunks = slicer.cut(ori_wav_path, db_thresh=slice_db) | |
| audio_data, audio_sr = slicer.chunks2audio(ori_wav_path, chunks) | |
| audio = [] | |
| sid = sid_map[sid] | |
| if sid == "sing": | |
| svc_model = Svc(model_sing, config_name, dev=dev) | |
| else: | |
| svc_model = Svc(model_talk, config_name, dev=dev) | |
| for (slice_tag, data) in audio_data: | |
| length = int(np.ceil(len(data) / audio_sr * svc_model.target_sample)) | |
| raw_path = io.BytesIO() | |
| soundfile.write(raw_path, data, audio_sr, format="wav") | |
| raw_path.seek(0) | |
| if slice_tag: | |
| _audio = np.zeros(length) | |
| else: | |
| out_audio, out_str = svc_model.infer("yukie", transform, raw_path) | |
| _audio = out_audio.cpu().numpy() | |
| audio.extend(list(_audio)) | |
| audio = (np.array(audio) * 32768.0).astype('int16') | |
| used_time = time.time() - start_time | |
| out_wav_path = "tmp.wav" | |
| soundfile.write(out_wav_path, audio, 32000, format="wav") | |
| mistake, var = svc_model.calc_error(ori_wav_path, out_wav_path, transform) | |
| out_picture = svc_model.f0_plt(ori_wav_path, out_wav_path, transform) | |
| out_str = ("Success! total use time:{}s\n半音偏差:{}\n半音方差:{}".format( | |
| used_time, mistake, var)) | |
| return out_str, (32000, audio), gr.Image.update("temp.jpg") | |
| if __name__ == "__main__": | |
| app = YukieGradio() | |
| app.UI.launch() | |