| import re |
| import os |
| import warnings |
|
|
| import gradio as gr |
| import torch |
|
|
| import numpy as np |
|
|
| |
| import os |
| import torch |
| import librosa |
| import numpy as np |
| import soundfile as sf |
| import pyworld as pw |
| import parselmouth |
| from ast import literal_eval |
| from slicer import Slicer |
| from ddsp.vocoder import load_model, F0_Extractor, Volume_Extractor, Units_Encoder |
| from ddsp.core import upsample |
| from enhancer import Enhancer |
| from tqdm import tqdm |
|
|
| import zipfile |
| import urllib.request |
|
|
| model_folder = "/models/" |
| os.makedirs("pretrain/hubert/", exist_ok=True) |
|
|
| url = 'https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt' |
| file_path = 'pretrain/hubert/hubert-soft-0d54a1f4.pt' |
|
|
| if not os.path.exists(file_path): |
| urllib.request.urlretrieve(url, file_path) |
|
|
| url = 'https://github.com/openvpi/vocoders/releases/download/nsf-hifigan-v1/nsf_hifigan_20221211.zip' |
| file_path = 'pretrain/nsf_hifigan_20221211.zip' |
|
|
| if not os.path.exists(file_path): |
| urllib.request.urlretrieve(url, file_path) |
|
|
|
|
| |
| with zipfile.ZipFile(file_path, 'r') as zip_ref: |
| |
| zip_ref.extractall('./pretrain/') |
|
|
| dl = os.listdir('./pretrain/') |
| print(dl) |
|
|
|
|
| def list_model(): |
| global pth_path |
| res = [] |
| dir = os.getcwd()+model_folder |
| for f in os.listdir(dir): |
| if (f.endswith(".pt")): |
| res.append(f) |
| if len(f) >= len(pth_path): |
| pth_path = f |
| print(res) |
| return res |
|
|
|
|
| pth_path = "model_best.pt" |
| models = list_model() |
| print("pth_path:"+pth_path) |
|
|
|
|
| examples = [] |
| for f in os.listdir("samples"): |
| examples.append("samples/"+f) |
|
|
|
|
| def svc_main(input_path, model_path, key=0, enhance=False, enhancer_adaptive_key=0, pitch_extractor='crepe', f0_min='50', f0_max='1100', threhold='-60', spk_mix_dict='None', spk_id='1', read_head=0, keys=[]): |
| print('input: '+input_path) |
| |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' |
|
|
| |
| model, args = load_model( |
| os.getcwd()+model_folder+model_path, device=device) |
|
|
| |
| if read_head > 0: |
| audio, sample_rate = librosa.load( |
| input_path, sr=None, duration=(40+read_head)) |
| audio, _ = librosa.effects.trim(audio, top_db=20) |
| audio = audio[0:int(sample_rate*read_head)] |
| else: |
| audio, sample_rate = librosa.load(input_path, sr=None) |
|
|
| if len(audio.shape) > 1: |
| audio = librosa.to_mono(audio) |
|
|
| duration = librosa.get_duration(y=audio, sr=sample_rate) |
| print("duration:", duration) |
| hop_size = args.data.block_size * sample_rate / args.data.sampling_rate |
|
|
| |
| print('Extracting the volume envelope of the input audio...') |
| volume_extractor = Volume_Extractor(hop_size) |
| volume = volume_extractor.extract(audio) |
| mask = (volume > 10 ** (float(threhold) / 20)).astype('float') |
| mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1])) |
| mask = np.array([np.max(mask[n: n + 9]) for n in range(len(mask) - 8)]) |
| mask = torch.from_numpy(mask).float().to(device).unsqueeze(-1).unsqueeze(0) |
| mask = upsample(mask, args.data.block_size).squeeze(-1) |
| volume = torch.from_numpy(volume).float().to( |
| device).unsqueeze(-1).unsqueeze(0) |
|
|
| |
| units_encoder = Units_Encoder( |
| args.data.encoder, |
| args.data.encoder_ckpt, |
| args.data.encoder_sample_rate, |
| args.data.encoder_hop_size, |
| device=device) |
|
|
| |
| if enhance: |
| print('Enhancer type: ' + args.enhancer.type) |
| enhancer = Enhancer(args.enhancer.type, |
| args.enhancer.ckpt, device=device) |
|
|
| |
| spk_mix_dict = literal_eval(spk_mix_dict) |
| if spk_mix_dict is not None: |
| print('Mix-speaker mode') |
| else: |
| print('Speaker ID: ' + str(int(spk_id))) |
| spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(device) |
|
|
| |
| print('Pitch extractor type: ' + pitch_extractor) |
| pitch_extractor = F0_Extractor( |
| pitch_extractor, |
| sample_rate, |
| hop_size, |
| float(f0_min), |
| float(f0_max)) |
| print('Extracting the pitch curve of the input audio...') |
| f1 = pitch_extractor.extract(audio, uv_interp=True, device=device) |
| f1 = torch.from_numpy(f1).float().to(device).unsqueeze(-1).unsqueeze(0) |
|
|
| |
| output_paths = [] |
| if len(keys) < 1: |
| keys.append(key) |
| print("keys:", keys) |
| for key in keys: |
| if read_head > 0: |
| output_path = input_path + '.key'+str(key)+'h.wav' |
| else: |
| output_path = input_path + '.key'+str(key)+'.wav' |
|
|
| if output_path in output_paths: |
| continue |
| f0 = f1 * 2 ** (float(key) / 12) |
| |
| result = np.zeros(0) |
| current_length = 0 |
| segments = split(audio, sample_rate, hop_size) |
| print('Cut the input audio into ' + str(len(segments)) + ' slices') |
| with torch.no_grad(): |
| for segment in tqdm(segments): |
| start_frame = segment[0] |
| seg_input = torch.from_numpy( |
| segment[1]).float().unsqueeze(0).to(device) |
| seg_units = units_encoder.encode( |
| seg_input, sample_rate, hop_size) |
|
|
| seg_f0 = f0[:, start_frame: start_frame + seg_units.size(1), :] |
| seg_volume = volume[:, |
| start_frame: start_frame + seg_units.size(1), :] |
|
|
| seg_output, _, (s_h, s_n) = model( |
| seg_units, seg_f0, seg_volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict) |
| seg_output *= mask[:, start_frame * args.data.block_size: ( |
| start_frame + seg_units.size(1)) * args.data.block_size] |
|
|
| if enhance: |
| seg_output, output_sample_rate = enhancer.enhance( |
| seg_output, |
| args.data.sampling_rate, |
| seg_f0, |
| args.data.block_size, |
| adaptive_key=float(enhancer_adaptive_key)) |
| else: |
| output_sample_rate = args.data.sampling_rate |
|
|
| seg_output = seg_output.squeeze().cpu().numpy() |
|
|
| silent_length = round(start_frame * args.data.block_size * |
| output_sample_rate / args.data.sampling_rate) - current_length |
| if silent_length >= 0: |
| result = np.append(result, np.zeros(silent_length)) |
| result = np.append(result, seg_output) |
| else: |
| result = cross_fade(result, seg_output, |
| current_length + silent_length) |
| current_length = current_length + \ |
| silent_length + len(seg_output) |
| sf.write(output_path, result, output_sample_rate) |
| output_paths.append(output_path) |
| print("finish:", output_path) |
| return output_paths |
|
|
|
|
| def split(audio, sample_rate, hop_size, db_thresh=-40, min_len=5000): |
| slicer = Slicer( |
| sr=sample_rate, |
| threshold=db_thresh, |
| min_length=min_len) |
| chunks = dict(slicer.slice(audio)) |
| result = [] |
| for k, v in chunks.items(): |
| tag = v["split_time"].split(",") |
| if tag[0] != tag[1]: |
| start_frame = int(int(tag[0]) // hop_size) |
| end_frame = int(int(tag[1]) // hop_size) |
| if end_frame > start_frame: |
| result.append(( |
| start_frame, |
| audio[int(start_frame * hop_size): int(end_frame * hop_size)])) |
| return result |
|
|
|
|
| def cross_fade(a: np.ndarray, b: np.ndarray, idx: int): |
| result = np.zeros(idx + b.shape[0]) |
| fade_len = a.shape[0] - idx |
| np.copyto(dst=result[:idx], src=a[:idx]) |
| k = np.linspace(0, 1.0, num=fade_len, endpoint=True) |
| result[idx: a.shape[0]] = (1 - k) * a[idx:] + k * b[: fade_len] |
| np.copyto(dst=result[a.shape[0]:], src=b[fade_len:]) |
| return result |
|
|
|
|
| def svc(input_path, model_path, key=0, enhance=False, enhancer_adaptive_key=0, pitch_extractor='crepe', f0_min='50', f0_max='1100', threhold='-60', spk_mix_dict='None', spk_id='1', read_head=0, keys=[]): |
| result = svc_main(input_path, model_path, key, enhance, enhancer_adaptive_key, |
| pitch_extractor, f0_min, f0_max, threhold, spk_mix_dict, spk_id, read_head, keys) |
| if len(result) > 0: |
| return result[0] |
| return "" |
|
|
|
|
| def svc_head(input_path, model_path, key=0, enhance=False, enhancer_adaptive_key=0, pitch_extractor='crepe', read_head=30, key_center=0, key_step=1): |
| key_center = float(key_center) |
| keys = [] |
| for s in range(-2, 3): |
| k = s*key_step+key_center |
| keys.append(k) |
| result = svc_main(input_path, model_path, key, enhance, |
| enhancer_adaptive_key, pitch_extractor, read_head=read_head, keys=keys) |
| return gr.Audio.update(label="Key "+str(keys[0]), value=result[0]), gr.Audio.update(label="Key "+str(keys[1]), value=result[1]), gr.Audio.update(label="Key "+str(keys[2]), value=result[2]), gr.Audio.update(label="Key "+str(keys[3]), value=result[3]), gr.Audio.update(label="Key "+str(keys[4]), value=result[4]) |
|
|
|
|
| app = gr.Blocks() |
| with app: |
| with gr.Tabs(): |
| with gr.TabItem("DDSP-SVC"): |
|
|
| with gr.Row(): |
| with gr.Column(): |
| svc_input = gr.Audio( |
| type="filepath", label="Input") |
| with gr.Row(): |
| svc_model = gr.Dropdown( |
| choices=models, label="模型", value=pth_path, visible=True) |
| svc_key = gr.Number(value=0, label="Key") |
| svc_pe = gr.Dropdown(choices=[ |
| "parselmouth", "dio", "harvest", "crepe"], value='parselmouth', label='Pitch extractor') |
| svc_enhance = gr.Checkbox(value=False, label='Enhance') |
| svc_eak = gr.Number( |
| value=0, label='Enhancer adaptive key') |
| with gr.Column(): |
| svc_output = gr.Audio(type="filepath", label="Output") |
| svc_submit = gr.Button("转换", variant="primary") |
| svc_submit.click( |
| svc, inputs=[svc_input, svc_model, svc_key, svc_enhance, svc_eak, svc_pe], outputs=svc_output) |
|
|
| with gr.TabItem("DDSP-TEST"): |
| gr.Markdown("使用5个key参数推理音频文件的头部,完成试听") |
| with gr.Row(): |
| with gr.Column(): |
| svc_input2 = gr.Audio( |
| type="filepath", label="Input") |
| svc_model2 = gr.Dropdown( |
| choices=models, label="模型", value=pth_path, visible=True) |
| svc_read_head = gr.Number(value=20, label="处理文件头部(秒)") |
|
|
| with gr.Row(): |
| svc_key2 = gr.Dropdown(value='0', label="Key center", choices=[ |
| '-9', '-8', '-7', '-6', '-5', '-4', '-3', '-2', '-1', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']) |
| svc_key_step = gr.Number(value=1, label="Key step") |
| svc_pe2 = gr.Dropdown(choices=[ |
| "parselmouth", "dio", "harvest", "crepe"], value='crepe', label='Pitch extractor') |
| with gr.Row(): |
| svc_enhance2 = gr.Checkbox( |
| value=False, label='Enhance') |
| svc_eak2 = gr.Number( |
| value=0, label='Enhancer adaptive key') |
| with gr.Column(): |
| svc_output_0 = gr.Audio(type="filepath", label="Output -2") |
| svc_output_1 = gr.Audio(type="filepath", label="Output -1") |
| svc_output_2 = gr.Audio(type="filepath", label="Output 0") |
| svc_output_3 = gr.Audio(type="filepath", label="Output 1") |
| svc_output_4 = gr.Audio(type="filepath", label="Output 2") |
| svc_submit2 = gr.Button("转换", variant="primary") |
| svc_submit2.click( |
| svc_head, inputs=[svc_input2, svc_model2, svc_key, svc_enhance2, svc_eak2, svc_pe2, svc_read_head, svc_key2, svc_key_step], outputs=[svc_output_0, svc_output_1, svc_output_2, svc_output_3, svc_output_4]) |
|
|
| gr.Examples(examples, svc_input, svc_output, svc) |
|
|
| gr.HTML(""" |
| <div style="text-align:center"> |
| 模型采用 CC-BY-NC协议,代码采用 MIT协议 |
| <br/> |
| 仅供学习交流,不可用于商业或非法用途 |
| <br/> |
| 使用本项目模型直接或间接生成的音频,必须声明由AI技术或DDSP-SVC技术合成 |
| </div> |
| """) |
| app.launch(debug=True) |
|
|