DDSP / app.py
baibaibai's picture
Update app.py
9e94534
import re
import os
import warnings
import gradio as gr
import torch
import numpy as np
# import main as svc
import os
import torch
import librosa
import numpy as np
import soundfile as sf
import pyworld as pw
import parselmouth
from ast import literal_eval
from slicer import Slicer
from ddsp.vocoder import load_model, F0_Extractor, Volume_Extractor, Units_Encoder
from ddsp.core import upsample
from enhancer import Enhancer
from tqdm import tqdm
import zipfile
import urllib.request
model_folder = "/models/"
os.makedirs("pretrain/hubert/", exist_ok=True)
url = 'https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt'
file_path = 'pretrain/hubert/hubert-soft-0d54a1f4.pt'
if not os.path.exists(file_path):
urllib.request.urlretrieve(url, file_path)
url = 'https://github.com/openvpi/vocoders/releases/download/nsf-hifigan-v1/nsf_hifigan_20221211.zip'
file_path = 'pretrain/nsf_hifigan_20221211.zip'
if not os.path.exists(file_path):
urllib.request.urlretrieve(url, file_path)
# 打开需要解压的压缩文件
with zipfile.ZipFile(file_path, 'r') as zip_ref:
# 解压所有文件到当前目录
zip_ref.extractall('./pretrain/')
dl = os.listdir('./pretrain/')
print(dl)
def list_model():
global pth_path
res = []
dir = os.getcwd()+model_folder
for f in os.listdir(dir):
if (f.endswith(".pt")):
res.append(f)
if len(f) >= len(pth_path):
pth_path = f
print(res)
return res
pth_path = "model_best.pt"
models = list_model()
print("pth_path:"+pth_path)
examples = []
for f in os.listdir("samples"):
examples.append("samples/"+f)
def svc_main(input_path, model_path, key=0, enhance=False, enhancer_adaptive_key=0, pitch_extractor='crepe', f0_min='50', f0_max='1100', threhold='-60', spk_mix_dict='None', spk_id='1', read_head=0, keys=[]):
print('input: '+input_path)
#device = 'cpu'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# load ddsp model
model, args = load_model(
os.getcwd()+model_folder+model_path, device=device)
# load input
if read_head > 0:
audio, sample_rate = librosa.load(
input_path, sr=None, duration=(40+read_head))
audio, _ = librosa.effects.trim(audio, top_db=20)
audio = audio[0:int(sample_rate*read_head)]
else:
audio, sample_rate = librosa.load(input_path, sr=None)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio)
duration = librosa.get_duration(y=audio, sr=sample_rate)
print("duration:", duration)
hop_size = args.data.block_size * sample_rate / args.data.sampling_rate
# extract volume
print('Extracting the volume envelope of the input audio...')
volume_extractor = Volume_Extractor(hop_size)
volume = volume_extractor.extract(audio)
mask = (volume > 10 ** (float(threhold) / 20)).astype('float')
mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1]))
mask = np.array([np.max(mask[n: n + 9]) for n in range(len(mask) - 8)])
mask = torch.from_numpy(mask).float().to(device).unsqueeze(-1).unsqueeze(0)
mask = upsample(mask, args.data.block_size).squeeze(-1)
volume = torch.from_numpy(volume).float().to(
device).unsqueeze(-1).unsqueeze(0)
# load units encoder
units_encoder = Units_Encoder(
args.data.encoder,
args.data.encoder_ckpt,
args.data.encoder_sample_rate,
args.data.encoder_hop_size,
device=device)
# load enhancer
if enhance:
print('Enhancer type: ' + args.enhancer.type)
enhancer = Enhancer(args.enhancer.type,
args.enhancer.ckpt, device=device)
# speaker id or mix-speaker dictionary
spk_mix_dict = literal_eval(spk_mix_dict)
if spk_mix_dict is not None:
print('Mix-speaker mode')
else:
print('Speaker ID: ' + str(int(spk_id)))
spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(device)
# extract f0
print('Pitch extractor type: ' + pitch_extractor)
pitch_extractor = F0_Extractor(
pitch_extractor,
sample_rate,
hop_size,
float(f0_min),
float(f0_max))
print('Extracting the pitch curve of the input audio...')
f1 = pitch_extractor.extract(audio, uv_interp=True, device=device)
f1 = torch.from_numpy(f1).float().to(device).unsqueeze(-1).unsqueeze(0)
# key change
output_paths = []
if len(keys) < 1:
keys.append(key)
print("keys:", keys)
for key in keys:
if read_head > 0:
output_path = input_path + '.key'+str(key)+'h.wav'
else:
output_path = input_path + '.key'+str(key)+'.wav'
if output_path in output_paths:
continue
f0 = f1 * 2 ** (float(key) / 12)
# forward and save the output
result = np.zeros(0)
current_length = 0
segments = split(audio, sample_rate, hop_size)
print('Cut the input audio into ' + str(len(segments)) + ' slices')
with torch.no_grad():
for segment in tqdm(segments):
start_frame = segment[0]
seg_input = torch.from_numpy(
segment[1]).float().unsqueeze(0).to(device)
seg_units = units_encoder.encode(
seg_input, sample_rate, hop_size)
seg_f0 = f0[:, start_frame: start_frame + seg_units.size(1), :]
seg_volume = volume[:,
start_frame: start_frame + seg_units.size(1), :]
seg_output, _, (s_h, s_n) = model(
seg_units, seg_f0, seg_volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict)
seg_output *= mask[:, start_frame * args.data.block_size: (
start_frame + seg_units.size(1)) * args.data.block_size]
if enhance:
seg_output, output_sample_rate = enhancer.enhance(
seg_output,
args.data.sampling_rate,
seg_f0,
args.data.block_size,
adaptive_key=float(enhancer_adaptive_key))
else:
output_sample_rate = args.data.sampling_rate
seg_output = seg_output.squeeze().cpu().numpy()
silent_length = round(start_frame * args.data.block_size *
output_sample_rate / args.data.sampling_rate) - current_length
if silent_length >= 0:
result = np.append(result, np.zeros(silent_length))
result = np.append(result, seg_output)
else:
result = cross_fade(result, seg_output,
current_length + silent_length)
current_length = current_length + \
silent_length + len(seg_output)
sf.write(output_path, result, output_sample_rate)
output_paths.append(output_path)
print("finish:", output_path)
return output_paths
def split(audio, sample_rate, hop_size, db_thresh=-40, min_len=5000):
slicer = Slicer(
sr=sample_rate,
threshold=db_thresh,
min_length=min_len)
chunks = dict(slicer.slice(audio))
result = []
for k, v in chunks.items():
tag = v["split_time"].split(",")
if tag[0] != tag[1]:
start_frame = int(int(tag[0]) // hop_size)
end_frame = int(int(tag[1]) // hop_size)
if end_frame > start_frame:
result.append((
start_frame,
audio[int(start_frame * hop_size): int(end_frame * hop_size)]))
return result
def cross_fade(a: np.ndarray, b: np.ndarray, idx: int):
result = np.zeros(idx + b.shape[0])
fade_len = a.shape[0] - idx
np.copyto(dst=result[:idx], src=a[:idx])
k = np.linspace(0, 1.0, num=fade_len, endpoint=True)
result[idx: a.shape[0]] = (1 - k) * a[idx:] + k * b[: fade_len]
np.copyto(dst=result[a.shape[0]:], src=b[fade_len:])
return result
def svc(input_path, model_path, key=0, enhance=False, enhancer_adaptive_key=0, pitch_extractor='crepe', f0_min='50', f0_max='1100', threhold='-60', spk_mix_dict='None', spk_id='1', read_head=0, keys=[]):
result = svc_main(input_path, model_path, key, enhance, enhancer_adaptive_key,
pitch_extractor, f0_min, f0_max, threhold, spk_mix_dict, spk_id, read_head, keys)
if len(result) > 0:
return result[0]
return ""
def svc_head(input_path, model_path, key=0, enhance=False, enhancer_adaptive_key=0, pitch_extractor='crepe', read_head=30, key_center=0, key_step=1):
key_center = float(key_center)
keys = []
for s in range(-2, 3):
k = s*key_step+key_center
keys.append(k)
result = svc_main(input_path, model_path, key, enhance,
enhancer_adaptive_key, pitch_extractor, read_head=read_head, keys=keys)
return gr.Audio.update(label="Key "+str(keys[0]), value=result[0]), gr.Audio.update(label="Key "+str(keys[1]), value=result[1]), gr.Audio.update(label="Key "+str(keys[2]), value=result[2]), gr.Audio.update(label="Key "+str(keys[3]), value=result[3]), gr.Audio.update(label="Key "+str(keys[4]), value=result[4])
app = gr.Blocks()
with app:
with gr.Tabs():
with gr.TabItem("DDSP-SVC"):
with gr.Row():
with gr.Column():
svc_input = gr.Audio(
type="filepath", label="Input")
with gr.Row():
svc_model = gr.Dropdown(
choices=models, label="模型", value=pth_path, visible=True)
svc_key = gr.Number(value=0, label="Key")
svc_pe = gr.Dropdown(choices=[
"parselmouth", "dio", "harvest", "crepe"], value='parselmouth', label='Pitch extractor')
svc_enhance = gr.Checkbox(value=False, label='Enhance')
svc_eak = gr.Number(
value=0, label='Enhancer adaptive key')
with gr.Column():
svc_output = gr.Audio(type="filepath", label="Output")
svc_submit = gr.Button("转换", variant="primary")
svc_submit.click(
svc, inputs=[svc_input, svc_model, svc_key, svc_enhance, svc_eak, svc_pe], outputs=svc_output)
with gr.TabItem("DDSP-TEST"):
gr.Markdown("使用5个key参数推理音频文件的头部,完成试听")
with gr.Row():
with gr.Column():
svc_input2 = gr.Audio(
type="filepath", label="Input")
svc_model2 = gr.Dropdown(
choices=models, label="模型", value=pth_path, visible=True)
svc_read_head = gr.Number(value=20, label="处理文件头部(秒)")
with gr.Row():
svc_key2 = gr.Dropdown(value='0', label="Key center", choices=[
'-9', '-8', '-7', '-6', '-5', '-4', '-3', '-2', '-1', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
svc_key_step = gr.Number(value=1, label="Key step")
svc_pe2 = gr.Dropdown(choices=[
"parselmouth", "dio", "harvest", "crepe"], value='crepe', label='Pitch extractor')
with gr.Row():
svc_enhance2 = gr.Checkbox(
value=False, label='Enhance')
svc_eak2 = gr.Number(
value=0, label='Enhancer adaptive key')
with gr.Column():
svc_output_0 = gr.Audio(type="filepath", label="Output -2")
svc_output_1 = gr.Audio(type="filepath", label="Output -1")
svc_output_2 = gr.Audio(type="filepath", label="Output 0")
svc_output_3 = gr.Audio(type="filepath", label="Output 1")
svc_output_4 = gr.Audio(type="filepath", label="Output 2")
svc_submit2 = gr.Button("转换", variant="primary")
svc_submit2.click(
svc_head, inputs=[svc_input2, svc_model2, svc_key, svc_enhance2, svc_eak2, svc_pe2, svc_read_head, svc_key2, svc_key_step], outputs=[svc_output_0, svc_output_1, svc_output_2, svc_output_3, svc_output_4])
gr.Examples(examples, svc_input, svc_output, svc)
gr.HTML("""
<div style="text-align:center">
模型采用 CC-BY-NC协议,代码采用 MIT协议
<br/>
仅供学习交流,不可用于商业或非法用途
<br/>
使用本项目模型直接或间接生成的音频,必须声明由AI技术或DDSP-SVC技术合成
</div>
""")
app.launch(debug=True)