Spaces:

baibaibai
/

DDSP

Runtime error

App Files Files Community

DDSP / app.py

baibaibai

Update app.py

9e94534 about 3 years ago

raw

history blame contribute delete

13 kB

	import re
	import os
	import warnings

	import gradio as gr
	import torch

	import numpy as np

	# import main as svc
	import os
	import torch
	import librosa
	import numpy as np
	import soundfile as sf
	import pyworld as pw
	import parselmouth
	from ast import literal_eval
	from slicer import Slicer
	from ddsp.vocoder import load_model, F0_Extractor, Volume_Extractor, Units_Encoder
	from ddsp.core import upsample
	from enhancer import Enhancer
	from tqdm import tqdm

	import zipfile
	import urllib.request

	model_folder = "/models/"
	os.makedirs("pretrain/hubert/", exist_ok=True)

	url = 'https://github.com/bshall/hubert/releases/download/v0.1/hubert-soft-0d54a1f4.pt'
	file_path = 'pretrain/hubert/hubert-soft-0d54a1f4.pt'

	if not os.path.exists(file_path):
	urllib.request.urlretrieve(url, file_path)

	url = 'https://github.com/openvpi/vocoders/releases/download/nsf-hifigan-v1/nsf_hifigan_20221211.zip'
	file_path = 'pretrain/nsf_hifigan_20221211.zip'

	if not os.path.exists(file_path):
	urllib.request.urlretrieve(url, file_path)


	# 打开需要解压的压缩文件
	with zipfile.ZipFile(file_path, 'r') as zip_ref:
	# 解压所有文件到当前目录
	zip_ref.extractall('./pretrain/')

	dl = os.listdir('./pretrain/')
	print(dl)


	def list_model():
	global pth_path
	res = []
	dir = os.getcwd()+model_folder
	for f in os.listdir(dir):
	if (f.endswith(".pt")):
	res.append(f)
	if len(f) >= len(pth_path):
	pth_path = f
	print(res)
	return res


	pth_path = "model_best.pt"
	models = list_model()
	print("pth_path:"+pth_path)


	examples = []
	for f in os.listdir("samples"):
	examples.append("samples/"+f)


	def svc_main(input_path, model_path, key=0, enhance=False, enhancer_adaptive_key=0, pitch_extractor='crepe', f0_min='50', f0_max='1100', threhold='-60', spk_mix_dict='None', spk_id='1', read_head=0, keys=[]):
	print('input: '+input_path)
	#device = 'cpu'
	device = 'cuda' if torch.cuda.is_available() else 'cpu'

	# load ddsp model
	model, args = load_model(
	os.getcwd()+model_folder+model_path, device=device)

	# load input
	if read_head > 0:
	audio, sample_rate = librosa.load(
	input_path, sr=None, duration=(40+read_head))
	audio, _ = librosa.effects.trim(audio, top_db=20)
	audio = audio[0:int(sample_rate*read_head)]
	else:
	audio, sample_rate = librosa.load(input_path, sr=None)

	if len(audio.shape) > 1:
	audio = librosa.to_mono(audio)

	duration = librosa.get_duration(y=audio, sr=sample_rate)
	print("duration:", duration)
	hop_size = args.data.block_size * sample_rate / args.data.sampling_rate

	# extract volume
	print('Extracting the volume envelope of the input audio...')
	volume_extractor = Volume_Extractor(hop_size)
	volume = volume_extractor.extract(audio)
	mask = (volume > 10 ** (float(threhold) / 20)).astype('float')
	mask = np.pad(mask, (4, 4), constant_values=(mask[0], mask[-1]))
	mask = np.array([np.max(mask[n: n + 9]) for n in range(len(mask) - 8)])
	mask = torch.from_numpy(mask).float().to(device).unsqueeze(-1).unsqueeze(0)
	mask = upsample(mask, args.data.block_size).squeeze(-1)
	volume = torch.from_numpy(volume).float().to(
	device).unsqueeze(-1).unsqueeze(0)

	# load units encoder
	units_encoder = Units_Encoder(
	args.data.encoder,
	args.data.encoder_ckpt,
	args.data.encoder_sample_rate,
	args.data.encoder_hop_size,
	device=device)

	# load enhancer
	if enhance:
	print('Enhancer type: ' + args.enhancer.type)
	enhancer = Enhancer(args.enhancer.type,
	args.enhancer.ckpt, device=device)

	# speaker id or mix-speaker dictionary
	spk_mix_dict = literal_eval(spk_mix_dict)
	if spk_mix_dict is not None:
	print('Mix-speaker mode')
	else:
	print('Speaker ID: ' + str(int(spk_id)))
	spk_id = torch.LongTensor(np.array([[int(spk_id)]])).to(device)

	# extract f0
	print('Pitch extractor type: ' + pitch_extractor)
	pitch_extractor = F0_Extractor(
	pitch_extractor,
	sample_rate,
	hop_size,
	float(f0_min),
	float(f0_max))
	print('Extracting the pitch curve of the input audio...')
	f1 = pitch_extractor.extract(audio, uv_interp=True, device=device)
	f1 = torch.from_numpy(f1).float().to(device).unsqueeze(-1).unsqueeze(0)

	# key change
	output_paths = []
	if len(keys) < 1:
	keys.append(key)
	print("keys:", keys)
	for key in keys:
	if read_head > 0:
	output_path = input_path + '.key'+str(key)+'h.wav'
	else:
	output_path = input_path + '.key'+str(key)+'.wav'

	if output_path in output_paths:
	continue
	f0 = f1 * 2 ** (float(key) / 12)
	# forward and save the output
	result = np.zeros(0)
	current_length = 0
	segments = split(audio, sample_rate, hop_size)
	print('Cut the input audio into ' + str(len(segments)) + ' slices')
	with torch.no_grad():
	for segment in tqdm(segments):
	start_frame = segment[0]
	seg_input = torch.from_numpy(
	segment[1]).float().unsqueeze(0).to(device)
	seg_units = units_encoder.encode(
	seg_input, sample_rate, hop_size)

	seg_f0 = f0[:, start_frame: start_frame + seg_units.size(1), :]
	seg_volume = volume[:,
	start_frame: start_frame + seg_units.size(1), :]

	seg_output, _, (s_h, s_n) = model(
	seg_units, seg_f0, seg_volume, spk_id=spk_id, spk_mix_dict=spk_mix_dict)
	seg_output = mask[:, start_frame args.data.block_size: (
	start_frame + seg_units.size(1)) * args.data.block_size]

	if enhance:
	seg_output, output_sample_rate = enhancer.enhance(
	seg_output,
	args.data.sampling_rate,
	seg_f0,
	args.data.block_size,
	adaptive_key=float(enhancer_adaptive_key))
	else:
	output_sample_rate = args.data.sampling_rate

	seg_output = seg_output.squeeze().cpu().numpy()

	silent_length = round(start_frame * args.data.block_size *
	output_sample_rate / args.data.sampling_rate) - current_length
	if silent_length >= 0:
	result = np.append(result, np.zeros(silent_length))
	result = np.append(result, seg_output)
	else:
	result = cross_fade(result, seg_output,
	current_length + silent_length)
	current_length = current_length + \
	silent_length + len(seg_output)
	sf.write(output_path, result, output_sample_rate)
	output_paths.append(output_path)
	print("finish：", output_path)
	return output_paths


	def split(audio, sample_rate, hop_size, db_thresh=-40, min_len=5000):
	slicer = Slicer(
	sr=sample_rate,
	threshold=db_thresh,
	min_length=min_len)
	chunks = dict(slicer.slice(audio))
	result = []
	for k, v in chunks.items():
	tag = v["split_time"].split(",")
	if tag[0] != tag[1]:
	start_frame = int(int(tag[0]) // hop_size)
	end_frame = int(int(tag[1]) // hop_size)
	if end_frame > start_frame:
	result.append((
	start_frame,
	audio[int(start_frame * hop_size): int(end_frame * hop_size)]))
	return result


	def cross_fade(a: np.ndarray, b: np.ndarray, idx: int):
	result = np.zeros(idx + b.shape[0])
	fade_len = a.shape[0] - idx
	np.copyto(dst=result[:idx], src=a[:idx])
	k = np.linspace(0, 1.0, num=fade_len, endpoint=True)
	result[idx: a.shape[0]] = (1 - k) * a[idx:] + k * b[: fade_len]
	np.copyto(dst=result[a.shape[0]:], src=b[fade_len:])
	return result


	def svc(input_path, model_path, key=0, enhance=False, enhancer_adaptive_key=0, pitch_extractor='crepe', f0_min='50', f0_max='1100', threhold='-60', spk_mix_dict='None', spk_id='1', read_head=0, keys=[]):
	result = svc_main(input_path, model_path, key, enhance, enhancer_adaptive_key,
	pitch_extractor, f0_min, f0_max, threhold, spk_mix_dict, spk_id, read_head, keys)
	if len(result) > 0:
	return result[0]
	return ""


	def svc_head(input_path, model_path, key=0, enhance=False, enhancer_adaptive_key=0, pitch_extractor='crepe', read_head=30, key_center=0, key_step=1):
	key_center = float(key_center)
	keys = []
	for s in range(-2, 3):
	k = s*key_step+key_center
	keys.append(k)
	result = svc_main(input_path, model_path, key, enhance,
	enhancer_adaptive_key, pitch_extractor, read_head=read_head, keys=keys)
	return gr.Audio.update(label="Key "+str(keys[0]), value=result[0]), gr.Audio.update(label="Key "+str(keys[1]), value=result[1]), gr.Audio.update(label="Key "+str(keys[2]), value=result[2]), gr.Audio.update(label="Key "+str(keys[3]), value=result[3]), gr.Audio.update(label="Key "+str(keys[4]), value=result[4])


	app = gr.Blocks()
	with app:
	with gr.Tabs():
	with gr.TabItem("DDSP-SVC"):

	with gr.Row():
	with gr.Column():
	svc_input = gr.Audio(
	type="filepath", label="Input")
	with gr.Row():
	svc_model = gr.Dropdown(
	choices=models, label="模型", value=pth_path, visible=True)
	svc_key = gr.Number(value=0, label="Key")
	svc_pe = gr.Dropdown(choices=[
	"parselmouth", "dio", "harvest", "crepe"], value='parselmouth', label='Pitch extractor')
	svc_enhance = gr.Checkbox(value=False, label='Enhance')
	svc_eak = gr.Number(
	value=0, label='Enhancer adaptive key')
	with gr.Column():
	svc_output = gr.Audio(type="filepath", label="Output")
	svc_submit = gr.Button("转换", variant="primary")
	svc_submit.click(
	svc, inputs=[svc_input, svc_model, svc_key, svc_enhance, svc_eak, svc_pe], outputs=svc_output)

	with gr.TabItem("DDSP-TEST"):
	gr.Markdown("使用5个key参数推理音频文件的头部，完成试听")
	with gr.Row():
	with gr.Column():
	svc_input2 = gr.Audio(
	type="filepath", label="Input")
	svc_model2 = gr.Dropdown(
	choices=models, label="模型", value=pth_path, visible=True)
	svc_read_head = gr.Number(value=20, label="处理文件头部（秒）")

	with gr.Row():
	svc_key2 = gr.Dropdown(value='0', label="Key center", choices=[
	'-9', '-8', '-7', '-6', '-5', '-4', '-3', '-2', '-1', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])
	svc_key_step = gr.Number(value=1, label="Key step")
	svc_pe2 = gr.Dropdown(choices=[
	"parselmouth", "dio", "harvest", "crepe"], value='crepe', label='Pitch extractor')
	with gr.Row():
	svc_enhance2 = gr.Checkbox(
	value=False, label='Enhance')
	svc_eak2 = gr.Number(
	value=0, label='Enhancer adaptive key')
	with gr.Column():
	svc_output_0 = gr.Audio(type="filepath", label="Output -2")
	svc_output_1 = gr.Audio(type="filepath", label="Output -1")
	svc_output_2 = gr.Audio(type="filepath", label="Output 0")
	svc_output_3 = gr.Audio(type="filepath", label="Output 1")
	svc_output_4 = gr.Audio(type="filepath", label="Output 2")
	svc_submit2 = gr.Button("转换", variant="primary")
	svc_submit2.click(
	svc_head, inputs=[svc_input2, svc_model2, svc_key, svc_enhance2, svc_eak2, svc_pe2, svc_read_head, svc_key2, svc_key_step], outputs=[svc_output_0, svc_output_1, svc_output_2, svc_output_3, svc_output_4])

	gr.Examples(examples, svc_input, svc_output, svc)

	gr.HTML("""
	<div style="text-align:center">
	模型采用 CC-BY-NC协议，代码采用 MIT协议
	<br/>
	仅供学习交流，不可用于商业或非法用途
	<br/>
	使用本项目模型直接或间接生成的音频，必须声明由AI技术或DDSP-SVC技术合成
	</div>
	""")
	app.launch(debug=True)