Spaces:

kushan1988
/

VITS-TEST

Sleeping

App Files Files Community

VITS-TEST / app.py

kushan1988

Update app.py

f3a6fd1 over 2 years ago

raw

history blame contribute delete

19.3 kB

	import io
	import os
	import tempfile

	# os.system("wget -P cvec/ https://huggingface.co/spaces/innnky/nanami/resolve/main/checkpoint_best_legacy_500.pt")
	import gradio as gr
	import gradio.processing_utils as gr_pu
	import librosa
	import numpy as np
	import soundfile
	from inference.infer_tool import Svc
	import logging
	import re
	import json

	import subprocess
	import edge_tts
	import asyncio
	from scipy.io import wavfile
	import librosa
	import torch
	import time
	import traceback
	from itertools import chain
	from utils import mix_model
	import base64
	from io import BytesIO
	import soundfile as sf

	logging.getLogger('numba').setLevel(logging.WARNING)
	logging.getLogger('markdown_it').setLevel(logging.WARNING)
	logging.getLogger('urllib3').setLevel(logging.WARNING)
	logging.getLogger('matplotlib').setLevel(logging.WARNING)
	logging.getLogger('multipart').setLevel(logging.WARNING)

	model = None
	spk = None
	debug = False

	cuda = {}
	if torch.cuda.is_available():
	for i in range(torch.cuda.device_count()):
	device_name = torch.cuda.get_device_properties(i).name
	cuda[f"CUDA:{i} {device_name}"] = f"cuda:{i}"

	def upload_mix_append_file(files,sfiles):
	try:
	if(sfiles == None):
	file_paths = [file.name for file in files]
	else:
	file_paths = [file.name for file in chain(files,sfiles)]
	p = {file:100 for file in file_paths}
	return file_paths,mix_model_output1.update(value=json.dumps(p,indent=2))
	except Exception as e:
	if debug: traceback.print_exc()
	raise gr.Error(e)

	def mix_submit_click(js,mode):
	try:
	assert js.lstrip()!=""
	modes = {"Convex combination":0, "linear combination":1}
	mode = modes[mode]
	data = json.loads(js)
	data = list(data.items())
	model_path,mix_rate = zip(*data)
	path = mix_model(model_path,mix_rate,mode)
	return f"Success, the file was saved in{path}"
	except Exception as e:
	if debug: traceback.print_exc()
	raise gr.Error(e)

	def updata_mix_info(files):
	try:
	if files == None : return mix_model_output1.update(value="")
	p = {file.name:100 for file in files}
	return mix_model_output1.update(value=json.dumps(p,indent=2))
	except Exception as e:
	if debug: traceback.print_exc()
	raise gr.Error(e)

	def modelAnalysis(model_path,config_path,cluster_model_path,device,enhance):
	global model
	try:
	device = cuda[device] if "CUDA" in device else device
	model = Svc(model_path, config_path, device=device if device!="Auto" else None, cluster_model_path = cluster_model_path.name if cluster_model_path != None else "",nsf_hifigan_enhance=enhance)
	spks = list(model.spk2id.keys())
	device_name = torch.cuda.get_device_properties(model.dev).name if "cuda" in str(model.dev) else str(model.dev)
	msg = f"Successfully loaded the model to the device{device_name}superior\n"
	if cluster_model_path is None:
	msg += "Clustering model not loaded\n"
	else:
	msg += f"clustering model{cluster_model_path.name}Loading successfully\n"
	msg += "Available sounds for the current model:\n"
	for i in spks:
	msg += i + " "
	return sid.update(choices = spks,value=spks[0]), msg
	except Exception as e:
	if debug: traceback.print_exc()
	raise gr.Error(e)


	def modelUnload():
	global model
	if model is None:
	return sid.update(choices = [],value=""),"No models need to be uninstalled!"
	else:
	model.unload_model()
	model = None
	torch.cuda.empty_cache()
	return sid.update(choices = [],value=""),"Model unloading completed!"


	# def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold):
	# global model
	# try:
	# if input_audio is None:
	# raise gr.Error("You need to upload audio")
	# if model is None:
	# raise gr.Error("You need to specify the model")
	# sampling_rate, audio = input_audio
	# # print(audio.shape,sampling_rate)
	# audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
	# if len(audio.shape) > 1:
	# audio = librosa.to_mono(audio.transpose(1, 0))
	# temp_path = "temp.wav"
	# soundfile.write(temp_path, audio, sampling_rate, format="wav")
	# _audio = model.slice_inference(temp_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold)
	# model.clear_empty()
	# os.remove(temp_path)
	# #Build the path to save the file and save it to the results folder
	# try:
	# timestamp = str(int(time.time()))
	# filename = sid + "_" + timestamp + ".wav"
	# output_file = os.path.join("./results", filename)
	# soundfile.write(output_file, _audio, model.target_sample, format="wav")
	# return f"The inference is successful and the audio file is saved as results/{filename}", (model.target_sample, _audio)
	# except Exception as e:
	# if debug: traceback.print_exc()
	# return f"File saving failed, please save manually", (model.target_sample, _audio)
	# except Exception as e:
	# if debug: traceback.print_exc()
	# raise gr.Error(e)

	def vc_fn(sid, input_audio, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold):
	global model
	try:
	if input_audio is None:
	raise gr.Error("You need to upload audio")
	if model is None:
	raise gr.Error("You need to specify the model")

	sampling_rate, audio = input_audio
	audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
	if len(audio.shape) > 1:
	audio = librosa.to_mono(audio.transpose(1, 0))

	# Use the RAM-based filesystem for temporary storage
	temp_path = "/dev/shm/temp.wav"
	sf.write(temp_path, audio, sampling_rate, format="wav")

	_audio = model.slice_inference(temp_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold)

	# Clean up the temporary file to free up RAM
	os.remove(temp_path)

	return f"The inference was successful! Listen to the processed audio below.", (model.target_sample, _audio)

	except Exception as e:
	if debug:
	traceback.print_exc()
	raise gr.Error(str(e))


	def tts_func(_text,_rate,_voice):
	#Use edge-tts to convert text into audio
	# voice = "zh-CN-XiaoyiNeural"#Female, higher pitch
	# voice = "zh-CN-YunxiNeural"#male
	voice = "zh-CN-YunxiNeural"#男性
	if ( _voice == "女" ) : voice = "zh-CN-XiaoyiNeural"
	output_file = _text[0:10]+".wav"
	# communicate = edge_tts.Communicate(_text, voice)
	# await communicate.save(output_file)
	if _rate>=0:
	ratestr="+{:.0%}".format(_rate)
	elif _rate<0:
	ratestr="{:.0%}".format(_rate)#Minus sign comes with

	p=subprocess.Popen("edge-tts "+
	" --text "+_text+
	" --write-media "+output_file+
	" --voice "+voice+
	" --rate="+ratestr
	,shell=True,
	stdout=subprocess.PIPE,
	stdin=subprocess.PIPE)
	p.wait()
	return output_file

	def text_clear(text):
	return re.sub(r"[\n\,\(\) ]", "", text)

	def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,f0_predictor,enhancer_adaptive_key,cr_threshold):
	#Use edge-tts to convert text into audio
	text2tts=text_clear(text2tts)
	output_file=tts_func(text2tts,tts_rate,tts_voice)

	#Adjust sampling rate
	sr2=44100
	wav, sr = librosa.load(output_file)
	wav2 = librosa.resample(wav, orig_sr=sr, target_sr=sr2)
	save_path2= text2tts[0:10]+"_44k"+".wav"
	wavfile.write(save_path2,sr2,
	(wav2 * np.iinfo(np.int16).max).astype(np.int16)
	)

	#Read audio
	sample_rate, data=gr_pu.audio_from_file(save_path2)
	vc_input=(sample_rate, data)

	a,b=vc_fn(sid, vc_input, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold)
	os.remove(output_file)
	os.remove(save_path2)
	return a,b

	def debug_change():
	global debug
	debug = debug_button.value

	with gr.Blocks(
	theme=gr.themes.Base(
	primary_hue = gr.themes.colors.green,
	font=["Source Sans Pro", "Arial", "sans-serif"],
	font_mono=['JetBrains mono', "Consolas", 'Courier New']
	),
	) as app:
	with gr.Tabs():
	with gr.TabItem("reasoning"):
	gr.Markdown(value="""
	So-vits-svc 4.0 reasoning webui
	""")
	with gr.Row(variant="panel"):
	with gr.Column():
	gr.Markdown(value="""
	<font size=2> Model settings</font>
	""")
	model_path = gr.Text(label="Select model file", value="logs/44k/G_709600.pth")
	config_path = gr.Text(label="Select profile", value="configs/config.json")
	print('model_path', model_path)
	print('config_path', config_path)
	cluster_model_path = gr.File(label="Select the clustering model file (if not selected, you can leave it unselected)")
	device = gr.Dropdown(label="Inference device, the default is to automatically select the CPU and GPU", choices=["Auto",*cuda.keys(),"CPU"], value="Auto")
	enhance = gr.Checkbox(label="Whether to use NSF_HIFIGAN enhancement. This option has a certain sound quality enhancement effect on some models with small training sets, but has a negative effect on well-trained models. It is turned off by default.", value=False)
	with gr.Column():
	gr.Markdown(value="""
	<font size=3>After all files on the left are selected (all file modules display download), click "Load Model" to analyze:</font>
	""")
	model_load_button = gr.Button(value="Load model", variant="primary")
	model_unload_button = gr.Button(value="Unload model", variant="primary")
	sid = gr.Dropdown(label="timbre (speaker)")
	sid_output = gr.Textbox(label="Output Message")


	with gr.Row(variant="panel"):
	with gr.Column():
	gr.Markdown(value="""
	<font size=2> Inference settings</font>
	""")
	auto_f0 = gr.Checkbox(label="Automatic f0 prediction, combined with the clustering model f0 prediction effect is better, but it will cause the pitch change function to fail (only for voice conversion, singing if this option is checked will cause extreme out-of-tune)", value=False)
	f0_predictor = gr.Dropdown(label="Select F0 predictor, you can choose crepe, pm, dio, harvest, the default is pm (note: crepe uses the mean filter for the original F0)", choices=["pm","dio","harvest","crepe"], value="pm")
	vc_transform = gr.Number(label="Pitch change (integer, can be positive or negative, number of semitones, rising an octave is 12)", value=0)
	cluster_ratio = gr.Number(label="Clustering model mixing ratio, between 0 and 1, 0 means clustering is not enabled. Using the clustering model can improve the timbre similarity, but it will lead to a decrease in pronunciation (if used, it is recommended to be around 0.5)", value=0)
	slice_db = gr.Number(label="slice threshold", value=-40)
	noise_scale = gr.Number(label="noise_scale It is recommended not to move, as it will affect the sound quality and metaphysical parameters.", value=0.4)
	with gr.Column():
	pad_seconds = gr.Number(label="Infer the audio pad seconds. Due to unknown reasons, there will be abnormal sound at the beginning and end. It will not appear after a short silent section of the pad.", value=0.5)
	cl_num = gr.Number(label="Audio is automatically sliced, 0 means no slicing, the unit is seconds (s)", value=0)
	lg_num = gr.Number(label="The cross-fade length of the audio slices at both ends. If the vocals are incoherent after automatic slicing, you can adjust this value. If it is coherent, it is recommended to use the default value 0. Note that this setting will affect the inference speed. The unit is seconds/s.", value=0)
	lgr_num = gr.Number(label="After automatic audio slicing, the head and tail of each slice need to be discarded. This parameter sets the proportion of intersection length retention, ranging from 0-1, left open and right closed", value=0.75)
	enhancer_adaptive_key = gr.Number(label="Adapt the enhancer to a higher range (in semitones) \| Default is 0", value=0)
	cr_threshold = gr.Number(label="F0 filter threshold, only effective when crepe is started. The value range is from 0-1. Lowering this value can reduce the probability of out-of-tune, but it will increase the mute sound.", value=0.05)
	with gr.Tabs():
	with gr.TabItem("audio to audio"):
	vc_input3 = gr.Audio(label="Select audio")
	vc_submit = gr.Button("audio conversion", variant="primary")
	with gr.TabItem("Text to audio"):
	text2tts=gr.Textbox(label="Enter the text you want to translate here. Note that it is recommended to turn on F0 prediction when using this function, otherwise it will be very strange.")
	tts_rate = gr.Number(label="tts speaking speed", value=0)
	tts_voice = gr.Radio(label="gender",choices=["male","female"], value="male")
	vc_submit2 = gr.Button("text conversion", variant="primary")
	with gr.Row():
	with gr.Column():
	vc_output1 = gr.Textbox(label="Output Message")
	with gr.Column():
	vc_output2 = gr.Audio(label="Output Audio", interactive=False)

	with gr.TabItem("Gadget/Lab Features"):
	gr.Markdown(value="""
	<font size=2> So-vits-svc 4.0 Gadget/Lab Features</font>
	""")
	with gr.Tabs():
	with gr.TabItem("static sound fusion"):
	gr.Markdown(value="""
	<font size=2> Introduction: This function can synthesize multiple sound models into one sound model (convex combination or linear combination of multiple model parameters), thereby creating sound lines that do not exist in reality.
	Notice:
	1. This function only supports single-speaker models
	2. If you forcibly use a multi-speaker model, you need to ensure that the number of speakers in multiple models is the same so that sounds under the same SpaekerID can be mixed.
	3. Ensure that the model fields in the config.json of all models to be mixed are the same
	4. The output hybrid model can use any config.json of the model to be synthesized, but the clustering model cannot be used.
	5. When uploading models in batches, it is best to put the models into a folder, select them and upload them together.
	6. The recommended size for adjusting the mixing ratio is between 0-100. It can also be adjusted to other numbers, but unknown effects will occur in linear combination mode.
	7. After the mixing is completed, the file will be saved in the project root directory with the file name output.pth
	8. The convex combination mode will perform Softmax on the mixing ratio so that the mixing ratio adds up to 1, while the linear combination mode will not
	</font>
	""")
	mix_model_path = gr.Files(label="Select the required hybrid model file")
	mix_model_upload_button = gr.UploadButton("Select/Append requires hybrid model files", file_count="multiple", variant="primary")
	mix_model_output1 = gr.Textbox(
	label="Mixing ratio adjustment, unit/%",
	interactive = True
	)
	mix_mode = gr.Radio(choices=["Convex combination", "linear combination"], label="Fusion mode",value="Convex combination",interactive = True)
	mix_submit = gr.Button("Voice fusion starts", variant="primary")
	mix_model_output2 = gr.Textbox(
	label="Output Message"
	)
	mix_model_path.change(updata_mix_info,[mix_model_path],[mix_model_output1])
	mix_model_upload_button.upload(upload_mix_append_file, [mix_model_upload_button,mix_model_path], [mix_model_path,mix_model_output1])
	mix_submit.click(mix_submit_click, [mix_model_output1,mix_mode], [mix_model_output2])


	with gr.Tabs():
	with gr.Row(variant="panel"):
	with gr.Column():
	gr.Markdown(value="""
	<font size=2> WebUI settings</font>
	""")
	debug_button = gr.Checkbox(label="Debug mode, if you need to turn it on to report bugs to the community, the console can display specific error prompts after turning it on.", value=debug)
	vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold], [vc_output1, gr.components.Audio(type="numpy", label="Processed Audio")])
	vc_submit2.click(vc_fn2, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,f0_predictor,enhancer_adaptive_key,cr_threshold], [vc_output1, vc_output2])
	debug_button.change(debug_change,[],[])
	model_load_button.click(modelAnalysis,[model_path,config_path,cluster_model_path,device,enhance],[sid,sid_output])
	model_unload_button.click(modelUnload,[],[sid,sid_output])
	app.launch(debug=True)