VITS-TEST / app.py
kushan1988's picture
Update app.py
f3a6fd1
import io
import os
import tempfile
# os.system("wget -P cvec/ https://huggingface.co/spaces/innnky/nanami/resolve/main/checkpoint_best_legacy_500.pt")
import gradio as gr
import gradio.processing_utils as gr_pu
import librosa
import numpy as np
import soundfile
from inference.infer_tool import Svc
import logging
import re
import json
import subprocess
import edge_tts
import asyncio
from scipy.io import wavfile
import librosa
import torch
import time
import traceback
from itertools import chain
from utils import mix_model
import base64
from io import BytesIO
import soundfile as sf
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('markdown_it').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)
logging.getLogger('matplotlib').setLevel(logging.WARNING)
logging.getLogger('multipart').setLevel(logging.WARNING)
model = None
spk = None
debug = False
cuda = {}
if torch.cuda.is_available():
for i in range(torch.cuda.device_count()):
device_name = torch.cuda.get_device_properties(i).name
cuda[f"CUDA:{i} {device_name}"] = f"cuda:{i}"
def upload_mix_append_file(files,sfiles):
try:
if(sfiles == None):
file_paths = [file.name for file in files]
else:
file_paths = [file.name for file in chain(files,sfiles)]
p = {file:100 for file in file_paths}
return file_paths,mix_model_output1.update(value=json.dumps(p,indent=2))
except Exception as e:
if debug: traceback.print_exc()
raise gr.Error(e)
def mix_submit_click(js,mode):
try:
assert js.lstrip()!=""
modes = {"Convex combination":0, "linear combination":1}
mode = modes[mode]
data = json.loads(js)
data = list(data.items())
model_path,mix_rate = zip(*data)
path = mix_model(model_path,mix_rate,mode)
return f"Success, the file was saved in{path}"
except Exception as e:
if debug: traceback.print_exc()
raise gr.Error(e)
def updata_mix_info(files):
try:
if files == None : return mix_model_output1.update(value="")
p = {file.name:100 for file in files}
return mix_model_output1.update(value=json.dumps(p,indent=2))
except Exception as e:
if debug: traceback.print_exc()
raise gr.Error(e)
def modelAnalysis(model_path,config_path,cluster_model_path,device,enhance):
global model
try:
device = cuda[device] if "CUDA" in device else device
model = Svc(model_path, config_path, device=device if device!="Auto" else None, cluster_model_path = cluster_model_path.name if cluster_model_path != None else "",nsf_hifigan_enhance=enhance)
spks = list(model.spk2id.keys())
device_name = torch.cuda.get_device_properties(model.dev).name if "cuda" in str(model.dev) else str(model.dev)
msg = f"Successfully loaded the model to the device{device_name}superior\n"
if cluster_model_path is None:
msg += "Clustering model not loaded\n"
else:
msg += f"clustering model{cluster_model_path.name}Loading successfully\n"
msg += "Available sounds for the current model:\n"
for i in spks:
msg += i + " "
return sid.update(choices = spks,value=spks[0]), msg
except Exception as e:
if debug: traceback.print_exc()
raise gr.Error(e)
def modelUnload():
global model
if model is None:
return sid.update(choices = [],value=""),"No models need to be uninstalled!"
else:
model.unload_model()
model = None
torch.cuda.empty_cache()
return sid.update(choices = [],value=""),"Model unloading completed!"
# def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold):
# global model
# try:
# if input_audio is None:
# raise gr.Error("You need to upload audio")
# if model is None:
# raise gr.Error("You need to specify the model")
# sampling_rate, audio = input_audio
# # print(audio.shape,sampling_rate)
# audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
# if len(audio.shape) > 1:
# audio = librosa.to_mono(audio.transpose(1, 0))
# temp_path = "temp.wav"
# soundfile.write(temp_path, audio, sampling_rate, format="wav")
# _audio = model.slice_inference(temp_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold)
# model.clear_empty()
# os.remove(temp_path)
# #Build the path to save the file and save it to the results folder
# try:
# timestamp = str(int(time.time()))
# filename = sid + "_" + timestamp + ".wav"
# output_file = os.path.join("./results", filename)
# soundfile.write(output_file, _audio, model.target_sample, format="wav")
# return f"The inference is successful and the audio file is saved as results/{filename}", (model.target_sample, _audio)
# except Exception as e:
# if debug: traceback.print_exc()
# return f"File saving failed, please save manually", (model.target_sample, _audio)
# except Exception as e:
# if debug: traceback.print_exc()
# raise gr.Error(e)
def vc_fn(sid, input_audio, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold):
global model
try:
if input_audio is None:
raise gr.Error("You need to upload audio")
if model is None:
raise gr.Error("You need to specify the model")
sampling_rate, audio = input_audio
audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
if len(audio.shape) > 1:
audio = librosa.to_mono(audio.transpose(1, 0))
# Use the RAM-based filesystem for temporary storage
temp_path = "/dev/shm/temp.wav"
sf.write(temp_path, audio, sampling_rate, format="wav")
_audio = model.slice_inference(temp_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold)
# Clean up the temporary file to free up RAM
os.remove(temp_path)
return f"The inference was successful! Listen to the processed audio below.", (model.target_sample, _audio)
except Exception as e:
if debug:
traceback.print_exc()
raise gr.Error(str(e))
def tts_func(_text,_rate,_voice):
#Use edge-tts to convert text into audio
# voice = "zh-CN-XiaoyiNeural"#Female, higher pitch
# voice = "zh-CN-YunxiNeural"#male
voice = "zh-CN-YunxiNeural"#男性
if ( _voice == "女" ) : voice = "zh-CN-XiaoyiNeural"
output_file = _text[0:10]+".wav"
# communicate = edge_tts.Communicate(_text, voice)
# await communicate.save(output_file)
if _rate>=0:
ratestr="+{:.0%}".format(_rate)
elif _rate<0:
ratestr="{:.0%}".format(_rate)#Minus sign comes with
p=subprocess.Popen("edge-tts "+
" --text "+_text+
" --write-media "+output_file+
" --voice "+voice+
" --rate="+ratestr
,shell=True,
stdout=subprocess.PIPE,
stdin=subprocess.PIPE)
p.wait()
return output_file
def text_clear(text):
return re.sub(r"[\n\,\(\) ]", "", text)
def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,f0_predictor,enhancer_adaptive_key,cr_threshold):
#Use edge-tts to convert text into audio
text2tts=text_clear(text2tts)
output_file=tts_func(text2tts,tts_rate,tts_voice)
#Adjust sampling rate
sr2=44100
wav, sr = librosa.load(output_file)
wav2 = librosa.resample(wav, orig_sr=sr, target_sr=sr2)
save_path2= text2tts[0:10]+"_44k"+".wav"
wavfile.write(save_path2,sr2,
(wav2 * np.iinfo(np.int16).max).astype(np.int16)
)
#Read audio
sample_rate, data=gr_pu.audio_from_file(save_path2)
vc_input=(sample_rate, data)
a,b=vc_fn(sid, vc_input, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold)
os.remove(output_file)
os.remove(save_path2)
return a,b
def debug_change():
global debug
debug = debug_button.value
with gr.Blocks(
theme=gr.themes.Base(
primary_hue = gr.themes.colors.green,
font=["Source Sans Pro", "Arial", "sans-serif"],
font_mono=['JetBrains mono', "Consolas", 'Courier New']
),
) as app:
with gr.Tabs():
with gr.TabItem("reasoning"):
gr.Markdown(value="""
So-vits-svc 4.0 reasoning webui
""")
with gr.Row(variant="panel"):
with gr.Column():
gr.Markdown(value="""
<font size=2> Model settings</font>
""")
model_path = gr.Text(label="Select model file", value="logs/44k/G_709600.pth")
config_path = gr.Text(label="Select profile", value="configs/config.json")
print('model_path', model_path)
print('config_path', config_path)
cluster_model_path = gr.File(label="Select the clustering model file (if not selected, you can leave it unselected)")
device = gr.Dropdown(label="Inference device, the default is to automatically select the CPU and GPU", choices=["Auto",*cuda.keys(),"CPU"], value="Auto")
enhance = gr.Checkbox(label="Whether to use NSF_HIFIGAN enhancement. This option has a certain sound quality enhancement effect on some models with small training sets, but has a negative effect on well-trained models. It is turned off by default.", value=False)
with gr.Column():
gr.Markdown(value="""
<font size=3>After all files on the left are selected (all file modules display download), click "Load Model" to analyze:</font>
""")
model_load_button = gr.Button(value="Load model", variant="primary")
model_unload_button = gr.Button(value="Unload model", variant="primary")
sid = gr.Dropdown(label="timbre (speaker)")
sid_output = gr.Textbox(label="Output Message")
with gr.Row(variant="panel"):
with gr.Column():
gr.Markdown(value="""
<font size=2> Inference settings</font>
""")
auto_f0 = gr.Checkbox(label="Automatic f0 prediction, combined with the clustering model f0 prediction effect is better, but it will cause the pitch change function to fail (only for voice conversion, singing if this option is checked will cause extreme out-of-tune)", value=False)
f0_predictor = gr.Dropdown(label="Select F0 predictor, you can choose crepe, pm, dio, harvest, the default is pm (note: crepe uses the mean filter for the original F0)", choices=["pm","dio","harvest","crepe"], value="pm")
vc_transform = gr.Number(label="Pitch change (integer, can be positive or negative, number of semitones, rising an octave is 12)", value=0)
cluster_ratio = gr.Number(label="Clustering model mixing ratio, between 0 and 1, 0 means clustering is not enabled. Using the clustering model can improve the timbre similarity, but it will lead to a decrease in pronunciation (if used, it is recommended to be around 0.5)", value=0)
slice_db = gr.Number(label="slice threshold", value=-40)
noise_scale = gr.Number(label="noise_scale It is recommended not to move, as it will affect the sound quality and metaphysical parameters.", value=0.4)
with gr.Column():
pad_seconds = gr.Number(label="Infer the audio pad seconds. Due to unknown reasons, there will be abnormal sound at the beginning and end. It will not appear after a short silent section of the pad.", value=0.5)
cl_num = gr.Number(label="Audio is automatically sliced, 0 means no slicing, the unit is seconds (s)", value=0)
lg_num = gr.Number(label="The cross-fade length of the audio slices at both ends. If the vocals are incoherent after automatic slicing, you can adjust this value. If it is coherent, it is recommended to use the default value 0. Note that this setting will affect the inference speed. The unit is seconds/s.", value=0)
lgr_num = gr.Number(label="After automatic audio slicing, the head and tail of each slice need to be discarded. This parameter sets the proportion of intersection length retention, ranging from 0-1, left open and right closed", value=0.75)
enhancer_adaptive_key = gr.Number(label="Adapt the enhancer to a higher range (in semitones) | Default is 0", value=0)
cr_threshold = gr.Number(label="F0 filter threshold, only effective when crepe is started. The value range is from 0-1. Lowering this value can reduce the probability of out-of-tune, but it will increase the mute sound.", value=0.05)
with gr.Tabs():
with gr.TabItem("audio to audio"):
vc_input3 = gr.Audio(label="Select audio")
vc_submit = gr.Button("audio conversion", variant="primary")
with gr.TabItem("Text to audio"):
text2tts=gr.Textbox(label="Enter the text you want to translate here. Note that it is recommended to turn on F0 prediction when using this function, otherwise it will be very strange.")
tts_rate = gr.Number(label="tts speaking speed", value=0)
tts_voice = gr.Radio(label="gender",choices=["male","female"], value="male")
vc_submit2 = gr.Button("text conversion", variant="primary")
with gr.Row():
with gr.Column():
vc_output1 = gr.Textbox(label="Output Message")
with gr.Column():
vc_output2 = gr.Audio(label="Output Audio", interactive=False)
with gr.TabItem("Gadget/Lab Features"):
gr.Markdown(value="""
<font size=2> So-vits-svc 4.0 Gadget/Lab Features</font>
""")
with gr.Tabs():
with gr.TabItem("static sound fusion"):
gr.Markdown(value="""
<font size=2> Introduction: This function can synthesize multiple sound models into one sound model (convex combination or linear combination of multiple model parameters), thereby creating sound lines that do not exist in reality.
Notice:
1. This function only supports single-speaker models
2. If you forcibly use a multi-speaker model, you need to ensure that the number of speakers in multiple models is the same so that sounds under the same SpaekerID can be mixed.
3. Ensure that the model fields in the config.json of all models to be mixed are the same
4. The output hybrid model can use any config.json of the model to be synthesized, but the clustering model cannot be used.
5. When uploading models in batches, it is best to put the models into a folder, select them and upload them together.
6. The recommended size for adjusting the mixing ratio is between 0-100. It can also be adjusted to other numbers, but unknown effects will occur in linear combination mode.
7. After the mixing is completed, the file will be saved in the project root directory with the file name output.pth
8. The convex combination mode will perform Softmax on the mixing ratio so that the mixing ratio adds up to 1, while the linear combination mode will not
</font>
""")
mix_model_path = gr.Files(label="Select the required hybrid model file")
mix_model_upload_button = gr.UploadButton("Select/Append requires hybrid model files", file_count="multiple", variant="primary")
mix_model_output1 = gr.Textbox(
label="Mixing ratio adjustment, unit/%",
interactive = True
)
mix_mode = gr.Radio(choices=["Convex combination", "linear combination"], label="Fusion mode",value="Convex combination",interactive = True)
mix_submit = gr.Button("Voice fusion starts", variant="primary")
mix_model_output2 = gr.Textbox(
label="Output Message"
)
mix_model_path.change(updata_mix_info,[mix_model_path],[mix_model_output1])
mix_model_upload_button.upload(upload_mix_append_file, [mix_model_upload_button,mix_model_path], [mix_model_path,mix_model_output1])
mix_submit.click(mix_submit_click, [mix_model_output1,mix_mode], [mix_model_output2])
with gr.Tabs():
with gr.Row(variant="panel"):
with gr.Column():
gr.Markdown(value="""
<font size=2> WebUI settings</font>
""")
debug_button = gr.Checkbox(label="Debug mode, if you need to turn it on to report bugs to the community, the console can display specific error prompts after turning it on.", value=debug)
vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold], [vc_output1, gr.components.Audio(type="numpy", label="Processed Audio")])
vc_submit2.click(vc_fn2, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,f0_predictor,enhancer_adaptive_key,cr_threshold], [vc_output1, vc_output2])
debug_button.change(debug_change,[],[])
model_load_button.click(modelAnalysis,[model_path,config_path,cluster_model_path,device,enhance],[sid,sid_output])
model_unload_button.click(modelUnload,[],[sid,sid_output])
app.launch(debug=True)