Spaces:
Sleeping
Sleeping
| import io | |
| import os | |
| import tempfile | |
| # os.system("wget -P cvec/ https://huggingface.co/spaces/innnky/nanami/resolve/main/checkpoint_best_legacy_500.pt") | |
| import gradio as gr | |
| import gradio.processing_utils as gr_pu | |
| import librosa | |
| import numpy as np | |
| import soundfile | |
| from inference.infer_tool import Svc | |
| import logging | |
| import re | |
| import json | |
| import subprocess | |
| import edge_tts | |
| import asyncio | |
| from scipy.io import wavfile | |
| import librosa | |
| import torch | |
| import time | |
| import traceback | |
| from itertools import chain | |
| from utils import mix_model | |
| import base64 | |
| from io import BytesIO | |
| import soundfile as sf | |
| logging.getLogger('numba').setLevel(logging.WARNING) | |
| logging.getLogger('markdown_it').setLevel(logging.WARNING) | |
| logging.getLogger('urllib3').setLevel(logging.WARNING) | |
| logging.getLogger('matplotlib').setLevel(logging.WARNING) | |
| logging.getLogger('multipart').setLevel(logging.WARNING) | |
| model = None | |
| spk = None | |
| debug = False | |
| cuda = {} | |
| if torch.cuda.is_available(): | |
| for i in range(torch.cuda.device_count()): | |
| device_name = torch.cuda.get_device_properties(i).name | |
| cuda[f"CUDA:{i} {device_name}"] = f"cuda:{i}" | |
| def upload_mix_append_file(files,sfiles): | |
| try: | |
| if(sfiles == None): | |
| file_paths = [file.name for file in files] | |
| else: | |
| file_paths = [file.name for file in chain(files,sfiles)] | |
| p = {file:100 for file in file_paths} | |
| return file_paths,mix_model_output1.update(value=json.dumps(p,indent=2)) | |
| except Exception as e: | |
| if debug: traceback.print_exc() | |
| raise gr.Error(e) | |
| def mix_submit_click(js,mode): | |
| try: | |
| assert js.lstrip()!="" | |
| modes = {"Convex combination":0, "linear combination":1} | |
| mode = modes[mode] | |
| data = json.loads(js) | |
| data = list(data.items()) | |
| model_path,mix_rate = zip(*data) | |
| path = mix_model(model_path,mix_rate,mode) | |
| return f"Success, the file was saved in{path}" | |
| except Exception as e: | |
| if debug: traceback.print_exc() | |
| raise gr.Error(e) | |
| def updata_mix_info(files): | |
| try: | |
| if files == None : return mix_model_output1.update(value="") | |
| p = {file.name:100 for file in files} | |
| return mix_model_output1.update(value=json.dumps(p,indent=2)) | |
| except Exception as e: | |
| if debug: traceback.print_exc() | |
| raise gr.Error(e) | |
| def modelAnalysis(model_path,config_path,cluster_model_path,device,enhance): | |
| global model | |
| try: | |
| device = cuda[device] if "CUDA" in device else device | |
| model = Svc(model_path, config_path, device=device if device!="Auto" else None, cluster_model_path = cluster_model_path.name if cluster_model_path != None else "",nsf_hifigan_enhance=enhance) | |
| spks = list(model.spk2id.keys()) | |
| device_name = torch.cuda.get_device_properties(model.dev).name if "cuda" in str(model.dev) else str(model.dev) | |
| msg = f"Successfully loaded the model to the device{device_name}superior\n" | |
| if cluster_model_path is None: | |
| msg += "Clustering model not loaded\n" | |
| else: | |
| msg += f"clustering model{cluster_model_path.name}Loading successfully\n" | |
| msg += "Available sounds for the current model:\n" | |
| for i in spks: | |
| msg += i + " " | |
| return sid.update(choices = spks,value=spks[0]), msg | |
| except Exception as e: | |
| if debug: traceback.print_exc() | |
| raise gr.Error(e) | |
| def modelUnload(): | |
| global model | |
| if model is None: | |
| return sid.update(choices = [],value=""),"No models need to be uninstalled!" | |
| else: | |
| model.unload_model() | |
| model = None | |
| torch.cuda.empty_cache() | |
| return sid.update(choices = [],value=""),"Model unloading completed!" | |
| # def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold): | |
| # global model | |
| # try: | |
| # if input_audio is None: | |
| # raise gr.Error("You need to upload audio") | |
| # if model is None: | |
| # raise gr.Error("You need to specify the model") | |
| # sampling_rate, audio = input_audio | |
| # # print(audio.shape,sampling_rate) | |
| # audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) | |
| # if len(audio.shape) > 1: | |
| # audio = librosa.to_mono(audio.transpose(1, 0)) | |
| # temp_path = "temp.wav" | |
| # soundfile.write(temp_path, audio, sampling_rate, format="wav") | |
| # _audio = model.slice_inference(temp_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold) | |
| # model.clear_empty() | |
| # os.remove(temp_path) | |
| # #Build the path to save the file and save it to the results folder | |
| # try: | |
| # timestamp = str(int(time.time())) | |
| # filename = sid + "_" + timestamp + ".wav" | |
| # output_file = os.path.join("./results", filename) | |
| # soundfile.write(output_file, _audio, model.target_sample, format="wav") | |
| # return f"The inference is successful and the audio file is saved as results/{filename}", (model.target_sample, _audio) | |
| # except Exception as e: | |
| # if debug: traceback.print_exc() | |
| # return f"File saving failed, please save manually", (model.target_sample, _audio) | |
| # except Exception as e: | |
| # if debug: traceback.print_exc() | |
| # raise gr.Error(e) | |
| def vc_fn(sid, input_audio, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold): | |
| global model | |
| try: | |
| if input_audio is None: | |
| raise gr.Error("You need to upload audio") | |
| if model is None: | |
| raise gr.Error("You need to specify the model") | |
| sampling_rate, audio = input_audio | |
| audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) | |
| if len(audio.shape) > 1: | |
| audio = librosa.to_mono(audio.transpose(1, 0)) | |
| # Use the RAM-based filesystem for temporary storage | |
| temp_path = "/dev/shm/temp.wav" | |
| sf.write(temp_path, audio, sampling_rate, format="wav") | |
| _audio = model.slice_inference(temp_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold) | |
| # Clean up the temporary file to free up RAM | |
| os.remove(temp_path) | |
| return f"The inference was successful! Listen to the processed audio below.", (model.target_sample, _audio) | |
| except Exception as e: | |
| if debug: | |
| traceback.print_exc() | |
| raise gr.Error(str(e)) | |
| def tts_func(_text,_rate,_voice): | |
| #Use edge-tts to convert text into audio | |
| # voice = "zh-CN-XiaoyiNeural"#Female, higher pitch | |
| # voice = "zh-CN-YunxiNeural"#male | |
| voice = "zh-CN-YunxiNeural"#男性 | |
| if ( _voice == "女" ) : voice = "zh-CN-XiaoyiNeural" | |
| output_file = _text[0:10]+".wav" | |
| # communicate = edge_tts.Communicate(_text, voice) | |
| # await communicate.save(output_file) | |
| if _rate>=0: | |
| ratestr="+{:.0%}".format(_rate) | |
| elif _rate<0: | |
| ratestr="{:.0%}".format(_rate)#Minus sign comes with | |
| p=subprocess.Popen("edge-tts "+ | |
| " --text "+_text+ | |
| " --write-media "+output_file+ | |
| " --voice "+voice+ | |
| " --rate="+ratestr | |
| ,shell=True, | |
| stdout=subprocess.PIPE, | |
| stdin=subprocess.PIPE) | |
| p.wait() | |
| return output_file | |
| def text_clear(text): | |
| return re.sub(r"[\n\,\(\) ]", "", text) | |
| def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,f0_predictor,enhancer_adaptive_key,cr_threshold): | |
| #Use edge-tts to convert text into audio | |
| text2tts=text_clear(text2tts) | |
| output_file=tts_func(text2tts,tts_rate,tts_voice) | |
| #Adjust sampling rate | |
| sr2=44100 | |
| wav, sr = librosa.load(output_file) | |
| wav2 = librosa.resample(wav, orig_sr=sr, target_sr=sr2) | |
| save_path2= text2tts[0:10]+"_44k"+".wav" | |
| wavfile.write(save_path2,sr2, | |
| (wav2 * np.iinfo(np.int16).max).astype(np.int16) | |
| ) | |
| #Read audio | |
| sample_rate, data=gr_pu.audio_from_file(save_path2) | |
| vc_input=(sample_rate, data) | |
| a,b=vc_fn(sid, vc_input, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold) | |
| os.remove(output_file) | |
| os.remove(save_path2) | |
| return a,b | |
| def debug_change(): | |
| global debug | |
| debug = debug_button.value | |
| with gr.Blocks( | |
| theme=gr.themes.Base( | |
| primary_hue = gr.themes.colors.green, | |
| font=["Source Sans Pro", "Arial", "sans-serif"], | |
| font_mono=['JetBrains mono', "Consolas", 'Courier New'] | |
| ), | |
| ) as app: | |
| with gr.Tabs(): | |
| with gr.TabItem("reasoning"): | |
| gr.Markdown(value=""" | |
| So-vits-svc 4.0 reasoning webui | |
| """) | |
| with gr.Row(variant="panel"): | |
| with gr.Column(): | |
| gr.Markdown(value=""" | |
| <font size=2> Model settings</font> | |
| """) | |
| model_path = gr.Text(label="Select model file", value="logs/44k/G_709600.pth") | |
| config_path = gr.Text(label="Select profile", value="configs/config.json") | |
| print('model_path', model_path) | |
| print('config_path', config_path) | |
| cluster_model_path = gr.File(label="Select the clustering model file (if not selected, you can leave it unselected)") | |
| device = gr.Dropdown(label="Inference device, the default is to automatically select the CPU and GPU", choices=["Auto",*cuda.keys(),"CPU"], value="Auto") | |
| enhance = gr.Checkbox(label="Whether to use NSF_HIFIGAN enhancement. This option has a certain sound quality enhancement effect on some models with small training sets, but has a negative effect on well-trained models. It is turned off by default.", value=False) | |
| with gr.Column(): | |
| gr.Markdown(value=""" | |
| <font size=3>After all files on the left are selected (all file modules display download), click "Load Model" to analyze:</font> | |
| """) | |
| model_load_button = gr.Button(value="Load model", variant="primary") | |
| model_unload_button = gr.Button(value="Unload model", variant="primary") | |
| sid = gr.Dropdown(label="timbre (speaker)") | |
| sid_output = gr.Textbox(label="Output Message") | |
| with gr.Row(variant="panel"): | |
| with gr.Column(): | |
| gr.Markdown(value=""" | |
| <font size=2> Inference settings</font> | |
| """) | |
| auto_f0 = gr.Checkbox(label="Automatic f0 prediction, combined with the clustering model f0 prediction effect is better, but it will cause the pitch change function to fail (only for voice conversion, singing if this option is checked will cause extreme out-of-tune)", value=False) | |
| f0_predictor = gr.Dropdown(label="Select F0 predictor, you can choose crepe, pm, dio, harvest, the default is pm (note: crepe uses the mean filter for the original F0)", choices=["pm","dio","harvest","crepe"], value="pm") | |
| vc_transform = gr.Number(label="Pitch change (integer, can be positive or negative, number of semitones, rising an octave is 12)", value=0) | |
| cluster_ratio = gr.Number(label="Clustering model mixing ratio, between 0 and 1, 0 means clustering is not enabled. Using the clustering model can improve the timbre similarity, but it will lead to a decrease in pronunciation (if used, it is recommended to be around 0.5)", value=0) | |
| slice_db = gr.Number(label="slice threshold", value=-40) | |
| noise_scale = gr.Number(label="noise_scale It is recommended not to move, as it will affect the sound quality and metaphysical parameters.", value=0.4) | |
| with gr.Column(): | |
| pad_seconds = gr.Number(label="Infer the audio pad seconds. Due to unknown reasons, there will be abnormal sound at the beginning and end. It will not appear after a short silent section of the pad.", value=0.5) | |
| cl_num = gr.Number(label="Audio is automatically sliced, 0 means no slicing, the unit is seconds (s)", value=0) | |
| lg_num = gr.Number(label="The cross-fade length of the audio slices at both ends. If the vocals are incoherent after automatic slicing, you can adjust this value. If it is coherent, it is recommended to use the default value 0. Note that this setting will affect the inference speed. The unit is seconds/s.", value=0) | |
| lgr_num = gr.Number(label="After automatic audio slicing, the head and tail of each slice need to be discarded. This parameter sets the proportion of intersection length retention, ranging from 0-1, left open and right closed", value=0.75) | |
| enhancer_adaptive_key = gr.Number(label="Adapt the enhancer to a higher range (in semitones) | Default is 0", value=0) | |
| cr_threshold = gr.Number(label="F0 filter threshold, only effective when crepe is started. The value range is from 0-1. Lowering this value can reduce the probability of out-of-tune, but it will increase the mute sound.", value=0.05) | |
| with gr.Tabs(): | |
| with gr.TabItem("audio to audio"): | |
| vc_input3 = gr.Audio(label="Select audio") | |
| vc_submit = gr.Button("audio conversion", variant="primary") | |
| with gr.TabItem("Text to audio"): | |
| text2tts=gr.Textbox(label="Enter the text you want to translate here. Note that it is recommended to turn on F0 prediction when using this function, otherwise it will be very strange.") | |
| tts_rate = gr.Number(label="tts speaking speed", value=0) | |
| tts_voice = gr.Radio(label="gender",choices=["male","female"], value="male") | |
| vc_submit2 = gr.Button("text conversion", variant="primary") | |
| with gr.Row(): | |
| with gr.Column(): | |
| vc_output1 = gr.Textbox(label="Output Message") | |
| with gr.Column(): | |
| vc_output2 = gr.Audio(label="Output Audio", interactive=False) | |
| with gr.TabItem("Gadget/Lab Features"): | |
| gr.Markdown(value=""" | |
| <font size=2> So-vits-svc 4.0 Gadget/Lab Features</font> | |
| """) | |
| with gr.Tabs(): | |
| with gr.TabItem("static sound fusion"): | |
| gr.Markdown(value=""" | |
| <font size=2> Introduction: This function can synthesize multiple sound models into one sound model (convex combination or linear combination of multiple model parameters), thereby creating sound lines that do not exist in reality. | |
| Notice: | |
| 1. This function only supports single-speaker models | |
| 2. If you forcibly use a multi-speaker model, you need to ensure that the number of speakers in multiple models is the same so that sounds under the same SpaekerID can be mixed. | |
| 3. Ensure that the model fields in the config.json of all models to be mixed are the same | |
| 4. The output hybrid model can use any config.json of the model to be synthesized, but the clustering model cannot be used. | |
| 5. When uploading models in batches, it is best to put the models into a folder, select them and upload them together. | |
| 6. The recommended size for adjusting the mixing ratio is between 0-100. It can also be adjusted to other numbers, but unknown effects will occur in linear combination mode. | |
| 7. After the mixing is completed, the file will be saved in the project root directory with the file name output.pth | |
| 8. The convex combination mode will perform Softmax on the mixing ratio so that the mixing ratio adds up to 1, while the linear combination mode will not | |
| </font> | |
| """) | |
| mix_model_path = gr.Files(label="Select the required hybrid model file") | |
| mix_model_upload_button = gr.UploadButton("Select/Append requires hybrid model files", file_count="multiple", variant="primary") | |
| mix_model_output1 = gr.Textbox( | |
| label="Mixing ratio adjustment, unit/%", | |
| interactive = True | |
| ) | |
| mix_mode = gr.Radio(choices=["Convex combination", "linear combination"], label="Fusion mode",value="Convex combination",interactive = True) | |
| mix_submit = gr.Button("Voice fusion starts", variant="primary") | |
| mix_model_output2 = gr.Textbox( | |
| label="Output Message" | |
| ) | |
| mix_model_path.change(updata_mix_info,[mix_model_path],[mix_model_output1]) | |
| mix_model_upload_button.upload(upload_mix_append_file, [mix_model_upload_button,mix_model_path], [mix_model_path,mix_model_output1]) | |
| mix_submit.click(mix_submit_click, [mix_model_output1,mix_mode], [mix_model_output2]) | |
| with gr.Tabs(): | |
| with gr.Row(variant="panel"): | |
| with gr.Column(): | |
| gr.Markdown(value=""" | |
| <font size=2> WebUI settings</font> | |
| """) | |
| debug_button = gr.Checkbox(label="Debug mode, if you need to turn it on to report bugs to the community, the console can display specific error prompts after turning it on.", value=debug) | |
| vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold], [vc_output1, gr.components.Audio(type="numpy", label="Processed Audio")]) | |
| vc_submit2.click(vc_fn2, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,f0_predictor,enhancer_adaptive_key,cr_threshold], [vc_output1, vc_output2]) | |
| debug_button.change(debug_change,[],[]) | |
| model_load_button.click(modelAnalysis,[model_path,config_path,cluster_model_path,device,enhance],[sid,sid_output]) | |
| model_unload_button.click(modelUnload,[],[sid,sid_output]) | |
| app.launch(debug=True) | |