Spaces:
Runtime error
Runtime error
| import multiprocessing | |
| import os | |
| import re | |
| import torch | |
| import glob | |
| import gradio as gr | |
| import librosa | |
| import numpy as np | |
| import soundfile as sf | |
| from inference.infer_tool import Svc | |
| import logging | |
| import json | |
| import yaml | |
| import time | |
| import subprocess | |
| import shutil | |
| import utils | |
| import datetime | |
| import traceback | |
| from utils import mix_model | |
| from onnxexport.model_onnx import SynthesizerTrn | |
| from itertools import chain | |
| from compress_model import removeOptimizer | |
| from auto_slicer import AutoSlicer | |
| logging.getLogger('numba').setLevel(logging.WARNING) | |
| logging.getLogger('markdown_it').setLevel(logging.WARNING) | |
| logging.getLogger('urllib3').setLevel(logging.WARNING) | |
| logging.getLogger('matplotlib').setLevel(logging.WARNING) | |
| workdir = "logs/44k" | |
| diff_workdir = "logs/44k/diffusion" | |
| config_dir = "configs/" | |
| raw_path = "dataset_raw" | |
| raw_wavs_path = "raw" | |
| models_backup_path = 'models_backup' | |
| root_dir = "checkpoints" | |
| debug = False | |
| sovits_params = {} | |
| diff_params = {} | |
| loaded = None | |
| def debug_change(): | |
| global debug | |
| debug = debug_button.value | |
| def get_default_settings(): | |
| global sovits_params, diff_params | |
| yaml_path = "settings.yaml" | |
| with open(yaml_path, 'r') as f: | |
| default_settings = yaml.safe_load(f) | |
| sovits_params = default_settings['sovits_params'] | |
| diff_params = default_settings['diff_params'] | |
| return sovits_params, diff_params | |
| def save_default_settings(log_interval,eval_interval,keep_ckpts,batch_size,learning_rate,fp16_run,all_in_mem,num_workers,cache_all_data,cache_device,amp_dtype,diff_batch_size,diff_lr,diff_interval_log,diff_interval_val,diff_force_save): | |
| yaml_path = "settings.yaml" | |
| with open(yaml_path, 'r') as f: | |
| default_settings = yaml.safe_load(f) | |
| default_settings['sovits_params']['log_interval'] = int(log_interval) | |
| default_settings['sovits_params']['eval_interval'] = int(eval_interval) | |
| default_settings['sovits_params']['keep_ckpts'] = int(keep_ckpts) | |
| default_settings['sovits_params']['batch_size'] = int(batch_size) | |
| default_settings['sovits_params']['learning_rate'] = float(learning_rate) | |
| default_settings['sovits_params']['fp16_run'] = fp16_run | |
| default_settings['sovits_params']['all_in_mem'] = all_in_mem | |
| default_settings['diff_params']['num_workers'] = int(num_workers) | |
| default_settings['diff_params']['cache_all_data'] = cache_all_data | |
| default_settings['diff_params']['cache_device'] = str(cache_device) | |
| default_settings['diff_params']['amp_dtype'] = str(amp_dtype) | |
| default_settings['diff_params']['diff_batch_size'] = int(diff_batch_size) | |
| default_settings['diff_params']['diff_lr'] = float(diff_lr) | |
| default_settings['diff_params']['diff_interval_log'] = int(diff_interval_log) | |
| default_settings['diff_params']['diff_interval_val'] = int(diff_interval_val) | |
| default_settings['diff_params']['diff_force_save'] = int(diff_force_save) | |
| with open(yaml_path, 'w') as y: | |
| yaml.safe_dump(default_settings, y, default_flow_style=False, sort_keys=False) | |
| return "成功保存默认配置" | |
| def get_model_info(choice_ckpt): | |
| pthfile = os.path.join(workdir, choice_ckpt) | |
| net = torch.load(pthfile, map_location=torch.device('cpu')) #cpu load | |
| spk_emb = net["model"].get("emb_g.weight") | |
| if spk_emb is None: | |
| return "所选模型缺少emb_g.weight,你可能选择了一个底模" | |
| _dim, _layer = spk_emb.size() | |
| model_type = { | |
| 768: "Vec768-Layer12", | |
| 256: "Vec256-Layer9 / HubertSoft", | |
| 1024: "Whisper-PPG" | |
| } | |
| return model_type.get(_layer, "不受支持的模型") | |
| def load_json_encoder(config_choice): | |
| config_file = os.path.join(config_dir + config_choice) | |
| with open(config_file, 'r') as f: | |
| config = json.load(f) | |
| try: | |
| config_encoder = str(config["model"]["speech_encoder"]) | |
| return config_encoder | |
| except Exception as e: | |
| if "speech_encoder" in str(e): | |
| return "你的配置文件似乎是未作兼容的旧版,请根据文档指示对你的配置文件进行修改" | |
| else: | |
| return f"出错了: {e}" | |
| def load_model_func(ckpt_name,cluster_name,config_name,enhance,diff_model_name,diff_config_name,only_diffusion,encoder,using_device): | |
| global model | |
| config_path = os.path.join(config_dir, config_name) | |
| diff_config_path = os.path.join(config_dir, diff_config_name) if diff_config_name != "no_diff_config" else "configs/diffusion.yaml" | |
| with open(config_path, 'r') as f: | |
| config = json.load(f) | |
| spk_dict = config["spk"] | |
| spk_name = config.get('spk', None) | |
| spk_choice = next(iter(spk_name)) if spk_name else "未检测到音色" | |
| ckpt_path = os.path.join(workdir, ckpt_name) | |
| _, _suffix = os.path.splitext(cluster_name) | |
| fr = True if _suffix == ".pkl" else False #如果是pkl后缀就启用特征检索 | |
| cluster_path = os.path.join(workdir, cluster_name) | |
| diff_model_path = os.path.join(diff_workdir, diff_model_name) | |
| shallow_diffusion = True if diff_model_name != "no_diff" else False | |
| use_spk_mix = False | |
| device = None if using_device == "Auto" else using_device | |
| model = Svc(ckpt_path, | |
| config_path, | |
| device, | |
| cluster_path, | |
| enhance, | |
| diff_model_path, | |
| diff_config_path, | |
| shallow_diffusion, | |
| only_diffusion, | |
| use_spk_mix, | |
| fr) | |
| spk_list = list(spk_dict.keys()) | |
| clip = 25 if encoder == "Whisper-PPG" else 0 #Whisper必须强制切片25秒 | |
| device_name = torch.cuda.get_device_properties(model.dev).name if "cuda" in str(model.dev) else str(model.dev) | |
| index_or_kmeans = "特征索引" if fr is True else "聚类模型" | |
| clu_load = "未加载" if cluster_name == "no_clu" else cluster_name | |
| diff_load = "未加载" if diff_model_name == "no_diff" else diff_model_name | |
| output_msg = f"模型被成功加载到了{device_name}上\n{index_or_kmeans}:{clu_load}\n扩散模型:{diff_load}" | |
| return output_msg, gr.Dropdown.update(choices=spk_list, value=spk_choice), clip | |
| def Newload_model_func(ckpt_name,cluster_name,config_name2,enhance2,diff_model_name2,diff_config_name2,only_diffusion2,encoder2,using_device2): | |
| global model, loaded | |
| config_name = config_name2.value | |
| enhance = enhance2.value | |
| diff_model_name = diff_model_name2.value | |
| diff_config_name = (diff_config_name2).value | |
| only_diffusion = (only_diffusion2).value | |
| encoder = (encoder2).value | |
| using_device = (using_device2).value | |
| config_path = os.path.join(config_dir, config_name) | |
| diff_config_path = os.path.join(config_dir, diff_config_name) if diff_config_name != "no_diff_config" else "configs/diffusion.yaml" | |
| with open(config_path, 'r') as f: | |
| config = json.load(f) | |
| spk_dict = config["spk"] | |
| spk_name = config.get('spk', None) | |
| spk_choice = next(iter(spk_name)) if spk_name else "未检测到音色" | |
| ckpt_path = os.path.join(workdir, ckpt_name) | |
| _, _suffix = os.path.splitext(cluster_name) | |
| fr = True if _suffix == ".pkl" else False #如果是pkl后缀就启用特征检索 | |
| cluster_path = os.path.join(workdir, cluster_name) | |
| diff_model_path = os.path.join(diff_workdir, diff_model_name) | |
| shallow_diffusion = True if diff_model_name != "no_diff" else False | |
| use_spk_mix = False | |
| device = None if using_device == "Auto" else using_device | |
| model = Svc(ckpt_path, | |
| config_path, | |
| device, | |
| cluster_path, | |
| enhance, | |
| diff_model_path, | |
| diff_config_path, | |
| shallow_diffusion, | |
| only_diffusion, | |
| use_spk_mix, | |
| fr) | |
| spk_list = list(spk_dict.keys()) | |
| clip = 25 if encoder == "Whisper-PPG" else 0 #Whisper必须强制切片25秒 | |
| device_name = torch.cuda.get_device_properties(model.dev).name if "cuda" in str(model.dev) else str(model.dev) | |
| index_or_kmeans = "特征索引" if fr is True else "聚类模型" | |
| clu_load = "未加载" if cluster_name == "no_clu" else cluster_name | |
| diff_load = "未加载" if diff_model_name == "no_diff" else diff_model_name | |
| loaded = cluster_name | |
| #output_msg = f"模型被成功加载到了{device_name}上\n{index_or_kmeans}:{clu_load}\n扩散模型:{diff_load}" | |
| #return output_msg, gr.Dropdown.update(choices=spk_list, value=spk_choice), clip | |
| def get_file_options(directory, extension): | |
| return [file for file in os.listdir(directory) if file.endswith(extension)] | |
| def load_options(): | |
| ckpt_list = [file for file in get_file_options(workdir, ".pth") if not file.startswith("D_")] | |
| config_list = get_file_options(config_dir, ".json") | |
| cluster_list = ["no_clu"] + get_file_options(workdir, ".pt") + get_file_options(workdir, ".pkl") # 聚类和特征检索模型 | |
| diff_list = ["no_diff"] + get_file_options(diff_workdir, ".pt") | |
| diff_config_list = get_file_options(config_dir, ".yaml") | |
| return ckpt_list, config_list, cluster_list, diff_list, diff_config_list | |
| def refresh_options(): | |
| ckpt_list, config_list, cluster_list, diff_list, diff_config_list = load_options() | |
| return ( | |
| choice_ckpt.update(choices=ckpt_list), | |
| config_choice.update(choices=config_list), | |
| cluster_choice.update(choices=cluster_list), | |
| diff_choice.update(choices=diff_list), | |
| diff_config_choice.update(choices=diff_config_list) | |
| ) | |
| def vc_infer(sid, input_audio, input_audio_path, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment): | |
| if np.issubdtype(input_audio.dtype, np.integer): | |
| input_audio = (input_audio / np.iinfo(input_audio.dtype).max).astype(np.float32) | |
| if len(input_audio.shape) > 1: | |
| input_audio = librosa.to_mono(input_audio.transpose(1, 0)) | |
| _audio = model.slice_inference( | |
| input_audio_path, | |
| sid, | |
| vc_transform, | |
| slice_db, | |
| cluster_ratio, | |
| auto_f0, | |
| noise_scale, | |
| pad_seconds, | |
| cl_num, | |
| lg_num, | |
| lgr_num, | |
| f0_predictor, | |
| enhancer_adaptive_key, | |
| cr_threshold, | |
| k_step, | |
| use_spk_mix, | |
| second_encoding, | |
| loudness_envelope_adjustment | |
| ) | |
| model.clear_empty() | |
| timestamp = str(int(time.time())) | |
| if not os.path.exists("results"): | |
| os.makedirs("results") | |
| output_file_name = os.path.splitext(os.path.basename(input_audio_path))[0] + "_" + sid + "_" + timestamp + ".wav" | |
| output_file_path = os.path.join("results", output_file_name) | |
| sf.write(output_file_path, _audio, model.target_sample, format="wav") | |
| return output_file_path | |
| def vc_fn(sid, input_audio, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment): | |
| global model | |
| try: | |
| if input_audio is None: | |
| return "You need to upload an audio", None | |
| if model is None: | |
| return "You need to upload an model", None | |
| sampling_rate, audio = input_audio | |
| temp_path = "temp.wav" | |
| sf.write(temp_path, audio, sampling_rate, format="wav") | |
| output_file_path = vc_infer(sid, audio, temp_path, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment) | |
| os.remove(temp_path) | |
| return "Success", output_file_path | |
| except Exception as e: | |
| if debug: traceback.print_exc() | |
| raise gr.Error(e) | |
| def vc_batch_fn(sid, input_audio_files, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment): | |
| global model | |
| try: | |
| if input_audio_files is None or len(input_audio_files) == 0: | |
| return "You need to upload at least one audio file" | |
| if model is None: | |
| return "You need to upload a model" | |
| for file_obj in input_audio_files: | |
| input_audio_path = file_obj.name | |
| audio, sampling_rate = sf.read(input_audio_path) | |
| vc_infer(sid, audio, input_audio_path, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment) | |
| return "批量推理完成,音频已经被保存到results文件夹" | |
| except Exception as e: | |
| if debug: traceback.print_exc() | |
| raise gr.Error(e) | |
| def tts_fn(_text, _speaker, sid, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold, k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment): | |
| global model | |
| try: | |
| subprocess.run([r"python", "tts.py", _text, _speaker]) | |
| sr = 44100 | |
| y, sr = librosa.load("tts.wav") | |
| resampled_y = librosa.resample(y, orig_sr=sr, target_sr=sr) | |
| sf.write("tts.wav", resampled_y, sr, subtype = "PCM_16") | |
| input_audio = "tts.wav" | |
| audio, sampling_rate = sf.read(input_audio) | |
| if model is None: | |
| return "You need to upload a model", None | |
| output_file_path = vc_infer(sid, audio, input_audio, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment) | |
| return "Success", output_file_path | |
| except Exception as e: | |
| if debug: traceback.print_exc() | |
| raise gr.Error(e) | |
| def load_raw_dirs(): | |
| illegal_files = [] | |
| #检查文件名 | |
| allowed_pattern = re.compile(r'^[a-zA-Z0-9_@#$%^&()_+\-=\s\.]*$') | |
| for root, dirs, files in os.walk(raw_path): | |
| if root != raw_path: # 只处理子文件夹内的文件 | |
| for file in files: | |
| file_name, _ = os.path.splitext(file) | |
| if not allowed_pattern.match(file_name): | |
| illegal_files.append(file) | |
| if len(illegal_files)!=0: | |
| return f"数据集文件名只能包含数字、字母、下划线,以下文件不符合要求,请改名后再试:{illegal_files}" | |
| #检查有没有小可爱不用wav文件当数据集 | |
| for root, dirs, files in os.walk(raw_path): | |
| if root != raw_path: # 只处理子文件夹内的文件 | |
| for file in files: | |
| if not file.lower().endswith('.wav'): | |
| illegal_files.append(file) | |
| if len(illegal_files)!=0: | |
| return f"以下文件为非wav格式文件,请删除后再试:{illegal_files}" | |
| spk_dirs = [] | |
| with os.scandir(raw_path) as entries: | |
| for entry in entries: | |
| if entry.is_dir(): | |
| spk_dirs.append(entry.name) | |
| if len(spk_dirs) != 0: | |
| return raw_dirs_list.update(value=spk_dirs) | |
| else: | |
| return raw_dirs_list.update(value="未找到数据集,请检查dataset_raw文件夹") | |
| def dataset_preprocess(encoder, f0_predictor, use_diff, vol_aug, skip_loudnorm, num_processes): | |
| diff_arg = "--use_diff" if use_diff else "" | |
| vol_aug_arg = "--vol_aug" if vol_aug else "" | |
| skip_loudnorm_arg = "--skip_loudnorm" if skip_loudnorm else "" | |
| preprocess_commands = [ | |
| r"python resample.py %s" % (skip_loudnorm_arg), | |
| r"python preprocess_flist_config.py --speech_encoder %s %s" % (encoder, vol_aug_arg), | |
| r"python preprocess_hubert_f0.py --num_processes %s --f0_predictor %s %s" % (num_processes ,f0_predictor, diff_arg) | |
| ] | |
| accumulated_output = "" | |
| #清空dataset | |
| dataset = os.listdir("dataset/44k") | |
| if len(dataset) != 0: | |
| for dir in dataset: | |
| dataset_dir = "dataset/44k/" + str(dir) | |
| if os.path.isdir(dataset_dir): | |
| shutil.rmtree(dataset_dir) | |
| accumulated_output += f"Deleting previous dataset: {dir}\n" | |
| for command in preprocess_commands: | |
| try: | |
| result = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, text=True) | |
| accumulated_output += f"Command: {command}, Using Encoder: {encoder}, Using f0 Predictor: {f0_predictor}\n" | |
| yield accumulated_output, None | |
| progress_line = None | |
| for line in result.stdout: | |
| if r"it/s" in line or r"s/it" in line: #防止进度条刷屏 | |
| progress_line = line | |
| else: | |
| accumulated_output += line | |
| if progress_line is None: | |
| yield accumulated_output, None | |
| else: | |
| yield accumulated_output + progress_line, None | |
| result.communicate() | |
| except subprocess.CalledProcessError as e: | |
| result = e.output | |
| accumulated_output += f"Error: {result}\n" | |
| yield accumulated_output, None | |
| if progress_line is not None: | |
| accumulated_output += progress_line | |
| accumulated_output += '-' * 50 + '\n' | |
| yield accumulated_output, None | |
| config_path = "configs/config.json" | |
| with open(config_path, 'r') as f: | |
| config = json.load(f) | |
| spk_name = config.get('spk', None) | |
| yield accumulated_output, gr.Textbox.update(value=spk_name) | |
| def regenerate_config(encoder, vol_aug): | |
| vol_aug_arg = "--vol_aug" if vol_aug else "" | |
| cmd = r"python preprocess_flist_config.py --speech_encoder %s %s" % (encoder, vol_aug_arg) | |
| output = "" | |
| try: | |
| result = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, text=True) | |
| for line in result.stdout: | |
| output += line | |
| output += "Regenerate config file successfully." | |
| except subprocess.CalledProcessError as e: | |
| result = e.output | |
| output += f"Error: {result}\n" | |
| return output | |
| def clear_output(): | |
| return gr.Textbox.update(value="Cleared!>_<") | |
| def read_config(config_path): | |
| with open(config_path, 'r') as config_file: | |
| config_data = json.load(config_file) | |
| return config_data | |
| def config_fn(log_interval, eval_interval, keep_ckpts, batch_size, lr, fp16_run, all_in_mem, diff_num_workers, diff_cache_all_data, diff_batch_size, diff_lr, diff_interval_log, diff_interval_val, diff_cache_device, diff_amp_dtype, diff_force_save): | |
| config_origin = "configs/config.json" | |
| diff_config = "configs/diffusion.yaml" | |
| config_data = read_config(config_origin) | |
| config_data['train']['log_interval'] = int(log_interval) | |
| config_data['train']['eval_interval'] = int(eval_interval) | |
| config_data['train']['keep_ckpts'] = int(keep_ckpts) | |
| config_data['train']['batch_size'] = int(batch_size) | |
| config_data['train']['learning_rate'] = float(lr) | |
| config_data['train']['fp16_run'] = fp16_run | |
| config_data['train']['all_in_mem'] = all_in_mem | |
| with open(config_origin, 'w') as config_file: | |
| json.dump(config_data, config_file, indent=4) | |
| with open(diff_config, 'r') as diff_yaml: | |
| diff_config_data = yaml.safe_load(diff_yaml) | |
| diff_config_data['train']['num_workers'] = int(diff_num_workers) | |
| diff_config_data['train']['cache_all_data'] = diff_cache_all_data | |
| diff_config_data['train']['batch_size'] = int(diff_batch_size) | |
| diff_config_data['train']['lr'] = float(diff_lr) | |
| diff_config_data['train']['interval_log'] = int(diff_interval_log) | |
| diff_config_data['train']['interval_val'] = int(diff_interval_val) | |
| diff_config_data['train']['cache_device'] = str(diff_cache_device) | |
| diff_config_data['train']['amp_dtype'] = str(diff_amp_dtype) | |
| diff_config_data['train']['interval_force_save'] = int(diff_force_save) | |
| with open(diff_config, 'w') as diff_yaml: | |
| yaml.safe_dump(diff_config_data, diff_yaml, default_flow_style=False, sort_keys=False) | |
| return "配置文件写入完成" | |
| def check_dataset(dataset_path): | |
| if not os.listdir(dataset_path): | |
| return "数据集不存在,请检查dataset文件夹" | |
| no_npy_pt_files = True | |
| for root, dirs, files in os.walk(dataset_path): | |
| for file in files: | |
| if file.endswith('.npy') or file.endswith('.pt'): | |
| no_npy_pt_files = False | |
| break | |
| if no_npy_pt_files: | |
| return "数据集中未检测到f0和hubert文件,可能是预处理未完成" | |
| return None | |
| def training(gpu_selection, encoder): | |
| config_data = read_config("configs/config.json") | |
| vol_emb = config_data["model"]["vol_embedding"] | |
| dataset_warn = check_dataset("dataset/44k") | |
| if dataset_warn is not None: | |
| return dataset_warn | |
| encoder_models = { #编码器好多,要塞不下了 | |
| "vec256l9": ("D_0.pth", "G_0.pth", "pre_trained_model"), | |
| "vec768l12": ("D_0.pth", "G_0.pth", "pre_trained_model/768l12/vol_emb" if vol_emb else "pre_trained_model/768l12"), | |
| "hubertsoft": ("D_0.pth", "G_0.pth", "pre_trained_model/hubertsoft"), | |
| "whisper-ppg": ("D_0.pth", "G_0.pth", "pre_trained_model/whisper-ppg"), | |
| "cnhubertlarge": ("D_0.pth", "G_0.pth", "pre_trained_model/cnhubertlarge"), | |
| "dphubert": ("D_0.pth", "G_0.pth", "pre_trained_model/dphubert"), | |
| "whisper-ppg-large": ("D_0.pth", "G_0.pth", "pre_trained_model/whisper-ppg-large") | |
| } | |
| if encoder not in encoder_models: | |
| return "未知编码器" | |
| d_0_file, g_0_file, encoder_model_path = encoder_models[encoder] | |
| d_0_path = os.path.join(encoder_model_path, d_0_file) | |
| g_0_path = os.path.join(encoder_model_path, g_0_file) | |
| timestamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M') | |
| new_backup_folder = os.path.join(models_backup_path, str(timestamp)) | |
| if os.listdir(workdir) != ['diffusion']: | |
| os.makedirs(new_backup_folder, exist_ok=True) | |
| for file in os.listdir(workdir): | |
| if file != "diffusion": | |
| shutil.move(os.path.join(workdir, file), os.path.join(new_backup_folder, file)) | |
| shutil.copy(d_0_path, os.path.join(workdir, "D_0.pth")) | |
| shutil.copy(g_0_path, os.path.join(workdir, "G_0.pth")) | |
| cmd = r"set CUDA_VISIBLE_DEVICES=%s && python train.py -c configs/config.json -m 44k" % (gpu_selection) | |
| subprocess.Popen(["cmd", "/c", "start", "cmd", "/k", cmd]) | |
| return "已经在新的终端窗口开始训练,请监看终端窗口的训练日志。在终端中按Ctrl+C可暂停训练。" | |
| def continue_training(gpu_selection, encoder): | |
| dataset_warn = check_dataset("dataset/44k") | |
| if dataset_warn is not None: | |
| return dataset_warn | |
| if encoder == "": | |
| return "请先选择预处理对应的编码器" | |
| all_files = os.listdir(workdir) | |
| model_files = [f for f in all_files if f.startswith('G_') and f.endswith('.pth')] | |
| if len(model_files) == 0: | |
| return "你还没有已开始的训练" | |
| cmd = r"set CUDA_VISIBLE_DEVICES=%s && python train.py -c configs/config.json -m 44k" % (gpu_selection) | |
| subprocess.Popen(["cmd", "/c", "start", "cmd", "/k", cmd]) | |
| return "已经在新的终端窗口开始训练,请监看终端窗口的训练日志。在终端中按Ctrl+C可暂停训练。" | |
| def kmeans_training(kmeans_gpu): | |
| if not os.listdir(r"dataset/44k"): | |
| return "数据集不存在,请检查dataset文件夹" | |
| cmd = r"python cluster/train_cluster.py --gpu" if kmeans_gpu else r"python cluster/train_cluster.py" | |
| subprocess.Popen(["cmd", "/c", "start", "cmd", "/k", cmd]) | |
| return "已经在新的终端窗口开始训练,训练聚类模型不会输出日志,CPU训练一般需要5-10分钟左右" | |
| def index_training(): | |
| if not os.listdir(r"dataset/44k"): | |
| return "数据集不存在,请检查dataset文件夹" | |
| cmd = r"python train_index.py -c configs/config.json" | |
| subprocess.Popen(["cmd", "/c", "start", "cmd", "/k", cmd]) | |
| return "已经在新的终端窗口开始训练" | |
| def diff_training(encoder): | |
| if not os.listdir(r"dataset/44k"): | |
| return "数据集不存在,请检查dataset文件夹" | |
| pre_trained_model_768l12 = "pre_trained_model/diffusion/768l12/model_0.pt" | |
| pre_trained_model_hubertsoft = "pre_trained_model/diffusion/hubertsoft/model_0.pt" | |
| timestamp = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M') | |
| new_backup_folder = os.path.join(models_backup_path, "diffusion", str(timestamp)) | |
| if len(os.listdir(diff_workdir)) != 0: | |
| os.makedirs(new_backup_folder, exist_ok=True) | |
| for file in os.listdir(diff_workdir): | |
| shutil.move(os.path.join(diff_workdir, file), os.path.join(new_backup_folder, file)) | |
| if encoder == "vec256l9" or encoder == "whisper-ppg": | |
| return "你所选的编码器暂时不支持训练扩散模型" | |
| elif encoder == "vec768l12": | |
| shutil.copy(pre_trained_model_768l12, os.path.join(diff_workdir, "model_0.pt")) | |
| elif encoder == "hubertsoft": | |
| shutil.copy(pre_trained_model_hubertsoft, os.path.join(diff_workdir, "model_0.pt")) | |
| else: | |
| return "请先选择编码器" | |
| subprocess.Popen(["cmd", "/c", "start", "cmd", "/k", r"python train_diff.py -c configs/diffusion.yaml"]) | |
| return "已经在新的终端窗口开始训练,请监看终端窗口的训练日志。在终端中按Ctrl+C可暂停训练。" | |
| def diff_continue_training(encoder): | |
| if not os.listdir(r"dataset/44k"): | |
| return "数据集不存在,请检查dataset文件夹" | |
| if encoder == "": | |
| return "请先选择预处理对应的编码器" | |
| all_files = os.listdir(diff_workdir) | |
| model_files = [f for f in all_files if f.endswith('.pt')] | |
| if len(model_files) == 0: | |
| return "你还没有已开始的训练" | |
| subprocess.Popen(["cmd", "/c", "start", "cmd", "/k", r"python train_diff.py -c configs/diffusion.yaml"]) | |
| return "已经在新的终端窗口开始训练,请监看终端窗口的训练日志。在终端中按Ctrl+C可暂停训练。" | |
| def upload_mix_append_file(files,sfiles): | |
| try: | |
| if(sfiles == None): | |
| file_paths = [file.name for file in files] | |
| else: | |
| file_paths = [file.name for file in chain(files,sfiles)] | |
| p = {file:100 for file in file_paths} | |
| return file_paths,mix_model_output1.update(value=json.dumps(p,indent=2)) | |
| except Exception as e: | |
| if debug: traceback.print_exc() | |
| raise gr.Error(e) | |
| def mix_submit_click(js,mode): | |
| try: | |
| assert js.lstrip()!="" | |
| modes = {"凸组合":0, "线性组合":1} | |
| mode = modes[mode] | |
| data = json.loads(js) | |
| data = list(data.items()) | |
| model_path,mix_rate = zip(*data) | |
| path = mix_model(model_path,mix_rate,mode) | |
| return f"成功,文件被保存在了{path}" | |
| except Exception as e: | |
| if debug: traceback.print_exc() | |
| raise gr.Error(e) | |
| def updata_mix_info(files): | |
| try: | |
| if files == None : return mix_model_output1.update(value="") | |
| p = {file.name:100 for file in files} | |
| return mix_model_output1.update(value=json.dumps(p,indent=2)) | |
| except Exception as e: | |
| if debug: traceback.print_exc() | |
| raise gr.Error(e) | |
| def pth_identify(): | |
| if not os.path.exists(root_dir): | |
| return f"未找到{root_dir}文件夹,请先创建一个{root_dir}文件夹并按第一步流程操作" | |
| model_dirs = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))] | |
| if not model_dirs: | |
| return f"未在{root_dir}文件夹中找到模型文件夹,请确保每个模型和配置文件都被放置在单独的文件夹中" | |
| valid_model_dirs = [] | |
| for path in model_dirs: | |
| pth_files = glob.glob(f"{root_dir}/{path}/*.pth") | |
| json_files = glob.glob(f"{root_dir}/{path}/*.json") | |
| if len(pth_files) != 1 or len(json_files) != 1: | |
| return f"错误: 在{root_dir}/{path}中找到了{len(pth_files)}个.pth文件和{len(json_files)}个.json文件。应当确保每个文件夹内有且只有一个.pth文件和.json文件" | |
| valid_model_dirs.append(path) | |
| return f"成功识别了{len(valid_model_dirs)}个模型:{valid_model_dirs}" | |
| def onnx_export(): | |
| model_dirs = [d for d in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, d))] | |
| try: | |
| for path in model_dirs: | |
| pth_files = glob.glob(f"{root_dir}/{path}/*.pth") | |
| json_files = glob.glob(f"{root_dir}/{path}/*.json") | |
| model_file = pth_files[0] | |
| json_file = json_files[0] | |
| with open(json_file, 'r') as config_file: | |
| config_data = json.load(config_file) | |
| channels = config_data["model"]["gin_channels"] | |
| if str(channels) == "256": | |
| para1 = 1 | |
| if str(channels) == "768": | |
| para1 = 192 | |
| device = torch.device("cpu") | |
| hps = utils.get_hparams_from_file(json_file) | |
| SVCVITS = SynthesizerTrn( | |
| hps.data.filter_length // 2 + 1, | |
| hps.train.segment_size // hps.data.hop_length, | |
| **hps.model) | |
| _ = utils.load_checkpoint(model_file, SVCVITS, None) | |
| _ = SVCVITS.eval().to(device) | |
| for i in SVCVITS.parameters(): | |
| i.requires_grad = False | |
| n_frame = 10 | |
| test_hidden_unit = torch.rand(para1, n_frame, channels) | |
| test_pitch = torch.rand(1, n_frame) | |
| test_mel2ph = torch.arange(0, n_frame, dtype=torch.int64)[None] # torch.LongTensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]).unsqueeze(0) | |
| test_uv = torch.ones(1, n_frame, dtype=torch.float32) | |
| test_noise = torch.randn(1, 192, n_frame) | |
| test_sid = torch.LongTensor([0]) | |
| input_names = ["c", "f0", "mel2ph", "uv", "noise", "sid"] | |
| output_names = ["audio", ] | |
| onnx_file = os.path.splitext(model_file)[0] + ".onnx" | |
| torch.onnx.export(SVCVITS, | |
| ( | |
| test_hidden_unit.to(device), | |
| test_pitch.to(device), | |
| test_mel2ph.to(device), | |
| test_uv.to(device), | |
| test_noise.to(device), | |
| test_sid.to(device) | |
| ), | |
| onnx_file, | |
| dynamic_axes={ | |
| "c": [0, 1], | |
| "f0": [1], | |
| "mel2ph": [1], | |
| "uv": [1], | |
| "noise": [2], | |
| }, | |
| do_constant_folding=False, | |
| opset_version=16, | |
| verbose=False, | |
| input_names=input_names, | |
| output_names=output_names) | |
| return "转换成功,模型被保存在了checkpoints下的对应目录" | |
| except Exception as e: | |
| if debug: traceback.print_exc() | |
| return "转换错误:"+str(e) | |
| def load_raw_audio(audio_path): | |
| if not os.path.isdir(audio_path): | |
| return "请输入正确的目录", None | |
| files = os.listdir(audio_path) | |
| wav_files = [file for file in files if file.lower().endswith('.wav')] | |
| if not wav_files: | |
| return "未在目录中找到.wav音频文件", None | |
| return "成功加载", wav_files | |
| def slicer_fn(input_dir, output_dir, process_method, max_sec, min_sec): | |
| if output_dir == "": | |
| return "请先选择输出的文件夹" | |
| slicer = AutoSlicer() | |
| if not os.path.exists(output_dir): | |
| os.makedirs(output_dir) | |
| for filename in os.listdir(input_dir): | |
| if filename.lower().endswith(".wav"): | |
| slicer.auto_slice(filename, input_dir, output_dir, max_sec) | |
| if process_method == "丢弃": | |
| for filename in os.listdir(output_dir): | |
| if filename.endswith(".wav"): | |
| filepath = os.path.join(output_dir, filename) | |
| audio, sr = librosa.load(filepath, sr=None, mono=False) | |
| if librosa.get_duration(y=audio, sr=sr) < min_sec: | |
| os.remove(filepath) | |
| elif process_method == "将过短音频整合为长音频": | |
| slicer.merge_short(output_dir, max_sec, min_sec) | |
| file_count, max_duration, min_duration, orig_duration, final_duration = slicer.slice_count(input_dir, output_dir) | |
| hrs = int(final_duration / 3600) | |
| mins = int((final_duration % 3600) / 60) | |
| sec = format(float(final_duration % 60), '.2f') | |
| rate = format(100 * (final_duration / orig_duration), '.2f') | |
| return f"成功将音频切分为{file_count}条片段,其中最长{max_duration}秒,最短{min_duration}秒,切片后的音频总时长{hrs:02d}小时{mins:02d}分{sec}秒,为原始音频时长的{rate}%" | |
| def model_compression(_model): | |
| if _model == "": | |
| return "请先选择要压缩的模型" | |
| else: | |
| model_path = os.path.join(workdir, _model) | |
| filename, extension = os.path.splitext(_model) | |
| output_model_name = f"{filename}_compressed{extension}" | |
| output_path = os.path.join(workdir, output_model_name) | |
| removeOptimizer(model_path, output_path) | |
| return f"模型已成功被保存在了{output_path}" | |
| # read ckpt list | |
| ckpt_list, config_list, cluster_list, diff_list, diff_config_list = load_options() | |
| #read GPU info | |
| ngpu=torch.cuda.device_count() | |
| gpu_infos=[] | |
| if(torch.cuda.is_available()==False or ngpu==0):if_gpu_ok=False | |
| else: | |
| if_gpu_ok = False | |
| for i in range(ngpu): | |
| gpu_name=torch.cuda.get_device_name(i) | |
| if("MX"in gpu_name):continue | |
| if("10"in gpu_name or "16"in gpu_name or "20"in gpu_name or "30"in gpu_name or "40"in gpu_name or "A50"in gpu_name.upper() or "70"in gpu_name or "80"in gpu_name or "90"in gpu_name or "M4"in gpu_name or"P4"in gpu_name or "T4"in gpu_name or "TITAN"in gpu_name.upper()):#A10#A100#V100#A40#P40#M40#K80 | |
| if_gpu_ok=True#至少有一张能用的N卡 | |
| gpu_infos.append("%s\t%s"%(i,gpu_name)) | |
| gpu_info="\n".join(gpu_infos)if if_gpu_ok==True and len(gpu_infos)>0 else "很遗憾您这没有能用的显卡来支持您训练" | |
| gpus="-".join([i[0]for i in gpu_infos]) | |
| #read default params | |
| sovits_params, diff_params = get_default_settings() | |
| app = gr.Blocks() | |
| def Newget_model_info(choice_ckpt2): | |
| choice_ckpt = str(choice_ckpt2) | |
| pthfile = os.path.join(workdir, choice_ckpt) | |
| net = torch.load(pthfile, map_location=torch.device('cpu')) #cpu load | |
| spk_emb = net["model"].get("emb_g.weight") | |
| if spk_emb is None: | |
| return "所选模型缺少emb_g.weight,你可能选择了一个底模" | |
| _dim, _layer = spk_emb.size() | |
| model_type = { | |
| 768: "Vec768-Layer12", | |
| 256: "Vec256-Layer9 / HubertSoft", | |
| 1024: "Whisper-PPG" | |
| } | |
| return gr.Textbox(visible=False, value=model_type.get(_layer, "不受支持的模型")) | |
| with app: | |
| gr.Markdown(value=""" | |
| ### So-VITS-SVC 4.1-Stable | |
| 修改自原项目及bilibili@麦哲云 | |
| 仅供个人娱乐和非商业用途,禁止用于血腥、暴力、性相关、政治相关内容 | |
| weiui来自:bilibili@羽毛布団,交流③群:416656175 | |
| 镜像作者:bilibili@kiss丿冷鸟鸟,交流群:829974025 | |
| """) | |
| with gr.Tabs(): | |
| with gr.TabItem("FC"): | |
| #with gr.Row(): | |
| # choice_ckpt = gr.Dropdown(label="模型选择", choices=ckpt_list, value="no_model") | |
| # model_branch = gr.Textbox(label="模型编码器", placeholder="请先选择模型", interactive=False) | |
| #choice_ckpt = gr.Dropdown(value="G_388000.pth", visible=False) | |
| #with gr.Row(): | |
| # config_choice = gr.Dropdown(label="配置文件", choices=config_list, value="no_config") | |
| # config_info = gr.Textbox(label="配置文件编码器", placeholder="请选择配置文件") | |
| config_choice = gr.Dropdown(value="config.json", visible=False) | |
| #gr.Markdown(value="""**请检查模型和配置文件的编码器是否匹配**""") | |
| #with gr.Row(): | |
| # diff_choice = gr.Dropdown(label="(可选)选择扩散模型", choices=diff_list, value="no_diff", interactive=True) | |
| # diff_config_choice = gr.Dropdown(label="扩散模型配置文件", choices=diff_config_list, value="no_diff_config", interactive=True) | |
| diff_choice = gr.Dropdown(value="no_diff", visible=False) | |
| diff_config_choice = gr.Dropdown(value="no_diff_config", visible=False) | |
| with gr.Row(): | |
| cluster_choice = gr.Dropdown(label="(可选)选择聚类模型/特征检索模型", choices=cluster_list, value="no_clu") | |
| with gr.Row(): | |
| enhance = gr.Checkbox(label="是否使用NSF_HIFIGAN增强,该选项对部分训练集少的模型有一定的音质增强效果,但是对训练好的模型有反面效果,默认关闭", value=False) | |
| #only_diffusion = gr.Checkbox(label="是否使用全扩散推理,开启后将不使用So-VITS模型,仅使用扩散模型进行完整扩散推理,默认关闭", value=False) | |
| only_diffusion = gr.Checkbox(value=False, visible=False) | |
| #using_device = gr.Dropdown(label="推理设备,默认为自动选择", choices=["Auto","cuda","cpu"], value="Auto") | |
| using_device = gr.Dropdown(value='Auto', visible=False) | |
| #refresh = gr.Button("刷新选项") | |
| #loadckpt = gr.Button("加载模型", variant="primary") | |
| #with gr.Row(): | |
| # model_message = gr.Textbox(label="Output Message") | |
| # sid = gr.Dropdown(label="So-VITS说话人", value="speaker0") | |
| sid = gr.Dropdown(value="1056", visible=False) | |
| #choice_ckpt.change(get_model_info, [choice_ckpt], [model_branch]) | |
| model_branch = Newget_model_info("G_388000.pth") | |
| #config_choice.change(load_json_encoder, [config_choice], [config_info]) | |
| #refresh.click(refresh_options,[],[choice_ckpt,config_choice,cluster_choice,diff_choice,diff_config_choice]) | |
| gr.Markdown(value=""" | |
| 请稍等片刻,模型加载大约需要10秒。后续操作不需要重新加载模型 | |
| """) | |
| with gr.Tabs(): | |
| with gr.TabItem("单个音频上传"): | |
| vc_input3 = gr.Audio(label="单个音频上传") | |
| with gr.TabItem("批量音频上传"): | |
| vc_batch_files = gr.Files(label="批量音频上传", file_types=["audio"], file_count="multiple") | |
| with gr.TabItem("文字转语音(实验性)"): | |
| gr.Markdown(""" | |
| 文字转语音(TTS)说明:使用edge_tts服务生成音频,并转换为So-VITS模型音色。可以在输入文字中使用标点符号简单控制情绪 | |
| zh-CN-XiaoyiNeural:中文女声 | |
| zh-CN-YunxiNeural: 中文男声 | |
| ja-JP-NanamiNeural:日文女声 | |
| ja-JP-KeitaNeural:日文男声 | |
| zh-CN-liaoning-XiaobeiNeural:东北话女声 | |
| zh-CN-shaanxi-XiaoniNeural: 陕西话女声 | |
| zh-HK-HiuMaanNeural: 粤语女声 | |
| zh-HK-WanLungNeural: 粤语男声 | |
| """) | |
| with gr.Row(): | |
| text_input = gr.Textbox(label = "在此输入需要转译的文字(建议打开自动f0预测)",) | |
| tts_spk = gr.Dropdown(label = "选择原始音频音色(来自微软TTS)", choices=["zh-CN-XiaoyiNeural", "zh-CN-YunxiNeural", "zh-CN-liaoning-XiaobeiNeural", "zh-CN-shaanxi-XiaoniNeural", "zh-HK-HiuMaanNeural", "zh-HK-WanLungNeural", "ja-JP-NanamiNeural", "ja-JP-KeitaNeural"], value = "zh-CN-XiaoyiNeural") | |
| #with gr.Row(): | |
| # tts_rate = gr.Slider(label = "TTS语音变速(倍速)", minimum = 0, maximum = 3, value = 1) | |
| # tts_volume = gr.Slider(label = "TTS语音音量(相对值)", minimum = 0, maximum = 1.5, value = 1) | |
| with gr.Row(): | |
| auto_f0 = gr.Checkbox(label="自动f0预测,配合聚类模型f0预测效果更好,会导致变调功能失效(仅限转换语音,歌声不要勾选此项会跑调)", value=False) | |
| f0_predictor = gr.Radio(label="f0预测器选择(如遇哑音可以更换f0预测器解决,crepe为原F0使用均值滤波器)", choices=["pm","crepe","harvest","dio"], value="pm") | |
| cr_threshold = gr.Number(label="F0过滤阈值,只有使用crepe时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音", value=0.05) | |
| with gr.Row(): | |
| vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0) | |
| cluster_ratio = gr.Number(label="聚类模型/特征检索混合比例,0-1之间,默认为0不启用聚类或特征检索,能提升音色相似度,但会导致咬字下降", value=0) | |
| k_step = gr.Slider(label="浅扩散步数,只有使用了扩散模型才有效,步数越大越接近扩散模型的结果", value=100, minimum = 1, maximum = 1000) | |
| with gr.Row(): | |
| enhancer_adaptive_key = gr.Number(label="使NSF-HIFIGAN增强器适应更高的音域(单位为半音数)|默认为0", value=0,interactive=True) | |
| slice_db = gr.Number(label="切片阈值", value=-50) | |
| cl_num = gr.Number(label="音频自动切片,0为按默认方式切片,单位为秒/s,爆显存可以设置此处强制切片", value=0) | |
| with gr.Accordion("高级设置(一般不需要动)", open=False): | |
| noise_scale = gr.Number(label="noise_scale 建议不要动,会影响音质,玄学参数", value=0.4) | |
| pad_seconds = gr.Number(label="推理音频pad秒数,由于未知原因开头结尾会有异响,pad一小段静音段后就不会出现", value=0.5) | |
| lg_num = gr.Number(label="两端音频切片的交叉淡入长度,如果自动切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,注意,该设置会影响推理速度,单位为秒/s", value=1) | |
| lgr_num = gr.Number(label="自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭", value=0.75,interactive=True) | |
| second_encoding = gr.Checkbox(label = "二次编码,浅扩散前会对原始音频进行二次编码,玄学选项,效果时好时差,默认关闭", value=False) | |
| loudness_envelope_adjustment = gr.Number(label="输入源响度包络替换输出响度包络融合比例,越靠近1越使用输出响度包络", value = 0) | |
| use_spk_mix = gr.Checkbox(label="动态声线融合,暂时没做完", value=False, interactive=False) | |
| with gr.Row(): | |
| vc_submit = gr.Button("音频转换", variant="primary") | |
| vc_batch_submit = gr.Button("批量转换", variant="primary") | |
| vc_tts_submit = gr.Button("文本转语音", variant="primary") | |
| vc_output1 = gr.Textbox(label="Output Message") | |
| vc_output2 = gr.Audio(label="Output Audio") | |
| def Newvc_fn(sid, input_audio, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment, clus2): | |
| global model, loaded | |
| if loaded != clus2: | |
| Newload_model_func("G_388000.pth",clus2,config_choice,enhance,diff_choice,diff_config_choice,only_diffusion,model_branch,using_device) | |
| loaded = clus2 | |
| try: | |
| if input_audio is None: | |
| return "You need to upload an audio", None | |
| if model is None: | |
| return "You need to upload an model", None | |
| sampling_rate, audio = input_audio | |
| temp_path = "temp.wav" | |
| sf.write(temp_path, audio, sampling_rate, format="wav") | |
| output_file_path = vc_infer(sid, audio, temp_path, vc_transform, auto_f0, cluster_ratio, slice_db, noise_scale, pad_seconds, cl_num, lg_num, lgr_num, f0_predictor, enhancer_adaptive_key, cr_threshold, k_step, use_spk_mix, second_encoding, loudness_envelope_adjustment) | |
| os.remove(temp_path) | |
| return "Success", output_file_path | |
| except Exception as e: | |
| if debug: traceback.print_exc() | |
| raise gr.Error(e) | |
| #loadckpt.click(load_model_func,[choice_ckpt,cluster_choice,config_choice,enhance,diff_choice,diff_config_choice,only_diffusion,model_branch,using_device],[model_message, sid, cl_num]) | |
| vc_submit.click(Newvc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment,cluster_choice], [vc_output1, vc_output2]) | |
| vc_batch_submit.click(vc_batch_fn, [sid, vc_batch_files, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment], [vc_output1]) | |
| vc_tts_submit.click(tts_fn, [text_input, tts_spk, sid, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,f0_predictor,enhancer_adaptive_key,cr_threshold,k_step,use_spk_mix,second_encoding,loudness_envelope_adjustment], [vc_output1, vc_output2]) | |
| ''' | |
| with gr.TabItem("训练"): | |
| gr.Markdown(value="""请将数据集文件夹放置在dataset_raw文件夹下,确认放置正确后点击下方获取数据集名称""") | |
| raw_dirs_list=gr.Textbox(label="Raw dataset directory(s):") | |
| get_raw_dirs=gr.Button("识别数据集", variant="primary") | |
| gr.Markdown(value="""确认数据集正确识别后请选择训练使用的特征编码器和f0预测器,**如果要训练扩散模型,请选择Vec768l12或hubertsoft,并确保So-VITS和扩散模型使用同一个编码器**""") | |
| with gr.Row(): | |
| gr.Markdown(value="""**vec256l9**: ContentVec(256Layer9),旧版本叫v1,So-VITS-SVC 4.0的基础版本,**暂不支持扩散模型** | |
| **vec768l12**: 特征输入更换为ContentVec的第12层Transformer输出,模型理论上会更加还原训练集音色 | |
| **hubertsoft**: So-VITS-SVC 3.0使用的编码器,咬字更为准确,但可能存在多说话人音色泄露问题 | |
| **whisper-ppg**: 来自OpenAI,咬字最为准确,但和Hubertsoft一样存在多说话人音色泄露,且显存占用和训练时间有明显增加。**暂不支持扩散模型** | |
| """) | |
| gr.Markdown(value="""**crepe**: 抗噪能力最强,但预处理速度慢(不过如果你的显卡很强的话速度会很快) | |
| **pm**: 预处理速度快,但抗噪能力较弱 | |
| **dio**: 先前版本预处理默认使用的f0预测器 | |
| **harvest**: 有一定抗噪能力,预处理显存占用友好,速度比较慢 | |
| """) | |
| with gr.Row(): | |
| branch_selection = gr.Radio(label="选择训练使用的编码器", choices=["vec256l9","vec768l12","hubertsoft","whisper-ppg"], value="vec768l12", interactive=True) | |
| f0_predictor_selection = gr.Radio(label="选择训练使用的f0预测器", choices=["crepe","pm","dio","harvest"], value="crepe", interactive=True) | |
| use_diff = gr.Checkbox(label="是否使用浅扩散模型,如要训练浅扩散模型请勾选此项", value=True) | |
| vol_aug=gr.Checkbox(label="是否启用响度嵌入和音量增强,启用后可以根据输入源控制输出响度,但对数据集质量的要求更高。**仅支持vec768l12编码器**", value=False) | |
| with gr.Row(): | |
| skip_loudnorm = gr.Checkbox(label="是否跳过响度匹配,如果你已经用音频处理软件做过响度匹配,请勾选此处") | |
| num_processes = gr.Slider(label="预处理使用的CPU线程数,可以大幅加快预处理速度,但线程数过大容易爆显存,建议12G显存设置为2", minimum=1, maximum=multiprocessing.cpu_count(), value=1, step=1) | |
| with gr.Row(): | |
| raw_preprocess=gr.Button("数据预处理", variant="primary") | |
| regenerate_config_btn=gr.Button("重新生成配置文件", variant="primary") | |
| preprocess_output=gr.Textbox(label="预处理输出信息,完成后请检查一下是否有报错信息,如无则可以进行下一步", max_lines=999) | |
| clear_preprocess_output=gr.Button("清空输出信息") | |
| with gr.Group(): | |
| gr.Markdown(value="""填写训练设置和超参数""") | |
| with gr.Row(): | |
| gr.Textbox(label="当前使用显卡信息", value=gpu_info) | |
| gpu_selection=gr.Textbox(label="多卡用户请指定希望训练使用的显卡ID(0,1,2...)", value=gpus, interactive=True) | |
| with gr.Row(): | |
| log_interval=gr.Textbox(label="每隔多少步(steps)生成一次评估日志", value=sovits_params['log_interval']) | |
| eval_interval=gr.Textbox(label="每隔多少步(steps)验证并保存一次模型", value=sovits_params['eval_interval']) | |
| keep_ckpts=gr.Textbox(label="仅保留最新的X个模型,超出该数字的旧模型会被删除。设置为0则永不删除", value=sovits_params['keep_ckpts']) | |
| with gr.Row(): | |
| batch_size=gr.Textbox(label="批量大小,每步取多少条数据进行训练,大batch有助于训练但显著增加显存占用。6G显存建议设定为4", value=sovits_params['batch_size']) | |
| lr=gr.Textbox(label="学习率,一般不用动,批量大小较大时可以适当增大学习率,但强烈不建议超过0.0002,有炸炉风险", value=sovits_params['learning_rate']) | |
| fp16_run=gr.Checkbox(label="是否使用fp16混合精度训练,fp16训练可能降低显存占用和训练时间,但对模型质量的影响尚未查证", value=sovits_params['fp16_run']) | |
| all_in_mem=gr.Checkbox(label="是否加载所有数据集到内存中,硬盘IO过于低下、同时内存容量远大于数据集体积时可以启用,能显著加快训练速度", value=sovits_params['all_in_mem']) | |
| with gr.Row(): | |
| gr.Markdown("请检查右侧的说话人列表是否和你要训练的目标说话人一致,确认无误后点击写入配置文件,然后就可以开始训练了") | |
| speakers=gr.Textbox(label="说话人列表") | |
| with gr.Accordion(label = "扩散模型配置(训练扩散模型需要写入此处)", open=True): | |
| with gr.Row(): | |
| diff_num_workers = gr.Number(label="num_workers, 如果你的电脑配置较高,可以将这里设置为0加快训练速度", value=diff_params['num_workers']) | |
| diff_cache_all_data = gr.Checkbox(label="是否缓存数据,启用后可以加快训练速度,关闭后可以节省显存或内存,但会减慢训练速度", value=diff_params['cache_all_data']) | |
| diff_cache_device = gr.Radio(label="若启用缓存数据,使用显存(cuda)还是内存(cpu)缓存,如果显卡显存充足,选择cuda以加快训练速度", choices=["cuda","cpu"], value=diff_params['cache_device']) | |
| diff_amp_dtype = gr.Radio(label="训练数据类型,fp16可能会有更快的训练速度,前提是你的显卡支持", choices=["fp32","fp16"], value=diff_params['amp_dtype']) | |
| with gr.Row(): | |
| diff_batch_size = gr.Number(label="批量大小(batch_size),根据显卡显存设置,小显存适当降低该项,6G显存可以设定为48,但该数值不要超过数据集总数量的1/4", value=diff_params['diff_batch_size']) | |
| diff_lr = gr.Number(label="学习率(一般不需要动)", value=diff_params['diff_lr']) | |
| diff_interval_log = gr.Number(label="每隔多少步(steps)生成一次评估日志", value = diff_params['diff_interval_log']) | |
| diff_interval_val = gr.Number(label="每隔多少步(steps)验证并保存一次模型,如果你的批量大小较大,可以适当减少这里的数字,但不建议设置为1000以下", value=diff_params['diff_interval_val']) | |
| diff_force_save = gr.Number(label="每隔多少步强制保留模型,只有该步数的倍数保存的模型会被保留,其余会被删除。设置为与验证步数相同的值则每个模型都会被保留", value=diff_params['diff_force_save']) | |
| with gr.Row(): | |
| save_params=gr.Button("将当前设置保存为默认设置", variant="primary") | |
| write_config=gr.Button("写入配置文件", variant="primary") | |
| write_config_output=gr.Textbox(label="输出信息") | |
| gr.Markdown(value="""**点击从头开始训练**将会自动将已有的训练进度保存到models_backup文件夹,并自动装载预训练模型。 | |
| **继续上一次的训练进度**将从上一个保存模型的进度继续训练。继续训练进度无需重新预处理和写入配置文件。 | |
| 关于扩散、聚类和特征检索的详细说明请看[此处](https://www.yuque.com/umoubuton/ueupp5/kmui02dszo5zrqkz)。 | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| start_training=gr.Button("从头开始训练", variant="primary") | |
| training_output=gr.Textbox(label="训练输出信息") | |
| with gr.Column(): | |
| continue_training_btn=gr.Button("继续上一次的训练进度", variant="primary") | |
| continue_training_output=gr.Textbox(label="训练输出信息") | |
| with gr.Row(): | |
| with gr.Column(): | |
| diff_training_btn=gr.Button("从头训练扩散模型", variant="primary") | |
| diff_training_output=gr.Textbox(label="训练输出信息") | |
| with gr.Column(): | |
| diff_continue_training_btn=gr.Button("继续训练扩散模型", variant="primary") | |
| diff_continue_training_output=gr.Textbox(label="训练输出信息") | |
| with gr.Accordion(label = "聚类、特征检索训练", open=False): | |
| with gr.Row(): | |
| with gr.Column(): | |
| kmeans_button=gr.Button("训练聚类模型", variant="primary") | |
| kmeans_gpu = gr.Checkbox(label="使用GPU训练", value=True) | |
| kmeans_output=gr.Textbox(label="训练输出信息") | |
| with gr.Column(): | |
| index_button=gr.Button("训练特征检索模型", variant="primary") | |
| index_output=gr.Textbox(label="训练输出信息") | |
| ''' | |
| with gr.TabItem("小工具/实验室特性"): | |
| gr.Markdown(value=""" | |
| ### So-vits-svc 4.1 小工具/实验室特性 | |
| 提供了一些有趣或实用的小工具,可以自行探索 | |
| """) | |
| with gr.Tabs(): | |
| with gr.TabItem("静态声线融合"): | |
| gr.Markdown(value=""" | |
| <font size=2> 介绍:该功能可以将多个声音模型合成为一个声音模型(多个模型参数的凸组合或线性组合),从而制造出现实中不存在的声线 | |
| 注意: | |
| 1.该功能仅支持单说话人的模型 | |
| 2.如果强行使用多说话人模型,需要保证多个模型的说话人数量相同,这样可以混合同一个SpaekerID下的声音 | |
| 3.保证所有待混合模型的config.json中的model字段是相同的 | |
| 4.输出的混合模型可以使用待合成模型的任意一个config.json,但聚类模型将不能使用 | |
| 5.批量上传模型的时候最好把模型放到一个文件夹选中后一起上传 | |
| 6.混合比例调整建议大小在0-100之间,也可以调为其他数字,但在线性组合模式下会出现未知的效果 | |
| 7.混合完毕后,文件将会保存在项目根目录中,文件名为output.pth | |
| 8.凸组合模式会将混合比例执行Softmax使混合比例相加为1,而线性组合模式不会 | |
| </font> | |
| """) | |
| mix_model_path = gr.Files(label="选择需要混合模型文件") | |
| mix_model_upload_button = gr.UploadButton("选择/追加需要混合模型文件", file_count="multiple") | |
| mix_model_output1 = gr.Textbox( | |
| label="混合比例调整,单位/%", | |
| interactive = True | |
| ) | |
| mix_mode = gr.Radio(choices=["凸组合", "线性组合"], label="融合模式",value="凸组合",interactive = True) | |
| mix_submit = gr.Button("声线融合启动", variant="primary") | |
| mix_model_output2 = gr.Textbox( | |
| label="Output Message" | |
| ) | |
| with gr.TabItem("onnx转换"): | |
| gr.Markdown(value=""" | |
| 提供了将.pth模型(批量)转换为.onnx模型的功能 | |
| 源项目本身自带转换的功能,但不支持批量,操作也不够简单,这个工具可以支持在WebUI中以可视化的操作方式批量转换.onnx模型 | |
| 有人可能会问,转.onnx模型有什么作用呢?相信我,如果你问出了这个问题,说明这个工具你应该用不上 | |
| ### Step 1: | |
| 在整合包根目录下新建一个"checkpoints"文件夹,将pth模型和对应的json配置文件按目录分别放置到checkpoints文件夹下 | |
| 看起来应该像这样: | |
| checkpoints | |
| ├───xxxx | |
| │ ├───xxxx.pth | |
| │ └───xxxx.json | |
| ├───xxxx | |
| │ ├───xxxx.pth | |
| │ └───xxxx.json | |
| └───…… | |
| """) | |
| pth_dir_msg = gr.Textbox(label="识别待转换模型", placeholder="请将模型和配置文件按上述说明放置在正确位置") | |
| pth_dir_identify_btn = gr.Button("识别", variant="primary") | |
| gr.Markdown(value=""" | |
| ### Step 2: | |
| 识别正确后点击下方开始转换,转换一个模型可能需要一分钟甚至更久 | |
| """) | |
| pth2onnx_btn = gr.Button("开始转换", variant="primary") | |
| pth2onnx_msg = gr.Textbox(label="输出信息") | |
| with gr.TabItem("智能音频切片"): | |
| gr.Markdown(value=""" | |
| 该工具可以实现对音频的切片,无需调整参数即可完成符合要求的数据集制作。 | |
| 数据集要求的音频切片约在2-15秒内,用传统的Slicer-GUI切片工具需要精准调参和二次切片才能符合要求,该工具省去了上述繁琐的操作,只要上传原始音频即可一键制作数据集。 | |
| """) | |
| with gr.Row(): | |
| raw_audio_path = gr.Textbox(label="原始音频文件夹", placeholder="包含所有待切片音频的文件夹,示例: D:\干声\speakers") | |
| load_raw_audio_btn = gr.Button("加载原始音频", variant = "primary") | |
| load_raw_audio_output = gr.Textbox(label = "输出信息") | |
| raw_audio_dataset = gr.Textbox(label = "音频列表", value = "") | |
| slicer_output_dir = gr.Textbox(label = "输出目录", placeholder = "选择输出目录") | |
| with gr.Row(): | |
| process_method = gr.Radio(label = "对过短音频的处理方式", choices = ["丢弃","将过短音频整合为长音频"], value = "丢弃") | |
| max_sec = gr.Number(label = "切片的最长秒数", value = 15) | |
| min_sec = gr.Number(label = "切片的最短秒数", value = 2) | |
| slicer_btn = gr.Button("开始切片", variant = "primary") | |
| slicer_output_msg = gr.Textbox(label = "输出信息") | |
| mix_model_path.change(updata_mix_info,[mix_model_path],[mix_model_output1]) | |
| mix_model_upload_button.upload(upload_mix_append_file, [mix_model_upload_button,mix_model_path], [mix_model_path,mix_model_output1]) | |
| mix_submit.click(mix_submit_click, [mix_model_output1,mix_mode], [mix_model_output2]) | |
| pth_dir_identify_btn.click(pth_identify, [], [pth_dir_msg]) | |
| pth2onnx_btn.click(onnx_export, [], [pth2onnx_msg]) | |
| load_raw_audio_btn.click(load_raw_audio, [raw_audio_path], [load_raw_audio_output, raw_audio_dataset]) | |
| slicer_btn.click(slicer_fn, [raw_audio_path, slicer_output_dir, process_method, max_sec, min_sec], [slicer_output_msg]) | |
| with gr.TabItem("模型压缩工具"): | |
| gr.Markdown(value=""" | |
| 该工具可以实现对模型的体积压缩,在**不影响模型推理功能**的情况下,将原本约600M的So-VITS模型压缩至约200M, 大大减少了硬盘的压力。 | |
| **注意:压缩后的模型将无法继续训练,请在确认封炉后再压缩。** | |
| 将模型文件放置在logs/44k下,然后选择需要压缩的模型 | |
| """) | |
| model_to_compress = gr.Dropdown(label="模型选择", choices=ckpt_list, value="") | |
| compress_model_btn = gr.Button("压缩模型", variant="primary") | |
| compress_model_output = gr.Textbox(label="输出信息", value="") | |
| compress_model_btn.click(model_compression, [model_to_compress], [compress_model_output]) | |
| """ | |
| get_raw_dirs.click(load_raw_dirs,[],[raw_dirs_list]) | |
| raw_preprocess.click(dataset_preprocess,[branch_selection, f0_predictor_selection, use_diff, vol_aug, skip_loudnorm, num_processes],[preprocess_output, speakers]) | |
| regenerate_config_btn.click(regenerate_config,[branch_selection, vol_aug],[preprocess_output]) | |
| clear_preprocess_output.click(clear_output,[],[preprocess_output]) | |
| save_params.click(save_default_settings, [log_interval,eval_interval,keep_ckpts,batch_size,lr,fp16_run,all_in_mem,diff_num_workers,diff_cache_all_data,diff_cache_device,diff_amp_dtype,diff_batch_size,diff_lr,diff_interval_log,diff_interval_val,diff_force_save], [write_config_output]) | |
| write_config.click(config_fn,[log_interval, eval_interval, keep_ckpts, batch_size, lr, fp16_run, all_in_mem, diff_num_workers, diff_cache_all_data, diff_batch_size, diff_lr, diff_interval_log, diff_interval_val, diff_cache_device, diff_amp_dtype, diff_force_save],[write_config_output]) | |
| start_training.click(training,[gpu_selection, branch_selection],[training_output]) | |
| diff_training_btn.click(diff_training,[branch_selection],[diff_training_output]) | |
| continue_training_btn.click(continue_training,[gpu_selection, branch_selection],[continue_training_output]) | |
| diff_continue_training_btn.click(diff_continue_training,[branch_selection],[diff_continue_training_output]) | |
| kmeans_button.click(kmeans_training,[kmeans_gpu],[kmeans_output]) | |
| index_button.click(index_training, [], [index_output]) | |
| """ | |
| with gr.Tabs(): | |
| with gr.Row(variant="panel"): | |
| with gr.Column(): | |
| gr.Markdown(value=""" | |
| <font size=2> WebUI设置</font> | |
| """) | |
| debug_button = gr.Checkbox(label="Debug模式,反馈BUG需要打开,打开后控制台可以显示具体错误提示", value=debug) | |
| debug_button.change(debug_change,[],[]) | |
| app.queue(concurrency_count=1022, max_size=2044).launch() | |