VITS-TEST / inference_main.py
kushan1988's picture
Upload 231 files
0b65cde
import io
import logging
import os
import time
from pathlib import Path
from spkmix import spk_mix_map
import librosa
import matplotlib.pyplot as plt
import numpy as np
import soundfile
from inference import infer_tool
from inference import slicer
from inference.infer_tool import Svc
logging.getLogger('numba').setLevel(logging.WARNING)
chunks_dict = infer_tool.read_temp("inference/chunks_temp.json")
def main():
import argparse
parser = argparse.ArgumentParser(description='sovits4 inference')
# The part that must be set
parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_37600.pth", help='model path')
parser.add_argument('-c', '--config_path', type=str, default="logs/44k/config.json", help='configuration file path')
parser.add_argument('-cl', '--clip', type=float, default=0, help='Forced audio slice, default 0 is automatic slice, unit is second/s')
parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["test.wav"], help='List of wav file names, placed in the raw folder')
parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='Pitch adjustment, support positive and negative (semitone)')
parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['buyizi'], help='synthetic target speaker name')
# optional part
parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False, help='The voice conversion automatically predicts the pitch, do not turn on this when converting the singing voice, it will seriously go out of tune')
parser.add_argument('-cm', '--cluster_model_path', type=str, default="logs/44k/kmeans_10000.pt", help='Clustering model or feature retrieval index path, if no clustering or feature retrieval is trained, fill it in casually')
parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='Clustering scheme or feature retrieval ratio, range 0-1, if no clustering model or feature retrieval is trained, the default is 0')
parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='The cross-fade length of two audio slices. If the human voice is incoherent after forcing the slice, you can adjust this value. If it is coherent, it is recommended to use the default value of 0, and the unit is second')
parser.add_argument('-f0p', '--f0_predictor', type=str, default="pm", help='Select the F0 predictor, you can choose crepe, pm, dio, harvest, the default is pm (note: crepe uses the mean filter for the original F0)')
parser.add_argument('-eh', '--enhance', action='store_true', default=False, help='Whether to use the NSF_HIFIGAN enhancer, this option has a certain sound quality enhancement effect on some models with a small training set, but has a negative effect on the trained model, and it is disabled by default')
parser.add_argument('-shd', '--shallow_diffusion', action='store_true', default=False, help='Whether to use shallow diffusion. After using it, it can solve some electronic audio problems. It is disabled by default. When this option is enabled, the NSF_HIFIGAN enhancer will be disabled')
parser.add_argument('-usm', '--use_spk_mix', action='store_true', default=False, help='Whether to use role fusion')
parser.add_argument('-lea', '--loudness_envelope_adjustment', type=float, default=1, help='The input source loudness envelope replaces the output loudness envelope fusion ratio, the closer to 1, the more the output loudness envelope is used')
parser.add_argument('-fr', '--feature_retrieval', action='store_true', default=False, help='Whether to use feature retrieval, if the clustering model is used, it will be disabled, and the cm and cr parameters will become the index path and mixing ratio of feature retrieval')
# Shallow Diffusion Settings
parser.add_argument('-dm', '--diffusion_model_path', type=str, default="logs/44k/diffusion/model_0.pt", help='Diffusion Model Path')
parser.add_argument('-dc', '--diffusion_config_path', type=str, default="logs/44k/diffusion/config.yaml", help='Diffusion model configuration file path')
parser.add_argument('-ks', '--k_step', type=int, default=100, help='The number of diffusion steps, the larger the result is closer to the diffusion model, the default is 100')
parser.add_argument('-se', '--second_encoding', action='store_true', default=False, help='Secondary encoding, the original audio will be encoded twice before shallow diffusion, metaphysical option, sometimes the effect is good, sometimes the effect is poor')
parser.add_argument('-od', '--only_diffusion', action='store_true', default=False, help='Pure diffusion mode, this mode will not load the sovits model, reasoning with the diffusion model')
# non-moving part
parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='Default -40, noisy audio can be -30, dry sound can keep breathing -50')
parser.add_argument('-d', '--device', type=str, default=None, help='Inference device, if None is to automatically select cpu and gpu')
parser.add_argument('-ns', '--noice_scale', type=float, default=0.4, help='The noise level will affect the articulation and sound quality, which is more metaphysical')
parser.add_argument('-p', '--pad_seconds', type=float, default=0.5, help='Inferring the number of seconds of the audio pad, there will be abnormal noise at the beginning and end due to unknown reasons, and the pad will not appear after a short period of silence')
parser.add_argument('-wf', '--wav_format', type=str, default='wav', help='audio output format')
parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75, help='After automatic audio slicing, the head and tail of each slice need to be discarded. This parameter sets the ratio of cross length retention, range 0-1, left open and right closed')
parser.add_argument('-eak', '--enhancer_adaptive_key', type=int, default=0, help='Adapt the enhancer to a higher register (in semitones) | default 0')
parser.add_argument('-ft', '--f0_filter_threshold', type=float, default=0.05,help='F0 filter threshold, only valid when using crepe. The value ranges from 0-1. Lowering this value can reduce the probability of out-of-tune, but it will increase mute')
args = parser.parse_args()
clean_names = args.clean_names
trans = args.trans
spk_list = args.spk_list
slice_db = args.slice_db
wav_format = args.wav_format
auto_predict_f0 = args.auto_predict_f0
cluster_infer_ratio = args.cluster_infer_ratio
noice_scale = args.noice_scale
pad_seconds = args.pad_seconds
clip = args.clip
lg = args.linear_gradient
lgr = args.linear_gradient_retain
f0p = args.f0_predictor
enhance = args.enhance
enhancer_adaptive_key = args.enhancer_adaptive_key
cr_threshold = args.f0_filter_threshold
diffusion_model_path = args.diffusion_model_path
diffusion_config_path = args.diffusion_config_path
k_step = args.k_step
only_diffusion = args.only_diffusion
shallow_diffusion = args.shallow_diffusion
use_spk_mix = args.use_spk_mix
second_encoding = args.second_encoding
loudness_envelope_adjustment = args.loudness_envelope_adjustment
print("Rsn "+wav_format)
svc_model = Svc(args.model_path,
args.config_path,
args.device,
args.cluster_model_path,
enhance,
diffusion_model_path,
diffusion_config_path,
shallow_diffusion,
only_diffusion,
use_spk_mix,
args.feature_retrieval)
print("Rsn svc_model = ")
infer_tool.mkdir(["raw", "results"])
if len(spk_mix_map)<=1:
use_spk_mix = False
if use_spk_mix:
spk_list = [spk_mix_map]
infer_tool.fill_a_to_b(trans, clean_names)
for clean_name, tran in zip(clean_names, trans):
# raw_audio_path = f"/home/ubuntu/experement/VITS4/raw/{clean_name}"
raw_audio_path = os.path.join(os.curdir, "dataset_raw\America", clean_name)
if "." not in raw_audio_path:
raw_audio_path += ".wav"
print("Rsn2 "+raw_audio_path)
infer_tool.format_wav(raw_audio_path)
for spk in spk_list:
kwarg = {
"raw_audio_path" : raw_audio_path,
"spk" : spk,
"tran" : tran,
"slice_db" : slice_db,
"cluster_infer_ratio" : cluster_infer_ratio,
"auto_predict_f0" : auto_predict_f0,
"noice_scale" : noice_scale,
"pad_seconds" : pad_seconds,
"clip_seconds" : clip,
"lg_num": lg,
"lgr_num" : lgr,
"f0_predictor" : f0p,
"enhancer_adaptive_key" : enhancer_adaptive_key,
"cr_threshold" : cr_threshold,
"k_step":k_step,
"use_spk_mix":use_spk_mix,
"second_encoding":second_encoding,
"loudness_envelope_adjustment":loudness_envelope_adjustment
}
audio = svc_model.slice_inference(**kwarg)
key = "auto" if auto_predict_f0 else f"{tran}key"
cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
isdiffusion = "sovits"
if shallow_diffusion : isdiffusion = "sovdiff"
if only_diffusion : isdiffusion = "diff"
if use_spk_mix:
spk = "spk_mix"
res_path = f'{clean_name}_{key}_{spk}{cluster_name}_{isdiffusion}.{wav_format}'
soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
svc_model.clear_empty()
if __name__ == '__main__':
main()