Spaces:

kushan1988
/

VITS-TEST

Sleeping

App Files Files Community

VITS-TEST / inference_main.py

kushan1988

Upload 231 files

0b65cde over 2 years ago

raw

history blame contribute delete

9.9 kB

	import io
	import logging
	import os
	import time
	from pathlib import Path
	from spkmix import spk_mix_map
	import librosa
	import matplotlib.pyplot as plt
	import numpy as np
	import soundfile
	from inference import infer_tool
	from inference import slicer
	from inference.infer_tool import Svc

	logging.getLogger('numba').setLevel(logging.WARNING)
	chunks_dict = infer_tool.read_temp("inference/chunks_temp.json")



	def main():
	import argparse

	parser = argparse.ArgumentParser(description='sovits4 inference')

	# The part that must be set
	parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_37600.pth", help='model path')
	parser.add_argument('-c', '--config_path', type=str, default="logs/44k/config.json", help='configuration file path')
	parser.add_argument('-cl', '--clip', type=float, default=0, help='Forced audio slice, default 0 is automatic slice, unit is second/s')
	parser.add_argument('-n', '--clean_names', type=str, nargs='+', default=["test.wav"], help='List of wav file names, placed in the raw folder')
	parser.add_argument('-t', '--trans', type=int, nargs='+', default=[0], help='Pitch adjustment, support positive and negative (semitone)')
	parser.add_argument('-s', '--spk_list', type=str, nargs='+', default=['buyizi'], help='synthetic target speaker name')

	# optional part
	parser.add_argument('-a', '--auto_predict_f0', action='store_true', default=False, help='The voice conversion automatically predicts the pitch, do not turn on this when converting the singing voice, it will seriously go out of tune')
	parser.add_argument('-cm', '--cluster_model_path', type=str, default="logs/44k/kmeans_10000.pt", help='Clustering model or feature retrieval index path, if no clustering or feature retrieval is trained, fill it in casually')
	parser.add_argument('-cr', '--cluster_infer_ratio', type=float, default=0, help='Clustering scheme or feature retrieval ratio, range 0-1, if no clustering model or feature retrieval is trained, the default is 0')
	parser.add_argument('-lg', '--linear_gradient', type=float, default=0, help='The cross-fade length of two audio slices. If the human voice is incoherent after forcing the slice, you can adjust this value. If it is coherent, it is recommended to use the default value of 0, and the unit is second')
	parser.add_argument('-f0p', '--f0_predictor', type=str, default="pm", help='Select the F0 predictor, you can choose crepe, pm, dio, harvest, the default is pm (note: crepe uses the mean filter for the original F0)')
	parser.add_argument('-eh', '--enhance', action='store_true', default=False, help='Whether to use the NSF_HIFIGAN enhancer, this option has a certain sound quality enhancement effect on some models with a small training set, but has a negative effect on the trained model, and it is disabled by default')
	parser.add_argument('-shd', '--shallow_diffusion', action='store_true', default=False, help='Whether to use shallow diffusion. After using it, it can solve some electronic audio problems. It is disabled by default. When this option is enabled, the NSF_HIFIGAN enhancer will be disabled')
	parser.add_argument('-usm', '--use_spk_mix', action='store_true', default=False, help='Whether to use role fusion')
	parser.add_argument('-lea', '--loudness_envelope_adjustment', type=float, default=1, help='The input source loudness envelope replaces the output loudness envelope fusion ratio, the closer to 1, the more the output loudness envelope is used')
	parser.add_argument('-fr', '--feature_retrieval', action='store_true', default=False, help='Whether to use feature retrieval, if the clustering model is used, it will be disabled, and the cm and cr parameters will become the index path and mixing ratio of feature retrieval')

	# Shallow Diffusion Settings
	parser.add_argument('-dm', '--diffusion_model_path', type=str, default="logs/44k/diffusion/model_0.pt", help='Diffusion Model Path')
	parser.add_argument('-dc', '--diffusion_config_path', type=str, default="logs/44k/diffusion/config.yaml", help='Diffusion model configuration file path')
	parser.add_argument('-ks', '--k_step', type=int, default=100, help='The number of diffusion steps, the larger the result is closer to the diffusion model, the default is 100')
	parser.add_argument('-se', '--second_encoding', action='store_true', default=False, help='Secondary encoding, the original audio will be encoded twice before shallow diffusion, metaphysical option, sometimes the effect is good, sometimes the effect is poor')
	parser.add_argument('-od', '--only_diffusion', action='store_true', default=False, help='Pure diffusion mode, this mode will not load the sovits model, reasoning with the diffusion model')


	# non-moving part
	parser.add_argument('-sd', '--slice_db', type=int, default=-40, help='Default -40, noisy audio can be -30, dry sound can keep breathing -50')
	parser.add_argument('-d', '--device', type=str, default=None, help='Inference device, if None is to automatically select cpu and gpu')
	parser.add_argument('-ns', '--noice_scale', type=float, default=0.4, help='The noise level will affect the articulation and sound quality, which is more metaphysical')
	parser.add_argument('-p', '--pad_seconds', type=float, default=0.5, help='Inferring the number of seconds of the audio pad, there will be abnormal noise at the beginning and end due to unknown reasons, and the pad will not appear after a short period of silence')
	parser.add_argument('-wf', '--wav_format', type=str, default='wav', help='audio output format')
	parser.add_argument('-lgr', '--linear_gradient_retain', type=float, default=0.75, help='After automatic audio slicing, the head and tail of each slice need to be discarded. This parameter sets the ratio of cross length retention, range 0-1, left open and right closed')
	parser.add_argument('-eak', '--enhancer_adaptive_key', type=int, default=0, help='Adapt the enhancer to a higher register (in semitones) \| default 0')
	parser.add_argument('-ft', '--f0_filter_threshold', type=float, default=0.05,help='F0 filter threshold, only valid when using crepe. The value ranges from 0-1. Lowering this value can reduce the probability of out-of-tune, but it will increase mute')


	args = parser.parse_args()

	clean_names = args.clean_names
	trans = args.trans
	spk_list = args.spk_list
	slice_db = args.slice_db
	wav_format = args.wav_format
	auto_predict_f0 = args.auto_predict_f0
	cluster_infer_ratio = args.cluster_infer_ratio
	noice_scale = args.noice_scale
	pad_seconds = args.pad_seconds
	clip = args.clip
	lg = args.linear_gradient
	lgr = args.linear_gradient_retain
	f0p = args.f0_predictor
	enhance = args.enhance
	enhancer_adaptive_key = args.enhancer_adaptive_key
	cr_threshold = args.f0_filter_threshold
	diffusion_model_path = args.diffusion_model_path
	diffusion_config_path = args.diffusion_config_path
	k_step = args.k_step
	only_diffusion = args.only_diffusion
	shallow_diffusion = args.shallow_diffusion
	use_spk_mix = args.use_spk_mix
	second_encoding = args.second_encoding
	loudness_envelope_adjustment = args.loudness_envelope_adjustment

	print("Rsn "+wav_format)

	svc_model = Svc(args.model_path,
	args.config_path,
	args.device,
	args.cluster_model_path,
	enhance,
	diffusion_model_path,
	diffusion_config_path,
	shallow_diffusion,
	only_diffusion,
	use_spk_mix,
	args.feature_retrieval)

	print("Rsn svc_model = ")
	infer_tool.mkdir(["raw", "results"])

	if len(spk_mix_map)<=1:
	use_spk_mix = False
	if use_spk_mix:
	spk_list = [spk_mix_map]

	infer_tool.fill_a_to_b(trans, clean_names)
	for clean_name, tran in zip(clean_names, trans):
	# raw_audio_path = f"/home/ubuntu/experement/VITS4/raw/{clean_name}"
	raw_audio_path = os.path.join(os.curdir, "dataset_raw\America", clean_name)
	if "." not in raw_audio_path:
	raw_audio_path += ".wav"

	print("Rsn2 "+raw_audio_path)
	infer_tool.format_wav(raw_audio_path)
	for spk in spk_list:
	kwarg = {
	"raw_audio_path" : raw_audio_path,
	"spk" : spk,
	"tran" : tran,
	"slice_db" : slice_db,
	"cluster_infer_ratio" : cluster_infer_ratio,
	"auto_predict_f0" : auto_predict_f0,
	"noice_scale" : noice_scale,
	"pad_seconds" : pad_seconds,
	"clip_seconds" : clip,
	"lg_num": lg,
	"lgr_num" : lgr,
	"f0_predictor" : f0p,
	"enhancer_adaptive_key" : enhancer_adaptive_key,
	"cr_threshold" : cr_threshold,
	"k_step":k_step,
	"use_spk_mix":use_spk_mix,
	"second_encoding":second_encoding,
	"loudness_envelope_adjustment":loudness_envelope_adjustment
	}
	audio = svc_model.slice_inference(**kwarg)
	key = "auto" if auto_predict_f0 else f"{tran}key"
	cluster_name = "" if cluster_infer_ratio == 0 else f"_{cluster_infer_ratio}"
	isdiffusion = "sovits"
	if shallow_diffusion : isdiffusion = "sovdiff"
	if only_diffusion : isdiffusion = "diff"
	if use_spk_mix:
	spk = "spk_mix"
	res_path = f'{clean_name}_{key}_{spk}{cluster_name}_{isdiffusion}.{wav_format}'
	soundfile.write(res_path, audio, svc_model.target_sample, format=wav_format)
	svc_model.clear_empty()

	if __name__ == '__main__':
	main()