|
|
import os
|
|
|
import sys
|
|
|
import json
|
|
|
import argparse
|
|
|
import subprocess
|
|
|
from functools import lru_cache
|
|
|
from distutils.util import strtobool
|
|
|
|
|
|
now_dir = os.getcwd()
|
|
|
sys.path.append(now_dir)
|
|
|
|
|
|
current_script_directory = os.path.dirname(os.path.realpath(__file__))
|
|
|
logs_path = os.path.join(current_script_directory, "logs")
|
|
|
|
|
|
from rvc.lib.tools.prerequisites_download import prequisites_download_pipeline
|
|
|
from rvc.train.process.model_blender import model_blender
|
|
|
from rvc.train.process.model_information import model_information
|
|
|
from rvc.lib.tools.analyzer import analyze_audio
|
|
|
from rvc.lib.tools.launch_tensorboard import launch_tensorboard_pipeline
|
|
|
from rvc.lib.tools.model_download import model_download_pipeline
|
|
|
|
|
|
python = sys.executable
|
|
|
|
|
|
|
|
|
|
|
|
@lru_cache(maxsize=1)
|
|
|
def load_voices_data():
|
|
|
with open(
|
|
|
os.path.join("rvc", "lib", "tools", "tts_voices.json"), "r", encoding="utf-8"
|
|
|
) as file:
|
|
|
return json.load(file)
|
|
|
|
|
|
|
|
|
voices_data = load_voices_data()
|
|
|
locales = list({voice["ShortName"] for voice in voices_data})
|
|
|
|
|
|
|
|
|
@lru_cache(maxsize=None)
|
|
|
def import_voice_converter():
|
|
|
from rvc.infer.infer import VoiceConverter
|
|
|
|
|
|
return VoiceConverter()
|
|
|
|
|
|
|
|
|
@lru_cache(maxsize=1)
|
|
|
def get_config():
|
|
|
from rvc.configs.config import Config
|
|
|
|
|
|
return Config()
|
|
|
|
|
|
|
|
|
|
|
|
def run_infer_script(
|
|
|
pitch: int,
|
|
|
index_rate: float,
|
|
|
volume_envelope: int,
|
|
|
protect: float,
|
|
|
hop_length: int,
|
|
|
f0_method: str,
|
|
|
input_path: str,
|
|
|
output_path: str,
|
|
|
pth_path: str,
|
|
|
index_path: str,
|
|
|
split_audio: bool,
|
|
|
f0_autotune: bool,
|
|
|
f0_autotune_strength: float,
|
|
|
clean_audio: bool,
|
|
|
clean_strength: float,
|
|
|
export_format: str,
|
|
|
f0_file: str,
|
|
|
embedder_model: str,
|
|
|
embedder_model_custom: str = None,
|
|
|
formant_shifting: bool = False,
|
|
|
formant_qfrency: float = 1.0,
|
|
|
formant_timbre: float = 1.0,
|
|
|
post_process: bool = False,
|
|
|
reverb: bool = False,
|
|
|
pitch_shift: bool = False,
|
|
|
limiter: bool = False,
|
|
|
gain: bool = False,
|
|
|
distortion: bool = False,
|
|
|
chorus: bool = False,
|
|
|
bitcrush: bool = False,
|
|
|
clipping: bool = False,
|
|
|
compressor: bool = False,
|
|
|
delay: bool = False,
|
|
|
reverb_room_size: float = 0.5,
|
|
|
reverb_damping: float = 0.5,
|
|
|
reverb_wet_gain: float = 0.5,
|
|
|
reverb_dry_gain: float = 0.5,
|
|
|
reverb_width: float = 0.5,
|
|
|
reverb_freeze_mode: float = 0.5,
|
|
|
pitch_shift_semitones: float = 0.0,
|
|
|
limiter_threshold: float = -6,
|
|
|
limiter_release_time: float = 0.01,
|
|
|
gain_db: float = 0.0,
|
|
|
distortion_gain: float = 25,
|
|
|
chorus_rate: float = 1.0,
|
|
|
chorus_depth: float = 0.25,
|
|
|
chorus_center_delay: float = 7,
|
|
|
chorus_feedback: float = 0.0,
|
|
|
chorus_mix: float = 0.5,
|
|
|
bitcrush_bit_depth: int = 8,
|
|
|
clipping_threshold: float = -6,
|
|
|
compressor_threshold: float = 0,
|
|
|
compressor_ratio: float = 1,
|
|
|
compressor_attack: float = 1.0,
|
|
|
compressor_release: float = 100,
|
|
|
delay_seconds: float = 0.5,
|
|
|
delay_feedback: float = 0.0,
|
|
|
delay_mix: float = 0.5,
|
|
|
sid: int = 0,
|
|
|
):
|
|
|
kwargs = {
|
|
|
"audio_input_path": input_path,
|
|
|
"audio_output_path": output_path,
|
|
|
"model_path": pth_path,
|
|
|
"index_path": index_path,
|
|
|
"pitch": pitch,
|
|
|
"index_rate": index_rate,
|
|
|
"volume_envelope": volume_envelope,
|
|
|
"protect": protect,
|
|
|
"hop_length": hop_length,
|
|
|
"f0_method": f0_method,
|
|
|
"pth_path": pth_path,
|
|
|
"index_path": index_path,
|
|
|
"split_audio": split_audio,
|
|
|
"f0_autotune": f0_autotune,
|
|
|
"f0_autotune_strength": f0_autotune_strength,
|
|
|
"clean_audio": clean_audio,
|
|
|
"clean_strength": clean_strength,
|
|
|
"export_format": export_format,
|
|
|
"f0_file": f0_file,
|
|
|
"embedder_model": embedder_model,
|
|
|
"embedder_model_custom": embedder_model_custom,
|
|
|
"post_process": post_process,
|
|
|
"formant_shifting": formant_shifting,
|
|
|
"formant_qfrency": formant_qfrency,
|
|
|
"formant_timbre": formant_timbre,
|
|
|
"reverb": reverb,
|
|
|
"pitch_shift": pitch_shift,
|
|
|
"limiter": limiter,
|
|
|
"gain": gain,
|
|
|
"distortion": distortion,
|
|
|
"chorus": chorus,
|
|
|
"bitcrush": bitcrush,
|
|
|
"clipping": clipping,
|
|
|
"compressor": compressor,
|
|
|
"delay": delay,
|
|
|
"reverb_room_size": reverb_room_size,
|
|
|
"reverb_damping": reverb_damping,
|
|
|
"reverb_wet_level": reverb_wet_gain,
|
|
|
"reverb_dry_level": reverb_dry_gain,
|
|
|
"reverb_width": reverb_width,
|
|
|
"reverb_freeze_mode": reverb_freeze_mode,
|
|
|
"pitch_shift_semitones": pitch_shift_semitones,
|
|
|
"limiter_threshold": limiter_threshold,
|
|
|
"limiter_release": limiter_release_time,
|
|
|
"gain_db": gain_db,
|
|
|
"distortion_gain": distortion_gain,
|
|
|
"chorus_rate": chorus_rate,
|
|
|
"chorus_depth": chorus_depth,
|
|
|
"chorus_delay": chorus_center_delay,
|
|
|
"chorus_feedback": chorus_feedback,
|
|
|
"chorus_mix": chorus_mix,
|
|
|
"bitcrush_bit_depth": bitcrush_bit_depth,
|
|
|
"clipping_threshold": clipping_threshold,
|
|
|
"compressor_threshold": compressor_threshold,
|
|
|
"compressor_ratio": compressor_ratio,
|
|
|
"compressor_attack": compressor_attack,
|
|
|
"compressor_release": compressor_release,
|
|
|
"delay_seconds": delay_seconds,
|
|
|
"delay_feedback": delay_feedback,
|
|
|
"delay_mix": delay_mix,
|
|
|
"sid": sid,
|
|
|
}
|
|
|
infer_pipeline = import_voice_converter()
|
|
|
infer_pipeline.convert_audio(
|
|
|
**kwargs,
|
|
|
)
|
|
|
return f"File {input_path} inferred successfully.", output_path.replace(
|
|
|
".wav", f".{export_format.lower()}"
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def run_batch_infer_script(
|
|
|
pitch: int,
|
|
|
index_rate: float,
|
|
|
volume_envelope: int,
|
|
|
protect: float,
|
|
|
hop_length: int,
|
|
|
f0_method: str,
|
|
|
input_folder: str,
|
|
|
output_folder: str,
|
|
|
pth_path: str,
|
|
|
index_path: str,
|
|
|
split_audio: bool,
|
|
|
f0_autotune: bool,
|
|
|
f0_autotune_strength: float,
|
|
|
clean_audio: bool,
|
|
|
clean_strength: float,
|
|
|
export_format: str,
|
|
|
f0_file: str,
|
|
|
embedder_model: str,
|
|
|
embedder_model_custom: str = None,
|
|
|
formant_shifting: bool = False,
|
|
|
formant_qfrency: float = 1.0,
|
|
|
formant_timbre: float = 1.0,
|
|
|
post_process: bool = False,
|
|
|
reverb: bool = False,
|
|
|
pitch_shift: bool = False,
|
|
|
limiter: bool = False,
|
|
|
gain: bool = False,
|
|
|
distortion: bool = False,
|
|
|
chorus: bool = False,
|
|
|
bitcrush: bool = False,
|
|
|
clipping: bool = False,
|
|
|
compressor: bool = False,
|
|
|
delay: bool = False,
|
|
|
reverb_room_size: float = 0.5,
|
|
|
reverb_damping: float = 0.5,
|
|
|
reverb_wet_gain: float = 0.5,
|
|
|
reverb_dry_gain: float = 0.5,
|
|
|
reverb_width: float = 0.5,
|
|
|
reverb_freeze_mode: float = 0.5,
|
|
|
pitch_shift_semitones: float = 0.0,
|
|
|
limiter_threshold: float = -6,
|
|
|
limiter_release_time: float = 0.01,
|
|
|
gain_db: float = 0.0,
|
|
|
distortion_gain: float = 25,
|
|
|
chorus_rate: float = 1.0,
|
|
|
chorus_depth: float = 0.25,
|
|
|
chorus_center_delay: float = 7,
|
|
|
chorus_feedback: float = 0.0,
|
|
|
chorus_mix: float = 0.5,
|
|
|
bitcrush_bit_depth: int = 8,
|
|
|
clipping_threshold: float = -6,
|
|
|
compressor_threshold: float = 0,
|
|
|
compressor_ratio: float = 1,
|
|
|
compressor_attack: float = 1.0,
|
|
|
compressor_release: float = 100,
|
|
|
delay_seconds: float = 0.5,
|
|
|
delay_feedback: float = 0.0,
|
|
|
delay_mix: float = 0.5,
|
|
|
sid: int = 0,
|
|
|
):
|
|
|
kwargs = {
|
|
|
"audio_input_paths": input_folder,
|
|
|
"audio_output_path": output_folder,
|
|
|
"model_path": pth_path,
|
|
|
"index_path": index_path,
|
|
|
"pitch": pitch,
|
|
|
"index_rate": index_rate,
|
|
|
"volume_envelope": volume_envelope,
|
|
|
"protect": protect,
|
|
|
"hop_length": hop_length,
|
|
|
"f0_method": f0_method,
|
|
|
"pth_path": pth_path,
|
|
|
"index_path": index_path,
|
|
|
"split_audio": split_audio,
|
|
|
"f0_autotune": f0_autotune,
|
|
|
"f0_autotune_strength": f0_autotune_strength,
|
|
|
"clean_audio": clean_audio,
|
|
|
"clean_strength": clean_strength,
|
|
|
"export_format": export_format,
|
|
|
"f0_file": f0_file,
|
|
|
"embedder_model": embedder_model,
|
|
|
"embedder_model_custom": embedder_model_custom,
|
|
|
"post_process": post_process,
|
|
|
"formant_shifting": formant_shifting,
|
|
|
"formant_qfrency": formant_qfrency,
|
|
|
"formant_timbre": formant_timbre,
|
|
|
"reverb": reverb,
|
|
|
"pitch_shift": pitch_shift,
|
|
|
"limiter": limiter,
|
|
|
"gain": gain,
|
|
|
"distortion": distortion,
|
|
|
"chorus": chorus,
|
|
|
"bitcrush": bitcrush,
|
|
|
"clipping": clipping,
|
|
|
"compressor": compressor,
|
|
|
"delay": delay,
|
|
|
"reverb_room_size": reverb_room_size,
|
|
|
"reverb_damping": reverb_damping,
|
|
|
"reverb_wet_level": reverb_wet_gain,
|
|
|
"reverb_dry_level": reverb_dry_gain,
|
|
|
"reverb_width": reverb_width,
|
|
|
"reverb_freeze_mode": reverb_freeze_mode,
|
|
|
"pitch_shift_semitones": pitch_shift_semitones,
|
|
|
"limiter_threshold": limiter_threshold,
|
|
|
"limiter_release": limiter_release_time,
|
|
|
"gain_db": gain_db,
|
|
|
"distortion_gain": distortion_gain,
|
|
|
"chorus_rate": chorus_rate,
|
|
|
"chorus_depth": chorus_depth,
|
|
|
"chorus_delay": chorus_center_delay,
|
|
|
"chorus_feedback": chorus_feedback,
|
|
|
"chorus_mix": chorus_mix,
|
|
|
"bitcrush_bit_depth": bitcrush_bit_depth,
|
|
|
"clipping_threshold": clipping_threshold,
|
|
|
"compressor_threshold": compressor_threshold,
|
|
|
"compressor_ratio": compressor_ratio,
|
|
|
"compressor_attack": compressor_attack,
|
|
|
"compressor_release": compressor_release,
|
|
|
"delay_seconds": delay_seconds,
|
|
|
"delay_feedback": delay_feedback,
|
|
|
"delay_mix": delay_mix,
|
|
|
"sid": sid,
|
|
|
}
|
|
|
infer_pipeline = import_voice_converter()
|
|
|
infer_pipeline.convert_audio_batch(
|
|
|
**kwargs,
|
|
|
)
|
|
|
|
|
|
return f"Files from {input_folder} inferred successfully."
|
|
|
|
|
|
|
|
|
|
|
|
def run_tts_script(
|
|
|
tts_file: str,
|
|
|
tts_text: str,
|
|
|
tts_voice: str,
|
|
|
tts_rate: int,
|
|
|
pitch: int,
|
|
|
index_rate: float,
|
|
|
volume_envelope: int,
|
|
|
protect: float,
|
|
|
hop_length: int,
|
|
|
f0_method: str,
|
|
|
output_tts_path: str,
|
|
|
output_rvc_path: str,
|
|
|
pth_path: str,
|
|
|
index_path: str,
|
|
|
split_audio: bool,
|
|
|
f0_autotune: bool,
|
|
|
f0_autotune_strength: float,
|
|
|
clean_audio: bool,
|
|
|
clean_strength: float,
|
|
|
export_format: str,
|
|
|
f0_file: str,
|
|
|
embedder_model: str,
|
|
|
embedder_model_custom: str = None,
|
|
|
sid: int = 0,
|
|
|
):
|
|
|
|
|
|
tts_script_path = os.path.join("rvc", "lib", "tools", "tts.py")
|
|
|
|
|
|
if os.path.exists(output_tts_path) and os.path.abspath(output_tts_path).startswith(
|
|
|
os.path.abspath("assets")
|
|
|
):
|
|
|
os.remove(output_tts_path)
|
|
|
|
|
|
command_tts = [
|
|
|
*map(
|
|
|
str,
|
|
|
[
|
|
|
python,
|
|
|
tts_script_path,
|
|
|
tts_file,
|
|
|
tts_text,
|
|
|
tts_voice,
|
|
|
tts_rate,
|
|
|
output_tts_path,
|
|
|
],
|
|
|
),
|
|
|
]
|
|
|
subprocess.run(command_tts)
|
|
|
infer_pipeline = import_voice_converter()
|
|
|
infer_pipeline.convert_audio(
|
|
|
pitch=pitch,
|
|
|
index_rate=index_rate,
|
|
|
volume_envelope=volume_envelope,
|
|
|
protect=protect,
|
|
|
hop_length=hop_length,
|
|
|
f0_method=f0_method,
|
|
|
audio_input_path=output_tts_path,
|
|
|
audio_output_path=output_rvc_path,
|
|
|
model_path=pth_path,
|
|
|
index_path=index_path,
|
|
|
split_audio=split_audio,
|
|
|
f0_autotune=f0_autotune,
|
|
|
f0_autotune_strength=f0_autotune_strength,
|
|
|
clean_audio=clean_audio,
|
|
|
clean_strength=clean_strength,
|
|
|
export_format=export_format,
|
|
|
f0_file=f0_file,
|
|
|
embedder_model=embedder_model,
|
|
|
embedder_model_custom=embedder_model_custom,
|
|
|
sid=sid,
|
|
|
formant_shifting=None,
|
|
|
formant_qfrency=None,
|
|
|
formant_timbre=None,
|
|
|
post_process=None,
|
|
|
reverb=None,
|
|
|
pitch_shift=None,
|
|
|
limiter=None,
|
|
|
gain=None,
|
|
|
distortion=None,
|
|
|
chorus=None,
|
|
|
bitcrush=None,
|
|
|
clipping=None,
|
|
|
compressor=None,
|
|
|
delay=None,
|
|
|
sliders=None,
|
|
|
)
|
|
|
|
|
|
return f"Text {tts_text} synthesized successfully.", output_rvc_path.replace(
|
|
|
".wav", f".{export_format.lower()}"
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def run_preprocess_script(
|
|
|
model_name: str,
|
|
|
dataset_path: str,
|
|
|
sample_rate: int,
|
|
|
cpu_cores: int,
|
|
|
cut_preprocess: str,
|
|
|
process_effects: bool,
|
|
|
noise_reduction: bool,
|
|
|
clean_strength: float,
|
|
|
chunk_len: float,
|
|
|
overlap_len: float,
|
|
|
):
|
|
|
preprocess_script_path = os.path.join("rvc", "train", "preprocess", "preprocess.py")
|
|
|
command = [
|
|
|
python,
|
|
|
preprocess_script_path,
|
|
|
*map(
|
|
|
str,
|
|
|
[
|
|
|
os.path.join(logs_path, model_name),
|
|
|
dataset_path,
|
|
|
sample_rate,
|
|
|
cpu_cores,
|
|
|
cut_preprocess,
|
|
|
process_effects,
|
|
|
noise_reduction,
|
|
|
clean_strength,
|
|
|
chunk_len,
|
|
|
overlap_len,
|
|
|
],
|
|
|
),
|
|
|
]
|
|
|
subprocess.run(command)
|
|
|
return f"Model {model_name} preprocessed successfully."
|
|
|
|
|
|
|
|
|
|
|
|
def run_extract_script(
|
|
|
model_name: str,
|
|
|
f0_method: str,
|
|
|
hop_length: int,
|
|
|
cpu_cores: int,
|
|
|
gpu: int,
|
|
|
sample_rate: int,
|
|
|
embedder_model: str,
|
|
|
embedder_model_custom: str = None,
|
|
|
include_mutes: int = 2,
|
|
|
):
|
|
|
|
|
|
model_path = os.path.join(logs_path, model_name)
|
|
|
extract = os.path.join("rvc", "train", "extract", "extract.py")
|
|
|
|
|
|
command_1 = [
|
|
|
python,
|
|
|
extract,
|
|
|
*map(
|
|
|
str,
|
|
|
[
|
|
|
model_path,
|
|
|
f0_method,
|
|
|
hop_length,
|
|
|
cpu_cores,
|
|
|
gpu,
|
|
|
sample_rate,
|
|
|
embedder_model,
|
|
|
embedder_model_custom,
|
|
|
include_mutes,
|
|
|
],
|
|
|
),
|
|
|
]
|
|
|
|
|
|
subprocess.run(command_1)
|
|
|
|
|
|
return f"Model {model_name} extracted successfully."
|
|
|
|
|
|
|
|
|
|
|
|
def run_train_script(
|
|
|
model_name: str,
|
|
|
save_every_epoch: int,
|
|
|
save_only_latest: bool,
|
|
|
save_every_weights: bool,
|
|
|
total_epoch: int,
|
|
|
sample_rate: int,
|
|
|
batch_size: int,
|
|
|
gpu: int,
|
|
|
overtraining_detector: bool,
|
|
|
overtraining_threshold: int,
|
|
|
pretrained: bool,
|
|
|
cleanup: bool,
|
|
|
index_algorithm: str = "Auto",
|
|
|
cache_data_in_gpu: bool = False,
|
|
|
custom_pretrained: bool = False,
|
|
|
g_pretrained_path: str = None,
|
|
|
d_pretrained_path: str = None,
|
|
|
vocoder: str = "HiFi-GAN",
|
|
|
checkpointing: bool = False,
|
|
|
):
|
|
|
|
|
|
if pretrained == True:
|
|
|
from rvc.lib.tools.pretrained_selector import pretrained_selector
|
|
|
|
|
|
if custom_pretrained == False:
|
|
|
pg, pd = pretrained_selector(str(vocoder), int(sample_rate))
|
|
|
else:
|
|
|
if g_pretrained_path is None or d_pretrained_path is None:
|
|
|
raise ValueError(
|
|
|
"Please provide the path to the pretrained G and D models."
|
|
|
)
|
|
|
pg, pd = g_pretrained_path, d_pretrained_path
|
|
|
else:
|
|
|
pg, pd = "", ""
|
|
|
|
|
|
train_script_path = os.path.join("rvc", "train", "train.py")
|
|
|
command = [
|
|
|
python,
|
|
|
train_script_path,
|
|
|
*map(
|
|
|
str,
|
|
|
[
|
|
|
model_name,
|
|
|
save_every_epoch,
|
|
|
total_epoch,
|
|
|
pg,
|
|
|
pd,
|
|
|
gpu,
|
|
|
batch_size,
|
|
|
sample_rate,
|
|
|
save_only_latest,
|
|
|
save_every_weights,
|
|
|
cache_data_in_gpu,
|
|
|
overtraining_detector,
|
|
|
overtraining_threshold,
|
|
|
cleanup,
|
|
|
vocoder,
|
|
|
checkpointing,
|
|
|
],
|
|
|
),
|
|
|
]
|
|
|
subprocess.run(command)
|
|
|
run_index_script(model_name, index_algorithm)
|
|
|
return f"Model {model_name} trained successfully."
|
|
|
|
|
|
|
|
|
|
|
|
def run_index_script(model_name: str, index_algorithm: str):
|
|
|
index_script_path = os.path.join("rvc", "train", "process", "extract_index.py")
|
|
|
command = [
|
|
|
python,
|
|
|
index_script_path,
|
|
|
os.path.join(logs_path, model_name),
|
|
|
index_algorithm,
|
|
|
]
|
|
|
|
|
|
subprocess.run(command)
|
|
|
return f"Index file for {model_name} generated successfully."
|
|
|
|
|
|
|
|
|
|
|
|
def run_model_information_script(pth_path: str):
|
|
|
print(model_information(pth_path))
|
|
|
return model_information(pth_path)
|
|
|
|
|
|
|
|
|
|
|
|
def run_model_blender_script(
|
|
|
model_name: str, pth_path_1: str, pth_path_2: str, ratio: float
|
|
|
):
|
|
|
message, model_blended = model_blender(model_name, pth_path_1, pth_path_2, ratio)
|
|
|
return message, model_blended
|
|
|
|
|
|
|
|
|
|
|
|
def run_tensorboard_script():
|
|
|
launch_tensorboard_pipeline()
|
|
|
|
|
|
|
|
|
|
|
|
def run_download_script(model_link: str):
|
|
|
model_download_pipeline(model_link)
|
|
|
return f"Model downloaded successfully."
|
|
|
|
|
|
|
|
|
|
|
|
def run_prerequisites_script(
|
|
|
pretraineds_hifigan: bool,
|
|
|
models: bool,
|
|
|
exe: bool,
|
|
|
):
|
|
|
prequisites_download_pipeline(
|
|
|
pretraineds_hifigan,
|
|
|
models,
|
|
|
exe,
|
|
|
)
|
|
|
return "Prerequisites installed successfully."
|
|
|
|
|
|
|
|
|
|
|
|
def run_audio_analyzer_script(
|
|
|
input_path: str, save_plot_path: str = "logs/audio_analysis.png"
|
|
|
):
|
|
|
audio_info, plot_path = analyze_audio(input_path, save_plot_path)
|
|
|
print(
|
|
|
f"Audio info of {input_path}: {audio_info}",
|
|
|
f"Audio file {input_path} analyzed successfully. Plot saved at: {plot_path}",
|
|
|
)
|
|
|
return audio_info, plot_path
|
|
|
|
|
|
|
|
|
|
|
|
def parse_arguments():
|
|
|
parser = argparse.ArgumentParser(
|
|
|
description="Run the main.py script with specific parameters."
|
|
|
)
|
|
|
subparsers = parser.add_subparsers(
|
|
|
title="subcommands", dest="mode", help="Choose a mode"
|
|
|
)
|
|
|
|
|
|
|
|
|
infer_parser = subparsers.add_parser("infer", help="Run inference")
|
|
|
pitch_description = (
|
|
|
"Set the pitch of the audio. Higher values result in a higher pitch."
|
|
|
)
|
|
|
infer_parser.add_argument(
|
|
|
"--pitch",
|
|
|
type=int,
|
|
|
help=pitch_description,
|
|
|
choices=range(-24, 25),
|
|
|
default=0,
|
|
|
)
|
|
|
index_rate_description = "Control the influence of the index file on the output. Higher values mean stronger influence. Lower values can help reduce artifacts but may result in less accurate voice cloning."
|
|
|
infer_parser.add_argument(
|
|
|
"--index_rate",
|
|
|
type=float,
|
|
|
help=index_rate_description,
|
|
|
choices=[i / 100.0 for i in range(0, 101)],
|
|
|
default=0.3,
|
|
|
)
|
|
|
volume_envelope_description = "Control the blending of the output's volume envelope. A value of 1 means the output envelope is fully used."
|
|
|
infer_parser.add_argument(
|
|
|
"--volume_envelope",
|
|
|
type=float,
|
|
|
help=volume_envelope_description,
|
|
|
choices=[i / 100.0 for i in range(0, 101)],
|
|
|
default=1,
|
|
|
)
|
|
|
protect_description = "Protect consonants and breathing sounds from artifacts. A value of 0.5 offers the strongest protection, while lower values may reduce the protection level but potentially mitigate the indexing effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--protect",
|
|
|
type=float,
|
|
|
help=protect_description,
|
|
|
choices=[i / 1000.0 for i in range(0, 501)],
|
|
|
default=0.33,
|
|
|
)
|
|
|
hop_length_description = "Only applicable for the Crepe pitch extraction method. Determines the time it takes for the system to react to a significant pitch change. Smaller values require more processing time but can lead to better pitch accuracy."
|
|
|
infer_parser.add_argument(
|
|
|
"--hop_length",
|
|
|
type=int,
|
|
|
help=hop_length_description,
|
|
|
choices=range(1, 513),
|
|
|
default=128,
|
|
|
)
|
|
|
f0_method_description = "Choose the pitch extraction algorithm for the conversion. 'rmvpe' is the default and generally recommended."
|
|
|
infer_parser.add_argument(
|
|
|
"--f0_method",
|
|
|
type=str,
|
|
|
help=f0_method_description,
|
|
|
choices=[
|
|
|
"crepe",
|
|
|
"crepe-tiny",
|
|
|
"rmvpe",
|
|
|
"fcpe",
|
|
|
"hybrid[crepe+rmvpe]",
|
|
|
"hybrid[crepe+fcpe]",
|
|
|
"hybrid[rmvpe+fcpe]",
|
|
|
"hybrid[crepe+rmvpe+fcpe]",
|
|
|
],
|
|
|
default="rmvpe",
|
|
|
)
|
|
|
infer_parser.add_argument(
|
|
|
"--input_path",
|
|
|
type=str,
|
|
|
help="Full path to the input audio file.",
|
|
|
required=True,
|
|
|
)
|
|
|
infer_parser.add_argument(
|
|
|
"--output_path",
|
|
|
type=str,
|
|
|
help="Full path to the output audio file.",
|
|
|
required=True,
|
|
|
)
|
|
|
pth_path_description = "Full path to the RVC model file (.pth)."
|
|
|
infer_parser.add_argument(
|
|
|
"--pth_path", type=str, help=pth_path_description, required=True
|
|
|
)
|
|
|
index_path_description = "Full path to the index file (.index)."
|
|
|
infer_parser.add_argument(
|
|
|
"--index_path", type=str, help=index_path_description, required=True
|
|
|
)
|
|
|
split_audio_description = "Split the audio into smaller segments before inference. This can improve the quality of the output for longer audio files."
|
|
|
infer_parser.add_argument(
|
|
|
"--split_audio",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=split_audio_description,
|
|
|
default=False,
|
|
|
)
|
|
|
f0_autotune_description = "Apply a light autotune to the inferred audio. Particularly useful for singing voice conversions."
|
|
|
infer_parser.add_argument(
|
|
|
"--f0_autotune",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=f0_autotune_description,
|
|
|
default=False,
|
|
|
)
|
|
|
f0_autotune_strength_description = "Set the autotune strength - the more you increase it the more it will snap to the chromatic grid."
|
|
|
infer_parser.add_argument(
|
|
|
"--f0_autotune_strength",
|
|
|
type=float,
|
|
|
help=f0_autotune_strength_description,
|
|
|
choices=[(i / 10) for i in range(11)],
|
|
|
default=1.0,
|
|
|
)
|
|
|
clean_audio_description = "Clean the output audio using noise reduction algorithms. Recommended for speech conversions."
|
|
|
infer_parser.add_argument(
|
|
|
"--clean_audio",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=clean_audio_description,
|
|
|
default=False,
|
|
|
)
|
|
|
clean_strength_description = "Adjust the intensity of the audio cleaning process. Higher values result in stronger cleaning, but may lead to a more compressed sound."
|
|
|
infer_parser.add_argument(
|
|
|
"--clean_strength",
|
|
|
type=float,
|
|
|
help=clean_strength_description,
|
|
|
choices=[(i / 10) for i in range(11)],
|
|
|
default=0.7,
|
|
|
)
|
|
|
export_format_description = "Select the desired output audio format."
|
|
|
infer_parser.add_argument(
|
|
|
"--export_format",
|
|
|
type=str,
|
|
|
help=export_format_description,
|
|
|
choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
|
|
|
default="WAV",
|
|
|
)
|
|
|
embedder_model_description = (
|
|
|
"Choose the model used for generating speaker embeddings."
|
|
|
)
|
|
|
infer_parser.add_argument(
|
|
|
"--embedder_model",
|
|
|
type=str,
|
|
|
help=embedder_model_description,
|
|
|
choices=[
|
|
|
"contentvec",
|
|
|
"chinese-hubert-base",
|
|
|
"japanese-hubert-base",
|
|
|
"korean-hubert-base",
|
|
|
"custom",
|
|
|
],
|
|
|
default="contentvec",
|
|
|
)
|
|
|
embedder_model_custom_description = "Specify the path to a custom model for speaker embedding. Only applicable if 'embedder_model' is set to 'custom'."
|
|
|
infer_parser.add_argument(
|
|
|
"--embedder_model_custom",
|
|
|
type=str,
|
|
|
help=embedder_model_custom_description,
|
|
|
default=None,
|
|
|
)
|
|
|
f0_file_description = "Full path to an external F0 file (.f0). This allows you to use pre-computed pitch values for the input audio."
|
|
|
infer_parser.add_argument(
|
|
|
"--f0_file",
|
|
|
type=str,
|
|
|
help=f0_file_description,
|
|
|
default=None,
|
|
|
)
|
|
|
formant_shifting_description = "Apply formant shifting to the input audio. This can help adjust the timbre of the voice."
|
|
|
infer_parser.add_argument(
|
|
|
"--formant_shifting",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=formant_shifting_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
formant_qfrency_description = "Control the frequency of the formant shifting effect. Higher values result in a more pronounced effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--formant_qfrency",
|
|
|
type=float,
|
|
|
help=formant_qfrency_description,
|
|
|
default=1.0,
|
|
|
required=False,
|
|
|
)
|
|
|
formant_timbre_description = "Control the timbre of the formant shifting effect. Higher values result in a more pronounced effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--formant_timbre",
|
|
|
type=float,
|
|
|
help=formant_timbre_description,
|
|
|
default=1.0,
|
|
|
required=False,
|
|
|
)
|
|
|
sid_description = "Speaker ID for multi-speaker models."
|
|
|
infer_parser.add_argument(
|
|
|
"--sid",
|
|
|
type=int,
|
|
|
help=sid_description,
|
|
|
default=0,
|
|
|
required=False,
|
|
|
)
|
|
|
post_process_description = "Apply post-processing effects to the output audio."
|
|
|
infer_parser.add_argument(
|
|
|
"--post_process",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=post_process_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
reverb_description = "Apply reverb effect to the output audio."
|
|
|
infer_parser.add_argument(
|
|
|
"--reverb",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=reverb_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
pitch_shift_description = "Apply pitch shifting effect to the output audio."
|
|
|
infer_parser.add_argument(
|
|
|
"--pitch_shift",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=pitch_shift_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
limiter_description = "Apply limiter effect to the output audio."
|
|
|
infer_parser.add_argument(
|
|
|
"--limiter",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=limiter_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
gain_description = "Apply gain effect to the output audio."
|
|
|
infer_parser.add_argument(
|
|
|
"--gain",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=gain_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
distortion_description = "Apply distortion effect to the output audio."
|
|
|
infer_parser.add_argument(
|
|
|
"--distortion",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=distortion_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
chorus_description = "Apply chorus effect to the output audio."
|
|
|
infer_parser.add_argument(
|
|
|
"--chorus",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=chorus_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
bitcrush_description = "Apply bitcrush effect to the output audio."
|
|
|
infer_parser.add_argument(
|
|
|
"--bitcrush",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=bitcrush_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
clipping_description = "Apply clipping effect to the output audio."
|
|
|
infer_parser.add_argument(
|
|
|
"--clipping",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=clipping_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
compressor_description = "Apply compressor effect to the output audio."
|
|
|
infer_parser.add_argument(
|
|
|
"--compressor",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=compressor_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
delay_description = "Apply delay effect to the output audio."
|
|
|
infer_parser.add_argument(
|
|
|
"--delay",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=delay_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
reverb_room_size_description = "Control the room size of the reverb effect. Higher values result in a larger room size."
|
|
|
infer_parser.add_argument(
|
|
|
"--reverb_room_size",
|
|
|
type=float,
|
|
|
help=reverb_room_size_description,
|
|
|
default=0.5,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
reverb_damping_description = "Control the damping of the reverb effect. Higher values result in a more damped sound."
|
|
|
infer_parser.add_argument(
|
|
|
"--reverb_damping",
|
|
|
type=float,
|
|
|
help=reverb_damping_description,
|
|
|
default=0.5,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
reverb_wet_gain_description = "Control the wet gain of the reverb effect. Higher values result in a stronger reverb effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--reverb_wet_gain",
|
|
|
type=float,
|
|
|
help=reverb_wet_gain_description,
|
|
|
default=0.5,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
reverb_dry_gain_description = "Control the dry gain of the reverb effect. Higher values result in a stronger dry signal."
|
|
|
infer_parser.add_argument(
|
|
|
"--reverb_dry_gain",
|
|
|
type=float,
|
|
|
help=reverb_dry_gain_description,
|
|
|
default=0.5,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
reverb_width_description = "Control the stereo width of the reverb effect. Higher values result in a wider stereo image."
|
|
|
infer_parser.add_argument(
|
|
|
"--reverb_width",
|
|
|
type=float,
|
|
|
help=reverb_width_description,
|
|
|
default=0.5,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
reverb_freeze_mode_description = "Control the freeze mode of the reverb effect. Higher values result in a stronger freeze effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--reverb_freeze_mode",
|
|
|
type=float,
|
|
|
help=reverb_freeze_mode_description,
|
|
|
default=0.5,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
pitch_shift_semitones_description = "Control the pitch shift in semitones. Positive values increase the pitch, while negative values decrease it."
|
|
|
infer_parser.add_argument(
|
|
|
"--pitch_shift_semitones",
|
|
|
type=float,
|
|
|
help=pitch_shift_semitones_description,
|
|
|
default=0.0,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
limiter_threshold_description = "Control the threshold of the limiter effect. Higher values result in a stronger limiting effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--limiter_threshold",
|
|
|
type=float,
|
|
|
help=limiter_threshold_description,
|
|
|
default=-6,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
limiter_release_time_description = "Control the release time of the limiter effect. Higher values result in a longer release time."
|
|
|
infer_parser.add_argument(
|
|
|
"--limiter_release_time",
|
|
|
type=float,
|
|
|
help=limiter_release_time_description,
|
|
|
default=0.01,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
gain_db_description = "Control the gain in decibels. Positive values increase the gain, while negative values decrease it."
|
|
|
infer_parser.add_argument(
|
|
|
"--gain_db",
|
|
|
type=float,
|
|
|
help=gain_db_description,
|
|
|
default=0.0,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
distortion_gain_description = "Control the gain of the distortion effect. Higher values result in a stronger distortion effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--distortion_gain",
|
|
|
type=float,
|
|
|
help=distortion_gain_description,
|
|
|
default=25,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
chorus_rate_description = "Control the rate of the chorus effect. Higher values result in a faster chorus effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--chorus_rate",
|
|
|
type=float,
|
|
|
help=chorus_rate_description,
|
|
|
default=1.0,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
chorus_depth_description = "Control the depth of the chorus effect. Higher values result in a stronger chorus effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--chorus_depth",
|
|
|
type=float,
|
|
|
help=chorus_depth_description,
|
|
|
default=0.25,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
chorus_center_delay_description = "Control the center delay of the chorus effect. Higher values result in a longer center delay."
|
|
|
infer_parser.add_argument(
|
|
|
"--chorus_center_delay",
|
|
|
type=float,
|
|
|
help=chorus_center_delay_description,
|
|
|
default=7,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
chorus_feedback_description = "Control the feedback of the chorus effect. Higher values result in a stronger feedback effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--chorus_feedback",
|
|
|
type=float,
|
|
|
help=chorus_feedback_description,
|
|
|
default=0.0,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
chorus_mix_description = "Control the mix of the chorus effect. Higher values result in a stronger chorus effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--chorus_mix",
|
|
|
type=float,
|
|
|
help=chorus_mix_description,
|
|
|
default=0.5,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
bitcrush_bit_depth_description = "Control the bit depth of the bitcrush effect. Higher values result in a stronger bitcrush effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--bitcrush_bit_depth",
|
|
|
type=int,
|
|
|
help=bitcrush_bit_depth_description,
|
|
|
default=8,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
clipping_threshold_description = "Control the threshold of the clipping effect. Higher values result in a stronger clipping effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--clipping_threshold",
|
|
|
type=float,
|
|
|
help=clipping_threshold_description,
|
|
|
default=-6,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
compressor_threshold_description = "Control the threshold of the compressor effect. Higher values result in a stronger compressor effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--compressor_threshold",
|
|
|
type=float,
|
|
|
help=compressor_threshold_description,
|
|
|
default=0,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
compressor_ratio_description = "Control the ratio of the compressor effect. Higher values result in a stronger compressor effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--compressor_ratio",
|
|
|
type=float,
|
|
|
help=compressor_ratio_description,
|
|
|
default=1,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
compressor_attack_description = "Control the attack of the compressor effect. Higher values result in a stronger compressor effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--compressor_attack",
|
|
|
type=float,
|
|
|
help=compressor_attack_description,
|
|
|
default=1.0,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
compressor_release_description = "Control the release of the compressor effect. Higher values result in a stronger compressor effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--compressor_release",
|
|
|
type=float,
|
|
|
help=compressor_release_description,
|
|
|
default=100,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
delay_seconds_description = "Control the delay time in seconds. Higher values result in a longer delay time."
|
|
|
infer_parser.add_argument(
|
|
|
"--delay_seconds",
|
|
|
type=float,
|
|
|
help=delay_seconds_description,
|
|
|
default=0.5,
|
|
|
required=False,
|
|
|
)
|
|
|
delay_feedback_description = "Control the feedback of the delay effect. Higher values result in a stronger feedback effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--delay_feedback",
|
|
|
type=float,
|
|
|
help=delay_feedback_description,
|
|
|
default=0.0,
|
|
|
required=False,
|
|
|
)
|
|
|
delay_mix_description = "Control the mix of the delay effect. Higher values result in a stronger delay effect."
|
|
|
infer_parser.add_argument(
|
|
|
"--delay_mix",
|
|
|
type=float,
|
|
|
help=delay_mix_description,
|
|
|
default=0.5,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
|
|
|
batch_infer_parser = subparsers.add_parser(
|
|
|
"batch_infer",
|
|
|
help="Run batch inference",
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--pitch",
|
|
|
type=int,
|
|
|
help=pitch_description,
|
|
|
choices=range(-24, 25),
|
|
|
default=0,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--index_rate",
|
|
|
type=float,
|
|
|
help=index_rate_description,
|
|
|
choices=[i / 100.0 for i in range(0, 101)],
|
|
|
default=0.3,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--volume_envelope",
|
|
|
type=float,
|
|
|
help=volume_envelope_description,
|
|
|
choices=[i / 100.0 for i in range(0, 101)],
|
|
|
default=1,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--protect",
|
|
|
type=float,
|
|
|
help=protect_description,
|
|
|
choices=[i / 1000.0 for i in range(0, 501)],
|
|
|
default=0.33,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--hop_length",
|
|
|
type=int,
|
|
|
help=hop_length_description,
|
|
|
choices=range(1, 513),
|
|
|
default=128,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--f0_method",
|
|
|
type=str,
|
|
|
help=f0_method_description,
|
|
|
choices=[
|
|
|
"crepe",
|
|
|
"crepe-tiny",
|
|
|
"rmvpe",
|
|
|
"fcpe",
|
|
|
"hybrid[crepe+rmvpe]",
|
|
|
"hybrid[crepe+fcpe]",
|
|
|
"hybrid[rmvpe+fcpe]",
|
|
|
"hybrid[crepe+rmvpe+fcpe]",
|
|
|
],
|
|
|
default="rmvpe",
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--input_folder",
|
|
|
type=str,
|
|
|
help="Path to the folder containing input audio files.",
|
|
|
required=True,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--output_folder",
|
|
|
type=str,
|
|
|
help="Path to the folder for saving output audio files.",
|
|
|
required=True,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--pth_path", type=str, help=pth_path_description, required=True
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--index_path", type=str, help=index_path_description, required=True
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--split_audio",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=split_audio_description,
|
|
|
default=False,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--f0_autotune",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=f0_autotune_description,
|
|
|
default=False,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--f0_autotune_strength",
|
|
|
type=float,
|
|
|
help=clean_strength_description,
|
|
|
choices=[(i / 10) for i in range(11)],
|
|
|
default=1.0,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--clean_audio",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=clean_audio_description,
|
|
|
default=False,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--clean_strength",
|
|
|
type=float,
|
|
|
help=clean_strength_description,
|
|
|
choices=[(i / 10) for i in range(11)],
|
|
|
default=0.7,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--export_format",
|
|
|
type=str,
|
|
|
help=export_format_description,
|
|
|
choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
|
|
|
default="WAV",
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--embedder_model",
|
|
|
type=str,
|
|
|
help=embedder_model_description,
|
|
|
choices=[
|
|
|
"contentvec",
|
|
|
"chinese-hubert-base",
|
|
|
"japanese-hubert-base",
|
|
|
"korean-hubert-base",
|
|
|
"custom",
|
|
|
],
|
|
|
default="contentvec",
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--embedder_model_custom",
|
|
|
type=str,
|
|
|
help=embedder_model_custom_description,
|
|
|
default=None,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--f0_file",
|
|
|
type=str,
|
|
|
help=f0_file_description,
|
|
|
default=None,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--formant_shifting",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=formant_shifting_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--formant_qfrency",
|
|
|
type=float,
|
|
|
help=formant_qfrency_description,
|
|
|
default=1.0,
|
|
|
required=False,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--formant_timbre",
|
|
|
type=float,
|
|
|
help=formant_timbre_description,
|
|
|
default=1.0,
|
|
|
required=False,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--sid",
|
|
|
type=int,
|
|
|
help=sid_description,
|
|
|
default=0,
|
|
|
required=False,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--post_process",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=post_process_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--reverb",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=reverb_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--pitch_shift",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=pitch_shift_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--limiter",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=limiter_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--gain",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=gain_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--distortion",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=distortion_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--chorus",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=chorus_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--bitcrush",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=bitcrush_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--clipping",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=clipping_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--compressor",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=compressor_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--delay",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=delay_description,
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--reverb_room_size",
|
|
|
type=float,
|
|
|
help=reverb_room_size_description,
|
|
|
default=0.5,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--reverb_damping",
|
|
|
type=float,
|
|
|
help=reverb_damping_description,
|
|
|
default=0.5,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--reverb_wet_gain",
|
|
|
type=float,
|
|
|
help=reverb_wet_gain_description,
|
|
|
default=0.5,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--reverb_dry_gain",
|
|
|
type=float,
|
|
|
help=reverb_dry_gain_description,
|
|
|
default=0.5,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--reverb_width",
|
|
|
type=float,
|
|
|
help=reverb_width_description,
|
|
|
default=0.5,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--reverb_freeze_mode",
|
|
|
type=float,
|
|
|
help=reverb_freeze_mode_description,
|
|
|
default=0.5,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--pitch_shift_semitones",
|
|
|
type=float,
|
|
|
help=pitch_shift_semitones_description,
|
|
|
default=0.0,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--limiter_threshold",
|
|
|
type=float,
|
|
|
help=limiter_threshold_description,
|
|
|
default=-6,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--limiter_release_time",
|
|
|
type=float,
|
|
|
help=limiter_release_time_description,
|
|
|
default=0.01,
|
|
|
required=False,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--gain_db",
|
|
|
type=float,
|
|
|
help=gain_db_description,
|
|
|
default=0.0,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--distortion_gain",
|
|
|
type=float,
|
|
|
help=distortion_gain_description,
|
|
|
default=25,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--chorus_rate",
|
|
|
type=float,
|
|
|
help=chorus_rate_description,
|
|
|
default=1.0,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--chorus_depth",
|
|
|
type=float,
|
|
|
help=chorus_depth_description,
|
|
|
default=0.25,
|
|
|
required=False,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--chorus_center_delay",
|
|
|
type=float,
|
|
|
help=chorus_center_delay_description,
|
|
|
default=7,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--chorus_feedback",
|
|
|
type=float,
|
|
|
help=chorus_feedback_description,
|
|
|
default=0.0,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--chorus_mix",
|
|
|
type=float,
|
|
|
help=chorus_mix_description,
|
|
|
default=0.5,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--bitcrush_bit_depth",
|
|
|
type=int,
|
|
|
help=bitcrush_bit_depth_description,
|
|
|
default=8,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--clipping_threshold",
|
|
|
type=float,
|
|
|
help=clipping_threshold_description,
|
|
|
default=-6,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--compressor_threshold",
|
|
|
type=float,
|
|
|
help=compressor_threshold_description,
|
|
|
default=0,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--compressor_ratio",
|
|
|
type=float,
|
|
|
help=compressor_ratio_description,
|
|
|
default=1,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--compressor_attack",
|
|
|
type=float,
|
|
|
help=compressor_attack_description,
|
|
|
default=1.0,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--compressor_release",
|
|
|
type=float,
|
|
|
help=compressor_release_description,
|
|
|
default=100,
|
|
|
required=False,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--delay_seconds",
|
|
|
type=float,
|
|
|
help=delay_seconds_description,
|
|
|
default=0.5,
|
|
|
required=False,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--delay_feedback",
|
|
|
type=float,
|
|
|
help=delay_feedback_description,
|
|
|
default=0.0,
|
|
|
required=False,
|
|
|
)
|
|
|
batch_infer_parser.add_argument(
|
|
|
"--delay_mix",
|
|
|
type=float,
|
|
|
help=delay_mix_description,
|
|
|
default=0.5,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
|
|
|
tts_parser = subparsers.add_parser("tts", help="Run TTS inference")
|
|
|
tts_parser.add_argument(
|
|
|
"--tts_file", type=str, help="File with a text to be synthesized", required=True
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--tts_text", type=str, help="Text to be synthesized", required=True
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--tts_voice",
|
|
|
type=str,
|
|
|
help="Voice to be used for TTS synthesis.",
|
|
|
choices=locales,
|
|
|
required=True,
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--tts_rate",
|
|
|
type=int,
|
|
|
help="Control the speaking rate of the TTS. Values range from -100 (slower) to 100 (faster).",
|
|
|
choices=range(-100, 101),
|
|
|
default=0,
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--pitch",
|
|
|
type=int,
|
|
|
help=pitch_description,
|
|
|
choices=range(-24, 25),
|
|
|
default=0,
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--index_rate",
|
|
|
type=float,
|
|
|
help=index_rate_description,
|
|
|
choices=[(i / 10) for i in range(11)],
|
|
|
default=0.3,
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--volume_envelope",
|
|
|
type=float,
|
|
|
help=volume_envelope_description,
|
|
|
choices=[(i / 10) for i in range(11)],
|
|
|
default=1,
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--protect",
|
|
|
type=float,
|
|
|
help=protect_description,
|
|
|
choices=[(i / 10) for i in range(6)],
|
|
|
default=0.33,
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--hop_length",
|
|
|
type=int,
|
|
|
help=hop_length_description,
|
|
|
choices=range(1, 513),
|
|
|
default=128,
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--f0_method",
|
|
|
type=str,
|
|
|
help=f0_method_description,
|
|
|
choices=[
|
|
|
"crepe",
|
|
|
"crepe-tiny",
|
|
|
"rmvpe+",
|
|
|
"fcpe",
|
|
|
"hybrid[crepe+rmvpe]",
|
|
|
"hybrid[crepe+fcpe]",
|
|
|
"hybrid[rmvpe+fcpe]",
|
|
|
"hybrid[crepe+rmvpe+fcpe]",
|
|
|
],
|
|
|
default="rmvpe+",
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--output_tts_path",
|
|
|
type=str,
|
|
|
help="Full path to save the synthesized TTS audio.",
|
|
|
required=True,
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--output_rvc_path",
|
|
|
type=str,
|
|
|
help="Full path to save the voice-converted audio using the synthesized TTS.",
|
|
|
required=True,
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--pth_path", type=str, help=pth_path_description, required=True
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--index_path", type=str, help=index_path_description, required=True
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--split_audio",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=split_audio_description,
|
|
|
default=False,
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--f0_autotune",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=f0_autotune_description,
|
|
|
default=False,
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--f0_autotune_strength",
|
|
|
type=float,
|
|
|
help=clean_strength_description,
|
|
|
choices=[(i / 10) for i in range(11)],
|
|
|
default=1.0,
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--clean_audio",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help=clean_audio_description,
|
|
|
default=False,
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--clean_strength",
|
|
|
type=float,
|
|
|
help=clean_strength_description,
|
|
|
choices=[(i / 10) for i in range(11)],
|
|
|
default=0.7,
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--export_format",
|
|
|
type=str,
|
|
|
help=export_format_description,
|
|
|
choices=["WAV", "MP3", "FLAC", "OGG", "M4A"],
|
|
|
default="WAV",
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--embedder_model",
|
|
|
type=str,
|
|
|
help=embedder_model_description,
|
|
|
choices=[
|
|
|
"contentvec",
|
|
|
"chinese-hubert-base",
|
|
|
"japanese-hubert-base",
|
|
|
"korean-hubert-base",
|
|
|
"custom",
|
|
|
],
|
|
|
default="contentvec",
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--embedder_model_custom",
|
|
|
type=str,
|
|
|
help=embedder_model_custom_description,
|
|
|
default=None,
|
|
|
)
|
|
|
tts_parser.add_argument(
|
|
|
"--f0_file",
|
|
|
type=str,
|
|
|
help=f0_file_description,
|
|
|
default=None,
|
|
|
)
|
|
|
|
|
|
|
|
|
preprocess_parser = subparsers.add_parser(
|
|
|
"preprocess", help="Preprocess a dataset for training."
|
|
|
)
|
|
|
preprocess_parser.add_argument(
|
|
|
"--model_name", type=str, help="Name of the model to be trained.", required=True
|
|
|
)
|
|
|
preprocess_parser.add_argument(
|
|
|
"--dataset_path", type=str, help="Path to the dataset directory.", required=True
|
|
|
)
|
|
|
preprocess_parser.add_argument(
|
|
|
"--sample_rate",
|
|
|
type=int,
|
|
|
help="Target sampling rate for the audio data.",
|
|
|
choices=[32000, 40000, 48000],
|
|
|
required=True,
|
|
|
)
|
|
|
preprocess_parser.add_argument(
|
|
|
"--cpu_cores",
|
|
|
type=int,
|
|
|
help="Number of CPU cores to use for preprocessing.",
|
|
|
choices=range(1, 65),
|
|
|
)
|
|
|
preprocess_parser.add_argument(
|
|
|
"--cut_preprocess",
|
|
|
type=str,
|
|
|
choices=["Skip", "Simple", "Automatic"],
|
|
|
help="Cut the dataset into smaller segments for faster preprocessing.",
|
|
|
default="Automatic",
|
|
|
required=True,
|
|
|
)
|
|
|
preprocess_parser.add_argument(
|
|
|
"--process_effects",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help="Disable all filters during preprocessing.",
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
preprocess_parser.add_argument(
|
|
|
"--noise_reduction",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help="Enable noise reduction during preprocessing.",
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
preprocess_parser.add_argument(
|
|
|
"--noise_reduction_strength",
|
|
|
type=float,
|
|
|
help="Strength of the noise reduction filter.",
|
|
|
choices=[(i / 10) for i in range(11)],
|
|
|
default=0.7,
|
|
|
required=False,
|
|
|
)
|
|
|
preprocess_parser.add_argument(
|
|
|
"--chunk_len",
|
|
|
type=float,
|
|
|
help="Chunk length.",
|
|
|
choices=[i * 0.5 for i in range(1, 11)],
|
|
|
default=3.0,
|
|
|
required=False,
|
|
|
)
|
|
|
preprocess_parser.add_argument(
|
|
|
"--overlap_len",
|
|
|
type=float,
|
|
|
help="Overlap length.",
|
|
|
choices=[0.0, 0.1, 0.2, 0.3, 0.4],
|
|
|
default=0.3,
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
|
|
|
extract_parser = subparsers.add_parser(
|
|
|
"extract", help="Extract features from a dataset."
|
|
|
)
|
|
|
extract_parser.add_argument(
|
|
|
"--model_name", type=str, help="Name of the model.", required=True
|
|
|
)
|
|
|
extract_parser.add_argument(
|
|
|
"--f0_method",
|
|
|
type=str,
|
|
|
help="Pitch extraction method to use.",
|
|
|
choices=[
|
|
|
"crepe",
|
|
|
"crepe-tiny",
|
|
|
"rmvpe",
|
|
|
],
|
|
|
default="rmvpe",
|
|
|
)
|
|
|
extract_parser.add_argument(
|
|
|
"--hop_length",
|
|
|
type=int,
|
|
|
help="Hop length for feature extraction. Only applicable for Crepe pitch extraction.",
|
|
|
choices=range(1, 513),
|
|
|
default=128,
|
|
|
)
|
|
|
extract_parser.add_argument(
|
|
|
"--cpu_cores",
|
|
|
type=int,
|
|
|
help="Number of CPU cores to use for feature extraction (optional).",
|
|
|
choices=range(1, 65),
|
|
|
default=None,
|
|
|
)
|
|
|
extract_parser.add_argument(
|
|
|
"--gpu",
|
|
|
type=str,
|
|
|
help="GPU device to use for feature extraction (optional).",
|
|
|
default="-",
|
|
|
)
|
|
|
extract_parser.add_argument(
|
|
|
"--sample_rate",
|
|
|
type=int,
|
|
|
help="Target sampling rate for the audio data.",
|
|
|
choices=[32000, 40000, 44100, 48000],
|
|
|
required=True,
|
|
|
)
|
|
|
extract_parser.add_argument(
|
|
|
"--embedder_model",
|
|
|
type=str,
|
|
|
help=embedder_model_description,
|
|
|
choices=[
|
|
|
"contentvec",
|
|
|
"chinese-hubert-base",
|
|
|
"japanese-hubert-base",
|
|
|
"korean-hubert-base",
|
|
|
"custom",
|
|
|
],
|
|
|
default="contentvec",
|
|
|
)
|
|
|
extract_parser.add_argument(
|
|
|
"--embedder_model_custom",
|
|
|
type=str,
|
|
|
help=embedder_model_custom_description,
|
|
|
default=None,
|
|
|
)
|
|
|
extract_parser.add_argument(
|
|
|
"--include_mutes",
|
|
|
type=int,
|
|
|
help="Number of silent files to include.",
|
|
|
choices=range(0, 11),
|
|
|
default=2,
|
|
|
required=True,
|
|
|
)
|
|
|
|
|
|
|
|
|
train_parser = subparsers.add_parser("train", help="Train an RVC model.")
|
|
|
train_parser.add_argument(
|
|
|
"--model_name", type=str, help="Name of the model to be trained.", required=True
|
|
|
)
|
|
|
train_parser.add_argument(
|
|
|
"--vocoder",
|
|
|
type=str,
|
|
|
help="Vocoder name",
|
|
|
choices=["HiFi-GAN", "MRF HiFi-GAN", "RefineGAN"],
|
|
|
default="HiFi-GAN",
|
|
|
)
|
|
|
train_parser.add_argument(
|
|
|
"--checkpointing",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help="Enables memory-efficient training.",
|
|
|
default=False,
|
|
|
required=False,
|
|
|
)
|
|
|
train_parser.add_argument(
|
|
|
"--save_every_epoch",
|
|
|
type=int,
|
|
|
help="Save the model every specified number of epochs.",
|
|
|
choices=range(1, 10001),
|
|
|
required=True,
|
|
|
)
|
|
|
train_parser.add_argument(
|
|
|
"--save_only_latest",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help="Save only the latest model checkpoint.",
|
|
|
default=False,
|
|
|
)
|
|
|
train_parser.add_argument(
|
|
|
"--save_every_weights",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help="Save model weights every epoch.",
|
|
|
default=True,
|
|
|
)
|
|
|
train_parser.add_argument(
|
|
|
"--total_epoch",
|
|
|
type=int,
|
|
|
help="Total number of epochs to train for.",
|
|
|
choices=range(1, 10001),
|
|
|
default=1000,
|
|
|
)
|
|
|
train_parser.add_argument(
|
|
|
"--sample_rate",
|
|
|
type=int,
|
|
|
help="Sampling rate of the training data.",
|
|
|
choices=[32000, 40000, 48000],
|
|
|
required=True,
|
|
|
)
|
|
|
train_parser.add_argument(
|
|
|
"--batch_size",
|
|
|
type=int,
|
|
|
help="Batch size for training.",
|
|
|
choices=range(1, 51),
|
|
|
default=8,
|
|
|
)
|
|
|
train_parser.add_argument(
|
|
|
"--gpu",
|
|
|
type=str,
|
|
|
help="GPU device to use for training (e.g., '0').",
|
|
|
default="0",
|
|
|
)
|
|
|
train_parser.add_argument(
|
|
|
"--pretrained",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help="Use a pretrained model for initialization.",
|
|
|
default=True,
|
|
|
)
|
|
|
train_parser.add_argument(
|
|
|
"--custom_pretrained",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help="Use a custom pretrained model.",
|
|
|
default=False,
|
|
|
)
|
|
|
train_parser.add_argument(
|
|
|
"--g_pretrained_path",
|
|
|
type=str,
|
|
|
nargs="?",
|
|
|
default=None,
|
|
|
help="Path to the pretrained generator model file.",
|
|
|
)
|
|
|
train_parser.add_argument(
|
|
|
"--d_pretrained_path",
|
|
|
type=str,
|
|
|
nargs="?",
|
|
|
default=None,
|
|
|
help="Path to the pretrained discriminator model file.",
|
|
|
)
|
|
|
train_parser.add_argument(
|
|
|
"--overtraining_detector",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help="Enable overtraining detection.",
|
|
|
default=False,
|
|
|
)
|
|
|
train_parser.add_argument(
|
|
|
"--overtraining_threshold",
|
|
|
type=int,
|
|
|
help="Threshold for overtraining detection.",
|
|
|
choices=range(1, 101),
|
|
|
default=50,
|
|
|
)
|
|
|
train_parser.add_argument(
|
|
|
"--cleanup",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help="Cleanup previous training attempt.",
|
|
|
default=False,
|
|
|
)
|
|
|
train_parser.add_argument(
|
|
|
"--cache_data_in_gpu",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
help="Cache training data in GPU memory.",
|
|
|
default=False,
|
|
|
)
|
|
|
train_parser.add_argument(
|
|
|
"--index_algorithm",
|
|
|
type=str,
|
|
|
choices=["Auto", "Faiss", "KMeans"],
|
|
|
help="Choose the method for generating the index file.",
|
|
|
default="Auto",
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
|
|
|
index_parser = subparsers.add_parser(
|
|
|
"index", help="Generate an index file for an RVC model."
|
|
|
)
|
|
|
index_parser.add_argument(
|
|
|
"--model_name", type=str, help="Name of the model.", required=True
|
|
|
)
|
|
|
index_parser.add_argument(
|
|
|
"--index_algorithm",
|
|
|
type=str,
|
|
|
choices=["Auto", "Faiss", "KMeans"],
|
|
|
help="Choose the method for generating the index file.",
|
|
|
default="Auto",
|
|
|
required=False,
|
|
|
)
|
|
|
|
|
|
|
|
|
model_information_parser = subparsers.add_parser(
|
|
|
"model_information", help="Display information about a trained model."
|
|
|
)
|
|
|
model_information_parser.add_argument(
|
|
|
"--pth_path", type=str, help="Path to the .pth model file.", required=True
|
|
|
)
|
|
|
|
|
|
|
|
|
model_blender_parser = subparsers.add_parser(
|
|
|
"model_blender", help="Fuse two RVC models together."
|
|
|
)
|
|
|
model_blender_parser.add_argument(
|
|
|
"--model_name", type=str, help="Name of the new fused model.", required=True
|
|
|
)
|
|
|
model_blender_parser.add_argument(
|
|
|
"--pth_path_1",
|
|
|
type=str,
|
|
|
help="Path to the first .pth model file.",
|
|
|
required=True,
|
|
|
)
|
|
|
model_blender_parser.add_argument(
|
|
|
"--pth_path_2",
|
|
|
type=str,
|
|
|
help="Path to the second .pth model file.",
|
|
|
required=True,
|
|
|
)
|
|
|
model_blender_parser.add_argument(
|
|
|
"--ratio",
|
|
|
type=float,
|
|
|
help="Ratio for blending the two models (0.0 to 1.0).",
|
|
|
choices=[(i / 10) for i in range(11)],
|
|
|
default=0.5,
|
|
|
)
|
|
|
|
|
|
|
|
|
subparsers.add_parser(
|
|
|
"tensorboard", help="Launch TensorBoard for monitoring training progress."
|
|
|
)
|
|
|
|
|
|
|
|
|
download_parser = subparsers.add_parser(
|
|
|
"download", help="Download a model from a provided link."
|
|
|
)
|
|
|
download_parser.add_argument(
|
|
|
"--model_link", type=str, help="Direct link to the model file.", required=True
|
|
|
)
|
|
|
|
|
|
|
|
|
prerequisites_parser = subparsers.add_parser(
|
|
|
"prerequisites", help="Install prerequisites for RVC."
|
|
|
)
|
|
|
prerequisites_parser.add_argument(
|
|
|
"--pretraineds_hifigan",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
default=True,
|
|
|
help="Download pretrained models for RVC v2.",
|
|
|
)
|
|
|
prerequisites_parser.add_argument(
|
|
|
"--models",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
default=True,
|
|
|
help="Download additional models.",
|
|
|
)
|
|
|
prerequisites_parser.add_argument(
|
|
|
"--exe",
|
|
|
type=lambda x: bool(strtobool(x)),
|
|
|
choices=[True, False],
|
|
|
default=True,
|
|
|
help="Download required executables.",
|
|
|
)
|
|
|
|
|
|
|
|
|
audio_analyzer = subparsers.add_parser(
|
|
|
"audio_analyzer", help="Analyze an audio file."
|
|
|
)
|
|
|
audio_analyzer.add_argument(
|
|
|
"--input_path", type=str, help="Path to the input audio file.", required=True
|
|
|
)
|
|
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
|
|
|
def main():
|
|
|
if len(sys.argv) == 1:
|
|
|
print("Please run the script with '-h' for more information.")
|
|
|
sys.exit(1)
|
|
|
|
|
|
args = parse_arguments()
|
|
|
|
|
|
try:
|
|
|
if args.mode == "infer":
|
|
|
run_infer_script(
|
|
|
pitch=args.pitch,
|
|
|
index_rate=args.index_rate,
|
|
|
volume_envelope=args.volume_envelope,
|
|
|
protect=args.protect,
|
|
|
hop_length=args.hop_length,
|
|
|
f0_method=args.f0_method,
|
|
|
input_path=args.input_path,
|
|
|
output_path=args.output_path,
|
|
|
pth_path=args.pth_path,
|
|
|
index_path=args.index_path,
|
|
|
split_audio=args.split_audio,
|
|
|
f0_autotune=args.f0_autotune,
|
|
|
f0_autotune_strength=args.f0_autotune_strength,
|
|
|
clean_audio=args.clean_audio,
|
|
|
clean_strength=args.clean_strength,
|
|
|
export_format=args.export_format,
|
|
|
embedder_model=args.embedder_model,
|
|
|
embedder_model_custom=args.embedder_model_custom,
|
|
|
f0_file=args.f0_file,
|
|
|
formant_shifting=args.formant_shifting,
|
|
|
formant_qfrency=args.formant_qfrency,
|
|
|
formant_timbre=args.formant_timbre,
|
|
|
sid=args.sid,
|
|
|
post_process=args.post_process,
|
|
|
reverb=args.reverb,
|
|
|
pitch_shift=args.pitch_shift,
|
|
|
limiter=args.limiter,
|
|
|
gain=args.gain,
|
|
|
distortion=args.distortion,
|
|
|
chorus=args.chorus,
|
|
|
bitcrush=args.bitcrush,
|
|
|
clipping=args.clipping,
|
|
|
compressor=args.compressor,
|
|
|
delay=args.delay,
|
|
|
reverb_room_size=args.reverb_room_size,
|
|
|
reverb_damping=args.reverb_damping,
|
|
|
reverb_wet_gain=args.reverb_wet_gain,
|
|
|
reverb_dry_gain=args.reverb_dry_gain,
|
|
|
reverb_width=args.reverb_width,
|
|
|
reverb_freeze_mode=args.reverb_freeze_mode,
|
|
|
pitch_shift_semitones=args.pitch_shift_semitones,
|
|
|
limiter_threshold=args.limiter_threshold,
|
|
|
limiter_release_time=args.limiter_release_time,
|
|
|
gain_db=args.gain_db,
|
|
|
distortion_gain=args.distortion_gain,
|
|
|
chorus_rate=args.chorus_rate,
|
|
|
chorus_depth=args.chorus_depth,
|
|
|
chorus_center_delay=args.chorus_center_delay,
|
|
|
chorus_feedback=args.chorus_feedback,
|
|
|
chorus_mix=args.chorus_mix,
|
|
|
bitcrush_bit_depth=args.bitcrush_bit_depth,
|
|
|
clipping_threshold=args.clipping_threshold,
|
|
|
compressor_threshold=args.compressor_threshold,
|
|
|
compressor_ratio=args.compressor_ratio,
|
|
|
compressor_attack=args.compressor_attack,
|
|
|
compressor_release=args.compressor_release,
|
|
|
delay_seconds=args.delay_seconds,
|
|
|
delay_feedback=args.delay_feedback,
|
|
|
delay_mix=args.delay_mix,
|
|
|
)
|
|
|
elif args.mode == "batch_infer":
|
|
|
run_batch_infer_script(
|
|
|
pitch=args.pitch,
|
|
|
index_rate=args.index_rate,
|
|
|
volume_envelope=args.volume_envelope,
|
|
|
protect=args.protect,
|
|
|
hop_length=args.hop_length,
|
|
|
f0_method=args.f0_method,
|
|
|
input_folder=args.input_folder,
|
|
|
output_folder=args.output_folder,
|
|
|
pth_path=args.pth_path,
|
|
|
index_path=args.index_path,
|
|
|
split_audio=args.split_audio,
|
|
|
f0_autotune=args.f0_autotune,
|
|
|
f0_autotune_strength=args.f0_autotune_strength,
|
|
|
clean_audio=args.clean_audio,
|
|
|
clean_strength=args.clean_strength,
|
|
|
export_format=args.export_format,
|
|
|
embedder_model=args.embedder_model,
|
|
|
embedder_model_custom=args.embedder_model_custom,
|
|
|
f0_file=args.f0_file,
|
|
|
formant_shifting=args.formant_shifting,
|
|
|
formant_qfrency=args.formant_qfrency,
|
|
|
formant_timbre=args.formant_timbre,
|
|
|
sid=args.sid,
|
|
|
post_process=args.post_process,
|
|
|
reverb=args.reverb,
|
|
|
pitch_shift=args.pitch_shift,
|
|
|
limiter=args.limiter,
|
|
|
gain=args.gain,
|
|
|
distortion=args.distortion,
|
|
|
chorus=args.chorus,
|
|
|
bitcrush=args.bitcrush,
|
|
|
clipping=args.clipping,
|
|
|
compressor=args.compressor,
|
|
|
delay=args.delay,
|
|
|
reverb_room_size=args.reverb_room_size,
|
|
|
reverb_damping=args.reverb_damping,
|
|
|
reverb_wet_gain=args.reverb_wet_gain,
|
|
|
reverb_dry_gain=args.reverb_dry_gain,
|
|
|
reverb_width=args.reverb_width,
|
|
|
reverb_freeze_mode=args.reverb_freeze_mode,
|
|
|
pitch_shift_semitones=args.pitch_shift_semitones,
|
|
|
limiter_threshold=args.limiter_threshold,
|
|
|
limiter_release_time=args.limiter_release_time,
|
|
|
gain_db=args.gain_db,
|
|
|
distortion_gain=args.distortion_gain,
|
|
|
chorus_rate=args.chorus_rate,
|
|
|
chorus_depth=args.chorus_depth,
|
|
|
chorus_center_delay=args.chorus_center_delay,
|
|
|
chorus_feedback=args.chorus_feedback,
|
|
|
chorus_mix=args.chorus_mix,
|
|
|
bitcrush_bit_depth=args.bitcrush_bit_depth,
|
|
|
clipping_threshold=args.clipping_threshold,
|
|
|
compressor_threshold=args.compressor_threshold,
|
|
|
compressor_ratio=args.compressor_ratio,
|
|
|
compressor_attack=args.compressor_attack,
|
|
|
compressor_release=args.compressor_release,
|
|
|
delay_seconds=args.delay_seconds,
|
|
|
delay_feedback=args.delay_feedback,
|
|
|
delay_mix=args.delay_mix,
|
|
|
)
|
|
|
elif args.mode == "tts":
|
|
|
run_tts_script(
|
|
|
tts_file=args.tts_file,
|
|
|
tts_text=args.tts_text,
|
|
|
tts_voice=args.tts_voice,
|
|
|
tts_rate=args.tts_rate,
|
|
|
pitch=args.pitch,
|
|
|
index_rate=args.index_rate,
|
|
|
volume_envelope=args.volume_envelope,
|
|
|
protect=args.protect,
|
|
|
hop_length=args.hop_length,
|
|
|
f0_method=args.f0_method,
|
|
|
output_tts_path=args.output_tts_path,
|
|
|
output_rvc_path=args.output_rvc_path,
|
|
|
pth_path=args.pth_path,
|
|
|
index_path=args.index_path,
|
|
|
split_audio=args.split_audio,
|
|
|
f0_autotune=args.f0_autotune,
|
|
|
f0_autotune_strength=args.f0_autotune_strength,
|
|
|
clean_audio=args.clean_audio,
|
|
|
clean_strength=args.clean_strength,
|
|
|
export_format=args.export_format,
|
|
|
embedder_model=args.embedder_model,
|
|
|
embedder_model_custom=args.embedder_model_custom,
|
|
|
f0_file=args.f0_file,
|
|
|
)
|
|
|
elif args.mode == "preprocess":
|
|
|
run_preprocess_script(
|
|
|
model_name=args.model_name,
|
|
|
dataset_path=args.dataset_path,
|
|
|
sample_rate=args.sample_rate,
|
|
|
cpu_cores=args.cpu_cores,
|
|
|
cut_preprocess=args.cut_preprocess,
|
|
|
process_effects=args.process_effects,
|
|
|
noise_reduction=args.noise_reduction,
|
|
|
clean_strength=args.noise_reduction_strength,
|
|
|
chunk_len=args.chunk_len,
|
|
|
overlap_len=args.overlap_len,
|
|
|
)
|
|
|
elif args.mode == "extract":
|
|
|
run_extract_script(
|
|
|
model_name=args.model_name,
|
|
|
f0_method=args.f0_method,
|
|
|
hop_length=args.hop_length,
|
|
|
cpu_cores=args.cpu_cores,
|
|
|
gpu=args.gpu,
|
|
|
sample_rate=args.sample_rate,
|
|
|
embedder_model=args.embedder_model,
|
|
|
embedder_model_custom=args.embedder_model_custom,
|
|
|
include_mutes=args.include_mutes,
|
|
|
)
|
|
|
elif args.mode == "train":
|
|
|
run_train_script(
|
|
|
model_name=args.model_name,
|
|
|
save_every_epoch=args.save_every_epoch,
|
|
|
save_only_latest=args.save_only_latest,
|
|
|
save_every_weights=args.save_every_weights,
|
|
|
total_epoch=args.total_epoch,
|
|
|
sample_rate=args.sample_rate,
|
|
|
batch_size=args.batch_size,
|
|
|
gpu=args.gpu,
|
|
|
overtraining_detector=args.overtraining_detector,
|
|
|
overtraining_threshold=args.overtraining_threshold,
|
|
|
pretrained=args.pretrained,
|
|
|
custom_pretrained=args.custom_pretrained,
|
|
|
cleanup=args.cleanup,
|
|
|
index_algorithm=args.index_algorithm,
|
|
|
cache_data_in_gpu=args.cache_data_in_gpu,
|
|
|
g_pretrained_path=args.g_pretrained_path,
|
|
|
d_pretrained_path=args.d_pretrained_path,
|
|
|
vocoder=args.vocoder,
|
|
|
checkpointing=args.checkpointing,
|
|
|
)
|
|
|
elif args.mode == "index":
|
|
|
run_index_script(
|
|
|
model_name=args.model_name,
|
|
|
index_algorithm=args.index_algorithm,
|
|
|
)
|
|
|
elif args.mode == "model_information":
|
|
|
run_model_information_script(
|
|
|
pth_path=args.pth_path,
|
|
|
)
|
|
|
elif args.mode == "model_blender":
|
|
|
run_model_blender_script(
|
|
|
model_name=args.model_name,
|
|
|
pth_path_1=args.pth_path_1,
|
|
|
pth_path_2=args.pth_path_2,
|
|
|
ratio=args.ratio,
|
|
|
)
|
|
|
elif args.mode == "tensorboard":
|
|
|
run_tensorboard_script()
|
|
|
elif args.mode == "download":
|
|
|
run_download_script(
|
|
|
model_link=args.model_link,
|
|
|
)
|
|
|
elif args.mode == "prerequisites":
|
|
|
run_prerequisites_script(
|
|
|
pretraineds_hifigan=args.pretraineds_hifigan,
|
|
|
models=args.models,
|
|
|
exe=args.exe,
|
|
|
)
|
|
|
elif args.mode == "audio_analyzer":
|
|
|
run_audio_analyzer_script(
|
|
|
input_path=args.input_path,
|
|
|
)
|
|
|
except Exception as error:
|
|
|
print(f"An error occurred during execution: {error}")
|
|
|
|
|
|
import traceback
|
|
|
|
|
|
traceback.print_exc()
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main() |