from __future__ import annotations import os import sys from enum import IntEnum, StrEnum, auto from functools import cached_property from pathlib import Path from typing import TYPE_CHECKING, Annotated, Any, TypedDict import gradio as gr import typer from huggingface_hub import snapshot_download from pydantic import BaseModel from ultimate_rvc.common import AUDIO_DIR, MODELS_DIR, TEMP_DIR from ultimate_rvc.core.generate.song_cover import get_named_song_dirs from ultimate_rvc.core.generate.speech import get_edge_tts_voice_names from ultimate_rvc.core.manage.audio import ( get_audio_datasets, get_named_audio_datasets, get_saved_output_audio, get_saved_speech_audio, ) from ultimate_rvc.core.manage.config import get_config_names, load_config from ultimate_rvc.core.manage.models import ( get_custom_embedder_model_names, get_custom_pretrained_model_names, get_training_model_names, get_voice_model_names, ) from ultimate_rvc.web.common import ( initialize_dropdowns, exception_harness, render_transfer_component, setup_transfer_event, toggle_visibility, toggle_visible_component, update_dropdowns, update_output_name, update_value, ) from ultimate_rvc.web.config.component import ( AnyComponentConfig, AudioConfig, CheckboxConfig, ComponentConfig, DropdownConfig, NumberConfig, RadioConfig, SliderConfig, TextboxConfig, ) from ultimate_rvc.web.config.tab import ( SongGenerationConfig, SpeechGenerationConfig, TrainingConfig, ) from ultimate_rvc.web.tabs.generate.speech.multi_step_generation import ( render as render_speech_multi_step_tab, ) from ultimate_rvc.web.tabs.generate.speech.one_click_generation import ( render as render_speech_one_click_tab, ) from ultimate_rvc.web.tabs.manage.audio import render as render_audio_tab from ultimate_rvc.web.tabs.manage.models import render as render_models_tab from ultimate_rvc.web.tabs.manage.settings import render as render_settings_tab if TYPE_CHECKING: import gradio as gr from typing import TYPE_CHECKING from functools import partial import gradio as gr from ultimate_rvc.core.common import ( INTERMEDIATE_AUDIO_BASE_DIR, OUTPUT_AUDIO_DIR, copy_file_safe, display_progress, get_file_hash, json_dump, json_load, validate_model, validate_url, ) from ultimate_rvc.core.exceptions import ( Entity, InvalidLocationError, Location, NotFoundError, NotProvidedError, UIMessage, YoutubeUrlError, ) from ultimate_rvc.core.generate.common import ( convert, get_unique_base_path, mix_audio, validate_audio_dir_exists, validate_audio_file_exists, wavify, ) from ultimate_rvc.core.generate.song_cover import ( get_named_song_dirs, get_song_cover_name, mix_song, pitch_shift, postprocess, retrieve_song, separate_audio, get_named_song_dirs, get_song_cover_name, run_pipeline, ) from ultimate_rvc.core.generate.typing_extra import ( EffectedVocalsMetaData, FileMetaData, MixedAudioType, PitchShiftMetaData, RVCAudioMetaData, SeparatedAudioMetaData, ) from ultimate_rvc.core.manage.audio import get_saved_output_audio from ultimate_rvc.typing_extra import EmbedderModel from ultimate_rvc.web.common import ( PROGRESS_BAR, exception_harness, toggle_intermediate_audio, toggle_visibility, toggle_visible_component, update_dropdowns, update_output_name, update_value, ) from ultimate_rvc.web.typing_extra import ConcurrencyId type StrPath = str | PathLike[str] type Json = Mapping[str, Json] | Sequence[Json] | str | int | float | bool | None class SegmentSize(IntEnum): """Enumeration of segment sizes for audio separation.""" SEG_64 = 64 SEG_128 = 128 SEG_256 = 256 SEG_512 = 512 SEG_1024 = 1024 SEG_2048 = 2048 SEG_4096 = 4096 class F0Method(StrEnum): """Enumeration of pitch extraction methods.""" RMVPE = "rmvpe" CREPE = "crepe" CREPE_TINY = "crepe-tiny" FCPE = "fcpe" class RVCContentType(StrEnum): """Enumeration of valid content to convert with RVC.""" VOCALS = "vocals" VOICE = "voice" SPEECH = "speech" AUDIO = "audio" class SampleRate(IntEnum): """Enumeration of supported audio sample rates.""" HZ_16000 = 16000 HZ_44100 = 44100 HZ_48000 = 48000 HZ_96000 = 96000 HZ_192000 = 192000 class AudioExt(StrEnum): """Enumeration of supported audio file formats.""" MP3 = "mp3" WAV = "wav" FLAC = "flac" OGG = "ogg" class DeviceType(StrEnum): """Enumeration of device types for training voice models.""" AUTOMATIC = "Automatic" CPU = "CPU" GPU = "GPU" class TrainingSampleRate(StrEnum): """Enumeration of sample rates for training voice models.""" HZ_32K = "32000" HZ_40K = "40000" HZ_48K = "48000" class PretrainedSampleRate(StrEnum): """Enumeration of valid sample rates for pretrained models.""" HZ_32K = "32k" HZ_40K = "40k" HZ_44K = "44k" HZ_48K = "48k" class TrainingF0Method(StrEnum): """Enumeration of pitch extraction methods for training.""" RMVPE = "rmvpe" CREPE = "crepe" CREPE_TINY = "crepe-tiny" class AudioSplitMethod(StrEnum): """ Enumeration of methods to use for splitting audio files during dataset preprocessing. """ SKIP = "Skip" SIMPLE = "Simple" AUTOMATIC = "Automatic" class Vocoder(StrEnum): """Enumeration of vocoders for training voice models.""" HIFI_GAN = "HiFi-GAN" MRF_HIFI_GAN = "MRF HiFi-GAN" REFINE_GAN = "RefineGAN" class IndexAlgorithm(StrEnum): """Enumeration of indexing algorithms for training voice models.""" AUTO = "Auto" FAISS = "Faiss" KMEANS = "KMeans" class PretrainedType(StrEnum): """ Enumeration of the possible types of pretrained models to finetune voice models on. """ NONE = "None" DEFAULT = "Default" CUSTOM = "Custom" class ConcurrencyId(StrEnum): """Enumeration of possible concurrency identifiers.""" GPU = auto() class SongSourceType(StrEnum): """The type of source providing the song to generate a cover of.""" PATH = "Local or HTTP filepath" LOCAL_FILE = "Local file" CACHED_SONG = "Cached song" class SpeechSourceType(StrEnum): """The type of source providing the text to generate speech from.""" TEXT = "Text" LOCAL_FILE = "Local file" class SongTransferOption(StrEnum): """Enumeration of possible song transfer options.""" STEP_1_AUDIO = "Step 1: stem splitting" STEP_2_VOCALS = "Step 2: vocal conversion" STEP_3_VOCALS = "Step 3: vocal effect" STEP_4_INSTRUMENTALS = "Step 4: instrumentals" STEP_4_BACKUP_VOCALS = "Step 4: backup vocals" STEP_5_MAIN_VOCALS = "Step 5: main vocals" STEP_5_INSTRUMENTALS = "Step 5: instrumentals" STEP_5_BACKUP_VOCALS = "Step 5: backup vocals" class SpeechTransferOption(StrEnum): """Enumeration of possible speech transfer options.""" STEP_2_SPEECH = "Step 2: speech conversion" STEP_3_SPEECH = "Step 3: speech effect" class ComponentVisibilityKwArgs(TypedDict, total=False): """ Keyword arguments for setting component visibility. Attributes ---------- visible : bool Whether the component should be visible. value : Any The value of the component. """ visible: bool value: Any class UpdateDropdownKwArgs(TypedDict, total=False): """ Keyword arguments for updating a dropdown component. Attributes ---------- choices : DropdownChoices The updated choices for the dropdown component. value : DropdownValue The updated value for the dropdown component. """ choices: DropdownChoices value: DropdownValue class TextBoxKwArgs(TypedDict, total=False): """ Keyword arguments for updating a textbox component. Attributes ---------- value : str | None The updated value for the textbox component. placeholder : str | None The updated placeholder for the textbox component. """ value: str | None placeholder: str | None class UpdateAudioKwArgs(TypedDict, total=False): """ Keyword arguments for updating an audio component. Attributes ---------- value : str | None The updated value for the audio component. """ value: str | None class DatasetType(StrEnum): """The type of dataset to train a voice model.""" NEW_DATASET = "Create new dataset" EXISTING_DATASET = "Use existing dataset" class EmbedderModel(StrEnum): """Enumeration of audio embedding models.""" CONTENTVEC = "contentvec" CRUSTY = "Crusty" CUSTOM = "custom" class SeparationModel(StrEnum): """Enumeration of audio separation models.""" UVR_MDX_NET_VOC_FT = "UVR-MDX-NET-Voc_FT.onnx" UVR_MDX_NET_KARA_2 = "UVR_MDXNET_KARA_2.onnx" REVERB_HQ_BY_FOXJOY = "Reverb_HQ_By_FoxJoy.onnx" UVR_MDX_NET_Inst_HQ_3 = "UVR-MDX-NET-Inst_HQ_3.onnx" Kim_Vocal_1 = "Kim_Vocal_1.onnx" Kim_Vocal_2 = "Kim_Vocal_2.onnx" Kim_Inst = "Kim_Inst.onnx" UVR_MDX_NET_Inst_HQ_4 = "UVR-MDX-NET-Inst_HQ_4.onnx" kuielab_a_vocals = "kuielab_a_vocals.onnx" kuielab_b_vocals = "kuielab_b_vocals.onnx" kuielab_a_drums = "kuielab_a_drums.onnx" kuielab_b_drums = "kuielab_b_drums.onnx" kuielab_a_bass = "kuielab_a_bass.onnx" kuielab_b_bass = "kuielab_b_bass.onnx" kuielab_a_other = "kuielab_a_other.onnx" kuielab_b_other = "kuielab_b_other.onnx" MDX23C_8KFFT_InstVoc_HQ_2 = "MDX23C-8KFFT-InstVoc_HQ_2.ckpt" UVR_DeNoise = "UVR-DeNoise.pth" UVR_DeEcho_DeReverb = "UVR-DeEcho-DeReverb.pth" now_dir = os.getcwd() sys.path.append(now_dir) models_dir = "models" dump_path = os.path.join(now_dir, models_dir) repo_id = "lainlives/voice" hf_token = os.environ.get("HF_TOKEN") snapshot_download(repo_id=repo_id, local_dir=dump_path, token=hf_token) # if __name__ == "__main__": # start_app(share=False, ssr_mode = True) config_name = "default" # os.environ.get("URVC_CONFIG") cookiefile = os.environ.get("YT_COOKIEFILE") """ Module defining models for representing configuration settings for UI tabs. """ class SongIntermediateAudioConfig(BaseModel): """ Configuration settings for intermediate audio components in the one-click song generation tab. Attributes ---------- song : AudioConfig Configuration settings for the input song audio component. vocals : AudioConfig Configuration settings for the vocals audio component. instrumentals : AudioConfig Configuration settings for the instrumentals audio component. main_vocals : AudioConfig Configuration settings for the main vocals audio component. backup_vocals : AudioConfig Configuration settings for the backup vocals audio component. main_vocals_dereverbed : AudioConfig Configuration settings for the main vocals de-reverbed audio component. main_vocals_reverb : AudioConfig Configuration settings for the main vocals reverb audio component. converted_vocals : AudioConfig Configuration settings for the converted vocals audio component. postprocessed_vocals : AudioConfig Configuration settings for the postprocessed vocals audio component. instrumentals_shifted : AudioConfig Configuration settings for the shifted instrumentals audio component. backup_vocals_shifted : AudioConfig Configuration settings for the shifted backup vocals audio component. all : list[gr.Audio] List of instances of all intermediate audio components. """ song: AudioConfig = AudioConfig.intermediate(label="Song") vocals: AudioConfig = AudioConfig.intermediate(label="Vocals") instrumentals: AudioConfig = AudioConfig.intermediate( label="Instrumentals", ) main_vocals: AudioConfig = AudioConfig.intermediate( label="Main vocals", ) backup_vocals: AudioConfig = AudioConfig.intermediate( label="Backup vocals", ) main_vocals_dereverbed: AudioConfig = AudioConfig.intermediate( label="De-reverbed main vocals", ) main_vocals_reverb: AudioConfig = AudioConfig.intermediate( label="Main vocals with reverb", ) converted_vocals: AudioConfig = AudioConfig.intermediate( label="Converted vocals", ) postprocessed_vocals: AudioConfig = AudioConfig.intermediate( label="Postprocessed vocals", ) instrumentals_shifted: AudioConfig = AudioConfig.intermediate( label="Pitch-shifted instrumentals", ) backup_vocals_shifted: AudioConfig = AudioConfig.intermediate( label="Pitch-shifted backup vocals", ) @property def all(self) -> list[gr.Audio]: """ Retrieve instances of all intermediate audio components in the one-click song generation tab. Returns ------- list[gr.Audio] List of instances of all intermediate audio components in the one-click song generation tab. """ # NOTE we are using self.__annotations__ to get the fields in # the order they are defined in the class return [getattr(self, field).instance for field in self.__annotations__] class OneClickSongGenerationConfig(SongGenerationConfig): """ Configuration settings for the one-click song generation tab. Attributes ---------- n_octaves : SliderConfig Configuration settings for an octave pitch shift slider component. n_semitones : SliderConfig Configuration settings for a semitone pitch shift slider component. show_intermediate_audio : CheckboxConfig Configuration settings for a show intermediate audio checkbox component. intermediate_audio : SongIntermediateAudioConfig Configuration settings for intermediate audio components. See Also -------- SongGenerationConfig Parent model defining common component configuration settings for song generation tabs. """ n_octaves: SliderConfig = SliderConfig.octave_shift( label="Vocal pitch shift", info=( "The number of octaves to shift the pitch of the converted vocals by. Use 1" " for male-to-female and -1 for vice-versa." ), ) n_semitones: SliderConfig = SliderConfig.semitone_shift( label="Overall pitch shift", info=( "The number of semi-tones to shift the pitch of the converted vocals," " instrumentals and backup vocals by." ), ) show_intermediate_audio: CheckboxConfig = CheckboxConfig( label="Show intermediate audio", info="Show intermediate audio tracks produced during song cover generation.", value=False, exclude_value=True, ) intermediate_audio: SongIntermediateAudioConfig = SongIntermediateAudioConfig() class SongInputAudioConfig(BaseModel): """ Configuration settings for input audio components in the multi-step song generation tab. Attributes ---------- audio : AudioConfig Configuration settings for the input audio component. vocals : AudioConfig Configuration settings for the vocals audio component. converted_vocals : AudioConfig Configuration settings for the converted vocals audio component. instrumentals : AudioConfig Configuration settings for the instrumentals audio component. backup_vocals : AudioConfig Configuration settings for the backup vocals audio component. main_vocals : AudioConfig Configuration settings for the main vocals audio component. shifted_instrumentals : AudioConfig Configuration settings for the shifted instrumentals audio component. shifted_backup_vocals : AudioConfig Configuration settings for the shifted backup vocals audio component. all : list[AudioConfig] List of configuration settings for all input audio components in the multi-step song generation tab. """ audio: AudioConfig = AudioConfig.input(label="Audio") vocals: AudioConfig = AudioConfig.input(label="Vocals") converted_vocals: AudioConfig = AudioConfig.input(label="Vocals") instrumentals: AudioConfig = AudioConfig.input(label="Instrumentals") backup_vocals: AudioConfig = AudioConfig.input(label="Backup vocals") main_vocals: AudioConfig = AudioConfig.input(label="Main vocals") shifted_instrumentals: AudioConfig = AudioConfig.input(label="Instrumentals") shifted_backup_vocals: AudioConfig = AudioConfig.input(label="Backup vocals") @property def all(self) -> list[AudioConfig]: """ Retrieve configuration settings for all input audio components in the multi-step song generation tab. Returns ------- list[AudioConfig] List of configuration settings for all input audio components in the multi-step song generation tab. """ return [getattr(self, field) for field in self.__annotations__] class SongDirsConfig(BaseModel): """ Configuration settings for song directory components in the multi-step song generation tab. Attributes ---------- separate_audio : DropdownConfig Configuration settings for the song directory component for separating audio. convert_vocals : DropdownConfig Configuration settings for the song directory component for converting vocals. postprocess_vocals : DropdownConfig Configuration settings for the song directory component for postprocessing vocals. pitch_shift_background : DropdownConfig Configuration settings for the song directory component for pitch-shifting background audio. mix : DropdownConfig Configuration settings for the song directory component for mixing audio. all : list[gr.Dropdown] List of instances of all song directory components in the multi-step song generation tab. """ separate_audio: DropdownConfig = DropdownConfig.song_dir() convert_vocals: DropdownConfig = DropdownConfig.song_dir() postprocess_vocals: DropdownConfig = DropdownConfig.song_dir() pitch_shift_background: DropdownConfig = DropdownConfig.song_dir() mix: DropdownConfig = DropdownConfig.song_dir() @property def all(self) -> list[gr.Dropdown]: """ Retrieve instances of all song directory components in the multi-step song generation tab. Returns ------- list[gr.Dropdown] List of instances of all song directory components in the multi-step song generation tab. """ return [getattr(self, field).instance for field in self.__annotations__] class MultiStepSongGenerationConfig(SongGenerationConfig): """ Configuration settings for multi-step song generation tab. Attributes ---------- separation_model : DropdownConfig Configuration settings for a separation model dropdown component. segment_size : RadioConfig Configuration settings for a segment size radio component. n_octaves : SliderConfig Configuration settings for an octave pitch shift slider component. n_semitones : SliderConfig Configuration settings for a semitone pitch shift slider component. n_semitones_instrumentals : SliderConfig Configuration settings for an instrumentals pitch shift slider component. n_semitones_backup_vocals : SliderConfig Configuration settings for a backup vocals pitch shift slider component. input_audio : SongInputAudioConfig Configuration settings for input audio components. song_dirs : SongDirsConfig Configuration settings for song directory components. See Also -------- SongGenerationConfig Parent model defining common component configuration settings for song generation tabs. """ separation_model: DropdownConfig = DropdownConfig( label="Separation model", info="The model to use for audio separation.", value=SeparationModel.MDX23C_8KFFT_InstVoc_HQ_2, choices=list(SeparationModel), ) segment_size: RadioConfig = RadioConfig( label="Segment size", info=( "The size of the segments into which the audio is split. Using a larger" " size consumes more resources, but may give better results." ), value=SegmentSize.SEG_2048, choices=list(SegmentSize), ) n_octaves: SliderConfig = SliderConfig.octave_shift( label="Pitch shift (octaves)", info=( "The number of octaves to pitch-shift the converted voice by. Use 1 for" " male-to-female and -1 for vice-versa." ), ) n_semitones: SliderConfig = SliderConfig.semitone_shift( label="Pitch shift (semi-tones)", info=( "The number of semi-tones to pitch-shift the converted vocals by. Altering" " this slightly reduces sound quality." ), ) n_semitones_instrumentals: SliderConfig = SliderConfig.semitone_shift( label="Instrumental pitch shift", info="The number of semi-tones to pitch-shift the instrumentals by.", ) n_semitones_backup_vocals: SliderConfig = SliderConfig.semitone_shift( label="Backup vocal pitch shift", info="The number of semi-tones to pitch-shift the backup vocals by.", ) input_audio: SongInputAudioConfig = SongInputAudioConfig() song_dirs: SongDirsConfig = SongDirsConfig() class SpeechIntermediateAudioConfig(BaseModel): """ Configuration settings for intermediate audio components in the one-click speech generation tab. Attributes ---------- speech : AudioConfig Configuration settings for the input speech audio component. converted_speech : AudioConfig Configuration settings for the converted speech audio component. all : list[gr.Audio] List of instances of all intermediate audio components in the speech generation tab. """ speech: AudioConfig = AudioConfig.intermediate(label="Speech") converted_speech: AudioConfig = AudioConfig.intermediate(label="Converted speech") @property def all(self) -> list[gr.Audio]: """ Retrieve instances of all intermediate audio components in the speech generation tab. Returns ------- list[gr.Audio] List of instances of all intermediate audio components in the speech generation tab. """ return [getattr(self, field).instance for field in self.__annotations__] class OneClickSpeechGenerationConfig(SpeechGenerationConfig): """ Configuration settings for one-click speech generation tab. Attributes ---------- intermediate_audio : SpeechIntermediateAudioConfig Configuration settings for intermediate audio components. show_intermediate_audio : CheckboxConfig Configuration settings for a show intermediate audio checkbox component. See Also -------- SpeechGenerationConfig Parent model defining common component configuration settings for speech generation tabs. """ intermediate_audio: SpeechIntermediateAudioConfig = SpeechIntermediateAudioConfig() show_intermediate_audio: CheckboxConfig = CheckboxConfig( label="Show intermediate audio", info="Show intermediate audio tracks produced during speech generation.", value=False, exclude_value=True, ) class SpeechInputAudioConfig(BaseModel): """ Configuration settings for input audio components in the multi-step speech generation tab. Attributes ---------- speech : AudioConfig Configuration settings for the input speech audio component. converted_speech : AudioConfig Configuration settings for the converted speech audio component. all : list[AudioConfig] List of configuration settings for all input audio components in the multi-step speech generation tab. """ speech: AudioConfig = AudioConfig.input("Speech") converted_speech: AudioConfig = AudioConfig.input("Converted speech") @property def all(self) -> list[AudioConfig]: """ Retrieve configuration settings for all input audio components in the multi-step speech generation tab. Returns ------- list[AudioConfig] List of configuration settings for all input audio components in the multi-step speech generation tab. """ return [getattr(self, field) for field in self.__annotations__] class MultiStepSpeechGenerationConfig(SpeechGenerationConfig): """ Configuration settings for the multi-step speech generation tab. Attributes ---------- input_audio : SpeechInputAudioConfig Configuration settings for input audio components. See Also -------- SpeechGenerationConfig Parent model defining common component configuration settings for speech generation tabs. """ input_audio: SpeechInputAudioConfig = SpeechInputAudioConfig() class MultiStepTrainingConfig(TrainingConfig): """Configuration settings for multi-step training tab.""" class ModelManagementConfig(BaseModel): """ Configuration settings for model management tab. Attributes ---------- voices : DropdownConfig Configuration settings for delete voice models dropdown component. embedders : DropdownConfig Configuration settings for delete embedder models dropdown component. pretraineds : DropdownConfig Configuration settings for delete pretrained models dropdown component. traineds : DropdownConfig Configuration settings for delete training models dropdown component. dummy_checkbox : CheckboxConfig Configuration settings for a dummy checkbox component. """ voices: DropdownConfig = DropdownConfig.multi_delete( label="Voice models", info="Select one or more voice models to delete.", ) embedders: DropdownConfig = DropdownConfig.multi_delete( label="Custom embedder models", info="Select one or more embedder models to delete.", ) pretraineds: DropdownConfig = DropdownConfig.multi_delete( label="Custom pretrained models", info="Select one or more pretrained models to delete.", ) traineds: DropdownConfig = DropdownConfig.multi_delete( label="Training models", info="Select one or more training models to delete.", ) dummy_checkbox: CheckboxConfig = CheckboxConfig( value=False, visible=False, exclude_value=True, ) class AudioManagementConfig(BaseModel): """ Configuration settings for audio management tab. Attributes ---------- intermediate : DropdownConfig Configuration settings for delete intermediate audio files dropdown component speech : DropdownConfig Configuration settings for delete speech audio files dropdown component. output : DropdownConfig Configuration settings for delete output audio files dropdown component. dataset : DropdownConfig Configuration settings for delete dataset audio files dropdown component. dummy_checkbox : CheckboxConfig Configuration settings for a dummy checkbox component. """ intermediate: DropdownConfig = DropdownConfig.multi_delete( label="Song directories", info=( "Select one or more song directories containing intermediate audio files to" " delete." ), ) speech: DropdownConfig = DropdownConfig.multi_delete( label="Speech audio files", info="Select one or more speech audio files to delete.", ) output: DropdownConfig = DropdownConfig.multi_delete( label="Output audio files", info="Select one or more output audio files to delete.", ) dataset: DropdownConfig = DropdownConfig.multi_delete( label="Dataset audio files", info="Select one or more datasets containing audio files to delete.", ) dummy_checkbox: CheckboxConfig = CheckboxConfig( value=False, visible=False, exclude_value=True, ) class SettingsManagementConfig(BaseModel): """ Configuration settings for settings management tab. Attributes ---------- dummy_checkbox : CheckboxConfig Configuration settings for a dummy checkbox component. """ load_config_name: DropdownConfig = DropdownConfig( label="Configuration name", info="The name of a configuration to load UI settings from", value=None, render=False, exclude_value=True, ) delete_config_names: DropdownConfig = DropdownConfig.multi_delete( label="Configuration names", info="Select the name of one or more configurations to delete", ) dummy_checkbox: CheckboxConfig = CheckboxConfig( value=False, visible=False, exclude_value=True, ) class TotalSongGenerationConfig(BaseModel): """ All configuration settings for song generation tabs. Attributes ---------- one_click : OneClickSongGenerationConfig Configuration settings for the one-click song generation tab. multi_step : MultiStepSongGenerationConfig Configuration settings for the multi-step song generation tab. """ one_click: OneClickSongGenerationConfig = OneClickSongGenerationConfig() multi_step: MultiStepSongGenerationConfig = MultiStepSongGenerationConfig() class TotalSpeechGenerationConfig(BaseModel): """ All configuration settings for speech generation tabs. Attributes ---------- one_click : OneClickSpeechGenerationConfig Configuration settings for the one-click speech generation tab. multi_step : MultiStepSpeechGenerationConfig Configuration settings for the multi-step speech generation tab. """ one_click: OneClickSpeechGenerationConfig = OneClickSpeechGenerationConfig() multi_step: MultiStepSpeechGenerationConfig = MultiStepSpeechGenerationConfig() class TotalTrainingConfig(BaseModel): """ All configuration settings for training tabs. Attributes ---------- training : TrainingConfig Configuration settings for the multi-step training tab. """ multi_step: MultiStepTrainingConfig = MultiStepTrainingConfig() class TotalManagementConfig(BaseModel): """ All configuration settings for management tabs. Attributes ---------- model : ModelManagementConfig Configuration settings for the model management tab. audio : AudioManagementConfig Configuration settings for the audio management tab. settings : SettingsManagementConfig Configuration settings for the settings management tab. """ model: ModelManagementConfig = ModelManagementConfig() audio: AudioManagementConfig = AudioManagementConfig() settings: SettingsManagementConfig = SettingsManagementConfig() class TotalConfig(BaseModel): """ All configuration settings for the Ultimate RVC app. Attributes ---------- song : TotalSongGenerationConfig Configuration settings for song generation tabs. speech : TotalSpeechGenerationConfig Configuration settings for speech generation tabs. training : TotalTrainingConfig Configuration settings for training tabs. management : TotalManagementConfig Configuration settings for management tabs. """ song: TotalSongGenerationConfig = TotalSongGenerationConfig() speech: TotalSpeechGenerationConfig = TotalSpeechGenerationConfig() training: TotalTrainingConfig = TotalTrainingConfig() management: TotalManagementConfig = TotalManagementConfig() @cached_property def all(self) -> list[AnyComponentConfig]: """ Recursively collect those component configuration models nested within the current model instance, which have values that are not excluded. Returns ------- list[AnyComponentConfig] A list of component configuration models found within the current model instance, which have values that are not excluded. """ def _collect(model: BaseModel) -> list[AnyComponentConfig]: component_configs: list[Any] = [] for _, value in model: if isinstance(value, ComponentConfig): if not value.exclude_value: component_configs.append(value) elif isinstance(value, BaseModel): component_configs.extend(_collect(value)) return component_configs return _collect(self) class BaseTabConfig(BaseModel): """ Base model defining common component configuration settings for UI tabs. Attributes ---------- embedder_model : DropdownConfig Configuration settings for an embedder model dropdown component. custom_embedder_model : DropdownConfig Configuration settings for a custom embedder model dropdown component. """ embedder_model: DropdownConfig = DropdownConfig( label="Embedder model", info="The model to use for generating speaker embeddings.", value=EmbedderModel.CONTENTVEC, choices=list(EmbedderModel), exclude_value=True, ) custom_embedder_model: DropdownConfig = DropdownConfig( label="Custom embedder model", info="Select a custom embedder model from the dropdown.", value=None, visible=False, render=False, exclude_value=True, ) class TrainingConfig(BaseTabConfig): """ Common component configuration settings for training tabs. Attributes ---------- dataset_type : DropdownConfig Configuration settings for a dataset type dropdown component. dataset : DropdownConfig Configuration settings for a dataset dropdown component. dataset_name : TextboxConfig Configuration settings for a dataset name textbox component. preprocess_model : DropdownConfig Configuration settings for a model name dropdown component for audio preprocessing. sample_rate : DropdownConfig Configuration settings for a sample rate dropdown component. filter_audio : CheckboxConfig Configuration settings for a filter audio checkbox component. clean_audio : CheckboxConfig Configuration settings for a clean audio checkbox component. clean_strength : SliderConfig Configuration settings for a clean strength slider component. split_method : DropdownConfig Configuration settings for an audio splitting method dropdown component. chunk_len : SliderConfig Configuration settings for a chunk length slider component. overlap_len : SliderConfig Configuration settings for an overlap length slider component. preprocess_cores : SliderConfig Configuration settings for a CPU cores slider component for preprocessing. extract_model : DropdownConfig Configuration settings for a model name dropdown component for feature extraction. f0_method : DropdownConfig Configuration settings for an F0 method dropdown component. hop_length : SliderConfig Configuration settings for a hop length slider component. include_mutes : SliderConfig Configuration settings for an include mutes slider component. extract_cores : SliderConfig Configuration settings for a CPU cores slider component for feature extraction. extraction_acceleration : HardwareAccelerationConfig Configuration settings for a hardware acceleration component for feature extraction. extraction_gpus : DropdownConfig Configuration settings for a GPU dropdown compoennt for feature extraction. train_model : DropdownConfig Configuration settings for a model name dropdown component for training. num_epochs : SliderConfig Configuration settings for a number of epochs slider component. batch_size : SliderConfig Configuration settings for a batch size slider component. detect_overtraining : CheckboxConfig Configuration settings for a detect overtraining checkbox component. overtraining_threshold : SliderConfig Configuration settings for an overtraining threshold slider component. vocoder : DropdownConfig Configuration settings for a vocoder dropdown component. index_algorithm : DropdownConfig Configuration settings for an index algorithm dropdown component. pretrained_type : DropdownConfig Configuration settings for a pretrained model type dropdown component. custom_pretrained_model : DropdownConfig Configuration settings for a custom pretrained model dropdown component. save_interval : SliderConfig Configuration settings for a save-interval slider component. save_all_checkpoints : CheckboxConfig Configuration settings for a save-all-checkpoints checkbox component. save_all_weights : CheckboxConfig Configuration settings for a save-all-weights checkbox component. clear_saved_data : CheckboxConfig Configuration settings for a clear-saved-data checkbox component. upload_model : CheckboxConfig Configuration settings for an upload voice model checkbox component. upload_name : TextboxConfig Configuration settings for an upload name textbox component. training_acceleration : HardwareAccelerationConfig Configuration settings for a hardware acceleration component for training. training_gpus : DropdownConfig Configuration settings for a GPU dropdown component for training. preload_dataset : CheckboxConfig Configuration settings for a preload dataset checkbox component. reduce_memory_usage : CheckboxConfig Configuration settings for a reduce-memory-usage checkbox component. See Also -------- BaseTabConfig Parent model defining common component configuration settings for UI tabs. """ dataset_type: DropdownConfig = DropdownConfig( label="Dataset type", info="Select the type of dataset to preprocess.", value=DatasetType.NEW_DATASET, choices=list(DatasetType), exclude_value=True, ) dataset: DropdownConfig = DropdownConfig( label="Dataset path", info=( "The path to an existing dataset. Either select a path to a previously" " created dataset or provide a path to an external dataset." ), value=None, allow_custom_value=True, visible=False, render=False, exclude_value=True, ) dataset_name: TextboxConfig = TextboxConfig( label="Dataset name", info=( "The name of the new dataset. If the dataset already exists, the provided" " audio files will be added to it." ), value="My dataset", exclude_value=True, ) preprocess_model: DropdownConfig = DropdownConfig( label="Model name", info=( "Name of the model to preprocess the given dataset for. Either select an" " existing model from the dropdown or provide the name of a new model." ), value="My model", allow_custom_value=True, render=False, exclude_value=True, ) sample_rate: DropdownConfig = DropdownConfig( label="Sample rate", info="Target sample rate for the audio files in the provided dataset.", value=TrainingSampleRate.HZ_40K, choices=list(TrainingSampleRate), ) filter_audio: CheckboxConfig = CheckboxConfig( label="Filter audio", info=( "Whether to remove low-frequency sounds from the audio files in the" " provided dataset by applying a high-pass butterworth filter.

" ), value=True, ) clean_audio: CheckboxConfig = CheckboxConfig( label="Clean audio", info=( "Whether to clean the audio files in the provided dataset using noise" " reduction algorithms.


" ), value=False, exclude_value=True, ) clean_strength: SliderConfig = SliderConfig.clean_strength(visible=False) split_method: DropdownConfig = DropdownConfig( label="Audio splitting method", info=( "The method to use for splitting the audio files in the provided dataset." " Use the `Skip` method to skip splitting if the audio files are already" " split. Use the `Simple` method if excessive silence has already been" " removed from the audio files. Use the `Automatic` method for automatic" " silence detection and splitting around it." ), value=AudioSplitMethod.AUTOMATIC, choices=list(AudioSplitMethod), exclude_value=True, ) chunk_len: SliderConfig = SliderConfig( label="Chunk length", info="Length of split audio chunks.", value=3.0, minimum=0.5, maximum=5.0, step=0.1, visible=False, ) overlap_len: SliderConfig = SliderConfig( label="Overlap length", info="Length of overlap between split audio chunks.", value=0.3, minimum=0.0, maximum=0.4, step=0.1, visible=False, ) preprocess_cores: SliderConfig = SliderConfig.cpu_cores() extract_model: DropdownConfig = DropdownConfig( label="Model name", info=( "Name of the model with an associated preprocessed dataset to extract" " training features from. When a new dataset is preprocessed, its" " associated model is selected by default." ), value=None, render=False, exclude_value=True, ) f0_method: DropdownConfig = DropdownConfig( label="F0 method", info="The method to use for extracting pitch features.", value=TrainingF0Method.RMVPE, choices=list(TrainingF0Method), exclude_value=True, ) hop_length: SliderConfig = SliderConfig.hop_length( label="Hop length", info="The hop length to use for extracting pitch features.

", visible=False, ) include_mutes: SliderConfig = SliderConfig( label="Include mutes", info=( "The number of mute audio files to include in the generated training file" " list. Adding silent files enables the training model to handle pure" " silence in inferred audio files. If the preprocessed audio dataset" " already contains segments of pure silence, set this to 0." ), value=0, minimum=0, maximum=10, step=1, ) extraction_cores: SliderConfig = SliderConfig.cpu_cores() extraction_acceleration: DropdownConfig = DropdownConfig.hardware_acceleration() extraction_gpus: DropdownConfig = DropdownConfig.gpu() train_model: DropdownConfig = DropdownConfig( label="Model name", info=( "Name of the model to train. When training features are extracted for a new" " model, its name is selected by default." ), value=None, render=False, exclude_value=True, ) num_epochs: SliderConfig = SliderConfig( label="Number of epochs", info=( "The number of epochs to train the voice model. A higher number can improve" " voice model performance but may lead to overtraining." ), value=500, minimum=1, maximum=5000, step=1, ) batch_size: SliderConfig = SliderConfig( label="Batch size", info=( "The number of samples in each training batch. It is advisable to align" " this value with the available VRAM of your GPU." ), value=16, minimum=1, maximum=128, step=1, ) detect_overtraining: CheckboxConfig = CheckboxConfig( label="Detect overtraining", info=( "Whether to detect overtraining to prevent the voice model from learning" " the training data too well and losing the ability to generalize to new" " data." ), value=True, exclude_value=True, ) overtraining_threshold: SliderConfig = SliderConfig( label="Overtraining threshold", info=( "The maximum number of epochs to continue training without any observed" " improvement in voice model performance." ), value=500, minimum=1, maximum=1000, visible=False, ) vocoder: DropdownConfig = DropdownConfig( label="Vocoder", info=( "The vocoder to use for audio synthesis during training. HiFi-GAN provides" " basic audio fidelity, while RefineGAN provides the highest audio" " fidelity." ), value=Vocoder.HIFI_GAN, choices=list(Vocoder), ) index_algorithm: DropdownConfig = DropdownConfig( label="Index algorithm", info=( "The method to use for generating an index file for the trained voice" " model. `KMeans` is particularly useful for large datasets." ), value=IndexAlgorithm.AUTO, choices=list(IndexAlgorithm), ) pretrained_type: DropdownConfig = DropdownConfig( label="Pretrained model type", info=( "The type of pretrained model to finetune the voice model on. `None` will" " train the voice model from scratch, while `Default` will use a pretrained" " model tailored to the specific voice model architecture. `Custom` will" " use a custom pretrained that you provide." ), value=PretrainedType.DEFAULT, choices=list(PretrainedType), exclude_value=True, ) custom_pretrained_model: DropdownConfig = DropdownConfig( label="Custom pretrained model", info="Select a custom pretrained model to finetune from the dropdown.", value=None, visible=False, render=False, exclude_value=True, ) save_interval: SliderConfig = SliderConfig( label="Save interval", info=( "The epoch interval at which to to save voice model weights and" " checkpoints. The best model weights are always saved regardless of this" " setting." ), value=10, minimum=1, maximum=100, step=1, ) save_all_checkpoints: CheckboxConfig = CheckboxConfig( label="Save all checkpoints", info=( "Whether to save a unique checkpoint at each save interval. If not enabled," " only the latest checkpoint will be saved at each interval." ), value=True, ) save_all_weights: CheckboxConfig = CheckboxConfig( label="Save all weights", info=( "Whether to save unique voice model weights at each save interval. If not" " enabled, only the best voice model weights will be saved." ), value=True, ) clear_saved_data: CheckboxConfig = CheckboxConfig( label="Clear saved data", info=( "Whether to delete any existing training data associated with the voice" " model before training commences. Enable this setting only if you are" " training a new voice model from scratch or restarting training." ), value=False, ) upload_model: CheckboxConfig = CheckboxConfig( label="Upload voice model", info=( "Whether to automatically upload the trained voice model so that it can be" " used for generation tasks within the Ultimate RVC app." ), value=False, exclude_value=True, ) upload_name: TextboxConfig = TextboxConfig( label="Upload name", info="The name to give the uploaded voice model.", value=None, visible=False, exclude_value=True, ) training_acceleration: DropdownConfig = DropdownConfig.hardware_acceleration() training_gpus: DropdownConfig = DropdownConfig.gpu() preload_dataset: CheckboxConfig = CheckboxConfig( label="Preload dataset", info=( "Whether to preload all training data into GPU memory. This can improve" " training speed but requires a lot of VRAM.

" ), value=True, ) reduce_memory_usage: CheckboxConfig = CheckboxConfig( label="Reduce memory usage", info=( "Whether to reduce VRAM usage at the cost of slower training speed by" " enabling activation checkpointing. This is useful for GPUs with limited" " memory (e.g., <6GB VRAM) or when training with a batch size larger than" " what your GPU can normally accommodate." ), value=False, ) class GenerationConfig(BaseTabConfig): """ Common component configuration settings for generation tabs. voice_model : DropdownConfig Configuration settings for a voice model dropdown component. f0_methods : DropdownConfig Configuration settings for a pitch extraction algorithms dropdown component. index_rate : SliderConfig Configuration settings for an index rate slider component. rms_mix_rate : SliderConfig Configuration settings for a RMS mix rate slider component. protect_rate : SliderConfig Configuration settings for a protect rate slider component. split_voice : CheckboxConfig Configuration settings for a split voice checkbox component. autotune_voice: CheckboxConfig Configuration settings for an autotune voice checkbox component. autotune_strength: SliderConfig Configuration settings for an autotune strength slider component. sid : NumberConfig Configuration settings for a speaker ID number component. output_sr : DropdownConfig Configuration settings for an output sample rate dropdown component. output_format : DropdownConfig Configuration settings for an output format dropdown component. output_name : TextboxConfig Configuration settings for an output name textbox component. See Also -------- BaseTabConfig Parent model defining common component configuration settings for UI tabs. """ voice_model: DropdownConfig = DropdownConfig( label="Voice model", info="Select a model to use for voice conversion.", value=None, render=False, exclude_value=True, ) f0_methods: DropdownConfig = DropdownConfig( label="Pitch extraction algorithm(s)", info=( "If more than one method is selected, then the median of the pitch values" " extracted by each method is used. RMVPE is recommended for most cases and" " is the default when no method is selected." ), value=[F0Method.RMVPE], choices=list(F0Method), multiselect=True, ) index_rate: SliderConfig = SliderConfig( label="Index rate", info=( "Increase to bias the conversion towards the accent of the voice model." " Decrease to potentially reduce artifacts coming from the voice" " model.


" ), value=0.3, minimum=0.0, maximum=1.0, ) rms_mix_rate: SliderConfig = SliderConfig( label="RMS mix rate", info=( "How much to mimic the loudness (0) of the input voice or a fixed loudness" " (1). A value of 1 is recommended for most cases.

" ), value=1.0, minimum=0.0, maximum=1.0, ) protect_rate: SliderConfig = SliderConfig( label="Protect rate", info=( "Controls the extent to which consonants and breathing sounds are protected" " from artifacts. A higher value offers more protection but may worsen the" " indexing effect.

" ), value=0.33, minimum=0.0, maximum=0.5, ) hop_length: SliderConfig = SliderConfig.hop_length( label="Hop length", info=( "How often the CREPE-based pitch extraction method checks for pitch changes" " measured in milliseconds. Lower values lead to longer conversion times" " and a higher risk of voice cracks, but better pitch accuracy." ), visible=True, ) split_voice: CheckboxConfig = CheckboxConfig( label="Split input voice", info=( "Whether to split the input voice track into smaller segments before" " converting it. This can improve output quality for longer voice tracks." ), value=False, ) autotune_voice: CheckboxConfig = CheckboxConfig( label="Autotune converted voice", info="Whether to apply autotune to the converted voice.

", value=False, exclude_value=True, ) autotune_strength: SliderConfig = SliderConfig( label="Autotune intensity", info=( "Higher values result in stronger snapping to the chromatic grid and" " artifacting." ), value=1.0, minimum=0.0, maximum=1.0, visible=False, ) sid: NumberConfig = NumberConfig( label="Speaker ID", info="Speaker ID for multi-speaker-models.", value=0, precision=0, ) output_sr: DropdownConfig = DropdownConfig( label="Output sample rate", info="The sample rate of the mixed output track.", value=SampleRate.HZ_44100, choices=list(SampleRate), ) output_format: DropdownConfig = DropdownConfig( label="Output format", info="The audio format of the mixed output track.", value=AudioExt.MP3, choices=list(AudioExt), ) output_name: TextboxConfig = TextboxConfig( label="Output name", info="If no name is provided, a suitable name will be generated automatically.", value=None, placeholder="Ultimate RVC output", exclude_value=True, ) class SongGenerationConfig(GenerationConfig): """ Common component configuration settings for song generation tabs. Attributes ---------- source_type : DropdownConfig Configuration settings for a source type dropdown component. source : TextboxConfig Configuration settings for an input source textbox component. cached_song : DropdownConfig Configuration settings for a cached song dropdown component. clean_strength : SliderConfig Configuration settings for a clean strength slider component. clean_voice : CheckboxConfig Configuration settings for a clean voice checkbox component. room_size : SliderConfig Configuration settings for a room size slider component. wet_level : SliderConfig Configuration settings for a wetness level slider component. dry_level : SliderConfig Configuration settings for a dryness level slider component. damping : SliderConfig Configuration settings for a damping level slider component. main_gain : SliderConfig Configuration settings for a main gain slider component. inst_gain : SliderConfig Configuration settings for an instrumentals gain slider component. backup_gain : SliderConfig Configuration settings for a backup vocals gain slider component. See Also -------- GenerationConfig Parent model defining common component configuration settings for song generation tabs. """ source_type: DropdownConfig = DropdownConfig( label="Source type", info="The type of source to retrieve a song from.", value=SongSourceType.LOCAL_FILE, choices=list(SongSourceType), type="index", exclude_value=True, ) source: TextboxConfig = TextboxConfig( label="Source", info="Local (to the server) filepath or http link. Youtube probably wont work but most other sites still do.", value=None, exclude_value=True, ) cached_song: DropdownConfig = DropdownConfig( label="Source", info="Select a song from the list of cached songs.", value=None, visible=False, render=False, exclude_value=True, ) clean_voice: CheckboxConfig = CheckboxConfig( label="Clean converted voice", info=( "Whether to clean the converted voice using noise reduction" " algorithms.

" ), value=False, exclude_value=True, ) clean_strength: SliderConfig = SliderConfig.clean_strength(visible=False) room_size: SliderConfig = SliderConfig( label="Room size", info=( "Size of the room which reverb effect simulates. Increase for longer reverb" " time." ), value=0.15, minimum=0.0, maximum=1.0, ) wet_level: SliderConfig = SliderConfig( label="Wetness level", info="Loudness of converted vocals with reverb effect applied.", value=0.2, minimum=0.0, maximum=1.0, ) dry_level: SliderConfig = SliderConfig( label="Dryness level", info="Loudness of converted vocals without reverb effect applied.", value=0.8, minimum=0.0, maximum=1.0, ) damping: SliderConfig = SliderConfig( label="Damping level", info="Absorption of high frequencies in reverb effect.", value=0.7, minimum=0.0, maximum=1.0, ) main_gain: SliderConfig = SliderConfig.gain( label="Main gain", info="The gain to apply to the main vocals.", ) inst_gain: SliderConfig = SliderConfig.gain( label="Instrumentals gain", info="The gain to apply to the instrumentals.", ) backup_gain: SliderConfig = SliderConfig.gain( label="Backup gain", info="The gain to apply to the backup vocals.", ) class SpeechGenerationConfig(GenerationConfig): """ Common component configuration settings for speech generation tabs. Attributes ---------- source_type : DropdownConfig Configuration settings for a source type dropdown component. source : TextboxConfig Configuration settings for an input source textbox component. edge_tts_voice : DropdownConfig Configuration settings for an Edge TTS voice dropdown component. n_octaves : SliderConfig Configuration settings for an octave pitch shift slider component. n_semitones : SliderConfig Configuration settings for a semitone pitch shift slider component. tts_pitch_shift : SliderConfig Configuration settings for a TTS pitch shift slider component. tts_speed_change : SliderConfig Configuration settings for a TTS speed change slider component. tts_volume_change : SliderConfig Configuration settings for a TTS volume change slider component. clean_voice : CheckboxConfig Configuration settings for a clean voice checkbox component. clean_strength : SliderConfig Configuration settings for a clean strength slider component. output_gain : GainSliderConfig Configuration settings for an output gain slider component. See Also -------- GenerationConfig Parent model defining common component configuration settings for generation tabs. """ source_type: DropdownConfig = DropdownConfig( label="Source type", info="The type of source to generate speech from.", value=SpeechSourceType.TEXT, choices=list(SpeechSourceType), type="index", exclude_value=True, ) source: TextboxConfig = TextboxConfig( label="Source", info="Text to generate speech from", value=None, exclude_value=True, ) edge_tts_voice: DropdownConfig = DropdownConfig( label="Edge TTS voice", info="Select a voice to use for text to speech conversion.", value=None, render=False, exclude_value=True, ) n_octaves: SliderConfig = SliderConfig.octave_shift( label="Octave shift", info=( "The number of octaves to pitch-shift the converted speech by. Use 1 for" " male-to-female and -1 for vice-versa." ), ) n_semitones: SliderConfig = SliderConfig.semitone_shift( label="Semitone shift", info="The number of semi-tones to pitch-shift the converted speech by.", ) tts_pitch_shift: SliderConfig = SliderConfig( label="Edge TTS pitch shift", info=( "The number of hertz to shift the pitch of the speech generated by Edge" " TTS." ), value=0, minimum=-100, maximum=100, step=1, ) tts_speed_change: SliderConfig = SliderConfig( label="TTS speed change", info="The percentual change to the speed of the speech generated by Edge TTS.", value=0, minimum=-50, maximum=100, step=1, ) tts_volume_change: SliderConfig = SliderConfig( label="TTS volume change", info="The percentual change to the volume of the speech generated by Edge TTS.", value=0, minimum=-100, maximum=100, step=1, ) clean_voice: CheckboxConfig = CheckboxConfig( label="Clean converted voice", info=( "Whether to clean the converted voice using noise reduction" " algorithms.

" ), value=True, exclude_value=True, ) clean_strength: SliderConfig = SliderConfig.clean_strength(visible=True) output_gain: SliderConfig = SliderConfig.gain( label="Output gain", info="The gain to apply to the converted speech.

", ) total_config = load_config(config_name, TotalConfig) if config_name else TotalConfig() def render_song_cover_multi_step_tab( total_config: TotalConfig, cookiefile: str | None = None ) -> None: """ Render "Generate song cover - multi-step generation" tab. Parameters ---------- total_config : TotalConfig Model containing all component configuration settings for the Ultimate RVC web UI. cookiefile : str, optional The path to a file containing cookies to use when downloading audio from Youtube. """ tab_config = total_config.song.multi_step for input_track in tab_config.input_audio.all: input_track.instantiate() with gr.Tab("Multi-step"): _render_step_0(total_config, cookiefile=cookiefile) _render_step_1(tab_config) _render_step_2(tab_config) _render_step_3(tab_config) _render_step_4(tab_config) _render_step_5(total_config, tab_config) def _render_step_0(total_config: TotalConfig, cookiefile: str | None) -> None: tab_config = total_config.song.multi_step current_song_dir = gr.State(None) with gr.Accordion("Step 0: song retrieval", open=True): gr.Markdown("") with gr.Row(): with gr.Column(): tab_config.source_type.instantiate() with gr.Column(): tab_config.source.instantiate() local_file = gr.Audio( label="Source", type="filepath", visible=False, waveform_options=gr.WaveformOptions(show_recording_waveform=True), ) tab_config.cached_song.instance.render() tab_config.source_type.instance.input( partial(toggle_visible_component, 3), inputs=tab_config.source_type.instance, outputs=[ tab_config.source.instance, local_file, tab_config.cached_song.instance, ], show_progress="hidden", ) local_file.change( update_value, inputs=local_file, outputs=tab_config.source.instance, show_progress="hidden", ) tab_config.cached_song.instance.input( update_value, inputs=tab_config.cached_song.instance, outputs=tab_config.source.instance, show_progress="hidden", ) with gr.Accordion("Options", open=False): song_transfer = _render_song_transfer( [SongTransferOption.STEP_1_AUDIO], "Song", ) with gr.Row(): retrieve_song_reset_btn = gr.Button("Reset options") retrieve_song_btn = gr.Button("Retrieve song", variant="primary") song_transfer_btn = gr.Button("Transfer song") song_output = gr.Audio( label="Song", type="filepath", interactive=False, waveform_options=gr.WaveformOptions(show_recording_waveform=True), ) retrieve_song_reset_btn.click( lambda: gr.Dropdown(value=[SongTransferOption.STEP_1_AUDIO]), outputs=song_transfer, show_progress="hidden", ) retrieve_song_btn.click( partial( exception_harness( retrieve_song, info_msg="Song retrieved successfully!", ), cookiefile=cookiefile, ), inputs=tab_config.source.instance, outputs=[song_output, current_song_dir], ).then( partial( update_dropdowns, get_named_song_dirs, len(tab_config.song_dirs.all) + 2, value_indices=range(len(tab_config.song_dirs.all)), ), inputs=current_song_dir, outputs=[ *tab_config.song_dirs.all, tab_config.cached_song.instance, total_config.song.one_click.cached_song.instance, ], show_progress="hidden", ).then( partial(update_dropdowns, get_named_song_dirs, 1, [], [0]), outputs=total_config.management.audio.intermediate.instance, show_progress="hidden", ) setup_transfer_event( song_transfer_btn, song_transfer, song_output, tab_config.input_audio.all, ) def _render_step_1(tab_config: MultiStepSongGenerationConfig) -> None: with gr.Accordion("Step 1: vocal separation", open=False): tab_config.input_audio.audio.instance.render() tab_config.song_dirs.separate_audio.instance.render() with gr.Accordion("Options", open=False): with gr.Row(): tab_config.separation_model.instantiate() tab_config.segment_size.instantiate() with gr.Row(): primary_stem_transfer = _render_song_transfer( [SongTransferOption.STEP_2_VOCALS], "Primary stem", ) secondary_stem_transfer = _render_song_transfer( [SongTransferOption.STEP_4_INSTRUMENTALS], "Secondary stem", ) with gr.Row(): separate_audio_reset_btn = gr.Button("Reset options") separate_vocals_btn = gr.Button("Separate vocals", variant="primary") with gr.Row(): primary_stem_transfer_btn = gr.Button("Transfer primary stem") secondary_stem_transfer_btn = gr.Button("Transfer secondary stem") with gr.Row(): primary_stem_output = gr.Audio( label="Primary stem", type="filepath", interactive=False, waveform_options=gr.WaveformOptions(show_recording_waveform=True), ) secondary_stem_output = gr.Audio( label="Secondary stem", type="filepath", interactive=False, waveform_options=gr.WaveformOptions(show_recording_waveform=True), ) separate_audio_reset_btn.click( lambda: [ tab_config.separation_model.value, tab_config.segment_size.value, gr.Dropdown(value=[SongTransferOption.STEP_2_VOCALS]), gr.Dropdown(value=[SongTransferOption.STEP_4_INSTRUMENTALS]), ], outputs=[ tab_config.separation_model.instance, tab_config.segment_size.instance, primary_stem_transfer, secondary_stem_transfer, ], show_progress="hidden", ) separate_vocals_btn.click( exception_harness( separate_audio, info_msg="Vocals separated successfully!", ), inputs=[ tab_config.input_audio.audio.instance, tab_config.song_dirs.separate_audio.instance, tab_config.separation_model.instance, tab_config.segment_size.instance, ], outputs=[primary_stem_output, secondary_stem_output], concurrency_limit=1, concurrency_id=ConcurrencyId.GPU, ) for btn, transfer, output in [ (primary_stem_transfer_btn, primary_stem_transfer, primary_stem_output), ( secondary_stem_transfer_btn, secondary_stem_transfer, secondary_stem_output, ), ]: setup_transfer_event( btn, transfer, output, tab_config.input_audio.all, ) def _render_step_2(tab_config: MultiStepSongGenerationConfig) -> None: with gr.Accordion("Step 2: vocal conversion", open=False): tab_config.input_audio.vocals.instance.render() tab_config.voice_model.instance.render() tab_config.song_dirs.convert_vocals.instance.render() with gr.Accordion("Options", open=False): with gr.Row(): tab_config.n_octaves.instantiate() tab_config.n_semitones.instantiate() converted_vocals_transfer = _render_song_transfer( [SongTransferOption.STEP_3_VOCALS], "Converted vocals", ) with gr.Accordion("Advanced", open=False): with gr.Accordion("Voice synthesis", open=False): with gr.Row(): tab_config.f0_methods.instantiate() tab_config.index_rate.instantiate() with gr.Row(): tab_config.rms_mix_rate.instantiate() tab_config.protect_rate.instantiate() tab_config.hop_length.instantiate() with gr.Accordion("Vocal enrichment", open=False), gr.Row(): with gr.Column(): tab_config.split_voice.instantiate() with gr.Column(): tab_config.autotune_voice.instantiate() tab_config.autotune_strength.instantiate() with gr.Column(): tab_config.clean_voice.instantiate() tab_config.clean_strength.instantiate() tab_config.autotune_voice.instance.change( partial(toggle_visibility, targets={True}), inputs=tab_config.autotune_voice.instance, outputs=tab_config.autotune_strength.instance, show_progress="hidden", ) tab_config.clean_voice.instance.change( partial(toggle_visibility, targets={True}), inputs=tab_config.clean_voice.instance, outputs=tab_config.clean_strength.instance, show_progress="hidden", ) with gr.Accordion("Speaker embeddings", open=False), gr.Row(): with gr.Column(): tab_config.embedder_model.instantiate() tab_config.custom_embedder_model.instance.render() tab_config.sid.instantiate() tab_config.embedder_model.instance.change( partial(toggle_visibility, targets={EmbedderModel.CUSTOM}), inputs=tab_config.embedder_model.instance, outputs=tab_config.custom_embedder_model.instance, show_progress="hidden", ) with gr.Row(): convert_vocals_reset_btn = gr.Button("Reset options") convert_vocals_btn = gr.Button("Convert vocals", variant="primary") converted_vocals_transfer_btn = gr.Button("Transfer converted vocals") converted_vocals_track_output = gr.Audio( label="Converted vocals", type="filepath", interactive=False, waveform_options=gr.WaveformOptions(show_recording_waveform=True), ) convert_vocals_reset_btn.click( lambda: [ tab_config.n_octaves.value, tab_config.n_semitones.value, tab_config.f0_methods.value, tab_config.index_rate.value, tab_config.rms_mix_rate.value, tab_config.protect_rate.value, tab_config.hop_length.value, tab_config.split_voice.value, tab_config.autotune_voice.value, tab_config.autotune_strength.value, tab_config.clean_voice.value, tab_config.clean_strength.value, tab_config.embedder_model.value, tab_config.sid.value, gr.Dropdown(value=[SongTransferOption.STEP_3_VOCALS]), ], outputs=[ tab_config.n_octaves.instance, tab_config.n_semitones.instance, tab_config.f0_methods.instance, tab_config.index_rate.instance, tab_config.rms_mix_rate.instance, tab_config.protect_rate.instance, tab_config.hop_length.instance, tab_config.split_voice.instance, tab_config.autotune_voice.instance, tab_config.autotune_strength.instance, tab_config.clean_voice.instance, tab_config.clean_strength.instance, tab_config.embedder_model.instance, tab_config.sid.instance, converted_vocals_transfer, ], show_progress="hidden", ) convert_vocals_btn.click( partial( exception_harness(convert, info_msg="Vocals converted successfully!"), content_type=RVCContentType.VOCALS, ), inputs=[ tab_config.input_audio.vocals.instance, tab_config.song_dirs.convert_vocals.instance, tab_config.voice_model.instance, tab_config.n_octaves.instance, tab_config.n_semitones.instance, tab_config.f0_methods.instance, tab_config.index_rate.instance, tab_config.rms_mix_rate.instance, tab_config.protect_rate.instance, tab_config.hop_length.instance, tab_config.split_voice.instance, tab_config.autotune_voice.instance, tab_config.autotune_strength.instance, tab_config.clean_voice.instance, tab_config.clean_strength.instance, tab_config.embedder_model.instance, tab_config.custom_embedder_model.instance, tab_config.sid.instance, ], outputs=converted_vocals_track_output, concurrency_id=ConcurrencyId.GPU, concurrency_limit=1, ) setup_transfer_event( converted_vocals_transfer_btn, converted_vocals_transfer, converted_vocals_track_output, tab_config.input_audio.all, ) def _render_step_3(tab_config: MultiStepSongGenerationConfig) -> None: with gr.Accordion("Step 3: vocal post-processing", open=False): tab_config.input_audio.converted_vocals.instance.render() tab_config.song_dirs.postprocess_vocals.instance.render() with gr.Accordion("Options", open=False): tab_config.room_size.instantiate() with gr.Row(): tab_config.wet_level.instantiate() tab_config.dry_level.instantiate() tab_config.damping.instantiate() effected_vocals_transfer = _render_song_transfer( [SongTransferOption.STEP_5_MAIN_VOCALS], "Effected vocals", ) with gr.Row(): postprocess_vocals_reset_btn = gr.Button("Reset options") postprocess_vocals_btn = gr.Button("Post-process vocals", variant="primary") effected_vocals_transfer_btn = gr.Button("Transfer effected vocals") effected_vocals_track_output = gr.Audio( label="Effected vocals", type="filepath", interactive=False, waveform_options=gr.WaveformOptions(show_recording_waveform=True), ) postprocess_vocals_reset_btn.click( lambda: [ tab_config.room_size.value, tab_config.wet_level.value, tab_config.dry_level.value, tab_config.damping.value, gr.Dropdown(value=[SongTransferOption.STEP_5_MAIN_VOCALS]), ], outputs=[ tab_config.room_size.instance, tab_config.wet_level.instance, tab_config.dry_level.instance, tab_config.damping.instance, effected_vocals_transfer, ], show_progress="hidden", ) postprocess_vocals_btn.click( exception_harness( postprocess, info_msg="Vocals post-processed successfully!", ), inputs=[ tab_config.input_audio.converted_vocals.instance, tab_config.song_dirs.postprocess_vocals.instance, tab_config.room_size.instance, tab_config.wet_level.instance, tab_config.dry_level.instance, tab_config.damping.instance, ], outputs=effected_vocals_track_output, ) setup_transfer_event( effected_vocals_transfer_btn, effected_vocals_transfer, effected_vocals_track_output, tab_config.input_audio.all, ) def _render_step_4(tab_config: MultiStepSongGenerationConfig) -> None: with gr.Accordion("Step 4: pitch shift of background audio", open=False): with gr.Row(): tab_config.input_audio.instrumentals.instance.render() tab_config.input_audio.backup_vocals.instance.render() with gr.Row(): tab_config.n_semitones_instrumentals.instantiate() tab_config.n_semitones_backup_vocals.instantiate() tab_config.song_dirs.pitch_shift_background.instance.render() with gr.Accordion("Options", open=False), gr.Row(): shifted_instrumentals_transfer = _render_song_transfer( [SongTransferOption.STEP_5_INSTRUMENTALS], "Pitch-shifted instrumentals", ) shifted_backup_vocals_transfer = _render_song_transfer( [SongTransferOption.STEP_5_BACKUP_VOCALS], "Pitch-shifted backup vocals", ) with gr.Row(): pitch_shift_instrumentals_btn = gr.Button( "Pitch shift instrumentals", variant="primary", ) pitch_shift_backup_vocals_btn = gr.Button( "Pitch shift backup vocals", variant="primary", ) with gr.Row(): shifted_instrumentals_transfer_btn = gr.Button( "Transfer shifted instrumentals", ) shifted_backup_vocals_transfer_btn = gr.Button( "Transfer shifted backup vocals", ) pitch_shift_background_reset_btn = gr.Button("Reset options") with gr.Row(): shifted_instrumentals_track_output = gr.Audio( label="Pitch-shifted instrumentals", type="filepath", interactive=False, waveform_options=gr.WaveformOptions(show_recording_waveform=True), ) shifted_backup_vocals_track_output = gr.Audio( label="Pitch-shifted backup vocals", type="filepath", interactive=False, waveform_options=gr.WaveformOptions(show_recording_waveform=True), ) pitch_shift_background_reset_btn.click( lambda: [ tab_config.n_semitones_instrumentals.value, tab_config.n_semitones_backup_vocals.value, gr.Dropdown(value=[SongTransferOption.STEP_5_INSTRUMENTALS]), gr.Dropdown(value=[SongTransferOption.STEP_5_BACKUP_VOCALS]), ], outputs=[ tab_config.n_semitones_instrumentals.instance, tab_config.n_semitones_backup_vocals.instance, shifted_instrumentals_transfer, shifted_backup_vocals_transfer, ], show_progress="hidden", ) pitch_shift_instrumentals_btn.click( exception_harness( pitch_shift, info_msg="Instrumentals pitch-shifted successfully!", ), inputs=[ tab_config.input_audio.instrumentals.instance, tab_config.song_dirs.pitch_shift_background.instance, tab_config.n_semitones_instrumentals.instance, ], outputs=shifted_instrumentals_track_output, ) pitch_shift_backup_vocals_btn.click( exception_harness( pitch_shift, info_msg="Backup vocals pitch-shifted successfully!", ), inputs=[ tab_config.input_audio.backup_vocals.instance, tab_config.song_dirs.pitch_shift_background.instance, tab_config.n_semitones_backup_vocals.instance, ], outputs=shifted_backup_vocals_track_output, ) for btn, transfer, output in [ ( shifted_instrumentals_transfer_btn, shifted_instrumentals_transfer, shifted_instrumentals_track_output, ), ( shifted_backup_vocals_transfer_btn, shifted_backup_vocals_transfer, shifted_backup_vocals_track_output, ), ]: setup_transfer_event( btn, transfer, output, tab_config.input_audio.all, ) def _render_step_5( total_config: TotalConfig, tab_config: MultiStepSongGenerationConfig, ) -> None: with gr.Accordion("Step 5: song mixing", open=False): with gr.Row(): tab_config.input_audio.main_vocals.instance.render() tab_config.input_audio.shifted_instrumentals.instance.render() tab_config.input_audio.shifted_backup_vocals.instance.render() tab_config.song_dirs.mix.instance.render() with gr.Accordion("Options", open=False): with gr.Row(): tab_config.main_gain.instantiate() tab_config.inst_gain.instantiate() tab_config.backup_gain.instantiate() with gr.Row(): tab_config.output_name.instantiate( value=partial( update_output_name, get_song_cover_name, False, # noqa: FBT003, ), inputs=[ tab_config.input_audio.main_vocals.instance, tab_config.song_dirs.mix.instance, ], ) tab_config.output_sr.instantiate() tab_config.output_format.instantiate() song_cover_transfer = _render_song_transfer([], "Song cover") with gr.Row(): mix_reset_btn = gr.Button("Reset options") mix_btn = gr.Button("Mix song cover", variant="primary") song_cover_transfer_btn = gr.Button("Transfer song cover") song_cover_output = gr.Audio( label="Song cover", type="filepath", interactive=False, waveform_options=gr.WaveformOptions(show_recording_waveform=True), ) mix_reset_btn.click( lambda: [ tab_config.main_gain.value, tab_config.inst_gain.value, tab_config.backup_gain.value, tab_config.output_sr.value, tab_config.output_format.value, gr.Dropdown(value=[]), ], outputs=[ tab_config.main_gain.instance, tab_config.inst_gain.instance, tab_config.backup_gain.instance, tab_config.output_sr.instance, tab_config.output_format.instance, song_cover_transfer, ], show_progress="hidden", ) temp_audio_gains = gr.State() mix_btn.click( partial( _pair_audio_tracks_and_gain, [ tab_config.input_audio.main_vocals.instance, tab_config.input_audio.shifted_instrumentals.instance, tab_config.input_audio.shifted_backup_vocals.instance, ], [ tab_config.main_gain.instance, tab_config.inst_gain.instance, tab_config.backup_gain.instance, ], ), inputs={ tab_config.input_audio.main_vocals.instance, tab_config.input_audio.shifted_instrumentals.instance, tab_config.input_audio.shifted_backup_vocals.instance, tab_config.main_gain.instance, tab_config.inst_gain.instance, tab_config.backup_gain.instance, }, outputs=temp_audio_gains, ).then( exception_harness(mix_song, info_msg="Song cover succesfully generated."), inputs=[ temp_audio_gains, tab_config.song_dirs.mix.instance, tab_config.output_sr.instance, tab_config.output_format.instance, tab_config.output_name.instance, ], outputs=song_cover_output, ).then( partial(update_dropdowns, get_saved_output_audio, 1, [], [0]), outputs=total_config.management.audio.output.instance, show_progress="hidden", ) setup_transfer_event( song_cover_transfer_btn, song_cover_transfer, song_cover_output, tab_config.input_audio.all, ) def _render_song_transfer( value: list[SongTransferOption], label_prefix: str, ) -> gr.Dropdown: return render_transfer_component(value, label_prefix, SongTransferOption) def _pair_audio_tracks_and_gain( audio_components: Sequence[gr.Audio], gain_components: Sequence[gr.Slider], data: dict[gr.Audio | gr.Slider, Any], ) -> list[tuple[str, int]]: """ Pair audio tracks and gain levels stored in separate gradio components. This function is meant to first be partially applied to the sequence of audio components and the sequence of slider components containing the values that should be combined. The resulting function can then be called by an event listener whose inputs is a set containing those audio and slider components. The `data` parameter in that case will contain a mapping from each of those components to the value that the component stores. Parameters ---------- audio_components : Sequence[gr.Audio] Audio components to pair with gain levels. gain_components : Sequence[gr.Slider] Gain level components to pair with audio tracks. data : dict[gr.Audio | gr.Slider, Any] Data from the audio and gain components. Returns ------- list[tuple[str, int]] Paired audio tracks and gain levels. Raises ------ ValueError If the number of audio tracks and gain levels are not the same. """ audio_tracks = [data[component] for component in audio_components] gain_levels = [data[component] for component in gain_components] if len(audio_tracks) != len(gain_levels): err_msg = "Number of audio tracks and gain levels must be the same." raise ValueError(err_msg) return [ (audio_track, gain_level) for audio_track, gain_level in zip(audio_tracks, gain_levels, strict=True) if audio_track ] def run_newpipeline( source: str, model_name: str, n_octaves: int = 0, n_semitones: int = 0, f0_methods: Sequence[F0Method] | None = None, index_rate: float = 0.3, rms_mix_rate: float = 1.0, protect_rate: float = 0.33, hop_length: int = 128, split_vocals: bool = False, autotune_vocals: bool = False, autotune_strength: float = 1.0, clean_vocals: bool = False, clean_strength: float = 0.7, embedder_model: EmbedderModel = EmbedderModel.CONTENTVEC, custom_embedder_model: str | None = None, sid: int = 0, room_size: float = 0.15, wet_level: float = 0.2, dry_level: float = 0.8, damping: float = 0.7, main_gain: int = 0, inst_gain: int = 0, backup_gain: int = 0, output_sr: int = 44100, output_format: AudioExt = AudioExt.MP3, output_name: str | None = None, cookiefile: StrPath | None = None, progress_bar: gr.Progress | None = None, ) -> tuple[Path, ...]: """ Run the song cover generation pipeline. Parameters ---------- source : str A Youtube URL, the path to a local audio file or the path to a song directory. model_name : str The name of the voice model to use for vocal conversion. n_octaves : int, default=0 The number of octaves to pitch-shift the converted vocals by. n_semitones : int, default=0 The number of semi-tones to pitch-shift the converted vocals, instrumentals, and backup vocals by. f0_methods : Sequence[F0Method], optional The methods to use for pitch extraction during vocal conversion. If None, the method used is rmvpe. index_rate : float, default=0.3 The influence of the index file on the vocal conversion. rms_mix_rate : float, default=1.0 The blending rate of the volume envelope of the converted vocals. protect_rate : float, default=0.33 The protect rate for consonants and breathing sounds during vocal conversion. hop_length : int, default=128 The hop length to use for crepe-based pitch detection. split_vocals : bool, default=False Whether to perform audio splitting before converting the main vocals. autotune_vocals : bool, default=False Whether to apply autotune to the converted vocals. autotune_strength : float, default=1.0 The strength of the autotune to apply to the converted vocals. clean_vocals : bool, default=False Whether to clean the converted vocals. clean_strength : float, default=0.7 The intensity of the cleaning to apply to the converted vocals. embedder_model : EmbedderModel, default=EmbedderModel.CONTENTVEC The model to use for generating speaker embeddings during vocal conversion. custom_embedder_model : StrPath, optional The name of a custom embedder model to use for generating speaker embeddings during vocal conversion. sid : int, default=0 The speaker id to use for multi-speaker models during vocal conversion. room_size : float, default=0.15 The room size of the reverb effect to apply to the converted vocals. wet_level : float, default=0.2 The wetness level of the reverb effect to apply to the converted vocals. dry_level : float, default=0.8 The dryness level of the reverb effect to apply to the converted vocals. damping : float, default=0.7 The damping of the reverb effect to apply to the converted vocals. main_gain : int, default=0 The gain to apply to the post-processed vocals. inst_gain : int, default=0 The gain to apply to the pitch-shifted instrumentals. backup_gain : int, default=0 The gain to apply to the pitch-shifted backup vocals. output_sr : int, default=44100 The sample rate of the song cover. output_format : AudioExt, default=AudioExt.MP3 The audio format of the song cover. output_name : str, optional The name of the song cover. cookiefile : StrPath, optional The path to a file containing cookies to use when downloading audio from Youtube. progress_bar : gr.Progress, optional Gradio progress bar to update. Returns ------- tuple[Path,...] The path to the generated song cover and the paths to any intermediate audio files that were generated. """ validate_model(model_name, Entity.VOICE_MODEL) if embedder_model == EmbedderModel.CUSTOM: validate_model(custom_embedder_model, Entity.CUSTOM_EMBEDDER_MODEL) display_progress("[~] Retrieving song...", 0 / 9, progress_bar) song, song_dir = retrieve_song(source, cookiefile=cookiefile) display_progress( "[~] newpipeline: Separating vocals from instrumentals...", 1 / 9, progress_bar ) vocals_track, instrumentals_track = separate_audio( song, song_dir, SeparationModel.MDX23C_8KFFT_InstVoc_HQ_2, SegmentSize.SEG_2048, ) display_progress( "[~] newpipeline: Separating main vocals from backup vocals...", 2 / 9, progress_bar, ) backup_vocals_track, main_vocals_track = separate_audio( vocals_track, song_dir, SeparationModel.UVR_MDX_NET_KARA_2, SegmentSize.SEG_2048, ) display_progress("[~] newpipeline: De-noising vocals...", 3 / 9, progress_bar) noise_track, clean_track = separate_audio( clean_track, song_dir, SeparationModel.UVR_DeNoise, SegmentSize.SEG_2048, ) display_progress("[~] newpipeline: De-reverbing vocals...", 4 / 9, progress_bar) reverb_track, vocals_dereverb_track = separate_audio( main_vocals_track, song_dir, SeparationModel.UVR_DeEcho_DeReverb, SegmentSize.SEG_2048, ) display_progress("[~] newpipeline: Converting vocals...", 5 / 9, progress_bar) converted_vocals_track = convert( audio_track=vocals_dereverb_track, directory=song_dir, model_name=model_name, n_octaves=n_octaves, n_semitones=n_semitones, f0_methods=f0_methods, index_rate=index_rate, rms_mix_rate=rms_mix_rate, protect_rate=protect_rate, hop_length=hop_length, split_audio=split_vocals, autotune_audio=autotune_vocals, autotune_strength=autotune_strength, clean_audio=clean_vocals, clean_strength=clean_strength, embedder_model=embedder_model, custom_embedder_model=custom_embedder_model, sid=sid, content_type=RVCContentType.VOCALS, ) display_progress("[~] newpipeline: Post-processing vocals...", 6 / 9, progress_bar) effected_vocals_track = postprocess( converted_vocals_track, song_dir, room_size, wet_level, dry_level, damping, ) display_progress( "[~] newpipeline: Pitch-shifting instrumentals...", 7 / 9, progress_bar ) shifted_instrumentals_track = pitch_shift( instrumentals_track, song_dir, n_semitones, ) display_progress( "[~] newpipeline: Pitch-shifting backup vocals...", 8 / 9, progress_bar ) shifted_backup_vocals_track = pitch_shift( backup_vocals_track, song_dir, n_semitones, ) song_cover = mix_song( [ (effected_vocals_track, main_gain), (shifted_instrumentals_track, inst_gain), (shifted_backup_vocals_track, backup_gain), ], song_dir, output_sr, output_format, output_name, ) return ( song_cover, song, vocals_track, instrumentals_track, main_vocals_track, backup_vocals_track, vocals_dereverb_track, reverb_track, converted_vocals_track, effected_vocals_track, shifted_instrumentals_track, shifted_backup_vocals_track, ) def render_app() -> gr.Blocks: """ Render the Ultimate RVC web application. Returns ------- gr.Blocks The rendered web application. """ css = """ h1 { text-align: center; margin-top: 20px; margin-bottom: 20px; } #generate-tab-button { font-weight: bold !important;} #manage-tab-button { font-weight: bold !important;} #audio-tab-button { font-weight: bold !important;} #settings-tab-button { font-weight: bold !important;} """ cache_delete_frequency = 86400 # every 24 hours check for files to delete cache_delete_cutoff = 86400 # and delete files older than 24 hours with gr.Blocks( title="Redzone-6 Audio Playground", theme=gr.Theme.load(str(Path(__file__).parent / "config/theme.json")), css=css, delete_cache=(cache_delete_frequency, cache_delete_cutoff), ) as app: for component_config in [ total_config.song.one_click.voice_model, total_config.song.one_click.cached_song, total_config.song.one_click.custom_embedder_model, total_config.song.multi_step.voice_model, total_config.song.multi_step.cached_song, total_config.song.multi_step.custom_embedder_model, total_config.song.multi_step.song_dirs.separate_audio, total_config.song.multi_step.song_dirs.convert_vocals, total_config.song.multi_step.song_dirs.postprocess_vocals, total_config.song.multi_step.song_dirs.pitch_shift_background, total_config.song.multi_step.song_dirs.mix, total_config.speech.one_click.edge_tts_voice, total_config.speech.one_click.voice_model, total_config.speech.one_click.custom_embedder_model, total_config.speech.multi_step.edge_tts_voice, total_config.speech.multi_step.voice_model, total_config.speech.multi_step.custom_embedder_model, total_config.training.multi_step.dataset, total_config.training.multi_step.preprocess_model, total_config.training.multi_step.extract_model, total_config.training.multi_step.train_model, total_config.training.multi_step.custom_embedder_model, total_config.training.multi_step.custom_pretrained_model, total_config.management.audio.intermediate, total_config.management.audio.speech, total_config.management.audio.output, total_config.management.audio.dataset, total_config.management.model.voices, total_config.management.model.embedders, total_config.management.model.pretraineds, total_config.management.model.traineds, total_config.management.settings.load_config_name, total_config.management.settings.delete_config_names, ]: component_config.instantiate() # main tab # with gr.Tab("Generate", elem_id="generate-tab"): with gr.Tab("Music", elem_id="generate-tab"): render_song_cover_one_click_tab(total_config, cookiefile) render_song_cover_multi_step_tab(total_config, cookiefile) with gr.Tab("Speech", elem_id="generate-tab"): render_speech_one_click_tab(total_config) render_speech_multi_step_tab(total_config) with gr.Tab("Configuration", elem_id="settings-tab"): with gr.Tab("Models"): render_models_tab(total_config) with gr.Tab("Settings"): render_settings_tab(total_config) render_audio_tab(total_config) app.load( _init_dropdowns, outputs=[ total_config.speech.one_click.edge_tts_voice.instance, total_config.speech.multi_step.edge_tts_voice.instance, total_config.song.one_click.voice_model.instance, total_config.song.multi_step.voice_model.instance, total_config.speech.one_click.voice_model.instance, total_config.speech.multi_step.voice_model.instance, total_config.management.model.voices.instance, total_config.song.one_click.custom_embedder_model.instance, total_config.song.multi_step.custom_embedder_model.instance, total_config.speech.one_click.custom_embedder_model.instance, total_config.speech.multi_step.custom_embedder_model.instance, total_config.training.multi_step.custom_embedder_model.instance, total_config.management.model.embedders.instance, total_config.training.multi_step.custom_pretrained_model.instance, total_config.management.model.pretraineds.instance, total_config.training.multi_step.extract_model.instance, total_config.training.multi_step.train_model.instance, total_config.training.multi_step.preprocess_model.instance, total_config.management.model.traineds.instance, total_config.song.one_click.cached_song.instance, total_config.song.multi_step.cached_song.instance, total_config.song.multi_step.song_dirs.separate_audio.instance, total_config.song.multi_step.song_dirs.convert_vocals.instance, total_config.song.multi_step.song_dirs.postprocess_vocals.instance, total_config.song.multi_step.song_dirs.pitch_shift_background.instance, total_config.song.multi_step.song_dirs.mix.instance, total_config.management.audio.intermediate.instance, total_config.training.multi_step.dataset.instance, total_config.management.audio.speech.instance, total_config.management.audio.output.instance, total_config.management.audio.dataset.instance, total_config.management.settings.load_config_name.instance, total_config.management.settings.delete_config_names.instance, ], show_progress="hidden", ) return app def _init_dropdowns() -> list[gr.Dropdown]: """ Initialize the Ultimate RVC web application by updating the choices and default values of non-static dropdown components. Returns ------- tuple[gr.Dropdown, ...] A tuple of gr.Dropdown components with updated choices and default values. """ # Initialize model dropdowns edge_tts_models = initialize_dropdowns( get_edge_tts_voice_names, 2, "en-US-ChristopherNeural", range(2), ) voice_models = initialize_dropdowns( get_voice_model_names, 5, value_indices=range(4), ) custom_embedder_models = initialize_dropdowns( get_custom_embedder_model_names, 6, value_indices=range(5), ) custom_pretrained_models = initialize_dropdowns( get_custom_pretrained_model_names, 2, value_indices=range(1), ) training_models = initialize_dropdowns( get_training_model_names, 4, value_indices=range(2), ) song_dirs = initialize_dropdowns( get_named_song_dirs, 8, value_indices=range(7), ) dataset = gr.Dropdown(get_audio_datasets()) speech_delete = gr.Dropdown(get_saved_speech_audio()) output_delete = gr.Dropdown(get_saved_output_audio()) dataset_delete = gr.Dropdown(get_named_audio_datasets()) configs = initialize_dropdowns(get_config_names, 2, value_indices=range(1)) return [ *edge_tts_models, *voice_models, *custom_embedder_models, *custom_pretrained_models, *training_models, *song_dirs, dataset, speech_delete, output_delete, dataset_delete, *configs, ] def render_song_cover_one_click_tab( total_config: TotalConfig, cookiefile: str | None = None ) -> None: """ Render "Generate song covers - One-click generation" tab. Parameters ---------- total_config : TotalConfig Model containing all component configuration settings for the Ultimate RVC web UI. cookiefile : str, optional The path to a file containing cookies to use when downloading audio from Youtube. """ with gr.Tab("One-click"): tab_config = total_config.song.one_click _render_input(tab_config) with gr.Accordion("Options", open=False): _render_main_options(tab_config) _render_conversion_options(tab_config) _render_mixing_options(tab_config) _render_output_options(tab_config) _render_intermediate_audio(tab_config) with gr.Row(equal_height=True): reset_btn = gr.Button(value="Reset options", scale=2) generate_btn = gr.Button("Generate", scale=2, variant="primary") song_cover = gr.Audio( label="Song cover", scale=3, waveform_options=gr.WaveformOptions(show_recording_waveform=False), ) song_dirs = total_config.song.multi_step.song_dirs.all generate_btn.click( partial( exception_harness( run_pipeline, info_msg="Song cover generated successfully!", ), cookiefile=cookiefile, progress_bar=PROGRESS_BAR, ), inputs=[ tab_config.source.instance, tab_config.voice_model.instance, tab_config.n_octaves.instance, tab_config.n_semitones.instance, tab_config.f0_methods.instance, tab_config.index_rate.instance, tab_config.rms_mix_rate.instance, tab_config.protect_rate.instance, tab_config.hop_length.instance, tab_config.split_voice.instance, tab_config.autotune_voice.instance, tab_config.autotune_strength.instance, tab_config.clean_voice.instance, tab_config.clean_strength.instance, tab_config.embedder_model.instance, tab_config.custom_embedder_model.instance, tab_config.sid.instance, tab_config.room_size.instance, tab_config.wet_level.instance, tab_config.dry_level.instance, tab_config.damping.instance, tab_config.main_gain.instance, tab_config.inst_gain.instance, tab_config.backup_gain.instance, tab_config.output_sr.instance, tab_config.output_format.instance, tab_config.output_name.instance, ], outputs=[song_cover, *tab_config.intermediate_audio.all], concurrency_limit=4, concurrency_id=ConcurrencyId.GPU, ).success( partial(update_dropdowns, get_named_song_dirs, 3 + len(song_dirs), [], [2]), outputs=[ total_config.song.one_click.cached_song.instance, total_config.song.multi_step.cached_song.instance, total_config.management.audio.intermediate.instance, *song_dirs, ], show_progress="hidden", ).then( partial(update_dropdowns, get_saved_output_audio, 1, [], [0]), outputs=total_config.management.audio.output.instance, show_progress="hidden", ) reset_btn.click( lambda: [ tab_config.n_octaves.value, tab_config.n_semitones.value, tab_config.f0_methods.value, tab_config.index_rate.value, tab_config.rms_mix_rate.value, tab_config.protect_rate.value, tab_config.hop_length.value, tab_config.split_voice.value, tab_config.autotune_voice.value, tab_config.autotune_strength.value, tab_config.clean_voice.value, tab_config.clean_strength.value, tab_config.embedder_model.value, tab_config.sid.value, tab_config.room_size.value, tab_config.wet_level.value, tab_config.dry_level.value, tab_config.damping.value, tab_config.main_gain.value, tab_config.inst_gain.value, tab_config.backup_gain.value, tab_config.output_sr.value, tab_config.output_format.value, tab_config.show_intermediate_audio.value, ], outputs=[ tab_config.n_octaves.instance, tab_config.n_semitones.instance, tab_config.f0_methods.instance, tab_config.index_rate.instance, tab_config.rms_mix_rate.instance, tab_config.protect_rate.instance, tab_config.hop_length.instance, tab_config.split_voice.instance, tab_config.autotune_voice.instance, tab_config.autotune_strength.instance, tab_config.clean_voice.instance, tab_config.clean_strength.instance, tab_config.embedder_model.instance, tab_config.sid.instance, tab_config.room_size.instance, tab_config.wet_level.instance, tab_config.dry_level.instance, tab_config.damping.instance, tab_config.main_gain.instance, tab_config.inst_gain.instance, tab_config.backup_gain.instance, tab_config.output_sr.instance, tab_config.output_format.instance, tab_config.show_intermediate_audio.instance, ], show_progress="hidden", ) def _render_input(tab_config: OneClickSongGenerationConfig) -> None: with gr.Row(): with gr.Column(): tab_config.source_type.instantiate() with gr.Column(): tab_config.source.instantiate() local_file = gr.Audio( label="Source", type="filepath", visible=False, waveform_options=gr.WaveformOptions(show_recording_waveform=False), ) tab_config.cached_song.instance.render() tab_config.source_type.instance.input( partial(toggle_visible_component, 3), inputs=tab_config.source_type.instance, outputs=[ tab_config.source.instance, local_file, tab_config.cached_song.instance, ], show_progress="hidden", ) local_file.change( update_value, inputs=local_file, outputs=tab_config.source.instance, show_progress="hidden", ) tab_config.cached_song.instance.input( update_value, inputs=tab_config.cached_song.instance, outputs=tab_config.source.instance, show_progress="hidden", ) with gr.Row(): tab_config.voice_model.instance.render() def _render_main_options(tab_config: OneClickSongGenerationConfig) -> None: with gr.Row(): tab_config.n_octaves.instantiate() tab_config.n_semitones.instantiate() def _render_conversion_options(tab_config: OneClickSongGenerationConfig) -> None: with gr.Accordion("Vocal conversion", open=True): gr.Markdown("") with gr.Accordion("Voice synthesis", open=True): with gr.Row(): tab_config.f0_methods.instantiate() tab_config.index_rate.instantiate() with gr.Row(): tab_config.rms_mix_rate.instantiate() tab_config.protect_rate.instantiate() tab_config.hop_length.instantiate() with gr.Accordion("Vocal enrichment", open=True): with gr.Row(): with gr.Column(): tab_config.split_voice.instantiate() with gr.Column(): tab_config.autotune_voice.instantiate() tab_config.autotune_strength.instantiate() with gr.Column(): tab_config.clean_voice.instantiate() tab_config.clean_strength.instantiate() tab_config.autotune_voice.instance.change( partial(toggle_visibility, targets={True}), inputs=tab_config.autotune_voice.instance, outputs=tab_config.autotune_strength.instance, show_progress="hidden", ) tab_config.clean_voice.instance.change( partial(toggle_visibility, targets={True}), inputs=tab_config.clean_voice.instance, outputs=tab_config.clean_strength.instance, show_progress="hidden", ) with gr.Accordion("Speaker embedding", open=True): with gr.Row(): with gr.Column(): tab_config.embedder_model.instantiate() tab_config.custom_embedder_model.instance.render() tab_config.sid.instantiate() tab_config.embedder_model.instance.change( partial(toggle_visibility, targets={EmbedderModel.CUSTOM}), inputs=tab_config.embedder_model.instance, outputs=tab_config.custom_embedder_model.instance, show_progress="hidden", ) def _render_mixing_options(tab_config: OneClickSongGenerationConfig) -> None: with gr.Accordion("Audio mixing", open=True): gr.Markdown("") with gr.Accordion("Reverb control on converted vocals", open=True): with gr.Row(): tab_config.room_size.instantiate() with gr.Row(): tab_config.wet_level.instantiate() tab_config.dry_level.instantiate() tab_config.damping.instantiate() with gr.Accordion("Volume controls (dB)", open=True), gr.Row(): tab_config.main_gain.instantiate() tab_config.inst_gain.instantiate() tab_config.backup_gain.instantiate() def _render_output_options(tab_config: OneClickSongGenerationConfig) -> None: with gr.Accordion("Audio output", open=True): with gr.Row(): tab_config.output_name.instantiate( value=partial( update_output_name, get_song_cover_name, True, # noqa: FBT003 ), inputs=[ gr.State(None), tab_config.cached_song.instance, tab_config.voice_model.instance, ], ) tab_config.output_sr.instantiate() tab_config.output_format.instantiate() with gr.Row(): tab_config.show_intermediate_audio.instantiate() def _render_intermediate_audio(tab_config: OneClickSongGenerationConfig) -> None: with gr.Accordion( "Intermediate audio tracks", open=False, visible=False, ) as intermediate_audio_accordion: with gr.Accordion( "Step 0: song retrieval", open=False, ) as song_retrieval_accordion: tab_config.intermediate_audio.song.instantiate() with ( gr.Accordion( "Step 1a: vocals/instrumentals separation", open=False, ) as vocals_separation_accordion, gr.Row(), ): tab_config.intermediate_audio.vocals.instantiate() tab_config.intermediate_audio.instrumentals.instantiate() with ( gr.Accordion( "Step 1b: main vocals/ backup vocals separation", open=False, ) as main_vocals_separation_accordion, gr.Row(), ): tab_config.intermediate_audio.main_vocals.instantiate() tab_config.intermediate_audio.backup_vocals.instantiate() with ( gr.Accordion( "Step 1c: main vocals cleanup", open=False, ) as vocal_cleanup_accordion, gr.Row(), ): tab_config.intermediate_audio.main_vocals_dereverbed.instantiate() tab_config.intermediate_audio.main_vocals_reverb.instantiate() with gr.Accordion( "Step 2: conversion of main vocals", open=False, ) as vocal_conversion_accordion: tab_config.intermediate_audio.converted_vocals.instantiate() with gr.Accordion( "Step 3: post-processing of converted vocals", open=False, ) as vocals_postprocessing_accordion: tab_config.intermediate_audio.postprocessed_vocals.instantiate() with ( gr.Accordion( "Step 4: pitch shift of background tracks", open=False, ) as pitch_shift_accordion, gr.Row(), ): tab_config.intermediate_audio.instrumentals_shifted.instantiate() tab_config.intermediate_audio.backup_vocals_shifted.instantiate() tab_config.show_intermediate_audio.instance.change( partial(toggle_intermediate_audio, num_components=7), inputs=tab_config.show_intermediate_audio.instance, outputs=[ intermediate_audio_accordion, song_retrieval_accordion, vocals_separation_accordion, main_vocals_separation_accordion, vocal_cleanup_accordion, vocal_conversion_accordion, vocals_postprocessing_accordion, pitch_shift_accordion, ], show_progress="hidden", ) app = render_app() app_wrapper = typer.Typer() @app_wrapper.command() def start_app( share: Annotated[ bool, typer.Option("--share", "-s", help="Enable sharing"), ] = False, listen: Annotated[ bool, typer.Option( "--listen", "-l", help="Make the web application reachable from your local network.", ), ] = False, listen_host: Annotated[ str | None, typer.Option( "--listen-host", "-h", help="The hostname that the server will use.", ), ] = "0.0.0.0", listen_port: Annotated[ int | None, typer.Option( "--listen-port", "-p", help="The listening port that the server will use.", ), ] = None, ssr_mode: Annotated[ bool, typer.Option( "--ssr-mode", help="Enable server-side rendering mode.", ), ] = False, ) -> None: """Run the Ultimate RVC web application.""" os.environ["GRADIO_TEMP_DIR"] = str(TEMP_DIR) gr.set_static_paths([MODELS_DIR, AUDIO_DIR]) # app.queue() app.launch( server_name=listen_host, server_port=listen_port, ssr_mode=ssr_mode, ) load_config("default", TotalConfig) if __name__ == "__main__": app_wrapper()