wubby / app.py
lainlives's picture
~
8e1e184
raw
history blame
115 kB
from __future__ import annotations
import os
import sys
from enum import IntEnum, StrEnum, auto
from functools import cached_property
from pathlib import Path
from typing import TYPE_CHECKING, Annotated, Any, TypedDict
import gradio as gr
import typer
from huggingface_hub import snapshot_download
from pydantic import BaseModel
from ultimate_rvc.common import AUDIO_DIR, MODELS_DIR, TEMP_DIR
from ultimate_rvc.core.generate.song_cover import get_named_song_dirs
from ultimate_rvc.core.generate.speech import get_edge_tts_voice_names
from ultimate_rvc.core.manage.audio import (
get_audio_datasets,
get_named_audio_datasets,
get_saved_output_audio,
get_saved_speech_audio,
)
from ultimate_rvc.core.manage.config import get_config_names, load_config
from ultimate_rvc.core.manage.models import (
get_custom_embedder_model_names,
get_custom_pretrained_model_names,
get_training_model_names,
get_voice_model_names,
)
from ultimate_rvc.web.common import (
initialize_dropdowns,
exception_harness,
render_transfer_component,
setup_transfer_event,
toggle_visibility,
toggle_visible_component,
update_dropdowns,
update_output_name,
update_value,
)
from ultimate_rvc.web.config.component import (
AnyComponentConfig,
AudioConfig,
CheckboxConfig,
ComponentConfig,
DropdownConfig,
NumberConfig,
RadioConfig,
SliderConfig,
TextboxConfig,
)
from ultimate_rvc.web.config.tab import (
SongGenerationConfig,
SpeechGenerationConfig,
TrainingConfig,
)
from ultimate_rvc.web.tabs.generate.speech.multi_step_generation import (
render as render_speech_multi_step_tab,
)
from ultimate_rvc.web.tabs.generate.speech.one_click_generation import (
render as render_speech_one_click_tab,
)
from ultimate_rvc.web.tabs.manage.audio import render as render_audio_tab
from ultimate_rvc.web.tabs.manage.models import render as render_models_tab
from ultimate_rvc.web.tabs.manage.settings import render as render_settings_tab
if TYPE_CHECKING:
import gradio as gr
from typing import TYPE_CHECKING
from functools import partial
import gradio as gr
from ultimate_rvc.core.common import (
INTERMEDIATE_AUDIO_BASE_DIR,
OUTPUT_AUDIO_DIR,
copy_file_safe,
display_progress,
get_file_hash,
json_dump,
json_load,
validate_model,
validate_url,
)
from ultimate_rvc.core.exceptions import (
Entity,
InvalidLocationError,
Location,
NotFoundError,
NotProvidedError,
UIMessage,
YoutubeUrlError,
)
from ultimate_rvc.core.generate.common import (
convert,
get_unique_base_path,
mix_audio,
validate_audio_dir_exists,
validate_audio_file_exists,
wavify,
)
from ultimate_rvc.core.generate.song_cover import (
get_named_song_dirs,
get_song_cover_name,
mix_song,
pitch_shift,
postprocess,
retrieve_song,
separate_audio,
get_named_song_dirs,
get_song_cover_name,
run_pipeline,
)
from ultimate_rvc.core.generate.typing_extra import (
EffectedVocalsMetaData,
FileMetaData,
MixedAudioType,
PitchShiftMetaData,
RVCAudioMetaData,
SeparatedAudioMetaData,
)
from ultimate_rvc.core.manage.audio import get_saved_output_audio
from ultimate_rvc.typing_extra import EmbedderModel
from ultimate_rvc.web.common import (
PROGRESS_BAR,
exception_harness,
toggle_intermediate_audio,
toggle_visibility,
toggle_visible_component,
update_dropdowns,
update_output_name,
update_value,
)
from ultimate_rvc.web.typing_extra import ConcurrencyId
type StrPath = str | PathLike[str]
type Json = Mapping[str, Json] | Sequence[Json] | str | int | float | bool | None
class SegmentSize(IntEnum):
"""Enumeration of segment sizes for audio separation."""
SEG_64 = 64
SEG_128 = 128
SEG_256 = 256
SEG_512 = 512
SEG_1024 = 1024
SEG_2048 = 2048
SEG_4096 = 4096
class F0Method(StrEnum):
"""Enumeration of pitch extraction methods."""
RMVPE = "rmvpe"
CREPE = "crepe"
CREPE_TINY = "crepe-tiny"
FCPE = "fcpe"
class RVCContentType(StrEnum):
"""Enumeration of valid content to convert with RVC."""
VOCALS = "vocals"
VOICE = "voice"
SPEECH = "speech"
AUDIO = "audio"
class SampleRate(IntEnum):
"""Enumeration of supported audio sample rates."""
HZ_16000 = 16000
HZ_44100 = 44100
HZ_48000 = 48000
HZ_96000 = 96000
HZ_192000 = 192000
class AudioExt(StrEnum):
"""Enumeration of supported audio file formats."""
MP3 = "mp3"
WAV = "wav"
FLAC = "flac"
OGG = "ogg"
class DeviceType(StrEnum):
"""Enumeration of device types for training voice models."""
AUTOMATIC = "Automatic"
CPU = "CPU"
GPU = "GPU"
class TrainingSampleRate(StrEnum):
"""Enumeration of sample rates for training voice models."""
HZ_32K = "32000"
HZ_40K = "40000"
HZ_48K = "48000"
class PretrainedSampleRate(StrEnum):
"""Enumeration of valid sample rates for pretrained models."""
HZ_32K = "32k"
HZ_40K = "40k"
HZ_44K = "44k"
HZ_48K = "48k"
class TrainingF0Method(StrEnum):
"""Enumeration of pitch extraction methods for training."""
RMVPE = "rmvpe"
CREPE = "crepe"
CREPE_TINY = "crepe-tiny"
class AudioSplitMethod(StrEnum):
"""
Enumeration of methods to use for splitting audio files during
dataset preprocessing.
"""
SKIP = "Skip"
SIMPLE = "Simple"
AUTOMATIC = "Automatic"
class Vocoder(StrEnum):
"""Enumeration of vocoders for training voice models."""
HIFI_GAN = "HiFi-GAN"
MRF_HIFI_GAN = "MRF HiFi-GAN"
REFINE_GAN = "RefineGAN"
class IndexAlgorithm(StrEnum):
"""Enumeration of indexing algorithms for training voice models."""
AUTO = "Auto"
FAISS = "Faiss"
KMEANS = "KMeans"
class PretrainedType(StrEnum):
"""
Enumeration of the possible types of pretrained models to finetune
voice models on.
"""
NONE = "None"
DEFAULT = "Default"
CUSTOM = "Custom"
class ConcurrencyId(StrEnum):
"""Enumeration of possible concurrency identifiers."""
GPU = auto()
class SongSourceType(StrEnum):
"""The type of source providing the song to generate a cover of."""
PATH = "Local or HTTP filepath"
LOCAL_FILE = "Local file"
CACHED_SONG = "Cached song"
class SpeechSourceType(StrEnum):
"""The type of source providing the text to generate speech from."""
TEXT = "Text"
LOCAL_FILE = "Local file"
class SongTransferOption(StrEnum):
"""Enumeration of possible song transfer options."""
STEP_1_AUDIO = "Step 1: stem splitting"
STEP_2_VOCALS = "Step 2: vocal conversion"
STEP_3_VOCALS = "Step 3: vocal effect"
STEP_4_INSTRUMENTALS = "Step 4: instrumentals"
STEP_4_BACKUP_VOCALS = "Step 4: backup vocals"
STEP_5_MAIN_VOCALS = "Step 5: main vocals"
STEP_5_INSTRUMENTALS = "Step 5: instrumentals"
STEP_5_BACKUP_VOCALS = "Step 5: backup vocals"
class SpeechTransferOption(StrEnum):
"""Enumeration of possible speech transfer options."""
STEP_2_SPEECH = "Step 2: speech conversion"
STEP_3_SPEECH = "Step 3: speech effect"
class ComponentVisibilityKwArgs(TypedDict, total=False):
"""
Keyword arguments for setting component visibility.
Attributes
----------
visible : bool
Whether the component should be visible.
value : Any
The value of the component.
"""
visible: bool
value: Any
class UpdateDropdownKwArgs(TypedDict, total=False):
"""
Keyword arguments for updating a dropdown component.
Attributes
----------
choices : DropdownChoices
The updated choices for the dropdown component.
value : DropdownValue
The updated value for the dropdown component.
"""
choices: DropdownChoices
value: DropdownValue
class TextBoxKwArgs(TypedDict, total=False):
"""
Keyword arguments for updating a textbox component.
Attributes
----------
value : str | None
The updated value for the textbox component.
placeholder : str | None
The updated placeholder for the textbox component.
"""
value: str | None
placeholder: str | None
class UpdateAudioKwArgs(TypedDict, total=False):
"""
Keyword arguments for updating an audio component.
Attributes
----------
value : str | None
The updated value for the audio component.
"""
value: str | None
class DatasetType(StrEnum):
"""The type of dataset to train a voice model."""
NEW_DATASET = "Create new dataset"
EXISTING_DATASET = "Use existing dataset"
class EmbedderModel(StrEnum):
"""Enumeration of audio embedding models."""
CONTENTVEC = "contentvec"
CRUSTY = "Crusty"
CUSTOM = "custom"
class SeparationModel(StrEnum):
"""Enumeration of audio separation models."""
UVR_MDX_NET_VOC_FT = "UVR-MDX-NET-Voc_FT.onnx"
UVR_MDX_NET_KARA_2 = "UVR_MDXNET_KARA_2.onnx"
REVERB_HQ_BY_FOXJOY = "Reverb_HQ_By_FoxJoy.onnx"
UVR_MDX_NET_Inst_HQ_3 = "UVR-MDX-NET-Inst_HQ_3.onnx"
Kim_Vocal_1 = "Kim_Vocal_1.onnx"
Kim_Vocal_2 = "Kim_Vocal_2.onnx"
Kim_Inst = "Kim_Inst.onnx"
UVR_MDX_NET_Inst_HQ_4 = "UVR-MDX-NET-Inst_HQ_4.onnx"
kuielab_a_vocals = "kuielab_a_vocals.onnx"
kuielab_b_vocals = "kuielab_b_vocals.onnx"
kuielab_a_drums = "kuielab_a_drums.onnx"
kuielab_b_drums = "kuielab_b_drums.onnx"
kuielab_a_bass = "kuielab_a_bass.onnx"
kuielab_b_bass = "kuielab_b_bass.onnx"
kuielab_a_other = "kuielab_a_other.onnx"
kuielab_b_other = "kuielab_b_other.onnx"
MDX23C_8KFFT_InstVoc_HQ_2 = "MDX23C-8KFFT-InstVoc_HQ_2.ckpt"
UVR_DeNoise = "UVR-DeNoise.pth"
UVR_DeEcho_DeReverb = "UVR-DeEcho-DeReverb.pth"
now_dir = os.getcwd()
sys.path.append(now_dir)
models_dir = "models"
dump_path = os.path.join(now_dir, models_dir)
repo_id = "lainlives/voice"
hf_token = os.environ.get("HF_TOKEN")
snapshot_download(repo_id=repo_id, local_dir=dump_path, token=hf_token)
# if __name__ == "__main__":
# start_app(share=False, ssr_mode = True)
config_name = "default" # os.environ.get("URVC_CONFIG")
cookiefile = os.environ.get("YT_COOKIEFILE")
"""
Module defining models for representing configuration settings for
UI tabs.
"""
class SongIntermediateAudioConfig(BaseModel):
"""
Configuration settings for intermediate audio components in the
one-click song generation tab.
Attributes
----------
song : AudioConfig
Configuration settings for the input song audio component.
vocals : AudioConfig
Configuration settings for the vocals audio component.
instrumentals : AudioConfig
Configuration settings for the instrumentals audio component.
main_vocals : AudioConfig
Configuration settings for the main vocals audio component.
backup_vocals : AudioConfig
Configuration settings for the backup vocals audio component.
main_vocals_dereverbed : AudioConfig
Configuration settings for the main vocals de-reverbed audio
component.
main_vocals_reverb : AudioConfig
Configuration settings for the main vocals reverb audio
component.
converted_vocals : AudioConfig
Configuration settings for the converted vocals audio
component.
postprocessed_vocals : AudioConfig
Configuration settings for the postprocessed vocals audio
component.
instrumentals_shifted : AudioConfig
Configuration settings for the shifted instrumentals audio
component.
backup_vocals_shifted : AudioConfig
Configuration settings for the shifted backup vocals audio
component.
all : list[gr.Audio]
List of instances of all intermediate audio components.
"""
song: AudioConfig = AudioConfig.intermediate(label="Song")
vocals: AudioConfig = AudioConfig.intermediate(label="Vocals")
instrumentals: AudioConfig = AudioConfig.intermediate(
label="Instrumentals",
)
main_vocals: AudioConfig = AudioConfig.intermediate(
label="Main vocals",
)
backup_vocals: AudioConfig = AudioConfig.intermediate(
label="Backup vocals",
)
main_vocals_dereverbed: AudioConfig = AudioConfig.intermediate(
label="De-reverbed main vocals",
)
main_vocals_reverb: AudioConfig = AudioConfig.intermediate(
label="Main vocals with reverb",
)
converted_vocals: AudioConfig = AudioConfig.intermediate(
label="Converted vocals",
)
postprocessed_vocals: AudioConfig = AudioConfig.intermediate(
label="Postprocessed vocals",
)
instrumentals_shifted: AudioConfig = AudioConfig.intermediate(
label="Pitch-shifted instrumentals",
)
backup_vocals_shifted: AudioConfig = AudioConfig.intermediate(
label="Pitch-shifted backup vocals",
)
@property
def all(self) -> list[gr.Audio]:
"""
Retrieve instances of all intermediate audio components
in the one-click song generation tab.
Returns
-------
list[gr.Audio]
List of instances of all intermediate audio components in
the one-click song generation tab.
"""
# NOTE we are using self.__annotations__ to get the fields in
# the order they are defined in the class
return [getattr(self, field).instance for field in self.__annotations__]
class OneClickSongGenerationConfig(SongGenerationConfig):
"""
Configuration settings for the one-click song generation tab.
Attributes
----------
n_octaves : SliderConfig
Configuration settings for an octave pitch shift slider
component.
n_semitones : SliderConfig
Configuration settings for a semitone pitch shift slider
component.
show_intermediate_audio : CheckboxConfig
Configuration settings for a show intermediate audio checkbox
component.
intermediate_audio : SongIntermediateAudioConfig
Configuration settings for intermediate audio components.
See Also
--------
SongGenerationConfig
Parent model defining common component configuration settings
for song generation tabs.
"""
n_octaves: SliderConfig = SliderConfig.octave_shift(
label="Vocal pitch shift",
info=(
"The number of octaves to shift the pitch of the converted vocals by. Use 1"
" for male-to-female and -1 for vice-versa."
),
)
n_semitones: SliderConfig = SliderConfig.semitone_shift(
label="Overall pitch shift",
info=(
"The number of semi-tones to shift the pitch of the converted vocals,"
" instrumentals and backup vocals by."
),
)
show_intermediate_audio: CheckboxConfig = CheckboxConfig(
label="Show intermediate audio",
info="Show intermediate audio tracks produced during song cover generation.",
value=False,
exclude_value=True,
)
intermediate_audio: SongIntermediateAudioConfig = SongIntermediateAudioConfig()
class SongInputAudioConfig(BaseModel):
"""
Configuration settings for input audio components in the multi-step
song generation tab.
Attributes
----------
audio : AudioConfig
Configuration settings for the input audio component.
vocals : AudioConfig
Configuration settings for the vocals audio component.
converted_vocals : AudioConfig
Configuration settings for the converted vocals audio
component.
instrumentals : AudioConfig
Configuration settings for the instrumentals audio
component.
backup_vocals : AudioConfig
Configuration settings for the backup vocals audio
component.
main_vocals : AudioConfig
Configuration settings for the main vocals audio
component.
shifted_instrumentals : AudioConfig
Configuration settings for the shifted instrumentals audio
component.
shifted_backup_vocals : AudioConfig
Configuration settings for the shifted backup vocals audio
component.
all : list[AudioConfig]
List of configuration settings for all input audio
components in the multi-step song generation tab.
"""
audio: AudioConfig = AudioConfig.input(label="Audio")
vocals: AudioConfig = AudioConfig.input(label="Vocals")
converted_vocals: AudioConfig = AudioConfig.input(label="Vocals")
instrumentals: AudioConfig = AudioConfig.input(label="Instrumentals")
backup_vocals: AudioConfig = AudioConfig.input(label="Backup vocals")
main_vocals: AudioConfig = AudioConfig.input(label="Main vocals")
shifted_instrumentals: AudioConfig = AudioConfig.input(label="Instrumentals")
shifted_backup_vocals: AudioConfig = AudioConfig.input(label="Backup vocals")
@property
def all(self) -> list[AudioConfig]:
"""
Retrieve configuration settings for all input audio components
in the multi-step song generation tab.
Returns
-------
list[AudioConfig]
List of configuration settings for all input audio
components in the multi-step song generation tab.
"""
return [getattr(self, field) for field in self.__annotations__]
class SongDirsConfig(BaseModel):
"""
Configuration settings for song directory components in the
multi-step song generation tab.
Attributes
----------
separate_audio : DropdownConfig
Configuration settings for the song directory component
for separating audio.
convert_vocals : DropdownConfig
Configuration settings for the song directory component
for converting vocals.
postprocess_vocals : DropdownConfig
Configuration settings for the song directory component
for postprocessing vocals.
pitch_shift_background : DropdownConfig
Configuration settings for the song directory component
for pitch-shifting background audio.
mix : DropdownConfig
Configuration settings for the song directory component
for mixing audio.
all : list[gr.Dropdown]
List of instances of all song directory components in the
multi-step song generation tab.
"""
separate_audio: DropdownConfig = DropdownConfig.song_dir()
convert_vocals: DropdownConfig = DropdownConfig.song_dir()
postprocess_vocals: DropdownConfig = DropdownConfig.song_dir()
pitch_shift_background: DropdownConfig = DropdownConfig.song_dir()
mix: DropdownConfig = DropdownConfig.song_dir()
@property
def all(self) -> list[gr.Dropdown]:
"""
Retrieve instances of all song directory components in the
multi-step song generation tab.
Returns
-------
list[gr.Dropdown]
List of instances of all song directory components in
the multi-step song generation tab.
"""
return [getattr(self, field).instance for field in self.__annotations__]
class MultiStepSongGenerationConfig(SongGenerationConfig):
"""
Configuration settings for multi-step song generation tab.
Attributes
----------
separation_model : DropdownConfig
Configuration settings for a separation model dropdown
component.
segment_size : RadioConfig
Configuration settings for a segment size radio component.
n_octaves : SliderConfig
Configuration settings for an octave pitch shift slider
component.
n_semitones : SliderConfig
Configuration settings for a semitone pitch shift slider
component.
n_semitones_instrumentals : SliderConfig
Configuration settings for an instrumentals pitch shift slider
component.
n_semitones_backup_vocals : SliderConfig
Configuration settings for a backup vocals pitch shift slider
component.
input_audio : SongInputAudioConfig
Configuration settings for input audio components.
song_dirs : SongDirsConfig
Configuration settings for song directory components.
See Also
--------
SongGenerationConfig
Parent model defining common component configuration settings
for song generation tabs.
"""
separation_model: DropdownConfig = DropdownConfig(
label="Separation model",
info="The model to use for audio separation.",
value=SeparationModel.MDX23C_8KFFT_InstVoc_HQ_2,
choices=list(SeparationModel),
)
segment_size: RadioConfig = RadioConfig(
label="Segment size",
info=(
"The size of the segments into which the audio is split. Using a larger"
" size consumes more resources, but may give better results."
),
value=SegmentSize.SEG_2048,
choices=list(SegmentSize),
)
n_octaves: SliderConfig = SliderConfig.octave_shift(
label="Pitch shift (octaves)",
info=(
"The number of octaves to pitch-shift the converted voice by. Use 1 for"
" male-to-female and -1 for vice-versa."
),
)
n_semitones: SliderConfig = SliderConfig.semitone_shift(
label="Pitch shift (semi-tones)",
info=(
"The number of semi-tones to pitch-shift the converted vocals by. Altering"
" this slightly reduces sound quality."
),
)
n_semitones_instrumentals: SliderConfig = SliderConfig.semitone_shift(
label="Instrumental pitch shift",
info="The number of semi-tones to pitch-shift the instrumentals by.",
)
n_semitones_backup_vocals: SliderConfig = SliderConfig.semitone_shift(
label="Backup vocal pitch shift",
info="The number of semi-tones to pitch-shift the backup vocals by.",
)
input_audio: SongInputAudioConfig = SongInputAudioConfig()
song_dirs: SongDirsConfig = SongDirsConfig()
class SpeechIntermediateAudioConfig(BaseModel):
"""
Configuration settings for intermediate audio components in the
one-click speech generation tab.
Attributes
----------
speech : AudioConfig
Configuration settings for the input speech audio component.
converted_speech : AudioConfig
Configuration settings for the converted speech audio component.
all : list[gr.Audio]
List of instances of all intermediate audio components in the
speech generation tab.
"""
speech: AudioConfig = AudioConfig.intermediate(label="Speech")
converted_speech: AudioConfig = AudioConfig.intermediate(label="Converted speech")
@property
def all(self) -> list[gr.Audio]:
"""
Retrieve instances of all intermediate audio components in the
speech generation tab.
Returns
-------
list[gr.Audio]
List of instances of all intermediate audio components in
the speech generation tab.
"""
return [getattr(self, field).instance for field in self.__annotations__]
class OneClickSpeechGenerationConfig(SpeechGenerationConfig):
"""
Configuration settings for one-click speech generation tab.
Attributes
----------
intermediate_audio : SpeechIntermediateAudioConfig
Configuration settings for intermediate audio components.
show_intermediate_audio : CheckboxConfig
Configuration settings for a show intermediate audio checkbox
component.
See Also
--------
SpeechGenerationConfig
Parent model defining common component configuration settings
for speech generation tabs.
"""
intermediate_audio: SpeechIntermediateAudioConfig = SpeechIntermediateAudioConfig()
show_intermediate_audio: CheckboxConfig = CheckboxConfig(
label="Show intermediate audio",
info="Show intermediate audio tracks produced during speech generation.",
value=False,
exclude_value=True,
)
class SpeechInputAudioConfig(BaseModel):
"""
Configuration settings for input audio components in the multi-step
speech generation tab.
Attributes
----------
speech : AudioConfig
Configuration settings for the input speech audio component.
converted_speech : AudioConfig
Configuration settings for the converted speech audio component.
all : list[AudioConfig]
List of configuration settings for all input audio components in
the multi-step speech generation tab.
"""
speech: AudioConfig = AudioConfig.input("Speech")
converted_speech: AudioConfig = AudioConfig.input("Converted speech")
@property
def all(self) -> list[AudioConfig]:
"""
Retrieve configuration settings for all input audio components
in the multi-step speech generation tab.
Returns
-------
list[AudioConfig]
List of configuration settings for all input audio
components in the multi-step speech generation tab.
"""
return [getattr(self, field) for field in self.__annotations__]
class MultiStepSpeechGenerationConfig(SpeechGenerationConfig):
"""
Configuration settings for the multi-step speech generation tab.
Attributes
----------
input_audio : SpeechInputAudioConfig
Configuration settings for input audio components.
See Also
--------
SpeechGenerationConfig
Parent model defining common component configuration settings
for speech generation tabs.
"""
input_audio: SpeechInputAudioConfig = SpeechInputAudioConfig()
class MultiStepTrainingConfig(TrainingConfig):
"""Configuration settings for multi-step training tab."""
class ModelManagementConfig(BaseModel):
"""
Configuration settings for model management tab.
Attributes
----------
voices : DropdownConfig
Configuration settings for delete voice models dropdown
component.
embedders : DropdownConfig
Configuration settings for delete embedder models dropdown
component.
pretraineds : DropdownConfig
Configuration settings for delete pretrained models dropdown
component.
traineds : DropdownConfig
Configuration settings for delete training models dropdown
component.
dummy_checkbox : CheckboxConfig
Configuration settings for a dummy checkbox component.
"""
voices: DropdownConfig = DropdownConfig.multi_delete(
label="Voice models",
info="Select one or more voice models to delete.",
)
embedders: DropdownConfig = DropdownConfig.multi_delete(
label="Custom embedder models",
info="Select one or more embedder models to delete.",
)
pretraineds: DropdownConfig = DropdownConfig.multi_delete(
label="Custom pretrained models",
info="Select one or more pretrained models to delete.",
)
traineds: DropdownConfig = DropdownConfig.multi_delete(
label="Training models",
info="Select one or more training models to delete.",
)
dummy_checkbox: CheckboxConfig = CheckboxConfig(
value=False,
visible=False,
exclude_value=True,
)
class AudioManagementConfig(BaseModel):
"""
Configuration settings for audio management tab.
Attributes
----------
intermediate : DropdownConfig
Configuration settings for delete intermediate audio files
dropdown component
speech : DropdownConfig
Configuration settings for delete speech audio files dropdown
component.
output : DropdownConfig
Configuration settings for delete output audio files dropdown
component.
dataset : DropdownConfig
Configuration settings for delete dataset audio files dropdown
component.
dummy_checkbox : CheckboxConfig
Configuration settings for a dummy checkbox component.
"""
intermediate: DropdownConfig = DropdownConfig.multi_delete(
label="Song directories",
info=(
"Select one or more song directories containing intermediate audio files to"
" delete."
),
)
speech: DropdownConfig = DropdownConfig.multi_delete(
label="Speech audio files",
info="Select one or more speech audio files to delete.",
)
output: DropdownConfig = DropdownConfig.multi_delete(
label="Output audio files",
info="Select one or more output audio files to delete.",
)
dataset: DropdownConfig = DropdownConfig.multi_delete(
label="Dataset audio files",
info="Select one or more datasets containing audio files to delete.",
)
dummy_checkbox: CheckboxConfig = CheckboxConfig(
value=False,
visible=False,
exclude_value=True,
)
class SettingsManagementConfig(BaseModel):
"""
Configuration settings for settings management tab.
Attributes
----------
dummy_checkbox : CheckboxConfig
Configuration settings for a dummy checkbox component.
"""
load_config_name: DropdownConfig = DropdownConfig(
label="Configuration name",
info="The name of a configuration to load UI settings from",
value=None,
render=False,
exclude_value=True,
)
delete_config_names: DropdownConfig = DropdownConfig.multi_delete(
label="Configuration names",
info="Select the name of one or more configurations to delete",
)
dummy_checkbox: CheckboxConfig = CheckboxConfig(
value=False,
visible=False,
exclude_value=True,
)
class TotalSongGenerationConfig(BaseModel):
"""
All configuration settings for song generation tabs.
Attributes
----------
one_click : OneClickSongGenerationConfig
Configuration settings for the one-click song generation tab.
multi_step : MultiStepSongGenerationConfig
Configuration settings for the multi-step song generation tab.
"""
one_click: OneClickSongGenerationConfig = OneClickSongGenerationConfig()
multi_step: MultiStepSongGenerationConfig = MultiStepSongGenerationConfig()
class TotalSpeechGenerationConfig(BaseModel):
"""
All configuration settings for speech generation tabs.
Attributes
----------
one_click : OneClickSpeechGenerationConfig
Configuration settings for the one-click speech generation tab.
multi_step : MultiStepSpeechGenerationConfig
Configuration settings for the multi-step speech generation tab.
"""
one_click: OneClickSpeechGenerationConfig = OneClickSpeechGenerationConfig()
multi_step: MultiStepSpeechGenerationConfig = MultiStepSpeechGenerationConfig()
class TotalTrainingConfig(BaseModel):
"""
All configuration settings for training tabs.
Attributes
----------
training : TrainingConfig
Configuration settings for the multi-step training tab.
"""
multi_step: MultiStepTrainingConfig = MultiStepTrainingConfig()
class TotalManagementConfig(BaseModel):
"""
All configuration settings for management tabs.
Attributes
----------
model : ModelManagementConfig
Configuration settings for the model management tab.
audio : AudioManagementConfig
Configuration settings for the audio management tab.
settings : SettingsManagementConfig
Configuration settings for the settings management tab.
"""
model: ModelManagementConfig = ModelManagementConfig()
audio: AudioManagementConfig = AudioManagementConfig()
settings: SettingsManagementConfig = SettingsManagementConfig()
class TotalConfig(BaseModel):
"""
All configuration settings for the Ultimate RVC app.
Attributes
----------
song : TotalSongGenerationConfig
Configuration settings for song generation tabs.
speech : TotalSpeechGenerationConfig
Configuration settings for speech generation tabs.
training : TotalTrainingConfig
Configuration settings for training tabs.
management : TotalManagementConfig
Configuration settings for management tabs.
"""
song: TotalSongGenerationConfig = TotalSongGenerationConfig()
speech: TotalSpeechGenerationConfig = TotalSpeechGenerationConfig()
training: TotalTrainingConfig = TotalTrainingConfig()
management: TotalManagementConfig = TotalManagementConfig()
@cached_property
def all(self) -> list[AnyComponentConfig]:
"""
Recursively collect those component configuration models nested
within the current model instance, which have values that are
not excluded.
Returns
-------
list[AnyComponentConfig]
A list of component configuration models found within the
current model instance, which have values that are not
excluded.
"""
def _collect(model: BaseModel) -> list[AnyComponentConfig]:
component_configs: list[Any] = []
for _, value in model:
if isinstance(value, ComponentConfig):
if not value.exclude_value:
component_configs.append(value)
elif isinstance(value, BaseModel):
component_configs.extend(_collect(value))
return component_configs
return _collect(self)
class BaseTabConfig(BaseModel):
"""
Base model defining common component configuration settings for
UI tabs.
Attributes
----------
embedder_model : DropdownConfig
Configuration settings for an embedder model dropdown component.
custom_embedder_model : DropdownConfig
Configuration settings for a custom embedder model dropdown
component.
"""
embedder_model: DropdownConfig = DropdownConfig(
label="Embedder model",
info="The model to use for generating speaker embeddings.",
value=EmbedderModel.CONTENTVEC,
choices=list(EmbedderModel),
exclude_value=True,
)
custom_embedder_model: DropdownConfig = DropdownConfig(
label="Custom embedder model",
info="Select a custom embedder model from the dropdown.",
value=None,
visible=False,
render=False,
exclude_value=True,
)
class TrainingConfig(BaseTabConfig):
"""
Common component configuration settings for training tabs.
Attributes
----------
dataset_type : DropdownConfig
Configuration settings for a dataset type dropdown component.
dataset : DropdownConfig
Configuration settings for a dataset dropdown component.
dataset_name : TextboxConfig
Configuration settings for a dataset name textbox component.
preprocess_model : DropdownConfig
Configuration settings for a model name dropdown component
for audio preprocessing.
sample_rate : DropdownConfig
Configuration settings for a sample rate dropdown component.
filter_audio : CheckboxConfig
Configuration settings for a filter audio checkbox component.
clean_audio : CheckboxConfig
Configuration settings for a clean audio checkbox component.
clean_strength : SliderConfig
Configuration settings for a clean strength slider component.
split_method : DropdownConfig
Configuration settings for an audio splitting method dropdown
component.
chunk_len : SliderConfig
Configuration settings for a chunk length slider component.
overlap_len : SliderConfig
Configuration settings for an overlap length slider component.
preprocess_cores : SliderConfig
Configuration settings for a CPU cores slider component for
preprocessing.
extract_model : DropdownConfig
Configuration settings for a model name dropdown component for
feature extraction.
f0_method : DropdownConfig
Configuration settings for an F0 method dropdown component.
hop_length : SliderConfig
Configuration settings for a hop length slider component.
include_mutes : SliderConfig
Configuration settings for an include mutes slider component.
extract_cores : SliderConfig
Configuration settings for a CPU cores slider component for
feature extraction.
extraction_acceleration : HardwareAccelerationConfig
Configuration settings for a hardware acceleration component for
feature extraction.
extraction_gpus : DropdownConfig
Configuration settings for a GPU dropdown compoennt for feature
extraction.
train_model : DropdownConfig
Configuration settings for a model name dropdown component for
training.
num_epochs : SliderConfig
Configuration settings for a number of epochs slider component.
batch_size : SliderConfig
Configuration settings for a batch size slider component.
detect_overtraining : CheckboxConfig
Configuration settings for a detect overtraining checkbox
component.
overtraining_threshold : SliderConfig
Configuration settings for an overtraining threshold slider
component.
vocoder : DropdownConfig
Configuration settings for a vocoder dropdown component.
index_algorithm : DropdownConfig
Configuration settings for an index algorithm dropdown
component.
pretrained_type : DropdownConfig
Configuration settings for a pretrained model type dropdown
component.
custom_pretrained_model : DropdownConfig
Configuration settings for a custom pretrained model dropdown
component.
save_interval : SliderConfig
Configuration settings for a save-interval slider component.
save_all_checkpoints : CheckboxConfig
Configuration settings for a save-all-checkpoints checkbox
component.
save_all_weights : CheckboxConfig
Configuration settings for a save-all-weights checkbox
component.
clear_saved_data : CheckboxConfig
Configuration settings for a clear-saved-data checkbox
component.
upload_model : CheckboxConfig
Configuration settings for an upload voice model checkbox
component.
upload_name : TextboxConfig
Configuration settings for an upload name textbox component.
training_acceleration : HardwareAccelerationConfig
Configuration settings for a hardware acceleration component for
training.
training_gpus : DropdownConfig
Configuration settings for a GPU dropdown component for
training.
preload_dataset : CheckboxConfig
Configuration settings for a preload dataset checkbox component.
reduce_memory_usage : CheckboxConfig
Configuration settings for a reduce-memory-usage checkbox
component.
See Also
--------
BaseTabConfig
Parent model defining common component configuration settings
for UI tabs.
"""
dataset_type: DropdownConfig = DropdownConfig(
label="Dataset type",
info="Select the type of dataset to preprocess.",
value=DatasetType.NEW_DATASET,
choices=list(DatasetType),
exclude_value=True,
)
dataset: DropdownConfig = DropdownConfig(
label="Dataset path",
info=(
"The path to an existing dataset. Either select a path to a previously"
" created dataset or provide a path to an external dataset."
),
value=None,
allow_custom_value=True,
visible=False,
render=False,
exclude_value=True,
)
dataset_name: TextboxConfig = TextboxConfig(
label="Dataset name",
info=(
"The name of the new dataset. If the dataset already exists, the provided"
" audio files will be added to it."
),
value="My dataset",
exclude_value=True,
)
preprocess_model: DropdownConfig = DropdownConfig(
label="Model name",
info=(
"Name of the model to preprocess the given dataset for. Either select an"
" existing model from the dropdown or provide the name of a new model."
),
value="My model",
allow_custom_value=True,
render=False,
exclude_value=True,
)
sample_rate: DropdownConfig = DropdownConfig(
label="Sample rate",
info="Target sample rate for the audio files in the provided dataset.",
value=TrainingSampleRate.HZ_40K,
choices=list(TrainingSampleRate),
)
filter_audio: CheckboxConfig = CheckboxConfig(
label="Filter audio",
info=(
"Whether to remove low-frequency sounds from the audio files in the"
" provided dataset by applying a high-pass butterworth filter.<br><br>"
),
value=True,
)
clean_audio: CheckboxConfig = CheckboxConfig(
label="Clean audio",
info=(
"Whether to clean the audio files in the provided dataset using noise"
" reduction algorithms.<br><br><br>"
),
value=False,
exclude_value=True,
)
clean_strength: SliderConfig = SliderConfig.clean_strength(visible=False)
split_method: DropdownConfig = DropdownConfig(
label="Audio splitting method",
info=(
"The method to use for splitting the audio files in the provided dataset."
" Use the `Skip` method to skip splitting if the audio files are already"
" split. Use the `Simple` method if excessive silence has already been"
" removed from the audio files. Use the `Automatic` method for automatic"
" silence detection and splitting around it."
),
value=AudioSplitMethod.AUTOMATIC,
choices=list(AudioSplitMethod),
exclude_value=True,
)
chunk_len: SliderConfig = SliderConfig(
label="Chunk length",
info="Length of split audio chunks.",
value=3.0,
minimum=0.5,
maximum=5.0,
step=0.1,
visible=False,
)
overlap_len: SliderConfig = SliderConfig(
label="Overlap length",
info="Length of overlap between split audio chunks.",
value=0.3,
minimum=0.0,
maximum=0.4,
step=0.1,
visible=False,
)
preprocess_cores: SliderConfig = SliderConfig.cpu_cores()
extract_model: DropdownConfig = DropdownConfig(
label="Model name",
info=(
"Name of the model with an associated preprocessed dataset to extract"
" training features from. When a new dataset is preprocessed, its"
" associated model is selected by default."
),
value=None,
render=False,
exclude_value=True,
)
f0_method: DropdownConfig = DropdownConfig(
label="F0 method",
info="The method to use for extracting pitch features.",
value=TrainingF0Method.RMVPE,
choices=list(TrainingF0Method),
exclude_value=True,
)
hop_length: SliderConfig = SliderConfig.hop_length(
label="Hop length",
info="The hop length to use for extracting pitch features.<br><br>",
visible=False,
)
include_mutes: SliderConfig = SliderConfig(
label="Include mutes",
info=(
"The number of mute audio files to include in the generated training file"
" list. Adding silent files enables the training model to handle pure"
" silence in inferred audio files. If the preprocessed audio dataset"
" already contains segments of pure silence, set this to 0."
),
value=0,
minimum=0,
maximum=10,
step=1,
)
extraction_cores: SliderConfig = SliderConfig.cpu_cores()
extraction_acceleration: DropdownConfig = DropdownConfig.hardware_acceleration()
extraction_gpus: DropdownConfig = DropdownConfig.gpu()
train_model: DropdownConfig = DropdownConfig(
label="Model name",
info=(
"Name of the model to train. When training features are extracted for a new"
" model, its name is selected by default."
),
value=None,
render=False,
exclude_value=True,
)
num_epochs: SliderConfig = SliderConfig(
label="Number of epochs",
info=(
"The number of epochs to train the voice model. A higher number can improve"
" voice model performance but may lead to overtraining."
),
value=500,
minimum=1,
maximum=5000,
step=1,
)
batch_size: SliderConfig = SliderConfig(
label="Batch size",
info=(
"The number of samples in each training batch. It is advisable to align"
" this value with the available VRAM of your GPU."
),
value=16,
minimum=1,
maximum=128,
step=1,
)
detect_overtraining: CheckboxConfig = CheckboxConfig(
label="Detect overtraining",
info=(
"Whether to detect overtraining to prevent the voice model from learning"
" the training data too well and losing the ability to generalize to new"
" data."
),
value=True,
exclude_value=True,
)
overtraining_threshold: SliderConfig = SliderConfig(
label="Overtraining threshold",
info=(
"The maximum number of epochs to continue training without any observed"
" improvement in voice model performance."
),
value=500,
minimum=1,
maximum=1000,
visible=False,
)
vocoder: DropdownConfig = DropdownConfig(
label="Vocoder",
info=(
"The vocoder to use for audio synthesis during training. HiFi-GAN provides"
" basic audio fidelity, while RefineGAN provides the highest audio"
" fidelity."
),
value=Vocoder.HIFI_GAN,
choices=list(Vocoder),
)
index_algorithm: DropdownConfig = DropdownConfig(
label="Index algorithm",
info=(
"The method to use for generating an index file for the trained voice"
" model. `KMeans` is particularly useful for large datasets."
),
value=IndexAlgorithm.AUTO,
choices=list(IndexAlgorithm),
)
pretrained_type: DropdownConfig = DropdownConfig(
label="Pretrained model type",
info=(
"The type of pretrained model to finetune the voice model on. `None` will"
" train the voice model from scratch, while `Default` will use a pretrained"
" model tailored to the specific voice model architecture. `Custom` will"
" use a custom pretrained that you provide."
),
value=PretrainedType.DEFAULT,
choices=list(PretrainedType),
exclude_value=True,
)
custom_pretrained_model: DropdownConfig = DropdownConfig(
label="Custom pretrained model",
info="Select a custom pretrained model to finetune from the dropdown.",
value=None,
visible=False,
render=False,
exclude_value=True,
)
save_interval: SliderConfig = SliderConfig(
label="Save interval",
info=(
"The epoch interval at which to to save voice model weights and"
" checkpoints. The best model weights are always saved regardless of this"
" setting."
),
value=10,
minimum=1,
maximum=100,
step=1,
)
save_all_checkpoints: CheckboxConfig = CheckboxConfig(
label="Save all checkpoints",
info=(
"Whether to save a unique checkpoint at each save interval. If not enabled,"
" only the latest checkpoint will be saved at each interval."
),
value=True,
)
save_all_weights: CheckboxConfig = CheckboxConfig(
label="Save all weights",
info=(
"Whether to save unique voice model weights at each save interval. If not"
" enabled, only the best voice model weights will be saved."
),
value=True,
)
clear_saved_data: CheckboxConfig = CheckboxConfig(
label="Clear saved data",
info=(
"Whether to delete any existing training data associated with the voice"
" model before training commences. Enable this setting only if you are"
" training a new voice model from scratch or restarting training."
),
value=False,
)
upload_model: CheckboxConfig = CheckboxConfig(
label="Upload voice model",
info=(
"Whether to automatically upload the trained voice model so that it can be"
" used for generation tasks within the Ultimate RVC app."
),
value=False,
exclude_value=True,
)
upload_name: TextboxConfig = TextboxConfig(
label="Upload name",
info="The name to give the uploaded voice model.",
value=None,
visible=False,
exclude_value=True,
)
training_acceleration: DropdownConfig = DropdownConfig.hardware_acceleration()
training_gpus: DropdownConfig = DropdownConfig.gpu()
preload_dataset: CheckboxConfig = CheckboxConfig(
label="Preload dataset",
info=(
"Whether to preload all training data into GPU memory. This can improve"
" training speed but requires a lot of VRAM.<br><br>"
),
value=True,
)
reduce_memory_usage: CheckboxConfig = CheckboxConfig(
label="Reduce memory usage",
info=(
"Whether to reduce VRAM usage at the cost of slower training speed by"
" enabling activation checkpointing. This is useful for GPUs with limited"
" memory (e.g., <6GB VRAM) or when training with a batch size larger than"
" what your GPU can normally accommodate."
),
value=False,
)
class GenerationConfig(BaseTabConfig):
"""
Common component configuration settings for generation tabs.
voice_model : DropdownConfig
Configuration settings for a voice model dropdown component.
f0_methods : DropdownConfig
Configuration settings for a pitch extraction algorithms
dropdown component.
index_rate : SliderConfig
Configuration settings for an index rate slider component.
rms_mix_rate : SliderConfig
Configuration settings for a RMS mix rate slider component.
protect_rate : SliderConfig
Configuration settings for a protect rate slider component.
split_voice : CheckboxConfig
Configuration settings for a split voice checkbox component.
autotune_voice: CheckboxConfig
Configuration settings for an autotune voice checkbox component.
autotune_strength: SliderConfig
Configuration settings for an autotune strength slider
component.
sid : NumberConfig
Configuration settings for a speaker ID number component.
output_sr : DropdownConfig
Configuration settings for an output sample rate dropdown
component.
output_format : DropdownConfig
Configuration settings for an output format dropdown
component.
output_name : TextboxConfig
Configuration settings for an output name textbox component.
See Also
--------
BaseTabConfig
Parent model defining common component configuration settings
for UI tabs.
"""
voice_model: DropdownConfig = DropdownConfig(
label="Voice model",
info="Select a model to use for voice conversion.",
value=None,
render=False,
exclude_value=True,
)
f0_methods: DropdownConfig = DropdownConfig(
label="Pitch extraction algorithm(s)",
info=(
"If more than one method is selected, then the median of the pitch values"
" extracted by each method is used. RMVPE is recommended for most cases and"
" is the default when no method is selected."
),
value=[F0Method.RMVPE],
choices=list(F0Method),
multiselect=True,
)
index_rate: SliderConfig = SliderConfig(
label="Index rate",
info=(
"Increase to bias the conversion towards the accent of the voice model."
" Decrease to potentially reduce artifacts coming from the voice"
" model.<br><br><br>"
),
value=0.3,
minimum=0.0,
maximum=1.0,
)
rms_mix_rate: SliderConfig = SliderConfig(
label="RMS mix rate",
info=(
"How much to mimic the loudness (0) of the input voice or a fixed loudness"
" (1). A value of 1 is recommended for most cases.<br><br>"
),
value=1.0,
minimum=0.0,
maximum=1.0,
)
protect_rate: SliderConfig = SliderConfig(
label="Protect rate",
info=(
"Controls the extent to which consonants and breathing sounds are protected"
" from artifacts. A higher value offers more protection but may worsen the"
" indexing effect.<br><br>"
),
value=0.33,
minimum=0.0,
maximum=0.5,
)
hop_length: SliderConfig = SliderConfig.hop_length(
label="Hop length",
info=(
"How often the CREPE-based pitch extraction method checks for pitch changes"
" measured in milliseconds. Lower values lead to longer conversion times"
" and a higher risk of voice cracks, but better pitch accuracy."
),
visible=True,
)
split_voice: CheckboxConfig = CheckboxConfig(
label="Split input voice",
info=(
"Whether to split the input voice track into smaller segments before"
" converting it. This can improve output quality for longer voice tracks."
),
value=False,
)
autotune_voice: CheckboxConfig = CheckboxConfig(
label="Autotune converted voice",
info="Whether to apply autotune to the converted voice.<br><br>",
value=False,
exclude_value=True,
)
autotune_strength: SliderConfig = SliderConfig(
label="Autotune intensity",
info=(
"Higher values result in stronger snapping to the chromatic grid and"
" artifacting."
),
value=1.0,
minimum=0.0,
maximum=1.0,
visible=False,
)
sid: NumberConfig = NumberConfig(
label="Speaker ID",
info="Speaker ID for multi-speaker-models.",
value=0,
precision=0,
)
output_sr: DropdownConfig = DropdownConfig(
label="Output sample rate",
info="The sample rate of the mixed output track.",
value=SampleRate.HZ_44100,
choices=list(SampleRate),
)
output_format: DropdownConfig = DropdownConfig(
label="Output format",
info="The audio format of the mixed output track.",
value=AudioExt.MP3,
choices=list(AudioExt),
)
output_name: TextboxConfig = TextboxConfig(
label="Output name",
info="If no name is provided, a suitable name will be generated automatically.",
value=None,
placeholder="Ultimate RVC output",
exclude_value=True,
)
class SongGenerationConfig(GenerationConfig):
"""
Common component configuration settings for song generation tabs.
Attributes
----------
source_type : DropdownConfig
Configuration settings for a source type dropdown component.
source : TextboxConfig
Configuration settings for an input source textbox component.
cached_song : DropdownConfig
Configuration settings for a cached song dropdown component.
clean_strength : SliderConfig
Configuration settings for a clean strength slider component.
clean_voice : CheckboxConfig
Configuration settings for a clean voice checkbox component.
room_size : SliderConfig
Configuration settings for a room size slider component.
wet_level : SliderConfig
Configuration settings for a wetness level slider component.
dry_level : SliderConfig
Configuration settings for a dryness level slider component.
damping : SliderConfig
Configuration settings for a damping level slider component.
main_gain : SliderConfig
Configuration settings for a main gain slider component.
inst_gain : SliderConfig
Configuration settings for an instrumentals gain slider
component.
backup_gain : SliderConfig
Configuration settings for a backup vocals gain slider
component.
See Also
--------
GenerationConfig
Parent model defining common component configuration settings
for song generation tabs.
"""
source_type: DropdownConfig = DropdownConfig(
label="Source type",
info="The type of source to retrieve a song from.",
value=SongSourceType.LOCAL_FILE,
choices=list(SongSourceType),
type="index",
exclude_value=True,
)
source: TextboxConfig = TextboxConfig(
label="Source",
info="Local (to the server) filepath or http link. Youtube probably wont work but most other sites still do.",
value=None,
exclude_value=True,
)
cached_song: DropdownConfig = DropdownConfig(
label="Source",
info="Select a song from the list of cached songs.",
value=None,
visible=False,
render=False,
exclude_value=True,
)
clean_voice: CheckboxConfig = CheckboxConfig(
label="Clean converted voice",
info=(
"Whether to clean the converted voice using noise reduction"
" algorithms.<br><br>"
),
value=False,
exclude_value=True,
)
clean_strength: SliderConfig = SliderConfig.clean_strength(visible=False)
room_size: SliderConfig = SliderConfig(
label="Room size",
info=(
"Size of the room which reverb effect simulates. Increase for longer reverb"
" time."
),
value=0.15,
minimum=0.0,
maximum=1.0,
)
wet_level: SliderConfig = SliderConfig(
label="Wetness level",
info="Loudness of converted vocals with reverb effect applied.",
value=0.2,
minimum=0.0,
maximum=1.0,
)
dry_level: SliderConfig = SliderConfig(
label="Dryness level",
info="Loudness of converted vocals without reverb effect applied.",
value=0.8,
minimum=0.0,
maximum=1.0,
)
damping: SliderConfig = SliderConfig(
label="Damping level",
info="Absorption of high frequencies in reverb effect.",
value=0.7,
minimum=0.0,
maximum=1.0,
)
main_gain: SliderConfig = SliderConfig.gain(
label="Main gain",
info="The gain to apply to the main vocals.",
)
inst_gain: SliderConfig = SliderConfig.gain(
label="Instrumentals gain",
info="The gain to apply to the instrumentals.",
)
backup_gain: SliderConfig = SliderConfig.gain(
label="Backup gain",
info="The gain to apply to the backup vocals.",
)
class SpeechGenerationConfig(GenerationConfig):
"""
Common component configuration settings for speech generation tabs.
Attributes
----------
source_type : DropdownConfig
Configuration settings for a source type dropdown component.
source : TextboxConfig
Configuration settings for an input source textbox component.
edge_tts_voice : DropdownConfig
Configuration settings for an Edge TTS voice dropdown
component.
n_octaves : SliderConfig
Configuration settings for an octave pitch shift slider
component.
n_semitones : SliderConfig
Configuration settings for a semitone pitch shift slider
component.
tts_pitch_shift : SliderConfig
Configuration settings for a TTS pitch shift slider
component.
tts_speed_change : SliderConfig
Configuration settings for a TTS speed change slider
component.
tts_volume_change : SliderConfig
Configuration settings for a TTS volume change slider
component.
clean_voice : CheckboxConfig
Configuration settings for a clean voice checkbox
component.
clean_strength : SliderConfig
Configuration settings for a clean strength slider
component.
output_gain : GainSliderConfig
Configuration settings for an output gain slider component.
See Also
--------
GenerationConfig
Parent model defining common component configuration settings
for generation tabs.
"""
source_type: DropdownConfig = DropdownConfig(
label="Source type",
info="The type of source to generate speech from.",
value=SpeechSourceType.TEXT,
choices=list(SpeechSourceType),
type="index",
exclude_value=True,
)
source: TextboxConfig = TextboxConfig(
label="Source",
info="Text to generate speech from",
value=None,
exclude_value=True,
)
edge_tts_voice: DropdownConfig = DropdownConfig(
label="Edge TTS voice",
info="Select a voice to use for text to speech conversion.",
value=None,
render=False,
exclude_value=True,
)
n_octaves: SliderConfig = SliderConfig.octave_shift(
label="Octave shift",
info=(
"The number of octaves to pitch-shift the converted speech by. Use 1 for"
" male-to-female and -1 for vice-versa."
),
)
n_semitones: SliderConfig = SliderConfig.semitone_shift(
label="Semitone shift",
info="The number of semi-tones to pitch-shift the converted speech by.",
)
tts_pitch_shift: SliderConfig = SliderConfig(
label="Edge TTS pitch shift",
info=(
"The number of hertz to shift the pitch of the speech generated by Edge"
" TTS."
),
value=0,
minimum=-100,
maximum=100,
step=1,
)
tts_speed_change: SliderConfig = SliderConfig(
label="TTS speed change",
info="The percentual change to the speed of the speech generated by Edge TTS.",
value=0,
minimum=-50,
maximum=100,
step=1,
)
tts_volume_change: SliderConfig = SliderConfig(
label="TTS volume change",
info="The percentual change to the volume of the speech generated by Edge TTS.",
value=0,
minimum=-100,
maximum=100,
step=1,
)
clean_voice: CheckboxConfig = CheckboxConfig(
label="Clean converted voice",
info=(
"Whether to clean the converted voice using noise reduction"
" algorithms.<br><br>"
),
value=True,
exclude_value=True,
)
clean_strength: SliderConfig = SliderConfig.clean_strength(visible=True)
output_gain: SliderConfig = SliderConfig.gain(
label="Output gain",
info="The gain to apply to the converted speech.<br><br>",
)
total_config = load_config(config_name, TotalConfig) if config_name else TotalConfig()
def render_song_cover_multi_step_tab(
total_config: TotalConfig, cookiefile: str | None = None
) -> None:
"""
Render "Generate song cover - multi-step generation" tab.
Parameters
----------
total_config : TotalConfig
Model containing all component configuration settings for the
Ultimate RVC web UI.
cookiefile : str, optional
The path to a file containing cookies to use when downloading
audio from Youtube.
"""
tab_config = total_config.song.multi_step
for input_track in tab_config.input_audio.all:
input_track.instantiate()
with gr.Tab("Multi-step"):
_render_step_0(total_config, cookiefile=cookiefile)
_render_step_1(tab_config)
_render_step_2(tab_config)
_render_step_3(tab_config)
_render_step_4(tab_config)
_render_step_5(total_config, tab_config)
def _render_step_0(total_config: TotalConfig, cookiefile: str | None) -> None:
tab_config = total_config.song.multi_step
current_song_dir = gr.State(None)
with gr.Accordion("Step 0: song retrieval", open=True):
gr.Markdown("")
with gr.Row():
with gr.Column():
tab_config.source_type.instantiate()
with gr.Column():
tab_config.source.instantiate()
local_file = gr.Audio(
label="Source",
type="filepath",
visible=False,
waveform_options=gr.WaveformOptions(show_recording_waveform=True),
)
tab_config.cached_song.instance.render()
tab_config.source_type.instance.input(
partial(toggle_visible_component, 3),
inputs=tab_config.source_type.instance,
outputs=[
tab_config.source.instance,
local_file,
tab_config.cached_song.instance,
],
show_progress="hidden",
)
local_file.change(
update_value,
inputs=local_file,
outputs=tab_config.source.instance,
show_progress="hidden",
)
tab_config.cached_song.instance.input(
update_value,
inputs=tab_config.cached_song.instance,
outputs=tab_config.source.instance,
show_progress="hidden",
)
with gr.Accordion("Options", open=False):
song_transfer = _render_song_transfer(
[SongTransferOption.STEP_1_AUDIO],
"Song",
)
with gr.Row():
retrieve_song_reset_btn = gr.Button("Reset options")
retrieve_song_btn = gr.Button("Retrieve song", variant="primary")
song_transfer_btn = gr.Button("Transfer song")
song_output = gr.Audio(
label="Song",
type="filepath",
interactive=False,
waveform_options=gr.WaveformOptions(show_recording_waveform=True),
)
retrieve_song_reset_btn.click(
lambda: gr.Dropdown(value=[SongTransferOption.STEP_1_AUDIO]),
outputs=song_transfer,
show_progress="hidden",
)
retrieve_song_btn.click(
partial(
exception_harness(
retrieve_song,
info_msg="Song retrieved successfully!",
),
cookiefile=cookiefile,
),
inputs=tab_config.source.instance,
outputs=[song_output, current_song_dir],
).then(
partial(
update_dropdowns,
get_named_song_dirs,
len(tab_config.song_dirs.all) + 2,
value_indices=range(len(tab_config.song_dirs.all)),
),
inputs=current_song_dir,
outputs=[
*tab_config.song_dirs.all,
tab_config.cached_song.instance,
total_config.song.one_click.cached_song.instance,
],
show_progress="hidden",
).then(
partial(update_dropdowns, get_named_song_dirs, 1, [], [0]),
outputs=total_config.management.audio.intermediate.instance,
show_progress="hidden",
)
setup_transfer_event(
song_transfer_btn,
song_transfer,
song_output,
tab_config.input_audio.all,
)
def _render_step_1(tab_config: MultiStepSongGenerationConfig) -> None:
with gr.Accordion("Step 1: vocal separation", open=False):
tab_config.input_audio.audio.instance.render()
tab_config.song_dirs.separate_audio.instance.render()
with gr.Accordion("Options", open=False):
with gr.Row():
tab_config.separation_model.instantiate()
tab_config.segment_size.instantiate()
with gr.Row():
primary_stem_transfer = _render_song_transfer(
[SongTransferOption.STEP_2_VOCALS],
"Primary stem",
)
secondary_stem_transfer = _render_song_transfer(
[SongTransferOption.STEP_4_INSTRUMENTALS],
"Secondary stem",
)
with gr.Row():
separate_audio_reset_btn = gr.Button("Reset options")
separate_vocals_btn = gr.Button("Separate vocals", variant="primary")
with gr.Row():
primary_stem_transfer_btn = gr.Button("Transfer primary stem")
secondary_stem_transfer_btn = gr.Button("Transfer secondary stem")
with gr.Row():
primary_stem_output = gr.Audio(
label="Primary stem",
type="filepath",
interactive=False,
waveform_options=gr.WaveformOptions(show_recording_waveform=True),
)
secondary_stem_output = gr.Audio(
label="Secondary stem",
type="filepath",
interactive=False,
waveform_options=gr.WaveformOptions(show_recording_waveform=True),
)
separate_audio_reset_btn.click(
lambda: [
tab_config.separation_model.value,
tab_config.segment_size.value,
gr.Dropdown(value=[SongTransferOption.STEP_2_VOCALS]),
gr.Dropdown(value=[SongTransferOption.STEP_4_INSTRUMENTALS]),
],
outputs=[
tab_config.separation_model.instance,
tab_config.segment_size.instance,
primary_stem_transfer,
secondary_stem_transfer,
],
show_progress="hidden",
)
separate_vocals_btn.click(
exception_harness(
separate_audio,
info_msg="Vocals separated successfully!",
),
inputs=[
tab_config.input_audio.audio.instance,
tab_config.song_dirs.separate_audio.instance,
tab_config.separation_model.instance,
tab_config.segment_size.instance,
],
outputs=[primary_stem_output, secondary_stem_output],
concurrency_limit=1,
concurrency_id=ConcurrencyId.GPU,
)
for btn, transfer, output in [
(primary_stem_transfer_btn, primary_stem_transfer, primary_stem_output),
(
secondary_stem_transfer_btn,
secondary_stem_transfer,
secondary_stem_output,
),
]:
setup_transfer_event(
btn,
transfer,
output,
tab_config.input_audio.all,
)
def _render_step_2(tab_config: MultiStepSongGenerationConfig) -> None:
with gr.Accordion("Step 2: vocal conversion", open=False):
tab_config.input_audio.vocals.instance.render()
tab_config.voice_model.instance.render()
tab_config.song_dirs.convert_vocals.instance.render()
with gr.Accordion("Options", open=False):
with gr.Row():
tab_config.n_octaves.instantiate()
tab_config.n_semitones.instantiate()
converted_vocals_transfer = _render_song_transfer(
[SongTransferOption.STEP_3_VOCALS],
"Converted vocals",
)
with gr.Accordion("Advanced", open=False):
with gr.Accordion("Voice synthesis", open=False):
with gr.Row():
tab_config.f0_methods.instantiate()
tab_config.index_rate.instantiate()
with gr.Row():
tab_config.rms_mix_rate.instantiate()
tab_config.protect_rate.instantiate()
tab_config.hop_length.instantiate()
with gr.Accordion("Vocal enrichment", open=False), gr.Row():
with gr.Column():
tab_config.split_voice.instantiate()
with gr.Column():
tab_config.autotune_voice.instantiate()
tab_config.autotune_strength.instantiate()
with gr.Column():
tab_config.clean_voice.instantiate()
tab_config.clean_strength.instantiate()
tab_config.autotune_voice.instance.change(
partial(toggle_visibility, targets={True}),
inputs=tab_config.autotune_voice.instance,
outputs=tab_config.autotune_strength.instance,
show_progress="hidden",
)
tab_config.clean_voice.instance.change(
partial(toggle_visibility, targets={True}),
inputs=tab_config.clean_voice.instance,
outputs=tab_config.clean_strength.instance,
show_progress="hidden",
)
with gr.Accordion("Speaker embeddings", open=False), gr.Row():
with gr.Column():
tab_config.embedder_model.instantiate()
tab_config.custom_embedder_model.instance.render()
tab_config.sid.instantiate()
tab_config.embedder_model.instance.change(
partial(toggle_visibility, targets={EmbedderModel.CUSTOM}),
inputs=tab_config.embedder_model.instance,
outputs=tab_config.custom_embedder_model.instance,
show_progress="hidden",
)
with gr.Row():
convert_vocals_reset_btn = gr.Button("Reset options")
convert_vocals_btn = gr.Button("Convert vocals", variant="primary")
converted_vocals_transfer_btn = gr.Button("Transfer converted vocals")
converted_vocals_track_output = gr.Audio(
label="Converted vocals",
type="filepath",
interactive=False,
waveform_options=gr.WaveformOptions(show_recording_waveform=True),
)
convert_vocals_reset_btn.click(
lambda: [
tab_config.n_octaves.value,
tab_config.n_semitones.value,
tab_config.f0_methods.value,
tab_config.index_rate.value,
tab_config.rms_mix_rate.value,
tab_config.protect_rate.value,
tab_config.hop_length.value,
tab_config.split_voice.value,
tab_config.autotune_voice.value,
tab_config.autotune_strength.value,
tab_config.clean_voice.value,
tab_config.clean_strength.value,
tab_config.embedder_model.value,
tab_config.sid.value,
gr.Dropdown(value=[SongTransferOption.STEP_3_VOCALS]),
],
outputs=[
tab_config.n_octaves.instance,
tab_config.n_semitones.instance,
tab_config.f0_methods.instance,
tab_config.index_rate.instance,
tab_config.rms_mix_rate.instance,
tab_config.protect_rate.instance,
tab_config.hop_length.instance,
tab_config.split_voice.instance,
tab_config.autotune_voice.instance,
tab_config.autotune_strength.instance,
tab_config.clean_voice.instance,
tab_config.clean_strength.instance,
tab_config.embedder_model.instance,
tab_config.sid.instance,
converted_vocals_transfer,
],
show_progress="hidden",
)
convert_vocals_btn.click(
partial(
exception_harness(convert, info_msg="Vocals converted successfully!"),
content_type=RVCContentType.VOCALS,
),
inputs=[
tab_config.input_audio.vocals.instance,
tab_config.song_dirs.convert_vocals.instance,
tab_config.voice_model.instance,
tab_config.n_octaves.instance,
tab_config.n_semitones.instance,
tab_config.f0_methods.instance,
tab_config.index_rate.instance,
tab_config.rms_mix_rate.instance,
tab_config.protect_rate.instance,
tab_config.hop_length.instance,
tab_config.split_voice.instance,
tab_config.autotune_voice.instance,
tab_config.autotune_strength.instance,
tab_config.clean_voice.instance,
tab_config.clean_strength.instance,
tab_config.embedder_model.instance,
tab_config.custom_embedder_model.instance,
tab_config.sid.instance,
],
outputs=converted_vocals_track_output,
concurrency_id=ConcurrencyId.GPU,
concurrency_limit=1,
)
setup_transfer_event(
converted_vocals_transfer_btn,
converted_vocals_transfer,
converted_vocals_track_output,
tab_config.input_audio.all,
)
def _render_step_3(tab_config: MultiStepSongGenerationConfig) -> None:
with gr.Accordion("Step 3: vocal post-processing", open=False):
tab_config.input_audio.converted_vocals.instance.render()
tab_config.song_dirs.postprocess_vocals.instance.render()
with gr.Accordion("Options", open=False):
tab_config.room_size.instantiate()
with gr.Row():
tab_config.wet_level.instantiate()
tab_config.dry_level.instantiate()
tab_config.damping.instantiate()
effected_vocals_transfer = _render_song_transfer(
[SongTransferOption.STEP_5_MAIN_VOCALS],
"Effected vocals",
)
with gr.Row():
postprocess_vocals_reset_btn = gr.Button("Reset options")
postprocess_vocals_btn = gr.Button("Post-process vocals", variant="primary")
effected_vocals_transfer_btn = gr.Button("Transfer effected vocals")
effected_vocals_track_output = gr.Audio(
label="Effected vocals",
type="filepath",
interactive=False,
waveform_options=gr.WaveformOptions(show_recording_waveform=True),
)
postprocess_vocals_reset_btn.click(
lambda: [
tab_config.room_size.value,
tab_config.wet_level.value,
tab_config.dry_level.value,
tab_config.damping.value,
gr.Dropdown(value=[SongTransferOption.STEP_5_MAIN_VOCALS]),
],
outputs=[
tab_config.room_size.instance,
tab_config.wet_level.instance,
tab_config.dry_level.instance,
tab_config.damping.instance,
effected_vocals_transfer,
],
show_progress="hidden",
)
postprocess_vocals_btn.click(
exception_harness(
postprocess,
info_msg="Vocals post-processed successfully!",
),
inputs=[
tab_config.input_audio.converted_vocals.instance,
tab_config.song_dirs.postprocess_vocals.instance,
tab_config.room_size.instance,
tab_config.wet_level.instance,
tab_config.dry_level.instance,
tab_config.damping.instance,
],
outputs=effected_vocals_track_output,
)
setup_transfer_event(
effected_vocals_transfer_btn,
effected_vocals_transfer,
effected_vocals_track_output,
tab_config.input_audio.all,
)
def _render_step_4(tab_config: MultiStepSongGenerationConfig) -> None:
with gr.Accordion("Step 4: pitch shift of background audio", open=False):
with gr.Row():
tab_config.input_audio.instrumentals.instance.render()
tab_config.input_audio.backup_vocals.instance.render()
with gr.Row():
tab_config.n_semitones_instrumentals.instantiate()
tab_config.n_semitones_backup_vocals.instantiate()
tab_config.song_dirs.pitch_shift_background.instance.render()
with gr.Accordion("Options", open=False), gr.Row():
shifted_instrumentals_transfer = _render_song_transfer(
[SongTransferOption.STEP_5_INSTRUMENTALS],
"Pitch-shifted instrumentals",
)
shifted_backup_vocals_transfer = _render_song_transfer(
[SongTransferOption.STEP_5_BACKUP_VOCALS],
"Pitch-shifted backup vocals",
)
with gr.Row():
pitch_shift_instrumentals_btn = gr.Button(
"Pitch shift instrumentals",
variant="primary",
)
pitch_shift_backup_vocals_btn = gr.Button(
"Pitch shift backup vocals",
variant="primary",
)
with gr.Row():
shifted_instrumentals_transfer_btn = gr.Button(
"Transfer shifted instrumentals",
)
shifted_backup_vocals_transfer_btn = gr.Button(
"Transfer shifted backup vocals",
)
pitch_shift_background_reset_btn = gr.Button("Reset options")
with gr.Row():
shifted_instrumentals_track_output = gr.Audio(
label="Pitch-shifted instrumentals",
type="filepath",
interactive=False,
waveform_options=gr.WaveformOptions(show_recording_waveform=True),
)
shifted_backup_vocals_track_output = gr.Audio(
label="Pitch-shifted backup vocals",
type="filepath",
interactive=False,
waveform_options=gr.WaveformOptions(show_recording_waveform=True),
)
pitch_shift_background_reset_btn.click(
lambda: [
tab_config.n_semitones_instrumentals.value,
tab_config.n_semitones_backup_vocals.value,
gr.Dropdown(value=[SongTransferOption.STEP_5_INSTRUMENTALS]),
gr.Dropdown(value=[SongTransferOption.STEP_5_BACKUP_VOCALS]),
],
outputs=[
tab_config.n_semitones_instrumentals.instance,
tab_config.n_semitones_backup_vocals.instance,
shifted_instrumentals_transfer,
shifted_backup_vocals_transfer,
],
show_progress="hidden",
)
pitch_shift_instrumentals_btn.click(
exception_harness(
pitch_shift,
info_msg="Instrumentals pitch-shifted successfully!",
),
inputs=[
tab_config.input_audio.instrumentals.instance,
tab_config.song_dirs.pitch_shift_background.instance,
tab_config.n_semitones_instrumentals.instance,
],
outputs=shifted_instrumentals_track_output,
)
pitch_shift_backup_vocals_btn.click(
exception_harness(
pitch_shift,
info_msg="Backup vocals pitch-shifted successfully!",
),
inputs=[
tab_config.input_audio.backup_vocals.instance,
tab_config.song_dirs.pitch_shift_background.instance,
tab_config.n_semitones_backup_vocals.instance,
],
outputs=shifted_backup_vocals_track_output,
)
for btn, transfer, output in [
(
shifted_instrumentals_transfer_btn,
shifted_instrumentals_transfer,
shifted_instrumentals_track_output,
),
(
shifted_backup_vocals_transfer_btn,
shifted_backup_vocals_transfer,
shifted_backup_vocals_track_output,
),
]:
setup_transfer_event(
btn,
transfer,
output,
tab_config.input_audio.all,
)
def _render_step_5(
total_config: TotalConfig,
tab_config: MultiStepSongGenerationConfig,
) -> None:
with gr.Accordion("Step 5: song mixing", open=False):
with gr.Row():
tab_config.input_audio.main_vocals.instance.render()
tab_config.input_audio.shifted_instrumentals.instance.render()
tab_config.input_audio.shifted_backup_vocals.instance.render()
tab_config.song_dirs.mix.instance.render()
with gr.Accordion("Options", open=False):
with gr.Row():
tab_config.main_gain.instantiate()
tab_config.inst_gain.instantiate()
tab_config.backup_gain.instantiate()
with gr.Row():
tab_config.output_name.instantiate(
value=partial(
update_output_name,
get_song_cover_name,
False, # noqa: FBT003,
),
inputs=[
tab_config.input_audio.main_vocals.instance,
tab_config.song_dirs.mix.instance,
],
)
tab_config.output_sr.instantiate()
tab_config.output_format.instantiate()
song_cover_transfer = _render_song_transfer([], "Song cover")
with gr.Row():
mix_reset_btn = gr.Button("Reset options")
mix_btn = gr.Button("Mix song cover", variant="primary")
song_cover_transfer_btn = gr.Button("Transfer song cover")
song_cover_output = gr.Audio(
label="Song cover",
type="filepath",
interactive=False,
waveform_options=gr.WaveformOptions(show_recording_waveform=True),
)
mix_reset_btn.click(
lambda: [
tab_config.main_gain.value,
tab_config.inst_gain.value,
tab_config.backup_gain.value,
tab_config.output_sr.value,
tab_config.output_format.value,
gr.Dropdown(value=[]),
],
outputs=[
tab_config.main_gain.instance,
tab_config.inst_gain.instance,
tab_config.backup_gain.instance,
tab_config.output_sr.instance,
tab_config.output_format.instance,
song_cover_transfer,
],
show_progress="hidden",
)
temp_audio_gains = gr.State()
mix_btn.click(
partial(
_pair_audio_tracks_and_gain,
[
tab_config.input_audio.main_vocals.instance,
tab_config.input_audio.shifted_instrumentals.instance,
tab_config.input_audio.shifted_backup_vocals.instance,
],
[
tab_config.main_gain.instance,
tab_config.inst_gain.instance,
tab_config.backup_gain.instance,
],
),
inputs={
tab_config.input_audio.main_vocals.instance,
tab_config.input_audio.shifted_instrumentals.instance,
tab_config.input_audio.shifted_backup_vocals.instance,
tab_config.main_gain.instance,
tab_config.inst_gain.instance,
tab_config.backup_gain.instance,
},
outputs=temp_audio_gains,
).then(
exception_harness(mix_song, info_msg="Song cover succesfully generated."),
inputs=[
temp_audio_gains,
tab_config.song_dirs.mix.instance,
tab_config.output_sr.instance,
tab_config.output_format.instance,
tab_config.output_name.instance,
],
outputs=song_cover_output,
).then(
partial(update_dropdowns, get_saved_output_audio, 1, [], [0]),
outputs=total_config.management.audio.output.instance,
show_progress="hidden",
)
setup_transfer_event(
song_cover_transfer_btn,
song_cover_transfer,
song_cover_output,
tab_config.input_audio.all,
)
def _render_song_transfer(
value: list[SongTransferOption],
label_prefix: str,
) -> gr.Dropdown:
return render_transfer_component(value, label_prefix, SongTransferOption)
def _pair_audio_tracks_and_gain(
audio_components: Sequence[gr.Audio],
gain_components: Sequence[gr.Slider],
data: dict[gr.Audio | gr.Slider, Any],
) -> list[tuple[str, int]]:
"""
Pair audio tracks and gain levels stored in separate gradio
components.
This function is meant to first be partially applied to the sequence
of audio components and the sequence of slider components containing
the values that should be combined. The resulting function can then
be called by an event listener whose inputs is a set containing
those audio and slider components. The `data` parameter in that case
will contain a mapping from each of those components to the value
that the component stores.
Parameters
----------
audio_components : Sequence[gr.Audio]
Audio components to pair with gain levels.
gain_components : Sequence[gr.Slider]
Gain level components to pair with audio tracks.
data : dict[gr.Audio | gr.Slider, Any]
Data from the audio and gain components.
Returns
-------
list[tuple[str, int]]
Paired audio tracks and gain levels.
Raises
------
ValueError
If the number of audio tracks and gain levels are not the same.
"""
audio_tracks = [data[component] for component in audio_components]
gain_levels = [data[component] for component in gain_components]
if len(audio_tracks) != len(gain_levels):
err_msg = "Number of audio tracks and gain levels must be the same."
raise ValueError(err_msg)
return [
(audio_track, gain_level)
for audio_track, gain_level in zip(audio_tracks, gain_levels, strict=True)
if audio_track
]
def render_app() -> gr.Blocks:
"""
Render the Ultimate RVC web application.
Returns
-------
gr.Blocks
The rendered web application.
"""
css = """
h1 { text-align: center; margin-top: 20px; margin-bottom: 20px; }
#generate-tab-button { font-weight: bold !important;}
#manage-tab-button { font-weight: bold !important;}
#audio-tab-button { font-weight: bold !important;}
#settings-tab-button { font-weight: bold !important;}
"""
cache_delete_frequency = 86400 # every 24 hours check for files to delete
cache_delete_cutoff = 86400 # and delete files older than 24 hours
with gr.Blocks(
title="Redzone-6 Audio Playground",
theme=gr.Theme.load(str(Path(__file__).parent / "config/theme.json")),
css=css,
delete_cache=(cache_delete_frequency, cache_delete_cutoff),
) as app:
for component_config in [
total_config.song.one_click.voice_model,
total_config.song.one_click.cached_song,
total_config.song.one_click.custom_embedder_model,
total_config.song.multi_step.voice_model,
total_config.song.multi_step.cached_song,
total_config.song.multi_step.custom_embedder_model,
total_config.song.multi_step.song_dirs.separate_audio,
total_config.song.multi_step.song_dirs.convert_vocals,
total_config.song.multi_step.song_dirs.postprocess_vocals,
total_config.song.multi_step.song_dirs.pitch_shift_background,
total_config.song.multi_step.song_dirs.mix,
total_config.speech.one_click.edge_tts_voice,
total_config.speech.one_click.voice_model,
total_config.speech.one_click.custom_embedder_model,
total_config.speech.multi_step.edge_tts_voice,
total_config.speech.multi_step.voice_model,
total_config.speech.multi_step.custom_embedder_model,
total_config.training.multi_step.dataset,
total_config.training.multi_step.preprocess_model,
total_config.training.multi_step.extract_model,
total_config.training.multi_step.train_model,
total_config.training.multi_step.custom_embedder_model,
total_config.training.multi_step.custom_pretrained_model,
total_config.management.audio.intermediate,
total_config.management.audio.speech,
total_config.management.audio.output,
total_config.management.audio.dataset,
total_config.management.model.voices,
total_config.management.model.embedders,
total_config.management.model.pretraineds,
total_config.management.model.traineds,
total_config.management.settings.load_config_name,
total_config.management.settings.delete_config_names,
]:
component_config.instantiate()
# main tab
# with gr.Tab("Generate", elem_id="generate-tab"):
with gr.Tab("Music", elem_id="generate-tab"):
render_song_cover_one_click_tab(total_config, cookiefile)
render_song_cover_multi_step_tab(total_config, cookiefile)
with gr.Tab("Speech", elem_id="generate-tab"):
render_speech_one_click_tab(total_config)
render_speech_multi_step_tab(total_config)
with gr.Tab("Configuration", elem_id="settings-tab"):
with gr.Tab("Models"):
render_models_tab(total_config)
with gr.Tab("Settings"):
render_settings_tab(total_config)
render_audio_tab(total_config)
app.load(
_init_dropdowns,
outputs=[
total_config.speech.one_click.edge_tts_voice.instance,
total_config.speech.multi_step.edge_tts_voice.instance,
total_config.song.one_click.voice_model.instance,
total_config.song.multi_step.voice_model.instance,
total_config.speech.one_click.voice_model.instance,
total_config.speech.multi_step.voice_model.instance,
total_config.management.model.voices.instance,
total_config.song.one_click.custom_embedder_model.instance,
total_config.song.multi_step.custom_embedder_model.instance,
total_config.speech.one_click.custom_embedder_model.instance,
total_config.speech.multi_step.custom_embedder_model.instance,
total_config.training.multi_step.custom_embedder_model.instance,
total_config.management.model.embedders.instance,
total_config.training.multi_step.custom_pretrained_model.instance,
total_config.management.model.pretraineds.instance,
total_config.training.multi_step.extract_model.instance,
total_config.training.multi_step.train_model.instance,
total_config.training.multi_step.preprocess_model.instance,
total_config.management.model.traineds.instance,
total_config.song.one_click.cached_song.instance,
total_config.song.multi_step.cached_song.instance,
total_config.song.multi_step.song_dirs.separate_audio.instance,
total_config.song.multi_step.song_dirs.convert_vocals.instance,
total_config.song.multi_step.song_dirs.postprocess_vocals.instance,
total_config.song.multi_step.song_dirs.pitch_shift_background.instance,
total_config.song.multi_step.song_dirs.mix.instance,
total_config.management.audio.intermediate.instance,
total_config.training.multi_step.dataset.instance,
total_config.management.audio.speech.instance,
total_config.management.audio.output.instance,
total_config.management.audio.dataset.instance,
total_config.management.settings.load_config_name.instance,
total_config.management.settings.delete_config_names.instance,
],
show_progress="hidden",
)
return app
def _init_dropdowns() -> list[gr.Dropdown]:
"""
Initialize the Ultimate RVC web application by updating the choices
and default values of non-static dropdown components.
Returns
-------
tuple[gr.Dropdown, ...]
A tuple of gr.Dropdown components with updated choices and
default values.
"""
# Initialize model dropdowns
edge_tts_models = initialize_dropdowns(
get_edge_tts_voice_names,
2,
"en-US-ChristopherNeural",
range(2),
)
voice_models = initialize_dropdowns(
get_voice_model_names,
5,
value_indices=range(4),
)
custom_embedder_models = initialize_dropdowns(
get_custom_embedder_model_names,
6,
value_indices=range(5),
)
custom_pretrained_models = initialize_dropdowns(
get_custom_pretrained_model_names,
2,
value_indices=range(1),
)
training_models = initialize_dropdowns(
get_training_model_names,
4,
value_indices=range(2),
)
song_dirs = initialize_dropdowns(
get_named_song_dirs,
8,
value_indices=range(7),
)
dataset = gr.Dropdown(get_audio_datasets())
speech_delete = gr.Dropdown(get_saved_speech_audio())
output_delete = gr.Dropdown(get_saved_output_audio())
dataset_delete = gr.Dropdown(get_named_audio_datasets())
configs = initialize_dropdowns(get_config_names, 2, value_indices=range(1))
return [
*edge_tts_models,
*voice_models,
*custom_embedder_models,
*custom_pretrained_models,
*training_models,
*song_dirs,
dataset,
speech_delete,
output_delete,
dataset_delete,
*configs,
]
def render_song_cover_one_click_tab(
total_config: TotalConfig, cookiefile: str | None = None
) -> None:
"""
Render "Generate song covers - One-click generation" tab.
Parameters
----------
total_config : TotalConfig
Model containing all component configuration settings for the
Ultimate RVC web UI.
cookiefile : str, optional
The path to a file containing cookies to use when downloading
audio from Youtube.
"""
with gr.Tab("One-click"):
tab_config = total_config.song.one_click
_render_input(tab_config)
with gr.Accordion("Options", open=False):
_render_main_options(tab_config)
_render_conversion_options(tab_config)
_render_mixing_options(tab_config)
_render_output_options(tab_config)
_render_intermediate_audio(tab_config)
with gr.Row(equal_height=True):
reset_btn = gr.Button(value="Reset options", scale=2)
generate_btn = gr.Button("Generate", scale=2, variant="primary")
song_cover = gr.Audio(
label="Song cover",
scale=3,
waveform_options=gr.WaveformOptions(show_recording_waveform=False),
)
song_dirs = total_config.song.multi_step.song_dirs.all
generate_btn.click(
partial(
exception_harness(
run_pipeline,
info_msg="Song cover generated successfully!",
),
cookiefile=cookiefile,
progress_bar=PROGRESS_BAR,
),
inputs=[
tab_config.source.instance,
tab_config.voice_model.instance,
tab_config.n_octaves.instance,
tab_config.n_semitones.instance,
tab_config.f0_methods.instance,
tab_config.index_rate.instance,
tab_config.rms_mix_rate.instance,
tab_config.protect_rate.instance,
tab_config.hop_length.instance,
tab_config.split_voice.instance,
tab_config.autotune_voice.instance,
tab_config.autotune_strength.instance,
tab_config.clean_voice.instance,
tab_config.clean_strength.instance,
tab_config.embedder_model.instance,
tab_config.custom_embedder_model.instance,
tab_config.sid.instance,
tab_config.room_size.instance,
tab_config.wet_level.instance,
tab_config.dry_level.instance,
tab_config.damping.instance,
tab_config.main_gain.instance,
tab_config.inst_gain.instance,
tab_config.backup_gain.instance,
tab_config.output_sr.instance,
tab_config.output_format.instance,
tab_config.output_name.instance,
],
outputs=[song_cover, *tab_config.intermediate_audio.all],
concurrency_limit=4,
concurrency_id=ConcurrencyId.GPU,
).success(
partial(update_dropdowns, get_named_song_dirs, 3 + len(song_dirs), [], [2]),
outputs=[
total_config.song.one_click.cached_song.instance,
total_config.song.multi_step.cached_song.instance,
total_config.management.audio.intermediate.instance,
*song_dirs,
],
show_progress="hidden",
).then(
partial(update_dropdowns, get_saved_output_audio, 1, [], [0]),
outputs=total_config.management.audio.output.instance,
show_progress="hidden",
)
reset_btn.click(
lambda: [
tab_config.n_octaves.value,
tab_config.n_semitones.value,
tab_config.f0_methods.value,
tab_config.index_rate.value,
tab_config.rms_mix_rate.value,
tab_config.protect_rate.value,
tab_config.hop_length.value,
tab_config.split_voice.value,
tab_config.autotune_voice.value,
tab_config.autotune_strength.value,
tab_config.clean_voice.value,
tab_config.clean_strength.value,
tab_config.embedder_model.value,
tab_config.sid.value,
tab_config.room_size.value,
tab_config.wet_level.value,
tab_config.dry_level.value,
tab_config.damping.value,
tab_config.main_gain.value,
tab_config.inst_gain.value,
tab_config.backup_gain.value,
tab_config.output_sr.value,
tab_config.output_format.value,
tab_config.show_intermediate_audio.value,
],
outputs=[
tab_config.n_octaves.instance,
tab_config.n_semitones.instance,
tab_config.f0_methods.instance,
tab_config.index_rate.instance,
tab_config.rms_mix_rate.instance,
tab_config.protect_rate.instance,
tab_config.hop_length.instance,
tab_config.split_voice.instance,
tab_config.autotune_voice.instance,
tab_config.autotune_strength.instance,
tab_config.clean_voice.instance,
tab_config.clean_strength.instance,
tab_config.embedder_model.instance,
tab_config.sid.instance,
tab_config.room_size.instance,
tab_config.wet_level.instance,
tab_config.dry_level.instance,
tab_config.damping.instance,
tab_config.main_gain.instance,
tab_config.inst_gain.instance,
tab_config.backup_gain.instance,
tab_config.output_sr.instance,
tab_config.output_format.instance,
tab_config.show_intermediate_audio.instance,
],
show_progress="hidden",
)
def _render_input(tab_config: OneClickSongGenerationConfig) -> None:
with gr.Row():
with gr.Column():
tab_config.source_type.instantiate()
with gr.Column():
tab_config.source.instantiate()
local_file = gr.Audio(
label="Source",
type="filepath",
visible=False,
waveform_options=gr.WaveformOptions(show_recording_waveform=False),
)
tab_config.cached_song.instance.render()
tab_config.source_type.instance.input(
partial(toggle_visible_component, 3),
inputs=tab_config.source_type.instance,
outputs=[
tab_config.source.instance,
local_file,
tab_config.cached_song.instance,
],
show_progress="hidden",
)
local_file.change(
update_value,
inputs=local_file,
outputs=tab_config.source.instance,
show_progress="hidden",
)
tab_config.cached_song.instance.input(
update_value,
inputs=tab_config.cached_song.instance,
outputs=tab_config.source.instance,
show_progress="hidden",
)
with gr.Row():
tab_config.voice_model.instance.render()
def _render_main_options(tab_config: OneClickSongGenerationConfig) -> None:
with gr.Row():
tab_config.n_octaves.instantiate()
tab_config.n_semitones.instantiate()
def _render_conversion_options(tab_config: OneClickSongGenerationConfig) -> None:
with gr.Accordion("Vocal conversion", open=True):
gr.Markdown("")
with gr.Accordion("Voice synthesis", open=True):
with gr.Row():
tab_config.f0_methods.instantiate()
tab_config.index_rate.instantiate()
with gr.Row():
tab_config.rms_mix_rate.instantiate()
tab_config.protect_rate.instantiate()
tab_config.hop_length.instantiate()
with gr.Accordion("Vocal enrichment", open=True):
with gr.Row():
with gr.Column():
tab_config.split_voice.instantiate()
with gr.Column():
tab_config.autotune_voice.instantiate()
tab_config.autotune_strength.instantiate()
with gr.Column():
tab_config.clean_voice.instantiate()
tab_config.clean_strength.instantiate()
tab_config.autotune_voice.instance.change(
partial(toggle_visibility, targets={True}),
inputs=tab_config.autotune_voice.instance,
outputs=tab_config.autotune_strength.instance,
show_progress="hidden",
)
tab_config.clean_voice.instance.change(
partial(toggle_visibility, targets={True}),
inputs=tab_config.clean_voice.instance,
outputs=tab_config.clean_strength.instance,
show_progress="hidden",
)
with gr.Accordion("Speaker embedding", open=True):
with gr.Row():
with gr.Column():
tab_config.embedder_model.instantiate()
tab_config.custom_embedder_model.instance.render()
tab_config.sid.instantiate()
tab_config.embedder_model.instance.change(
partial(toggle_visibility, targets={EmbedderModel.CUSTOM}),
inputs=tab_config.embedder_model.instance,
outputs=tab_config.custom_embedder_model.instance,
show_progress="hidden",
)
def _render_mixing_options(tab_config: OneClickSongGenerationConfig) -> None:
with gr.Accordion("Audio mixing", open=True):
gr.Markdown("")
with gr.Accordion("Reverb control on converted vocals", open=True):
with gr.Row():
tab_config.room_size.instantiate()
with gr.Row():
tab_config.wet_level.instantiate()
tab_config.dry_level.instantiate()
tab_config.damping.instantiate()
with gr.Accordion("Volume controls (dB)", open=True), gr.Row():
tab_config.main_gain.instantiate()
tab_config.inst_gain.instantiate()
tab_config.backup_gain.instantiate()
def _render_output_options(tab_config: OneClickSongGenerationConfig) -> None:
with gr.Accordion("Audio output", open=True):
with gr.Row():
tab_config.output_name.instantiate(
value=partial(
update_output_name,
get_song_cover_name,
True, # noqa: FBT003
),
inputs=[
gr.State(None),
tab_config.cached_song.instance,
tab_config.voice_model.instance,
],
)
tab_config.output_sr.instantiate()
tab_config.output_format.instantiate()
with gr.Row():
tab_config.show_intermediate_audio.instantiate()
def _render_intermediate_audio(tab_config: OneClickSongGenerationConfig) -> None:
with gr.Accordion(
"Intermediate audio tracks",
open=False,
visible=False,
) as intermediate_audio_accordion:
with gr.Accordion(
"Step 0: song retrieval",
open=False,
) as song_retrieval_accordion:
tab_config.intermediate_audio.song.instantiate()
with (
gr.Accordion(
"Step 1a: vocals/instrumentals separation",
open=False,
) as vocals_separation_accordion,
gr.Row(),
):
tab_config.intermediate_audio.vocals.instantiate()
tab_config.intermediate_audio.instrumentals.instantiate()
with (
gr.Accordion(
"Step 1b: main vocals/ backup vocals separation",
open=False,
) as main_vocals_separation_accordion,
gr.Row(),
):
tab_config.intermediate_audio.main_vocals.instantiate()
tab_config.intermediate_audio.backup_vocals.instantiate()
with (
gr.Accordion(
"Step 1c: main vocals cleanup",
open=False,
) as vocal_cleanup_accordion,
gr.Row(),
):
tab_config.intermediate_audio.main_vocals_dereverbed.instantiate()
tab_config.intermediate_audio.main_vocals_reverb.instantiate()
with gr.Accordion(
"Step 2: conversion of main vocals",
open=False,
) as vocal_conversion_accordion:
tab_config.intermediate_audio.converted_vocals.instantiate()
with gr.Accordion(
"Step 3: post-processing of converted vocals",
open=False,
) as vocals_postprocessing_accordion:
tab_config.intermediate_audio.postprocessed_vocals.instantiate()
with (
gr.Accordion(
"Step 4: pitch shift of background tracks",
open=False,
) as pitch_shift_accordion,
gr.Row(),
):
tab_config.intermediate_audio.instrumentals_shifted.instantiate()
tab_config.intermediate_audio.backup_vocals_shifted.instantiate()
tab_config.show_intermediate_audio.instance.change(
partial(toggle_intermediate_audio, num_components=7),
inputs=tab_config.show_intermediate_audio.instance,
outputs=[
intermediate_audio_accordion,
song_retrieval_accordion,
vocals_separation_accordion,
main_vocals_separation_accordion,
vocal_cleanup_accordion,
vocal_conversion_accordion,
vocals_postprocessing_accordion,
pitch_shift_accordion,
],
show_progress="hidden",
)
app = render_app()
app_wrapper = typer.Typer()
@app_wrapper.command()
def start_app(
share: Annotated[
bool,
typer.Option("--share", "-s", help="Enable sharing"),
] = False,
listen: Annotated[
bool,
typer.Option(
"--listen",
"-l",
help="Make the web application reachable from your local network.",
),
] = False,
listen_host: Annotated[
str | None,
typer.Option(
"--listen-host",
"-h",
help="The hostname that the server will use.",
),
] = "0.0.0.0",
listen_port: Annotated[
int | None,
typer.Option(
"--listen-port",
"-p",
help="The listening port that the server will use.",
),
] = None,
ssr_mode: Annotated[
bool,
typer.Option(
"--ssr-mode",
help="Enable server-side rendering mode.",
),
] = False,
) -> None:
"""Run the Ultimate RVC web application."""
os.environ["GRADIO_TEMP_DIR"] = str(TEMP_DIR)
gr.set_static_paths([MODELS_DIR, AUDIO_DIR])
# app.queue()
app.launch(
server_name=listen_host,
server_port=listen_port,
ssr_mode=ssr_mode,
)
load_config("default", TotalConfig)
if __name__ == "__main__":
app_wrapper()