|
|
from __future__ import annotations |
|
|
|
|
|
import os |
|
|
import sys |
|
|
from enum import IntEnum, StrEnum, auto |
|
|
from functools import cached_property |
|
|
from pathlib import Path |
|
|
from typing import TYPE_CHECKING, Annotated, Any, TypedDict |
|
|
|
|
|
import gradio as gr |
|
|
import typer |
|
|
from huggingface_hub import snapshot_download |
|
|
from pydantic import BaseModel |
|
|
from ultimate_rvc.common import AUDIO_DIR, MODELS_DIR, TEMP_DIR |
|
|
from ultimate_rvc.core.generate.song_cover import get_named_song_dirs |
|
|
from ultimate_rvc.core.generate.speech import get_edge_tts_voice_names |
|
|
from ultimate_rvc.core.manage.audio import ( |
|
|
get_audio_datasets, |
|
|
get_named_audio_datasets, |
|
|
get_saved_output_audio, |
|
|
get_saved_speech_audio, |
|
|
) |
|
|
from ultimate_rvc.core.manage.config import get_config_names, load_config |
|
|
from ultimate_rvc.core.manage.models import ( |
|
|
get_custom_embedder_model_names, |
|
|
get_custom_pretrained_model_names, |
|
|
get_training_model_names, |
|
|
get_voice_model_names, |
|
|
) |
|
|
from ultimate_rvc.web.common import ( |
|
|
initialize_dropdowns, |
|
|
exception_harness, |
|
|
render_transfer_component, |
|
|
setup_transfer_event, |
|
|
toggle_visibility, |
|
|
toggle_visible_component, |
|
|
update_dropdowns, |
|
|
update_output_name, |
|
|
update_value, |
|
|
) |
|
|
|
|
|
from ultimate_rvc.web.config.component import ( |
|
|
AnyComponentConfig, |
|
|
AudioConfig, |
|
|
CheckboxConfig, |
|
|
ComponentConfig, |
|
|
DropdownConfig, |
|
|
NumberConfig, |
|
|
RadioConfig, |
|
|
SliderConfig, |
|
|
TextboxConfig, |
|
|
) |
|
|
from ultimate_rvc.web.config.tab import ( |
|
|
SongGenerationConfig, |
|
|
SpeechGenerationConfig, |
|
|
TrainingConfig, |
|
|
) |
|
|
from ultimate_rvc.web.tabs.generate.speech.multi_step_generation import ( |
|
|
render as render_speech_multi_step_tab, |
|
|
) |
|
|
from ultimate_rvc.web.tabs.generate.speech.one_click_generation import ( |
|
|
render as render_speech_one_click_tab, |
|
|
) |
|
|
from ultimate_rvc.web.tabs.manage.audio import render as render_audio_tab |
|
|
from ultimate_rvc.web.tabs.manage.models import render as render_models_tab |
|
|
from ultimate_rvc.web.tabs.manage.settings import render as render_settings_tab |
|
|
|
|
|
if TYPE_CHECKING: |
|
|
import gradio as gr |
|
|
from typing import TYPE_CHECKING |
|
|
|
|
|
from functools import partial |
|
|
|
|
|
import gradio as gr |
|
|
from ultimate_rvc.core.common import ( |
|
|
INTERMEDIATE_AUDIO_BASE_DIR, |
|
|
OUTPUT_AUDIO_DIR, |
|
|
copy_file_safe, |
|
|
display_progress, |
|
|
get_file_hash, |
|
|
json_dump, |
|
|
json_load, |
|
|
validate_model, |
|
|
validate_url, |
|
|
) |
|
|
from ultimate_rvc.core.exceptions import ( |
|
|
Entity, |
|
|
InvalidLocationError, |
|
|
Location, |
|
|
NotFoundError, |
|
|
NotProvidedError, |
|
|
UIMessage, |
|
|
YoutubeUrlError, |
|
|
) |
|
|
from ultimate_rvc.core.generate.common import ( |
|
|
convert, |
|
|
get_unique_base_path, |
|
|
mix_audio, |
|
|
validate_audio_dir_exists, |
|
|
validate_audio_file_exists, |
|
|
wavify, |
|
|
) |
|
|
from ultimate_rvc.core.generate.song_cover import ( |
|
|
get_named_song_dirs, |
|
|
get_song_cover_name, |
|
|
mix_song, |
|
|
pitch_shift, |
|
|
postprocess, |
|
|
retrieve_song, |
|
|
separate_audio, |
|
|
get_named_song_dirs, |
|
|
get_song_cover_name, |
|
|
run_pipeline, |
|
|
) |
|
|
from ultimate_rvc.core.generate.typing_extra import ( |
|
|
EffectedVocalsMetaData, |
|
|
FileMetaData, |
|
|
MixedAudioType, |
|
|
PitchShiftMetaData, |
|
|
RVCAudioMetaData, |
|
|
SeparatedAudioMetaData, |
|
|
) |
|
|
from ultimate_rvc.core.manage.audio import get_saved_output_audio |
|
|
from ultimate_rvc.typing_extra import EmbedderModel |
|
|
from ultimate_rvc.web.common import ( |
|
|
PROGRESS_BAR, |
|
|
exception_harness, |
|
|
toggle_intermediate_audio, |
|
|
toggle_visibility, |
|
|
toggle_visible_component, |
|
|
update_dropdowns, |
|
|
update_output_name, |
|
|
update_value, |
|
|
) |
|
|
from ultimate_rvc.web.typing_extra import ConcurrencyId |
|
|
|
|
|
type StrPath = str | PathLike[str] |
|
|
|
|
|
type Json = Mapping[str, Json] | Sequence[Json] | str | int | float | bool | None |
|
|
|
|
|
|
|
|
class SegmentSize(IntEnum): |
|
|
"""Enumeration of segment sizes for audio separation.""" |
|
|
|
|
|
SEG_64 = 64 |
|
|
SEG_128 = 128 |
|
|
SEG_256 = 256 |
|
|
SEG_512 = 512 |
|
|
SEG_1024 = 1024 |
|
|
SEG_2048 = 2048 |
|
|
SEG_4096 = 4096 |
|
|
|
|
|
|
|
|
class F0Method(StrEnum): |
|
|
"""Enumeration of pitch extraction methods.""" |
|
|
|
|
|
RMVPE = "rmvpe" |
|
|
CREPE = "crepe" |
|
|
CREPE_TINY = "crepe-tiny" |
|
|
FCPE = "fcpe" |
|
|
|
|
|
|
|
|
class RVCContentType(StrEnum): |
|
|
"""Enumeration of valid content to convert with RVC.""" |
|
|
|
|
|
VOCALS = "vocals" |
|
|
VOICE = "voice" |
|
|
SPEECH = "speech" |
|
|
AUDIO = "audio" |
|
|
|
|
|
|
|
|
class SampleRate(IntEnum): |
|
|
"""Enumeration of supported audio sample rates.""" |
|
|
|
|
|
HZ_16000 = 16000 |
|
|
HZ_44100 = 44100 |
|
|
HZ_48000 = 48000 |
|
|
HZ_96000 = 96000 |
|
|
HZ_192000 = 192000 |
|
|
|
|
|
|
|
|
class AudioExt(StrEnum): |
|
|
"""Enumeration of supported audio file formats.""" |
|
|
|
|
|
MP3 = "mp3" |
|
|
WAV = "wav" |
|
|
FLAC = "flac" |
|
|
OGG = "ogg" |
|
|
|
|
|
|
|
|
class DeviceType(StrEnum): |
|
|
"""Enumeration of device types for training voice models.""" |
|
|
|
|
|
AUTOMATIC = "Automatic" |
|
|
CPU = "CPU" |
|
|
GPU = "GPU" |
|
|
|
|
|
|
|
|
class TrainingSampleRate(StrEnum): |
|
|
"""Enumeration of sample rates for training voice models.""" |
|
|
|
|
|
HZ_32K = "32000" |
|
|
HZ_40K = "40000" |
|
|
HZ_48K = "48000" |
|
|
|
|
|
|
|
|
class PretrainedSampleRate(StrEnum): |
|
|
"""Enumeration of valid sample rates for pretrained models.""" |
|
|
|
|
|
HZ_32K = "32k" |
|
|
HZ_40K = "40k" |
|
|
HZ_44K = "44k" |
|
|
HZ_48K = "48k" |
|
|
|
|
|
|
|
|
class TrainingF0Method(StrEnum): |
|
|
"""Enumeration of pitch extraction methods for training.""" |
|
|
|
|
|
RMVPE = "rmvpe" |
|
|
CREPE = "crepe" |
|
|
CREPE_TINY = "crepe-tiny" |
|
|
|
|
|
|
|
|
class AudioSplitMethod(StrEnum): |
|
|
""" |
|
|
Enumeration of methods to use for splitting audio files during |
|
|
dataset preprocessing. |
|
|
""" |
|
|
|
|
|
SKIP = "Skip" |
|
|
SIMPLE = "Simple" |
|
|
AUTOMATIC = "Automatic" |
|
|
|
|
|
|
|
|
class Vocoder(StrEnum): |
|
|
"""Enumeration of vocoders for training voice models.""" |
|
|
|
|
|
HIFI_GAN = "HiFi-GAN" |
|
|
MRF_HIFI_GAN = "MRF HiFi-GAN" |
|
|
REFINE_GAN = "RefineGAN" |
|
|
|
|
|
|
|
|
class IndexAlgorithm(StrEnum): |
|
|
"""Enumeration of indexing algorithms for training voice models.""" |
|
|
|
|
|
AUTO = "Auto" |
|
|
FAISS = "Faiss" |
|
|
KMEANS = "KMeans" |
|
|
|
|
|
|
|
|
class PretrainedType(StrEnum): |
|
|
""" |
|
|
Enumeration of the possible types of pretrained models to finetune |
|
|
voice models on. |
|
|
""" |
|
|
|
|
|
NONE = "None" |
|
|
DEFAULT = "Default" |
|
|
CUSTOM = "Custom" |
|
|
|
|
|
|
|
|
class ConcurrencyId(StrEnum): |
|
|
"""Enumeration of possible concurrency identifiers.""" |
|
|
|
|
|
GPU = auto() |
|
|
|
|
|
|
|
|
class SongSourceType(StrEnum): |
|
|
"""The type of source providing the song to generate a cover of.""" |
|
|
|
|
|
PATH = "Local or HTTP filepath" |
|
|
LOCAL_FILE = "Local file" |
|
|
CACHED_SONG = "Cached song" |
|
|
|
|
|
|
|
|
class SpeechSourceType(StrEnum): |
|
|
"""The type of source providing the text to generate speech from.""" |
|
|
|
|
|
TEXT = "Text" |
|
|
LOCAL_FILE = "Local file" |
|
|
|
|
|
|
|
|
class SongTransferOption(StrEnum): |
|
|
"""Enumeration of possible song transfer options.""" |
|
|
|
|
|
STEP_1_AUDIO = "Step 1: stem splitting" |
|
|
STEP_2_VOCALS = "Step 2: vocal conversion" |
|
|
STEP_3_VOCALS = "Step 3: vocal effect" |
|
|
STEP_4_INSTRUMENTALS = "Step 4: instrumentals" |
|
|
STEP_4_BACKUP_VOCALS = "Step 4: backup vocals" |
|
|
STEP_5_MAIN_VOCALS = "Step 5: main vocals" |
|
|
STEP_5_INSTRUMENTALS = "Step 5: instrumentals" |
|
|
STEP_5_BACKUP_VOCALS = "Step 5: backup vocals" |
|
|
|
|
|
|
|
|
class SpeechTransferOption(StrEnum): |
|
|
"""Enumeration of possible speech transfer options.""" |
|
|
|
|
|
STEP_2_SPEECH = "Step 2: speech conversion" |
|
|
STEP_3_SPEECH = "Step 3: speech effect" |
|
|
|
|
|
|
|
|
class ComponentVisibilityKwArgs(TypedDict, total=False): |
|
|
""" |
|
|
Keyword arguments for setting component visibility. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
visible : bool |
|
|
Whether the component should be visible. |
|
|
value : Any |
|
|
The value of the component. |
|
|
|
|
|
""" |
|
|
|
|
|
visible: bool |
|
|
value: Any |
|
|
|
|
|
|
|
|
class UpdateDropdownKwArgs(TypedDict, total=False): |
|
|
""" |
|
|
Keyword arguments for updating a dropdown component. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
choices : DropdownChoices |
|
|
The updated choices for the dropdown component. |
|
|
value : DropdownValue |
|
|
The updated value for the dropdown component. |
|
|
|
|
|
""" |
|
|
|
|
|
choices: DropdownChoices |
|
|
value: DropdownValue |
|
|
|
|
|
|
|
|
class TextBoxKwArgs(TypedDict, total=False): |
|
|
""" |
|
|
Keyword arguments for updating a textbox component. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
value : str | None |
|
|
The updated value for the textbox component. |
|
|
placeholder : str | None |
|
|
The updated placeholder for the textbox component. |
|
|
|
|
|
""" |
|
|
|
|
|
value: str | None |
|
|
placeholder: str | None |
|
|
|
|
|
|
|
|
class UpdateAudioKwArgs(TypedDict, total=False): |
|
|
""" |
|
|
Keyword arguments for updating an audio component. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
value : str | None |
|
|
The updated value for the audio component. |
|
|
|
|
|
""" |
|
|
|
|
|
value: str | None |
|
|
|
|
|
|
|
|
class DatasetType(StrEnum): |
|
|
"""The type of dataset to train a voice model.""" |
|
|
|
|
|
NEW_DATASET = "Create new dataset" |
|
|
EXISTING_DATASET = "Use existing dataset" |
|
|
|
|
|
|
|
|
class EmbedderModel(StrEnum): |
|
|
"""Enumeration of audio embedding models.""" |
|
|
|
|
|
CONTENTVEC = "contentvec" |
|
|
CRUSTY = "Crusty" |
|
|
CUSTOM = "custom" |
|
|
|
|
|
|
|
|
class SeparationModel(StrEnum): |
|
|
"""Enumeration of audio separation models.""" |
|
|
|
|
|
UVR_MDX_NET_VOC_FT = "UVR-MDX-NET-Voc_FT.onnx" |
|
|
UVR_MDX_NET_KARA_2 = "UVR_MDXNET_KARA_2.onnx" |
|
|
REVERB_HQ_BY_FOXJOY = "Reverb_HQ_By_FoxJoy.onnx" |
|
|
UVR_MDX_NET_Inst_HQ_3 = "UVR-MDX-NET-Inst_HQ_3.onnx" |
|
|
Kim_Vocal_1 = "Kim_Vocal_1.onnx" |
|
|
Kim_Vocal_2 = "Kim_Vocal_2.onnx" |
|
|
Kim_Inst = "Kim_Inst.onnx" |
|
|
UVR_MDX_NET_Inst_HQ_4 = "UVR-MDX-NET-Inst_HQ_4.onnx" |
|
|
kuielab_a_vocals = "kuielab_a_vocals.onnx" |
|
|
kuielab_b_vocals = "kuielab_b_vocals.onnx" |
|
|
kuielab_a_drums = "kuielab_a_drums.onnx" |
|
|
kuielab_b_drums = "kuielab_b_drums.onnx" |
|
|
kuielab_a_bass = "kuielab_a_bass.onnx" |
|
|
kuielab_b_bass = "kuielab_b_bass.onnx" |
|
|
kuielab_a_other = "kuielab_a_other.onnx" |
|
|
kuielab_b_other = "kuielab_b_other.onnx" |
|
|
MDX23C_8KFFT_InstVoc_HQ_2 = "MDX23C-8KFFT-InstVoc_HQ_2.ckpt" |
|
|
UVR_DeNoise = "UVR-DeNoise.pth" |
|
|
UVR_DeEcho_DeReverb = "UVR-DeEcho-DeReverb.pth" |
|
|
|
|
|
|
|
|
now_dir = os.getcwd() |
|
|
|
|
|
sys.path.append(now_dir) |
|
|
models_dir = "models" |
|
|
|
|
|
dump_path = os.path.join(now_dir, models_dir) |
|
|
|
|
|
repo_id = "lainlives/voice" |
|
|
|
|
|
hf_token = os.environ.get("HF_TOKEN") |
|
|
snapshot_download(repo_id=repo_id, local_dir=dump_path, token=hf_token) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config_name = "default" |
|
|
cookiefile = os.environ.get("YT_COOKIEFILE") |
|
|
|
|
|
|
|
|
""" |
|
|
Module defining models for representing configuration settings for |
|
|
UI tabs. |
|
|
""" |
|
|
|
|
|
|
|
|
class SongIntermediateAudioConfig(BaseModel): |
|
|
""" |
|
|
Configuration settings for intermediate audio components in the |
|
|
one-click song generation tab. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
song : AudioConfig |
|
|
Configuration settings for the input song audio component. |
|
|
vocals : AudioConfig |
|
|
Configuration settings for the vocals audio component. |
|
|
instrumentals : AudioConfig |
|
|
Configuration settings for the instrumentals audio component. |
|
|
main_vocals : AudioConfig |
|
|
Configuration settings for the main vocals audio component. |
|
|
backup_vocals : AudioConfig |
|
|
Configuration settings for the backup vocals audio component. |
|
|
main_vocals_dereverbed : AudioConfig |
|
|
Configuration settings for the main vocals de-reverbed audio |
|
|
component. |
|
|
main_vocals_reverb : AudioConfig |
|
|
Configuration settings for the main vocals reverb audio |
|
|
component. |
|
|
converted_vocals : AudioConfig |
|
|
Configuration settings for the converted vocals audio |
|
|
component. |
|
|
postprocessed_vocals : AudioConfig |
|
|
Configuration settings for the postprocessed vocals audio |
|
|
component. |
|
|
instrumentals_shifted : AudioConfig |
|
|
Configuration settings for the shifted instrumentals audio |
|
|
component. |
|
|
backup_vocals_shifted : AudioConfig |
|
|
Configuration settings for the shifted backup vocals audio |
|
|
component. |
|
|
all : list[gr.Audio] |
|
|
List of instances of all intermediate audio components. |
|
|
|
|
|
""" |
|
|
|
|
|
song: AudioConfig = AudioConfig.intermediate(label="Song") |
|
|
vocals: AudioConfig = AudioConfig.intermediate(label="Vocals") |
|
|
instrumentals: AudioConfig = AudioConfig.intermediate( |
|
|
label="Instrumentals", |
|
|
) |
|
|
main_vocals: AudioConfig = AudioConfig.intermediate( |
|
|
label="Main vocals", |
|
|
) |
|
|
backup_vocals: AudioConfig = AudioConfig.intermediate( |
|
|
label="Backup vocals", |
|
|
) |
|
|
main_vocals_dereverbed: AudioConfig = AudioConfig.intermediate( |
|
|
label="De-reverbed main vocals", |
|
|
) |
|
|
main_vocals_reverb: AudioConfig = AudioConfig.intermediate( |
|
|
label="Main vocals with reverb", |
|
|
) |
|
|
converted_vocals: AudioConfig = AudioConfig.intermediate( |
|
|
label="Converted vocals", |
|
|
) |
|
|
postprocessed_vocals: AudioConfig = AudioConfig.intermediate( |
|
|
label="Postprocessed vocals", |
|
|
) |
|
|
instrumentals_shifted: AudioConfig = AudioConfig.intermediate( |
|
|
label="Pitch-shifted instrumentals", |
|
|
) |
|
|
backup_vocals_shifted: AudioConfig = AudioConfig.intermediate( |
|
|
label="Pitch-shifted backup vocals", |
|
|
) |
|
|
|
|
|
@property |
|
|
def all(self) -> list[gr.Audio]: |
|
|
""" |
|
|
Retrieve instances of all intermediate audio components |
|
|
in the one-click song generation tab. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
list[gr.Audio] |
|
|
List of instances of all intermediate audio components in |
|
|
the one-click song generation tab. |
|
|
|
|
|
""" |
|
|
|
|
|
|
|
|
return [getattr(self, field).instance for field in self.__annotations__] |
|
|
|
|
|
|
|
|
class OneClickSongGenerationConfig(SongGenerationConfig): |
|
|
""" |
|
|
Configuration settings for the one-click song generation tab. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
n_octaves : SliderConfig |
|
|
Configuration settings for an octave pitch shift slider |
|
|
component. |
|
|
n_semitones : SliderConfig |
|
|
Configuration settings for a semitone pitch shift slider |
|
|
component. |
|
|
show_intermediate_audio : CheckboxConfig |
|
|
Configuration settings for a show intermediate audio checkbox |
|
|
component. |
|
|
intermediate_audio : SongIntermediateAudioConfig |
|
|
Configuration settings for intermediate audio components. |
|
|
|
|
|
See Also |
|
|
-------- |
|
|
SongGenerationConfig |
|
|
Parent model defining common component configuration settings |
|
|
for song generation tabs. |
|
|
|
|
|
""" |
|
|
|
|
|
n_octaves: SliderConfig = SliderConfig.octave_shift( |
|
|
label="Vocal pitch shift", |
|
|
info=( |
|
|
"The number of octaves to shift the pitch of the converted vocals by. Use 1" |
|
|
" for male-to-female and -1 for vice-versa." |
|
|
), |
|
|
) |
|
|
|
|
|
n_semitones: SliderConfig = SliderConfig.semitone_shift( |
|
|
label="Overall pitch shift", |
|
|
info=( |
|
|
"The number of semi-tones to shift the pitch of the converted vocals," |
|
|
" instrumentals and backup vocals by." |
|
|
), |
|
|
) |
|
|
show_intermediate_audio: CheckboxConfig = CheckboxConfig( |
|
|
label="Show intermediate audio", |
|
|
info="Show intermediate audio tracks produced during song cover generation.", |
|
|
value=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
intermediate_audio: SongIntermediateAudioConfig = SongIntermediateAudioConfig() |
|
|
|
|
|
|
|
|
class SongInputAudioConfig(BaseModel): |
|
|
""" |
|
|
Configuration settings for input audio components in the multi-step |
|
|
song generation tab. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
audio : AudioConfig |
|
|
Configuration settings for the input audio component. |
|
|
vocals : AudioConfig |
|
|
Configuration settings for the vocals audio component. |
|
|
converted_vocals : AudioConfig |
|
|
Configuration settings for the converted vocals audio |
|
|
component. |
|
|
instrumentals : AudioConfig |
|
|
Configuration settings for the instrumentals audio |
|
|
component. |
|
|
backup_vocals : AudioConfig |
|
|
Configuration settings for the backup vocals audio |
|
|
component. |
|
|
main_vocals : AudioConfig |
|
|
Configuration settings for the main vocals audio |
|
|
component. |
|
|
shifted_instrumentals : AudioConfig |
|
|
Configuration settings for the shifted instrumentals audio |
|
|
component. |
|
|
shifted_backup_vocals : AudioConfig |
|
|
Configuration settings for the shifted backup vocals audio |
|
|
component. |
|
|
all : list[AudioConfig] |
|
|
List of configuration settings for all input audio |
|
|
components in the multi-step song generation tab. |
|
|
|
|
|
""" |
|
|
|
|
|
audio: AudioConfig = AudioConfig.input(label="Audio") |
|
|
vocals: AudioConfig = AudioConfig.input(label="Vocals") |
|
|
converted_vocals: AudioConfig = AudioConfig.input(label="Vocals") |
|
|
instrumentals: AudioConfig = AudioConfig.input(label="Instrumentals") |
|
|
backup_vocals: AudioConfig = AudioConfig.input(label="Backup vocals") |
|
|
main_vocals: AudioConfig = AudioConfig.input(label="Main vocals") |
|
|
shifted_instrumentals: AudioConfig = AudioConfig.input(label="Instrumentals") |
|
|
shifted_backup_vocals: AudioConfig = AudioConfig.input(label="Backup vocals") |
|
|
|
|
|
@property |
|
|
def all(self) -> list[AudioConfig]: |
|
|
""" |
|
|
Retrieve configuration settings for all input audio components |
|
|
in the multi-step song generation tab. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
list[AudioConfig] |
|
|
List of configuration settings for all input audio |
|
|
components in the multi-step song generation tab. |
|
|
|
|
|
""" |
|
|
return [getattr(self, field) for field in self.__annotations__] |
|
|
|
|
|
|
|
|
class SongDirsConfig(BaseModel): |
|
|
""" |
|
|
Configuration settings for song directory components in the |
|
|
multi-step song generation tab. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
separate_audio : DropdownConfig |
|
|
Configuration settings for the song directory component |
|
|
for separating audio. |
|
|
convert_vocals : DropdownConfig |
|
|
Configuration settings for the song directory component |
|
|
for converting vocals. |
|
|
postprocess_vocals : DropdownConfig |
|
|
Configuration settings for the song directory component |
|
|
for postprocessing vocals. |
|
|
pitch_shift_background : DropdownConfig |
|
|
Configuration settings for the song directory component |
|
|
for pitch-shifting background audio. |
|
|
mix : DropdownConfig |
|
|
Configuration settings for the song directory component |
|
|
for mixing audio. |
|
|
all : list[gr.Dropdown] |
|
|
List of instances of all song directory components in the |
|
|
multi-step song generation tab. |
|
|
|
|
|
""" |
|
|
|
|
|
separate_audio: DropdownConfig = DropdownConfig.song_dir() |
|
|
convert_vocals: DropdownConfig = DropdownConfig.song_dir() |
|
|
postprocess_vocals: DropdownConfig = DropdownConfig.song_dir() |
|
|
pitch_shift_background: DropdownConfig = DropdownConfig.song_dir() |
|
|
mix: DropdownConfig = DropdownConfig.song_dir() |
|
|
|
|
|
@property |
|
|
def all(self) -> list[gr.Dropdown]: |
|
|
""" |
|
|
Retrieve instances of all song directory components in the |
|
|
multi-step song generation tab. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
list[gr.Dropdown] |
|
|
List of instances of all song directory components in |
|
|
the multi-step song generation tab. |
|
|
|
|
|
""" |
|
|
return [getattr(self, field).instance for field in self.__annotations__] |
|
|
|
|
|
|
|
|
class MultiStepSongGenerationConfig(SongGenerationConfig): |
|
|
""" |
|
|
Configuration settings for multi-step song generation tab. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
separation_model : DropdownConfig |
|
|
Configuration settings for a separation model dropdown |
|
|
component. |
|
|
segment_size : RadioConfig |
|
|
Configuration settings for a segment size radio component. |
|
|
n_octaves : SliderConfig |
|
|
Configuration settings for an octave pitch shift slider |
|
|
component. |
|
|
n_semitones : SliderConfig |
|
|
Configuration settings for a semitone pitch shift slider |
|
|
component. |
|
|
n_semitones_instrumentals : SliderConfig |
|
|
Configuration settings for an instrumentals pitch shift slider |
|
|
component. |
|
|
n_semitones_backup_vocals : SliderConfig |
|
|
Configuration settings for a backup vocals pitch shift slider |
|
|
component. |
|
|
input_audio : SongInputAudioConfig |
|
|
Configuration settings for input audio components. |
|
|
song_dirs : SongDirsConfig |
|
|
Configuration settings for song directory components. |
|
|
|
|
|
See Also |
|
|
-------- |
|
|
SongGenerationConfig |
|
|
Parent model defining common component configuration settings |
|
|
for song generation tabs. |
|
|
|
|
|
""" |
|
|
|
|
|
separation_model: DropdownConfig = DropdownConfig( |
|
|
label="Separation model", |
|
|
info="The model to use for audio separation.", |
|
|
value=SeparationModel.MDX23C_8KFFT_InstVoc_HQ_2, |
|
|
choices=list(SeparationModel), |
|
|
) |
|
|
segment_size: RadioConfig = RadioConfig( |
|
|
label="Segment size", |
|
|
info=( |
|
|
"The size of the segments into which the audio is split. Using a larger" |
|
|
" size consumes more resources, but may give better results." |
|
|
), |
|
|
value=SegmentSize.SEG_2048, |
|
|
choices=list(SegmentSize), |
|
|
) |
|
|
n_octaves: SliderConfig = SliderConfig.octave_shift( |
|
|
label="Pitch shift (octaves)", |
|
|
info=( |
|
|
"The number of octaves to pitch-shift the converted voice by. Use 1 for" |
|
|
" male-to-female and -1 for vice-versa." |
|
|
), |
|
|
) |
|
|
n_semitones: SliderConfig = SliderConfig.semitone_shift( |
|
|
label="Pitch shift (semi-tones)", |
|
|
info=( |
|
|
"The number of semi-tones to pitch-shift the converted vocals by. Altering" |
|
|
" this slightly reduces sound quality." |
|
|
), |
|
|
) |
|
|
n_semitones_instrumentals: SliderConfig = SliderConfig.semitone_shift( |
|
|
label="Instrumental pitch shift", |
|
|
info="The number of semi-tones to pitch-shift the instrumentals by.", |
|
|
) |
|
|
n_semitones_backup_vocals: SliderConfig = SliderConfig.semitone_shift( |
|
|
label="Backup vocal pitch shift", |
|
|
info="The number of semi-tones to pitch-shift the backup vocals by.", |
|
|
) |
|
|
input_audio: SongInputAudioConfig = SongInputAudioConfig() |
|
|
song_dirs: SongDirsConfig = SongDirsConfig() |
|
|
|
|
|
|
|
|
class SpeechIntermediateAudioConfig(BaseModel): |
|
|
""" |
|
|
Configuration settings for intermediate audio components in the |
|
|
one-click speech generation tab. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
speech : AudioConfig |
|
|
Configuration settings for the input speech audio component. |
|
|
converted_speech : AudioConfig |
|
|
Configuration settings for the converted speech audio component. |
|
|
all : list[gr.Audio] |
|
|
List of instances of all intermediate audio components in the |
|
|
speech generation tab. |
|
|
|
|
|
""" |
|
|
|
|
|
speech: AudioConfig = AudioConfig.intermediate(label="Speech") |
|
|
converted_speech: AudioConfig = AudioConfig.intermediate(label="Converted speech") |
|
|
|
|
|
@property |
|
|
def all(self) -> list[gr.Audio]: |
|
|
""" |
|
|
Retrieve instances of all intermediate audio components in the |
|
|
speech generation tab. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
list[gr.Audio] |
|
|
List of instances of all intermediate audio components in |
|
|
the speech generation tab. |
|
|
|
|
|
""" |
|
|
return [getattr(self, field).instance for field in self.__annotations__] |
|
|
|
|
|
|
|
|
class OneClickSpeechGenerationConfig(SpeechGenerationConfig): |
|
|
""" |
|
|
Configuration settings for one-click speech generation tab. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
intermediate_audio : SpeechIntermediateAudioConfig |
|
|
Configuration settings for intermediate audio components. |
|
|
show_intermediate_audio : CheckboxConfig |
|
|
Configuration settings for a show intermediate audio checkbox |
|
|
component. |
|
|
|
|
|
See Also |
|
|
-------- |
|
|
SpeechGenerationConfig |
|
|
Parent model defining common component configuration settings |
|
|
for speech generation tabs. |
|
|
|
|
|
""" |
|
|
|
|
|
intermediate_audio: SpeechIntermediateAudioConfig = SpeechIntermediateAudioConfig() |
|
|
|
|
|
show_intermediate_audio: CheckboxConfig = CheckboxConfig( |
|
|
label="Show intermediate audio", |
|
|
info="Show intermediate audio tracks produced during speech generation.", |
|
|
value=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
|
|
|
|
|
|
class SpeechInputAudioConfig(BaseModel): |
|
|
""" |
|
|
Configuration settings for input audio components in the multi-step |
|
|
speech generation tab. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
speech : AudioConfig |
|
|
Configuration settings for the input speech audio component. |
|
|
converted_speech : AudioConfig |
|
|
Configuration settings for the converted speech audio component. |
|
|
|
|
|
all : list[AudioConfig] |
|
|
List of configuration settings for all input audio components in |
|
|
the multi-step speech generation tab. |
|
|
|
|
|
""" |
|
|
|
|
|
speech: AudioConfig = AudioConfig.input("Speech") |
|
|
converted_speech: AudioConfig = AudioConfig.input("Converted speech") |
|
|
|
|
|
@property |
|
|
def all(self) -> list[AudioConfig]: |
|
|
""" |
|
|
Retrieve configuration settings for all input audio components |
|
|
in the multi-step speech generation tab. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
list[AudioConfig] |
|
|
List of configuration settings for all input audio |
|
|
components in the multi-step speech generation tab. |
|
|
|
|
|
""" |
|
|
return [getattr(self, field) for field in self.__annotations__] |
|
|
|
|
|
|
|
|
class MultiStepSpeechGenerationConfig(SpeechGenerationConfig): |
|
|
""" |
|
|
Configuration settings for the multi-step speech generation tab. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
input_audio : SpeechInputAudioConfig |
|
|
Configuration settings for input audio components. |
|
|
|
|
|
See Also |
|
|
-------- |
|
|
SpeechGenerationConfig |
|
|
Parent model defining common component configuration settings |
|
|
for speech generation tabs. |
|
|
|
|
|
""" |
|
|
|
|
|
input_audio: SpeechInputAudioConfig = SpeechInputAudioConfig() |
|
|
|
|
|
|
|
|
class MultiStepTrainingConfig(TrainingConfig): |
|
|
"""Configuration settings for multi-step training tab.""" |
|
|
|
|
|
|
|
|
class ModelManagementConfig(BaseModel): |
|
|
""" |
|
|
|
|
|
Configuration settings for model management tab. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
voices : DropdownConfig |
|
|
Configuration settings for delete voice models dropdown |
|
|
component. |
|
|
embedders : DropdownConfig |
|
|
Configuration settings for delete embedder models dropdown |
|
|
component. |
|
|
pretraineds : DropdownConfig |
|
|
Configuration settings for delete pretrained models dropdown |
|
|
component. |
|
|
traineds : DropdownConfig |
|
|
Configuration settings for delete training models dropdown |
|
|
component. |
|
|
dummy_checkbox : CheckboxConfig |
|
|
Configuration settings for a dummy checkbox component. |
|
|
|
|
|
""" |
|
|
|
|
|
voices: DropdownConfig = DropdownConfig.multi_delete( |
|
|
label="Voice models", |
|
|
info="Select one or more voice models to delete.", |
|
|
) |
|
|
embedders: DropdownConfig = DropdownConfig.multi_delete( |
|
|
label="Custom embedder models", |
|
|
info="Select one or more embedder models to delete.", |
|
|
) |
|
|
pretraineds: DropdownConfig = DropdownConfig.multi_delete( |
|
|
label="Custom pretrained models", |
|
|
info="Select one or more pretrained models to delete.", |
|
|
) |
|
|
traineds: DropdownConfig = DropdownConfig.multi_delete( |
|
|
label="Training models", |
|
|
info="Select one or more training models to delete.", |
|
|
) |
|
|
|
|
|
dummy_checkbox: CheckboxConfig = CheckboxConfig( |
|
|
value=False, |
|
|
visible=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
|
|
|
|
|
|
class AudioManagementConfig(BaseModel): |
|
|
""" |
|
|
Configuration settings for audio management tab. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
intermediate : DropdownConfig |
|
|
Configuration settings for delete intermediate audio files |
|
|
dropdown component |
|
|
speech : DropdownConfig |
|
|
Configuration settings for delete speech audio files dropdown |
|
|
component. |
|
|
output : DropdownConfig |
|
|
Configuration settings for delete output audio files dropdown |
|
|
component. |
|
|
dataset : DropdownConfig |
|
|
Configuration settings for delete dataset audio files dropdown |
|
|
component. |
|
|
dummy_checkbox : CheckboxConfig |
|
|
Configuration settings for a dummy checkbox component. |
|
|
|
|
|
""" |
|
|
|
|
|
intermediate: DropdownConfig = DropdownConfig.multi_delete( |
|
|
label="Song directories", |
|
|
info=( |
|
|
"Select one or more song directories containing intermediate audio files to" |
|
|
" delete." |
|
|
), |
|
|
) |
|
|
speech: DropdownConfig = DropdownConfig.multi_delete( |
|
|
label="Speech audio files", |
|
|
info="Select one or more speech audio files to delete.", |
|
|
) |
|
|
output: DropdownConfig = DropdownConfig.multi_delete( |
|
|
label="Output audio files", |
|
|
info="Select one or more output audio files to delete.", |
|
|
) |
|
|
dataset: DropdownConfig = DropdownConfig.multi_delete( |
|
|
label="Dataset audio files", |
|
|
info="Select one or more datasets containing audio files to delete.", |
|
|
) |
|
|
|
|
|
dummy_checkbox: CheckboxConfig = CheckboxConfig( |
|
|
value=False, |
|
|
visible=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
|
|
|
|
|
|
class SettingsManagementConfig(BaseModel): |
|
|
""" |
|
|
Configuration settings for settings management tab. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
dummy_checkbox : CheckboxConfig |
|
|
Configuration settings for a dummy checkbox component. |
|
|
|
|
|
""" |
|
|
|
|
|
load_config_name: DropdownConfig = DropdownConfig( |
|
|
label="Configuration name", |
|
|
info="The name of a configuration to load UI settings from", |
|
|
value=None, |
|
|
render=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
delete_config_names: DropdownConfig = DropdownConfig.multi_delete( |
|
|
label="Configuration names", |
|
|
info="Select the name of one or more configurations to delete", |
|
|
) |
|
|
dummy_checkbox: CheckboxConfig = CheckboxConfig( |
|
|
value=False, |
|
|
visible=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
|
|
|
|
|
|
class TotalSongGenerationConfig(BaseModel): |
|
|
""" |
|
|
All configuration settings for song generation tabs. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
one_click : OneClickSongGenerationConfig |
|
|
Configuration settings for the one-click song generation tab. |
|
|
multi_step : MultiStepSongGenerationConfig |
|
|
Configuration settings for the multi-step song generation tab. |
|
|
|
|
|
""" |
|
|
|
|
|
one_click: OneClickSongGenerationConfig = OneClickSongGenerationConfig() |
|
|
multi_step: MultiStepSongGenerationConfig = MultiStepSongGenerationConfig() |
|
|
|
|
|
|
|
|
class TotalSpeechGenerationConfig(BaseModel): |
|
|
""" |
|
|
All configuration settings for speech generation tabs. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
one_click : OneClickSpeechGenerationConfig |
|
|
Configuration settings for the one-click speech generation tab. |
|
|
multi_step : MultiStepSpeechGenerationConfig |
|
|
Configuration settings for the multi-step speech generation tab. |
|
|
|
|
|
""" |
|
|
|
|
|
one_click: OneClickSpeechGenerationConfig = OneClickSpeechGenerationConfig() |
|
|
multi_step: MultiStepSpeechGenerationConfig = MultiStepSpeechGenerationConfig() |
|
|
|
|
|
|
|
|
class TotalTrainingConfig(BaseModel): |
|
|
""" |
|
|
All configuration settings for training tabs. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
training : TrainingConfig |
|
|
Configuration settings for the multi-step training tab. |
|
|
|
|
|
""" |
|
|
|
|
|
multi_step: MultiStepTrainingConfig = MultiStepTrainingConfig() |
|
|
|
|
|
|
|
|
class TotalManagementConfig(BaseModel): |
|
|
""" |
|
|
All configuration settings for management tabs. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
model : ModelManagementConfig |
|
|
Configuration settings for the model management tab. |
|
|
audio : AudioManagementConfig |
|
|
Configuration settings for the audio management tab. |
|
|
settings : SettingsManagementConfig |
|
|
Configuration settings for the settings management tab. |
|
|
|
|
|
""" |
|
|
|
|
|
model: ModelManagementConfig = ModelManagementConfig() |
|
|
audio: AudioManagementConfig = AudioManagementConfig() |
|
|
settings: SettingsManagementConfig = SettingsManagementConfig() |
|
|
|
|
|
|
|
|
class TotalConfig(BaseModel): |
|
|
""" |
|
|
All configuration settings for the Ultimate RVC app. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
song : TotalSongGenerationConfig |
|
|
Configuration settings for song generation tabs. |
|
|
speech : TotalSpeechGenerationConfig |
|
|
Configuration settings for speech generation tabs. |
|
|
training : TotalTrainingConfig |
|
|
Configuration settings for training tabs. |
|
|
management : TotalManagementConfig |
|
|
Configuration settings for management tabs. |
|
|
|
|
|
""" |
|
|
|
|
|
song: TotalSongGenerationConfig = TotalSongGenerationConfig() |
|
|
speech: TotalSpeechGenerationConfig = TotalSpeechGenerationConfig() |
|
|
training: TotalTrainingConfig = TotalTrainingConfig() |
|
|
management: TotalManagementConfig = TotalManagementConfig() |
|
|
|
|
|
@cached_property |
|
|
def all(self) -> list[AnyComponentConfig]: |
|
|
""" |
|
|
Recursively collect those component configuration models nested |
|
|
within the current model instance, which have values that are |
|
|
not excluded. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
list[AnyComponentConfig] |
|
|
A list of component configuration models found within the |
|
|
current model instance, which have values that are not |
|
|
excluded. |
|
|
|
|
|
""" |
|
|
|
|
|
def _collect(model: BaseModel) -> list[AnyComponentConfig]: |
|
|
component_configs: list[Any] = [] |
|
|
for _, value in model: |
|
|
if isinstance(value, ComponentConfig): |
|
|
if not value.exclude_value: |
|
|
component_configs.append(value) |
|
|
elif isinstance(value, BaseModel): |
|
|
component_configs.extend(_collect(value)) |
|
|
return component_configs |
|
|
|
|
|
return _collect(self) |
|
|
|
|
|
|
|
|
class BaseTabConfig(BaseModel): |
|
|
""" |
|
|
Base model defining common component configuration settings for |
|
|
UI tabs. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
embedder_model : DropdownConfig |
|
|
Configuration settings for an embedder model dropdown component. |
|
|
custom_embedder_model : DropdownConfig |
|
|
Configuration settings for a custom embedder model dropdown |
|
|
component. |
|
|
|
|
|
""" |
|
|
|
|
|
embedder_model: DropdownConfig = DropdownConfig( |
|
|
label="Embedder model", |
|
|
info="The model to use for generating speaker embeddings.", |
|
|
value=EmbedderModel.CONTENTVEC, |
|
|
choices=list(EmbedderModel), |
|
|
exclude_value=True, |
|
|
) |
|
|
custom_embedder_model: DropdownConfig = DropdownConfig( |
|
|
label="Custom embedder model", |
|
|
info="Select a custom embedder model from the dropdown.", |
|
|
value=None, |
|
|
visible=False, |
|
|
render=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
|
|
|
|
|
|
class TrainingConfig(BaseTabConfig): |
|
|
""" |
|
|
Common component configuration settings for training tabs. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
dataset_type : DropdownConfig |
|
|
Configuration settings for a dataset type dropdown component. |
|
|
dataset : DropdownConfig |
|
|
Configuration settings for a dataset dropdown component. |
|
|
dataset_name : TextboxConfig |
|
|
Configuration settings for a dataset name textbox component. |
|
|
preprocess_model : DropdownConfig |
|
|
Configuration settings for a model name dropdown component |
|
|
for audio preprocessing. |
|
|
sample_rate : DropdownConfig |
|
|
Configuration settings for a sample rate dropdown component. |
|
|
filter_audio : CheckboxConfig |
|
|
Configuration settings for a filter audio checkbox component. |
|
|
clean_audio : CheckboxConfig |
|
|
Configuration settings for a clean audio checkbox component. |
|
|
clean_strength : SliderConfig |
|
|
Configuration settings for a clean strength slider component. |
|
|
split_method : DropdownConfig |
|
|
Configuration settings for an audio splitting method dropdown |
|
|
component. |
|
|
chunk_len : SliderConfig |
|
|
Configuration settings for a chunk length slider component. |
|
|
overlap_len : SliderConfig |
|
|
Configuration settings for an overlap length slider component. |
|
|
preprocess_cores : SliderConfig |
|
|
Configuration settings for a CPU cores slider component for |
|
|
preprocessing. |
|
|
extract_model : DropdownConfig |
|
|
Configuration settings for a model name dropdown component for |
|
|
feature extraction. |
|
|
f0_method : DropdownConfig |
|
|
Configuration settings for an F0 method dropdown component. |
|
|
hop_length : SliderConfig |
|
|
Configuration settings for a hop length slider component. |
|
|
include_mutes : SliderConfig |
|
|
Configuration settings for an include mutes slider component. |
|
|
extract_cores : SliderConfig |
|
|
Configuration settings for a CPU cores slider component for |
|
|
feature extraction. |
|
|
extraction_acceleration : HardwareAccelerationConfig |
|
|
Configuration settings for a hardware acceleration component for |
|
|
feature extraction. |
|
|
extraction_gpus : DropdownConfig |
|
|
Configuration settings for a GPU dropdown compoennt for feature |
|
|
extraction. |
|
|
train_model : DropdownConfig |
|
|
Configuration settings for a model name dropdown component for |
|
|
training. |
|
|
num_epochs : SliderConfig |
|
|
Configuration settings for a number of epochs slider component. |
|
|
batch_size : SliderConfig |
|
|
Configuration settings for a batch size slider component. |
|
|
detect_overtraining : CheckboxConfig |
|
|
Configuration settings for a detect overtraining checkbox |
|
|
component. |
|
|
overtraining_threshold : SliderConfig |
|
|
Configuration settings for an overtraining threshold slider |
|
|
component. |
|
|
vocoder : DropdownConfig |
|
|
Configuration settings for a vocoder dropdown component. |
|
|
index_algorithm : DropdownConfig |
|
|
Configuration settings for an index algorithm dropdown |
|
|
component. |
|
|
pretrained_type : DropdownConfig |
|
|
Configuration settings for a pretrained model type dropdown |
|
|
component. |
|
|
custom_pretrained_model : DropdownConfig |
|
|
Configuration settings for a custom pretrained model dropdown |
|
|
component. |
|
|
save_interval : SliderConfig |
|
|
Configuration settings for a save-interval slider component. |
|
|
save_all_checkpoints : CheckboxConfig |
|
|
Configuration settings for a save-all-checkpoints checkbox |
|
|
component. |
|
|
save_all_weights : CheckboxConfig |
|
|
Configuration settings for a save-all-weights checkbox |
|
|
component. |
|
|
clear_saved_data : CheckboxConfig |
|
|
Configuration settings for a clear-saved-data checkbox |
|
|
component. |
|
|
upload_model : CheckboxConfig |
|
|
Configuration settings for an upload voice model checkbox |
|
|
component. |
|
|
upload_name : TextboxConfig |
|
|
Configuration settings for an upload name textbox component. |
|
|
training_acceleration : HardwareAccelerationConfig |
|
|
Configuration settings for a hardware acceleration component for |
|
|
training. |
|
|
training_gpus : DropdownConfig |
|
|
Configuration settings for a GPU dropdown component for |
|
|
training. |
|
|
preload_dataset : CheckboxConfig |
|
|
Configuration settings for a preload dataset checkbox component. |
|
|
reduce_memory_usage : CheckboxConfig |
|
|
Configuration settings for a reduce-memory-usage checkbox |
|
|
component. |
|
|
|
|
|
See Also |
|
|
-------- |
|
|
BaseTabConfig |
|
|
Parent model defining common component configuration settings |
|
|
for UI tabs. |
|
|
|
|
|
""" |
|
|
|
|
|
dataset_type: DropdownConfig = DropdownConfig( |
|
|
label="Dataset type", |
|
|
info="Select the type of dataset to preprocess.", |
|
|
value=DatasetType.NEW_DATASET, |
|
|
choices=list(DatasetType), |
|
|
exclude_value=True, |
|
|
) |
|
|
dataset: DropdownConfig = DropdownConfig( |
|
|
label="Dataset path", |
|
|
info=( |
|
|
"The path to an existing dataset. Either select a path to a previously" |
|
|
" created dataset or provide a path to an external dataset." |
|
|
), |
|
|
value=None, |
|
|
allow_custom_value=True, |
|
|
visible=False, |
|
|
render=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
dataset_name: TextboxConfig = TextboxConfig( |
|
|
label="Dataset name", |
|
|
info=( |
|
|
"The name of the new dataset. If the dataset already exists, the provided" |
|
|
" audio files will be added to it." |
|
|
), |
|
|
value="My dataset", |
|
|
exclude_value=True, |
|
|
) |
|
|
preprocess_model: DropdownConfig = DropdownConfig( |
|
|
label="Model name", |
|
|
info=( |
|
|
"Name of the model to preprocess the given dataset for. Either select an" |
|
|
" existing model from the dropdown or provide the name of a new model." |
|
|
), |
|
|
value="My model", |
|
|
allow_custom_value=True, |
|
|
render=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
sample_rate: DropdownConfig = DropdownConfig( |
|
|
label="Sample rate", |
|
|
info="Target sample rate for the audio files in the provided dataset.", |
|
|
value=TrainingSampleRate.HZ_40K, |
|
|
choices=list(TrainingSampleRate), |
|
|
) |
|
|
filter_audio: CheckboxConfig = CheckboxConfig( |
|
|
label="Filter audio", |
|
|
info=( |
|
|
"Whether to remove low-frequency sounds from the audio files in the" |
|
|
" provided dataset by applying a high-pass butterworth filter.<br><br>" |
|
|
), |
|
|
value=True, |
|
|
) |
|
|
clean_audio: CheckboxConfig = CheckboxConfig( |
|
|
label="Clean audio", |
|
|
info=( |
|
|
"Whether to clean the audio files in the provided dataset using noise" |
|
|
" reduction algorithms.<br><br><br>" |
|
|
), |
|
|
value=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
clean_strength: SliderConfig = SliderConfig.clean_strength(visible=False) |
|
|
split_method: DropdownConfig = DropdownConfig( |
|
|
label="Audio splitting method", |
|
|
info=( |
|
|
"The method to use for splitting the audio files in the provided dataset." |
|
|
" Use the `Skip` method to skip splitting if the audio files are already" |
|
|
" split. Use the `Simple` method if excessive silence has already been" |
|
|
" removed from the audio files. Use the `Automatic` method for automatic" |
|
|
" silence detection and splitting around it." |
|
|
), |
|
|
value=AudioSplitMethod.AUTOMATIC, |
|
|
choices=list(AudioSplitMethod), |
|
|
exclude_value=True, |
|
|
) |
|
|
chunk_len: SliderConfig = SliderConfig( |
|
|
label="Chunk length", |
|
|
info="Length of split audio chunks.", |
|
|
value=3.0, |
|
|
minimum=0.5, |
|
|
maximum=5.0, |
|
|
step=0.1, |
|
|
visible=False, |
|
|
) |
|
|
overlap_len: SliderConfig = SliderConfig( |
|
|
label="Overlap length", |
|
|
info="Length of overlap between split audio chunks.", |
|
|
value=0.3, |
|
|
minimum=0.0, |
|
|
maximum=0.4, |
|
|
step=0.1, |
|
|
visible=False, |
|
|
) |
|
|
preprocess_cores: SliderConfig = SliderConfig.cpu_cores() |
|
|
|
|
|
extract_model: DropdownConfig = DropdownConfig( |
|
|
label="Model name", |
|
|
info=( |
|
|
"Name of the model with an associated preprocessed dataset to extract" |
|
|
" training features from. When a new dataset is preprocessed, its" |
|
|
" associated model is selected by default." |
|
|
), |
|
|
value=None, |
|
|
render=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
f0_method: DropdownConfig = DropdownConfig( |
|
|
label="F0 method", |
|
|
info="The method to use for extracting pitch features.", |
|
|
value=TrainingF0Method.RMVPE, |
|
|
choices=list(TrainingF0Method), |
|
|
exclude_value=True, |
|
|
) |
|
|
|
|
|
hop_length: SliderConfig = SliderConfig.hop_length( |
|
|
label="Hop length", |
|
|
info="The hop length to use for extracting pitch features.<br><br>", |
|
|
visible=False, |
|
|
) |
|
|
include_mutes: SliderConfig = SliderConfig( |
|
|
label="Include mutes", |
|
|
info=( |
|
|
"The number of mute audio files to include in the generated training file" |
|
|
" list. Adding silent files enables the training model to handle pure" |
|
|
" silence in inferred audio files. If the preprocessed audio dataset" |
|
|
" already contains segments of pure silence, set this to 0." |
|
|
), |
|
|
value=0, |
|
|
minimum=0, |
|
|
maximum=10, |
|
|
step=1, |
|
|
) |
|
|
extraction_cores: SliderConfig = SliderConfig.cpu_cores() |
|
|
extraction_acceleration: DropdownConfig = DropdownConfig.hardware_acceleration() |
|
|
extraction_gpus: DropdownConfig = DropdownConfig.gpu() |
|
|
|
|
|
train_model: DropdownConfig = DropdownConfig( |
|
|
label="Model name", |
|
|
info=( |
|
|
"Name of the model to train. When training features are extracted for a new" |
|
|
" model, its name is selected by default." |
|
|
), |
|
|
value=None, |
|
|
render=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
num_epochs: SliderConfig = SliderConfig( |
|
|
label="Number of epochs", |
|
|
info=( |
|
|
"The number of epochs to train the voice model. A higher number can improve" |
|
|
" voice model performance but may lead to overtraining." |
|
|
), |
|
|
value=500, |
|
|
minimum=1, |
|
|
maximum=5000, |
|
|
step=1, |
|
|
) |
|
|
batch_size: SliderConfig = SliderConfig( |
|
|
label="Batch size", |
|
|
info=( |
|
|
"The number of samples in each training batch. It is advisable to align" |
|
|
" this value with the available VRAM of your GPU." |
|
|
), |
|
|
value=16, |
|
|
minimum=1, |
|
|
maximum=128, |
|
|
step=1, |
|
|
) |
|
|
detect_overtraining: CheckboxConfig = CheckboxConfig( |
|
|
label="Detect overtraining", |
|
|
info=( |
|
|
"Whether to detect overtraining to prevent the voice model from learning" |
|
|
" the training data too well and losing the ability to generalize to new" |
|
|
" data." |
|
|
), |
|
|
value=True, |
|
|
exclude_value=True, |
|
|
) |
|
|
overtraining_threshold: SliderConfig = SliderConfig( |
|
|
label="Overtraining threshold", |
|
|
info=( |
|
|
"The maximum number of epochs to continue training without any observed" |
|
|
" improvement in voice model performance." |
|
|
), |
|
|
value=500, |
|
|
minimum=1, |
|
|
maximum=1000, |
|
|
visible=False, |
|
|
) |
|
|
vocoder: DropdownConfig = DropdownConfig( |
|
|
label="Vocoder", |
|
|
info=( |
|
|
"The vocoder to use for audio synthesis during training. HiFi-GAN provides" |
|
|
" basic audio fidelity, while RefineGAN provides the highest audio" |
|
|
" fidelity." |
|
|
), |
|
|
value=Vocoder.HIFI_GAN, |
|
|
choices=list(Vocoder), |
|
|
) |
|
|
index_algorithm: DropdownConfig = DropdownConfig( |
|
|
label="Index algorithm", |
|
|
info=( |
|
|
"The method to use for generating an index file for the trained voice" |
|
|
" model. `KMeans` is particularly useful for large datasets." |
|
|
), |
|
|
value=IndexAlgorithm.AUTO, |
|
|
choices=list(IndexAlgorithm), |
|
|
) |
|
|
pretrained_type: DropdownConfig = DropdownConfig( |
|
|
label="Pretrained model type", |
|
|
info=( |
|
|
"The type of pretrained model to finetune the voice model on. `None` will" |
|
|
" train the voice model from scratch, while `Default` will use a pretrained" |
|
|
" model tailored to the specific voice model architecture. `Custom` will" |
|
|
" use a custom pretrained that you provide." |
|
|
), |
|
|
value=PretrainedType.DEFAULT, |
|
|
choices=list(PretrainedType), |
|
|
exclude_value=True, |
|
|
) |
|
|
custom_pretrained_model: DropdownConfig = DropdownConfig( |
|
|
label="Custom pretrained model", |
|
|
info="Select a custom pretrained model to finetune from the dropdown.", |
|
|
value=None, |
|
|
visible=False, |
|
|
render=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
save_interval: SliderConfig = SliderConfig( |
|
|
label="Save interval", |
|
|
info=( |
|
|
"The epoch interval at which to to save voice model weights and" |
|
|
" checkpoints. The best model weights are always saved regardless of this" |
|
|
" setting." |
|
|
), |
|
|
value=10, |
|
|
minimum=1, |
|
|
maximum=100, |
|
|
step=1, |
|
|
) |
|
|
save_all_checkpoints: CheckboxConfig = CheckboxConfig( |
|
|
label="Save all checkpoints", |
|
|
info=( |
|
|
"Whether to save a unique checkpoint at each save interval. If not enabled," |
|
|
" only the latest checkpoint will be saved at each interval." |
|
|
), |
|
|
value=True, |
|
|
) |
|
|
save_all_weights: CheckboxConfig = CheckboxConfig( |
|
|
label="Save all weights", |
|
|
info=( |
|
|
"Whether to save unique voice model weights at each save interval. If not" |
|
|
" enabled, only the best voice model weights will be saved." |
|
|
), |
|
|
value=True, |
|
|
) |
|
|
clear_saved_data: CheckboxConfig = CheckboxConfig( |
|
|
label="Clear saved data", |
|
|
info=( |
|
|
"Whether to delete any existing training data associated with the voice" |
|
|
" model before training commences. Enable this setting only if you are" |
|
|
" training a new voice model from scratch or restarting training." |
|
|
), |
|
|
value=False, |
|
|
) |
|
|
upload_model: CheckboxConfig = CheckboxConfig( |
|
|
label="Upload voice model", |
|
|
info=( |
|
|
"Whether to automatically upload the trained voice model so that it can be" |
|
|
" used for generation tasks within the Ultimate RVC app." |
|
|
), |
|
|
value=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
upload_name: TextboxConfig = TextboxConfig( |
|
|
label="Upload name", |
|
|
info="The name to give the uploaded voice model.", |
|
|
value=None, |
|
|
visible=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
training_acceleration: DropdownConfig = DropdownConfig.hardware_acceleration() |
|
|
training_gpus: DropdownConfig = DropdownConfig.gpu() |
|
|
preload_dataset: CheckboxConfig = CheckboxConfig( |
|
|
label="Preload dataset", |
|
|
info=( |
|
|
"Whether to preload all training data into GPU memory. This can improve" |
|
|
" training speed but requires a lot of VRAM.<br><br>" |
|
|
), |
|
|
value=True, |
|
|
) |
|
|
reduce_memory_usage: CheckboxConfig = CheckboxConfig( |
|
|
label="Reduce memory usage", |
|
|
info=( |
|
|
"Whether to reduce VRAM usage at the cost of slower training speed by" |
|
|
" enabling activation checkpointing. This is useful for GPUs with limited" |
|
|
" memory (e.g., <6GB VRAM) or when training with a batch size larger than" |
|
|
" what your GPU can normally accommodate." |
|
|
), |
|
|
value=False, |
|
|
) |
|
|
|
|
|
|
|
|
class GenerationConfig(BaseTabConfig): |
|
|
""" |
|
|
Common component configuration settings for generation tabs. |
|
|
|
|
|
voice_model : DropdownConfig |
|
|
Configuration settings for a voice model dropdown component. |
|
|
f0_methods : DropdownConfig |
|
|
Configuration settings for a pitch extraction algorithms |
|
|
dropdown component. |
|
|
index_rate : SliderConfig |
|
|
Configuration settings for an index rate slider component. |
|
|
rms_mix_rate : SliderConfig |
|
|
Configuration settings for a RMS mix rate slider component. |
|
|
protect_rate : SliderConfig |
|
|
Configuration settings for a protect rate slider component. |
|
|
split_voice : CheckboxConfig |
|
|
Configuration settings for a split voice checkbox component. |
|
|
autotune_voice: CheckboxConfig |
|
|
Configuration settings for an autotune voice checkbox component. |
|
|
autotune_strength: SliderConfig |
|
|
Configuration settings for an autotune strength slider |
|
|
component. |
|
|
sid : NumberConfig |
|
|
Configuration settings for a speaker ID number component. |
|
|
output_sr : DropdownConfig |
|
|
Configuration settings for an output sample rate dropdown |
|
|
component. |
|
|
output_format : DropdownConfig |
|
|
Configuration settings for an output format dropdown |
|
|
component. |
|
|
output_name : TextboxConfig |
|
|
Configuration settings for an output name textbox component. |
|
|
|
|
|
See Also |
|
|
-------- |
|
|
BaseTabConfig |
|
|
Parent model defining common component configuration settings |
|
|
for UI tabs. |
|
|
|
|
|
""" |
|
|
|
|
|
voice_model: DropdownConfig = DropdownConfig( |
|
|
label="Voice model", |
|
|
info="Select a model to use for voice conversion.", |
|
|
value=None, |
|
|
render=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
f0_methods: DropdownConfig = DropdownConfig( |
|
|
label="Pitch extraction algorithm(s)", |
|
|
info=( |
|
|
"If more than one method is selected, then the median of the pitch values" |
|
|
" extracted by each method is used. RMVPE is recommended for most cases and" |
|
|
" is the default when no method is selected." |
|
|
), |
|
|
value=[F0Method.RMVPE], |
|
|
choices=list(F0Method), |
|
|
multiselect=True, |
|
|
) |
|
|
index_rate: SliderConfig = SliderConfig( |
|
|
label="Index rate", |
|
|
info=( |
|
|
"Increase to bias the conversion towards the accent of the voice model." |
|
|
" Decrease to potentially reduce artifacts coming from the voice" |
|
|
" model.<br><br><br>" |
|
|
), |
|
|
value=0.3, |
|
|
minimum=0.0, |
|
|
maximum=1.0, |
|
|
) |
|
|
rms_mix_rate: SliderConfig = SliderConfig( |
|
|
label="RMS mix rate", |
|
|
info=( |
|
|
"How much to mimic the loudness (0) of the input voice or a fixed loudness" |
|
|
" (1). A value of 1 is recommended for most cases.<br><br>" |
|
|
), |
|
|
value=1.0, |
|
|
minimum=0.0, |
|
|
maximum=1.0, |
|
|
) |
|
|
protect_rate: SliderConfig = SliderConfig( |
|
|
label="Protect rate", |
|
|
info=( |
|
|
"Controls the extent to which consonants and breathing sounds are protected" |
|
|
" from artifacts. A higher value offers more protection but may worsen the" |
|
|
" indexing effect.<br><br>" |
|
|
), |
|
|
value=0.33, |
|
|
minimum=0.0, |
|
|
maximum=0.5, |
|
|
) |
|
|
|
|
|
hop_length: SliderConfig = SliderConfig.hop_length( |
|
|
label="Hop length", |
|
|
info=( |
|
|
"How often the CREPE-based pitch extraction method checks for pitch changes" |
|
|
" measured in milliseconds. Lower values lead to longer conversion times" |
|
|
" and a higher risk of voice cracks, but better pitch accuracy." |
|
|
), |
|
|
visible=True, |
|
|
) |
|
|
|
|
|
split_voice: CheckboxConfig = CheckboxConfig( |
|
|
label="Split input voice", |
|
|
info=( |
|
|
"Whether to split the input voice track into smaller segments before" |
|
|
" converting it. This can improve output quality for longer voice tracks." |
|
|
), |
|
|
value=False, |
|
|
) |
|
|
autotune_voice: CheckboxConfig = CheckboxConfig( |
|
|
label="Autotune converted voice", |
|
|
info="Whether to apply autotune to the converted voice.<br><br>", |
|
|
value=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
autotune_strength: SliderConfig = SliderConfig( |
|
|
label="Autotune intensity", |
|
|
info=( |
|
|
"Higher values result in stronger snapping to the chromatic grid and" |
|
|
" artifacting." |
|
|
), |
|
|
value=1.0, |
|
|
minimum=0.0, |
|
|
maximum=1.0, |
|
|
visible=False, |
|
|
) |
|
|
sid: NumberConfig = NumberConfig( |
|
|
label="Speaker ID", |
|
|
info="Speaker ID for multi-speaker-models.", |
|
|
value=0, |
|
|
precision=0, |
|
|
) |
|
|
output_sr: DropdownConfig = DropdownConfig( |
|
|
label="Output sample rate", |
|
|
info="The sample rate of the mixed output track.", |
|
|
value=SampleRate.HZ_44100, |
|
|
choices=list(SampleRate), |
|
|
) |
|
|
output_format: DropdownConfig = DropdownConfig( |
|
|
label="Output format", |
|
|
info="The audio format of the mixed output track.", |
|
|
value=AudioExt.MP3, |
|
|
choices=list(AudioExt), |
|
|
) |
|
|
output_name: TextboxConfig = TextboxConfig( |
|
|
label="Output name", |
|
|
info="If no name is provided, a suitable name will be generated automatically.", |
|
|
value=None, |
|
|
placeholder="Ultimate RVC output", |
|
|
exclude_value=True, |
|
|
) |
|
|
|
|
|
|
|
|
class SongGenerationConfig(GenerationConfig): |
|
|
""" |
|
|
Common component configuration settings for song generation tabs. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
source_type : DropdownConfig |
|
|
Configuration settings for a source type dropdown component. |
|
|
source : TextboxConfig |
|
|
Configuration settings for an input source textbox component. |
|
|
cached_song : DropdownConfig |
|
|
Configuration settings for a cached song dropdown component. |
|
|
clean_strength : SliderConfig |
|
|
Configuration settings for a clean strength slider component. |
|
|
clean_voice : CheckboxConfig |
|
|
Configuration settings for a clean voice checkbox component. |
|
|
room_size : SliderConfig |
|
|
Configuration settings for a room size slider component. |
|
|
wet_level : SliderConfig |
|
|
Configuration settings for a wetness level slider component. |
|
|
dry_level : SliderConfig |
|
|
Configuration settings for a dryness level slider component. |
|
|
damping : SliderConfig |
|
|
Configuration settings for a damping level slider component. |
|
|
main_gain : SliderConfig |
|
|
Configuration settings for a main gain slider component. |
|
|
inst_gain : SliderConfig |
|
|
Configuration settings for an instrumentals gain slider |
|
|
component. |
|
|
backup_gain : SliderConfig |
|
|
Configuration settings for a backup vocals gain slider |
|
|
component. |
|
|
|
|
|
See Also |
|
|
-------- |
|
|
GenerationConfig |
|
|
Parent model defining common component configuration settings |
|
|
for song generation tabs. |
|
|
|
|
|
""" |
|
|
|
|
|
source_type: DropdownConfig = DropdownConfig( |
|
|
label="Source type", |
|
|
info="The type of source to retrieve a song from.", |
|
|
value=SongSourceType.LOCAL_FILE, |
|
|
choices=list(SongSourceType), |
|
|
type="index", |
|
|
exclude_value=True, |
|
|
) |
|
|
source: TextboxConfig = TextboxConfig( |
|
|
label="Source", |
|
|
info="Local (to the server) filepath or http link. Youtube probably wont work but most other sites still do.", |
|
|
value=None, |
|
|
exclude_value=True, |
|
|
) |
|
|
cached_song: DropdownConfig = DropdownConfig( |
|
|
label="Source", |
|
|
info="Select a song from the list of cached songs.", |
|
|
value=None, |
|
|
visible=False, |
|
|
render=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
clean_voice: CheckboxConfig = CheckboxConfig( |
|
|
label="Clean converted voice", |
|
|
info=( |
|
|
"Whether to clean the converted voice using noise reduction" |
|
|
" algorithms.<br><br>" |
|
|
), |
|
|
value=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
clean_strength: SliderConfig = SliderConfig.clean_strength(visible=False) |
|
|
room_size: SliderConfig = SliderConfig( |
|
|
label="Room size", |
|
|
info=( |
|
|
"Size of the room which reverb effect simulates. Increase for longer reverb" |
|
|
" time." |
|
|
), |
|
|
value=0.15, |
|
|
minimum=0.0, |
|
|
maximum=1.0, |
|
|
) |
|
|
wet_level: SliderConfig = SliderConfig( |
|
|
label="Wetness level", |
|
|
info="Loudness of converted vocals with reverb effect applied.", |
|
|
value=0.2, |
|
|
minimum=0.0, |
|
|
maximum=1.0, |
|
|
) |
|
|
dry_level: SliderConfig = SliderConfig( |
|
|
label="Dryness level", |
|
|
info="Loudness of converted vocals without reverb effect applied.", |
|
|
value=0.8, |
|
|
minimum=0.0, |
|
|
maximum=1.0, |
|
|
) |
|
|
damping: SliderConfig = SliderConfig( |
|
|
label="Damping level", |
|
|
info="Absorption of high frequencies in reverb effect.", |
|
|
value=0.7, |
|
|
minimum=0.0, |
|
|
maximum=1.0, |
|
|
) |
|
|
main_gain: SliderConfig = SliderConfig.gain( |
|
|
label="Main gain", |
|
|
info="The gain to apply to the main vocals.", |
|
|
) |
|
|
inst_gain: SliderConfig = SliderConfig.gain( |
|
|
label="Instrumentals gain", |
|
|
info="The gain to apply to the instrumentals.", |
|
|
) |
|
|
backup_gain: SliderConfig = SliderConfig.gain( |
|
|
label="Backup gain", |
|
|
info="The gain to apply to the backup vocals.", |
|
|
) |
|
|
|
|
|
|
|
|
class SpeechGenerationConfig(GenerationConfig): |
|
|
""" |
|
|
Common component configuration settings for speech generation tabs. |
|
|
|
|
|
Attributes |
|
|
---------- |
|
|
source_type : DropdownConfig |
|
|
Configuration settings for a source type dropdown component. |
|
|
source : TextboxConfig |
|
|
Configuration settings for an input source textbox component. |
|
|
edge_tts_voice : DropdownConfig |
|
|
Configuration settings for an Edge TTS voice dropdown |
|
|
component. |
|
|
n_octaves : SliderConfig |
|
|
Configuration settings for an octave pitch shift slider |
|
|
component. |
|
|
n_semitones : SliderConfig |
|
|
Configuration settings for a semitone pitch shift slider |
|
|
component. |
|
|
tts_pitch_shift : SliderConfig |
|
|
Configuration settings for a TTS pitch shift slider |
|
|
component. |
|
|
tts_speed_change : SliderConfig |
|
|
Configuration settings for a TTS speed change slider |
|
|
component. |
|
|
tts_volume_change : SliderConfig |
|
|
Configuration settings for a TTS volume change slider |
|
|
component. |
|
|
clean_voice : CheckboxConfig |
|
|
Configuration settings for a clean voice checkbox |
|
|
component. |
|
|
clean_strength : SliderConfig |
|
|
Configuration settings for a clean strength slider |
|
|
component. |
|
|
output_gain : GainSliderConfig |
|
|
Configuration settings for an output gain slider component. |
|
|
|
|
|
See Also |
|
|
-------- |
|
|
GenerationConfig |
|
|
Parent model defining common component configuration settings |
|
|
for generation tabs. |
|
|
|
|
|
""" |
|
|
|
|
|
source_type: DropdownConfig = DropdownConfig( |
|
|
label="Source type", |
|
|
info="The type of source to generate speech from.", |
|
|
value=SpeechSourceType.TEXT, |
|
|
choices=list(SpeechSourceType), |
|
|
type="index", |
|
|
exclude_value=True, |
|
|
) |
|
|
source: TextboxConfig = TextboxConfig( |
|
|
label="Source", |
|
|
info="Text to generate speech from", |
|
|
value=None, |
|
|
exclude_value=True, |
|
|
) |
|
|
edge_tts_voice: DropdownConfig = DropdownConfig( |
|
|
label="Edge TTS voice", |
|
|
info="Select a voice to use for text to speech conversion.", |
|
|
value=None, |
|
|
render=False, |
|
|
exclude_value=True, |
|
|
) |
|
|
n_octaves: SliderConfig = SliderConfig.octave_shift( |
|
|
label="Octave shift", |
|
|
info=( |
|
|
"The number of octaves to pitch-shift the converted speech by. Use 1 for" |
|
|
" male-to-female and -1 for vice-versa." |
|
|
), |
|
|
) |
|
|
n_semitones: SliderConfig = SliderConfig.semitone_shift( |
|
|
label="Semitone shift", |
|
|
info="The number of semi-tones to pitch-shift the converted speech by.", |
|
|
) |
|
|
tts_pitch_shift: SliderConfig = SliderConfig( |
|
|
label="Edge TTS pitch shift", |
|
|
info=( |
|
|
"The number of hertz to shift the pitch of the speech generated by Edge" |
|
|
" TTS." |
|
|
), |
|
|
value=0, |
|
|
minimum=-100, |
|
|
maximum=100, |
|
|
step=1, |
|
|
) |
|
|
tts_speed_change: SliderConfig = SliderConfig( |
|
|
label="TTS speed change", |
|
|
info="The percentual change to the speed of the speech generated by Edge TTS.", |
|
|
value=0, |
|
|
minimum=-50, |
|
|
maximum=100, |
|
|
step=1, |
|
|
) |
|
|
tts_volume_change: SliderConfig = SliderConfig( |
|
|
label="TTS volume change", |
|
|
info="The percentual change to the volume of the speech generated by Edge TTS.", |
|
|
value=0, |
|
|
minimum=-100, |
|
|
maximum=100, |
|
|
step=1, |
|
|
) |
|
|
clean_voice: CheckboxConfig = CheckboxConfig( |
|
|
label="Clean converted voice", |
|
|
info=( |
|
|
"Whether to clean the converted voice using noise reduction" |
|
|
" algorithms.<br><br>" |
|
|
), |
|
|
value=True, |
|
|
exclude_value=True, |
|
|
) |
|
|
clean_strength: SliderConfig = SliderConfig.clean_strength(visible=True) |
|
|
output_gain: SliderConfig = SliderConfig.gain( |
|
|
label="Output gain", |
|
|
info="The gain to apply to the converted speech.<br><br>", |
|
|
) |
|
|
|
|
|
|
|
|
total_config = load_config(config_name, TotalConfig) if config_name else TotalConfig() |
|
|
|
|
|
|
|
|
def render_song_cover_multi_step_tab( |
|
|
total_config: TotalConfig, cookiefile: str | None = None |
|
|
) -> None: |
|
|
""" |
|
|
Render "Generate song cover - multi-step generation" tab. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
total_config : TotalConfig |
|
|
Model containing all component configuration settings for the |
|
|
Ultimate RVC web UI. |
|
|
cookiefile : str, optional |
|
|
The path to a file containing cookies to use when downloading |
|
|
audio from Youtube. |
|
|
|
|
|
""" |
|
|
tab_config = total_config.song.multi_step |
|
|
for input_track in tab_config.input_audio.all: |
|
|
input_track.instantiate() |
|
|
with gr.Tab("Multi-step"): |
|
|
_render_step_0(total_config, cookiefile=cookiefile) |
|
|
_render_step_1(tab_config) |
|
|
_render_step_2(tab_config) |
|
|
_render_step_3(tab_config) |
|
|
_render_step_4(tab_config) |
|
|
_render_step_5(total_config, tab_config) |
|
|
|
|
|
|
|
|
def _render_step_0(total_config: TotalConfig, cookiefile: str | None) -> None: |
|
|
tab_config = total_config.song.multi_step |
|
|
|
|
|
current_song_dir = gr.State(None) |
|
|
with gr.Accordion("Step 0: song retrieval", open=True): |
|
|
gr.Markdown("") |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
tab_config.source_type.instantiate() |
|
|
with gr.Column(): |
|
|
tab_config.source.instantiate() |
|
|
local_file = gr.Audio( |
|
|
label="Source", |
|
|
type="filepath", |
|
|
visible=False, |
|
|
waveform_options=gr.WaveformOptions(show_recording_waveform=True), |
|
|
) |
|
|
tab_config.cached_song.instance.render() |
|
|
|
|
|
tab_config.source_type.instance.input( |
|
|
partial(toggle_visible_component, 3), |
|
|
inputs=tab_config.source_type.instance, |
|
|
outputs=[ |
|
|
tab_config.source.instance, |
|
|
local_file, |
|
|
tab_config.cached_song.instance, |
|
|
], |
|
|
show_progress="hidden", |
|
|
) |
|
|
|
|
|
local_file.change( |
|
|
update_value, |
|
|
inputs=local_file, |
|
|
outputs=tab_config.source.instance, |
|
|
show_progress="hidden", |
|
|
) |
|
|
tab_config.cached_song.instance.input( |
|
|
update_value, |
|
|
inputs=tab_config.cached_song.instance, |
|
|
outputs=tab_config.source.instance, |
|
|
show_progress="hidden", |
|
|
) |
|
|
with gr.Accordion("Options", open=False): |
|
|
song_transfer = _render_song_transfer( |
|
|
[SongTransferOption.STEP_1_AUDIO], |
|
|
"Song", |
|
|
) |
|
|
with gr.Row(): |
|
|
retrieve_song_reset_btn = gr.Button("Reset options") |
|
|
retrieve_song_btn = gr.Button("Retrieve song", variant="primary") |
|
|
song_transfer_btn = gr.Button("Transfer song") |
|
|
song_output = gr.Audio( |
|
|
label="Song", |
|
|
type="filepath", |
|
|
interactive=False, |
|
|
waveform_options=gr.WaveformOptions(show_recording_waveform=True), |
|
|
) |
|
|
|
|
|
retrieve_song_reset_btn.click( |
|
|
lambda: gr.Dropdown(value=[SongTransferOption.STEP_1_AUDIO]), |
|
|
outputs=song_transfer, |
|
|
show_progress="hidden", |
|
|
) |
|
|
|
|
|
retrieve_song_btn.click( |
|
|
partial( |
|
|
exception_harness( |
|
|
retrieve_song, |
|
|
info_msg="Song retrieved successfully!", |
|
|
), |
|
|
cookiefile=cookiefile, |
|
|
), |
|
|
inputs=tab_config.source.instance, |
|
|
outputs=[song_output, current_song_dir], |
|
|
).then( |
|
|
partial( |
|
|
update_dropdowns, |
|
|
get_named_song_dirs, |
|
|
len(tab_config.song_dirs.all) + 2, |
|
|
value_indices=range(len(tab_config.song_dirs.all)), |
|
|
), |
|
|
inputs=current_song_dir, |
|
|
outputs=[ |
|
|
*tab_config.song_dirs.all, |
|
|
tab_config.cached_song.instance, |
|
|
total_config.song.one_click.cached_song.instance, |
|
|
], |
|
|
show_progress="hidden", |
|
|
).then( |
|
|
partial(update_dropdowns, get_named_song_dirs, 1, [], [0]), |
|
|
outputs=total_config.management.audio.intermediate.instance, |
|
|
show_progress="hidden", |
|
|
) |
|
|
setup_transfer_event( |
|
|
song_transfer_btn, |
|
|
song_transfer, |
|
|
song_output, |
|
|
tab_config.input_audio.all, |
|
|
) |
|
|
|
|
|
|
|
|
def _render_step_1(tab_config: MultiStepSongGenerationConfig) -> None: |
|
|
with gr.Accordion("Step 1: vocal separation", open=False): |
|
|
tab_config.input_audio.audio.instance.render() |
|
|
tab_config.song_dirs.separate_audio.instance.render() |
|
|
with gr.Accordion("Options", open=False): |
|
|
with gr.Row(): |
|
|
tab_config.separation_model.instantiate() |
|
|
tab_config.segment_size.instantiate() |
|
|
with gr.Row(): |
|
|
primary_stem_transfer = _render_song_transfer( |
|
|
[SongTransferOption.STEP_2_VOCALS], |
|
|
"Primary stem", |
|
|
) |
|
|
secondary_stem_transfer = _render_song_transfer( |
|
|
[SongTransferOption.STEP_4_INSTRUMENTALS], |
|
|
"Secondary stem", |
|
|
) |
|
|
with gr.Row(): |
|
|
separate_audio_reset_btn = gr.Button("Reset options") |
|
|
separate_vocals_btn = gr.Button("Separate vocals", variant="primary") |
|
|
with gr.Row(): |
|
|
primary_stem_transfer_btn = gr.Button("Transfer primary stem") |
|
|
secondary_stem_transfer_btn = gr.Button("Transfer secondary stem") |
|
|
|
|
|
with gr.Row(): |
|
|
primary_stem_output = gr.Audio( |
|
|
label="Primary stem", |
|
|
type="filepath", |
|
|
interactive=False, |
|
|
waveform_options=gr.WaveformOptions(show_recording_waveform=True), |
|
|
) |
|
|
secondary_stem_output = gr.Audio( |
|
|
label="Secondary stem", |
|
|
type="filepath", |
|
|
interactive=False, |
|
|
waveform_options=gr.WaveformOptions(show_recording_waveform=True), |
|
|
) |
|
|
|
|
|
separate_audio_reset_btn.click( |
|
|
lambda: [ |
|
|
tab_config.separation_model.value, |
|
|
tab_config.segment_size.value, |
|
|
gr.Dropdown(value=[SongTransferOption.STEP_2_VOCALS]), |
|
|
gr.Dropdown(value=[SongTransferOption.STEP_4_INSTRUMENTALS]), |
|
|
], |
|
|
outputs=[ |
|
|
tab_config.separation_model.instance, |
|
|
tab_config.segment_size.instance, |
|
|
primary_stem_transfer, |
|
|
secondary_stem_transfer, |
|
|
], |
|
|
show_progress="hidden", |
|
|
) |
|
|
separate_vocals_btn.click( |
|
|
exception_harness( |
|
|
separate_audio, |
|
|
info_msg="Vocals separated successfully!", |
|
|
), |
|
|
inputs=[ |
|
|
tab_config.input_audio.audio.instance, |
|
|
tab_config.song_dirs.separate_audio.instance, |
|
|
tab_config.separation_model.instance, |
|
|
tab_config.segment_size.instance, |
|
|
], |
|
|
outputs=[primary_stem_output, secondary_stem_output], |
|
|
concurrency_limit=1, |
|
|
concurrency_id=ConcurrencyId.GPU, |
|
|
) |
|
|
for btn, transfer, output in [ |
|
|
(primary_stem_transfer_btn, primary_stem_transfer, primary_stem_output), |
|
|
( |
|
|
secondary_stem_transfer_btn, |
|
|
secondary_stem_transfer, |
|
|
secondary_stem_output, |
|
|
), |
|
|
]: |
|
|
setup_transfer_event( |
|
|
btn, |
|
|
transfer, |
|
|
output, |
|
|
tab_config.input_audio.all, |
|
|
) |
|
|
|
|
|
|
|
|
def _render_step_2(tab_config: MultiStepSongGenerationConfig) -> None: |
|
|
with gr.Accordion("Step 2: vocal conversion", open=False): |
|
|
tab_config.input_audio.vocals.instance.render() |
|
|
tab_config.voice_model.instance.render() |
|
|
tab_config.song_dirs.convert_vocals.instance.render() |
|
|
with gr.Accordion("Options", open=False): |
|
|
with gr.Row(): |
|
|
tab_config.n_octaves.instantiate() |
|
|
tab_config.n_semitones.instantiate() |
|
|
|
|
|
converted_vocals_transfer = _render_song_transfer( |
|
|
[SongTransferOption.STEP_3_VOCALS], |
|
|
"Converted vocals", |
|
|
) |
|
|
with gr.Accordion("Advanced", open=False): |
|
|
with gr.Accordion("Voice synthesis", open=False): |
|
|
with gr.Row(): |
|
|
tab_config.f0_methods.instantiate() |
|
|
tab_config.index_rate.instantiate() |
|
|
with gr.Row(): |
|
|
tab_config.rms_mix_rate.instantiate() |
|
|
tab_config.protect_rate.instantiate() |
|
|
tab_config.hop_length.instantiate() |
|
|
with gr.Accordion("Vocal enrichment", open=False), gr.Row(): |
|
|
with gr.Column(): |
|
|
tab_config.split_voice.instantiate() |
|
|
with gr.Column(): |
|
|
tab_config.autotune_voice.instantiate() |
|
|
tab_config.autotune_strength.instantiate() |
|
|
with gr.Column(): |
|
|
tab_config.clean_voice.instantiate() |
|
|
tab_config.clean_strength.instantiate() |
|
|
tab_config.autotune_voice.instance.change( |
|
|
partial(toggle_visibility, targets={True}), |
|
|
inputs=tab_config.autotune_voice.instance, |
|
|
outputs=tab_config.autotune_strength.instance, |
|
|
show_progress="hidden", |
|
|
) |
|
|
tab_config.clean_voice.instance.change( |
|
|
partial(toggle_visibility, targets={True}), |
|
|
inputs=tab_config.clean_voice.instance, |
|
|
outputs=tab_config.clean_strength.instance, |
|
|
show_progress="hidden", |
|
|
) |
|
|
with gr.Accordion("Speaker embeddings", open=False), gr.Row(): |
|
|
with gr.Column(): |
|
|
tab_config.embedder_model.instantiate() |
|
|
tab_config.custom_embedder_model.instance.render() |
|
|
tab_config.sid.instantiate() |
|
|
tab_config.embedder_model.instance.change( |
|
|
partial(toggle_visibility, targets={EmbedderModel.CUSTOM}), |
|
|
inputs=tab_config.embedder_model.instance, |
|
|
outputs=tab_config.custom_embedder_model.instance, |
|
|
show_progress="hidden", |
|
|
) |
|
|
with gr.Row(): |
|
|
convert_vocals_reset_btn = gr.Button("Reset options") |
|
|
convert_vocals_btn = gr.Button("Convert vocals", variant="primary") |
|
|
converted_vocals_transfer_btn = gr.Button("Transfer converted vocals") |
|
|
converted_vocals_track_output = gr.Audio( |
|
|
label="Converted vocals", |
|
|
type="filepath", |
|
|
interactive=False, |
|
|
waveform_options=gr.WaveformOptions(show_recording_waveform=True), |
|
|
) |
|
|
|
|
|
convert_vocals_reset_btn.click( |
|
|
lambda: [ |
|
|
tab_config.n_octaves.value, |
|
|
tab_config.n_semitones.value, |
|
|
tab_config.f0_methods.value, |
|
|
tab_config.index_rate.value, |
|
|
tab_config.rms_mix_rate.value, |
|
|
tab_config.protect_rate.value, |
|
|
tab_config.hop_length.value, |
|
|
tab_config.split_voice.value, |
|
|
tab_config.autotune_voice.value, |
|
|
tab_config.autotune_strength.value, |
|
|
tab_config.clean_voice.value, |
|
|
tab_config.clean_strength.value, |
|
|
tab_config.embedder_model.value, |
|
|
tab_config.sid.value, |
|
|
gr.Dropdown(value=[SongTransferOption.STEP_3_VOCALS]), |
|
|
], |
|
|
outputs=[ |
|
|
tab_config.n_octaves.instance, |
|
|
tab_config.n_semitones.instance, |
|
|
tab_config.f0_methods.instance, |
|
|
tab_config.index_rate.instance, |
|
|
tab_config.rms_mix_rate.instance, |
|
|
tab_config.protect_rate.instance, |
|
|
tab_config.hop_length.instance, |
|
|
tab_config.split_voice.instance, |
|
|
tab_config.autotune_voice.instance, |
|
|
tab_config.autotune_strength.instance, |
|
|
tab_config.clean_voice.instance, |
|
|
tab_config.clean_strength.instance, |
|
|
tab_config.embedder_model.instance, |
|
|
tab_config.sid.instance, |
|
|
converted_vocals_transfer, |
|
|
], |
|
|
show_progress="hidden", |
|
|
) |
|
|
convert_vocals_btn.click( |
|
|
partial( |
|
|
exception_harness(convert, info_msg="Vocals converted successfully!"), |
|
|
content_type=RVCContentType.VOCALS, |
|
|
), |
|
|
inputs=[ |
|
|
tab_config.input_audio.vocals.instance, |
|
|
tab_config.song_dirs.convert_vocals.instance, |
|
|
tab_config.voice_model.instance, |
|
|
tab_config.n_octaves.instance, |
|
|
tab_config.n_semitones.instance, |
|
|
tab_config.f0_methods.instance, |
|
|
tab_config.index_rate.instance, |
|
|
tab_config.rms_mix_rate.instance, |
|
|
tab_config.protect_rate.instance, |
|
|
tab_config.hop_length.instance, |
|
|
tab_config.split_voice.instance, |
|
|
tab_config.autotune_voice.instance, |
|
|
tab_config.autotune_strength.instance, |
|
|
tab_config.clean_voice.instance, |
|
|
tab_config.clean_strength.instance, |
|
|
tab_config.embedder_model.instance, |
|
|
tab_config.custom_embedder_model.instance, |
|
|
tab_config.sid.instance, |
|
|
], |
|
|
outputs=converted_vocals_track_output, |
|
|
concurrency_id=ConcurrencyId.GPU, |
|
|
concurrency_limit=1, |
|
|
) |
|
|
setup_transfer_event( |
|
|
converted_vocals_transfer_btn, |
|
|
converted_vocals_transfer, |
|
|
converted_vocals_track_output, |
|
|
tab_config.input_audio.all, |
|
|
) |
|
|
|
|
|
|
|
|
def _render_step_3(tab_config: MultiStepSongGenerationConfig) -> None: |
|
|
with gr.Accordion("Step 3: vocal post-processing", open=False): |
|
|
tab_config.input_audio.converted_vocals.instance.render() |
|
|
tab_config.song_dirs.postprocess_vocals.instance.render() |
|
|
with gr.Accordion("Options", open=False): |
|
|
tab_config.room_size.instantiate() |
|
|
with gr.Row(): |
|
|
tab_config.wet_level.instantiate() |
|
|
tab_config.dry_level.instantiate() |
|
|
tab_config.damping.instantiate() |
|
|
effected_vocals_transfer = _render_song_transfer( |
|
|
[SongTransferOption.STEP_5_MAIN_VOCALS], |
|
|
"Effected vocals", |
|
|
) |
|
|
with gr.Row(): |
|
|
postprocess_vocals_reset_btn = gr.Button("Reset options") |
|
|
postprocess_vocals_btn = gr.Button("Post-process vocals", variant="primary") |
|
|
effected_vocals_transfer_btn = gr.Button("Transfer effected vocals") |
|
|
|
|
|
effected_vocals_track_output = gr.Audio( |
|
|
label="Effected vocals", |
|
|
type="filepath", |
|
|
interactive=False, |
|
|
waveform_options=gr.WaveformOptions(show_recording_waveform=True), |
|
|
) |
|
|
|
|
|
postprocess_vocals_reset_btn.click( |
|
|
lambda: [ |
|
|
tab_config.room_size.value, |
|
|
tab_config.wet_level.value, |
|
|
tab_config.dry_level.value, |
|
|
tab_config.damping.value, |
|
|
gr.Dropdown(value=[SongTransferOption.STEP_5_MAIN_VOCALS]), |
|
|
], |
|
|
outputs=[ |
|
|
tab_config.room_size.instance, |
|
|
tab_config.wet_level.instance, |
|
|
tab_config.dry_level.instance, |
|
|
tab_config.damping.instance, |
|
|
effected_vocals_transfer, |
|
|
], |
|
|
show_progress="hidden", |
|
|
) |
|
|
postprocess_vocals_btn.click( |
|
|
exception_harness( |
|
|
postprocess, |
|
|
info_msg="Vocals post-processed successfully!", |
|
|
), |
|
|
inputs=[ |
|
|
tab_config.input_audio.converted_vocals.instance, |
|
|
tab_config.song_dirs.postprocess_vocals.instance, |
|
|
tab_config.room_size.instance, |
|
|
tab_config.wet_level.instance, |
|
|
tab_config.dry_level.instance, |
|
|
tab_config.damping.instance, |
|
|
], |
|
|
outputs=effected_vocals_track_output, |
|
|
) |
|
|
setup_transfer_event( |
|
|
effected_vocals_transfer_btn, |
|
|
effected_vocals_transfer, |
|
|
effected_vocals_track_output, |
|
|
tab_config.input_audio.all, |
|
|
) |
|
|
|
|
|
|
|
|
def _render_step_4(tab_config: MultiStepSongGenerationConfig) -> None: |
|
|
with gr.Accordion("Step 4: pitch shift of background audio", open=False): |
|
|
with gr.Row(): |
|
|
tab_config.input_audio.instrumentals.instance.render() |
|
|
tab_config.input_audio.backup_vocals.instance.render() |
|
|
with gr.Row(): |
|
|
tab_config.n_semitones_instrumentals.instantiate() |
|
|
tab_config.n_semitones_backup_vocals.instantiate() |
|
|
tab_config.song_dirs.pitch_shift_background.instance.render() |
|
|
with gr.Accordion("Options", open=False), gr.Row(): |
|
|
shifted_instrumentals_transfer = _render_song_transfer( |
|
|
[SongTransferOption.STEP_5_INSTRUMENTALS], |
|
|
"Pitch-shifted instrumentals", |
|
|
) |
|
|
shifted_backup_vocals_transfer = _render_song_transfer( |
|
|
[SongTransferOption.STEP_5_BACKUP_VOCALS], |
|
|
"Pitch-shifted backup vocals", |
|
|
) |
|
|
with gr.Row(): |
|
|
pitch_shift_instrumentals_btn = gr.Button( |
|
|
"Pitch shift instrumentals", |
|
|
variant="primary", |
|
|
) |
|
|
pitch_shift_backup_vocals_btn = gr.Button( |
|
|
"Pitch shift backup vocals", |
|
|
variant="primary", |
|
|
) |
|
|
with gr.Row(): |
|
|
shifted_instrumentals_transfer_btn = gr.Button( |
|
|
"Transfer shifted instrumentals", |
|
|
) |
|
|
shifted_backup_vocals_transfer_btn = gr.Button( |
|
|
"Transfer shifted backup vocals", |
|
|
) |
|
|
pitch_shift_background_reset_btn = gr.Button("Reset options") |
|
|
with gr.Row(): |
|
|
shifted_instrumentals_track_output = gr.Audio( |
|
|
label="Pitch-shifted instrumentals", |
|
|
type="filepath", |
|
|
interactive=False, |
|
|
waveform_options=gr.WaveformOptions(show_recording_waveform=True), |
|
|
) |
|
|
shifted_backup_vocals_track_output = gr.Audio( |
|
|
label="Pitch-shifted backup vocals", |
|
|
type="filepath", |
|
|
interactive=False, |
|
|
waveform_options=gr.WaveformOptions(show_recording_waveform=True), |
|
|
) |
|
|
|
|
|
pitch_shift_background_reset_btn.click( |
|
|
lambda: [ |
|
|
tab_config.n_semitones_instrumentals.value, |
|
|
tab_config.n_semitones_backup_vocals.value, |
|
|
gr.Dropdown(value=[SongTransferOption.STEP_5_INSTRUMENTALS]), |
|
|
gr.Dropdown(value=[SongTransferOption.STEP_5_BACKUP_VOCALS]), |
|
|
], |
|
|
outputs=[ |
|
|
tab_config.n_semitones_instrumentals.instance, |
|
|
tab_config.n_semitones_backup_vocals.instance, |
|
|
shifted_instrumentals_transfer, |
|
|
shifted_backup_vocals_transfer, |
|
|
], |
|
|
show_progress="hidden", |
|
|
) |
|
|
pitch_shift_instrumentals_btn.click( |
|
|
exception_harness( |
|
|
pitch_shift, |
|
|
info_msg="Instrumentals pitch-shifted successfully!", |
|
|
), |
|
|
inputs=[ |
|
|
tab_config.input_audio.instrumentals.instance, |
|
|
tab_config.song_dirs.pitch_shift_background.instance, |
|
|
tab_config.n_semitones_instrumentals.instance, |
|
|
], |
|
|
outputs=shifted_instrumentals_track_output, |
|
|
) |
|
|
pitch_shift_backup_vocals_btn.click( |
|
|
exception_harness( |
|
|
pitch_shift, |
|
|
info_msg="Backup vocals pitch-shifted successfully!", |
|
|
), |
|
|
inputs=[ |
|
|
tab_config.input_audio.backup_vocals.instance, |
|
|
tab_config.song_dirs.pitch_shift_background.instance, |
|
|
tab_config.n_semitones_backup_vocals.instance, |
|
|
], |
|
|
outputs=shifted_backup_vocals_track_output, |
|
|
) |
|
|
for btn, transfer, output in [ |
|
|
( |
|
|
shifted_instrumentals_transfer_btn, |
|
|
shifted_instrumentals_transfer, |
|
|
shifted_instrumentals_track_output, |
|
|
), |
|
|
( |
|
|
shifted_backup_vocals_transfer_btn, |
|
|
shifted_backup_vocals_transfer, |
|
|
shifted_backup_vocals_track_output, |
|
|
), |
|
|
]: |
|
|
setup_transfer_event( |
|
|
btn, |
|
|
transfer, |
|
|
output, |
|
|
tab_config.input_audio.all, |
|
|
) |
|
|
|
|
|
|
|
|
def _render_step_5( |
|
|
total_config: TotalConfig, |
|
|
tab_config: MultiStepSongGenerationConfig, |
|
|
) -> None: |
|
|
with gr.Accordion("Step 5: song mixing", open=False): |
|
|
with gr.Row(): |
|
|
tab_config.input_audio.main_vocals.instance.render() |
|
|
tab_config.input_audio.shifted_instrumentals.instance.render() |
|
|
tab_config.input_audio.shifted_backup_vocals.instance.render() |
|
|
tab_config.song_dirs.mix.instance.render() |
|
|
with gr.Accordion("Options", open=False): |
|
|
with gr.Row(): |
|
|
tab_config.main_gain.instantiate() |
|
|
tab_config.inst_gain.instantiate() |
|
|
tab_config.backup_gain.instantiate() |
|
|
with gr.Row(): |
|
|
tab_config.output_name.instantiate( |
|
|
value=partial( |
|
|
update_output_name, |
|
|
get_song_cover_name, |
|
|
False, |
|
|
), |
|
|
inputs=[ |
|
|
tab_config.input_audio.main_vocals.instance, |
|
|
tab_config.song_dirs.mix.instance, |
|
|
], |
|
|
) |
|
|
tab_config.output_sr.instantiate() |
|
|
tab_config.output_format.instantiate() |
|
|
song_cover_transfer = _render_song_transfer([], "Song cover") |
|
|
with gr.Row(): |
|
|
mix_reset_btn = gr.Button("Reset options") |
|
|
mix_btn = gr.Button("Mix song cover", variant="primary") |
|
|
song_cover_transfer_btn = gr.Button("Transfer song cover") |
|
|
song_cover_output = gr.Audio( |
|
|
label="Song cover", |
|
|
type="filepath", |
|
|
interactive=False, |
|
|
waveform_options=gr.WaveformOptions(show_recording_waveform=True), |
|
|
) |
|
|
mix_reset_btn.click( |
|
|
lambda: [ |
|
|
tab_config.main_gain.value, |
|
|
tab_config.inst_gain.value, |
|
|
tab_config.backup_gain.value, |
|
|
tab_config.output_sr.value, |
|
|
tab_config.output_format.value, |
|
|
gr.Dropdown(value=[]), |
|
|
], |
|
|
outputs=[ |
|
|
tab_config.main_gain.instance, |
|
|
tab_config.inst_gain.instance, |
|
|
tab_config.backup_gain.instance, |
|
|
tab_config.output_sr.instance, |
|
|
tab_config.output_format.instance, |
|
|
song_cover_transfer, |
|
|
], |
|
|
show_progress="hidden", |
|
|
) |
|
|
temp_audio_gains = gr.State() |
|
|
mix_btn.click( |
|
|
partial( |
|
|
_pair_audio_tracks_and_gain, |
|
|
[ |
|
|
tab_config.input_audio.main_vocals.instance, |
|
|
tab_config.input_audio.shifted_instrumentals.instance, |
|
|
tab_config.input_audio.shifted_backup_vocals.instance, |
|
|
], |
|
|
[ |
|
|
tab_config.main_gain.instance, |
|
|
tab_config.inst_gain.instance, |
|
|
tab_config.backup_gain.instance, |
|
|
], |
|
|
), |
|
|
inputs={ |
|
|
tab_config.input_audio.main_vocals.instance, |
|
|
tab_config.input_audio.shifted_instrumentals.instance, |
|
|
tab_config.input_audio.shifted_backup_vocals.instance, |
|
|
tab_config.main_gain.instance, |
|
|
tab_config.inst_gain.instance, |
|
|
tab_config.backup_gain.instance, |
|
|
}, |
|
|
outputs=temp_audio_gains, |
|
|
).then( |
|
|
exception_harness(mix_song, info_msg="Song cover succesfully generated."), |
|
|
inputs=[ |
|
|
temp_audio_gains, |
|
|
tab_config.song_dirs.mix.instance, |
|
|
tab_config.output_sr.instance, |
|
|
tab_config.output_format.instance, |
|
|
tab_config.output_name.instance, |
|
|
], |
|
|
outputs=song_cover_output, |
|
|
).then( |
|
|
partial(update_dropdowns, get_saved_output_audio, 1, [], [0]), |
|
|
outputs=total_config.management.audio.output.instance, |
|
|
show_progress="hidden", |
|
|
) |
|
|
setup_transfer_event( |
|
|
song_cover_transfer_btn, |
|
|
song_cover_transfer, |
|
|
song_cover_output, |
|
|
tab_config.input_audio.all, |
|
|
) |
|
|
|
|
|
|
|
|
def _render_song_transfer( |
|
|
value: list[SongTransferOption], |
|
|
label_prefix: str, |
|
|
) -> gr.Dropdown: |
|
|
return render_transfer_component(value, label_prefix, SongTransferOption) |
|
|
|
|
|
|
|
|
def _pair_audio_tracks_and_gain( |
|
|
audio_components: Sequence[gr.Audio], |
|
|
gain_components: Sequence[gr.Slider], |
|
|
data: dict[gr.Audio | gr.Slider, Any], |
|
|
) -> list[tuple[str, int]]: |
|
|
""" |
|
|
Pair audio tracks and gain levels stored in separate gradio |
|
|
components. |
|
|
|
|
|
This function is meant to first be partially applied to the sequence |
|
|
of audio components and the sequence of slider components containing |
|
|
the values that should be combined. The resulting function can then |
|
|
be called by an event listener whose inputs is a set containing |
|
|
those audio and slider components. The `data` parameter in that case |
|
|
will contain a mapping from each of those components to the value |
|
|
that the component stores. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
audio_components : Sequence[gr.Audio] |
|
|
Audio components to pair with gain levels. |
|
|
gain_components : Sequence[gr.Slider] |
|
|
Gain level components to pair with audio tracks. |
|
|
data : dict[gr.Audio | gr.Slider, Any] |
|
|
Data from the audio and gain components. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
list[tuple[str, int]] |
|
|
Paired audio tracks and gain levels. |
|
|
|
|
|
Raises |
|
|
------ |
|
|
ValueError |
|
|
If the number of audio tracks and gain levels are not the same. |
|
|
|
|
|
""" |
|
|
audio_tracks = [data[component] for component in audio_components] |
|
|
gain_levels = [data[component] for component in gain_components] |
|
|
if len(audio_tracks) != len(gain_levels): |
|
|
err_msg = "Number of audio tracks and gain levels must be the same." |
|
|
raise ValueError(err_msg) |
|
|
return [ |
|
|
(audio_track, gain_level) |
|
|
for audio_track, gain_level in zip(audio_tracks, gain_levels, strict=True) |
|
|
if audio_track |
|
|
] |
|
|
|
|
|
|
|
|
def run_newpipeline( |
|
|
source: str, |
|
|
model_name: str, |
|
|
n_octaves: int = 0, |
|
|
n_semitones: int = 0, |
|
|
f0_methods: Sequence[F0Method] | None = None, |
|
|
index_rate: float = 0.3, |
|
|
rms_mix_rate: float = 1.0, |
|
|
protect_rate: float = 0.33, |
|
|
hop_length: int = 128, |
|
|
split_vocals: bool = False, |
|
|
autotune_vocals: bool = False, |
|
|
autotune_strength: float = 1.0, |
|
|
clean_vocals: bool = False, |
|
|
clean_strength: float = 0.7, |
|
|
embedder_model: EmbedderModel = EmbedderModel.CONTENTVEC, |
|
|
custom_embedder_model: str | None = None, |
|
|
sid: int = 0, |
|
|
room_size: float = 0.15, |
|
|
wet_level: float = 0.2, |
|
|
dry_level: float = 0.8, |
|
|
damping: float = 0.7, |
|
|
main_gain: int = 0, |
|
|
inst_gain: int = 0, |
|
|
backup_gain: int = 0, |
|
|
output_sr: int = 44100, |
|
|
output_format: AudioExt = AudioExt.MP3, |
|
|
output_name: str | None = None, |
|
|
cookiefile: StrPath | None = None, |
|
|
progress_bar: gr.Progress | None = None, |
|
|
) -> tuple[Path, ...]: |
|
|
""" |
|
|
Run the song cover generation pipeline. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
source : str |
|
|
A Youtube URL, the path to a local audio file or the path to a |
|
|
song directory. |
|
|
model_name : str |
|
|
The name of the voice model to use for vocal conversion. |
|
|
n_octaves : int, default=0 |
|
|
The number of octaves to pitch-shift the converted vocals by. |
|
|
n_semitones : int, default=0 |
|
|
The number of semi-tones to pitch-shift the converted vocals, |
|
|
instrumentals, and backup vocals by. |
|
|
f0_methods : Sequence[F0Method], optional |
|
|
The methods to use for pitch extraction during vocal |
|
|
conversion. If None, the method used is rmvpe. |
|
|
index_rate : float, default=0.3 |
|
|
The influence of the index file on the vocal conversion. |
|
|
rms_mix_rate : float, default=1.0 |
|
|
The blending rate of the volume envelope of the converted |
|
|
vocals. |
|
|
protect_rate : float, default=0.33 |
|
|
The protect rate for consonants and breathing sounds during |
|
|
vocal conversion. |
|
|
hop_length : int, default=128 |
|
|
The hop length to use for crepe-based pitch detection. |
|
|
split_vocals : bool, default=False |
|
|
Whether to perform audio splitting before converting the main |
|
|
vocals. |
|
|
autotune_vocals : bool, default=False |
|
|
Whether to apply autotune to the converted vocals. |
|
|
autotune_strength : float, default=1.0 |
|
|
The strength of the autotune to apply to the converted vocals. |
|
|
clean_vocals : bool, default=False |
|
|
Whether to clean the converted vocals. |
|
|
clean_strength : float, default=0.7 |
|
|
The intensity of the cleaning to apply to the converted vocals. |
|
|
embedder_model : EmbedderModel, default=EmbedderModel.CONTENTVEC |
|
|
The model to use for generating speaker embeddings during vocal |
|
|
conversion. |
|
|
custom_embedder_model : StrPath, optional |
|
|
The name of a custom embedder model to use for generating |
|
|
speaker embeddings during vocal conversion. |
|
|
sid : int, default=0 |
|
|
The speaker id to use for multi-speaker models during vocal |
|
|
conversion. |
|
|
room_size : float, default=0.15 |
|
|
The room size of the reverb effect to apply to the converted |
|
|
vocals. |
|
|
wet_level : float, default=0.2 |
|
|
The wetness level of the reverb effect to apply to the converted |
|
|
vocals. |
|
|
dry_level : float, default=0.8 |
|
|
The dryness level of the reverb effect to apply to the converted |
|
|
vocals. |
|
|
damping : float, default=0.7 |
|
|
The damping of the reverb effect to apply to the converted |
|
|
vocals. |
|
|
main_gain : int, default=0 |
|
|
The gain to apply to the post-processed vocals. |
|
|
inst_gain : int, default=0 |
|
|
The gain to apply to the pitch-shifted instrumentals. |
|
|
backup_gain : int, default=0 |
|
|
The gain to apply to the pitch-shifted backup vocals. |
|
|
output_sr : int, default=44100 |
|
|
The sample rate of the song cover. |
|
|
output_format : AudioExt, default=AudioExt.MP3 |
|
|
The audio format of the song cover. |
|
|
output_name : str, optional |
|
|
The name of the song cover. |
|
|
cookiefile : StrPath, optional |
|
|
The path to a file containing cookies to use when downloading |
|
|
audio from Youtube. |
|
|
progress_bar : gr.Progress, optional |
|
|
Gradio progress bar to update. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
tuple[Path,...] |
|
|
The path to the generated song cover and the paths to any |
|
|
intermediate audio files that were generated. |
|
|
|
|
|
""" |
|
|
validate_model(model_name, Entity.VOICE_MODEL) |
|
|
if embedder_model == EmbedderModel.CUSTOM: |
|
|
validate_model(custom_embedder_model, Entity.CUSTOM_EMBEDDER_MODEL) |
|
|
display_progress("[~] Retrieving song...", 0 / 9, progress_bar) |
|
|
song, song_dir = retrieve_song(source, cookiefile=cookiefile) |
|
|
display_progress( |
|
|
"[~] newpipeline: Separating vocals from instrumentals...", 1 / 9, progress_bar |
|
|
) |
|
|
vocals_track, instrumentals_track = separate_audio( |
|
|
song, |
|
|
song_dir, |
|
|
SeparationModel.MDX23C_8KFFT_InstVoc_HQ_2, |
|
|
SegmentSize.SEG_2048, |
|
|
) |
|
|
display_progress( |
|
|
"[~] newpipeline: Separating main vocals from backup vocals...", |
|
|
2 / 9, |
|
|
progress_bar, |
|
|
) |
|
|
backup_vocals_track, main_vocals_track = separate_audio( |
|
|
vocals_track, |
|
|
song_dir, |
|
|
SeparationModel.UVR_MDX_NET_KARA_2, |
|
|
SegmentSize.SEG_2048, |
|
|
) |
|
|
display_progress("[~] newpipeline: De-noising vocals...", 3 / 9, progress_bar) |
|
|
noise_track, clean_track = separate_audio( |
|
|
clean_track, |
|
|
song_dir, |
|
|
SeparationModel.UVR_DeNoise, |
|
|
SegmentSize.SEG_2048, |
|
|
) |
|
|
display_progress("[~] newpipeline: De-reverbing vocals...", 4 / 9, progress_bar) |
|
|
reverb_track, vocals_dereverb_track = separate_audio( |
|
|
main_vocals_track, |
|
|
song_dir, |
|
|
SeparationModel.UVR_DeEcho_DeReverb, |
|
|
SegmentSize.SEG_2048, |
|
|
) |
|
|
|
|
|
display_progress("[~] newpipeline: Converting vocals...", 5 / 9, progress_bar) |
|
|
converted_vocals_track = convert( |
|
|
audio_track=vocals_dereverb_track, |
|
|
directory=song_dir, |
|
|
model_name=model_name, |
|
|
n_octaves=n_octaves, |
|
|
n_semitones=n_semitones, |
|
|
f0_methods=f0_methods, |
|
|
index_rate=index_rate, |
|
|
rms_mix_rate=rms_mix_rate, |
|
|
protect_rate=protect_rate, |
|
|
hop_length=hop_length, |
|
|
split_audio=split_vocals, |
|
|
autotune_audio=autotune_vocals, |
|
|
autotune_strength=autotune_strength, |
|
|
clean_audio=clean_vocals, |
|
|
clean_strength=clean_strength, |
|
|
embedder_model=embedder_model, |
|
|
custom_embedder_model=custom_embedder_model, |
|
|
sid=sid, |
|
|
content_type=RVCContentType.VOCALS, |
|
|
) |
|
|
display_progress("[~] newpipeline: Post-processing vocals...", 6 / 9, progress_bar) |
|
|
effected_vocals_track = postprocess( |
|
|
converted_vocals_track, |
|
|
song_dir, |
|
|
room_size, |
|
|
wet_level, |
|
|
dry_level, |
|
|
damping, |
|
|
) |
|
|
display_progress( |
|
|
"[~] newpipeline: Pitch-shifting instrumentals...", 7 / 9, progress_bar |
|
|
) |
|
|
shifted_instrumentals_track = pitch_shift( |
|
|
instrumentals_track, |
|
|
song_dir, |
|
|
n_semitones, |
|
|
) |
|
|
display_progress( |
|
|
"[~] newpipeline: Pitch-shifting backup vocals...", 8 / 9, progress_bar |
|
|
) |
|
|
shifted_backup_vocals_track = pitch_shift( |
|
|
backup_vocals_track, |
|
|
song_dir, |
|
|
n_semitones, |
|
|
) |
|
|
|
|
|
song_cover = mix_song( |
|
|
[ |
|
|
(effected_vocals_track, main_gain), |
|
|
(shifted_instrumentals_track, inst_gain), |
|
|
(shifted_backup_vocals_track, backup_gain), |
|
|
], |
|
|
song_dir, |
|
|
output_sr, |
|
|
output_format, |
|
|
output_name, |
|
|
) |
|
|
return ( |
|
|
song_cover, |
|
|
song, |
|
|
vocals_track, |
|
|
instrumentals_track, |
|
|
main_vocals_track, |
|
|
backup_vocals_track, |
|
|
vocals_dereverb_track, |
|
|
reverb_track, |
|
|
converted_vocals_track, |
|
|
effected_vocals_track, |
|
|
shifted_instrumentals_track, |
|
|
shifted_backup_vocals_track, |
|
|
) |
|
|
|
|
|
|
|
|
def render_app() -> gr.Blocks: |
|
|
""" |
|
|
Render the Ultimate RVC web application. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
gr.Blocks |
|
|
The rendered web application. |
|
|
|
|
|
""" |
|
|
css = """ |
|
|
h1 { text-align: center; margin-top: 20px; margin-bottom: 20px; } |
|
|
|
|
|
#generate-tab-button { font-weight: bold !important;} |
|
|
#manage-tab-button { font-weight: bold !important;} |
|
|
#audio-tab-button { font-weight: bold !important;} |
|
|
#settings-tab-button { font-weight: bold !important;} |
|
|
""" |
|
|
cache_delete_frequency = 86400 |
|
|
cache_delete_cutoff = 86400 |
|
|
|
|
|
with gr.Blocks( |
|
|
title="Redzone-6 Audio Playground", |
|
|
theme=gr.Theme.load(str(Path(__file__).parent / "config/theme.json")), |
|
|
css=css, |
|
|
delete_cache=(cache_delete_frequency, cache_delete_cutoff), |
|
|
) as app: |
|
|
for component_config in [ |
|
|
total_config.song.one_click.voice_model, |
|
|
total_config.song.one_click.cached_song, |
|
|
total_config.song.one_click.custom_embedder_model, |
|
|
total_config.song.multi_step.voice_model, |
|
|
total_config.song.multi_step.cached_song, |
|
|
total_config.song.multi_step.custom_embedder_model, |
|
|
total_config.song.multi_step.song_dirs.separate_audio, |
|
|
total_config.song.multi_step.song_dirs.convert_vocals, |
|
|
total_config.song.multi_step.song_dirs.postprocess_vocals, |
|
|
total_config.song.multi_step.song_dirs.pitch_shift_background, |
|
|
total_config.song.multi_step.song_dirs.mix, |
|
|
total_config.speech.one_click.edge_tts_voice, |
|
|
total_config.speech.one_click.voice_model, |
|
|
total_config.speech.one_click.custom_embedder_model, |
|
|
total_config.speech.multi_step.edge_tts_voice, |
|
|
total_config.speech.multi_step.voice_model, |
|
|
total_config.speech.multi_step.custom_embedder_model, |
|
|
total_config.training.multi_step.dataset, |
|
|
total_config.training.multi_step.preprocess_model, |
|
|
total_config.training.multi_step.extract_model, |
|
|
total_config.training.multi_step.train_model, |
|
|
total_config.training.multi_step.custom_embedder_model, |
|
|
total_config.training.multi_step.custom_pretrained_model, |
|
|
total_config.management.audio.intermediate, |
|
|
total_config.management.audio.speech, |
|
|
total_config.management.audio.output, |
|
|
total_config.management.audio.dataset, |
|
|
total_config.management.model.voices, |
|
|
total_config.management.model.embedders, |
|
|
total_config.management.model.pretraineds, |
|
|
total_config.management.model.traineds, |
|
|
total_config.management.settings.load_config_name, |
|
|
total_config.management.settings.delete_config_names, |
|
|
]: |
|
|
component_config.instantiate() |
|
|
|
|
|
|
|
|
with gr.Tab("Music", elem_id="generate-tab"): |
|
|
render_song_cover_one_click_tab(total_config, cookiefile) |
|
|
render_song_cover_multi_step_tab(total_config, cookiefile) |
|
|
with gr.Tab("Speech", elem_id="generate-tab"): |
|
|
render_speech_one_click_tab(total_config) |
|
|
render_speech_multi_step_tab(total_config) |
|
|
with gr.Tab("Configuration", elem_id="settings-tab"): |
|
|
with gr.Tab("Models"): |
|
|
render_models_tab(total_config) |
|
|
with gr.Tab("Settings"): |
|
|
render_settings_tab(total_config) |
|
|
render_audio_tab(total_config) |
|
|
|
|
|
app.load( |
|
|
_init_dropdowns, |
|
|
outputs=[ |
|
|
total_config.speech.one_click.edge_tts_voice.instance, |
|
|
total_config.speech.multi_step.edge_tts_voice.instance, |
|
|
total_config.song.one_click.voice_model.instance, |
|
|
total_config.song.multi_step.voice_model.instance, |
|
|
total_config.speech.one_click.voice_model.instance, |
|
|
total_config.speech.multi_step.voice_model.instance, |
|
|
total_config.management.model.voices.instance, |
|
|
total_config.song.one_click.custom_embedder_model.instance, |
|
|
total_config.song.multi_step.custom_embedder_model.instance, |
|
|
total_config.speech.one_click.custom_embedder_model.instance, |
|
|
total_config.speech.multi_step.custom_embedder_model.instance, |
|
|
total_config.training.multi_step.custom_embedder_model.instance, |
|
|
total_config.management.model.embedders.instance, |
|
|
total_config.training.multi_step.custom_pretrained_model.instance, |
|
|
total_config.management.model.pretraineds.instance, |
|
|
total_config.training.multi_step.extract_model.instance, |
|
|
total_config.training.multi_step.train_model.instance, |
|
|
total_config.training.multi_step.preprocess_model.instance, |
|
|
total_config.management.model.traineds.instance, |
|
|
total_config.song.one_click.cached_song.instance, |
|
|
total_config.song.multi_step.cached_song.instance, |
|
|
total_config.song.multi_step.song_dirs.separate_audio.instance, |
|
|
total_config.song.multi_step.song_dirs.convert_vocals.instance, |
|
|
total_config.song.multi_step.song_dirs.postprocess_vocals.instance, |
|
|
total_config.song.multi_step.song_dirs.pitch_shift_background.instance, |
|
|
total_config.song.multi_step.song_dirs.mix.instance, |
|
|
total_config.management.audio.intermediate.instance, |
|
|
total_config.training.multi_step.dataset.instance, |
|
|
total_config.management.audio.speech.instance, |
|
|
total_config.management.audio.output.instance, |
|
|
total_config.management.audio.dataset.instance, |
|
|
total_config.management.settings.load_config_name.instance, |
|
|
total_config.management.settings.delete_config_names.instance, |
|
|
], |
|
|
show_progress="hidden", |
|
|
) |
|
|
return app |
|
|
|
|
|
|
|
|
def _init_dropdowns() -> list[gr.Dropdown]: |
|
|
""" |
|
|
Initialize the Ultimate RVC web application by updating the choices |
|
|
and default values of non-static dropdown components. |
|
|
|
|
|
Returns |
|
|
------- |
|
|
tuple[gr.Dropdown, ...] |
|
|
A tuple of gr.Dropdown components with updated choices and |
|
|
default values. |
|
|
|
|
|
""" |
|
|
|
|
|
edge_tts_models = initialize_dropdowns( |
|
|
get_edge_tts_voice_names, |
|
|
2, |
|
|
"en-US-ChristopherNeural", |
|
|
range(2), |
|
|
) |
|
|
voice_models = initialize_dropdowns( |
|
|
get_voice_model_names, |
|
|
5, |
|
|
value_indices=range(4), |
|
|
) |
|
|
custom_embedder_models = initialize_dropdowns( |
|
|
get_custom_embedder_model_names, |
|
|
6, |
|
|
value_indices=range(5), |
|
|
) |
|
|
custom_pretrained_models = initialize_dropdowns( |
|
|
get_custom_pretrained_model_names, |
|
|
2, |
|
|
value_indices=range(1), |
|
|
) |
|
|
training_models = initialize_dropdowns( |
|
|
get_training_model_names, |
|
|
4, |
|
|
value_indices=range(2), |
|
|
) |
|
|
song_dirs = initialize_dropdowns( |
|
|
get_named_song_dirs, |
|
|
8, |
|
|
value_indices=range(7), |
|
|
) |
|
|
dataset = gr.Dropdown(get_audio_datasets()) |
|
|
speech_delete = gr.Dropdown(get_saved_speech_audio()) |
|
|
output_delete = gr.Dropdown(get_saved_output_audio()) |
|
|
dataset_delete = gr.Dropdown(get_named_audio_datasets()) |
|
|
configs = initialize_dropdowns(get_config_names, 2, value_indices=range(1)) |
|
|
return [ |
|
|
*edge_tts_models, |
|
|
*voice_models, |
|
|
*custom_embedder_models, |
|
|
*custom_pretrained_models, |
|
|
*training_models, |
|
|
*song_dirs, |
|
|
dataset, |
|
|
speech_delete, |
|
|
output_delete, |
|
|
dataset_delete, |
|
|
*configs, |
|
|
] |
|
|
|
|
|
|
|
|
def render_song_cover_one_click_tab( |
|
|
total_config: TotalConfig, cookiefile: str | None = None |
|
|
) -> None: |
|
|
""" |
|
|
Render "Generate song covers - One-click generation" tab. |
|
|
|
|
|
Parameters |
|
|
---------- |
|
|
total_config : TotalConfig |
|
|
Model containing all component configuration settings for the |
|
|
Ultimate RVC web UI. |
|
|
cookiefile : str, optional |
|
|
The path to a file containing cookies to use when downloading |
|
|
audio from Youtube. |
|
|
|
|
|
""" |
|
|
with gr.Tab("One-click"): |
|
|
tab_config = total_config.song.one_click |
|
|
_render_input(tab_config) |
|
|
with gr.Accordion("Options", open=False): |
|
|
_render_main_options(tab_config) |
|
|
_render_conversion_options(tab_config) |
|
|
_render_mixing_options(tab_config) |
|
|
_render_output_options(tab_config) |
|
|
_render_intermediate_audio(tab_config) |
|
|
|
|
|
with gr.Row(equal_height=True): |
|
|
reset_btn = gr.Button(value="Reset options", scale=2) |
|
|
generate_btn = gr.Button("Generate", scale=2, variant="primary") |
|
|
song_cover = gr.Audio( |
|
|
label="Song cover", |
|
|
scale=3, |
|
|
waveform_options=gr.WaveformOptions(show_recording_waveform=False), |
|
|
) |
|
|
song_dirs = total_config.song.multi_step.song_dirs.all |
|
|
generate_btn.click( |
|
|
partial( |
|
|
exception_harness( |
|
|
run_pipeline, |
|
|
info_msg="Song cover generated successfully!", |
|
|
), |
|
|
cookiefile=cookiefile, |
|
|
progress_bar=PROGRESS_BAR, |
|
|
), |
|
|
inputs=[ |
|
|
tab_config.source.instance, |
|
|
tab_config.voice_model.instance, |
|
|
tab_config.n_octaves.instance, |
|
|
tab_config.n_semitones.instance, |
|
|
tab_config.f0_methods.instance, |
|
|
tab_config.index_rate.instance, |
|
|
tab_config.rms_mix_rate.instance, |
|
|
tab_config.protect_rate.instance, |
|
|
tab_config.hop_length.instance, |
|
|
tab_config.split_voice.instance, |
|
|
tab_config.autotune_voice.instance, |
|
|
tab_config.autotune_strength.instance, |
|
|
tab_config.clean_voice.instance, |
|
|
tab_config.clean_strength.instance, |
|
|
tab_config.embedder_model.instance, |
|
|
tab_config.custom_embedder_model.instance, |
|
|
tab_config.sid.instance, |
|
|
tab_config.room_size.instance, |
|
|
tab_config.wet_level.instance, |
|
|
tab_config.dry_level.instance, |
|
|
tab_config.damping.instance, |
|
|
tab_config.main_gain.instance, |
|
|
tab_config.inst_gain.instance, |
|
|
tab_config.backup_gain.instance, |
|
|
tab_config.output_sr.instance, |
|
|
tab_config.output_format.instance, |
|
|
tab_config.output_name.instance, |
|
|
], |
|
|
outputs=[song_cover, *tab_config.intermediate_audio.all], |
|
|
concurrency_limit=4, |
|
|
concurrency_id=ConcurrencyId.GPU, |
|
|
).success( |
|
|
partial(update_dropdowns, get_named_song_dirs, 3 + len(song_dirs), [], [2]), |
|
|
outputs=[ |
|
|
total_config.song.one_click.cached_song.instance, |
|
|
total_config.song.multi_step.cached_song.instance, |
|
|
total_config.management.audio.intermediate.instance, |
|
|
*song_dirs, |
|
|
], |
|
|
show_progress="hidden", |
|
|
).then( |
|
|
partial(update_dropdowns, get_saved_output_audio, 1, [], [0]), |
|
|
outputs=total_config.management.audio.output.instance, |
|
|
show_progress="hidden", |
|
|
) |
|
|
reset_btn.click( |
|
|
lambda: [ |
|
|
tab_config.n_octaves.value, |
|
|
tab_config.n_semitones.value, |
|
|
tab_config.f0_methods.value, |
|
|
tab_config.index_rate.value, |
|
|
tab_config.rms_mix_rate.value, |
|
|
tab_config.protect_rate.value, |
|
|
tab_config.hop_length.value, |
|
|
tab_config.split_voice.value, |
|
|
tab_config.autotune_voice.value, |
|
|
tab_config.autotune_strength.value, |
|
|
tab_config.clean_voice.value, |
|
|
tab_config.clean_strength.value, |
|
|
tab_config.embedder_model.value, |
|
|
tab_config.sid.value, |
|
|
tab_config.room_size.value, |
|
|
tab_config.wet_level.value, |
|
|
tab_config.dry_level.value, |
|
|
tab_config.damping.value, |
|
|
tab_config.main_gain.value, |
|
|
tab_config.inst_gain.value, |
|
|
tab_config.backup_gain.value, |
|
|
tab_config.output_sr.value, |
|
|
tab_config.output_format.value, |
|
|
tab_config.show_intermediate_audio.value, |
|
|
], |
|
|
outputs=[ |
|
|
tab_config.n_octaves.instance, |
|
|
tab_config.n_semitones.instance, |
|
|
tab_config.f0_methods.instance, |
|
|
tab_config.index_rate.instance, |
|
|
tab_config.rms_mix_rate.instance, |
|
|
tab_config.protect_rate.instance, |
|
|
tab_config.hop_length.instance, |
|
|
tab_config.split_voice.instance, |
|
|
tab_config.autotune_voice.instance, |
|
|
tab_config.autotune_strength.instance, |
|
|
tab_config.clean_voice.instance, |
|
|
tab_config.clean_strength.instance, |
|
|
tab_config.embedder_model.instance, |
|
|
tab_config.sid.instance, |
|
|
tab_config.room_size.instance, |
|
|
tab_config.wet_level.instance, |
|
|
tab_config.dry_level.instance, |
|
|
tab_config.damping.instance, |
|
|
tab_config.main_gain.instance, |
|
|
tab_config.inst_gain.instance, |
|
|
tab_config.backup_gain.instance, |
|
|
tab_config.output_sr.instance, |
|
|
tab_config.output_format.instance, |
|
|
tab_config.show_intermediate_audio.instance, |
|
|
], |
|
|
show_progress="hidden", |
|
|
) |
|
|
|
|
|
|
|
|
def _render_input(tab_config: OneClickSongGenerationConfig) -> None: |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
tab_config.source_type.instantiate() |
|
|
with gr.Column(): |
|
|
tab_config.source.instantiate() |
|
|
local_file = gr.Audio( |
|
|
label="Source", |
|
|
type="filepath", |
|
|
visible=False, |
|
|
waveform_options=gr.WaveformOptions(show_recording_waveform=False), |
|
|
) |
|
|
tab_config.cached_song.instance.render() |
|
|
tab_config.source_type.instance.input( |
|
|
partial(toggle_visible_component, 3), |
|
|
inputs=tab_config.source_type.instance, |
|
|
outputs=[ |
|
|
tab_config.source.instance, |
|
|
local_file, |
|
|
tab_config.cached_song.instance, |
|
|
], |
|
|
show_progress="hidden", |
|
|
) |
|
|
|
|
|
local_file.change( |
|
|
update_value, |
|
|
inputs=local_file, |
|
|
outputs=tab_config.source.instance, |
|
|
show_progress="hidden", |
|
|
) |
|
|
tab_config.cached_song.instance.input( |
|
|
update_value, |
|
|
inputs=tab_config.cached_song.instance, |
|
|
outputs=tab_config.source.instance, |
|
|
show_progress="hidden", |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
tab_config.voice_model.instance.render() |
|
|
|
|
|
|
|
|
def _render_main_options(tab_config: OneClickSongGenerationConfig) -> None: |
|
|
with gr.Row(): |
|
|
tab_config.n_octaves.instantiate() |
|
|
tab_config.n_semitones.instantiate() |
|
|
|
|
|
|
|
|
def _render_conversion_options(tab_config: OneClickSongGenerationConfig) -> None: |
|
|
with gr.Accordion("Vocal conversion", open=True): |
|
|
gr.Markdown("") |
|
|
with gr.Accordion("Voice synthesis", open=True): |
|
|
with gr.Row(): |
|
|
tab_config.f0_methods.instantiate() |
|
|
tab_config.index_rate.instantiate() |
|
|
with gr.Row(): |
|
|
tab_config.rms_mix_rate.instantiate() |
|
|
tab_config.protect_rate.instantiate() |
|
|
tab_config.hop_length.instantiate() |
|
|
with gr.Accordion("Vocal enrichment", open=True): |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
tab_config.split_voice.instantiate() |
|
|
with gr.Column(): |
|
|
tab_config.autotune_voice.instantiate() |
|
|
tab_config.autotune_strength.instantiate() |
|
|
with gr.Column(): |
|
|
tab_config.clean_voice.instantiate() |
|
|
tab_config.clean_strength.instantiate() |
|
|
tab_config.autotune_voice.instance.change( |
|
|
partial(toggle_visibility, targets={True}), |
|
|
inputs=tab_config.autotune_voice.instance, |
|
|
outputs=tab_config.autotune_strength.instance, |
|
|
show_progress="hidden", |
|
|
) |
|
|
tab_config.clean_voice.instance.change( |
|
|
partial(toggle_visibility, targets={True}), |
|
|
inputs=tab_config.clean_voice.instance, |
|
|
outputs=tab_config.clean_strength.instance, |
|
|
show_progress="hidden", |
|
|
) |
|
|
with gr.Accordion("Speaker embedding", open=True): |
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
tab_config.embedder_model.instantiate() |
|
|
tab_config.custom_embedder_model.instance.render() |
|
|
tab_config.sid.instantiate() |
|
|
tab_config.embedder_model.instance.change( |
|
|
partial(toggle_visibility, targets={EmbedderModel.CUSTOM}), |
|
|
inputs=tab_config.embedder_model.instance, |
|
|
outputs=tab_config.custom_embedder_model.instance, |
|
|
show_progress="hidden", |
|
|
) |
|
|
|
|
|
|
|
|
def _render_mixing_options(tab_config: OneClickSongGenerationConfig) -> None: |
|
|
with gr.Accordion("Audio mixing", open=True): |
|
|
gr.Markdown("") |
|
|
with gr.Accordion("Reverb control on converted vocals", open=True): |
|
|
with gr.Row(): |
|
|
tab_config.room_size.instantiate() |
|
|
with gr.Row(): |
|
|
tab_config.wet_level.instantiate() |
|
|
tab_config.dry_level.instantiate() |
|
|
tab_config.damping.instantiate() |
|
|
|
|
|
with gr.Accordion("Volume controls (dB)", open=True), gr.Row(): |
|
|
tab_config.main_gain.instantiate() |
|
|
tab_config.inst_gain.instantiate() |
|
|
tab_config.backup_gain.instantiate() |
|
|
|
|
|
|
|
|
def _render_output_options(tab_config: OneClickSongGenerationConfig) -> None: |
|
|
with gr.Accordion("Audio output", open=True): |
|
|
with gr.Row(): |
|
|
tab_config.output_name.instantiate( |
|
|
value=partial( |
|
|
update_output_name, |
|
|
get_song_cover_name, |
|
|
True, |
|
|
), |
|
|
inputs=[ |
|
|
gr.State(None), |
|
|
tab_config.cached_song.instance, |
|
|
tab_config.voice_model.instance, |
|
|
], |
|
|
) |
|
|
tab_config.output_sr.instantiate() |
|
|
tab_config.output_format.instantiate() |
|
|
with gr.Row(): |
|
|
tab_config.show_intermediate_audio.instantiate() |
|
|
|
|
|
|
|
|
def _render_intermediate_audio(tab_config: OneClickSongGenerationConfig) -> None: |
|
|
with gr.Accordion( |
|
|
"Intermediate audio tracks", |
|
|
open=False, |
|
|
visible=False, |
|
|
) as intermediate_audio_accordion: |
|
|
with gr.Accordion( |
|
|
"Step 0: song retrieval", |
|
|
open=False, |
|
|
) as song_retrieval_accordion: |
|
|
tab_config.intermediate_audio.song.instantiate() |
|
|
with ( |
|
|
gr.Accordion( |
|
|
"Step 1a: vocals/instrumentals separation", |
|
|
open=False, |
|
|
) as vocals_separation_accordion, |
|
|
gr.Row(), |
|
|
): |
|
|
tab_config.intermediate_audio.vocals.instantiate() |
|
|
tab_config.intermediate_audio.instrumentals.instantiate() |
|
|
with ( |
|
|
gr.Accordion( |
|
|
"Step 1b: main vocals/ backup vocals separation", |
|
|
open=False, |
|
|
) as main_vocals_separation_accordion, |
|
|
gr.Row(), |
|
|
): |
|
|
tab_config.intermediate_audio.main_vocals.instantiate() |
|
|
tab_config.intermediate_audio.backup_vocals.instantiate() |
|
|
with ( |
|
|
gr.Accordion( |
|
|
"Step 1c: main vocals cleanup", |
|
|
open=False, |
|
|
) as vocal_cleanup_accordion, |
|
|
gr.Row(), |
|
|
): |
|
|
tab_config.intermediate_audio.main_vocals_dereverbed.instantiate() |
|
|
tab_config.intermediate_audio.main_vocals_reverb.instantiate() |
|
|
with gr.Accordion( |
|
|
"Step 2: conversion of main vocals", |
|
|
open=False, |
|
|
) as vocal_conversion_accordion: |
|
|
tab_config.intermediate_audio.converted_vocals.instantiate() |
|
|
with gr.Accordion( |
|
|
"Step 3: post-processing of converted vocals", |
|
|
open=False, |
|
|
) as vocals_postprocessing_accordion: |
|
|
tab_config.intermediate_audio.postprocessed_vocals.instantiate() |
|
|
with ( |
|
|
gr.Accordion( |
|
|
"Step 4: pitch shift of background tracks", |
|
|
open=False, |
|
|
) as pitch_shift_accordion, |
|
|
gr.Row(), |
|
|
): |
|
|
tab_config.intermediate_audio.instrumentals_shifted.instantiate() |
|
|
tab_config.intermediate_audio.backup_vocals_shifted.instantiate() |
|
|
|
|
|
tab_config.show_intermediate_audio.instance.change( |
|
|
partial(toggle_intermediate_audio, num_components=7), |
|
|
inputs=tab_config.show_intermediate_audio.instance, |
|
|
outputs=[ |
|
|
intermediate_audio_accordion, |
|
|
song_retrieval_accordion, |
|
|
vocals_separation_accordion, |
|
|
main_vocals_separation_accordion, |
|
|
vocal_cleanup_accordion, |
|
|
vocal_conversion_accordion, |
|
|
vocals_postprocessing_accordion, |
|
|
pitch_shift_accordion, |
|
|
], |
|
|
show_progress="hidden", |
|
|
) |
|
|
|
|
|
|
|
|
app = render_app() |
|
|
app_wrapper = typer.Typer() |
|
|
|
|
|
|
|
|
@app_wrapper.command() |
|
|
def start_app( |
|
|
share: Annotated[ |
|
|
bool, |
|
|
typer.Option("--share", "-s", help="Enable sharing"), |
|
|
] = False, |
|
|
listen: Annotated[ |
|
|
bool, |
|
|
typer.Option( |
|
|
"--listen", |
|
|
"-l", |
|
|
help="Make the web application reachable from your local network.", |
|
|
), |
|
|
] = False, |
|
|
listen_host: Annotated[ |
|
|
str | None, |
|
|
typer.Option( |
|
|
"--listen-host", |
|
|
"-h", |
|
|
help="The hostname that the server will use.", |
|
|
), |
|
|
] = "0.0.0.0", |
|
|
listen_port: Annotated[ |
|
|
int | None, |
|
|
typer.Option( |
|
|
"--listen-port", |
|
|
"-p", |
|
|
help="The listening port that the server will use.", |
|
|
), |
|
|
] = None, |
|
|
ssr_mode: Annotated[ |
|
|
bool, |
|
|
typer.Option( |
|
|
"--ssr-mode", |
|
|
help="Enable server-side rendering mode.", |
|
|
), |
|
|
] = False, |
|
|
) -> None: |
|
|
"""Run the Ultimate RVC web application.""" |
|
|
os.environ["GRADIO_TEMP_DIR"] = str(TEMP_DIR) |
|
|
gr.set_static_paths([MODELS_DIR, AUDIO_DIR]) |
|
|
|
|
|
app.launch( |
|
|
server_name=listen_host, |
|
|
server_port=listen_port, |
|
|
ssr_mode=ssr_mode, |
|
|
) |
|
|
|
|
|
|
|
|
load_config("default", TotalConfig) |
|
|
if __name__ == "__main__": |
|
|
app_wrapper() |
|
|
|