diff --git "a/app.py" "b/app.py"
--- "a/app.py"
+++ "b/app.py"
@@ -90,1880 +90,1882 @@ type StrPath = str | PathLike[str]
type Json = Mapping[str, Json] | Sequence[Json] | str | int | float | bool | None
+class SegmentSize(IntEnum):
+ """Enumeration of segment sizes for audio separation."""
+ SEG_64 = 64
+ SEG_128 = 128
+ SEG_256 = 256
+ SEG_512 = 512
+ SEG_1024 = 1024
+ SEG_2048 = 2048
+ SEG_4096 = 4096
-class BaseTabConfig(BaseModel):
- """
- Base model defining common component configuration settings for
- UI tabs.
- Attributes
- ----------
- embedder_model : DropdownConfig
- Configuration settings for an embedder model dropdown component.
- custom_embedder_model : DropdownConfig
- Configuration settings for a custom embedder model dropdown
- component.
+class F0Method(StrEnum):
+ """Enumeration of pitch extraction methods."""
- """
+ RMVPE = "rmvpe"
+ CREPE = "crepe"
+ CREPE_TINY = "crepe-tiny"
+ FCPE = "fcpe"
- embedder_model: DropdownConfig = DropdownConfig(
- label="Embedder model",
- info="The model to use for generating speaker embeddings.",
- value=EmbedderModel.CONTENTVEC,
- choices=list(EmbedderModel),
- exclude_value=True,
- )
- custom_embedder_model: DropdownConfig = DropdownConfig(
- label="Custom embedder model",
- info="Select a custom embedder model from the dropdown.",
- value=None,
- visible=False,
- render=False,
- exclude_value=True,
- )
-class GenerationConfig(BaseTabConfig):
- """
- Common component configuration settings for generation tabs.
+class RVCContentType(StrEnum):
+ """Enumeration of valid content to convert with RVC."""
- voice_model : DropdownConfig
- Configuration settings for a voice model dropdown component.
- f0_methods : DropdownConfig
- Configuration settings for a pitch extraction algorithms
- dropdown component.
- index_rate : SliderConfig
- Configuration settings for an index rate slider component.
- rms_mix_rate : SliderConfig
- Configuration settings for a RMS mix rate slider component.
- protect_rate : SliderConfig
- Configuration settings for a protect rate slider component.
- split_voice : CheckboxConfig
- Configuration settings for a split voice checkbox component.
- autotune_voice: CheckboxConfig
- Configuration settings for an autotune voice checkbox component.
- autotune_strength: SliderConfig
- Configuration settings for an autotune strength slider
- component.
- sid : NumberConfig
- Configuration settings for a speaker ID number component.
- output_sr : DropdownConfig
- Configuration settings for an output sample rate dropdown
- component.
- output_format : DropdownConfig
- Configuration settings for an output format dropdown
- component.
- output_name : TextboxConfig
- Configuration settings for an output name textbox component.
+ VOCALS = "vocals"
+ VOICE = "voice"
+ SPEECH = "speech"
+ AUDIO = "audio"
- See Also
- --------
- BaseTabConfig
- Parent model defining common component configuration settings
- for UI tabs.
- """
+class SampleRate(IntEnum):
+ """Enumeration of supported audio sample rates."""
- voice_model: DropdownConfig = DropdownConfig(
- label="Voice model",
- info="Select a model to use for voice conversion.",
- value=None,
- render=False,
- exclude_value=True,
- )
- f0_methods: DropdownConfig = DropdownConfig(
- label="Pitch extraction algorithm(s)",
- info=(
- "If more than one method is selected, then the median of the pitch values"
- " extracted by each method is used. RMVPE is recommended for most cases and"
- " is the default when no method is selected."
- ),
- value=[F0Method.RMVPE],
- choices=list(F0Method),
- multiselect=True,
- )
- index_rate: SliderConfig = SliderConfig(
- label="Index rate",
- info=(
- "Increase to bias the conversion towards the accent of the voice model."
- " Decrease to potentially reduce artifacts coming from the voice"
- " model.
"
- ),
- value=0.3,
- minimum=0.0,
- maximum=1.0,
- )
- rms_mix_rate: SliderConfig = SliderConfig(
- label="RMS mix rate",
- info=(
- "How much to mimic the loudness (0) of the input voice or a fixed loudness"
- " (1). A value of 1 is recommended for most cases.
"
- ),
- value=1.0,
- minimum=0.0,
- maximum=1.0,
- )
- protect_rate: SliderConfig = SliderConfig(
- label="Protect rate",
- info=(
- "Controls the extent to which consonants and breathing sounds are protected"
- " from artifacts. A higher value offers more protection but may worsen the"
- " indexing effect.
"
- ),
- value=0.33,
- minimum=0.0,
- maximum=0.5,
- )
+ HZ_16000 = 16000
+ HZ_44100 = 44100
+ HZ_48000 = 48000
+ HZ_96000 = 96000
+ HZ_192000 = 192000
- hop_length: SliderConfig = SliderConfig.hop_length(
- label="Hop length",
- info=(
- "How often the CREPE-based pitch extraction method checks for pitch changes"
- " measured in milliseconds. Lower values lead to longer conversion times"
- " and a higher risk of voice cracks, but better pitch accuracy."
- ),
- visible=True,
- )
- split_voice: CheckboxConfig = CheckboxConfig(
- label="Split input voice",
- info=(
- "Whether to split the input voice track into smaller segments before"
- " converting it. This can improve output quality for longer voice tracks."
- ),
- value=False,
- )
- autotune_voice: CheckboxConfig = CheckboxConfig(
- label="Autotune converted voice",
- info="Whether to apply autotune to the converted voice.
",
- value=False,
- exclude_value=True,
- )
- autotune_strength: SliderConfig = SliderConfig(
- label="Autotune intensity",
- info=(
- "Higher values result in stronger snapping to the chromatic grid and"
- " artifacting."
- ),
- value=1.0,
- minimum=0.0,
- maximum=1.0,
- visible=False,
- )
- sid: NumberConfig = NumberConfig(
- label="Speaker ID",
- info="Speaker ID for multi-speaker-models.",
- value=0,
- precision=0,
- )
- output_sr: DropdownConfig = DropdownConfig(
- label="Output sample rate",
- info="The sample rate of the mixed output track.",
- value=SampleRate.HZ_44100,
- choices=list(SampleRate),
- )
- output_format: DropdownConfig = DropdownConfig(
- label="Output format",
- info="The audio format of the mixed output track.",
- value=AudioExt.MP3,
- choices=list(AudioExt),
- )
- output_name: TextboxConfig = TextboxConfig(
- label="Output name",
- info="If no name is provided, a suitable name will be generated automatically.",
- value=None,
- placeholder="Ultimate RVC output",
- exclude_value=True,
- )
+class AudioExt(StrEnum):
+ """Enumeration of supported audio file formats."""
+ MP3 = "mp3"
+ WAV = "wav"
+ FLAC = "flac"
+ OGG = "ogg"
-class SongGenerationConfig(GenerationConfig):
- """
- Common component configuration settings for song generation tabs.
- Attributes
- ----------
- source_type : DropdownConfig
- Configuration settings for a source type dropdown component.
- source : TextboxConfig
- Configuration settings for an input source textbox component.
- cached_song : DropdownConfig
- Configuration settings for a cached song dropdown component.
- clean_strength : SliderConfig
- Configuration settings for a clean strength slider component.
- clean_voice : CheckboxConfig
- Configuration settings for a clean voice checkbox component.
- room_size : SliderConfig
- Configuration settings for a room size slider component.
- wet_level : SliderConfig
- Configuration settings for a wetness level slider component.
- dry_level : SliderConfig
- Configuration settings for a dryness level slider component.
- damping : SliderConfig
- Configuration settings for a damping level slider component.
- main_gain : SliderConfig
- Configuration settings for a main gain slider component.
- inst_gain : SliderConfig
- Configuration settings for an instrumentals gain slider
- component.
- backup_gain : SliderConfig
- Configuration settings for a backup vocals gain slider
- component.
+class DeviceType(StrEnum):
+ """Enumeration of device types for training voice models."""
- See Also
- --------
- GenerationConfig
- Parent model defining common component configuration settings
- for song generation tabs.
+ AUTOMATIC = "Automatic"
+ CPU = "CPU"
+ GPU = "GPU"
- """
- source_type: DropdownConfig = DropdownConfig(
- label="Source type",
- info="The type of source to retrieve a song from.",
- value=SongSourceType.LOCAL_FILE,
- choices=list(SongSourceType),
- type="index",
- exclude_value=True,
- )
- source: TextboxConfig = TextboxConfig(
- label="Source",
- info="Link to a song on YouTube or the full path of a local audio file.",
- value=None,
- exclude_value=True,
- )
- cached_song: DropdownConfig = DropdownConfig(
- label="Source",
- info="Select a song from the list of cached songs.",
- value=None,
- visible=False,
- render=False,
- exclude_value=True,
- )
- clean_voice: CheckboxConfig = CheckboxConfig(
- label="Clean converted voice",
- info=(
- "Whether to clean the converted voice using noise reduction"
- " algorithms.
"
- ),
- value=False,
- exclude_value=True,
- )
- clean_strength: SliderConfig = SliderConfig.clean_strength(visible=False)
- room_size: SliderConfig = SliderConfig(
- label="Room size",
- info=(
- "Size of the room which reverb effect simulates. Increase for longer reverb"
- " time."
- ),
- value=0.15,
- minimum=0.0,
- maximum=1.0,
- )
- wet_level: SliderConfig = SliderConfig(
- label="Wetness level",
- info="Loudness of converted vocals with reverb effect applied.",
- value=0.2,
- minimum=0.0,
- maximum=1.0,
- )
- dry_level: SliderConfig = SliderConfig(
- label="Dryness level",
- info="Loudness of converted vocals without reverb effect applied.",
- value=0.8,
- minimum=0.0,
- maximum=1.0,
- )
- damping: SliderConfig = SliderConfig(
- label="Damping level",
- info="Absorption of high frequencies in reverb effect.",
- value=0.7,
- minimum=0.0,
- maximum=1.0,
- )
- main_gain: SliderConfig = SliderConfig.gain(
- label="Main gain",
- info="The gain to apply to the main vocals.",
- )
- inst_gain: SliderConfig = SliderConfig.gain(
- label="Instrumentals gain",
- info="The gain to apply to the instrumentals.",
- )
- backup_gain: SliderConfig = SliderConfig.gain(
- label="Backup gain",
- info="The gain to apply to the backup vocals.",
- )
+class TrainingSampleRate(StrEnum):
+ """Enumeration of sample rates for training voice models."""
+ HZ_32K = "32000"
+ HZ_40K = "40000"
+ HZ_48K = "48000"
-class SpeechGenerationConfig(GenerationConfig):
- """
- Common component configuration settings for speech generation tabs.
- Attributes
- ----------
- source_type : DropdownConfig
- Configuration settings for a source type dropdown component.
- source : TextboxConfig
- Configuration settings for an input source textbox component.
- edge_tts_voice : DropdownConfig
- Configuration settings for an Edge TTS voice dropdown
- component.
- n_octaves : SliderConfig
- Configuration settings for an octave pitch shift slider
- component.
- n_semitones : SliderConfig
- Configuration settings for a semitone pitch shift slider
- component.
- tts_pitch_shift : SliderConfig
- Configuration settings for a TTS pitch shift slider
- component.
- tts_speed_change : SliderConfig
- Configuration settings for a TTS speed change slider
- component.
- tts_volume_change : SliderConfig
- Configuration settings for a TTS volume change slider
- component.
- clean_voice : CheckboxConfig
- Configuration settings for a clean voice checkbox
- component.
- clean_strength : SliderConfig
- Configuration settings for a clean strength slider
- component.
- output_gain : GainSliderConfig
- Configuration settings for an output gain slider component.
+class PretrainedSampleRate(StrEnum):
+ """Enumeration of valid sample rates for pretrained models."""
- See Also
- --------
- GenerationConfig
- Parent model defining common component configuration settings
- for generation tabs.
+ HZ_32K = "32k"
+ HZ_40K = "40k"
+ HZ_44K = "44k"
+ HZ_48K = "48k"
- """
- source_type: DropdownConfig = DropdownConfig(
- label="Source type",
- info="The type of source to generate speech from.",
- value=SpeechSourceType.TEXT,
- choices=list(SpeechSourceType),
- type="index",
- exclude_value=True,
- )
- source: TextboxConfig = TextboxConfig(
- label="Source",
- info="Text to generate speech from",
- value=None,
- exclude_value=True,
- )
- edge_tts_voice: DropdownConfig = DropdownConfig(
- label="Edge TTS voice",
- info="Select a voice to use for text to speech conversion.",
- value=None,
- render=False,
- exclude_value=True,
- )
- n_octaves: SliderConfig = SliderConfig.octave_shift(
- label="Octave shift",
- info=(
- "The number of octaves to pitch-shift the converted speech by. Use 1 for"
- " male-to-female and -1 for vice-versa."
- ),
- )
- n_semitones: SliderConfig = SliderConfig.semitone_shift(
- label="Semitone shift",
- info="The number of semi-tones to pitch-shift the converted speech by.",
- )
- tts_pitch_shift: SliderConfig = SliderConfig(
- label="Edge TTS pitch shift",
- info=(
- "The number of hertz to shift the pitch of the speech generated by Edge"
- " TTS."
- ),
- value=0,
- minimum=-100,
- maximum=100,
- step=1,
- )
- tts_speed_change: SliderConfig = SliderConfig(
- label="TTS speed change",
- info="The percentual change to the speed of the speech generated by Edge TTS.",
- value=0,
- minimum=-50,
- maximum=100,
- step=1,
- )
- tts_volume_change: SliderConfig = SliderConfig(
- label="TTS volume change",
- info="The percentual change to the volume of the speech generated by Edge TTS.",
- value=0,
- minimum=-100,
- maximum=100,
- step=1,
- )
- clean_voice: CheckboxConfig = CheckboxConfig(
- label="Clean converted voice",
- info=(
- "Whether to clean the converted voice using noise reduction"
- " algorithms.
"
- ),
- value=True,
- exclude_value=True,
- )
- clean_strength: SliderConfig = SliderConfig.clean_strength(visible=True)
- output_gain: SliderConfig = SliderConfig.gain(
- label="Output gain",
- info="The gain to apply to the converted speech.
",
- )
+class TrainingF0Method(StrEnum):
+ """Enumeration of pitch extraction methods for training."""
+ RMVPE = "rmvpe"
+ CREPE = "crepe"
+ CREPE_TINY = "crepe-tiny"
-class TrainingConfig(BaseTabConfig):
- """
- Common component configuration settings for training tabs.
- Attributes
- ----------
- dataset_type : DropdownConfig
- Configuration settings for a dataset type dropdown component.
- dataset : DropdownConfig
- Configuration settings for a dataset dropdown component.
- dataset_name : TextboxConfig
- Configuration settings for a dataset name textbox component.
- preprocess_model : DropdownConfig
- Configuration settings for a model name dropdown component
- for audio preprocessing.
- sample_rate : DropdownConfig
- Configuration settings for a sample rate dropdown component.
- filter_audio : CheckboxConfig
- Configuration settings for a filter audio checkbox component.
- clean_audio : CheckboxConfig
- Configuration settings for a clean audio checkbox component.
- clean_strength : SliderConfig
- Configuration settings for a clean strength slider component.
- split_method : DropdownConfig
- Configuration settings for an audio splitting method dropdown
- component.
- chunk_len : SliderConfig
- Configuration settings for a chunk length slider component.
- overlap_len : SliderConfig
- Configuration settings for an overlap length slider component.
- preprocess_cores : SliderConfig
- Configuration settings for a CPU cores slider component for
- preprocessing.
- extract_model : DropdownConfig
- Configuration settings for a model name dropdown component for
- feature extraction.
- f0_method : DropdownConfig
- Configuration settings for an F0 method dropdown component.
- hop_length : SliderConfig
- Configuration settings for a hop length slider component.
- include_mutes : SliderConfig
- Configuration settings for an include mutes slider component.
- extract_cores : SliderConfig
- Configuration settings for a CPU cores slider component for
- feature extraction.
- extraction_acceleration : HardwareAccelerationConfig
- Configuration settings for a hardware acceleration component for
- feature extraction.
- extraction_gpus : DropdownConfig
- Configuration settings for a GPU dropdown compoennt for feature
- extraction.
- train_model : DropdownConfig
- Configuration settings for a model name dropdown component for
- training.
- num_epochs : SliderConfig
- Configuration settings for a number of epochs slider component.
- batch_size : SliderConfig
- Configuration settings for a batch size slider component.
- detect_overtraining : CheckboxConfig
- Configuration settings for a detect overtraining checkbox
- component.
- overtraining_threshold : SliderConfig
- Configuration settings for an overtraining threshold slider
- component.
- vocoder : DropdownConfig
- Configuration settings for a vocoder dropdown component.
- index_algorithm : DropdownConfig
- Configuration settings for an index algorithm dropdown
- component.
- pretrained_type : DropdownConfig
- Configuration settings for a pretrained model type dropdown
- component.
- custom_pretrained_model : DropdownConfig
- Configuration settings for a custom pretrained model dropdown
- component.
- save_interval : SliderConfig
- Configuration settings for a save-interval slider component.
- save_all_checkpoints : CheckboxConfig
- Configuration settings for a save-all-checkpoints checkbox
- component.
- save_all_weights : CheckboxConfig
- Configuration settings for a save-all-weights checkbox
- component.
- clear_saved_data : CheckboxConfig
- Configuration settings for a clear-saved-data checkbox
- component.
- upload_model : CheckboxConfig
- Configuration settings for an upload voice model checkbox
- component.
- upload_name : TextboxConfig
- Configuration settings for an upload name textbox component.
- training_acceleration : HardwareAccelerationConfig
- Configuration settings for a hardware acceleration component for
- training.
- training_gpus : DropdownConfig
- Configuration settings for a GPU dropdown component for
- training.
- preload_dataset : CheckboxConfig
- Configuration settings for a preload dataset checkbox component.
- reduce_memory_usage : CheckboxConfig
- Configuration settings for a reduce-memory-usage checkbox
- component.
-
- See Also
- --------
- BaseTabConfig
- Parent model defining common component configuration settings
- for UI tabs.
-
- """
-
- dataset_type: DropdownConfig = DropdownConfig(
- label="Dataset type",
- info="Select the type of dataset to preprocess.",
- value=DatasetType.NEW_DATASET,
- choices=list(DatasetType),
- exclude_value=True,
- )
- dataset: DropdownConfig = DropdownConfig(
- label="Dataset path",
- info=(
- "The path to an existing dataset. Either select a path to a previously"
- " created dataset or provide a path to an external dataset."
- ),
- value=None,
- allow_custom_value=True,
- visible=False,
- render=False,
- exclude_value=True,
- )
- dataset_name: TextboxConfig = TextboxConfig(
- label="Dataset name",
- info=(
- "The name of the new dataset. If the dataset already exists, the provided"
- " audio files will be added to it."
- ),
- value="My dataset",
- exclude_value=True,
- )
- preprocess_model: DropdownConfig = DropdownConfig(
- label="Model name",
- info=(
- "Name of the model to preprocess the given dataset for. Either select an"
- " existing model from the dropdown or provide the name of a new model."
- ),
- value="My model",
- allow_custom_value=True,
- render=False,
- exclude_value=True,
- )
- sample_rate: DropdownConfig = DropdownConfig(
- label="Sample rate",
- info="Target sample rate for the audio files in the provided dataset.",
- value=TrainingSampleRate.HZ_40K,
- choices=list(TrainingSampleRate),
- )
- filter_audio: CheckboxConfig = CheckboxConfig(
- label="Filter audio",
- info=(
- "Whether to remove low-frequency sounds from the audio files in the"
- " provided dataset by applying a high-pass butterworth filter.
"
- ),
- value=True,
- )
- clean_audio: CheckboxConfig = CheckboxConfig(
- label="Clean audio",
- info=(
- "Whether to clean the audio files in the provided dataset using noise"
- " reduction algorithms.
"
- ),
- value=False,
- exclude_value=True,
- )
- clean_strength: SliderConfig = SliderConfig.clean_strength(visible=False)
- split_method: DropdownConfig = DropdownConfig(
- label="Audio splitting method",
- info=(
- "The method to use for splitting the audio files in the provided dataset."
- " Use the `Skip` method to skip splitting if the audio files are already"
- " split. Use the `Simple` method if excessive silence has already been"
- " removed from the audio files. Use the `Automatic` method for automatic"
- " silence detection and splitting around it."
- ),
- value=AudioSplitMethod.AUTOMATIC,
- choices=list(AudioSplitMethod),
- exclude_value=True,
- )
- chunk_len: SliderConfig = SliderConfig(
- label="Chunk length",
- info="Length of split audio chunks.",
- value=3.0,
- minimum=0.5,
- maximum=5.0,
- step=0.1,
- visible=False,
- )
- overlap_len: SliderConfig = SliderConfig(
- label="Overlap length",
- info="Length of overlap between split audio chunks.",
- value=0.3,
- minimum=0.0,
- maximum=0.4,
- step=0.1,
- visible=False,
- )
- preprocess_cores: SliderConfig = SliderConfig.cpu_cores()
-
- extract_model: DropdownConfig = DropdownConfig(
- label="Model name",
- info=(
- "Name of the model with an associated preprocessed dataset to extract"
- " training features from. When a new dataset is preprocessed, its"
- " associated model is selected by default."
- ),
- value=None,
- render=False,
- exclude_value=True,
- )
- f0_method: DropdownConfig = DropdownConfig(
- label="F0 method",
- info="The method to use for extracting pitch features.",
- value=TrainingF0Method.RMVPE,
- choices=list(TrainingF0Method),
- exclude_value=True,
- )
-
- hop_length: SliderConfig = SliderConfig.hop_length(
- label="Hop length",
- info="The hop length to use for extracting pitch features.
",
- visible=False,
- )
- include_mutes: SliderConfig = SliderConfig(
- label="Include mutes",
- info=(
- "The number of mute audio files to include in the generated training file"
- " list. Adding silent files enables the training model to handle pure"
- " silence in inferred audio files. If the preprocessed audio dataset"
- " already contains segments of pure silence, set this to 0."
- ),
- value=0,
- minimum=0,
- maximum=10,
- step=1,
- )
- extraction_cores: SliderConfig = SliderConfig.cpu_cores()
- extraction_acceleration: DropdownConfig = DropdownConfig.hardware_acceleration()
- extraction_gpus: DropdownConfig = DropdownConfig.gpu()
-
- train_model: DropdownConfig = DropdownConfig(
- label="Model name",
- info=(
- "Name of the model to train. When training features are extracted for a new"
- " model, its name is selected by default."
- ),
- value=None,
- render=False,
- exclude_value=True,
- )
- num_epochs: SliderConfig = SliderConfig(
- label="Number of epochs",
- info=(
- "The number of epochs to train the voice model. A higher number can improve"
- " voice model performance but may lead to overtraining."
- ),
- value=500,
- minimum=1,
- maximum=5000,
- step=1,
- )
- batch_size: SliderConfig = SliderConfig(
- label="Batch size",
- info=(
- "The number of samples in each training batch. It is advisable to align"
- " this value with the available VRAM of your GPU."
- ),
- value=16,
- minimum=1,
- maximum=128,
- step=1,
- )
- detect_overtraining: CheckboxConfig = CheckboxConfig(
- label="Detect overtraining",
- info=(
- "Whether to detect overtraining to prevent the voice model from learning"
- " the training data too well and losing the ability to generalize to new"
- " data."
- ),
- value=True,
- exclude_value=True,
- )
- overtraining_threshold: SliderConfig = SliderConfig(
- label="Overtraining threshold",
- info=(
- "The maximum number of epochs to continue training without any observed"
- " improvement in voice model performance."
- ),
- value=500,
- minimum=1,
- maximum=1000,
- visible=False,
- )
- vocoder: DropdownConfig = DropdownConfig(
- label="Vocoder",
- info=(
- "The vocoder to use for audio synthesis during training. HiFi-GAN provides"
- " basic audio fidelity, while RefineGAN provides the highest audio"
- " fidelity."
- ),
- value=Vocoder.HIFI_GAN,
- choices=list(Vocoder),
- )
- index_algorithm: DropdownConfig = DropdownConfig(
- label="Index algorithm",
- info=(
- "The method to use for generating an index file for the trained voice"
- " model. `KMeans` is particularly useful for large datasets."
- ),
- value=IndexAlgorithm.AUTO,
- choices=list(IndexAlgorithm),
- )
- pretrained_type: DropdownConfig = DropdownConfig(
- label="Pretrained model type",
- info=(
- "The type of pretrained model to finetune the voice model on. `None` will"
- " train the voice model from scratch, while `Default` will use a pretrained"
- " model tailored to the specific voice model architecture. `Custom` will"
- " use a custom pretrained that you provide."
- ),
- value=PretrainedType.DEFAULT,
- choices=list(PretrainedType),
- exclude_value=True,
- )
- custom_pretrained_model: DropdownConfig = DropdownConfig(
- label="Custom pretrained model",
- info="Select a custom pretrained model to finetune from the dropdown.",
- value=None,
- visible=False,
- render=False,
- exclude_value=True,
- )
- save_interval: SliderConfig = SliderConfig(
- label="Save interval",
- info=(
- "The epoch interval at which to to save voice model weights and"
- " checkpoints. The best model weights are always saved regardless of this"
- " setting."
- ),
- value=10,
- minimum=1,
- maximum=100,
- step=1,
- )
- save_all_checkpoints: CheckboxConfig = CheckboxConfig(
- label="Save all checkpoints",
- info=(
- "Whether to save a unique checkpoint at each save interval. If not enabled,"
- " only the latest checkpoint will be saved at each interval."
- ),
- value=True,
- )
- save_all_weights: CheckboxConfig = CheckboxConfig(
- label="Save all weights",
- info=(
- "Whether to save unique voice model weights at each save interval. If not"
- " enabled, only the best voice model weights will be saved."
- ),
- value=True,
- )
- clear_saved_data: CheckboxConfig = CheckboxConfig(
- label="Clear saved data",
- info=(
- "Whether to delete any existing training data associated with the voice"
- " model before training commences. Enable this setting only if you are"
- " training a new voice model from scratch or restarting training."
- ),
- value=False,
- )
- upload_model: CheckboxConfig = CheckboxConfig(
- label="Upload voice model",
- info=(
- "Whether to automatically upload the trained voice model so that it can be"
- " used for generation tasks within the Ultimate RVC app."
- ),
- value=False,
- exclude_value=True,
- )
- upload_name: TextboxConfig = TextboxConfig(
- label="Upload name",
- info="The name to give the uploaded voice model.",
- value=None,
- visible=False,
- exclude_value=True,
- )
- training_acceleration: DropdownConfig = DropdownConfig.hardware_acceleration()
- training_gpus: DropdownConfig = DropdownConfig.gpu()
- preload_dataset: CheckboxConfig = CheckboxConfig(
- label="Preload dataset",
- info=(
- "Whether to preload all training data into GPU memory. This can improve"
- " training speed but requires a lot of VRAM.
"
- ),
- value=True,
- )
- reduce_memory_usage: CheckboxConfig = CheckboxConfig(
- label="Reduce memory usage",
- info=(
- "Whether to reduce VRAM usage at the cost of slower training speed by"
- " enabling activation checkpointing. This is useful for GPUs with limited"
- " memory (e.g., <6GB VRAM) or when training with a batch size larger than"
- " what your GPU can normally accommodate."
- ),
- value=False,
- )
+class AudioSplitMethod(StrEnum):
+ """
+ Enumeration of methods to use for splitting audio files during
+ dataset preprocessing.
+ """
+ SKIP = "Skip"
+ SIMPLE = "Simple"
+ AUTOMATIC = "Automatic"
-class SegmentSize(IntEnum):
- """Enumeration of segment sizes for audio separation."""
+class Vocoder(StrEnum):
+ """Enumeration of vocoders for training voice models."""
- SEG_64 = 64
- SEG_128 = 128
- SEG_256 = 256
- SEG_512 = 512
- SEG_1024 = 1024
- SEG_2048 = 2048
- SEG_4096 = 4096
+ HIFI_GAN = "HiFi-GAN"
+ MRF_HIFI_GAN = "MRF HiFi-GAN"
+ REFINE_GAN = "RefineGAN"
-class F0Method(StrEnum):
- """Enumeration of pitch extraction methods."""
+class IndexAlgorithm(StrEnum):
+ """Enumeration of indexing algorithms for training voice models."""
- RMVPE = "rmvpe"
- CREPE = "crepe"
- CREPE_TINY = "crepe-tiny"
- FCPE = "fcpe"
+ AUTO = "Auto"
+ FAISS = "Faiss"
+ KMEANS = "KMeans"
+class PretrainedType(StrEnum):
+ """
+ Enumeration of the possible types of pretrained models to finetune
+ voice models on.
+ """
-class RVCContentType(StrEnum):
- """Enumeration of valid content to convert with RVC."""
+ NONE = "None"
+ DEFAULT = "Default"
+ CUSTOM = "Custom"
- VOCALS = "vocals"
- VOICE = "voice"
- SPEECH = "speech"
- AUDIO = "audio"
-class SampleRate(IntEnum):
- """Enumeration of supported audio sample rates."""
- HZ_16000 = 16000
- HZ_44100 = 44100
- HZ_48000 = 48000
- HZ_96000 = 96000
- HZ_192000 = 192000
+class ConcurrencyId(StrEnum):
+ """Enumeration of possible concurrency identifiers."""
+ GPU = auto()
-class AudioExt(StrEnum):
- """Enumeration of supported audio file formats."""
- MP3 = "mp3"
- WAV = "wav"
- FLAC = "flac"
- OGG = "ogg"
+class SongSourceType(StrEnum):
+ """The type of source providing the song to generate a cover of."""
+ LOCAL_FILE = "Local file"
+ CACHED_SONG = "Cached song"
+
+
+class SpeechSourceType(StrEnum):
+ """The type of source providing the text to generate speech from."""
+
+ TEXT = "Text"
+ LOCAL_FILE = "Local file"
+
+
+class SongTransferOption(StrEnum):
+ """Enumeration of possible song transfer options."""
+
+ STEP_1_AUDIO = "Step 1: stem splitting"
+ STEP_2_VOCALS = "Step 2: vocal conversion"
+ STEP_3_VOCALS = "Step 3: vocal effect"
+ STEP_4_INSTRUMENTALS = "Step 4: instrumentals"
+ STEP_4_BACKUP_VOCALS = "Step 4: backup vocals"
+ STEP_5_MAIN_VOCALS = "Step 5: main vocals"
+ STEP_5_INSTRUMENTALS = "Step 5: instrumentals"
+ STEP_5_BACKUP_VOCALS = "Step 5: backup vocals"
+
+
+class SpeechTransferOption(StrEnum):
+ """Enumeration of possible speech transfer options."""
+
+ STEP_2_SPEECH = "Step 2: vocal conversion"
+ STEP_3_SPEECH = "Step 3: vocal effect"
+
+
+class ComponentVisibilityKwArgs(TypedDict, total=False):
+ """
+ Keyword arguments for setting component visibility.
+
+ Attributes
+ ----------
+ visible : bool
+ Whether the component should be visible.
+ value : Any
+ The value of the component.
+
+ """
+
+ visible: bool
+ value: Any
+
+
+class UpdateDropdownKwArgs(TypedDict, total=False):
+ """
+ Keyword arguments for updating a dropdown component.
+
+ Attributes
+ ----------
+ choices : DropdownChoices
+ The updated choices for the dropdown component.
+ value : DropdownValue
+ The updated value for the dropdown component.
+
+ """
+
+ choices: DropdownChoices
+ value: DropdownValue
+
+
+class TextBoxKwArgs(TypedDict, total=False):
+ """
+ Keyword arguments for updating a textbox component.
+
+ Attributes
+ ----------
+ value : str | None
+ The updated value for the textbox component.
+ placeholder : str | None
+ The updated placeholder for the textbox component.
+
+ """
+
+ value: str | None
+ placeholder: str | None
+
+
+class UpdateAudioKwArgs(TypedDict, total=False):
+ """
+ Keyword arguments for updating an audio component.
+
+ Attributes
+ ----------
+ value : str | None
+ The updated value for the audio component.
+
+ """
+
+ value: str | None
+
+
+class DatasetType(StrEnum):
+ """The type of dataset to train a voice model."""
+
+ NEW_DATASET = "New dataset"
+ EXISTING_DATASET = "Existing dataset"
+
+
+embedders_list = [
+ ("embedders/contentvec/", ["pytorch_model.bin", "config.json"]),
+ ("embedders/custom/Crusty/", ["model.safetensors", "config.json"]),
+]
+
+
+class EmbedderModel(StrEnum):
+ """Enumeration of audio embedding models."""
+
+ CONTENTVEC = "contentvec"
+ CRUSTY = "Crusty"
+ CUSTOM = "custom"
+
+
+
+
+
+class SeparationModel(StrEnum):
+ """Enumeration of audio separation models."""
+
+ UVR_MDX_NET_VOC_FT = "UVR-MDX-NET-Voc_FT.onnx"
+ UVR_MDX_NET_KARA_2 = "UVR_MDXNET_KARA_2.onnx"
+ REVERB_HQ_BY_FOXJOY = "Reverb_HQ_By_FoxJoy.onnx"
+ UVR_MDX_NET_Inst_HQ_3 = "UVR-MDX-NET-Inst_HQ_3.onnx"
+ UVR_MDX_NET_Voc_FT = "UVR-MDX-NET-Voc_FT.onnx"
+ Kim_Vocal_1 = "Kim_Vocal_1.onnx"
+ Kim_Vocal_2 = "Kim_Vocal_2.onnx"
+ Kim_Inst = "Kim_Inst.onnx"
+ UVR_MDX_NET_Inst_HQ_4 = "UVR-MDX-NET-Inst_HQ_4.onnx"
+ kuielab_a_vocals = "kuielab_a_vocals.onnx"
+ kuielab_b_vocals = "kuielab_b_vocals.onnx"
+ kuielab_a_drums = "kuielab_a_drums.onnx"
+ kuielab_b_drums = "kuielab_b_drums.onnx"
+ kuielab_a_bass = "kuielab_a_bass.onnx"
+ kuielab_b_bass = "kuielab_b_bass.onnx"
+ kuielab_a_other = "kuielab_a_other.onnx"
+ kuielab_b_other = "kuielab_b_other.onnx"
+ MDX23C_8KFFT_InstVoc_HQ_2 = "MDX23C-8KFFT-InstVoc_HQ_2.ckpt"
+ UVR_DeNoise = "UVR-DeNoise.pth"
+ UVR_DeEcho_DeReverb = "UVR-DeEcho-DeReverb.pth"
+
+class SeparationModel2(StrEnum):
+ """Enumeration of audio separation models."""
+
+ UVR_MDX_NET_VOC_FT = "UVR-MDX-NET-Voc_FT.onnx"
+ UVR_MDX_NET_KARA_2 = "UVR_MDXNET_KARA_2.onnx"
+ REVERB_HQ_BY_FOXJOY = "Reverb_HQ_By_FoxJoy.onnx"
+ UVR_MDX_NET_Inst_HQ_3 = "UVR-MDX-NET-Inst_HQ_3.onnx"
+ UVR_MDX_NET_Voc_FT = "UVR-MDX-NET-Voc_FT.onnx"
+ Kim_Vocal_1 = "Kim_Vocal_1.onnx"
+ Kim_Vocal_2 = "Kim_Vocal_2.onnx"
+ Kim_Inst = "Kim_Inst.onnx"
+ UVR_MDX_NET_Inst_HQ_4 = "UVR-MDX-NET-Inst_HQ_4.onnx"
+ kuielab_a_vocals = "kuielab_a_vocals.onnx"
+ kuielab_b_vocals = "kuielab_b_vocals.onnx"
+ kuielab_a_drums = "kuielab_a_drums.onnx"
+ kuielab_b_drums = "kuielab_b_drums.onnx"
+ kuielab_a_bass = "kuielab_a_bass.onnx"
+ kuielab_b_bass = "kuielab_b_bass.onnx"
+ kuielab_a_other = "kuielab_a_other.onnx"
+ kuielab_b_other = "kuielab_b_other.onnx"
+ MDX23C_8KFFT_InstVoc_HQ_2 = "MDX23C-8KFFT-InstVoc_HQ_2.ckpt"
+ UVR_DeNoise = "UVR-DeNoise.pth"
+ UVR_DeEcho_DeReverb = "UVR-DeEcho-DeReverb.pth"
+
+
+
+
+
+now_dir = os.getcwd()
+
+sys.path.append(now_dir)
+models_dir = "models"
+
+dump_path = os.path.join(now_dir, models_dir)
+
+repo_id = "lainlives/voice"
+
+hf_token = os.environ.get("HF_TOKEN")
+snapshot_download(repo_id=repo_id, local_dir=dump_path, token=hf_token)
+
+#if __name__ == "__main__":
+# start_app(share=False, ssr_mode = True)
-class DeviceType(StrEnum):
- """Enumeration of device types for training voice models."""
- AUTOMATIC = "Automatic"
- CPU = "CPU"
- GPU = "GPU"
-class TrainingSampleRate(StrEnum):
- """Enumeration of sample rates for training voice models."""
- HZ_32K = "32000"
- HZ_40K = "40000"
- HZ_48K = "48000"
+config_name = "default" #os.environ.get("URVC_CONFIG")
+cookiefile = os.environ.get("YT_COOKIEFILE")
-class PretrainedSampleRate(StrEnum):
- """Enumeration of valid sample rates for pretrained models."""
- HZ_32K = "32k"
- HZ_40K = "40k"
- HZ_44K = "44k"
- HZ_48K = "48k"
+"""
+Module defining models for representing configuration settings for
+UI tabs.
+"""
-class TrainingF0Method(StrEnum):
- """Enumeration of pitch extraction methods for training."""
- RMVPE = "rmvpe"
- CREPE = "crepe"
- CREPE_TINY = "crepe-tiny"
-class AudioSplitMethod(StrEnum):
+class SongIntermediateAudioConfig(BaseModel):
"""
- Enumeration of methods to use for splitting audio files during
- dataset preprocessing.
+ Configuration settings for intermediate audio components in the
+ one-click song generation tab.
+
+ Attributes
+ ----------
+ song : AudioConfig
+ Configuration settings for the input song audio component.
+ vocals : AudioConfig
+ Configuration settings for the vocals audio component.
+ instrumentals : AudioConfig
+ Configuration settings for the instrumentals audio component.
+ main_vocals : AudioConfig
+ Configuration settings for the main vocals audio component.
+ backup_vocals : AudioConfig
+ Configuration settings for the backup vocals audio component.
+ main_vocals_dereverbed : AudioConfig
+ Configuration settings for the main vocals de-reverbed audio
+ component.
+ main_vocals_reverb : AudioConfig
+ Configuration settings for the main vocals reverb audio
+ component.
+ converted_vocals : AudioConfig
+ Configuration settings for the converted vocals audio
+ component.
+ postprocessed_vocals : AudioConfig
+ Configuration settings for the postprocessed vocals audio
+ component.
+ instrumentals_shifted : AudioConfig
+ Configuration settings for the shifted instrumentals audio
+ component.
+ backup_vocals_shifted : AudioConfig
+ Configuration settings for the shifted backup vocals audio
+ component.
+ all : list[gr.Audio]
+ List of instances of all intermediate audio components.
+
"""
- SKIP = "Skip"
- SIMPLE = "Simple"
- AUTOMATIC = "Automatic"
+ song: AudioConfig = AudioConfig.intermediate(label="Song")
+ vocals: AudioConfig = AudioConfig.intermediate(label="Vocals")
+ instrumentals: AudioConfig = AudioConfig.intermediate(
+ label="Instrumentals",
+ )
+ main_vocals: AudioConfig = AudioConfig.intermediate(
+ label="Main vocals",
+ )
+ backup_vocals: AudioConfig = AudioConfig.intermediate(
+ label="Backup vocals",
+ )
+ main_vocals_dereverbed: AudioConfig = AudioConfig.intermediate(
+ label="De-reverbed main vocals",
+ )
+ main_vocals_reverb: AudioConfig = AudioConfig.intermediate(
+ label="Main vocals with reverb",
+ )
+ converted_vocals: AudioConfig = AudioConfig.intermediate(
+ label="Converted vocals",
+ )
+ postprocessed_vocals: AudioConfig = AudioConfig.intermediate(
+ label="Postprocessed vocals",
+ )
+ instrumentals_shifted: AudioConfig = AudioConfig.intermediate(
+ label="Pitch-shifted instrumentals",
+ )
+ backup_vocals_shifted: AudioConfig = AudioConfig.intermediate(
+ label="Pitch-shifted backup vocals",
+ )
+ @property
+ def all(self) -> list[gr.Audio]:
+ """
+ Retrieve instances of all intermediate audio components
+ in the one-click song generation tab.
-class Vocoder(StrEnum):
- """Enumeration of vocoders for training voice models."""
+ Returns
+ -------
+ list[gr.Audio]
+ List of instances of all intermediate audio components in
+ the one-click song generation tab.
- HIFI_GAN = "HiFi-GAN"
- MRF_HIFI_GAN = "MRF HiFi-GAN"
- REFINE_GAN = "RefineGAN"
+ """
+ # NOTE we are using self.__annotations__ to get the fields in
+ # the order they are defined in the class
+ return [getattr(self, field).instance for field in self.__annotations__]
-class IndexAlgorithm(StrEnum):
- """Enumeration of indexing algorithms for training voice models."""
+class OneClickSongGenerationConfig(SongGenerationConfig):
+ """
+ Configuration settings for the one-click song generation tab.
- AUTO = "Auto"
- FAISS = "Faiss"
- KMEANS = "KMeans"
+ Attributes
+ ----------
+ n_octaves : SliderConfig
+ Configuration settings for an octave pitch shift slider
+ component.
+ n_semitones : SliderConfig
+ Configuration settings for a semitone pitch shift slider
+ component.
+ show_intermediate_audio : CheckboxConfig
+ Configuration settings for a show intermediate audio checkbox
+ component.
+ intermediate_audio : SongIntermediateAudioConfig
+ Configuration settings for intermediate audio components.
+ See Also
+ --------
+ SongGenerationConfig
+ Parent model defining common component configuration settings
+ for song generation tabs.
-class PretrainedType(StrEnum):
- """
- Enumeration of the possible types of pretrained models to finetune
- voice models on.
"""
- NONE = "None"
- DEFAULT = "Default"
- CUSTOM = "Custom"
+ n_octaves: SliderConfig = SliderConfig.octave_shift(
+ label="Vocal pitch shift",
+ info=(
+ "The number of octaves to shift the pitch of the converted vocals by. Use 1"
+ " for male-to-female and -1 for vice-versa."
+ ),
+ )
+ n_semitones: SliderConfig = SliderConfig.semitone_shift(
+ label="Overall pitch shift",
+ info=(
+ "The number of semi-tones to shift the pitch of the converted vocals,"
+ " instrumentals and backup vocals by."
+ ),
+ )
+ show_intermediate_audio: CheckboxConfig = CheckboxConfig(
+ label="Show intermediate audio",
+ info="Show intermediate audio tracks produced during song cover generation.",
+ value=False,
+ exclude_value=True,
+ )
+ intermediate_audio: SongIntermediateAudioConfig = SongIntermediateAudioConfig()
+class SongInputAudioConfig(BaseModel):
+ """
+ Configuration settings for input audio components in the multi-step
+ song generation tab.
-class ConcurrencyId(StrEnum):
- """Enumeration of possible concurrency identifiers."""
+ Attributes
+ ----------
+ audio : AudioConfig
+ Configuration settings for the input audio component.
+ vocals : AudioConfig
+ Configuration settings for the vocals audio component.
+ converted_vocals : AudioConfig
+ Configuration settings for the converted vocals audio
+ component.
+ instrumentals : AudioConfig
+ Configuration settings for the instrumentals audio
+ component.
+ backup_vocals : AudioConfig
+ Configuration settings for the backup vocals audio
+ component.
+ main_vocals : AudioConfig
+ Configuration settings for the main vocals audio
+ component.
+ shifted_instrumentals : AudioConfig
+ Configuration settings for the shifted instrumentals audio
+ component.
+ shifted_backup_vocals : AudioConfig
+ Configuration settings for the shifted backup vocals audio
+ component.
+ all : list[AudioConfig]
+ List of configuration settings for all input audio
+ components in the multi-step song generation tab.
- GPU = auto()
+ """
+ audio: AudioConfig = AudioConfig.input(label="Audio")
+ vocals: AudioConfig = AudioConfig.input(label="Vocals")
+ converted_vocals: AudioConfig = AudioConfig.input(label="Vocals")
+ instrumentals: AudioConfig = AudioConfig.input(label="Instrumentals")
+ backup_vocals: AudioConfig = AudioConfig.input(label="Backup vocals")
+ main_vocals: AudioConfig = AudioConfig.input(label="Main vocals")
+ shifted_instrumentals: AudioConfig = AudioConfig.input(label="Instrumentals")
+ shifted_backup_vocals: AudioConfig = AudioConfig.input(label="Backup vocals")
-class SongSourceType(StrEnum):
- """The type of source providing the song to generate a cover of."""
+ @property
+ def all(self) -> list[AudioConfig]:
+ """
+ Retrieve configuration settings for all input audio components
+ in the multi-step song generation tab.
- LOCAL_FILE = "Local file"
- CACHED_SONG = "Cached song"
+ Returns
+ -------
+ list[AudioConfig]
+ List of configuration settings for all input audio
+ components in the multi-step song generation tab.
+ """
+ return [getattr(self, field) for field in self.__annotations__]
-class SpeechSourceType(StrEnum):
- """The type of source providing the text to generate speech from."""
- TEXT = "Text"
- LOCAL_FILE = "Local file"
+class SongDirsConfig(BaseModel):
+ """
+ Configuration settings for song directory components in the
+ multi-step song generation tab.
+ Attributes
+ ----------
+ separate_audio : DropdownConfig
+ Configuration settings for the song directory component
+ for separating audio.
+ convert_vocals : DropdownConfig
+ Configuration settings for the song directory component
+ for converting vocals.
+ postprocess_vocals : DropdownConfig
+ Configuration settings for the song directory component
+ for postprocessing vocals.
+ pitch_shift_background : DropdownConfig
+ Configuration settings for the song directory component
+ for pitch-shifting background audio.
+ mix : DropdownConfig
+ Configuration settings for the song directory component
+ for mixing audio.
+ all : list[gr.Dropdown]
+ List of instances of all song directory components in the
+ multi-step song generation tab.
-class SongTransferOption(StrEnum):
- """Enumeration of possible song transfer options."""
+ """
- STEP_1_AUDIO = "Step 1: stem splitting"
- STEP_2_VOCALS = "Step 2: vocal conversion"
- STEP_3_VOCALS = "Step 3: vocal effect"
- STEP_4_INSTRUMENTALS = "Step 4: instrumentals"
- STEP_4_BACKUP_VOCALS = "Step 4: backup vocals"
- STEP_5_MAIN_VOCALS = "Step 5: main vocals"
- STEP_5_INSTRUMENTALS = "Step 5: instrumentals"
- STEP_5_BACKUP_VOCALS = "Step 5: backup vocals"
+ separate_audio: DropdownConfig = DropdownConfig.song_dir()
+ convert_vocals: DropdownConfig = DropdownConfig.song_dir()
+ postprocess_vocals: DropdownConfig = DropdownConfig.song_dir()
+ pitch_shift_background: DropdownConfig = DropdownConfig.song_dir()
+ mix: DropdownConfig = DropdownConfig.song_dir()
+ @property
+ def all(self) -> list[gr.Dropdown]:
+ """
+ Retrieve instances of all song directory components in the
+ multi-step song generation tab.
-class SpeechTransferOption(StrEnum):
- """Enumeration of possible speech transfer options."""
+ Returns
+ -------
+ list[gr.Dropdown]
+ List of instances of all song directory components in
+ the multi-step song generation tab.
- STEP_2_SPEECH = "Step 2: vocal conversion"
- STEP_3_SPEECH = "Step 3: vocal effect"
+ """
+ return [getattr(self, field).instance for field in self.__annotations__]
-class ComponentVisibilityKwArgs(TypedDict, total=False):
+class MultiStepSongGenerationConfig(SongGenerationConfig):
"""
- Keyword arguments for setting component visibility.
+ Configuration settings for multi-step song generation tab.
Attributes
----------
- visible : bool
- Whether the component should be visible.
- value : Any
- The value of the component.
+ separation_model : DropdownConfig
+ Configuration settings for a separation model dropdown
+ component.
+ segment_size : RadioConfig
+ Configuration settings for a segment size radio component.
+ n_octaves : SliderConfig
+ Configuration settings for an octave pitch shift slider
+ component.
+ n_semitones : SliderConfig
+ Configuration settings for a semitone pitch shift slider
+ component.
+ n_semitones_instrumentals : SliderConfig
+ Configuration settings for an instrumentals pitch shift slider
+ component.
+ n_semitones_backup_vocals : SliderConfig
+ Configuration settings for a backup vocals pitch shift slider
+ component.
+ input_audio : SongInputAudioConfig
+ Configuration settings for input audio components.
+ song_dirs : SongDirsConfig
+ Configuration settings for song directory components.
+
+ See Also
+ --------
+ SongGenerationConfig
+ Parent model defining common component configuration settings
+ for song generation tabs.
"""
- visible: bool
- value: Any
+ separation_model: DropdownConfig = DropdownConfig(
+ label="Separation model",
+ info="The model to use for audio separation.",
+ value=SeparationModel.UVR_MDX_NET_VOC_FT,
+ choices=list(SeparationModel2),
+ )
+ segment_size: RadioConfig = RadioConfig(
+ label="Segment size",
+ info=(
+ "The size of the segments into which the audio is split. Using a larger"
+ " size consumes more resources, but may give better results."
+ ),
+ value=SegmentSize.SEG_2048,
+ choices=list(SegmentSize),
+ )
+ n_octaves: SliderConfig = SliderConfig.octave_shift(
+ label="Pitch shift (octaves)",
+ info=(
+ "The number of octaves to pitch-shift the converted voice by. Use 1 for"
+ " male-to-female and -1 for vice-versa."
+ ),
+ )
+ n_semitones: SliderConfig = SliderConfig.semitone_shift(
+ label="Pitch shift (semi-tones)",
+ info=(
+ "The number of semi-tones to pitch-shift the converted vocals by. Altering"
+ " this slightly reduces sound quality."
+ ),
+ )
+ n_semitones_instrumentals: SliderConfig = SliderConfig.semitone_shift(
+ label="Instrumental pitch shift",
+ info="The number of semi-tones to pitch-shift the instrumentals by.",
+ )
+ n_semitones_backup_vocals: SliderConfig = SliderConfig.semitone_shift(
+ label="Backup vocal pitch shift",
+ info="The number of semi-tones to pitch-shift the backup vocals by.",
+ )
+ input_audio: SongInputAudioConfig = SongInputAudioConfig()
+ song_dirs: SongDirsConfig = SongDirsConfig()
-class UpdateDropdownKwArgs(TypedDict, total=False):
+class SpeechIntermediateAudioConfig(BaseModel):
"""
- Keyword arguments for updating a dropdown component.
+ Configuration settings for intermediate audio components in the
+ one-click speech generation tab.
Attributes
----------
- choices : DropdownChoices
- The updated choices for the dropdown component.
- value : DropdownValue
- The updated value for the dropdown component.
+ speech : AudioConfig
+ Configuration settings for the input speech audio component.
+ converted_speech : AudioConfig
+ Configuration settings for the converted speech audio component.
+ all : list[gr.Audio]
+ List of instances of all intermediate audio components in the
+ speech generation tab.
"""
- choices: DropdownChoices
- value: DropdownValue
-
-
-class TextBoxKwArgs(TypedDict, total=False):
- """
- Keyword arguments for updating a textbox component.
+ speech: AudioConfig = AudioConfig.intermediate(label="Speech")
+ converted_speech: AudioConfig = AudioConfig.intermediate(label="Converted speech")
- Attributes
- ----------
- value : str | None
- The updated value for the textbox component.
- placeholder : str | None
- The updated placeholder for the textbox component.
+ @property
+ def all(self) -> list[gr.Audio]:
+ """
+ Retrieve instances of all intermediate audio components in the
+ speech generation tab.
- """
+ Returns
+ -------
+ list[gr.Audio]
+ List of instances of all intermediate audio components in
+ the speech generation tab.
- value: str | None
- placeholder: str | None
+ """
+ return [getattr(self, field).instance for field in self.__annotations__]
-class UpdateAudioKwArgs(TypedDict, total=False):
+class OneClickSpeechGenerationConfig(SpeechGenerationConfig):
"""
- Keyword arguments for updating an audio component.
+ Configuration settings for one-click speech generation tab.
Attributes
----------
- value : str | None
- The updated value for the audio component.
-
- """
-
- value: str | None
-
-
-class DatasetType(StrEnum):
- """The type of dataset to train a voice model."""
-
- NEW_DATASET = "New dataset"
- EXISTING_DATASET = "Existing dataset"
+ intermediate_audio : SpeechIntermediateAudioConfig
+ Configuration settings for intermediate audio components.
+ show_intermediate_audio : CheckboxConfig
+ Configuration settings for a show intermediate audio checkbox
+ component.
+ See Also
+ --------
+ SpeechGenerationConfig
+ Parent model defining common component configuration settings
+ for speech generation tabs.
-embedders_list = [
- ("embedders/contentvec/", ["pytorch_model.bin", "config.json"]),
- ("embedders/custom/Crusty/", ["model.safetensors", "config.json"]),
-]
+ """
+ intermediate_audio: SpeechIntermediateAudioConfig = SpeechIntermediateAudioConfig()
-class EmbedderModel(StrEnum):
- """Enumeration of audio embedding models."""
+ show_intermediate_audio: CheckboxConfig = CheckboxConfig(
+ label="Show intermediate audio",
+ info="Show intermediate audio tracks produced during speech generation.",
+ value=False,
+ exclude_value=True,
+ )
- CONTENTVEC = "contentvec"
- CRUSTY = "Crusty"
- CUSTOM = "custom"
-
-class SeparationModel(StrEnum):
- """Enumeration of audio separation models."""
- UVR_MDX_NET_VOC_FT = "UVR-MDX-NET-Voc_FT.onnx"
- UVR_MDX_NET_KARA_2 = "UVR_MDXNET_KARA_2.onnx"
- REVERB_HQ_BY_FOXJOY = "Reverb_HQ_By_FoxJoy.onnx"
- UVR_MDX_NET_Inst_HQ_3 = "UVR-MDX-NET-Inst_HQ_3.onnx"
- UVR_MDX_NET_Voc_FT = "UVR-MDX-NET-Voc_FT.onnx"
- Kim_Vocal_1 = "Kim_Vocal_1.onnx"
- Kim_Vocal_2 = "Kim_Vocal_2.onnx"
- Kim_Inst = "Kim_Inst.onnx"
- UVR_MDX_NET_Inst_HQ_4 = "UVR-MDX-NET-Inst_HQ_4.onnx"
- kuielab_a_vocals = "kuielab_a_vocals.onnx"
- kuielab_b_vocals = "kuielab_b_vocals.onnx"
- kuielab_a_drums = "kuielab_a_drums.onnx"
- kuielab_b_drums = "kuielab_b_drums.onnx"
- kuielab_a_bass = "kuielab_a_bass.onnx"
- kuielab_b_bass = "kuielab_b_bass.onnx"
- kuielab_a_other = "kuielab_a_other.onnx"
- kuielab_b_other = "kuielab_b_other.onnx"
- MDX23C_8KFFT_InstVoc_HQ_2 = "MDX23C-8KFFT-InstVoc_HQ_2.ckpt"
- UVR_DeNoise = "UVR-DeNoise.pth"
- UVR_DeEcho_DeReverb = "UVR-DeEcho-DeReverb.pth"
-
-class SeparationModel2(StrEnum):
- """Enumeration of audio separation models."""
+class SpeechInputAudioConfig(BaseModel):
+ """
+ Configuration settings for input audio components in the multi-step
+ speech generation tab.
- UVR_MDX_NET_VOC_FT = "UVR-MDX-NET-Voc_FT.onnx"
- UVR_MDX_NET_KARA_2 = "UVR_MDXNET_KARA_2.onnx"
- REVERB_HQ_BY_FOXJOY = "Reverb_HQ_By_FoxJoy.onnx"
- UVR_MDX_NET_Inst_HQ_3 = "UVR-MDX-NET-Inst_HQ_3.onnx"
- UVR_MDX_NET_Voc_FT = "UVR-MDX-NET-Voc_FT.onnx"
- Kim_Vocal_1 = "Kim_Vocal_1.onnx"
- Kim_Vocal_2 = "Kim_Vocal_2.onnx"
- Kim_Inst = "Kim_Inst.onnx"
- UVR_MDX_NET_Inst_HQ_4 = "UVR-MDX-NET-Inst_HQ_4.onnx"
- kuielab_a_vocals = "kuielab_a_vocals.onnx"
- kuielab_b_vocals = "kuielab_b_vocals.onnx"
- kuielab_a_drums = "kuielab_a_drums.onnx"
- kuielab_b_drums = "kuielab_b_drums.onnx"
- kuielab_a_bass = "kuielab_a_bass.onnx"
- kuielab_b_bass = "kuielab_b_bass.onnx"
- kuielab_a_other = "kuielab_a_other.onnx"
- kuielab_b_other = "kuielab_b_other.onnx"
- MDX23C_8KFFT_InstVoc_HQ_2 = "MDX23C-8KFFT-InstVoc_HQ_2.ckpt"
- UVR_DeNoise = "UVR-DeNoise.pth"
- UVR_DeEcho_DeReverb = "UVR-DeEcho-DeReverb.pth"
+ Attributes
+ ----------
+ speech : AudioConfig
+ Configuration settings for the input speech audio component.
+ converted_speech : AudioConfig
+ Configuration settings for the converted speech audio component.
+ all : list[AudioConfig]
+ List of configuration settings for all input audio components in
+ the multi-step speech generation tab.
+ """
+ speech: AudioConfig = AudioConfig.input("Speech")
+ converted_speech: AudioConfig = AudioConfig.input("Converted speech")
+ @property
+ def all(self) -> list[AudioConfig]:
+ """
+ Retrieve configuration settings for all input audio components
+ in the multi-step speech generation tab.
-now_dir = os.getcwd()
+ Returns
+ -------
+ list[AudioConfig]
+ List of configuration settings for all input audio
+ components in the multi-step speech generation tab.
-sys.path.append(now_dir)
-models_dir = "models"
+ """
+ return [getattr(self, field) for field in self.__annotations__]
-dump_path = os.path.join(now_dir, models_dir)
-repo_id = "lainlives/voice"
+class MultiStepSpeechGenerationConfig(SpeechGenerationConfig):
+ """
+ Configuration settings for the multi-step speech generation tab.
-hf_token = os.environ.get("HF_TOKEN")
-snapshot_download(repo_id=repo_id, local_dir=dump_path, token=hf_token)
+ Attributes
+ ----------
+ input_audio : SpeechInputAudioConfig
+ Configuration settings for input audio components.
-#if __name__ == "__main__":
-# start_app(share=False, ssr_mode = True)
+ See Also
+ --------
+ SpeechGenerationConfig
+ Parent model defining common component configuration settings
+ for speech generation tabs.
+ """
+ input_audio: SpeechInputAudioConfig = SpeechInputAudioConfig()
+class MultiStepTrainingConfig(TrainingConfig):
+ """Configuration settings for multi-step training tab."""
-config_name = "default" #os.environ.get("URVC_CONFIG")
-cookiefile = os.environ.get("YT_COOKIEFILE")
+class ModelManagementConfig(BaseModel):
+ """
+ Configuration settings for model management tab.
+ Attributes
+ ----------
+ voices : DropdownConfig
+ Configuration settings for delete voice models dropdown
+ component.
+ embedders : DropdownConfig
+ Configuration settings for delete embedder models dropdown
+ component.
+ pretraineds : DropdownConfig
+ Configuration settings for delete pretrained models dropdown
+ component.
+ traineds : DropdownConfig
+ Configuration settings for delete training models dropdown
+ component.
+ dummy_checkbox : CheckboxConfig
+ Configuration settings for a dummy checkbox component.
-"""
-Module defining models for representing configuration settings for
-UI tabs.
-"""
+ """
+ voices: DropdownConfig = DropdownConfig.multi_delete(
+ label="Voice models",
+ info="Select one or more voice models to delete.",
+ )
+ embedders: DropdownConfig = DropdownConfig.multi_delete(
+ label="Custom embedder models",
+ info="Select one or more embedder models to delete.",
+ )
+ pretraineds: DropdownConfig = DropdownConfig.multi_delete(
+ label="Custom pretrained models",
+ info="Select one or more pretrained models to delete.",
+ )
+ traineds: DropdownConfig = DropdownConfig.multi_delete(
+ label="Training models",
+ info="Select one or more training models to delete.",
+ )
+ dummy_checkbox: CheckboxConfig = CheckboxConfig(
+ value=False,
+ visible=False,
+ exclude_value=True,
+ )
-class SongIntermediateAudioConfig(BaseModel):
+class AudioManagementConfig(BaseModel):
"""
- Configuration settings for intermediate audio components in the
- one-click song generation tab.
+ Configuration settings for audio management tab.
Attributes
----------
- song : AudioConfig
- Configuration settings for the input song audio component.
- vocals : AudioConfig
- Configuration settings for the vocals audio component.
- instrumentals : AudioConfig
- Configuration settings for the instrumentals audio component.
- main_vocals : AudioConfig
- Configuration settings for the main vocals audio component.
- backup_vocals : AudioConfig
- Configuration settings for the backup vocals audio component.
- main_vocals_dereverbed : AudioConfig
- Configuration settings for the main vocals de-reverbed audio
- component.
- main_vocals_reverb : AudioConfig
- Configuration settings for the main vocals reverb audio
- component.
- converted_vocals : AudioConfig
- Configuration settings for the converted vocals audio
- component.
- postprocessed_vocals : AudioConfig
- Configuration settings for the postprocessed vocals audio
+ intermediate : DropdownConfig
+ Configuration settings for delete intermediate audio files
+ dropdown component
+ speech : DropdownConfig
+ Configuration settings for delete speech audio files dropdown
component.
- instrumentals_shifted : AudioConfig
- Configuration settings for the shifted instrumentals audio
+ output : DropdownConfig
+ Configuration settings for delete output audio files dropdown
component.
- backup_vocals_shifted : AudioConfig
- Configuration settings for the shifted backup vocals audio
+ dataset : DropdownConfig
+ Configuration settings for delete dataset audio files dropdown
component.
- all : list[gr.Audio]
- List of instances of all intermediate audio components.
+ dummy_checkbox : CheckboxConfig
+ Configuration settings for a dummy checkbox component.
"""
- song: AudioConfig = AudioConfig.intermediate(label="Song")
- vocals: AudioConfig = AudioConfig.intermediate(label="Vocals")
- instrumentals: AudioConfig = AudioConfig.intermediate(
- label="Instrumentals",
- )
- main_vocals: AudioConfig = AudioConfig.intermediate(
- label="Main vocals",
+ intermediate: DropdownConfig = DropdownConfig.multi_delete(
+ label="Song directories",
+ info=(
+ "Select one or more song directories containing intermediate audio files to"
+ " delete."
+ ),
)
- backup_vocals: AudioConfig = AudioConfig.intermediate(
- label="Backup vocals",
+ speech: DropdownConfig = DropdownConfig.multi_delete(
+ label="Speech audio files",
+ info="Select one or more speech audio files to delete.",
)
- main_vocals_dereverbed: AudioConfig = AudioConfig.intermediate(
- label="De-reverbed main vocals",
+ output: DropdownConfig = DropdownConfig.multi_delete(
+ label="Output audio files",
+ info="Select one or more output audio files to delete.",
)
- main_vocals_reverb: AudioConfig = AudioConfig.intermediate(
- label="Main vocals with reverb",
+ dataset: DropdownConfig = DropdownConfig.multi_delete(
+ label="Dataset audio files",
+ info="Select one or more datasets containing audio files to delete.",
)
- converted_vocals: AudioConfig = AudioConfig.intermediate(
- label="Converted vocals",
+
+ dummy_checkbox: CheckboxConfig = CheckboxConfig(
+ value=False,
+ visible=False,
+ exclude_value=True,
)
- postprocessed_vocals: AudioConfig = AudioConfig.intermediate(
- label="Postprocessed vocals",
+
+
+class SettingsManagementConfig(BaseModel):
+ """
+ Configuration settings for settings management tab.
+
+ Attributes
+ ----------
+ dummy_checkbox : CheckboxConfig
+ Configuration settings for a dummy checkbox component.
+
+ """
+
+ load_config_name: DropdownConfig = DropdownConfig(
+ label="Configuration name",
+ info="The name of a configuration to load UI settings from",
+ value=None,
+ render=False,
+ exclude_value=True,
)
- instrumentals_shifted: AudioConfig = AudioConfig.intermediate(
- label="Pitch-shifted instrumentals",
+ delete_config_names: DropdownConfig = DropdownConfig.multi_delete(
+ label="Configuration names",
+ info="Select the name of one or more configurations to delete",
)
- backup_vocals_shifted: AudioConfig = AudioConfig.intermediate(
- label="Pitch-shifted backup vocals",
+ dummy_checkbox: CheckboxConfig = CheckboxConfig(
+ value=False,
+ visible=False,
+ exclude_value=True,
)
- @property
- def all(self) -> list[gr.Audio]:
- """
- Retrieve instances of all intermediate audio components
- in the one-click song generation tab.
- Returns
- -------
- list[gr.Audio]
- List of instances of all intermediate audio components in
- the one-click song generation tab.
+class TotalSongGenerationConfig(BaseModel):
+ """
+ All configuration settings for song generation tabs.
- """
- # NOTE we are using self.__annotations__ to get the fields in
- # the order they are defined in the class
- return [getattr(self, field).instance for field in self.__annotations__]
+ Attributes
+ ----------
+ one_click : OneClickSongGenerationConfig
+ Configuration settings for the one-click song generation tab.
+ multi_step : MultiStepSongGenerationConfig
+ Configuration settings for the multi-step song generation tab.
+
+ """
+ one_click: OneClickSongGenerationConfig = OneClickSongGenerationConfig()
+ multi_step: MultiStepSongGenerationConfig = MultiStepSongGenerationConfig()
-class OneClickSongGenerationConfig(SongGenerationConfig):
+
+class TotalSpeechGenerationConfig(BaseModel):
"""
- Configuration settings for the one-click song generation tab.
+ All configuration settings for speech generation tabs.
Attributes
----------
- n_octaves : SliderConfig
- Configuration settings for an octave pitch shift slider
- component.
- n_semitones : SliderConfig
- Configuration settings for a semitone pitch shift slider
- component.
- show_intermediate_audio : CheckboxConfig
- Configuration settings for a show intermediate audio checkbox
- component.
- intermediate_audio : SongIntermediateAudioConfig
- Configuration settings for intermediate audio components.
+ one_click : OneClickSpeechGenerationConfig
+ Configuration settings for the one-click speech generation tab.
+ multi_step : MultiStepSpeechGenerationConfig
+ Configuration settings for the multi-step speech generation tab.
+
+ """
+
+ one_click: OneClickSpeechGenerationConfig = OneClickSpeechGenerationConfig()
+ multi_step: MultiStepSpeechGenerationConfig = MultiStepSpeechGenerationConfig()
+
+
+class TotalTrainingConfig(BaseModel):
+ """
+ All configuration settings for training tabs.
+
+ Attributes
+ ----------
+ training : TrainingConfig
+ Configuration settings for the multi-step training tab.
- See Also
- --------
- SongGenerationConfig
- Parent model defining common component configuration settings
- for song generation tabs.
+ """
+ multi_step: MultiStepTrainingConfig = MultiStepTrainingConfig()
+
+
+class TotalManagementConfig(BaseModel):
"""
+ All configuration settings for management tabs.
- n_octaves: SliderConfig = SliderConfig.octave_shift(
- label="Vocal pitch shift",
- info=(
- "The number of octaves to shift the pitch of the converted vocals by. Use 1"
- " for male-to-female and -1 for vice-versa."
- ),
- )
+ Attributes
+ ----------
+ model : ModelManagementConfig
+ Configuration settings for the model management tab.
+ audio : AudioManagementConfig
+ Configuration settings for the audio management tab.
+ settings : SettingsManagementConfig
+ Configuration settings for the settings management tab.
- n_semitones: SliderConfig = SliderConfig.semitone_shift(
- label="Overall pitch shift",
- info=(
- "The number of semi-tones to shift the pitch of the converted vocals,"
- " instrumentals and backup vocals by."
- ),
- )
- show_intermediate_audio: CheckboxConfig = CheckboxConfig(
- label="Show intermediate audio",
- info="Show intermediate audio tracks produced during song cover generation.",
- value=False,
- exclude_value=True,
- )
- intermediate_audio: SongIntermediateAudioConfig = SongIntermediateAudioConfig()
+ """
+
+ model: ModelManagementConfig = ModelManagementConfig()
+ audio: AudioManagementConfig = AudioManagementConfig()
+ settings: SettingsManagementConfig = SettingsManagementConfig()
-class SongInputAudioConfig(BaseModel):
+class TotalConfig(BaseModel):
"""
- Configuration settings for input audio components in the multi-step
- song generation tab.
+ All configuration settings for the Ultimate RVC app.
Attributes
----------
- audio : AudioConfig
- Configuration settings for the input audio component.
- vocals : AudioConfig
- Configuration settings for the vocals audio component.
- converted_vocals : AudioConfig
- Configuration settings for the converted vocals audio
- component.
- instrumentals : AudioConfig
- Configuration settings for the instrumentals audio
- component.
- backup_vocals : AudioConfig
- Configuration settings for the backup vocals audio
- component.
- main_vocals : AudioConfig
- Configuration settings for the main vocals audio
- component.
- shifted_instrumentals : AudioConfig
- Configuration settings for the shifted instrumentals audio
- component.
- shifted_backup_vocals : AudioConfig
- Configuration settings for the shifted backup vocals audio
- component.
- all : list[AudioConfig]
- List of configuration settings for all input audio
- components in the multi-step song generation tab.
+ song : TotalSongGenerationConfig
+ Configuration settings for song generation tabs.
+ speech : TotalSpeechGenerationConfig
+ Configuration settings for speech generation tabs.
+ training : TotalTrainingConfig
+ Configuration settings for training tabs.
+ management : TotalManagementConfig
+ Configuration settings for management tabs.
"""
- audio: AudioConfig = AudioConfig.input(label="Audio")
- vocals: AudioConfig = AudioConfig.input(label="Vocals")
- converted_vocals: AudioConfig = AudioConfig.input(label="Vocals")
- instrumentals: AudioConfig = AudioConfig.input(label="Instrumentals")
- backup_vocals: AudioConfig = AudioConfig.input(label="Backup vocals")
- main_vocals: AudioConfig = AudioConfig.input(label="Main vocals")
- shifted_instrumentals: AudioConfig = AudioConfig.input(label="Instrumentals")
- shifted_backup_vocals: AudioConfig = AudioConfig.input(label="Backup vocals")
+ song: TotalSongGenerationConfig = TotalSongGenerationConfig()
+ speech: TotalSpeechGenerationConfig = TotalSpeechGenerationConfig()
+ training: TotalTrainingConfig = TotalTrainingConfig()
+ management: TotalManagementConfig = TotalManagementConfig()
- @property
- def all(self) -> list[AudioConfig]:
+ @cached_property
+ def all(self) -> list[AnyComponentConfig]:
"""
- Retrieve configuration settings for all input audio components
- in the multi-step song generation tab.
+ Recursively collect those component configuration models nested
+ within the current model instance, which have values that are
+ not excluded.
Returns
-------
- list[AudioConfig]
- List of configuration settings for all input audio
- components in the multi-step song generation tab.
+ list[AnyComponentConfig]
+ A list of component configuration models found within the
+ current model instance, which have values that are not
+ excluded.
"""
- return [getattr(self, field) for field in self.__annotations__]
+ def _collect(model: BaseModel) -> list[AnyComponentConfig]:
+ component_configs: list[Any] = []
+ for _, value in model:
+ if isinstance(value, ComponentConfig):
+ if not value.exclude_value:
+ component_configs.append(value)
+ elif isinstance(value, BaseModel):
+ component_configs.extend(_collect(value))
+ return component_configs
+
+ return _collect(self)
-class SongDirsConfig(BaseModel):
+
+
+
+
+class BaseTabConfig(BaseModel):
"""
- Configuration settings for song directory components in the
- multi-step song generation tab.
+ Base model defining common component configuration settings for
+ UI tabs.
Attributes
----------
- separate_audio : DropdownConfig
- Configuration settings for the song directory component
- for separating audio.
- convert_vocals : DropdownConfig
- Configuration settings for the song directory component
- for converting vocals.
- postprocess_vocals : DropdownConfig
- Configuration settings for the song directory component
- for postprocessing vocals.
- pitch_shift_background : DropdownConfig
- Configuration settings for the song directory component
- for pitch-shifting background audio.
- mix : DropdownConfig
- Configuration settings for the song directory component
- for mixing audio.
- all : list[gr.Dropdown]
- List of instances of all song directory components in the
- multi-step song generation tab.
+ embedder_model : DropdownConfig
+ Configuration settings for an embedder model dropdown component.
+ custom_embedder_model : DropdownConfig
+ Configuration settings for a custom embedder model dropdown
+ component.
"""
- separate_audio: DropdownConfig = DropdownConfig.song_dir()
- convert_vocals: DropdownConfig = DropdownConfig.song_dir()
- postprocess_vocals: DropdownConfig = DropdownConfig.song_dir()
- pitch_shift_background: DropdownConfig = DropdownConfig.song_dir()
- mix: DropdownConfig = DropdownConfig.song_dir()
-
- @property
- def all(self) -> list[gr.Dropdown]:
- """
- Retrieve instances of all song directory components in the
- multi-step song generation tab.
-
- Returns
- -------
- list[gr.Dropdown]
- List of instances of all song directory components in
- the multi-step song generation tab.
+ embedder_model: DropdownConfig = DropdownConfig(
+ label="Embedder model",
+ info="The model to use for generating speaker embeddings.",
+ value=EmbedderModel.CONTENTVEC,
+ choices=list(EmbedderModel),
+ exclude_value=True,
+ )
+ custom_embedder_model: DropdownConfig = DropdownConfig(
+ label="Custom embedder model",
+ info="Select a custom embedder model from the dropdown.",
+ value=None,
+ visible=False,
+ render=False,
+ exclude_value=True,
+ )
- """
- return [getattr(self, field).instance for field in self.__annotations__]
-class MultiStepSongGenerationConfig(SongGenerationConfig):
+class TrainingConfig(BaseTabConfig):
"""
- Configuration settings for multi-step song generation tab.
+ Common component configuration settings for training tabs.
Attributes
----------
- separation_model : DropdownConfig
- Configuration settings for a separation model dropdown
+ dataset_type : DropdownConfig
+ Configuration settings for a dataset type dropdown component.
+ dataset : DropdownConfig
+ Configuration settings for a dataset dropdown component.
+ dataset_name : TextboxConfig
+ Configuration settings for a dataset name textbox component.
+ preprocess_model : DropdownConfig
+ Configuration settings for a model name dropdown component
+ for audio preprocessing.
+ sample_rate : DropdownConfig
+ Configuration settings for a sample rate dropdown component.
+ filter_audio : CheckboxConfig
+ Configuration settings for a filter audio checkbox component.
+ clean_audio : CheckboxConfig
+ Configuration settings for a clean audio checkbox component.
+ clean_strength : SliderConfig
+ Configuration settings for a clean strength slider component.
+ split_method : DropdownConfig
+ Configuration settings for an audio splitting method dropdown
+ component.
+ chunk_len : SliderConfig
+ Configuration settings for a chunk length slider component.
+ overlap_len : SliderConfig
+ Configuration settings for an overlap length slider component.
+ preprocess_cores : SliderConfig
+ Configuration settings for a CPU cores slider component for
+ preprocessing.
+ extract_model : DropdownConfig
+ Configuration settings for a model name dropdown component for
+ feature extraction.
+ f0_method : DropdownConfig
+ Configuration settings for an F0 method dropdown component.
+ hop_length : SliderConfig
+ Configuration settings for a hop length slider component.
+ include_mutes : SliderConfig
+ Configuration settings for an include mutes slider component.
+ extract_cores : SliderConfig
+ Configuration settings for a CPU cores slider component for
+ feature extraction.
+ extraction_acceleration : HardwareAccelerationConfig
+ Configuration settings for a hardware acceleration component for
+ feature extraction.
+ extraction_gpus : DropdownConfig
+ Configuration settings for a GPU dropdown compoennt for feature
+ extraction.
+ train_model : DropdownConfig
+ Configuration settings for a model name dropdown component for
+ training.
+ num_epochs : SliderConfig
+ Configuration settings for a number of epochs slider component.
+ batch_size : SliderConfig
+ Configuration settings for a batch size slider component.
+ detect_overtraining : CheckboxConfig
+ Configuration settings for a detect overtraining checkbox
+ component.
+ overtraining_threshold : SliderConfig
+ Configuration settings for an overtraining threshold slider
+ component.
+ vocoder : DropdownConfig
+ Configuration settings for a vocoder dropdown component.
+ index_algorithm : DropdownConfig
+ Configuration settings for an index algorithm dropdown
component.
- segment_size : RadioConfig
- Configuration settings for a segment size radio component.
- n_octaves : SliderConfig
- Configuration settings for an octave pitch shift slider
+ pretrained_type : DropdownConfig
+ Configuration settings for a pretrained model type dropdown
component.
- n_semitones : SliderConfig
- Configuration settings for a semitone pitch shift slider
+ custom_pretrained_model : DropdownConfig
+ Configuration settings for a custom pretrained model dropdown
component.
- n_semitones_instrumentals : SliderConfig
- Configuration settings for an instrumentals pitch shift slider
+ save_interval : SliderConfig
+ Configuration settings for a save-interval slider component.
+ save_all_checkpoints : CheckboxConfig
+ Configuration settings for a save-all-checkpoints checkbox
component.
- n_semitones_backup_vocals : SliderConfig
- Configuration settings for a backup vocals pitch shift slider
+ save_all_weights : CheckboxConfig
+ Configuration settings for a save-all-weights checkbox
+ component.
+ clear_saved_data : CheckboxConfig
+ Configuration settings for a clear-saved-data checkbox
+ component.
+ upload_model : CheckboxConfig
+ Configuration settings for an upload voice model checkbox
+ component.
+ upload_name : TextboxConfig
+ Configuration settings for an upload name textbox component.
+ training_acceleration : HardwareAccelerationConfig
+ Configuration settings for a hardware acceleration component for
+ training.
+ training_gpus : DropdownConfig
+ Configuration settings for a GPU dropdown component for
+ training.
+ preload_dataset : CheckboxConfig
+ Configuration settings for a preload dataset checkbox component.
+ reduce_memory_usage : CheckboxConfig
+ Configuration settings for a reduce-memory-usage checkbox
component.
- input_audio : SongInputAudioConfig
- Configuration settings for input audio components.
- song_dirs : SongDirsConfig
- Configuration settings for song directory components.
See Also
--------
- SongGenerationConfig
+ BaseTabConfig
Parent model defining common component configuration settings
- for song generation tabs.
+ for UI tabs.
"""
- separation_model: DropdownConfig = DropdownConfig(
- label="Separation model",
- info="The model to use for audio separation.",
- value=SeparationModel.UVR_MDX_NET_VOC_FT,
- choices=list(SeparationModel2),
+ dataset_type: DropdownConfig = DropdownConfig(
+ label="Dataset type",
+ info="Select the type of dataset to preprocess.",
+ value=DatasetType.NEW_DATASET,
+ choices=list(DatasetType),
+ exclude_value=True,
+ )
+ dataset: DropdownConfig = DropdownConfig(
+ label="Dataset path",
+ info=(
+ "The path to an existing dataset. Either select a path to a previously"
+ " created dataset or provide a path to an external dataset."
+ ),
+ value=None,
+ allow_custom_value=True,
+ visible=False,
+ render=False,
+ exclude_value=True,
+ )
+ dataset_name: TextboxConfig = TextboxConfig(
+ label="Dataset name",
+ info=(
+ "The name of the new dataset. If the dataset already exists, the provided"
+ " audio files will be added to it."
+ ),
+ value="My dataset",
+ exclude_value=True,
+ )
+ preprocess_model: DropdownConfig = DropdownConfig(
+ label="Model name",
+ info=(
+ "Name of the model to preprocess the given dataset for. Either select an"
+ " existing model from the dropdown or provide the name of a new model."
+ ),
+ value="My model",
+ allow_custom_value=True,
+ render=False,
+ exclude_value=True,
+ )
+ sample_rate: DropdownConfig = DropdownConfig(
+ label="Sample rate",
+ info="Target sample rate for the audio files in the provided dataset.",
+ value=TrainingSampleRate.HZ_40K,
+ choices=list(TrainingSampleRate),
+ )
+ filter_audio: CheckboxConfig = CheckboxConfig(
+ label="Filter audio",
+ info=(
+ "Whether to remove low-frequency sounds from the audio files in the"
+ " provided dataset by applying a high-pass butterworth filter.
"
+ ),
+ value=True,
+ )
+ clean_audio: CheckboxConfig = CheckboxConfig(
+ label="Clean audio",
+ info=(
+ "Whether to clean the audio files in the provided dataset using noise"
+ " reduction algorithms.
"
+ ),
+ value=False,
+ exclude_value=True,
+ )
+ clean_strength: SliderConfig = SliderConfig.clean_strength(visible=False)
+ split_method: DropdownConfig = DropdownConfig(
+ label="Audio splitting method",
+ info=(
+ "The method to use for splitting the audio files in the provided dataset."
+ " Use the `Skip` method to skip splitting if the audio files are already"
+ " split. Use the `Simple` method if excessive silence has already been"
+ " removed from the audio files. Use the `Automatic` method for automatic"
+ " silence detection and splitting around it."
+ ),
+ value=AudioSplitMethod.AUTOMATIC,
+ choices=list(AudioSplitMethod),
+ exclude_value=True,
+ )
+ chunk_len: SliderConfig = SliderConfig(
+ label="Chunk length",
+ info="Length of split audio chunks.",
+ value=3.0,
+ minimum=0.5,
+ maximum=5.0,
+ step=0.1,
+ visible=False,
+ )
+ overlap_len: SliderConfig = SliderConfig(
+ label="Overlap length",
+ info="Length of overlap between split audio chunks.",
+ value=0.3,
+ minimum=0.0,
+ maximum=0.4,
+ step=0.1,
+ visible=False,
+ )
+ preprocess_cores: SliderConfig = SliderConfig.cpu_cores()
+
+ extract_model: DropdownConfig = DropdownConfig(
+ label="Model name",
+ info=(
+ "Name of the model with an associated preprocessed dataset to extract"
+ " training features from. When a new dataset is preprocessed, its"
+ " associated model is selected by default."
+ ),
+ value=None,
+ render=False,
+ exclude_value=True,
+ )
+ f0_method: DropdownConfig = DropdownConfig(
+ label="F0 method",
+ info="The method to use for extracting pitch features.",
+ value=TrainingF0Method.RMVPE,
+ choices=list(TrainingF0Method),
+ exclude_value=True,
+ )
+
+ hop_length: SliderConfig = SliderConfig.hop_length(
+ label="Hop length",
+ info="The hop length to use for extracting pitch features.
",
+ visible=False,
+ )
+ include_mutes: SliderConfig = SliderConfig(
+ label="Include mutes",
+ info=(
+ "The number of mute audio files to include in the generated training file"
+ " list. Adding silent files enables the training model to handle pure"
+ " silence in inferred audio files. If the preprocessed audio dataset"
+ " already contains segments of pure silence, set this to 0."
+ ),
+ value=0,
+ minimum=0,
+ maximum=10,
+ step=1,
+ )
+ extraction_cores: SliderConfig = SliderConfig.cpu_cores()
+ extraction_acceleration: DropdownConfig = DropdownConfig.hardware_acceleration()
+ extraction_gpus: DropdownConfig = DropdownConfig.gpu()
+
+ train_model: DropdownConfig = DropdownConfig(
+ label="Model name",
+ info=(
+ "Name of the model to train. When training features are extracted for a new"
+ " model, its name is selected by default."
+ ),
+ value=None,
+ render=False,
+ exclude_value=True,
+ )
+ num_epochs: SliderConfig = SliderConfig(
+ label="Number of epochs",
+ info=(
+ "The number of epochs to train the voice model. A higher number can improve"
+ " voice model performance but may lead to overtraining."
+ ),
+ value=500,
+ minimum=1,
+ maximum=5000,
+ step=1,
+ )
+ batch_size: SliderConfig = SliderConfig(
+ label="Batch size",
+ info=(
+ "The number of samples in each training batch. It is advisable to align"
+ " this value with the available VRAM of your GPU."
+ ),
+ value=16,
+ minimum=1,
+ maximum=128,
+ step=1,
+ )
+ detect_overtraining: CheckboxConfig = CheckboxConfig(
+ label="Detect overtraining",
+ info=(
+ "Whether to detect overtraining to prevent the voice model from learning"
+ " the training data too well and losing the ability to generalize to new"
+ " data."
+ ),
+ value=True,
+ exclude_value=True,
+ )
+ overtraining_threshold: SliderConfig = SliderConfig(
+ label="Overtraining threshold",
+ info=(
+ "The maximum number of epochs to continue training without any observed"
+ " improvement in voice model performance."
+ ),
+ value=500,
+ minimum=1,
+ maximum=1000,
+ visible=False,
+ )
+ vocoder: DropdownConfig = DropdownConfig(
+ label="Vocoder",
+ info=(
+ "The vocoder to use for audio synthesis during training. HiFi-GAN provides"
+ " basic audio fidelity, while RefineGAN provides the highest audio"
+ " fidelity."
+ ),
+ value=Vocoder.HIFI_GAN,
+ choices=list(Vocoder),
+ )
+ index_algorithm: DropdownConfig = DropdownConfig(
+ label="Index algorithm",
+ info=(
+ "The method to use for generating an index file for the trained voice"
+ " model. `KMeans` is particularly useful for large datasets."
+ ),
+ value=IndexAlgorithm.AUTO,
+ choices=list(IndexAlgorithm),
)
- segment_size: RadioConfig = RadioConfig(
- label="Segment size",
+ pretrained_type: DropdownConfig = DropdownConfig(
+ label="Pretrained model type",
info=(
- "The size of the segments into which the audio is split. Using a larger"
- " size consumes more resources, but may give better results."
+ "The type of pretrained model to finetune the voice model on. `None` will"
+ " train the voice model from scratch, while `Default` will use a pretrained"
+ " model tailored to the specific voice model architecture. `Custom` will"
+ " use a custom pretrained that you provide."
),
- value=SegmentSize.SEG_2048,
- choices=list(SegmentSize),
+ value=PretrainedType.DEFAULT,
+ choices=list(PretrainedType),
+ exclude_value=True,
)
- n_octaves: SliderConfig = SliderConfig.octave_shift(
- label="Pitch shift (octaves)",
+ custom_pretrained_model: DropdownConfig = DropdownConfig(
+ label="Custom pretrained model",
+ info="Select a custom pretrained model to finetune from the dropdown.",
+ value=None,
+ visible=False,
+ render=False,
+ exclude_value=True,
+ )
+ save_interval: SliderConfig = SliderConfig(
+ label="Save interval",
info=(
- "The number of octaves to pitch-shift the converted voice by. Use 1 for"
- " male-to-female and -1 for vice-versa."
+ "The epoch interval at which to to save voice model weights and"
+ " checkpoints. The best model weights are always saved regardless of this"
+ " setting."
),
+ value=10,
+ minimum=1,
+ maximum=100,
+ step=1,
)
- n_semitones: SliderConfig = SliderConfig.semitone_shift(
- label="Pitch shift (semi-tones)",
+ save_all_checkpoints: CheckboxConfig = CheckboxConfig(
+ label="Save all checkpoints",
info=(
- "The number of semi-tones to pitch-shift the converted vocals by. Altering"
- " this slightly reduces sound quality."
+ "Whether to save a unique checkpoint at each save interval. If not enabled,"
+ " only the latest checkpoint will be saved at each interval."
),
+ value=True,
)
- n_semitones_instrumentals: SliderConfig = SliderConfig.semitone_shift(
- label="Instrumental pitch shift",
- info="The number of semi-tones to pitch-shift the instrumentals by.",
+ save_all_weights: CheckboxConfig = CheckboxConfig(
+ label="Save all weights",
+ info=(
+ "Whether to save unique voice model weights at each save interval. If not"
+ " enabled, only the best voice model weights will be saved."
+ ),
+ value=True,
)
- n_semitones_backup_vocals: SliderConfig = SliderConfig.semitone_shift(
- label="Backup vocal pitch shift",
- info="The number of semi-tones to pitch-shift the backup vocals by.",
+ clear_saved_data: CheckboxConfig = CheckboxConfig(
+ label="Clear saved data",
+ info=(
+ "Whether to delete any existing training data associated with the voice"
+ " model before training commences. Enable this setting only if you are"
+ " training a new voice model from scratch or restarting training."
+ ),
+ value=False,
)
- input_audio: SongInputAudioConfig = SongInputAudioConfig()
- song_dirs: SongDirsConfig = SongDirsConfig()
-
-
-class SpeechIntermediateAudioConfig(BaseModel):
- """
- Configuration settings for intermediate audio components in the
- one-click speech generation tab.
-
- Attributes
- ----------
- speech : AudioConfig
- Configuration settings for the input speech audio component.
- converted_speech : AudioConfig
- Configuration settings for the converted speech audio component.
- all : list[gr.Audio]
- List of instances of all intermediate audio components in the
- speech generation tab.
-
- """
-
- speech: AudioConfig = AudioConfig.intermediate(label="Speech")
- converted_speech: AudioConfig = AudioConfig.intermediate(label="Converted speech")
-
- @property
- def all(self) -> list[gr.Audio]:
- """
- Retrieve instances of all intermediate audio components in the
- speech generation tab.
-
- Returns
- -------
- list[gr.Audio]
- List of instances of all intermediate audio components in
- the speech generation tab.
-
- """
- return [getattr(self, field).instance for field in self.__annotations__]
-
-
-class OneClickSpeechGenerationConfig(SpeechGenerationConfig):
- """
- Configuration settings for one-click speech generation tab.
-
- Attributes
- ----------
- intermediate_audio : SpeechIntermediateAudioConfig
- Configuration settings for intermediate audio components.
- show_intermediate_audio : CheckboxConfig
- Configuration settings for a show intermediate audio checkbox
- component.
-
- See Also
- --------
- SpeechGenerationConfig
- Parent model defining common component configuration settings
- for speech generation tabs.
-
- """
-
- intermediate_audio: SpeechIntermediateAudioConfig = SpeechIntermediateAudioConfig()
-
- show_intermediate_audio: CheckboxConfig = CheckboxConfig(
- label="Show intermediate audio",
- info="Show intermediate audio tracks produced during speech generation.",
+ upload_model: CheckboxConfig = CheckboxConfig(
+ label="Upload voice model",
+ info=(
+ "Whether to automatically upload the trained voice model so that it can be"
+ " used for generation tasks within the Ultimate RVC app."
+ ),
value=False,
exclude_value=True,
)
+ upload_name: TextboxConfig = TextboxConfig(
+ label="Upload name",
+ info="The name to give the uploaded voice model.",
+ value=None,
+ visible=False,
+ exclude_value=True,
+ )
+ training_acceleration: DropdownConfig = DropdownConfig.hardware_acceleration()
+ training_gpus: DropdownConfig = DropdownConfig.gpu()
+ preload_dataset: CheckboxConfig = CheckboxConfig(
+ label="Preload dataset",
+ info=(
+ "Whether to preload all training data into GPU memory. This can improve"
+ " training speed but requires a lot of VRAM.
"
+ ),
+ value=True,
+ )
+ reduce_memory_usage: CheckboxConfig = CheckboxConfig(
+ label="Reduce memory usage",
+ info=(
+ "Whether to reduce VRAM usage at the cost of slower training speed by"
+ " enabling activation checkpointing. This is useful for GPUs with limited"
+ " memory (e.g., <6GB VRAM) or when training with a batch size larger than"
+ " what your GPU can normally accommodate."
+ ),
+ value=False,
+ )
-class SpeechInputAudioConfig(BaseModel):
- """
- Configuration settings for input audio components in the multi-step
- speech generation tab.
-
- Attributes
- ----------
- speech : AudioConfig
- Configuration settings for the input speech audio component.
- converted_speech : AudioConfig
- Configuration settings for the converted speech audio component.
-
- all : list[AudioConfig]
- List of configuration settings for all input audio components in
- the multi-step speech generation tab.
-
- """
-
- speech: AudioConfig = AudioConfig.input("Speech")
- converted_speech: AudioConfig = AudioConfig.input("Converted speech")
-
- @property
- def all(self) -> list[AudioConfig]:
- """
- Retrieve configuration settings for all input audio components
- in the multi-step speech generation tab.
-
- Returns
- -------
- list[AudioConfig]
- List of configuration settings for all input audio
- components in the multi-step speech generation tab.
-
- """
- return [getattr(self, field) for field in self.__annotations__]
-
-
-class MultiStepSpeechGenerationConfig(SpeechGenerationConfig):
- """
- Configuration settings for the multi-step speech generation tab.
-
- Attributes
- ----------
- input_audio : SpeechInputAudioConfig
- Configuration settings for input audio components.
-
- See Also
- --------
- SpeechGenerationConfig
- Parent model defining common component configuration settings
- for speech generation tabs.
-
- """
-
- input_audio: SpeechInputAudioConfig = SpeechInputAudioConfig()
-
-
-class MultiStepTrainingConfig(TrainingConfig):
- """Configuration settings for multi-step training tab."""
-
-
-class ModelManagementConfig(BaseModel):
+class GenerationConfig(BaseTabConfig):
"""
+ Common component configuration settings for generation tabs.
- Configuration settings for model management tab.
-
- Attributes
- ----------
- voices : DropdownConfig
- Configuration settings for delete voice models dropdown
- component.
- embedders : DropdownConfig
- Configuration settings for delete embedder models dropdown
+ voice_model : DropdownConfig
+ Configuration settings for a voice model dropdown component.
+ f0_methods : DropdownConfig
+ Configuration settings for a pitch extraction algorithms
+ dropdown component.
+ index_rate : SliderConfig
+ Configuration settings for an index rate slider component.
+ rms_mix_rate : SliderConfig
+ Configuration settings for a RMS mix rate slider component.
+ protect_rate : SliderConfig
+ Configuration settings for a protect rate slider component.
+ split_voice : CheckboxConfig
+ Configuration settings for a split voice checkbox component.
+ autotune_voice: CheckboxConfig
+ Configuration settings for an autotune voice checkbox component.
+ autotune_strength: SliderConfig
+ Configuration settings for an autotune strength slider
component.
- pretraineds : DropdownConfig
- Configuration settings for delete pretrained models dropdown
+ sid : NumberConfig
+ Configuration settings for a speaker ID number component.
+ output_sr : DropdownConfig
+ Configuration settings for an output sample rate dropdown
component.
- traineds : DropdownConfig
- Configuration settings for delete training models dropdown
+ output_format : DropdownConfig
+ Configuration settings for an output format dropdown
component.
- dummy_checkbox : CheckboxConfig
- Configuration settings for a dummy checkbox component.
+ output_name : TextboxConfig
+ Configuration settings for an output name textbox component.
+
+ See Also
+ --------
+ BaseTabConfig
+ Parent model defining common component configuration settings
+ for UI tabs.
"""
- voices: DropdownConfig = DropdownConfig.multi_delete(
- label="Voice models",
- info="Select one or more voice models to delete.",
+ voice_model: DropdownConfig = DropdownConfig(
+ label="Voice model",
+ info="Select a model to use for voice conversion.",
+ value=None,
+ render=False,
+ exclude_value=True,
)
- embedders: DropdownConfig = DropdownConfig.multi_delete(
- label="Custom embedder models",
- info="Select one or more embedder models to delete.",
+ f0_methods: DropdownConfig = DropdownConfig(
+ label="Pitch extraction algorithm(s)",
+ info=(
+ "If more than one method is selected, then the median of the pitch values"
+ " extracted by each method is used. RMVPE is recommended for most cases and"
+ " is the default when no method is selected."
+ ),
+ value=[F0Method.RMVPE],
+ choices=list(F0Method),
+ multiselect=True,
)
- pretraineds: DropdownConfig = DropdownConfig.multi_delete(
- label="Custom pretrained models",
- info="Select one or more pretrained models to delete.",
+ index_rate: SliderConfig = SliderConfig(
+ label="Index rate",
+ info=(
+ "Increase to bias the conversion towards the accent of the voice model."
+ " Decrease to potentially reduce artifacts coming from the voice"
+ " model.
"
+ ),
+ value=0.3,
+ minimum=0.0,
+ maximum=1.0,
)
- traineds: DropdownConfig = DropdownConfig.multi_delete(
- label="Training models",
- info="Select one or more training models to delete.",
+ rms_mix_rate: SliderConfig = SliderConfig(
+ label="RMS mix rate",
+ info=(
+ "How much to mimic the loudness (0) of the input voice or a fixed loudness"
+ " (1). A value of 1 is recommended for most cases.
"
+ ),
+ value=1.0,
+ minimum=0.0,
+ maximum=1.0,
+ )
+ protect_rate: SliderConfig = SliderConfig(
+ label="Protect rate",
+ info=(
+ "Controls the extent to which consonants and breathing sounds are protected"
+ " from artifacts. A higher value offers more protection but may worsen the"
+ " indexing effect.
"
+ ),
+ value=0.33,
+ minimum=0.0,
+ maximum=0.5,
)
- dummy_checkbox: CheckboxConfig = CheckboxConfig(
+ hop_length: SliderConfig = SliderConfig.hop_length(
+ label="Hop length",
+ info=(
+ "How often the CREPE-based pitch extraction method checks for pitch changes"
+ " measured in milliseconds. Lower values lead to longer conversion times"
+ " and a higher risk of voice cracks, but better pitch accuracy."
+ ),
+ visible=True,
+ )
+
+ split_voice: CheckboxConfig = CheckboxConfig(
+ label="Split input voice",
+ info=(
+ "Whether to split the input voice track into smaller segments before"
+ " converting it. This can improve output quality for longer voice tracks."
+ ),
+ value=False,
+ )
+ autotune_voice: CheckboxConfig = CheckboxConfig(
+ label="Autotune converted voice",
+ info="Whether to apply autotune to the converted voice.
",
value=False,
+ exclude_value=True,
+ )
+ autotune_strength: SliderConfig = SliderConfig(
+ label="Autotune intensity",
+ info=(
+ "Higher values result in stronger snapping to the chromatic grid and"
+ " artifacting."
+ ),
+ value=1.0,
+ minimum=0.0,
+ maximum=1.0,
visible=False,
+ )
+ sid: NumberConfig = NumberConfig(
+ label="Speaker ID",
+ info="Speaker ID for multi-speaker-models.",
+ value=0,
+ precision=0,
+ )
+ output_sr: DropdownConfig = DropdownConfig(
+ label="Output sample rate",
+ info="The sample rate of the mixed output track.",
+ value=SampleRate.HZ_44100,
+ choices=list(SampleRate),
+ )
+ output_format: DropdownConfig = DropdownConfig(
+ label="Output format",
+ info="The audio format of the mixed output track.",
+ value=AudioExt.MP3,
+ choices=list(AudioExt),
+ )
+ output_name: TextboxConfig = TextboxConfig(
+ label="Output name",
+ info="If no name is provided, a suitable name will be generated automatically.",
+ value=None,
+ placeholder="Ultimate RVC output",
exclude_value=True,
)
-class AudioManagementConfig(BaseModel):
+class SongGenerationConfig(GenerationConfig):
"""
- Configuration settings for audio management tab.
+ Common component configuration settings for song generation tabs.
Attributes
----------
- intermediate : DropdownConfig
- Configuration settings for delete intermediate audio files
- dropdown component
- speech : DropdownConfig
- Configuration settings for delete speech audio files dropdown
- component.
- output : DropdownConfig
- Configuration settings for delete output audio files dropdown
+ source_type : DropdownConfig
+ Configuration settings for a source type dropdown component.
+ source : TextboxConfig
+ Configuration settings for an input source textbox component.
+ cached_song : DropdownConfig
+ Configuration settings for a cached song dropdown component.
+ clean_strength : SliderConfig
+ Configuration settings for a clean strength slider component.
+ clean_voice : CheckboxConfig
+ Configuration settings for a clean voice checkbox component.
+ room_size : SliderConfig
+ Configuration settings for a room size slider component.
+ wet_level : SliderConfig
+ Configuration settings for a wetness level slider component.
+ dry_level : SliderConfig
+ Configuration settings for a dryness level slider component.
+ damping : SliderConfig
+ Configuration settings for a damping level slider component.
+ main_gain : SliderConfig
+ Configuration settings for a main gain slider component.
+ inst_gain : SliderConfig
+ Configuration settings for an instrumentals gain slider
component.
- dataset : DropdownConfig
- Configuration settings for delete dataset audio files dropdown
+ backup_gain : SliderConfig
+ Configuration settings for a backup vocals gain slider
component.
- dummy_checkbox : CheckboxConfig
- Configuration settings for a dummy checkbox component.
+
+ See Also
+ --------
+ GenerationConfig
+ Parent model defining common component configuration settings
+ for song generation tabs.
"""
- intermediate: DropdownConfig = DropdownConfig.multi_delete(
- label="Song directories",
+ source_type: DropdownConfig = DropdownConfig(
+ label="Source type",
+ info="The type of source to retrieve a song from.",
+ value=SongSourceType.LOCAL_FILE,
+ choices=list(SongSourceType),
+ type="index",
+ exclude_value=True,
+ )
+ source: TextboxConfig = TextboxConfig(
+ label="Source",
+ info="Link to a song on YouTube or the full path of a local audio file.",
+ value=None,
+ exclude_value=True,
+ )
+ cached_song: DropdownConfig = DropdownConfig(
+ label="Source",
+ info="Select a song from the list of cached songs.",
+ value=None,
+ visible=False,
+ render=False,
+ exclude_value=True,
+ )
+ clean_voice: CheckboxConfig = CheckboxConfig(
+ label="Clean converted voice",
info=(
- "Select one or more song directories containing intermediate audio files to"
- " delete."
+ "Whether to clean the converted voice using noise reduction"
+ " algorithms.
"
),
+ value=False,
+ exclude_value=True,
)
- speech: DropdownConfig = DropdownConfig.multi_delete(
- label="Speech audio files",
- info="Select one or more speech audio files to delete.",
+ clean_strength: SliderConfig = SliderConfig.clean_strength(visible=False)
+ room_size: SliderConfig = SliderConfig(
+ label="Room size",
+ info=(
+ "Size of the room which reverb effect simulates. Increase for longer reverb"
+ " time."
+ ),
+ value=0.15,
+ minimum=0.0,
+ maximum=1.0,
)
- output: DropdownConfig = DropdownConfig.multi_delete(
- label="Output audio files",
- info="Select one or more output audio files to delete.",
+ wet_level: SliderConfig = SliderConfig(
+ label="Wetness level",
+ info="Loudness of converted vocals with reverb effect applied.",
+ value=0.2,
+ minimum=0.0,
+ maximum=1.0,
+ )
+ dry_level: SliderConfig = SliderConfig(
+ label="Dryness level",
+ info="Loudness of converted vocals without reverb effect applied.",
+ value=0.8,
+ minimum=0.0,
+ maximum=1.0,
+ )
+ damping: SliderConfig = SliderConfig(
+ label="Damping level",
+ info="Absorption of high frequencies in reverb effect.",
+ value=0.7,
+ minimum=0.0,
+ maximum=1.0,
+ )
+ main_gain: SliderConfig = SliderConfig.gain(
+ label="Main gain",
+ info="The gain to apply to the main vocals.",
)
- dataset: DropdownConfig = DropdownConfig.multi_delete(
- label="Dataset audio files",
- info="Select one or more datasets containing audio files to delete.",
+ inst_gain: SliderConfig = SliderConfig.gain(
+ label="Instrumentals gain",
+ info="The gain to apply to the instrumentals.",
)
-
- dummy_checkbox: CheckboxConfig = CheckboxConfig(
- value=False,
- visible=False,
- exclude_value=True,
+ backup_gain: SliderConfig = SliderConfig.gain(
+ label="Backup gain",
+ info="The gain to apply to the backup vocals.",
)
-class SettingsManagementConfig(BaseModel):
+class SpeechGenerationConfig(GenerationConfig):
"""
- Configuration settings for settings management tab.
+ Common component configuration settings for speech generation tabs.
Attributes
----------
- dummy_checkbox : CheckboxConfig
- Configuration settings for a dummy checkbox component.
+ source_type : DropdownConfig
+ Configuration settings for a source type dropdown component.
+ source : TextboxConfig
+ Configuration settings for an input source textbox component.
+ edge_tts_voice : DropdownConfig
+ Configuration settings for an Edge TTS voice dropdown
+ component.
+ n_octaves : SliderConfig
+ Configuration settings for an octave pitch shift slider
+ component.
+ n_semitones : SliderConfig
+ Configuration settings for a semitone pitch shift slider
+ component.
+ tts_pitch_shift : SliderConfig
+ Configuration settings for a TTS pitch shift slider
+ component.
+ tts_speed_change : SliderConfig
+ Configuration settings for a TTS speed change slider
+ component.
+ tts_volume_change : SliderConfig
+ Configuration settings for a TTS volume change slider
+ component.
+ clean_voice : CheckboxConfig
+ Configuration settings for a clean voice checkbox
+ component.
+ clean_strength : SliderConfig
+ Configuration settings for a clean strength slider
+ component.
+ output_gain : GainSliderConfig
+ Configuration settings for an output gain slider component.
+
+ See Also
+ --------
+ GenerationConfig
+ Parent model defining common component configuration settings
+ for generation tabs.
"""
- load_config_name: DropdownConfig = DropdownConfig(
- label="Configuration name",
- info="The name of a configuration to load UI settings from",
+ source_type: DropdownConfig = DropdownConfig(
+ label="Source type",
+ info="The type of source to generate speech from.",
+ value=SpeechSourceType.TEXT,
+ choices=list(SpeechSourceType),
+ type="index",
+ exclude_value=True,
+ )
+ source: TextboxConfig = TextboxConfig(
+ label="Source",
+ info="Text to generate speech from",
+ value=None,
+ exclude_value=True,
+ )
+ edge_tts_voice: DropdownConfig = DropdownConfig(
+ label="Edge TTS voice",
+ info="Select a voice to use for text to speech conversion.",
value=None,
render=False,
exclude_value=True,
)
- delete_config_names: DropdownConfig = DropdownConfig.multi_delete(
- label="Configuration names",
- info="Select the name of one or more configurations to delete",
+ n_octaves: SliderConfig = SliderConfig.octave_shift(
+ label="Octave shift",
+ info=(
+ "The number of octaves to pitch-shift the converted speech by. Use 1 for"
+ " male-to-female and -1 for vice-versa."
+ ),
)
- dummy_checkbox: CheckboxConfig = CheckboxConfig(
- value=False,
- visible=False,
+ n_semitones: SliderConfig = SliderConfig.semitone_shift(
+ label="Semitone shift",
+ info="The number of semi-tones to pitch-shift the converted speech by.",
+ )
+ tts_pitch_shift: SliderConfig = SliderConfig(
+ label="Edge TTS pitch shift",
+ info=(
+ "The number of hertz to shift the pitch of the speech generated by Edge"
+ " TTS."
+ ),
+ value=0,
+ minimum=-100,
+ maximum=100,
+ step=1,
+ )
+ tts_speed_change: SliderConfig = SliderConfig(
+ label="TTS speed change",
+ info="The percentual change to the speed of the speech generated by Edge TTS.",
+ value=0,
+ minimum=-50,
+ maximum=100,
+ step=1,
+ )
+ tts_volume_change: SliderConfig = SliderConfig(
+ label="TTS volume change",
+ info="The percentual change to the volume of the speech generated by Edge TTS.",
+ value=0,
+ minimum=-100,
+ maximum=100,
+ step=1,
+ )
+ clean_voice: CheckboxConfig = CheckboxConfig(
+ label="Clean converted voice",
+ info=(
+ "Whether to clean the converted voice using noise reduction"
+ " algorithms.
"
+ ),
+ value=True,
exclude_value=True,
)
-
-
-class TotalSongGenerationConfig(BaseModel):
- """
- All configuration settings for song generation tabs.
-
- Attributes
- ----------
- one_click : OneClickSongGenerationConfig
- Configuration settings for the one-click song generation tab.
- multi_step : MultiStepSongGenerationConfig
- Configuration settings for the multi-step song generation tab.
-
- """
-
- one_click: OneClickSongGenerationConfig = OneClickSongGenerationConfig()
- multi_step: MultiStepSongGenerationConfig = MultiStepSongGenerationConfig()
-
-
-class TotalSpeechGenerationConfig(BaseModel):
- """
- All configuration settings for speech generation tabs.
-
- Attributes
- ----------
- one_click : OneClickSpeechGenerationConfig
- Configuration settings for the one-click speech generation tab.
- multi_step : MultiStepSpeechGenerationConfig
- Configuration settings for the multi-step speech generation tab.
-
- """
-
- one_click: OneClickSpeechGenerationConfig = OneClickSpeechGenerationConfig()
- multi_step: MultiStepSpeechGenerationConfig = MultiStepSpeechGenerationConfig()
-
-
-class TotalTrainingConfig(BaseModel):
- """
- All configuration settings for training tabs.
-
- Attributes
- ----------
- training : TrainingConfig
- Configuration settings for the multi-step training tab.
-
- """
-
- multi_step: MultiStepTrainingConfig = MultiStepTrainingConfig()
-
-
-class TotalManagementConfig(BaseModel):
- """
- All configuration settings for management tabs.
-
- Attributes
- ----------
- model : ModelManagementConfig
- Configuration settings for the model management tab.
- audio : AudioManagementConfig
- Configuration settings for the audio management tab.
- settings : SettingsManagementConfig
- Configuration settings for the settings management tab.
-
- """
-
- model: ModelManagementConfig = ModelManagementConfig()
- audio: AudioManagementConfig = AudioManagementConfig()
- settings: SettingsManagementConfig = SettingsManagementConfig()
-
-
-class TotalConfig(BaseModel):
- """
- All configuration settings for the Ultimate RVC app.
-
- Attributes
- ----------
- song : TotalSongGenerationConfig
- Configuration settings for song generation tabs.
- speech : TotalSpeechGenerationConfig
- Configuration settings for speech generation tabs.
- training : TotalTrainingConfig
- Configuration settings for training tabs.
- management : TotalManagementConfig
- Configuration settings for management tabs.
-
- """
-
- song: TotalSongGenerationConfig = TotalSongGenerationConfig()
- speech: TotalSpeechGenerationConfig = TotalSpeechGenerationConfig()
- training: TotalTrainingConfig = TotalTrainingConfig()
- management: TotalManagementConfig = TotalManagementConfig()
-
- @cached_property
- def all(self) -> list[AnyComponentConfig]:
- """
- Recursively collect those component configuration models nested
- within the current model instance, which have values that are
- not excluded.
-
- Returns
- -------
- list[AnyComponentConfig]
- A list of component configuration models found within the
- current model instance, which have values that are not
- excluded.
-
- """
-
- def _collect(model: BaseModel) -> list[AnyComponentConfig]:
- component_configs: list[Any] = []
- for _, value in model:
- if isinstance(value, ComponentConfig):
- if not value.exclude_value:
- component_configs.append(value)
- elif isinstance(value, BaseModel):
- component_configs.extend(_collect(value))
- return component_configs
-
- return _collect(self)
-
-
-
+ clean_strength: SliderConfig = SliderConfig.clean_strength(visible=True)
+ output_gain: SliderConfig = SliderConfig.gain(
+ label="Output gain",
+ info="The gain to apply to the converted speech.
",
+ )