diff --git "a/app.py" "b/app.py" --- "a/app.py" +++ "b/app.py" @@ -90,1880 +90,1882 @@ type StrPath = str | PathLike[str] type Json = Mapping[str, Json] | Sequence[Json] | str | int | float | bool | None +class SegmentSize(IntEnum): + """Enumeration of segment sizes for audio separation.""" + SEG_64 = 64 + SEG_128 = 128 + SEG_256 = 256 + SEG_512 = 512 + SEG_1024 = 1024 + SEG_2048 = 2048 + SEG_4096 = 4096 -class BaseTabConfig(BaseModel): - """ - Base model defining common component configuration settings for - UI tabs. - Attributes - ---------- - embedder_model : DropdownConfig - Configuration settings for an embedder model dropdown component. - custom_embedder_model : DropdownConfig - Configuration settings for a custom embedder model dropdown - component. +class F0Method(StrEnum): + """Enumeration of pitch extraction methods.""" - """ + RMVPE = "rmvpe" + CREPE = "crepe" + CREPE_TINY = "crepe-tiny" + FCPE = "fcpe" - embedder_model: DropdownConfig = DropdownConfig( - label="Embedder model", - info="The model to use for generating speaker embeddings.", - value=EmbedderModel.CONTENTVEC, - choices=list(EmbedderModel), - exclude_value=True, - ) - custom_embedder_model: DropdownConfig = DropdownConfig( - label="Custom embedder model", - info="Select a custom embedder model from the dropdown.", - value=None, - visible=False, - render=False, - exclude_value=True, - ) -class GenerationConfig(BaseTabConfig): - """ - Common component configuration settings for generation tabs. +class RVCContentType(StrEnum): + """Enumeration of valid content to convert with RVC.""" - voice_model : DropdownConfig - Configuration settings for a voice model dropdown component. - f0_methods : DropdownConfig - Configuration settings for a pitch extraction algorithms - dropdown component. - index_rate : SliderConfig - Configuration settings for an index rate slider component. - rms_mix_rate : SliderConfig - Configuration settings for a RMS mix rate slider component. - protect_rate : SliderConfig - Configuration settings for a protect rate slider component. - split_voice : CheckboxConfig - Configuration settings for a split voice checkbox component. - autotune_voice: CheckboxConfig - Configuration settings for an autotune voice checkbox component. - autotune_strength: SliderConfig - Configuration settings for an autotune strength slider - component. - sid : NumberConfig - Configuration settings for a speaker ID number component. - output_sr : DropdownConfig - Configuration settings for an output sample rate dropdown - component. - output_format : DropdownConfig - Configuration settings for an output format dropdown - component. - output_name : TextboxConfig - Configuration settings for an output name textbox component. + VOCALS = "vocals" + VOICE = "voice" + SPEECH = "speech" + AUDIO = "audio" - See Also - -------- - BaseTabConfig - Parent model defining common component configuration settings - for UI tabs. - """ +class SampleRate(IntEnum): + """Enumeration of supported audio sample rates.""" - voice_model: DropdownConfig = DropdownConfig( - label="Voice model", - info="Select a model to use for voice conversion.", - value=None, - render=False, - exclude_value=True, - ) - f0_methods: DropdownConfig = DropdownConfig( - label="Pitch extraction algorithm(s)", - info=( - "If more than one method is selected, then the median of the pitch values" - " extracted by each method is used. RMVPE is recommended for most cases and" - " is the default when no method is selected." - ), - value=[F0Method.RMVPE], - choices=list(F0Method), - multiselect=True, - ) - index_rate: SliderConfig = SliderConfig( - label="Index rate", - info=( - "Increase to bias the conversion towards the accent of the voice model." - " Decrease to potentially reduce artifacts coming from the voice" - " model.


" - ), - value=0.3, - minimum=0.0, - maximum=1.0, - ) - rms_mix_rate: SliderConfig = SliderConfig( - label="RMS mix rate", - info=( - "How much to mimic the loudness (0) of the input voice or a fixed loudness" - " (1). A value of 1 is recommended for most cases.

" - ), - value=1.0, - minimum=0.0, - maximum=1.0, - ) - protect_rate: SliderConfig = SliderConfig( - label="Protect rate", - info=( - "Controls the extent to which consonants and breathing sounds are protected" - " from artifacts. A higher value offers more protection but may worsen the" - " indexing effect.

" - ), - value=0.33, - minimum=0.0, - maximum=0.5, - ) + HZ_16000 = 16000 + HZ_44100 = 44100 + HZ_48000 = 48000 + HZ_96000 = 96000 + HZ_192000 = 192000 - hop_length: SliderConfig = SliderConfig.hop_length( - label="Hop length", - info=( - "How often the CREPE-based pitch extraction method checks for pitch changes" - " measured in milliseconds. Lower values lead to longer conversion times" - " and a higher risk of voice cracks, but better pitch accuracy." - ), - visible=True, - ) - split_voice: CheckboxConfig = CheckboxConfig( - label="Split input voice", - info=( - "Whether to split the input voice track into smaller segments before" - " converting it. This can improve output quality for longer voice tracks." - ), - value=False, - ) - autotune_voice: CheckboxConfig = CheckboxConfig( - label="Autotune converted voice", - info="Whether to apply autotune to the converted voice.

", - value=False, - exclude_value=True, - ) - autotune_strength: SliderConfig = SliderConfig( - label="Autotune intensity", - info=( - "Higher values result in stronger snapping to the chromatic grid and" - " artifacting." - ), - value=1.0, - minimum=0.0, - maximum=1.0, - visible=False, - ) - sid: NumberConfig = NumberConfig( - label="Speaker ID", - info="Speaker ID for multi-speaker-models.", - value=0, - precision=0, - ) - output_sr: DropdownConfig = DropdownConfig( - label="Output sample rate", - info="The sample rate of the mixed output track.", - value=SampleRate.HZ_44100, - choices=list(SampleRate), - ) - output_format: DropdownConfig = DropdownConfig( - label="Output format", - info="The audio format of the mixed output track.", - value=AudioExt.MP3, - choices=list(AudioExt), - ) - output_name: TextboxConfig = TextboxConfig( - label="Output name", - info="If no name is provided, a suitable name will be generated automatically.", - value=None, - placeholder="Ultimate RVC output", - exclude_value=True, - ) +class AudioExt(StrEnum): + """Enumeration of supported audio file formats.""" + MP3 = "mp3" + WAV = "wav" + FLAC = "flac" + OGG = "ogg" -class SongGenerationConfig(GenerationConfig): - """ - Common component configuration settings for song generation tabs. - Attributes - ---------- - source_type : DropdownConfig - Configuration settings for a source type dropdown component. - source : TextboxConfig - Configuration settings for an input source textbox component. - cached_song : DropdownConfig - Configuration settings for a cached song dropdown component. - clean_strength : SliderConfig - Configuration settings for a clean strength slider component. - clean_voice : CheckboxConfig - Configuration settings for a clean voice checkbox component. - room_size : SliderConfig - Configuration settings for a room size slider component. - wet_level : SliderConfig - Configuration settings for a wetness level slider component. - dry_level : SliderConfig - Configuration settings for a dryness level slider component. - damping : SliderConfig - Configuration settings for a damping level slider component. - main_gain : SliderConfig - Configuration settings for a main gain slider component. - inst_gain : SliderConfig - Configuration settings for an instrumentals gain slider - component. - backup_gain : SliderConfig - Configuration settings for a backup vocals gain slider - component. +class DeviceType(StrEnum): + """Enumeration of device types for training voice models.""" - See Also - -------- - GenerationConfig - Parent model defining common component configuration settings - for song generation tabs. + AUTOMATIC = "Automatic" + CPU = "CPU" + GPU = "GPU" - """ - source_type: DropdownConfig = DropdownConfig( - label="Source type", - info="The type of source to retrieve a song from.", - value=SongSourceType.LOCAL_FILE, - choices=list(SongSourceType), - type="index", - exclude_value=True, - ) - source: TextboxConfig = TextboxConfig( - label="Source", - info="Link to a song on YouTube or the full path of a local audio file.", - value=None, - exclude_value=True, - ) - cached_song: DropdownConfig = DropdownConfig( - label="Source", - info="Select a song from the list of cached songs.", - value=None, - visible=False, - render=False, - exclude_value=True, - ) - clean_voice: CheckboxConfig = CheckboxConfig( - label="Clean converted voice", - info=( - "Whether to clean the converted voice using noise reduction" - " algorithms.

" - ), - value=False, - exclude_value=True, - ) - clean_strength: SliderConfig = SliderConfig.clean_strength(visible=False) - room_size: SliderConfig = SliderConfig( - label="Room size", - info=( - "Size of the room which reverb effect simulates. Increase for longer reverb" - " time." - ), - value=0.15, - minimum=0.0, - maximum=1.0, - ) - wet_level: SliderConfig = SliderConfig( - label="Wetness level", - info="Loudness of converted vocals with reverb effect applied.", - value=0.2, - minimum=0.0, - maximum=1.0, - ) - dry_level: SliderConfig = SliderConfig( - label="Dryness level", - info="Loudness of converted vocals without reverb effect applied.", - value=0.8, - minimum=0.0, - maximum=1.0, - ) - damping: SliderConfig = SliderConfig( - label="Damping level", - info="Absorption of high frequencies in reverb effect.", - value=0.7, - minimum=0.0, - maximum=1.0, - ) - main_gain: SliderConfig = SliderConfig.gain( - label="Main gain", - info="The gain to apply to the main vocals.", - ) - inst_gain: SliderConfig = SliderConfig.gain( - label="Instrumentals gain", - info="The gain to apply to the instrumentals.", - ) - backup_gain: SliderConfig = SliderConfig.gain( - label="Backup gain", - info="The gain to apply to the backup vocals.", - ) +class TrainingSampleRate(StrEnum): + """Enumeration of sample rates for training voice models.""" + HZ_32K = "32000" + HZ_40K = "40000" + HZ_48K = "48000" -class SpeechGenerationConfig(GenerationConfig): - """ - Common component configuration settings for speech generation tabs. - Attributes - ---------- - source_type : DropdownConfig - Configuration settings for a source type dropdown component. - source : TextboxConfig - Configuration settings for an input source textbox component. - edge_tts_voice : DropdownConfig - Configuration settings for an Edge TTS voice dropdown - component. - n_octaves : SliderConfig - Configuration settings for an octave pitch shift slider - component. - n_semitones : SliderConfig - Configuration settings for a semitone pitch shift slider - component. - tts_pitch_shift : SliderConfig - Configuration settings for a TTS pitch shift slider - component. - tts_speed_change : SliderConfig - Configuration settings for a TTS speed change slider - component. - tts_volume_change : SliderConfig - Configuration settings for a TTS volume change slider - component. - clean_voice : CheckboxConfig - Configuration settings for a clean voice checkbox - component. - clean_strength : SliderConfig - Configuration settings for a clean strength slider - component. - output_gain : GainSliderConfig - Configuration settings for an output gain slider component. +class PretrainedSampleRate(StrEnum): + """Enumeration of valid sample rates for pretrained models.""" - See Also - -------- - GenerationConfig - Parent model defining common component configuration settings - for generation tabs. + HZ_32K = "32k" + HZ_40K = "40k" + HZ_44K = "44k" + HZ_48K = "48k" - """ - source_type: DropdownConfig = DropdownConfig( - label="Source type", - info="The type of source to generate speech from.", - value=SpeechSourceType.TEXT, - choices=list(SpeechSourceType), - type="index", - exclude_value=True, - ) - source: TextboxConfig = TextboxConfig( - label="Source", - info="Text to generate speech from", - value=None, - exclude_value=True, - ) - edge_tts_voice: DropdownConfig = DropdownConfig( - label="Edge TTS voice", - info="Select a voice to use for text to speech conversion.", - value=None, - render=False, - exclude_value=True, - ) - n_octaves: SliderConfig = SliderConfig.octave_shift( - label="Octave shift", - info=( - "The number of octaves to pitch-shift the converted speech by. Use 1 for" - " male-to-female and -1 for vice-versa." - ), - ) - n_semitones: SliderConfig = SliderConfig.semitone_shift( - label="Semitone shift", - info="The number of semi-tones to pitch-shift the converted speech by.", - ) - tts_pitch_shift: SliderConfig = SliderConfig( - label="Edge TTS pitch shift", - info=( - "The number of hertz to shift the pitch of the speech generated by Edge" - " TTS." - ), - value=0, - minimum=-100, - maximum=100, - step=1, - ) - tts_speed_change: SliderConfig = SliderConfig( - label="TTS speed change", - info="The percentual change to the speed of the speech generated by Edge TTS.", - value=0, - minimum=-50, - maximum=100, - step=1, - ) - tts_volume_change: SliderConfig = SliderConfig( - label="TTS volume change", - info="The percentual change to the volume of the speech generated by Edge TTS.", - value=0, - minimum=-100, - maximum=100, - step=1, - ) - clean_voice: CheckboxConfig = CheckboxConfig( - label="Clean converted voice", - info=( - "Whether to clean the converted voice using noise reduction" - " algorithms.

" - ), - value=True, - exclude_value=True, - ) - clean_strength: SliderConfig = SliderConfig.clean_strength(visible=True) - output_gain: SliderConfig = SliderConfig.gain( - label="Output gain", - info="The gain to apply to the converted speech.

", - ) +class TrainingF0Method(StrEnum): + """Enumeration of pitch extraction methods for training.""" + RMVPE = "rmvpe" + CREPE = "crepe" + CREPE_TINY = "crepe-tiny" -class TrainingConfig(BaseTabConfig): - """ - Common component configuration settings for training tabs. - Attributes - ---------- - dataset_type : DropdownConfig - Configuration settings for a dataset type dropdown component. - dataset : DropdownConfig - Configuration settings for a dataset dropdown component. - dataset_name : TextboxConfig - Configuration settings for a dataset name textbox component. - preprocess_model : DropdownConfig - Configuration settings for a model name dropdown component - for audio preprocessing. - sample_rate : DropdownConfig - Configuration settings for a sample rate dropdown component. - filter_audio : CheckboxConfig - Configuration settings for a filter audio checkbox component. - clean_audio : CheckboxConfig - Configuration settings for a clean audio checkbox component. - clean_strength : SliderConfig - Configuration settings for a clean strength slider component. - split_method : DropdownConfig - Configuration settings for an audio splitting method dropdown - component. - chunk_len : SliderConfig - Configuration settings for a chunk length slider component. - overlap_len : SliderConfig - Configuration settings for an overlap length slider component. - preprocess_cores : SliderConfig - Configuration settings for a CPU cores slider component for - preprocessing. - extract_model : DropdownConfig - Configuration settings for a model name dropdown component for - feature extraction. - f0_method : DropdownConfig - Configuration settings for an F0 method dropdown component. - hop_length : SliderConfig - Configuration settings for a hop length slider component. - include_mutes : SliderConfig - Configuration settings for an include mutes slider component. - extract_cores : SliderConfig - Configuration settings for a CPU cores slider component for - feature extraction. - extraction_acceleration : HardwareAccelerationConfig - Configuration settings for a hardware acceleration component for - feature extraction. - extraction_gpus : DropdownConfig - Configuration settings for a GPU dropdown compoennt for feature - extraction. - train_model : DropdownConfig - Configuration settings for a model name dropdown component for - training. - num_epochs : SliderConfig - Configuration settings for a number of epochs slider component. - batch_size : SliderConfig - Configuration settings for a batch size slider component. - detect_overtraining : CheckboxConfig - Configuration settings for a detect overtraining checkbox - component. - overtraining_threshold : SliderConfig - Configuration settings for an overtraining threshold slider - component. - vocoder : DropdownConfig - Configuration settings for a vocoder dropdown component. - index_algorithm : DropdownConfig - Configuration settings for an index algorithm dropdown - component. - pretrained_type : DropdownConfig - Configuration settings for a pretrained model type dropdown - component. - custom_pretrained_model : DropdownConfig - Configuration settings for a custom pretrained model dropdown - component. - save_interval : SliderConfig - Configuration settings for a save-interval slider component. - save_all_checkpoints : CheckboxConfig - Configuration settings for a save-all-checkpoints checkbox - component. - save_all_weights : CheckboxConfig - Configuration settings for a save-all-weights checkbox - component. - clear_saved_data : CheckboxConfig - Configuration settings for a clear-saved-data checkbox - component. - upload_model : CheckboxConfig - Configuration settings for an upload voice model checkbox - component. - upload_name : TextboxConfig - Configuration settings for an upload name textbox component. - training_acceleration : HardwareAccelerationConfig - Configuration settings for a hardware acceleration component for - training. - training_gpus : DropdownConfig - Configuration settings for a GPU dropdown component for - training. - preload_dataset : CheckboxConfig - Configuration settings for a preload dataset checkbox component. - reduce_memory_usage : CheckboxConfig - Configuration settings for a reduce-memory-usage checkbox - component. - - See Also - -------- - BaseTabConfig - Parent model defining common component configuration settings - for UI tabs. - - """ - - dataset_type: DropdownConfig = DropdownConfig( - label="Dataset type", - info="Select the type of dataset to preprocess.", - value=DatasetType.NEW_DATASET, - choices=list(DatasetType), - exclude_value=True, - ) - dataset: DropdownConfig = DropdownConfig( - label="Dataset path", - info=( - "The path to an existing dataset. Either select a path to a previously" - " created dataset or provide a path to an external dataset." - ), - value=None, - allow_custom_value=True, - visible=False, - render=False, - exclude_value=True, - ) - dataset_name: TextboxConfig = TextboxConfig( - label="Dataset name", - info=( - "The name of the new dataset. If the dataset already exists, the provided" - " audio files will be added to it." - ), - value="My dataset", - exclude_value=True, - ) - preprocess_model: DropdownConfig = DropdownConfig( - label="Model name", - info=( - "Name of the model to preprocess the given dataset for. Either select an" - " existing model from the dropdown or provide the name of a new model." - ), - value="My model", - allow_custom_value=True, - render=False, - exclude_value=True, - ) - sample_rate: DropdownConfig = DropdownConfig( - label="Sample rate", - info="Target sample rate for the audio files in the provided dataset.", - value=TrainingSampleRate.HZ_40K, - choices=list(TrainingSampleRate), - ) - filter_audio: CheckboxConfig = CheckboxConfig( - label="Filter audio", - info=( - "Whether to remove low-frequency sounds from the audio files in the" - " provided dataset by applying a high-pass butterworth filter.

" - ), - value=True, - ) - clean_audio: CheckboxConfig = CheckboxConfig( - label="Clean audio", - info=( - "Whether to clean the audio files in the provided dataset using noise" - " reduction algorithms.


" - ), - value=False, - exclude_value=True, - ) - clean_strength: SliderConfig = SliderConfig.clean_strength(visible=False) - split_method: DropdownConfig = DropdownConfig( - label="Audio splitting method", - info=( - "The method to use for splitting the audio files in the provided dataset." - " Use the `Skip` method to skip splitting if the audio files are already" - " split. Use the `Simple` method if excessive silence has already been" - " removed from the audio files. Use the `Automatic` method for automatic" - " silence detection and splitting around it." - ), - value=AudioSplitMethod.AUTOMATIC, - choices=list(AudioSplitMethod), - exclude_value=True, - ) - chunk_len: SliderConfig = SliderConfig( - label="Chunk length", - info="Length of split audio chunks.", - value=3.0, - minimum=0.5, - maximum=5.0, - step=0.1, - visible=False, - ) - overlap_len: SliderConfig = SliderConfig( - label="Overlap length", - info="Length of overlap between split audio chunks.", - value=0.3, - minimum=0.0, - maximum=0.4, - step=0.1, - visible=False, - ) - preprocess_cores: SliderConfig = SliderConfig.cpu_cores() - - extract_model: DropdownConfig = DropdownConfig( - label="Model name", - info=( - "Name of the model with an associated preprocessed dataset to extract" - " training features from. When a new dataset is preprocessed, its" - " associated model is selected by default." - ), - value=None, - render=False, - exclude_value=True, - ) - f0_method: DropdownConfig = DropdownConfig( - label="F0 method", - info="The method to use for extracting pitch features.", - value=TrainingF0Method.RMVPE, - choices=list(TrainingF0Method), - exclude_value=True, - ) - - hop_length: SliderConfig = SliderConfig.hop_length( - label="Hop length", - info="The hop length to use for extracting pitch features.

", - visible=False, - ) - include_mutes: SliderConfig = SliderConfig( - label="Include mutes", - info=( - "The number of mute audio files to include in the generated training file" - " list. Adding silent files enables the training model to handle pure" - " silence in inferred audio files. If the preprocessed audio dataset" - " already contains segments of pure silence, set this to 0." - ), - value=0, - minimum=0, - maximum=10, - step=1, - ) - extraction_cores: SliderConfig = SliderConfig.cpu_cores() - extraction_acceleration: DropdownConfig = DropdownConfig.hardware_acceleration() - extraction_gpus: DropdownConfig = DropdownConfig.gpu() - - train_model: DropdownConfig = DropdownConfig( - label="Model name", - info=( - "Name of the model to train. When training features are extracted for a new" - " model, its name is selected by default." - ), - value=None, - render=False, - exclude_value=True, - ) - num_epochs: SliderConfig = SliderConfig( - label="Number of epochs", - info=( - "The number of epochs to train the voice model. A higher number can improve" - " voice model performance but may lead to overtraining." - ), - value=500, - minimum=1, - maximum=5000, - step=1, - ) - batch_size: SliderConfig = SliderConfig( - label="Batch size", - info=( - "The number of samples in each training batch. It is advisable to align" - " this value with the available VRAM of your GPU." - ), - value=16, - minimum=1, - maximum=128, - step=1, - ) - detect_overtraining: CheckboxConfig = CheckboxConfig( - label="Detect overtraining", - info=( - "Whether to detect overtraining to prevent the voice model from learning" - " the training data too well and losing the ability to generalize to new" - " data." - ), - value=True, - exclude_value=True, - ) - overtraining_threshold: SliderConfig = SliderConfig( - label="Overtraining threshold", - info=( - "The maximum number of epochs to continue training without any observed" - " improvement in voice model performance." - ), - value=500, - minimum=1, - maximum=1000, - visible=False, - ) - vocoder: DropdownConfig = DropdownConfig( - label="Vocoder", - info=( - "The vocoder to use for audio synthesis during training. HiFi-GAN provides" - " basic audio fidelity, while RefineGAN provides the highest audio" - " fidelity." - ), - value=Vocoder.HIFI_GAN, - choices=list(Vocoder), - ) - index_algorithm: DropdownConfig = DropdownConfig( - label="Index algorithm", - info=( - "The method to use for generating an index file for the trained voice" - " model. `KMeans` is particularly useful for large datasets." - ), - value=IndexAlgorithm.AUTO, - choices=list(IndexAlgorithm), - ) - pretrained_type: DropdownConfig = DropdownConfig( - label="Pretrained model type", - info=( - "The type of pretrained model to finetune the voice model on. `None` will" - " train the voice model from scratch, while `Default` will use a pretrained" - " model tailored to the specific voice model architecture. `Custom` will" - " use a custom pretrained that you provide." - ), - value=PretrainedType.DEFAULT, - choices=list(PretrainedType), - exclude_value=True, - ) - custom_pretrained_model: DropdownConfig = DropdownConfig( - label="Custom pretrained model", - info="Select a custom pretrained model to finetune from the dropdown.", - value=None, - visible=False, - render=False, - exclude_value=True, - ) - save_interval: SliderConfig = SliderConfig( - label="Save interval", - info=( - "The epoch interval at which to to save voice model weights and" - " checkpoints. The best model weights are always saved regardless of this" - " setting." - ), - value=10, - minimum=1, - maximum=100, - step=1, - ) - save_all_checkpoints: CheckboxConfig = CheckboxConfig( - label="Save all checkpoints", - info=( - "Whether to save a unique checkpoint at each save interval. If not enabled," - " only the latest checkpoint will be saved at each interval." - ), - value=True, - ) - save_all_weights: CheckboxConfig = CheckboxConfig( - label="Save all weights", - info=( - "Whether to save unique voice model weights at each save interval. If not" - " enabled, only the best voice model weights will be saved." - ), - value=True, - ) - clear_saved_data: CheckboxConfig = CheckboxConfig( - label="Clear saved data", - info=( - "Whether to delete any existing training data associated with the voice" - " model before training commences. Enable this setting only if you are" - " training a new voice model from scratch or restarting training." - ), - value=False, - ) - upload_model: CheckboxConfig = CheckboxConfig( - label="Upload voice model", - info=( - "Whether to automatically upload the trained voice model so that it can be" - " used for generation tasks within the Ultimate RVC app." - ), - value=False, - exclude_value=True, - ) - upload_name: TextboxConfig = TextboxConfig( - label="Upload name", - info="The name to give the uploaded voice model.", - value=None, - visible=False, - exclude_value=True, - ) - training_acceleration: DropdownConfig = DropdownConfig.hardware_acceleration() - training_gpus: DropdownConfig = DropdownConfig.gpu() - preload_dataset: CheckboxConfig = CheckboxConfig( - label="Preload dataset", - info=( - "Whether to preload all training data into GPU memory. This can improve" - " training speed but requires a lot of VRAM.

" - ), - value=True, - ) - reduce_memory_usage: CheckboxConfig = CheckboxConfig( - label="Reduce memory usage", - info=( - "Whether to reduce VRAM usage at the cost of slower training speed by" - " enabling activation checkpointing. This is useful for GPUs with limited" - " memory (e.g., <6GB VRAM) or when training with a batch size larger than" - " what your GPU can normally accommodate." - ), - value=False, - ) +class AudioSplitMethod(StrEnum): + """ + Enumeration of methods to use for splitting audio files during + dataset preprocessing. + """ + SKIP = "Skip" + SIMPLE = "Simple" + AUTOMATIC = "Automatic" -class SegmentSize(IntEnum): - """Enumeration of segment sizes for audio separation.""" +class Vocoder(StrEnum): + """Enumeration of vocoders for training voice models.""" - SEG_64 = 64 - SEG_128 = 128 - SEG_256 = 256 - SEG_512 = 512 - SEG_1024 = 1024 - SEG_2048 = 2048 - SEG_4096 = 4096 + HIFI_GAN = "HiFi-GAN" + MRF_HIFI_GAN = "MRF HiFi-GAN" + REFINE_GAN = "RefineGAN" -class F0Method(StrEnum): - """Enumeration of pitch extraction methods.""" +class IndexAlgorithm(StrEnum): + """Enumeration of indexing algorithms for training voice models.""" - RMVPE = "rmvpe" - CREPE = "crepe" - CREPE_TINY = "crepe-tiny" - FCPE = "fcpe" + AUTO = "Auto" + FAISS = "Faiss" + KMEANS = "KMeans" +class PretrainedType(StrEnum): + """ + Enumeration of the possible types of pretrained models to finetune + voice models on. + """ -class RVCContentType(StrEnum): - """Enumeration of valid content to convert with RVC.""" + NONE = "None" + DEFAULT = "Default" + CUSTOM = "Custom" - VOCALS = "vocals" - VOICE = "voice" - SPEECH = "speech" - AUDIO = "audio" -class SampleRate(IntEnum): - """Enumeration of supported audio sample rates.""" - HZ_16000 = 16000 - HZ_44100 = 44100 - HZ_48000 = 48000 - HZ_96000 = 96000 - HZ_192000 = 192000 +class ConcurrencyId(StrEnum): + """Enumeration of possible concurrency identifiers.""" + GPU = auto() -class AudioExt(StrEnum): - """Enumeration of supported audio file formats.""" - MP3 = "mp3" - WAV = "wav" - FLAC = "flac" - OGG = "ogg" +class SongSourceType(StrEnum): + """The type of source providing the song to generate a cover of.""" + LOCAL_FILE = "Local file" + CACHED_SONG = "Cached song" + + +class SpeechSourceType(StrEnum): + """The type of source providing the text to generate speech from.""" + + TEXT = "Text" + LOCAL_FILE = "Local file" + + +class SongTransferOption(StrEnum): + """Enumeration of possible song transfer options.""" + + STEP_1_AUDIO = "Step 1: stem splitting" + STEP_2_VOCALS = "Step 2: vocal conversion" + STEP_3_VOCALS = "Step 3: vocal effect" + STEP_4_INSTRUMENTALS = "Step 4: instrumentals" + STEP_4_BACKUP_VOCALS = "Step 4: backup vocals" + STEP_5_MAIN_VOCALS = "Step 5: main vocals" + STEP_5_INSTRUMENTALS = "Step 5: instrumentals" + STEP_5_BACKUP_VOCALS = "Step 5: backup vocals" + + +class SpeechTransferOption(StrEnum): + """Enumeration of possible speech transfer options.""" + + STEP_2_SPEECH = "Step 2: vocal conversion" + STEP_3_SPEECH = "Step 3: vocal effect" + + +class ComponentVisibilityKwArgs(TypedDict, total=False): + """ + Keyword arguments for setting component visibility. + + Attributes + ---------- + visible : bool + Whether the component should be visible. + value : Any + The value of the component. + + """ + + visible: bool + value: Any + + +class UpdateDropdownKwArgs(TypedDict, total=False): + """ + Keyword arguments for updating a dropdown component. + + Attributes + ---------- + choices : DropdownChoices + The updated choices for the dropdown component. + value : DropdownValue + The updated value for the dropdown component. + + """ + + choices: DropdownChoices + value: DropdownValue + + +class TextBoxKwArgs(TypedDict, total=False): + """ + Keyword arguments for updating a textbox component. + + Attributes + ---------- + value : str | None + The updated value for the textbox component. + placeholder : str | None + The updated placeholder for the textbox component. + + """ + + value: str | None + placeholder: str | None + + +class UpdateAudioKwArgs(TypedDict, total=False): + """ + Keyword arguments for updating an audio component. + + Attributes + ---------- + value : str | None + The updated value for the audio component. + + """ + + value: str | None + + +class DatasetType(StrEnum): + """The type of dataset to train a voice model.""" + + NEW_DATASET = "New dataset" + EXISTING_DATASET = "Existing dataset" + + +embedders_list = [ + ("embedders/contentvec/", ["pytorch_model.bin", "config.json"]), + ("embedders/custom/Crusty/", ["model.safetensors", "config.json"]), +] + + +class EmbedderModel(StrEnum): + """Enumeration of audio embedding models.""" + + CONTENTVEC = "contentvec" + CRUSTY = "Crusty" + CUSTOM = "custom" + + + + + +class SeparationModel(StrEnum): + """Enumeration of audio separation models.""" + + UVR_MDX_NET_VOC_FT = "UVR-MDX-NET-Voc_FT.onnx" + UVR_MDX_NET_KARA_2 = "UVR_MDXNET_KARA_2.onnx" + REVERB_HQ_BY_FOXJOY = "Reverb_HQ_By_FoxJoy.onnx" + UVR_MDX_NET_Inst_HQ_3 = "UVR-MDX-NET-Inst_HQ_3.onnx" + UVR_MDX_NET_Voc_FT = "UVR-MDX-NET-Voc_FT.onnx" + Kim_Vocal_1 = "Kim_Vocal_1.onnx" + Kim_Vocal_2 = "Kim_Vocal_2.onnx" + Kim_Inst = "Kim_Inst.onnx" + UVR_MDX_NET_Inst_HQ_4 = "UVR-MDX-NET-Inst_HQ_4.onnx" + kuielab_a_vocals = "kuielab_a_vocals.onnx" + kuielab_b_vocals = "kuielab_b_vocals.onnx" + kuielab_a_drums = "kuielab_a_drums.onnx" + kuielab_b_drums = "kuielab_b_drums.onnx" + kuielab_a_bass = "kuielab_a_bass.onnx" + kuielab_b_bass = "kuielab_b_bass.onnx" + kuielab_a_other = "kuielab_a_other.onnx" + kuielab_b_other = "kuielab_b_other.onnx" + MDX23C_8KFFT_InstVoc_HQ_2 = "MDX23C-8KFFT-InstVoc_HQ_2.ckpt" + UVR_DeNoise = "UVR-DeNoise.pth" + UVR_DeEcho_DeReverb = "UVR-DeEcho-DeReverb.pth" + +class SeparationModel2(StrEnum): + """Enumeration of audio separation models.""" + + UVR_MDX_NET_VOC_FT = "UVR-MDX-NET-Voc_FT.onnx" + UVR_MDX_NET_KARA_2 = "UVR_MDXNET_KARA_2.onnx" + REVERB_HQ_BY_FOXJOY = "Reverb_HQ_By_FoxJoy.onnx" + UVR_MDX_NET_Inst_HQ_3 = "UVR-MDX-NET-Inst_HQ_3.onnx" + UVR_MDX_NET_Voc_FT = "UVR-MDX-NET-Voc_FT.onnx" + Kim_Vocal_1 = "Kim_Vocal_1.onnx" + Kim_Vocal_2 = "Kim_Vocal_2.onnx" + Kim_Inst = "Kim_Inst.onnx" + UVR_MDX_NET_Inst_HQ_4 = "UVR-MDX-NET-Inst_HQ_4.onnx" + kuielab_a_vocals = "kuielab_a_vocals.onnx" + kuielab_b_vocals = "kuielab_b_vocals.onnx" + kuielab_a_drums = "kuielab_a_drums.onnx" + kuielab_b_drums = "kuielab_b_drums.onnx" + kuielab_a_bass = "kuielab_a_bass.onnx" + kuielab_b_bass = "kuielab_b_bass.onnx" + kuielab_a_other = "kuielab_a_other.onnx" + kuielab_b_other = "kuielab_b_other.onnx" + MDX23C_8KFFT_InstVoc_HQ_2 = "MDX23C-8KFFT-InstVoc_HQ_2.ckpt" + UVR_DeNoise = "UVR-DeNoise.pth" + UVR_DeEcho_DeReverb = "UVR-DeEcho-DeReverb.pth" + + + + + +now_dir = os.getcwd() + +sys.path.append(now_dir) +models_dir = "models" + +dump_path = os.path.join(now_dir, models_dir) + +repo_id = "lainlives/voice" + +hf_token = os.environ.get("HF_TOKEN") +snapshot_download(repo_id=repo_id, local_dir=dump_path, token=hf_token) + +#if __name__ == "__main__": +# start_app(share=False, ssr_mode = True) -class DeviceType(StrEnum): - """Enumeration of device types for training voice models.""" - AUTOMATIC = "Automatic" - CPU = "CPU" - GPU = "GPU" -class TrainingSampleRate(StrEnum): - """Enumeration of sample rates for training voice models.""" - HZ_32K = "32000" - HZ_40K = "40000" - HZ_48K = "48000" +config_name = "default" #os.environ.get("URVC_CONFIG") +cookiefile = os.environ.get("YT_COOKIEFILE") -class PretrainedSampleRate(StrEnum): - """Enumeration of valid sample rates for pretrained models.""" - HZ_32K = "32k" - HZ_40K = "40k" - HZ_44K = "44k" - HZ_48K = "48k" +""" +Module defining models for representing configuration settings for +UI tabs. +""" -class TrainingF0Method(StrEnum): - """Enumeration of pitch extraction methods for training.""" - RMVPE = "rmvpe" - CREPE = "crepe" - CREPE_TINY = "crepe-tiny" -class AudioSplitMethod(StrEnum): +class SongIntermediateAudioConfig(BaseModel): """ - Enumeration of methods to use for splitting audio files during - dataset preprocessing. + Configuration settings for intermediate audio components in the + one-click song generation tab. + + Attributes + ---------- + song : AudioConfig + Configuration settings for the input song audio component. + vocals : AudioConfig + Configuration settings for the vocals audio component. + instrumentals : AudioConfig + Configuration settings for the instrumentals audio component. + main_vocals : AudioConfig + Configuration settings for the main vocals audio component. + backup_vocals : AudioConfig + Configuration settings for the backup vocals audio component. + main_vocals_dereverbed : AudioConfig + Configuration settings for the main vocals de-reverbed audio + component. + main_vocals_reverb : AudioConfig + Configuration settings for the main vocals reverb audio + component. + converted_vocals : AudioConfig + Configuration settings for the converted vocals audio + component. + postprocessed_vocals : AudioConfig + Configuration settings for the postprocessed vocals audio + component. + instrumentals_shifted : AudioConfig + Configuration settings for the shifted instrumentals audio + component. + backup_vocals_shifted : AudioConfig + Configuration settings for the shifted backup vocals audio + component. + all : list[gr.Audio] + List of instances of all intermediate audio components. + """ - SKIP = "Skip" - SIMPLE = "Simple" - AUTOMATIC = "Automatic" + song: AudioConfig = AudioConfig.intermediate(label="Song") + vocals: AudioConfig = AudioConfig.intermediate(label="Vocals") + instrumentals: AudioConfig = AudioConfig.intermediate( + label="Instrumentals", + ) + main_vocals: AudioConfig = AudioConfig.intermediate( + label="Main vocals", + ) + backup_vocals: AudioConfig = AudioConfig.intermediate( + label="Backup vocals", + ) + main_vocals_dereverbed: AudioConfig = AudioConfig.intermediate( + label="De-reverbed main vocals", + ) + main_vocals_reverb: AudioConfig = AudioConfig.intermediate( + label="Main vocals with reverb", + ) + converted_vocals: AudioConfig = AudioConfig.intermediate( + label="Converted vocals", + ) + postprocessed_vocals: AudioConfig = AudioConfig.intermediate( + label="Postprocessed vocals", + ) + instrumentals_shifted: AudioConfig = AudioConfig.intermediate( + label="Pitch-shifted instrumentals", + ) + backup_vocals_shifted: AudioConfig = AudioConfig.intermediate( + label="Pitch-shifted backup vocals", + ) + @property + def all(self) -> list[gr.Audio]: + """ + Retrieve instances of all intermediate audio components + in the one-click song generation tab. -class Vocoder(StrEnum): - """Enumeration of vocoders for training voice models.""" + Returns + ------- + list[gr.Audio] + List of instances of all intermediate audio components in + the one-click song generation tab. - HIFI_GAN = "HiFi-GAN" - MRF_HIFI_GAN = "MRF HiFi-GAN" - REFINE_GAN = "RefineGAN" + """ + # NOTE we are using self.__annotations__ to get the fields in + # the order they are defined in the class + return [getattr(self, field).instance for field in self.__annotations__] -class IndexAlgorithm(StrEnum): - """Enumeration of indexing algorithms for training voice models.""" +class OneClickSongGenerationConfig(SongGenerationConfig): + """ + Configuration settings for the one-click song generation tab. - AUTO = "Auto" - FAISS = "Faiss" - KMEANS = "KMeans" + Attributes + ---------- + n_octaves : SliderConfig + Configuration settings for an octave pitch shift slider + component. + n_semitones : SliderConfig + Configuration settings for a semitone pitch shift slider + component. + show_intermediate_audio : CheckboxConfig + Configuration settings for a show intermediate audio checkbox + component. + intermediate_audio : SongIntermediateAudioConfig + Configuration settings for intermediate audio components. + See Also + -------- + SongGenerationConfig + Parent model defining common component configuration settings + for song generation tabs. -class PretrainedType(StrEnum): - """ - Enumeration of the possible types of pretrained models to finetune - voice models on. """ - NONE = "None" - DEFAULT = "Default" - CUSTOM = "Custom" + n_octaves: SliderConfig = SliderConfig.octave_shift( + label="Vocal pitch shift", + info=( + "The number of octaves to shift the pitch of the converted vocals by. Use 1" + " for male-to-female and -1 for vice-versa." + ), + ) + n_semitones: SliderConfig = SliderConfig.semitone_shift( + label="Overall pitch shift", + info=( + "The number of semi-tones to shift the pitch of the converted vocals," + " instrumentals and backup vocals by." + ), + ) + show_intermediate_audio: CheckboxConfig = CheckboxConfig( + label="Show intermediate audio", + info="Show intermediate audio tracks produced during song cover generation.", + value=False, + exclude_value=True, + ) + intermediate_audio: SongIntermediateAudioConfig = SongIntermediateAudioConfig() +class SongInputAudioConfig(BaseModel): + """ + Configuration settings for input audio components in the multi-step + song generation tab. -class ConcurrencyId(StrEnum): - """Enumeration of possible concurrency identifiers.""" + Attributes + ---------- + audio : AudioConfig + Configuration settings for the input audio component. + vocals : AudioConfig + Configuration settings for the vocals audio component. + converted_vocals : AudioConfig + Configuration settings for the converted vocals audio + component. + instrumentals : AudioConfig + Configuration settings for the instrumentals audio + component. + backup_vocals : AudioConfig + Configuration settings for the backup vocals audio + component. + main_vocals : AudioConfig + Configuration settings for the main vocals audio + component. + shifted_instrumentals : AudioConfig + Configuration settings for the shifted instrumentals audio + component. + shifted_backup_vocals : AudioConfig + Configuration settings for the shifted backup vocals audio + component. + all : list[AudioConfig] + List of configuration settings for all input audio + components in the multi-step song generation tab. - GPU = auto() + """ + audio: AudioConfig = AudioConfig.input(label="Audio") + vocals: AudioConfig = AudioConfig.input(label="Vocals") + converted_vocals: AudioConfig = AudioConfig.input(label="Vocals") + instrumentals: AudioConfig = AudioConfig.input(label="Instrumentals") + backup_vocals: AudioConfig = AudioConfig.input(label="Backup vocals") + main_vocals: AudioConfig = AudioConfig.input(label="Main vocals") + shifted_instrumentals: AudioConfig = AudioConfig.input(label="Instrumentals") + shifted_backup_vocals: AudioConfig = AudioConfig.input(label="Backup vocals") -class SongSourceType(StrEnum): - """The type of source providing the song to generate a cover of.""" + @property + def all(self) -> list[AudioConfig]: + """ + Retrieve configuration settings for all input audio components + in the multi-step song generation tab. - LOCAL_FILE = "Local file" - CACHED_SONG = "Cached song" + Returns + ------- + list[AudioConfig] + List of configuration settings for all input audio + components in the multi-step song generation tab. + """ + return [getattr(self, field) for field in self.__annotations__] -class SpeechSourceType(StrEnum): - """The type of source providing the text to generate speech from.""" - TEXT = "Text" - LOCAL_FILE = "Local file" +class SongDirsConfig(BaseModel): + """ + Configuration settings for song directory components in the + multi-step song generation tab. + Attributes + ---------- + separate_audio : DropdownConfig + Configuration settings for the song directory component + for separating audio. + convert_vocals : DropdownConfig + Configuration settings for the song directory component + for converting vocals. + postprocess_vocals : DropdownConfig + Configuration settings for the song directory component + for postprocessing vocals. + pitch_shift_background : DropdownConfig + Configuration settings for the song directory component + for pitch-shifting background audio. + mix : DropdownConfig + Configuration settings for the song directory component + for mixing audio. + all : list[gr.Dropdown] + List of instances of all song directory components in the + multi-step song generation tab. -class SongTransferOption(StrEnum): - """Enumeration of possible song transfer options.""" + """ - STEP_1_AUDIO = "Step 1: stem splitting" - STEP_2_VOCALS = "Step 2: vocal conversion" - STEP_3_VOCALS = "Step 3: vocal effect" - STEP_4_INSTRUMENTALS = "Step 4: instrumentals" - STEP_4_BACKUP_VOCALS = "Step 4: backup vocals" - STEP_5_MAIN_VOCALS = "Step 5: main vocals" - STEP_5_INSTRUMENTALS = "Step 5: instrumentals" - STEP_5_BACKUP_VOCALS = "Step 5: backup vocals" + separate_audio: DropdownConfig = DropdownConfig.song_dir() + convert_vocals: DropdownConfig = DropdownConfig.song_dir() + postprocess_vocals: DropdownConfig = DropdownConfig.song_dir() + pitch_shift_background: DropdownConfig = DropdownConfig.song_dir() + mix: DropdownConfig = DropdownConfig.song_dir() + @property + def all(self) -> list[gr.Dropdown]: + """ + Retrieve instances of all song directory components in the + multi-step song generation tab. -class SpeechTransferOption(StrEnum): - """Enumeration of possible speech transfer options.""" + Returns + ------- + list[gr.Dropdown] + List of instances of all song directory components in + the multi-step song generation tab. - STEP_2_SPEECH = "Step 2: vocal conversion" - STEP_3_SPEECH = "Step 3: vocal effect" + """ + return [getattr(self, field).instance for field in self.__annotations__] -class ComponentVisibilityKwArgs(TypedDict, total=False): +class MultiStepSongGenerationConfig(SongGenerationConfig): """ - Keyword arguments for setting component visibility. + Configuration settings for multi-step song generation tab. Attributes ---------- - visible : bool - Whether the component should be visible. - value : Any - The value of the component. + separation_model : DropdownConfig + Configuration settings for a separation model dropdown + component. + segment_size : RadioConfig + Configuration settings for a segment size radio component. + n_octaves : SliderConfig + Configuration settings for an octave pitch shift slider + component. + n_semitones : SliderConfig + Configuration settings for a semitone pitch shift slider + component. + n_semitones_instrumentals : SliderConfig + Configuration settings for an instrumentals pitch shift slider + component. + n_semitones_backup_vocals : SliderConfig + Configuration settings for a backup vocals pitch shift slider + component. + input_audio : SongInputAudioConfig + Configuration settings for input audio components. + song_dirs : SongDirsConfig + Configuration settings for song directory components. + + See Also + -------- + SongGenerationConfig + Parent model defining common component configuration settings + for song generation tabs. """ - visible: bool - value: Any + separation_model: DropdownConfig = DropdownConfig( + label="Separation model", + info="The model to use for audio separation.", + value=SeparationModel.UVR_MDX_NET_VOC_FT, + choices=list(SeparationModel2), + ) + segment_size: RadioConfig = RadioConfig( + label="Segment size", + info=( + "The size of the segments into which the audio is split. Using a larger" + " size consumes more resources, but may give better results." + ), + value=SegmentSize.SEG_2048, + choices=list(SegmentSize), + ) + n_octaves: SliderConfig = SliderConfig.octave_shift( + label="Pitch shift (octaves)", + info=( + "The number of octaves to pitch-shift the converted voice by. Use 1 for" + " male-to-female and -1 for vice-versa." + ), + ) + n_semitones: SliderConfig = SliderConfig.semitone_shift( + label="Pitch shift (semi-tones)", + info=( + "The number of semi-tones to pitch-shift the converted vocals by. Altering" + " this slightly reduces sound quality." + ), + ) + n_semitones_instrumentals: SliderConfig = SliderConfig.semitone_shift( + label="Instrumental pitch shift", + info="The number of semi-tones to pitch-shift the instrumentals by.", + ) + n_semitones_backup_vocals: SliderConfig = SliderConfig.semitone_shift( + label="Backup vocal pitch shift", + info="The number of semi-tones to pitch-shift the backup vocals by.", + ) + input_audio: SongInputAudioConfig = SongInputAudioConfig() + song_dirs: SongDirsConfig = SongDirsConfig() -class UpdateDropdownKwArgs(TypedDict, total=False): +class SpeechIntermediateAudioConfig(BaseModel): """ - Keyword arguments for updating a dropdown component. + Configuration settings for intermediate audio components in the + one-click speech generation tab. Attributes ---------- - choices : DropdownChoices - The updated choices for the dropdown component. - value : DropdownValue - The updated value for the dropdown component. + speech : AudioConfig + Configuration settings for the input speech audio component. + converted_speech : AudioConfig + Configuration settings for the converted speech audio component. + all : list[gr.Audio] + List of instances of all intermediate audio components in the + speech generation tab. """ - choices: DropdownChoices - value: DropdownValue - - -class TextBoxKwArgs(TypedDict, total=False): - """ - Keyword arguments for updating a textbox component. + speech: AudioConfig = AudioConfig.intermediate(label="Speech") + converted_speech: AudioConfig = AudioConfig.intermediate(label="Converted speech") - Attributes - ---------- - value : str | None - The updated value for the textbox component. - placeholder : str | None - The updated placeholder for the textbox component. + @property + def all(self) -> list[gr.Audio]: + """ + Retrieve instances of all intermediate audio components in the + speech generation tab. - """ + Returns + ------- + list[gr.Audio] + List of instances of all intermediate audio components in + the speech generation tab. - value: str | None - placeholder: str | None + """ + return [getattr(self, field).instance for field in self.__annotations__] -class UpdateAudioKwArgs(TypedDict, total=False): +class OneClickSpeechGenerationConfig(SpeechGenerationConfig): """ - Keyword arguments for updating an audio component. + Configuration settings for one-click speech generation tab. Attributes ---------- - value : str | None - The updated value for the audio component. - - """ - - value: str | None - - -class DatasetType(StrEnum): - """The type of dataset to train a voice model.""" - - NEW_DATASET = "New dataset" - EXISTING_DATASET = "Existing dataset" + intermediate_audio : SpeechIntermediateAudioConfig + Configuration settings for intermediate audio components. + show_intermediate_audio : CheckboxConfig + Configuration settings for a show intermediate audio checkbox + component. + See Also + -------- + SpeechGenerationConfig + Parent model defining common component configuration settings + for speech generation tabs. -embedders_list = [ - ("embedders/contentvec/", ["pytorch_model.bin", "config.json"]), - ("embedders/custom/Crusty/", ["model.safetensors", "config.json"]), -] + """ + intermediate_audio: SpeechIntermediateAudioConfig = SpeechIntermediateAudioConfig() -class EmbedderModel(StrEnum): - """Enumeration of audio embedding models.""" + show_intermediate_audio: CheckboxConfig = CheckboxConfig( + label="Show intermediate audio", + info="Show intermediate audio tracks produced during speech generation.", + value=False, + exclude_value=True, + ) - CONTENTVEC = "contentvec" - CRUSTY = "Crusty" - CUSTOM = "custom" - -class SeparationModel(StrEnum): - """Enumeration of audio separation models.""" - UVR_MDX_NET_VOC_FT = "UVR-MDX-NET-Voc_FT.onnx" - UVR_MDX_NET_KARA_2 = "UVR_MDXNET_KARA_2.onnx" - REVERB_HQ_BY_FOXJOY = "Reverb_HQ_By_FoxJoy.onnx" - UVR_MDX_NET_Inst_HQ_3 = "UVR-MDX-NET-Inst_HQ_3.onnx" - UVR_MDX_NET_Voc_FT = "UVR-MDX-NET-Voc_FT.onnx" - Kim_Vocal_1 = "Kim_Vocal_1.onnx" - Kim_Vocal_2 = "Kim_Vocal_2.onnx" - Kim_Inst = "Kim_Inst.onnx" - UVR_MDX_NET_Inst_HQ_4 = "UVR-MDX-NET-Inst_HQ_4.onnx" - kuielab_a_vocals = "kuielab_a_vocals.onnx" - kuielab_b_vocals = "kuielab_b_vocals.onnx" - kuielab_a_drums = "kuielab_a_drums.onnx" - kuielab_b_drums = "kuielab_b_drums.onnx" - kuielab_a_bass = "kuielab_a_bass.onnx" - kuielab_b_bass = "kuielab_b_bass.onnx" - kuielab_a_other = "kuielab_a_other.onnx" - kuielab_b_other = "kuielab_b_other.onnx" - MDX23C_8KFFT_InstVoc_HQ_2 = "MDX23C-8KFFT-InstVoc_HQ_2.ckpt" - UVR_DeNoise = "UVR-DeNoise.pth" - UVR_DeEcho_DeReverb = "UVR-DeEcho-DeReverb.pth" - -class SeparationModel2(StrEnum): - """Enumeration of audio separation models.""" +class SpeechInputAudioConfig(BaseModel): + """ + Configuration settings for input audio components in the multi-step + speech generation tab. - UVR_MDX_NET_VOC_FT = "UVR-MDX-NET-Voc_FT.onnx" - UVR_MDX_NET_KARA_2 = "UVR_MDXNET_KARA_2.onnx" - REVERB_HQ_BY_FOXJOY = "Reverb_HQ_By_FoxJoy.onnx" - UVR_MDX_NET_Inst_HQ_3 = "UVR-MDX-NET-Inst_HQ_3.onnx" - UVR_MDX_NET_Voc_FT = "UVR-MDX-NET-Voc_FT.onnx" - Kim_Vocal_1 = "Kim_Vocal_1.onnx" - Kim_Vocal_2 = "Kim_Vocal_2.onnx" - Kim_Inst = "Kim_Inst.onnx" - UVR_MDX_NET_Inst_HQ_4 = "UVR-MDX-NET-Inst_HQ_4.onnx" - kuielab_a_vocals = "kuielab_a_vocals.onnx" - kuielab_b_vocals = "kuielab_b_vocals.onnx" - kuielab_a_drums = "kuielab_a_drums.onnx" - kuielab_b_drums = "kuielab_b_drums.onnx" - kuielab_a_bass = "kuielab_a_bass.onnx" - kuielab_b_bass = "kuielab_b_bass.onnx" - kuielab_a_other = "kuielab_a_other.onnx" - kuielab_b_other = "kuielab_b_other.onnx" - MDX23C_8KFFT_InstVoc_HQ_2 = "MDX23C-8KFFT-InstVoc_HQ_2.ckpt" - UVR_DeNoise = "UVR-DeNoise.pth" - UVR_DeEcho_DeReverb = "UVR-DeEcho-DeReverb.pth" + Attributes + ---------- + speech : AudioConfig + Configuration settings for the input speech audio component. + converted_speech : AudioConfig + Configuration settings for the converted speech audio component. + all : list[AudioConfig] + List of configuration settings for all input audio components in + the multi-step speech generation tab. + """ + speech: AudioConfig = AudioConfig.input("Speech") + converted_speech: AudioConfig = AudioConfig.input("Converted speech") + @property + def all(self) -> list[AudioConfig]: + """ + Retrieve configuration settings for all input audio components + in the multi-step speech generation tab. -now_dir = os.getcwd() + Returns + ------- + list[AudioConfig] + List of configuration settings for all input audio + components in the multi-step speech generation tab. -sys.path.append(now_dir) -models_dir = "models" + """ + return [getattr(self, field) for field in self.__annotations__] -dump_path = os.path.join(now_dir, models_dir) -repo_id = "lainlives/voice" +class MultiStepSpeechGenerationConfig(SpeechGenerationConfig): + """ + Configuration settings for the multi-step speech generation tab. -hf_token = os.environ.get("HF_TOKEN") -snapshot_download(repo_id=repo_id, local_dir=dump_path, token=hf_token) + Attributes + ---------- + input_audio : SpeechInputAudioConfig + Configuration settings for input audio components. -#if __name__ == "__main__": -# start_app(share=False, ssr_mode = True) + See Also + -------- + SpeechGenerationConfig + Parent model defining common component configuration settings + for speech generation tabs. + """ + input_audio: SpeechInputAudioConfig = SpeechInputAudioConfig() +class MultiStepTrainingConfig(TrainingConfig): + """Configuration settings for multi-step training tab.""" -config_name = "default" #os.environ.get("URVC_CONFIG") -cookiefile = os.environ.get("YT_COOKIEFILE") +class ModelManagementConfig(BaseModel): + """ + Configuration settings for model management tab. + Attributes + ---------- + voices : DropdownConfig + Configuration settings for delete voice models dropdown + component. + embedders : DropdownConfig + Configuration settings for delete embedder models dropdown + component. + pretraineds : DropdownConfig + Configuration settings for delete pretrained models dropdown + component. + traineds : DropdownConfig + Configuration settings for delete training models dropdown + component. + dummy_checkbox : CheckboxConfig + Configuration settings for a dummy checkbox component. -""" -Module defining models for representing configuration settings for -UI tabs. -""" + """ + voices: DropdownConfig = DropdownConfig.multi_delete( + label="Voice models", + info="Select one or more voice models to delete.", + ) + embedders: DropdownConfig = DropdownConfig.multi_delete( + label="Custom embedder models", + info="Select one or more embedder models to delete.", + ) + pretraineds: DropdownConfig = DropdownConfig.multi_delete( + label="Custom pretrained models", + info="Select one or more pretrained models to delete.", + ) + traineds: DropdownConfig = DropdownConfig.multi_delete( + label="Training models", + info="Select one or more training models to delete.", + ) + dummy_checkbox: CheckboxConfig = CheckboxConfig( + value=False, + visible=False, + exclude_value=True, + ) -class SongIntermediateAudioConfig(BaseModel): +class AudioManagementConfig(BaseModel): """ - Configuration settings for intermediate audio components in the - one-click song generation tab. + Configuration settings for audio management tab. Attributes ---------- - song : AudioConfig - Configuration settings for the input song audio component. - vocals : AudioConfig - Configuration settings for the vocals audio component. - instrumentals : AudioConfig - Configuration settings for the instrumentals audio component. - main_vocals : AudioConfig - Configuration settings for the main vocals audio component. - backup_vocals : AudioConfig - Configuration settings for the backup vocals audio component. - main_vocals_dereverbed : AudioConfig - Configuration settings for the main vocals de-reverbed audio - component. - main_vocals_reverb : AudioConfig - Configuration settings for the main vocals reverb audio - component. - converted_vocals : AudioConfig - Configuration settings for the converted vocals audio - component. - postprocessed_vocals : AudioConfig - Configuration settings for the postprocessed vocals audio + intermediate : DropdownConfig + Configuration settings for delete intermediate audio files + dropdown component + speech : DropdownConfig + Configuration settings for delete speech audio files dropdown component. - instrumentals_shifted : AudioConfig - Configuration settings for the shifted instrumentals audio + output : DropdownConfig + Configuration settings for delete output audio files dropdown component. - backup_vocals_shifted : AudioConfig - Configuration settings for the shifted backup vocals audio + dataset : DropdownConfig + Configuration settings for delete dataset audio files dropdown component. - all : list[gr.Audio] - List of instances of all intermediate audio components. + dummy_checkbox : CheckboxConfig + Configuration settings for a dummy checkbox component. """ - song: AudioConfig = AudioConfig.intermediate(label="Song") - vocals: AudioConfig = AudioConfig.intermediate(label="Vocals") - instrumentals: AudioConfig = AudioConfig.intermediate( - label="Instrumentals", - ) - main_vocals: AudioConfig = AudioConfig.intermediate( - label="Main vocals", + intermediate: DropdownConfig = DropdownConfig.multi_delete( + label="Song directories", + info=( + "Select one or more song directories containing intermediate audio files to" + " delete." + ), ) - backup_vocals: AudioConfig = AudioConfig.intermediate( - label="Backup vocals", + speech: DropdownConfig = DropdownConfig.multi_delete( + label="Speech audio files", + info="Select one or more speech audio files to delete.", ) - main_vocals_dereverbed: AudioConfig = AudioConfig.intermediate( - label="De-reverbed main vocals", + output: DropdownConfig = DropdownConfig.multi_delete( + label="Output audio files", + info="Select one or more output audio files to delete.", ) - main_vocals_reverb: AudioConfig = AudioConfig.intermediate( - label="Main vocals with reverb", + dataset: DropdownConfig = DropdownConfig.multi_delete( + label="Dataset audio files", + info="Select one or more datasets containing audio files to delete.", ) - converted_vocals: AudioConfig = AudioConfig.intermediate( - label="Converted vocals", + + dummy_checkbox: CheckboxConfig = CheckboxConfig( + value=False, + visible=False, + exclude_value=True, ) - postprocessed_vocals: AudioConfig = AudioConfig.intermediate( - label="Postprocessed vocals", + + +class SettingsManagementConfig(BaseModel): + """ + Configuration settings for settings management tab. + + Attributes + ---------- + dummy_checkbox : CheckboxConfig + Configuration settings for a dummy checkbox component. + + """ + + load_config_name: DropdownConfig = DropdownConfig( + label="Configuration name", + info="The name of a configuration to load UI settings from", + value=None, + render=False, + exclude_value=True, ) - instrumentals_shifted: AudioConfig = AudioConfig.intermediate( - label="Pitch-shifted instrumentals", + delete_config_names: DropdownConfig = DropdownConfig.multi_delete( + label="Configuration names", + info="Select the name of one or more configurations to delete", ) - backup_vocals_shifted: AudioConfig = AudioConfig.intermediate( - label="Pitch-shifted backup vocals", + dummy_checkbox: CheckboxConfig = CheckboxConfig( + value=False, + visible=False, + exclude_value=True, ) - @property - def all(self) -> list[gr.Audio]: - """ - Retrieve instances of all intermediate audio components - in the one-click song generation tab. - Returns - ------- - list[gr.Audio] - List of instances of all intermediate audio components in - the one-click song generation tab. +class TotalSongGenerationConfig(BaseModel): + """ + All configuration settings for song generation tabs. - """ - # NOTE we are using self.__annotations__ to get the fields in - # the order they are defined in the class - return [getattr(self, field).instance for field in self.__annotations__] + Attributes + ---------- + one_click : OneClickSongGenerationConfig + Configuration settings for the one-click song generation tab. + multi_step : MultiStepSongGenerationConfig + Configuration settings for the multi-step song generation tab. + + """ + one_click: OneClickSongGenerationConfig = OneClickSongGenerationConfig() + multi_step: MultiStepSongGenerationConfig = MultiStepSongGenerationConfig() -class OneClickSongGenerationConfig(SongGenerationConfig): + +class TotalSpeechGenerationConfig(BaseModel): """ - Configuration settings for the one-click song generation tab. + All configuration settings for speech generation tabs. Attributes ---------- - n_octaves : SliderConfig - Configuration settings for an octave pitch shift slider - component. - n_semitones : SliderConfig - Configuration settings for a semitone pitch shift slider - component. - show_intermediate_audio : CheckboxConfig - Configuration settings for a show intermediate audio checkbox - component. - intermediate_audio : SongIntermediateAudioConfig - Configuration settings for intermediate audio components. + one_click : OneClickSpeechGenerationConfig + Configuration settings for the one-click speech generation tab. + multi_step : MultiStepSpeechGenerationConfig + Configuration settings for the multi-step speech generation tab. + + """ + + one_click: OneClickSpeechGenerationConfig = OneClickSpeechGenerationConfig() + multi_step: MultiStepSpeechGenerationConfig = MultiStepSpeechGenerationConfig() + + +class TotalTrainingConfig(BaseModel): + """ + All configuration settings for training tabs. + + Attributes + ---------- + training : TrainingConfig + Configuration settings for the multi-step training tab. - See Also - -------- - SongGenerationConfig - Parent model defining common component configuration settings - for song generation tabs. + """ + multi_step: MultiStepTrainingConfig = MultiStepTrainingConfig() + + +class TotalManagementConfig(BaseModel): """ + All configuration settings for management tabs. - n_octaves: SliderConfig = SliderConfig.octave_shift( - label="Vocal pitch shift", - info=( - "The number of octaves to shift the pitch of the converted vocals by. Use 1" - " for male-to-female and -1 for vice-versa." - ), - ) + Attributes + ---------- + model : ModelManagementConfig + Configuration settings for the model management tab. + audio : AudioManagementConfig + Configuration settings for the audio management tab. + settings : SettingsManagementConfig + Configuration settings for the settings management tab. - n_semitones: SliderConfig = SliderConfig.semitone_shift( - label="Overall pitch shift", - info=( - "The number of semi-tones to shift the pitch of the converted vocals," - " instrumentals and backup vocals by." - ), - ) - show_intermediate_audio: CheckboxConfig = CheckboxConfig( - label="Show intermediate audio", - info="Show intermediate audio tracks produced during song cover generation.", - value=False, - exclude_value=True, - ) - intermediate_audio: SongIntermediateAudioConfig = SongIntermediateAudioConfig() + """ + + model: ModelManagementConfig = ModelManagementConfig() + audio: AudioManagementConfig = AudioManagementConfig() + settings: SettingsManagementConfig = SettingsManagementConfig() -class SongInputAudioConfig(BaseModel): +class TotalConfig(BaseModel): """ - Configuration settings for input audio components in the multi-step - song generation tab. + All configuration settings for the Ultimate RVC app. Attributes ---------- - audio : AudioConfig - Configuration settings for the input audio component. - vocals : AudioConfig - Configuration settings for the vocals audio component. - converted_vocals : AudioConfig - Configuration settings for the converted vocals audio - component. - instrumentals : AudioConfig - Configuration settings for the instrumentals audio - component. - backup_vocals : AudioConfig - Configuration settings for the backup vocals audio - component. - main_vocals : AudioConfig - Configuration settings for the main vocals audio - component. - shifted_instrumentals : AudioConfig - Configuration settings for the shifted instrumentals audio - component. - shifted_backup_vocals : AudioConfig - Configuration settings for the shifted backup vocals audio - component. - all : list[AudioConfig] - List of configuration settings for all input audio - components in the multi-step song generation tab. + song : TotalSongGenerationConfig + Configuration settings for song generation tabs. + speech : TotalSpeechGenerationConfig + Configuration settings for speech generation tabs. + training : TotalTrainingConfig + Configuration settings for training tabs. + management : TotalManagementConfig + Configuration settings for management tabs. """ - audio: AudioConfig = AudioConfig.input(label="Audio") - vocals: AudioConfig = AudioConfig.input(label="Vocals") - converted_vocals: AudioConfig = AudioConfig.input(label="Vocals") - instrumentals: AudioConfig = AudioConfig.input(label="Instrumentals") - backup_vocals: AudioConfig = AudioConfig.input(label="Backup vocals") - main_vocals: AudioConfig = AudioConfig.input(label="Main vocals") - shifted_instrumentals: AudioConfig = AudioConfig.input(label="Instrumentals") - shifted_backup_vocals: AudioConfig = AudioConfig.input(label="Backup vocals") + song: TotalSongGenerationConfig = TotalSongGenerationConfig() + speech: TotalSpeechGenerationConfig = TotalSpeechGenerationConfig() + training: TotalTrainingConfig = TotalTrainingConfig() + management: TotalManagementConfig = TotalManagementConfig() - @property - def all(self) -> list[AudioConfig]: + @cached_property + def all(self) -> list[AnyComponentConfig]: """ - Retrieve configuration settings for all input audio components - in the multi-step song generation tab. + Recursively collect those component configuration models nested + within the current model instance, which have values that are + not excluded. Returns ------- - list[AudioConfig] - List of configuration settings for all input audio - components in the multi-step song generation tab. + list[AnyComponentConfig] + A list of component configuration models found within the + current model instance, which have values that are not + excluded. """ - return [getattr(self, field) for field in self.__annotations__] + def _collect(model: BaseModel) -> list[AnyComponentConfig]: + component_configs: list[Any] = [] + for _, value in model: + if isinstance(value, ComponentConfig): + if not value.exclude_value: + component_configs.append(value) + elif isinstance(value, BaseModel): + component_configs.extend(_collect(value)) + return component_configs + + return _collect(self) -class SongDirsConfig(BaseModel): + + + + +class BaseTabConfig(BaseModel): """ - Configuration settings for song directory components in the - multi-step song generation tab. + Base model defining common component configuration settings for + UI tabs. Attributes ---------- - separate_audio : DropdownConfig - Configuration settings for the song directory component - for separating audio. - convert_vocals : DropdownConfig - Configuration settings for the song directory component - for converting vocals. - postprocess_vocals : DropdownConfig - Configuration settings for the song directory component - for postprocessing vocals. - pitch_shift_background : DropdownConfig - Configuration settings for the song directory component - for pitch-shifting background audio. - mix : DropdownConfig - Configuration settings for the song directory component - for mixing audio. - all : list[gr.Dropdown] - List of instances of all song directory components in the - multi-step song generation tab. + embedder_model : DropdownConfig + Configuration settings for an embedder model dropdown component. + custom_embedder_model : DropdownConfig + Configuration settings for a custom embedder model dropdown + component. """ - separate_audio: DropdownConfig = DropdownConfig.song_dir() - convert_vocals: DropdownConfig = DropdownConfig.song_dir() - postprocess_vocals: DropdownConfig = DropdownConfig.song_dir() - pitch_shift_background: DropdownConfig = DropdownConfig.song_dir() - mix: DropdownConfig = DropdownConfig.song_dir() - - @property - def all(self) -> list[gr.Dropdown]: - """ - Retrieve instances of all song directory components in the - multi-step song generation tab. - - Returns - ------- - list[gr.Dropdown] - List of instances of all song directory components in - the multi-step song generation tab. + embedder_model: DropdownConfig = DropdownConfig( + label="Embedder model", + info="The model to use for generating speaker embeddings.", + value=EmbedderModel.CONTENTVEC, + choices=list(EmbedderModel), + exclude_value=True, + ) + custom_embedder_model: DropdownConfig = DropdownConfig( + label="Custom embedder model", + info="Select a custom embedder model from the dropdown.", + value=None, + visible=False, + render=False, + exclude_value=True, + ) - """ - return [getattr(self, field).instance for field in self.__annotations__] -class MultiStepSongGenerationConfig(SongGenerationConfig): +class TrainingConfig(BaseTabConfig): """ - Configuration settings for multi-step song generation tab. + Common component configuration settings for training tabs. Attributes ---------- - separation_model : DropdownConfig - Configuration settings for a separation model dropdown + dataset_type : DropdownConfig + Configuration settings for a dataset type dropdown component. + dataset : DropdownConfig + Configuration settings for a dataset dropdown component. + dataset_name : TextboxConfig + Configuration settings for a dataset name textbox component. + preprocess_model : DropdownConfig + Configuration settings for a model name dropdown component + for audio preprocessing. + sample_rate : DropdownConfig + Configuration settings for a sample rate dropdown component. + filter_audio : CheckboxConfig + Configuration settings for a filter audio checkbox component. + clean_audio : CheckboxConfig + Configuration settings for a clean audio checkbox component. + clean_strength : SliderConfig + Configuration settings for a clean strength slider component. + split_method : DropdownConfig + Configuration settings for an audio splitting method dropdown + component. + chunk_len : SliderConfig + Configuration settings for a chunk length slider component. + overlap_len : SliderConfig + Configuration settings for an overlap length slider component. + preprocess_cores : SliderConfig + Configuration settings for a CPU cores slider component for + preprocessing. + extract_model : DropdownConfig + Configuration settings for a model name dropdown component for + feature extraction. + f0_method : DropdownConfig + Configuration settings for an F0 method dropdown component. + hop_length : SliderConfig + Configuration settings for a hop length slider component. + include_mutes : SliderConfig + Configuration settings for an include mutes slider component. + extract_cores : SliderConfig + Configuration settings for a CPU cores slider component for + feature extraction. + extraction_acceleration : HardwareAccelerationConfig + Configuration settings for a hardware acceleration component for + feature extraction. + extraction_gpus : DropdownConfig + Configuration settings for a GPU dropdown compoennt for feature + extraction. + train_model : DropdownConfig + Configuration settings for a model name dropdown component for + training. + num_epochs : SliderConfig + Configuration settings for a number of epochs slider component. + batch_size : SliderConfig + Configuration settings for a batch size slider component. + detect_overtraining : CheckboxConfig + Configuration settings for a detect overtraining checkbox + component. + overtraining_threshold : SliderConfig + Configuration settings for an overtraining threshold slider + component. + vocoder : DropdownConfig + Configuration settings for a vocoder dropdown component. + index_algorithm : DropdownConfig + Configuration settings for an index algorithm dropdown component. - segment_size : RadioConfig - Configuration settings for a segment size radio component. - n_octaves : SliderConfig - Configuration settings for an octave pitch shift slider + pretrained_type : DropdownConfig + Configuration settings for a pretrained model type dropdown component. - n_semitones : SliderConfig - Configuration settings for a semitone pitch shift slider + custom_pretrained_model : DropdownConfig + Configuration settings for a custom pretrained model dropdown component. - n_semitones_instrumentals : SliderConfig - Configuration settings for an instrumentals pitch shift slider + save_interval : SliderConfig + Configuration settings for a save-interval slider component. + save_all_checkpoints : CheckboxConfig + Configuration settings for a save-all-checkpoints checkbox component. - n_semitones_backup_vocals : SliderConfig - Configuration settings for a backup vocals pitch shift slider + save_all_weights : CheckboxConfig + Configuration settings for a save-all-weights checkbox + component. + clear_saved_data : CheckboxConfig + Configuration settings for a clear-saved-data checkbox + component. + upload_model : CheckboxConfig + Configuration settings for an upload voice model checkbox + component. + upload_name : TextboxConfig + Configuration settings for an upload name textbox component. + training_acceleration : HardwareAccelerationConfig + Configuration settings for a hardware acceleration component for + training. + training_gpus : DropdownConfig + Configuration settings for a GPU dropdown component for + training. + preload_dataset : CheckboxConfig + Configuration settings for a preload dataset checkbox component. + reduce_memory_usage : CheckboxConfig + Configuration settings for a reduce-memory-usage checkbox component. - input_audio : SongInputAudioConfig - Configuration settings for input audio components. - song_dirs : SongDirsConfig - Configuration settings for song directory components. See Also -------- - SongGenerationConfig + BaseTabConfig Parent model defining common component configuration settings - for song generation tabs. + for UI tabs. """ - separation_model: DropdownConfig = DropdownConfig( - label="Separation model", - info="The model to use for audio separation.", - value=SeparationModel.UVR_MDX_NET_VOC_FT, - choices=list(SeparationModel2), + dataset_type: DropdownConfig = DropdownConfig( + label="Dataset type", + info="Select the type of dataset to preprocess.", + value=DatasetType.NEW_DATASET, + choices=list(DatasetType), + exclude_value=True, + ) + dataset: DropdownConfig = DropdownConfig( + label="Dataset path", + info=( + "The path to an existing dataset. Either select a path to a previously" + " created dataset or provide a path to an external dataset." + ), + value=None, + allow_custom_value=True, + visible=False, + render=False, + exclude_value=True, + ) + dataset_name: TextboxConfig = TextboxConfig( + label="Dataset name", + info=( + "The name of the new dataset. If the dataset already exists, the provided" + " audio files will be added to it." + ), + value="My dataset", + exclude_value=True, + ) + preprocess_model: DropdownConfig = DropdownConfig( + label="Model name", + info=( + "Name of the model to preprocess the given dataset for. Either select an" + " existing model from the dropdown or provide the name of a new model." + ), + value="My model", + allow_custom_value=True, + render=False, + exclude_value=True, + ) + sample_rate: DropdownConfig = DropdownConfig( + label="Sample rate", + info="Target sample rate for the audio files in the provided dataset.", + value=TrainingSampleRate.HZ_40K, + choices=list(TrainingSampleRate), + ) + filter_audio: CheckboxConfig = CheckboxConfig( + label="Filter audio", + info=( + "Whether to remove low-frequency sounds from the audio files in the" + " provided dataset by applying a high-pass butterworth filter.

" + ), + value=True, + ) + clean_audio: CheckboxConfig = CheckboxConfig( + label="Clean audio", + info=( + "Whether to clean the audio files in the provided dataset using noise" + " reduction algorithms.


" + ), + value=False, + exclude_value=True, + ) + clean_strength: SliderConfig = SliderConfig.clean_strength(visible=False) + split_method: DropdownConfig = DropdownConfig( + label="Audio splitting method", + info=( + "The method to use for splitting the audio files in the provided dataset." + " Use the `Skip` method to skip splitting if the audio files are already" + " split. Use the `Simple` method if excessive silence has already been" + " removed from the audio files. Use the `Automatic` method for automatic" + " silence detection and splitting around it." + ), + value=AudioSplitMethod.AUTOMATIC, + choices=list(AudioSplitMethod), + exclude_value=True, + ) + chunk_len: SliderConfig = SliderConfig( + label="Chunk length", + info="Length of split audio chunks.", + value=3.0, + minimum=0.5, + maximum=5.0, + step=0.1, + visible=False, + ) + overlap_len: SliderConfig = SliderConfig( + label="Overlap length", + info="Length of overlap between split audio chunks.", + value=0.3, + minimum=0.0, + maximum=0.4, + step=0.1, + visible=False, + ) + preprocess_cores: SliderConfig = SliderConfig.cpu_cores() + + extract_model: DropdownConfig = DropdownConfig( + label="Model name", + info=( + "Name of the model with an associated preprocessed dataset to extract" + " training features from. When a new dataset is preprocessed, its" + " associated model is selected by default." + ), + value=None, + render=False, + exclude_value=True, + ) + f0_method: DropdownConfig = DropdownConfig( + label="F0 method", + info="The method to use for extracting pitch features.", + value=TrainingF0Method.RMVPE, + choices=list(TrainingF0Method), + exclude_value=True, + ) + + hop_length: SliderConfig = SliderConfig.hop_length( + label="Hop length", + info="The hop length to use for extracting pitch features.

", + visible=False, + ) + include_mutes: SliderConfig = SliderConfig( + label="Include mutes", + info=( + "The number of mute audio files to include in the generated training file" + " list. Adding silent files enables the training model to handle pure" + " silence in inferred audio files. If the preprocessed audio dataset" + " already contains segments of pure silence, set this to 0." + ), + value=0, + minimum=0, + maximum=10, + step=1, + ) + extraction_cores: SliderConfig = SliderConfig.cpu_cores() + extraction_acceleration: DropdownConfig = DropdownConfig.hardware_acceleration() + extraction_gpus: DropdownConfig = DropdownConfig.gpu() + + train_model: DropdownConfig = DropdownConfig( + label="Model name", + info=( + "Name of the model to train. When training features are extracted for a new" + " model, its name is selected by default." + ), + value=None, + render=False, + exclude_value=True, + ) + num_epochs: SliderConfig = SliderConfig( + label="Number of epochs", + info=( + "The number of epochs to train the voice model. A higher number can improve" + " voice model performance but may lead to overtraining." + ), + value=500, + minimum=1, + maximum=5000, + step=1, + ) + batch_size: SliderConfig = SliderConfig( + label="Batch size", + info=( + "The number of samples in each training batch. It is advisable to align" + " this value with the available VRAM of your GPU." + ), + value=16, + minimum=1, + maximum=128, + step=1, + ) + detect_overtraining: CheckboxConfig = CheckboxConfig( + label="Detect overtraining", + info=( + "Whether to detect overtraining to prevent the voice model from learning" + " the training data too well and losing the ability to generalize to new" + " data." + ), + value=True, + exclude_value=True, + ) + overtraining_threshold: SliderConfig = SliderConfig( + label="Overtraining threshold", + info=( + "The maximum number of epochs to continue training without any observed" + " improvement in voice model performance." + ), + value=500, + minimum=1, + maximum=1000, + visible=False, + ) + vocoder: DropdownConfig = DropdownConfig( + label="Vocoder", + info=( + "The vocoder to use for audio synthesis during training. HiFi-GAN provides" + " basic audio fidelity, while RefineGAN provides the highest audio" + " fidelity." + ), + value=Vocoder.HIFI_GAN, + choices=list(Vocoder), + ) + index_algorithm: DropdownConfig = DropdownConfig( + label="Index algorithm", + info=( + "The method to use for generating an index file for the trained voice" + " model. `KMeans` is particularly useful for large datasets." + ), + value=IndexAlgorithm.AUTO, + choices=list(IndexAlgorithm), ) - segment_size: RadioConfig = RadioConfig( - label="Segment size", + pretrained_type: DropdownConfig = DropdownConfig( + label="Pretrained model type", info=( - "The size of the segments into which the audio is split. Using a larger" - " size consumes more resources, but may give better results." + "The type of pretrained model to finetune the voice model on. `None` will" + " train the voice model from scratch, while `Default` will use a pretrained" + " model tailored to the specific voice model architecture. `Custom` will" + " use a custom pretrained that you provide." ), - value=SegmentSize.SEG_2048, - choices=list(SegmentSize), + value=PretrainedType.DEFAULT, + choices=list(PretrainedType), + exclude_value=True, ) - n_octaves: SliderConfig = SliderConfig.octave_shift( - label="Pitch shift (octaves)", + custom_pretrained_model: DropdownConfig = DropdownConfig( + label="Custom pretrained model", + info="Select a custom pretrained model to finetune from the dropdown.", + value=None, + visible=False, + render=False, + exclude_value=True, + ) + save_interval: SliderConfig = SliderConfig( + label="Save interval", info=( - "The number of octaves to pitch-shift the converted voice by. Use 1 for" - " male-to-female and -1 for vice-versa." + "The epoch interval at which to to save voice model weights and" + " checkpoints. The best model weights are always saved regardless of this" + " setting." ), + value=10, + minimum=1, + maximum=100, + step=1, ) - n_semitones: SliderConfig = SliderConfig.semitone_shift( - label="Pitch shift (semi-tones)", + save_all_checkpoints: CheckboxConfig = CheckboxConfig( + label="Save all checkpoints", info=( - "The number of semi-tones to pitch-shift the converted vocals by. Altering" - " this slightly reduces sound quality." + "Whether to save a unique checkpoint at each save interval. If not enabled," + " only the latest checkpoint will be saved at each interval." ), + value=True, ) - n_semitones_instrumentals: SliderConfig = SliderConfig.semitone_shift( - label="Instrumental pitch shift", - info="The number of semi-tones to pitch-shift the instrumentals by.", + save_all_weights: CheckboxConfig = CheckboxConfig( + label="Save all weights", + info=( + "Whether to save unique voice model weights at each save interval. If not" + " enabled, only the best voice model weights will be saved." + ), + value=True, ) - n_semitones_backup_vocals: SliderConfig = SliderConfig.semitone_shift( - label="Backup vocal pitch shift", - info="The number of semi-tones to pitch-shift the backup vocals by.", + clear_saved_data: CheckboxConfig = CheckboxConfig( + label="Clear saved data", + info=( + "Whether to delete any existing training data associated with the voice" + " model before training commences. Enable this setting only if you are" + " training a new voice model from scratch or restarting training." + ), + value=False, ) - input_audio: SongInputAudioConfig = SongInputAudioConfig() - song_dirs: SongDirsConfig = SongDirsConfig() - - -class SpeechIntermediateAudioConfig(BaseModel): - """ - Configuration settings for intermediate audio components in the - one-click speech generation tab. - - Attributes - ---------- - speech : AudioConfig - Configuration settings for the input speech audio component. - converted_speech : AudioConfig - Configuration settings for the converted speech audio component. - all : list[gr.Audio] - List of instances of all intermediate audio components in the - speech generation tab. - - """ - - speech: AudioConfig = AudioConfig.intermediate(label="Speech") - converted_speech: AudioConfig = AudioConfig.intermediate(label="Converted speech") - - @property - def all(self) -> list[gr.Audio]: - """ - Retrieve instances of all intermediate audio components in the - speech generation tab. - - Returns - ------- - list[gr.Audio] - List of instances of all intermediate audio components in - the speech generation tab. - - """ - return [getattr(self, field).instance for field in self.__annotations__] - - -class OneClickSpeechGenerationConfig(SpeechGenerationConfig): - """ - Configuration settings for one-click speech generation tab. - - Attributes - ---------- - intermediate_audio : SpeechIntermediateAudioConfig - Configuration settings for intermediate audio components. - show_intermediate_audio : CheckboxConfig - Configuration settings for a show intermediate audio checkbox - component. - - See Also - -------- - SpeechGenerationConfig - Parent model defining common component configuration settings - for speech generation tabs. - - """ - - intermediate_audio: SpeechIntermediateAudioConfig = SpeechIntermediateAudioConfig() - - show_intermediate_audio: CheckboxConfig = CheckboxConfig( - label="Show intermediate audio", - info="Show intermediate audio tracks produced during speech generation.", + upload_model: CheckboxConfig = CheckboxConfig( + label="Upload voice model", + info=( + "Whether to automatically upload the trained voice model so that it can be" + " used for generation tasks within the Ultimate RVC app." + ), value=False, exclude_value=True, ) + upload_name: TextboxConfig = TextboxConfig( + label="Upload name", + info="The name to give the uploaded voice model.", + value=None, + visible=False, + exclude_value=True, + ) + training_acceleration: DropdownConfig = DropdownConfig.hardware_acceleration() + training_gpus: DropdownConfig = DropdownConfig.gpu() + preload_dataset: CheckboxConfig = CheckboxConfig( + label="Preload dataset", + info=( + "Whether to preload all training data into GPU memory. This can improve" + " training speed but requires a lot of VRAM.

" + ), + value=True, + ) + reduce_memory_usage: CheckboxConfig = CheckboxConfig( + label="Reduce memory usage", + info=( + "Whether to reduce VRAM usage at the cost of slower training speed by" + " enabling activation checkpointing. This is useful for GPUs with limited" + " memory (e.g., <6GB VRAM) or when training with a batch size larger than" + " what your GPU can normally accommodate." + ), + value=False, + ) -class SpeechInputAudioConfig(BaseModel): - """ - Configuration settings for input audio components in the multi-step - speech generation tab. - - Attributes - ---------- - speech : AudioConfig - Configuration settings for the input speech audio component. - converted_speech : AudioConfig - Configuration settings for the converted speech audio component. - - all : list[AudioConfig] - List of configuration settings for all input audio components in - the multi-step speech generation tab. - - """ - - speech: AudioConfig = AudioConfig.input("Speech") - converted_speech: AudioConfig = AudioConfig.input("Converted speech") - - @property - def all(self) -> list[AudioConfig]: - """ - Retrieve configuration settings for all input audio components - in the multi-step speech generation tab. - - Returns - ------- - list[AudioConfig] - List of configuration settings for all input audio - components in the multi-step speech generation tab. - - """ - return [getattr(self, field) for field in self.__annotations__] - - -class MultiStepSpeechGenerationConfig(SpeechGenerationConfig): - """ - Configuration settings for the multi-step speech generation tab. - - Attributes - ---------- - input_audio : SpeechInputAudioConfig - Configuration settings for input audio components. - - See Also - -------- - SpeechGenerationConfig - Parent model defining common component configuration settings - for speech generation tabs. - - """ - - input_audio: SpeechInputAudioConfig = SpeechInputAudioConfig() - - -class MultiStepTrainingConfig(TrainingConfig): - """Configuration settings for multi-step training tab.""" - - -class ModelManagementConfig(BaseModel): +class GenerationConfig(BaseTabConfig): """ + Common component configuration settings for generation tabs. - Configuration settings for model management tab. - - Attributes - ---------- - voices : DropdownConfig - Configuration settings for delete voice models dropdown - component. - embedders : DropdownConfig - Configuration settings for delete embedder models dropdown + voice_model : DropdownConfig + Configuration settings for a voice model dropdown component. + f0_methods : DropdownConfig + Configuration settings for a pitch extraction algorithms + dropdown component. + index_rate : SliderConfig + Configuration settings for an index rate slider component. + rms_mix_rate : SliderConfig + Configuration settings for a RMS mix rate slider component. + protect_rate : SliderConfig + Configuration settings for a protect rate slider component. + split_voice : CheckboxConfig + Configuration settings for a split voice checkbox component. + autotune_voice: CheckboxConfig + Configuration settings for an autotune voice checkbox component. + autotune_strength: SliderConfig + Configuration settings for an autotune strength slider component. - pretraineds : DropdownConfig - Configuration settings for delete pretrained models dropdown + sid : NumberConfig + Configuration settings for a speaker ID number component. + output_sr : DropdownConfig + Configuration settings for an output sample rate dropdown component. - traineds : DropdownConfig - Configuration settings for delete training models dropdown + output_format : DropdownConfig + Configuration settings for an output format dropdown component. - dummy_checkbox : CheckboxConfig - Configuration settings for a dummy checkbox component. + output_name : TextboxConfig + Configuration settings for an output name textbox component. + + See Also + -------- + BaseTabConfig + Parent model defining common component configuration settings + for UI tabs. """ - voices: DropdownConfig = DropdownConfig.multi_delete( - label="Voice models", - info="Select one or more voice models to delete.", + voice_model: DropdownConfig = DropdownConfig( + label="Voice model", + info="Select a model to use for voice conversion.", + value=None, + render=False, + exclude_value=True, ) - embedders: DropdownConfig = DropdownConfig.multi_delete( - label="Custom embedder models", - info="Select one or more embedder models to delete.", + f0_methods: DropdownConfig = DropdownConfig( + label="Pitch extraction algorithm(s)", + info=( + "If more than one method is selected, then the median of the pitch values" + " extracted by each method is used. RMVPE is recommended for most cases and" + " is the default when no method is selected." + ), + value=[F0Method.RMVPE], + choices=list(F0Method), + multiselect=True, ) - pretraineds: DropdownConfig = DropdownConfig.multi_delete( - label="Custom pretrained models", - info="Select one or more pretrained models to delete.", + index_rate: SliderConfig = SliderConfig( + label="Index rate", + info=( + "Increase to bias the conversion towards the accent of the voice model." + " Decrease to potentially reduce artifacts coming from the voice" + " model.


" + ), + value=0.3, + minimum=0.0, + maximum=1.0, ) - traineds: DropdownConfig = DropdownConfig.multi_delete( - label="Training models", - info="Select one or more training models to delete.", + rms_mix_rate: SliderConfig = SliderConfig( + label="RMS mix rate", + info=( + "How much to mimic the loudness (0) of the input voice or a fixed loudness" + " (1). A value of 1 is recommended for most cases.

" + ), + value=1.0, + minimum=0.0, + maximum=1.0, + ) + protect_rate: SliderConfig = SliderConfig( + label="Protect rate", + info=( + "Controls the extent to which consonants and breathing sounds are protected" + " from artifacts. A higher value offers more protection but may worsen the" + " indexing effect.

" + ), + value=0.33, + minimum=0.0, + maximum=0.5, ) - dummy_checkbox: CheckboxConfig = CheckboxConfig( + hop_length: SliderConfig = SliderConfig.hop_length( + label="Hop length", + info=( + "How often the CREPE-based pitch extraction method checks for pitch changes" + " measured in milliseconds. Lower values lead to longer conversion times" + " and a higher risk of voice cracks, but better pitch accuracy." + ), + visible=True, + ) + + split_voice: CheckboxConfig = CheckboxConfig( + label="Split input voice", + info=( + "Whether to split the input voice track into smaller segments before" + " converting it. This can improve output quality for longer voice tracks." + ), + value=False, + ) + autotune_voice: CheckboxConfig = CheckboxConfig( + label="Autotune converted voice", + info="Whether to apply autotune to the converted voice.

", value=False, + exclude_value=True, + ) + autotune_strength: SliderConfig = SliderConfig( + label="Autotune intensity", + info=( + "Higher values result in stronger snapping to the chromatic grid and" + " artifacting." + ), + value=1.0, + minimum=0.0, + maximum=1.0, visible=False, + ) + sid: NumberConfig = NumberConfig( + label="Speaker ID", + info="Speaker ID for multi-speaker-models.", + value=0, + precision=0, + ) + output_sr: DropdownConfig = DropdownConfig( + label="Output sample rate", + info="The sample rate of the mixed output track.", + value=SampleRate.HZ_44100, + choices=list(SampleRate), + ) + output_format: DropdownConfig = DropdownConfig( + label="Output format", + info="The audio format of the mixed output track.", + value=AudioExt.MP3, + choices=list(AudioExt), + ) + output_name: TextboxConfig = TextboxConfig( + label="Output name", + info="If no name is provided, a suitable name will be generated automatically.", + value=None, + placeholder="Ultimate RVC output", exclude_value=True, ) -class AudioManagementConfig(BaseModel): +class SongGenerationConfig(GenerationConfig): """ - Configuration settings for audio management tab. + Common component configuration settings for song generation tabs. Attributes ---------- - intermediate : DropdownConfig - Configuration settings for delete intermediate audio files - dropdown component - speech : DropdownConfig - Configuration settings for delete speech audio files dropdown - component. - output : DropdownConfig - Configuration settings for delete output audio files dropdown + source_type : DropdownConfig + Configuration settings for a source type dropdown component. + source : TextboxConfig + Configuration settings for an input source textbox component. + cached_song : DropdownConfig + Configuration settings for a cached song dropdown component. + clean_strength : SliderConfig + Configuration settings for a clean strength slider component. + clean_voice : CheckboxConfig + Configuration settings for a clean voice checkbox component. + room_size : SliderConfig + Configuration settings for a room size slider component. + wet_level : SliderConfig + Configuration settings for a wetness level slider component. + dry_level : SliderConfig + Configuration settings for a dryness level slider component. + damping : SliderConfig + Configuration settings for a damping level slider component. + main_gain : SliderConfig + Configuration settings for a main gain slider component. + inst_gain : SliderConfig + Configuration settings for an instrumentals gain slider component. - dataset : DropdownConfig - Configuration settings for delete dataset audio files dropdown + backup_gain : SliderConfig + Configuration settings for a backup vocals gain slider component. - dummy_checkbox : CheckboxConfig - Configuration settings for a dummy checkbox component. + + See Also + -------- + GenerationConfig + Parent model defining common component configuration settings + for song generation tabs. """ - intermediate: DropdownConfig = DropdownConfig.multi_delete( - label="Song directories", + source_type: DropdownConfig = DropdownConfig( + label="Source type", + info="The type of source to retrieve a song from.", + value=SongSourceType.LOCAL_FILE, + choices=list(SongSourceType), + type="index", + exclude_value=True, + ) + source: TextboxConfig = TextboxConfig( + label="Source", + info="Link to a song on YouTube or the full path of a local audio file.", + value=None, + exclude_value=True, + ) + cached_song: DropdownConfig = DropdownConfig( + label="Source", + info="Select a song from the list of cached songs.", + value=None, + visible=False, + render=False, + exclude_value=True, + ) + clean_voice: CheckboxConfig = CheckboxConfig( + label="Clean converted voice", info=( - "Select one or more song directories containing intermediate audio files to" - " delete." + "Whether to clean the converted voice using noise reduction" + " algorithms.

" ), + value=False, + exclude_value=True, ) - speech: DropdownConfig = DropdownConfig.multi_delete( - label="Speech audio files", - info="Select one or more speech audio files to delete.", + clean_strength: SliderConfig = SliderConfig.clean_strength(visible=False) + room_size: SliderConfig = SliderConfig( + label="Room size", + info=( + "Size of the room which reverb effect simulates. Increase for longer reverb" + " time." + ), + value=0.15, + minimum=0.0, + maximum=1.0, ) - output: DropdownConfig = DropdownConfig.multi_delete( - label="Output audio files", - info="Select one or more output audio files to delete.", + wet_level: SliderConfig = SliderConfig( + label="Wetness level", + info="Loudness of converted vocals with reverb effect applied.", + value=0.2, + minimum=0.0, + maximum=1.0, + ) + dry_level: SliderConfig = SliderConfig( + label="Dryness level", + info="Loudness of converted vocals without reverb effect applied.", + value=0.8, + minimum=0.0, + maximum=1.0, + ) + damping: SliderConfig = SliderConfig( + label="Damping level", + info="Absorption of high frequencies in reverb effect.", + value=0.7, + minimum=0.0, + maximum=1.0, + ) + main_gain: SliderConfig = SliderConfig.gain( + label="Main gain", + info="The gain to apply to the main vocals.", ) - dataset: DropdownConfig = DropdownConfig.multi_delete( - label="Dataset audio files", - info="Select one or more datasets containing audio files to delete.", + inst_gain: SliderConfig = SliderConfig.gain( + label="Instrumentals gain", + info="The gain to apply to the instrumentals.", ) - - dummy_checkbox: CheckboxConfig = CheckboxConfig( - value=False, - visible=False, - exclude_value=True, + backup_gain: SliderConfig = SliderConfig.gain( + label="Backup gain", + info="The gain to apply to the backup vocals.", ) -class SettingsManagementConfig(BaseModel): +class SpeechGenerationConfig(GenerationConfig): """ - Configuration settings for settings management tab. + Common component configuration settings for speech generation tabs. Attributes ---------- - dummy_checkbox : CheckboxConfig - Configuration settings for a dummy checkbox component. + source_type : DropdownConfig + Configuration settings for a source type dropdown component. + source : TextboxConfig + Configuration settings for an input source textbox component. + edge_tts_voice : DropdownConfig + Configuration settings for an Edge TTS voice dropdown + component. + n_octaves : SliderConfig + Configuration settings for an octave pitch shift slider + component. + n_semitones : SliderConfig + Configuration settings for a semitone pitch shift slider + component. + tts_pitch_shift : SliderConfig + Configuration settings for a TTS pitch shift slider + component. + tts_speed_change : SliderConfig + Configuration settings for a TTS speed change slider + component. + tts_volume_change : SliderConfig + Configuration settings for a TTS volume change slider + component. + clean_voice : CheckboxConfig + Configuration settings for a clean voice checkbox + component. + clean_strength : SliderConfig + Configuration settings for a clean strength slider + component. + output_gain : GainSliderConfig + Configuration settings for an output gain slider component. + + See Also + -------- + GenerationConfig + Parent model defining common component configuration settings + for generation tabs. """ - load_config_name: DropdownConfig = DropdownConfig( - label="Configuration name", - info="The name of a configuration to load UI settings from", + source_type: DropdownConfig = DropdownConfig( + label="Source type", + info="The type of source to generate speech from.", + value=SpeechSourceType.TEXT, + choices=list(SpeechSourceType), + type="index", + exclude_value=True, + ) + source: TextboxConfig = TextboxConfig( + label="Source", + info="Text to generate speech from", + value=None, + exclude_value=True, + ) + edge_tts_voice: DropdownConfig = DropdownConfig( + label="Edge TTS voice", + info="Select a voice to use for text to speech conversion.", value=None, render=False, exclude_value=True, ) - delete_config_names: DropdownConfig = DropdownConfig.multi_delete( - label="Configuration names", - info="Select the name of one or more configurations to delete", + n_octaves: SliderConfig = SliderConfig.octave_shift( + label="Octave shift", + info=( + "The number of octaves to pitch-shift the converted speech by. Use 1 for" + " male-to-female and -1 for vice-versa." + ), ) - dummy_checkbox: CheckboxConfig = CheckboxConfig( - value=False, - visible=False, + n_semitones: SliderConfig = SliderConfig.semitone_shift( + label="Semitone shift", + info="The number of semi-tones to pitch-shift the converted speech by.", + ) + tts_pitch_shift: SliderConfig = SliderConfig( + label="Edge TTS pitch shift", + info=( + "The number of hertz to shift the pitch of the speech generated by Edge" + " TTS." + ), + value=0, + minimum=-100, + maximum=100, + step=1, + ) + tts_speed_change: SliderConfig = SliderConfig( + label="TTS speed change", + info="The percentual change to the speed of the speech generated by Edge TTS.", + value=0, + minimum=-50, + maximum=100, + step=1, + ) + tts_volume_change: SliderConfig = SliderConfig( + label="TTS volume change", + info="The percentual change to the volume of the speech generated by Edge TTS.", + value=0, + minimum=-100, + maximum=100, + step=1, + ) + clean_voice: CheckboxConfig = CheckboxConfig( + label="Clean converted voice", + info=( + "Whether to clean the converted voice using noise reduction" + " algorithms.

" + ), + value=True, exclude_value=True, ) - - -class TotalSongGenerationConfig(BaseModel): - """ - All configuration settings for song generation tabs. - - Attributes - ---------- - one_click : OneClickSongGenerationConfig - Configuration settings for the one-click song generation tab. - multi_step : MultiStepSongGenerationConfig - Configuration settings for the multi-step song generation tab. - - """ - - one_click: OneClickSongGenerationConfig = OneClickSongGenerationConfig() - multi_step: MultiStepSongGenerationConfig = MultiStepSongGenerationConfig() - - -class TotalSpeechGenerationConfig(BaseModel): - """ - All configuration settings for speech generation tabs. - - Attributes - ---------- - one_click : OneClickSpeechGenerationConfig - Configuration settings for the one-click speech generation tab. - multi_step : MultiStepSpeechGenerationConfig - Configuration settings for the multi-step speech generation tab. - - """ - - one_click: OneClickSpeechGenerationConfig = OneClickSpeechGenerationConfig() - multi_step: MultiStepSpeechGenerationConfig = MultiStepSpeechGenerationConfig() - - -class TotalTrainingConfig(BaseModel): - """ - All configuration settings for training tabs. - - Attributes - ---------- - training : TrainingConfig - Configuration settings for the multi-step training tab. - - """ - - multi_step: MultiStepTrainingConfig = MultiStepTrainingConfig() - - -class TotalManagementConfig(BaseModel): - """ - All configuration settings for management tabs. - - Attributes - ---------- - model : ModelManagementConfig - Configuration settings for the model management tab. - audio : AudioManagementConfig - Configuration settings for the audio management tab. - settings : SettingsManagementConfig - Configuration settings for the settings management tab. - - """ - - model: ModelManagementConfig = ModelManagementConfig() - audio: AudioManagementConfig = AudioManagementConfig() - settings: SettingsManagementConfig = SettingsManagementConfig() - - -class TotalConfig(BaseModel): - """ - All configuration settings for the Ultimate RVC app. - - Attributes - ---------- - song : TotalSongGenerationConfig - Configuration settings for song generation tabs. - speech : TotalSpeechGenerationConfig - Configuration settings for speech generation tabs. - training : TotalTrainingConfig - Configuration settings for training tabs. - management : TotalManagementConfig - Configuration settings for management tabs. - - """ - - song: TotalSongGenerationConfig = TotalSongGenerationConfig() - speech: TotalSpeechGenerationConfig = TotalSpeechGenerationConfig() - training: TotalTrainingConfig = TotalTrainingConfig() - management: TotalManagementConfig = TotalManagementConfig() - - @cached_property - def all(self) -> list[AnyComponentConfig]: - """ - Recursively collect those component configuration models nested - within the current model instance, which have values that are - not excluded. - - Returns - ------- - list[AnyComponentConfig] - A list of component configuration models found within the - current model instance, which have values that are not - excluded. - - """ - - def _collect(model: BaseModel) -> list[AnyComponentConfig]: - component_configs: list[Any] = [] - for _, value in model: - if isinstance(value, ComponentConfig): - if not value.exclude_value: - component_configs.append(value) - elif isinstance(value, BaseModel): - component_configs.extend(_collect(value)) - return component_configs - - return _collect(self) - - - + clean_strength: SliderConfig = SliderConfig.clean_strength(visible=True) + output_gain: SliderConfig = SliderConfig.gain( + label="Output gain", + info="The gain to apply to the converted speech.

", + )