lainlives commited on
Commit
c2861b3
·
verified ·
1 Parent(s): a187464

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +843 -0
app.py CHANGED
@@ -65,6 +65,8 @@ from ultimate_rvc.web.config.component import (
65
  DropdownConfig,
66
  RadioConfig,
67
  SliderConfig,
 
 
68
  )
69
  from ultimate_rvc.web.config.tab import (
70
  SongGenerationConfig,
@@ -89,6 +91,847 @@ type StrPath = str | PathLike[str]
89
  type Json = Mapping[str, Json] | Sequence[Json] | str | int | float | bool | None
90
 
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  class SegmentSize(IntEnum):
93
  """Enumeration of segment sizes for audio separation."""
94
 
 
65
  DropdownConfig,
66
  RadioConfig,
67
  SliderConfig,
68
+ NumberConfig,
69
+ TextboxConfig,
70
  )
71
  from ultimate_rvc.web.config.tab import (
72
  SongGenerationConfig,
 
91
  type Json = Mapping[str, Json] | Sequence[Json] | str | int | float | bool | None
92
 
93
 
94
+
95
+ class BaseTabConfig(BaseModel):
96
+ """
97
+ Base model defining common component configuration settings for
98
+ UI tabs.
99
+
100
+ Attributes
101
+ ----------
102
+ embedder_model : DropdownConfig
103
+ Configuration settings for an embedder model dropdown component.
104
+ custom_embedder_model : DropdownConfig
105
+ Configuration settings for a custom embedder model dropdown
106
+ component.
107
+
108
+ """
109
+
110
+ embedder_model: DropdownConfig = DropdownConfig(
111
+ label="Embedder model",
112
+ info="The model to use for generating speaker embeddings.",
113
+ value=EmbedderModel.CONTENTVEC,
114
+ choices=list(EmbedderModel),
115
+ exclude_value=True,
116
+ )
117
+ custom_embedder_model: DropdownConfig = DropdownConfig(
118
+ label="Custom embedder model",
119
+ info="Select a custom embedder model from the dropdown.",
120
+ value=None,
121
+ visible=False,
122
+ render=False,
123
+ exclude_value=True,
124
+ )
125
+
126
+
127
+ class GenerationConfig(BaseTabConfig):
128
+ """
129
+ Common component configuration settings for generation tabs.
130
+
131
+ voice_model : DropdownConfig
132
+ Configuration settings for a voice model dropdown component.
133
+ f0_methods : DropdownConfig
134
+ Configuration settings for a pitch extraction algorithms
135
+ dropdown component.
136
+ index_rate : SliderConfig
137
+ Configuration settings for an index rate slider component.
138
+ rms_mix_rate : SliderConfig
139
+ Configuration settings for a RMS mix rate slider component.
140
+ protect_rate : SliderConfig
141
+ Configuration settings for a protect rate slider component.
142
+ split_voice : CheckboxConfig
143
+ Configuration settings for a split voice checkbox component.
144
+ autotune_voice: CheckboxConfig
145
+ Configuration settings for an autotune voice checkbox component.
146
+ autotune_strength: SliderConfig
147
+ Configuration settings for an autotune strength slider
148
+ component.
149
+ sid : NumberConfig
150
+ Configuration settings for a speaker ID number component.
151
+ output_sr : DropdownConfig
152
+ Configuration settings for an output sample rate dropdown
153
+ component.
154
+ output_format : DropdownConfig
155
+ Configuration settings for an output format dropdown
156
+ component.
157
+ output_name : TextboxConfig
158
+ Configuration settings for an output name textbox component.
159
+
160
+ See Also
161
+ --------
162
+ BaseTabConfig
163
+ Parent model defining common component configuration settings
164
+ for UI tabs.
165
+
166
+ """
167
+
168
+ voice_model: DropdownConfig = DropdownConfig(
169
+ label="Voice model",
170
+ info="Select a model to use for voice conversion.",
171
+ value=None,
172
+ render=False,
173
+ exclude_value=True,
174
+ )
175
+ f0_methods: DropdownConfig = DropdownConfig(
176
+ label="Pitch extraction algorithm(s)",
177
+ info=(
178
+ "If more than one method is selected, then the median of the pitch values"
179
+ " extracted by each method is used. RMVPE is recommended for most cases and"
180
+ " is the default when no method is selected."
181
+ ),
182
+ value=[F0Method.RMVPE],
183
+ choices=list(F0Method),
184
+ multiselect=True,
185
+ )
186
+ index_rate: SliderConfig = SliderConfig(
187
+ label="Index rate",
188
+ info=(
189
+ "Increase to bias the conversion towards the accent of the voice model."
190
+ " Decrease to potentially reduce artifacts coming from the voice"
191
+ " model.<br><br><br>"
192
+ ),
193
+ value=0.3,
194
+ minimum=0.0,
195
+ maximum=1.0,
196
+ )
197
+ rms_mix_rate: SliderConfig = SliderConfig(
198
+ label="RMS mix rate",
199
+ info=(
200
+ "How much to mimic the loudness (0) of the input voice or a fixed loudness"
201
+ " (1). A value of 1 is recommended for most cases.<br><br>"
202
+ ),
203
+ value=1.0,
204
+ minimum=0.0,
205
+ maximum=1.0,
206
+ )
207
+ protect_rate: SliderConfig = SliderConfig(
208
+ label="Protect rate",
209
+ info=(
210
+ "Controls the extent to which consonants and breathing sounds are protected"
211
+ " from artifacts. A higher value offers more protection but may worsen the"
212
+ " indexing effect.<br><br>"
213
+ ),
214
+ value=0.33,
215
+ minimum=0.0,
216
+ maximum=0.5,
217
+ )
218
+
219
+ hop_length: SliderConfig = SliderConfig.hop_length(
220
+ label="Hop length",
221
+ info=(
222
+ "How often the CREPE-based pitch extraction method checks for pitch changes"
223
+ " measured in milliseconds. Lower values lead to longer conversion times"
224
+ " and a higher risk of voice cracks, but better pitch accuracy."
225
+ ),
226
+ visible=True,
227
+ )
228
+
229
+ split_voice: CheckboxConfig = CheckboxConfig(
230
+ label="Split input voice",
231
+ info=(
232
+ "Whether to split the input voice track into smaller segments before"
233
+ " converting it. This can improve output quality for longer voice tracks."
234
+ ),
235
+ value=False,
236
+ )
237
+ autotune_voice: CheckboxConfig = CheckboxConfig(
238
+ label="Autotune converted voice",
239
+ info="Whether to apply autotune to the converted voice.<br><br>",
240
+ value=False,
241
+ exclude_value=True,
242
+ )
243
+ autotune_strength: SliderConfig = SliderConfig(
244
+ label="Autotune intensity",
245
+ info=(
246
+ "Higher values result in stronger snapping to the chromatic grid and"
247
+ " artifacting."
248
+ ),
249
+ value=1.0,
250
+ minimum=0.0,
251
+ maximum=1.0,
252
+ visible=False,
253
+ )
254
+ sid: NumberConfig = NumberConfig(
255
+ label="Speaker ID",
256
+ info="Speaker ID for multi-speaker-models.",
257
+ value=0,
258
+ precision=0,
259
+ )
260
+ output_sr: DropdownConfig = DropdownConfig(
261
+ label="Output sample rate",
262
+ info="The sample rate of the mixed output track.",
263
+ value=SampleRate.HZ_44100,
264
+ choices=list(SampleRate),
265
+ )
266
+ output_format: DropdownConfig = DropdownConfig(
267
+ label="Output format",
268
+ info="The audio format of the mixed output track.",
269
+ value=AudioExt.MP3,
270
+ choices=list(AudioExt),
271
+ )
272
+ output_name: TextboxConfig = TextboxConfig(
273
+ label="Output name",
274
+ info="If no name is provided, a suitable name will be generated automatically.",
275
+ value=None,
276
+ placeholder="Ultimate RVC output",
277
+ exclude_value=True,
278
+ )
279
+
280
+
281
+ class SongGenerationConfig(GenerationConfig):
282
+ """
283
+ Common component configuration settings for song generation tabs.
284
+
285
+ Attributes
286
+ ----------
287
+ source_type : DropdownConfig
288
+ Configuration settings for a source type dropdown component.
289
+ source : TextboxConfig
290
+ Configuration settings for an input source textbox component.
291
+ cached_song : DropdownConfig
292
+ Configuration settings for a cached song dropdown component.
293
+ clean_strength : SliderConfig
294
+ Configuration settings for a clean strength slider component.
295
+ clean_voice : CheckboxConfig
296
+ Configuration settings for a clean voice checkbox component.
297
+ room_size : SliderConfig
298
+ Configuration settings for a room size slider component.
299
+ wet_level : SliderConfig
300
+ Configuration settings for a wetness level slider component.
301
+ dry_level : SliderConfig
302
+ Configuration settings for a dryness level slider component.
303
+ damping : SliderConfig
304
+ Configuration settings for a damping level slider component.
305
+ main_gain : SliderConfig
306
+ Configuration settings for a main gain slider component.
307
+ inst_gain : SliderConfig
308
+ Configuration settings for an instrumentals gain slider
309
+ component.
310
+ backup_gain : SliderConfig
311
+ Configuration settings for a backup vocals gain slider
312
+ component.
313
+
314
+ See Also
315
+ --------
316
+ GenerationConfig
317
+ Parent model defining common component configuration settings
318
+ for song generation tabs.
319
+
320
+ """
321
+
322
+ source_type: DropdownConfig = DropdownConfig(
323
+ label="Source type",
324
+ info="The type of source to retrieve a song from.",
325
+ value=SongSourceType.LOCAL_FILE,
326
+ choices=list(SongSourceType),
327
+ type="index",
328
+ exclude_value=True,
329
+ )
330
+ source: TextboxConfig = TextboxConfig(
331
+ label="Source",
332
+ info="Link to a song on YouTube or the full path of a local audio file.",
333
+ value=None,
334
+ exclude_value=True,
335
+ )
336
+ cached_song: DropdownConfig = DropdownConfig(
337
+ label="Source",
338
+ info="Select a song from the list of cached songs.",
339
+ value=None,
340
+ visible=False,
341
+ render=False,
342
+ exclude_value=True,
343
+ )
344
+ clean_voice: CheckboxConfig = CheckboxConfig(
345
+ label="Clean converted voice",
346
+ info=(
347
+ "Whether to clean the converted voice using noise reduction"
348
+ " algorithms.<br><br>"
349
+ ),
350
+ value=False,
351
+ exclude_value=True,
352
+ )
353
+ clean_strength: SliderConfig = SliderConfig.clean_strength(visible=False)
354
+ room_size: SliderConfig = SliderConfig(
355
+ label="Room size",
356
+ info=(
357
+ "Size of the room which reverb effect simulates. Increase for longer reverb"
358
+ " time."
359
+ ),
360
+ value=0.15,
361
+ minimum=0.0,
362
+ maximum=1.0,
363
+ )
364
+ wet_level: SliderConfig = SliderConfig(
365
+ label="Wetness level",
366
+ info="Loudness of converted vocals with reverb effect applied.",
367
+ value=0.2,
368
+ minimum=0.0,
369
+ maximum=1.0,
370
+ )
371
+ dry_level: SliderConfig = SliderConfig(
372
+ label="Dryness level",
373
+ info="Loudness of converted vocals without reverb effect applied.",
374
+ value=0.8,
375
+ minimum=0.0,
376
+ maximum=1.0,
377
+ )
378
+ damping: SliderConfig = SliderConfig(
379
+ label="Damping level",
380
+ info="Absorption of high frequencies in reverb effect.",
381
+ value=0.7,
382
+ minimum=0.0,
383
+ maximum=1.0,
384
+ )
385
+ main_gain: SliderConfig = SliderConfig.gain(
386
+ label="Main gain",
387
+ info="The gain to apply to the main vocals.",
388
+ )
389
+ inst_gain: SliderConfig = SliderConfig.gain(
390
+ label="Instrumentals gain",
391
+ info="The gain to apply to the instrumentals.",
392
+ )
393
+ backup_gain: SliderConfig = SliderConfig.gain(
394
+ label="Backup gain",
395
+ info="The gain to apply to the backup vocals.",
396
+ )
397
+
398
+
399
+ class SpeechGenerationConfig(GenerationConfig):
400
+ """
401
+ Common component configuration settings for speech generation tabs.
402
+
403
+ Attributes
404
+ ----------
405
+ source_type : DropdownConfig
406
+ Configuration settings for a source type dropdown component.
407
+ source : TextboxConfig
408
+ Configuration settings for an input source textbox component.
409
+ edge_tts_voice : DropdownConfig
410
+ Configuration settings for an Edge TTS voice dropdown
411
+ component.
412
+ n_octaves : SliderConfig
413
+ Configuration settings for an octave pitch shift slider
414
+ component.
415
+ n_semitones : SliderConfig
416
+ Configuration settings for a semitone pitch shift slider
417
+ component.
418
+ tts_pitch_shift : SliderConfig
419
+ Configuration settings for a TTS pitch shift slider
420
+ component.
421
+ tts_speed_change : SliderConfig
422
+ Configuration settings for a TTS speed change slider
423
+ component.
424
+ tts_volume_change : SliderConfig
425
+ Configuration settings for a TTS volume change slider
426
+ component.
427
+ clean_voice : CheckboxConfig
428
+ Configuration settings for a clean voice checkbox
429
+ component.
430
+ clean_strength : SliderConfig
431
+ Configuration settings for a clean strength slider
432
+ component.
433
+ output_gain : GainSliderConfig
434
+ Configuration settings for an output gain slider component.
435
+
436
+ See Also
437
+ --------
438
+ GenerationConfig
439
+ Parent model defining common component configuration settings
440
+ for generation tabs.
441
+
442
+ """
443
+
444
+ source_type: DropdownConfig = DropdownConfig(
445
+ label="Source type",
446
+ info="The type of source to generate speech from.",
447
+ value=SpeechSourceType.TEXT,
448
+ choices=list(SpeechSourceType),
449
+ type="index",
450
+ exclude_value=True,
451
+ )
452
+ source: TextboxConfig = TextboxConfig(
453
+ label="Source",
454
+ info="Text to generate speech from",
455
+ value=None,
456
+ exclude_value=True,
457
+ )
458
+ edge_tts_voice: DropdownConfig = DropdownConfig(
459
+ label="Edge TTS voice",
460
+ info="Select a voice to use for text to speech conversion.",
461
+ value=None,
462
+ render=False,
463
+ exclude_value=True,
464
+ )
465
+ n_octaves: SliderConfig = SliderConfig.octave_shift(
466
+ label="Octave shift",
467
+ info=(
468
+ "The number of octaves to pitch-shift the converted speech by. Use 1 for"
469
+ " male-to-female and -1 for vice-versa."
470
+ ),
471
+ )
472
+ n_semitones: SliderConfig = SliderConfig.semitone_shift(
473
+ label="Semitone shift",
474
+ info="The number of semi-tones to pitch-shift the converted speech by.",
475
+ )
476
+ tts_pitch_shift: SliderConfig = SliderConfig(
477
+ label="Edge TTS pitch shift",
478
+ info=(
479
+ "The number of hertz to shift the pitch of the speech generated by Edge"
480
+ " TTS."
481
+ ),
482
+ value=0,
483
+ minimum=-100,
484
+ maximum=100,
485
+ step=1,
486
+ )
487
+ tts_speed_change: SliderConfig = SliderConfig(
488
+ label="TTS speed change",
489
+ info="The percentual change to the speed of the speech generated by Edge TTS.",
490
+ value=0,
491
+ minimum=-50,
492
+ maximum=100,
493
+ step=1,
494
+ )
495
+ tts_volume_change: SliderConfig = SliderConfig(
496
+ label="TTS volume change",
497
+ info="The percentual change to the volume of the speech generated by Edge TTS.",
498
+ value=0,
499
+ minimum=-100,
500
+ maximum=100,
501
+ step=1,
502
+ )
503
+ clean_voice: CheckboxConfig = CheckboxConfig(
504
+ label="Clean converted voice",
505
+ info=(
506
+ "Whether to clean the converted voice using noise reduction"
507
+ " algorithms.<br><br>"
508
+ ),
509
+ value=True,
510
+ exclude_value=True,
511
+ )
512
+ clean_strength: SliderConfig = SliderConfig.clean_strength(visible=True)
513
+ output_gain: SliderConfig = SliderConfig.gain(
514
+ label="Output gain",
515
+ info="The gain to apply to the converted speech.<br><br>",
516
+ )
517
+
518
+
519
+ class TrainingConfig(BaseTabConfig):
520
+ """
521
+ Common component configuration settings for training tabs.
522
+
523
+ Attributes
524
+ ----------
525
+ dataset_type : DropdownConfig
526
+ Configuration settings for a dataset type dropdown component.
527
+ dataset : DropdownConfig
528
+ Configuration settings for a dataset dropdown component.
529
+ dataset_name : TextboxConfig
530
+ Configuration settings for a dataset name textbox component.
531
+ preprocess_model : DropdownConfig
532
+ Configuration settings for a model name dropdown component
533
+ for audio preprocessing.
534
+ sample_rate : DropdownConfig
535
+ Configuration settings for a sample rate dropdown component.
536
+ filter_audio : CheckboxConfig
537
+ Configuration settings for a filter audio checkbox component.
538
+ clean_audio : CheckboxConfig
539
+ Configuration settings for a clean audio checkbox component.
540
+ clean_strength : SliderConfig
541
+ Configuration settings for a clean strength slider component.
542
+ split_method : DropdownConfig
543
+ Configuration settings for an audio splitting method dropdown
544
+ component.
545
+ chunk_len : SliderConfig
546
+ Configuration settings for a chunk length slider component.
547
+ overlap_len : SliderConfig
548
+ Configuration settings for an overlap length slider component.
549
+ preprocess_cores : SliderConfig
550
+ Configuration settings for a CPU cores slider component for
551
+ preprocessing.
552
+ extract_model : DropdownConfig
553
+ Configuration settings for a model name dropdown component for
554
+ feature extraction.
555
+ f0_method : DropdownConfig
556
+ Configuration settings for an F0 method dropdown component.
557
+ hop_length : SliderConfig
558
+ Configuration settings for a hop length slider component.
559
+ include_mutes : SliderConfig
560
+ Configuration settings for an include mutes slider component.
561
+ extract_cores : SliderConfig
562
+ Configuration settings for a CPU cores slider component for
563
+ feature extraction.
564
+ extraction_acceleration : HardwareAccelerationConfig
565
+ Configuration settings for a hardware acceleration component for
566
+ feature extraction.
567
+ extraction_gpus : DropdownConfig
568
+ Configuration settings for a GPU dropdown compoennt for feature
569
+ extraction.
570
+ train_model : DropdownConfig
571
+ Configuration settings for a model name dropdown component for
572
+ training.
573
+ num_epochs : SliderConfig
574
+ Configuration settings for a number of epochs slider component.
575
+ batch_size : SliderConfig
576
+ Configuration settings for a batch size slider component.
577
+ detect_overtraining : CheckboxConfig
578
+ Configuration settings for a detect overtraining checkbox
579
+ component.
580
+ overtraining_threshold : SliderConfig
581
+ Configuration settings for an overtraining threshold slider
582
+ component.
583
+ vocoder : DropdownConfig
584
+ Configuration settings for a vocoder dropdown component.
585
+ index_algorithm : DropdownConfig
586
+ Configuration settings for an index algorithm dropdown
587
+ component.
588
+ pretrained_type : DropdownConfig
589
+ Configuration settings for a pretrained model type dropdown
590
+ component.
591
+ custom_pretrained_model : DropdownConfig
592
+ Configuration settings for a custom pretrained model dropdown
593
+ component.
594
+ save_interval : SliderConfig
595
+ Configuration settings for a save-interval slider component.
596
+ save_all_checkpoints : CheckboxConfig
597
+ Configuration settings for a save-all-checkpoints checkbox
598
+ component.
599
+ save_all_weights : CheckboxConfig
600
+ Configuration settings for a save-all-weights checkbox
601
+ component.
602
+ clear_saved_data : CheckboxConfig
603
+ Configuration settings for a clear-saved-data checkbox
604
+ component.
605
+ upload_model : CheckboxConfig
606
+ Configuration settings for an upload voice model checkbox
607
+ component.
608
+ upload_name : TextboxConfig
609
+ Configuration settings for an upload name textbox component.
610
+ training_acceleration : HardwareAccelerationConfig
611
+ Configuration settings for a hardware acceleration component for
612
+ training.
613
+ training_gpus : DropdownConfig
614
+ Configuration settings for a GPU dropdown component for
615
+ training.
616
+ preload_dataset : CheckboxConfig
617
+ Configuration settings for a preload dataset checkbox component.
618
+ reduce_memory_usage : CheckboxConfig
619
+ Configuration settings for a reduce-memory-usage checkbox
620
+ component.
621
+
622
+ See Also
623
+ --------
624
+ BaseTabConfig
625
+ Parent model defining common component configuration settings
626
+ for UI tabs.
627
+
628
+ """
629
+
630
+ dataset_type: DropdownConfig = DropdownConfig(
631
+ label="Dataset type",
632
+ info="Select the type of dataset to preprocess.",
633
+ value=DatasetType.NEW_DATASET,
634
+ choices=list(DatasetType),
635
+ exclude_value=True,
636
+ )
637
+ dataset: DropdownConfig = DropdownConfig(
638
+ label="Dataset path",
639
+ info=(
640
+ "The path to an existing dataset. Either select a path to a previously"
641
+ " created dataset or provide a path to an external dataset."
642
+ ),
643
+ value=None,
644
+ allow_custom_value=True,
645
+ visible=False,
646
+ render=False,
647
+ exclude_value=True,
648
+ )
649
+ dataset_name: TextboxConfig = TextboxConfig(
650
+ label="Dataset name",
651
+ info=(
652
+ "The name of the new dataset. If the dataset already exists, the provided"
653
+ " audio files will be added to it."
654
+ ),
655
+ value="My dataset",
656
+ exclude_value=True,
657
+ )
658
+ preprocess_model: DropdownConfig = DropdownConfig(
659
+ label="Model name",
660
+ info=(
661
+ "Name of the model to preprocess the given dataset for. Either select an"
662
+ " existing model from the dropdown or provide the name of a new model."
663
+ ),
664
+ value="My model",
665
+ allow_custom_value=True,
666
+ render=False,
667
+ exclude_value=True,
668
+ )
669
+ sample_rate: DropdownConfig = DropdownConfig(
670
+ label="Sample rate",
671
+ info="Target sample rate for the audio files in the provided dataset.",
672
+ value=TrainingSampleRate.HZ_40K,
673
+ choices=list(TrainingSampleRate),
674
+ )
675
+ filter_audio: CheckboxConfig = CheckboxConfig(
676
+ label="Filter audio",
677
+ info=(
678
+ "Whether to remove low-frequency sounds from the audio files in the"
679
+ " provided dataset by applying a high-pass butterworth filter.<br><br>"
680
+ ),
681
+ value=True,
682
+ )
683
+ clean_audio: CheckboxConfig = CheckboxConfig(
684
+ label="Clean audio",
685
+ info=(
686
+ "Whether to clean the audio files in the provided dataset using noise"
687
+ " reduction algorithms.<br><br><br>"
688
+ ),
689
+ value=False,
690
+ exclude_value=True,
691
+ )
692
+ clean_strength: SliderConfig = SliderConfig.clean_strength(visible=False)
693
+ split_method: DropdownConfig = DropdownConfig(
694
+ label="Audio splitting method",
695
+ info=(
696
+ "The method to use for splitting the audio files in the provided dataset."
697
+ " Use the `Skip` method to skip splitting if the audio files are already"
698
+ " split. Use the `Simple` method if excessive silence has already been"
699
+ " removed from the audio files. Use the `Automatic` method for automatic"
700
+ " silence detection and splitting around it."
701
+ ),
702
+ value=AudioSplitMethod.AUTOMATIC,
703
+ choices=list(AudioSplitMethod),
704
+ exclude_value=True,
705
+ )
706
+ chunk_len: SliderConfig = SliderConfig(
707
+ label="Chunk length",
708
+ info="Length of split audio chunks.",
709
+ value=3.0,
710
+ minimum=0.5,
711
+ maximum=5.0,
712
+ step=0.1,
713
+ visible=False,
714
+ )
715
+ overlap_len: SliderConfig = SliderConfig(
716
+ label="Overlap length",
717
+ info="Length of overlap between split audio chunks.",
718
+ value=0.3,
719
+ minimum=0.0,
720
+ maximum=0.4,
721
+ step=0.1,
722
+ visible=False,
723
+ )
724
+ preprocess_cores: SliderConfig = SliderConfig.cpu_cores()
725
+
726
+ extract_model: DropdownConfig = DropdownConfig(
727
+ label="Model name",
728
+ info=(
729
+ "Name of the model with an associated preprocessed dataset to extract"
730
+ " training features from. When a new dataset is preprocessed, its"
731
+ " associated model is selected by default."
732
+ ),
733
+ value=None,
734
+ render=False,
735
+ exclude_value=True,
736
+ )
737
+ f0_method: DropdownConfig = DropdownConfig(
738
+ label="F0 method",
739
+ info="The method to use for extracting pitch features.",
740
+ value=TrainingF0Method.RMVPE,
741
+ choices=list(TrainingF0Method),
742
+ exclude_value=True,
743
+ )
744
+
745
+ hop_length: SliderConfig = SliderConfig.hop_length(
746
+ label="Hop length",
747
+ info="The hop length to use for extracting pitch features.<br><br>",
748
+ visible=False,
749
+ )
750
+ include_mutes: SliderConfig = SliderConfig(
751
+ label="Include mutes",
752
+ info=(
753
+ "The number of mute audio files to include in the generated training file"
754
+ " list. Adding silent files enables the training model to handle pure"
755
+ " silence in inferred audio files. If the preprocessed audio dataset"
756
+ " already contains segments of pure silence, set this to 0."
757
+ ),
758
+ value=0,
759
+ minimum=0,
760
+ maximum=10,
761
+ step=1,
762
+ )
763
+ extraction_cores: SliderConfig = SliderConfig.cpu_cores()
764
+ extraction_acceleration: DropdownConfig = DropdownConfig.hardware_acceleration()
765
+ extraction_gpus: DropdownConfig = DropdownConfig.gpu()
766
+
767
+ train_model: DropdownConfig = DropdownConfig(
768
+ label="Model name",
769
+ info=(
770
+ "Name of the model to train. When training features are extracted for a new"
771
+ " model, its name is selected by default."
772
+ ),
773
+ value=None,
774
+ render=False,
775
+ exclude_value=True,
776
+ )
777
+ num_epochs: SliderConfig = SliderConfig(
778
+ label="Number of epochs",
779
+ info=(
780
+ "The number of epochs to train the voice model. A higher number can improve"
781
+ " voice model performance but may lead to overtraining."
782
+ ),
783
+ value=500,
784
+ minimum=1,
785
+ maximum=5000,
786
+ step=1,
787
+ )
788
+ batch_size: SliderConfig = SliderConfig(
789
+ label="Batch size",
790
+ info=(
791
+ "The number of samples in each training batch. It is advisable to align"
792
+ " this value with the available VRAM of your GPU."
793
+ ),
794
+ value=16,
795
+ minimum=1,
796
+ maximum=128,
797
+ step=1,
798
+ )
799
+ detect_overtraining: CheckboxConfig = CheckboxConfig(
800
+ label="Detect overtraining",
801
+ info=(
802
+ "Whether to detect overtraining to prevent the voice model from learning"
803
+ " the training data too well and losing the ability to generalize to new"
804
+ " data."
805
+ ),
806
+ value=True,
807
+ exclude_value=True,
808
+ )
809
+ overtraining_threshold: SliderConfig = SliderConfig(
810
+ label="Overtraining threshold",
811
+ info=(
812
+ "The maximum number of epochs to continue training without any observed"
813
+ " improvement in voice model performance."
814
+ ),
815
+ value=500,
816
+ minimum=1,
817
+ maximum=1000,
818
+ visible=False,
819
+ )
820
+ vocoder: DropdownConfig = DropdownConfig(
821
+ label="Vocoder",
822
+ info=(
823
+ "The vocoder to use for audio synthesis during training. HiFi-GAN provides"
824
+ " basic audio fidelity, while RefineGAN provides the highest audio"
825
+ " fidelity."
826
+ ),
827
+ value=Vocoder.HIFI_GAN,
828
+ choices=list(Vocoder),
829
+ )
830
+ index_algorithm: DropdownConfig = DropdownConfig(
831
+ label="Index algorithm",
832
+ info=(
833
+ "The method to use for generating an index file for the trained voice"
834
+ " model. `KMeans` is particularly useful for large datasets."
835
+ ),
836
+ value=IndexAlgorithm.AUTO,
837
+ choices=list(IndexAlgorithm),
838
+ )
839
+ pretrained_type: DropdownConfig = DropdownConfig(
840
+ label="Pretrained model type",
841
+ info=(
842
+ "The type of pretrained model to finetune the voice model on. `None` will"
843
+ " train the voice model from scratch, while `Default` will use a pretrained"
844
+ " model tailored to the specific voice model architecture. `Custom` will"
845
+ " use a custom pretrained that you provide."
846
+ ),
847
+ value=PretrainedType.DEFAULT,
848
+ choices=list(PretrainedType),
849
+ exclude_value=True,
850
+ )
851
+ custom_pretrained_model: DropdownConfig = DropdownConfig(
852
+ label="Custom pretrained model",
853
+ info="Select a custom pretrained model to finetune from the dropdown.",
854
+ value=None,
855
+ visible=False,
856
+ render=False,
857
+ exclude_value=True,
858
+ )
859
+ save_interval: SliderConfig = SliderConfig(
860
+ label="Save interval",
861
+ info=(
862
+ "The epoch interval at which to to save voice model weights and"
863
+ " checkpoints. The best model weights are always saved regardless of this"
864
+ " setting."
865
+ ),
866
+ value=10,
867
+ minimum=1,
868
+ maximum=100,
869
+ step=1,
870
+ )
871
+ save_all_checkpoints: CheckboxConfig = CheckboxConfig(
872
+ label="Save all checkpoints",
873
+ info=(
874
+ "Whether to save a unique checkpoint at each save interval. If not enabled,"
875
+ " only the latest checkpoint will be saved at each interval."
876
+ ),
877
+ value=True,
878
+ )
879
+ save_all_weights: CheckboxConfig = CheckboxConfig(
880
+ label="Save all weights",
881
+ info=(
882
+ "Whether to save unique voice model weights at each save interval. If not"
883
+ " enabled, only the best voice model weights will be saved."
884
+ ),
885
+ value=True,
886
+ )
887
+ clear_saved_data: CheckboxConfig = CheckboxConfig(
888
+ label="Clear saved data",
889
+ info=(
890
+ "Whether to delete any existing training data associated with the voice"
891
+ " model before training commences. Enable this setting only if you are"
892
+ " training a new voice model from scratch or restarting training."
893
+ ),
894
+ value=False,
895
+ )
896
+ upload_model: CheckboxConfig = CheckboxConfig(
897
+ label="Upload voice model",
898
+ info=(
899
+ "Whether to automatically upload the trained voice model so that it can be"
900
+ " used for generation tasks within the Ultimate RVC app."
901
+ ),
902
+ value=False,
903
+ exclude_value=True,
904
+ )
905
+ upload_name: TextboxConfig = TextboxConfig(
906
+ label="Upload name",
907
+ info="The name to give the uploaded voice model.",
908
+ value=None,
909
+ visible=False,
910
+ exclude_value=True,
911
+ )
912
+ training_acceleration: DropdownConfig = DropdownConfig.hardware_acceleration()
913
+ training_gpus: DropdownConfig = DropdownConfig.gpu()
914
+ preload_dataset: CheckboxConfig = CheckboxConfig(
915
+ label="Preload dataset",
916
+ info=(
917
+ "Whether to preload all training data into GPU memory. This can improve"
918
+ " training speed but requires a lot of VRAM.<br><br>"
919
+ ),
920
+ value=True,
921
+ )
922
+ reduce_memory_usage: CheckboxConfig = CheckboxConfig(
923
+ label="Reduce memory usage",
924
+ info=(
925
+ "Whether to reduce VRAM usage at the cost of slower training speed by"
926
+ " enabling activation checkpointing. This is useful for GPUs with limited"
927
+ " memory (e.g., <6GB VRAM) or when training with a batch size larger than"
928
+ " what your GPU can normally accommodate."
929
+ ),
930
+ value=False,
931
+ )
932
+
933
+
934
+
935
  class SegmentSize(IntEnum):
936
  """Enumeration of segment sizes for audio separation."""
937