ozipoetra commited on
Commit
754f043
Β·
1 Parent(s): aac14c4

refactor: create own RVC library from ultimate_rvc

Browse files

- Create lib/rvc/ with essential voice conversion components
- lib/rvc/algorithm/: encoders, generators, synthesizers
- lib/rvc/predictors/: F0 extraction (RMVPE, FCPE, CREPE)
- lib/rvc/tools/: audio splitting utilities
- lib/rvc/converter.py: VoiceConverter class
- lib/rvc/pipeline.py: voice conversion pipeline
- lib/rvc/config.py: device and model configuration
- Move configs (48000.json, 40000.json, 32000.json) to configs/
- Remove ultimate_rvc/ directory (no longer needed)
- Update lib/jobs.py to use new lib.rvc.converter

This creates a self-contained RVC library based on ultimate-rvc,
removing the external dependency and allowing for easier maintenance.

Files changed (42) hide show
  1. {ultimate_rvc/rvc/configs β†’ configs}/32000.json +0 -0
  2. {ultimate_rvc/rvc/configs β†’ configs}/40000.json +0 -0
  3. {ultimate_rvc/rvc/configs β†’ configs}/48000.json +0 -0
  4. lib/jobs.py +1 -1
  5. lib/rvc/__init__.py +16 -0
  6. lib/rvc/algorithm/__init__.py +2 -0
  7. {ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/attentions.py +1 -1
  8. {ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/commons.py +0 -0
  9. {ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/encoders.py +4 -4
  10. lib/rvc/algorithm/generators/__init__.py +2 -0
  11. {ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/generators/hifigan.py +2 -2
  12. {ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/generators/hifigan_mrf.py +0 -0
  13. {ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/generators/hifigan_nsf.py +3 -3
  14. {ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/generators/refinegan.py +1 -1
  15. {ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/modules.py +1 -1
  16. {ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/normalization.py +0 -0
  17. {ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/residuals.py +2 -2
  18. {ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/synthesizers.py +7 -7
  19. {ultimate_rvc β†’ lib}/rvc/common.py +2 -2
  20. {ultimate_rvc/rvc/configs β†’ lib/rvc}/config.py +1 -1
  21. ultimate_rvc/rvc/infer/infer.py β†’ lib/rvc/converter.py +35 -122
  22. {ultimate_rvc/rvc/infer β†’ lib/rvc}/pipeline.py +5 -9
  23. {ultimate_rvc/rvc/lib β†’ lib/rvc}/predictors/F0Extractor.py +3 -3
  24. {ultimate_rvc/rvc/lib β†’ lib/rvc}/predictors/FCPE.py +0 -0
  25. {ultimate_rvc/rvc/lib β†’ lib/rvc}/predictors/RMVPE.py +0 -0
  26. lib/rvc/predictors/__init__.py +2 -0
  27. {ultimate_rvc/rvc/lib β†’ lib/rvc}/predictors/f0.py +2 -2
  28. lib/rvc/tools/__init__.py +2 -0
  29. {ultimate_rvc/rvc/lib β†’ lib/rvc}/tools/split_audio.py +0 -0
  30. {ultimate_rvc/rvc/lib β†’ lib/rvc}/utils.py +1 -1
  31. ultimate_rvc/__init__.py +0 -0
  32. ultimate_rvc/common.py +0 -37
  33. ultimate_rvc/rvc/__init__.py +0 -4
  34. ultimate_rvc/rvc/configs/__init__.py +0 -0
  35. ultimate_rvc/rvc/infer/__init__.py +0 -0
  36. ultimate_rvc/rvc/infer/typing_extra.py +0 -57
  37. ultimate_rvc/rvc/lib/__init__.py +0 -0
  38. ultimate_rvc/rvc/lib/algorithm/__init__.py +0 -0
  39. ultimate_rvc/rvc/lib/algorithm/generators/__init__.py +0 -0
  40. ultimate_rvc/rvc/lib/predictors/__init__.py +0 -0
  41. ultimate_rvc/rvc/lib/tools/__init__.py +0 -0
  42. ultimate_rvc/typing_extra.py +0 -154
{ultimate_rvc/rvc/configs β†’ configs}/32000.json RENAMED
File without changes
{ultimate_rvc/rvc/configs β†’ configs}/40000.json RENAMED
File without changes
{ultimate_rvc/rvc/configs β†’ configs}/48000.json RENAMED
File without changes
lib/jobs.py CHANGED
@@ -34,7 +34,7 @@ def get_vc():
34
  global _vc_instance
35
  if _vc_instance is None:
36
  logger.info("Loading VoiceConverter…")
37
- from ultimate_rvc.rvc.infer.infer import VoiceConverter
38
  _vc_instance = VoiceConverter()
39
  logger.info("VoiceConverter ready.")
40
  return _vc_instance
 
34
  global _vc_instance
35
  if _vc_instance is None:
36
  logger.info("Loading VoiceConverter…")
37
+ from lib.rvc.converter import VoiceConverter
38
  _vc_instance = VoiceConverter()
39
  logger.info("VoiceConverter ready.")
40
  return _vc_instance
lib/rvc/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """RVC Voice Conversion Library.
2
+
3
+ This is a minimal rewrite of the ultimate-rvc library for voice conversion.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ from pathlib import Path
8
+
9
+ # Base directory for RVC resources
10
+ BASE_DIR = Path(__file__).parent.parent.parent
11
+
12
+ # Models directory
13
+ MODELS_DIR = BASE_DIR / "rvc_models"
14
+
15
+ # Configs directory (for model configs)
16
+ CONFIGS_DIR = BASE_DIR / "configs"
lib/rvc/algorithm/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ """Algorithm modules for RVC."""
2
+ from __future__ import annotations
{ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/attentions.py RENAMED
@@ -2,7 +2,7 @@ import math
2
 
3
  import torch
4
 
5
- from ultimate_rvc.rvc.lib.algorithm.commons import convert_pad_shape
6
 
7
 
8
  class MultiHeadAttention(torch.nn.Module):
 
2
 
3
  import torch
4
 
5
+ from lib.rvc.lib.algorithm.commons import convert_pad_shape
6
 
7
 
8
  class MultiHeadAttention(torch.nn.Module):
{ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/commons.py RENAMED
File without changes
{ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/encoders.py RENAMED
@@ -3,10 +3,10 @@ import math
3
 
4
  import torch
5
 
6
- from ultimate_rvc.rvc.lib.algorithm.attentions import FFN, MultiHeadAttention
7
- from ultimate_rvc.rvc.lib.algorithm.commons import sequence_mask
8
- from ultimate_rvc.rvc.lib.algorithm.modules import WaveNet
9
- from ultimate_rvc.rvc.lib.algorithm.normalization import LayerNorm
10
 
11
  logger = logging.getLogger(__name__)
12
 
 
3
 
4
  import torch
5
 
6
+ from lib.rvc.lib.algorithm.attentions import FFN, MultiHeadAttention
7
+ from lib.rvc.lib.algorithm.commons import sequence_mask
8
+ from lib.rvc.lib.algorithm.modules import WaveNet
9
+ from lib.rvc.lib.algorithm.normalization import LayerNorm
10
 
11
  logger = logging.getLogger(__name__)
12
 
lib/rvc/algorithm/generators/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ """Generator modules for RVC vocoders."""
2
+ from __future__ import annotations
{ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/generators/hifigan.py RENAMED
@@ -6,8 +6,8 @@ import torch
6
  from torch.nn.utils import remove_weight_norm
7
  from torch.nn.utils.parametrizations import weight_norm
8
 
9
- from ultimate_rvc.rvc.lib.algorithm.commons import init_weights
10
- from ultimate_rvc.rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock
11
 
12
 
13
  class HiFiGANGenerator(torch.nn.Module):
 
6
  from torch.nn.utils import remove_weight_norm
7
  from torch.nn.utils.parametrizations import weight_norm
8
 
9
+ from lib.rvc.lib.algorithm.commons import init_weights
10
+ from lib.rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock
11
 
12
 
13
  class HiFiGANGenerator(torch.nn.Module):
{ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/generators/hifigan_mrf.py RENAMED
File without changes
{ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/generators/hifigan_nsf.py RENAMED
@@ -7,9 +7,9 @@ from torch.nn.utils import remove_weight_norm
7
  from torch.nn.utils.parametrizations import weight_norm
8
  from torch.utils.checkpoint import checkpoint
9
 
10
- from ultimate_rvc.rvc.lib.algorithm.commons import init_weights
11
- from ultimate_rvc.rvc.lib.algorithm.generators.hifigan import SineGenerator
12
- from ultimate_rvc.rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock
13
 
14
 
15
  class SourceModuleHnNSF(torch.nn.Module):
 
7
  from torch.nn.utils.parametrizations import weight_norm
8
  from torch.utils.checkpoint import checkpoint
9
 
10
+ from lib.rvc.lib.algorithm.commons import init_weights
11
+ from lib.rvc.lib.algorithm.generators.hifigan import SineGenerator
12
+ from lib.rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock
13
 
14
 
15
  class SourceModuleHnNSF(torch.nn.Module):
{ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/generators/refinegan.py RENAMED
@@ -8,7 +8,7 @@ from torch.nn.utils import remove_weight_norm
8
  from torch.nn.utils.parametrizations import weight_norm
9
  from torch.utils.checkpoint import checkpoint
10
 
11
- from ultimate_rvc.rvc.lib.algorithm.commons import get_padding, init_weights
12
 
13
 
14
  class ResBlock(nn.Module):
 
8
  from torch.nn.utils.parametrizations import weight_norm
9
  from torch.utils.checkpoint import checkpoint
10
 
11
+ from lib.rvc.lib.algorithm.commons import get_padding, init_weights
12
 
13
 
14
  class ResBlock(nn.Module):
{ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/modules.py RENAMED
@@ -1,6 +1,6 @@
1
  import torch
2
 
3
- from ultimate_rvc.rvc.lib.algorithm.commons import fused_add_tanh_sigmoid_multiply
4
 
5
 
6
  class WaveNet(torch.nn.Module):
 
1
  import torch
2
 
3
+ from lib.rvc.lib.algorithm.commons import fused_add_tanh_sigmoid_multiply
4
 
5
 
6
  class WaveNet(torch.nn.Module):
{ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/normalization.py RENAMED
File without changes
{ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/residuals.py RENAMED
@@ -6,8 +6,8 @@ import torch
6
  from torch.nn.utils import remove_weight_norm
7
  from torch.nn.utils.parametrizations import weight_norm
8
 
9
- from ultimate_rvc.rvc.lib.algorithm.commons import get_padding, init_weights
10
- from ultimate_rvc.rvc.lib.algorithm.modules import WaveNet
11
 
12
  LRELU_SLOPE = 0.1
13
 
 
6
  from torch.nn.utils import remove_weight_norm
7
  from torch.nn.utils.parametrizations import weight_norm
8
 
9
+ from lib.rvc.lib.algorithm.commons import get_padding, init_weights
10
+ from lib.rvc.lib.algorithm.modules import WaveNet
11
 
12
  LRELU_SLOPE = 0.1
13
 
{ultimate_rvc/rvc/lib β†’ lib/rvc}/algorithm/synthesizers.py RENAMED
@@ -4,13 +4,13 @@ import logging
4
 
5
  import torch
6
 
7
- from ultimate_rvc.rvc.lib.algorithm.commons import rand_slice_segments, slice_segments
8
- from ultimate_rvc.rvc.lib.algorithm.encoders import PosteriorEncoder, TextEncoder
9
- from ultimate_rvc.rvc.lib.algorithm.generators.hifigan import HiFiGANGenerator
10
- from ultimate_rvc.rvc.lib.algorithm.generators.hifigan_mrf import HiFiGANMRFGenerator
11
- from ultimate_rvc.rvc.lib.algorithm.generators.hifigan_nsf import HiFiGANNSFGenerator
12
- from ultimate_rvc.rvc.lib.algorithm.generators.refinegan import RefineGANGenerator
13
- from ultimate_rvc.rvc.lib.algorithm.residuals import ResidualCouplingBlock
14
 
15
  logger = logging.getLogger(__name__)
16
 
 
4
 
5
  import torch
6
 
7
+ from lib.rvc.lib.algorithm.commons import rand_slice_segments, slice_segments
8
+ from lib.rvc.lib.algorithm.encoders import PosteriorEncoder, TextEncoder
9
+ from lib.rvc.lib.algorithm.generators.hifigan import HiFiGANGenerator
10
+ from lib.rvc.lib.algorithm.generators.hifigan_mrf import HiFiGANMRFGenerator
11
+ from lib.rvc.lib.algorithm.generators.hifigan_nsf import HiFiGANNSFGenerator
12
+ from lib.rvc.lib.algorithm.generators.refinegan import RefineGANGenerator
13
+ from lib.rvc.lib.algorithm.residuals import ResidualCouplingBlock
14
 
15
  logger = logging.getLogger(__name__)
16
 
{ultimate_rvc β†’ lib}/rvc/common.py RENAMED
@@ -1,9 +1,9 @@
1
  """Common constants and functions for the RVC package."""
2
-
3
  from __future__ import annotations
4
 
5
  from pathlib import Path
6
 
7
  RVC_DIR = Path(__file__).resolve().parent
8
- RVC_CONFIGS_DIR = RVC_DIR / "configs"
 
9
  RVC_TRAINING_MODELS_DIR = RVC_DIR / "train" / "models"
 
1
  """Common constants and functions for the RVC package."""
 
2
  from __future__ import annotations
3
 
4
  from pathlib import Path
5
 
6
  RVC_DIR = Path(__file__).resolve().parent
7
+ RVC_CONFIGS_DIR = Path(__file__).resolve().parent.parent.parent / "configs"
8
+ RVC_MODELS_DIR = Path(__file__).resolve().parent.parent.parent / "rvc_models"
9
  RVC_TRAINING_MODELS_DIR = RVC_DIR / "train" / "models"
{ultimate_rvc/rvc/configs β†’ lib/rvc}/config.py RENAMED
@@ -4,7 +4,7 @@ import pathlib
4
 
5
  import torch
6
 
7
- from ultimate_rvc.rvc.common import RVC_CONFIGS_DIR
8
 
9
  version_config_paths = [
10
  os.path.join("48000.json"),
 
4
 
5
  import torch
6
 
7
+ from lib.rvc.common import RVC_CONFIGS_DIR
8
 
9
  version_config_paths = [
10
  os.path.join("48000.json"),
ultimate_rvc/rvc/infer/infer.py β†’ lib/rvc/converter.py RENAMED
@@ -1,4 +1,5 @@
1
- from typing import TYPE_CHECKING, Unpack
 
2
 
3
  import logging
4
  import os
@@ -7,14 +8,11 @@ import sys
7
  import time
8
  import traceback
9
 
10
- import soxr
11
-
12
- import numpy as np
13
-
14
- import torch
15
-
16
  import librosa
 
 
17
  import soundfile as sf
 
18
  from pedalboard import (
19
  Bitcrush,
20
  Chorus,
@@ -28,30 +26,29 @@ from pedalboard import (
28
  PitchShift,
29
  Reverb,
30
  )
 
31
 
32
- now_dir = pathlib.Path.cwd()
33
- sys.path.append(str(now_dir))
34
  import lazy_loader as lazy
35
 
36
- from ultimate_rvc.rvc.configs.config import Config
37
- from ultimate_rvc.rvc.infer.pipeline import Pipeline as VC
38
- from ultimate_rvc.rvc.infer.typing_extra import ConvertAudioKwArgs
39
- from ultimate_rvc.rvc.lib.algorithm.synthesizers import Synthesizer
40
- from ultimate_rvc.rvc.lib.tools.split_audio import merge_audio, process_audio
41
- from ultimate_rvc.rvc.lib.utils import load_audio_infer, load_embedding
42
- from ultimate_rvc.typing_extra import F0Method
43
 
44
  if TYPE_CHECKING:
45
  import noisereduce as nr
46
  else:
47
  nr = lazy.load("noisereduce")
48
 
49
- # logging.getLogger("httpx").setLevel(logging.WARNING)
50
- # logging.getLogger("httpcore").setLevel(logging.WARNING)
51
- # logging.getLogger("faiss").setLevel(logging.WARNING)
52
- # logging.getLogger("faiss.loader").setLevel(logging.WARNING)
53
  logger = logging.getLogger(__name__)
54
 
 
 
 
55
 
56
  class VoiceConverter:
57
  """
@@ -62,18 +59,16 @@ class VoiceConverter:
62
  """
63
  Initializes the VoiceConverter with default configuration, and sets up models and parameters.
64
  """
65
- self.config = Config() # Load configuration
66
- self.hubert_model = (
67
- None # Initialize the Hubert model (for embedding extraction)
68
- )
69
- self.last_embedder_model = None # Last used embedder model
70
- self.tgt_sr = None # Target sampling rate for the output audio
71
- self.net_g = None # Generator network for voice conversion
72
- self.vc = None # Voice conversion pipeline instance
73
- self.cpt = None # Checkpoint for loading model weights
74
- self.version = None # Model version
75
- self.n_spk = None # Number of speakers in the model
76
- self.use_f0 = None # Whether the model uses F0
77
  self.loaded_model = None
78
 
79
  def load_hubert(self, embedder_model: str, embedder_model_custom: str = None):
@@ -88,18 +83,16 @@ class VoiceConverter:
88
  self.hubert_model = load_embedding(embedder_model, embedder_model_custom)
89
  self.hubert_model = self.hubert_model.to(self.config.device).float()
90
  self.hubert_model.eval()
91
- # Disable gradient tracking permanently for inference-only model
92
  for param in self.hubert_model.parameters():
93
  param.requires_grad_(False)
94
- # Compile with torch.compile if available (torch 2.0+) for CPU kernel fusion
95
  if hasattr(torch, "compile"):
96
  try:
97
  self.hubert_model = torch.compile(
98
  self.hubert_model, mode="reduce-overhead", fullgraph=False
99
  )
100
  logger.info("HuBERT compiled with torch.compile (reduce-overhead)")
101
- except Exception as _ce:
102
- logger.info("torch.compile skipped: %s", _ce)
103
 
104
  @staticmethod
105
  def remove_audio_noise(data, sr, reduction_strength=0.7):
@@ -113,7 +106,6 @@ class VoiceConverter:
113
 
114
  """
115
  try:
116
-
117
  reduced_noise = nr.reduce_noise(
118
  y=data,
119
  sr=sr,
@@ -140,15 +132,7 @@ class VoiceConverter:
140
  print(f"Saving audio as {output_format}...")
141
  audio, sample_rate = librosa.load(input_path, sr=None)
142
  common_sample_rates = [
143
- 8000,
144
- 11025,
145
- 12000,
146
- 16000,
147
- 22050,
148
- 24000,
149
- 32000,
150
- 44100,
151
- 48000,
152
  ]
153
  target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate))
154
  audio = librosa.resample(
@@ -158,7 +142,7 @@ class VoiceConverter:
158
  res_type="soxr_vhq",
159
  )
160
  sf.write(output_path, audio, target_sr, format=output_format.lower())
161
- return output_path
162
  except Exception as error:
163
  print(f"An error occurred converting the audio format: {error}")
164
 
@@ -251,7 +235,7 @@ class VoiceConverter:
251
  sid: int = 0,
252
  proposed_pitch: bool = False,
253
  proposed_pitch_threshold: float = 155.0,
254
- **kwargs: Unpack[ConvertAudioKwArgs],
255
  ):
256
  """
257
  Performs voice conversion on the input audio.
@@ -327,7 +311,7 @@ class VoiceConverter:
327
  sid=sid,
328
  audio=c,
329
  pitch=pitch,
330
- f0_method=f0_method or F0Method.RMVPE,
331
  file_index=file_index,
332
  index_rate=index_rate,
333
  pitch_guidance=self.use_f0,
@@ -389,73 +373,6 @@ class VoiceConverter:
389
  elapsed_time,
390
  )
391
 
392
- def convert_audio_batch(
393
- self,
394
- audio_input_paths: str,
395
- audio_output_path: str,
396
- **kwargs,
397
- ):
398
- """
399
- Performs voice conversion on a batch of input audio files.
400
-
401
- Args:
402
- audio_input_paths (str): List of paths to the input audio files.
403
- audio_output_path (str): Path to the output audio file.
404
- resample_sr (int, optional): Resample sampling rate. Default is 0.
405
- sid (int, optional): Speaker ID. Default is 0.
406
- **kwargs: Additional keyword arguments.
407
-
408
- """
409
- pid = os.getpid()
410
- try:
411
- with pathlib.Path(os.path.join(now_dir, "assets", "infer_pid.txt")).open(
412
- "w",
413
- ) as pid_file:
414
- pid_file.write(str(pid))
415
- start_time = time.time()
416
- print(f"Converting audio batch '{audio_input_paths}'...")
417
- audio_files = [
418
- f
419
- for f in os.listdir(audio_input_paths)
420
- if f.lower().endswith(
421
- (
422
- "wav",
423
- "mp3",
424
- "flac",
425
- "ogg",
426
- "opus",
427
- "m4a",
428
- "mp4",
429
- "aac",
430
- "alac",
431
- "wma",
432
- "aiff",
433
- "webm",
434
- "ac3",
435
- ),
436
- )
437
- ]
438
- print(f"Detected {len(audio_files)} audio files for inference.")
439
- for a in audio_files:
440
- new_input = os.path.join(audio_input_paths, a)
441
- new_output = os.path.splitext(a)[0] + "_output.wav"
442
- new_output = os.path.join(audio_output_path, new_output)
443
- if pathlib.Path(new_output).exists():
444
- continue
445
- self.convert_audio(
446
- audio_input_path=new_input,
447
- audio_output_path=new_output,
448
- **kwargs,
449
- )
450
- print(f"Conversion completed at '{audio_input_paths}'.")
451
- elapsed_time = time.time() - start_time
452
- print(f"Batch conversion completed in {elapsed_time:.2f} seconds.")
453
- except Exception as error:
454
- print(f"An error occurred during audio batch conversion: {error}")
455
- print(traceback.format_exc())
456
- finally:
457
- pathlib.Path(os.path.join(now_dir, "assets", "infer_pid.txt")).unlink()
458
-
459
  def get_vc(self, weight_root, sid):
460
  """
461
  Loads the voice conversion model and sets up the pipeline.
@@ -509,9 +426,7 @@ class VoiceConverter:
509
  try:
510
  self.cpt = torch.load(weight_root, map_location="cpu", weights_only=False)
511
  except Exception:
512
- # Fallback for models saved with newer pickle protocols (e.g. protocol 83)
513
  import pickle
514
- import io
515
  try:
516
  with open(weight_root, "rb") as f:
517
  self.cpt = pickle.load(f)
@@ -541,18 +456,16 @@ class VoiceConverter:
541
  self.net_g.load_state_dict(self.cpt["weight"], strict=False)
542
  self.net_g = self.net_g.to(self.config.device).float()
543
  self.net_g.eval()
544
- # Disable gradient tracking for all synthesizer params
545
  for param in self.net_g.parameters():
546
  param.requires_grad_(False)
547
- # Compile synthesizer for faster CPU inference
548
  if hasattr(torch, "compile"):
549
  try:
550
  self.net_g = torch.compile(
551
  self.net_g, mode="reduce-overhead", fullgraph=False
552
  )
553
  logger.info("Synthesizer compiled with torch.compile")
554
- except Exception as _ce:
555
- logger.info("torch.compile skipped for net_g: %s", _ce)
556
 
557
  def setup_vc_instance(self):
558
  """
 
1
+ """Voice Converter for RVC."""
2
+ from __future__ import annotations
3
 
4
  import logging
5
  import os
 
8
  import time
9
  import traceback
10
 
 
 
 
 
 
 
11
  import librosa
12
+ import numpy as np
13
+ import soxr
14
  import soundfile as sf
15
+ import torch
16
  from pedalboard import (
17
  Bitcrush,
18
  Chorus,
 
26
  PitchShift,
27
  Reverb,
28
  )
29
+ from typing import TYPE_CHECKING, Unpack
30
 
 
 
31
  import lazy_loader as lazy
32
 
33
+ from lib.rvc.config import Config
34
+ from lib.rvc.pipeline import Pipeline as VC
35
+ from lib.rvc.lib.algorithm.synthesizers import Synthesizer
36
+ from lib.rvc.lib.tools.split_audio import merge_audio, process_audio
37
+ from lib.rvc.lib.utils import load_audio_infer, load_embedding
 
 
38
 
39
  if TYPE_CHECKING:
40
  import noisereduce as nr
41
  else:
42
  nr = lazy.load("noisereduce")
43
 
44
+ now_dir = pathlib.Path.cwd()
45
+ sys.path.append(str(now_dir))
46
+
 
47
  logger = logging.getLogger(__name__)
48
 
49
+ # Type alias for F0 method
50
+ F0Method = str
51
+
52
 
53
  class VoiceConverter:
54
  """
 
59
  """
60
  Initializes the VoiceConverter with default configuration, and sets up models and parameters.
61
  """
62
+ self.config = Config()
63
+ self.hubert_model = None
64
+ self.last_embedder_model = None
65
+ self.tgt_sr = None
66
+ self.net_g = None
67
+ self.vc = None
68
+ self.cpt = None
69
+ self.version = None
70
+ self.n_spk = None
71
+ self.use_f0 = None
 
 
72
  self.loaded_model = None
73
 
74
  def load_hubert(self, embedder_model: str, embedder_model_custom: str = None):
 
83
  self.hubert_model = load_embedding(embedder_model, embedder_model_custom)
84
  self.hubert_model = self.hubert_model.to(self.config.device).float()
85
  self.hubert_model.eval()
 
86
  for param in self.hubert_model.parameters():
87
  param.requires_grad_(False)
 
88
  if hasattr(torch, "compile"):
89
  try:
90
  self.hubert_model = torch.compile(
91
  self.hubert_model, mode="reduce-overhead", fullgraph=False
92
  )
93
  logger.info("HuBERT compiled with torch.compile (reduce-overhead)")
94
+ except Exception as e:
95
+ logger.info("torch.compile skipped: %s", e)
96
 
97
  @staticmethod
98
  def remove_audio_noise(data, sr, reduction_strength=0.7):
 
106
 
107
  """
108
  try:
 
109
  reduced_noise = nr.reduce_noise(
110
  y=data,
111
  sr=sr,
 
132
  print(f"Saving audio as {output_format}...")
133
  audio, sample_rate = librosa.load(input_path, sr=None)
134
  common_sample_rates = [
135
+ 8000, 11025, 12000, 16000, 22050, 24000, 32000, 44100, 48000,
 
 
 
 
 
 
 
 
136
  ]
137
  target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate))
138
  audio = librosa.resample(
 
142
  res_type="soxr_vhq",
143
  )
144
  sf.write(output_path, audio, target_sr, format=output_format.lower())
145
+ return output_path
146
  except Exception as error:
147
  print(f"An error occurred converting the audio format: {error}")
148
 
 
235
  sid: int = 0,
236
  proposed_pitch: bool = False,
237
  proposed_pitch_threshold: float = 155.0,
238
+ **kwargs,
239
  ):
240
  """
241
  Performs voice conversion on the input audio.
 
311
  sid=sid,
312
  audio=c,
313
  pitch=pitch,
314
+ f0_method=f0_method or "rmvpe",
315
  file_index=file_index,
316
  index_rate=index_rate,
317
  pitch_guidance=self.use_f0,
 
373
  elapsed_time,
374
  )
375
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  def get_vc(self, weight_root, sid):
377
  """
378
  Loads the voice conversion model and sets up the pipeline.
 
426
  try:
427
  self.cpt = torch.load(weight_root, map_location="cpu", weights_only=False)
428
  except Exception:
 
429
  import pickle
 
430
  try:
431
  with open(weight_root, "rb") as f:
432
  self.cpt = pickle.load(f)
 
456
  self.net_g.load_state_dict(self.cpt["weight"], strict=False)
457
  self.net_g = self.net_g.to(self.config.device).float()
458
  self.net_g.eval()
 
459
  for param in self.net_g.parameters():
460
  param.requires_grad_(False)
 
461
  if hasattr(torch, "compile"):
462
  try:
463
  self.net_g = torch.compile(
464
  self.net_g, mode="reduce-overhead", fullgraph=False
465
  )
466
  logger.info("Synthesizer compiled with torch.compile")
467
+ except Exception as e:
468
+ logger.info("torch.compile skipped for net_g: %s", e)
469
 
470
  def setup_vc_instance(self):
471
  """
{ultimate_rvc/rvc/infer β†’ lib/rvc}/pipeline.py RENAMED
@@ -1,23 +1,19 @@
 
1
  import pathlib
2
  import sys
3
 
4
- import numpy as np
5
- from scipy import signal
6
-
7
  import faiss
 
 
8
  import torch
9
  import torch.nn.functional as F
10
-
11
- import librosa
12
 
13
  now_dir = pathlib.Path.cwd()
14
  sys.path.append(str(now_dir))
15
 
16
- import logging
17
-
18
- from ultimate_rvc.rvc.lib.predictors.f0 import CREPE, FCPE, RMVPE
19
 
20
- # logging.getLogger("faiss").setLevel(logging.WARNING)
21
  logger = logging.getLogger(__name__)
22
 
23
  # Constants for high-pass filter
 
1
+ import logging
2
  import pathlib
3
  import sys
4
 
 
 
 
5
  import faiss
6
+ import librosa
7
+ import numpy as np
8
  import torch
9
  import torch.nn.functional as F
10
+ from scipy import signal
 
11
 
12
  now_dir = pathlib.Path.cwd()
13
  sys.path.append(str(now_dir))
14
 
15
+ from lib.rvc.predictors.f0 import CREPE, FCPE, RMVPE
 
 
16
 
 
17
  logger = logging.getLogger(__name__)
18
 
19
  # Constants for high-pass filter
{ultimate_rvc/rvc/lib β†’ lib/rvc}/predictors/F0Extractor.py RENAMED
@@ -12,11 +12,11 @@ import torchcrepe
12
 
13
  import librosa
14
 
15
- from ultimate_rvc.common import RVC_MODELS_DIR
16
- from ultimate_rvc.rvc.configs.config import Config
17
 
18
  # from tools.anyf0.rmvpe import RMVPE
19
- from ultimate_rvc.rvc.lib.predictors.RMVPE import RMVPE0Predictor
20
 
21
  config = Config()
22
 
 
12
 
13
  import librosa
14
 
15
+ from lib.rvc.common import RVC_MODELS_DIR
16
+ from lib.rvc.configs.config import Config
17
 
18
  # from tools.anyf0.rmvpe import RMVPE
19
+ from lib.rvc.lib.predictors.RMVPE import RMVPE0Predictor
20
 
21
  config = Config()
22
 
{ultimate_rvc/rvc/lib β†’ lib/rvc}/predictors/FCPE.py RENAMED
File without changes
{ultimate_rvc/rvc/lib β†’ lib/rvc}/predictors/RMVPE.py RENAMED
File without changes
lib/rvc/predictors/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ """F0 pitch prediction modules."""
2
+ from __future__ import annotations
{ultimate_rvc/rvc/lib β†’ lib/rvc}/predictors/f0.py RENAMED
@@ -5,8 +5,8 @@ from torchfcpe import spawn_infer_model_from_pt
5
  import torch
6
  import torchcrepe
7
 
8
- from ultimate_rvc.common import RVC_MODELS_DIR
9
- from ultimate_rvc.rvc.lib.predictors.RMVPE import RMVPE0Predictor
10
 
11
 
12
  class RMVPE:
 
5
  import torch
6
  import torchcrepe
7
 
8
+ from lib.rvc.common import RVC_MODELS_DIR
9
+ from lib.rvc.lib.predictors.RMVPE import RMVPE0Predictor
10
 
11
 
12
  class RMVPE:
lib/rvc/tools/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ """Audio processing tools."""
2
+ from __future__ import annotations
{ultimate_rvc/rvc/lib β†’ lib/rvc}/tools/split_audio.py RENAMED
File without changes
{ultimate_rvc/rvc/lib β†’ lib/rvc}/utils.py RENAMED
@@ -18,7 +18,7 @@ from transformers import HubertModel
18
  import librosa
19
  import soundfile as sf
20
 
21
- from ultimate_rvc.common import RVC_MODELS_DIR
22
 
23
  # Remove this to see warnings about transformers models
24
  warnings.filterwarnings("ignore")
 
18
  import librosa
19
  import soundfile as sf
20
 
21
+ from lib.rvc.common import RVC_MODELS_DIR
22
 
23
  # Remove this to see warnings about transformers models
24
  warnings.filterwarnings("ignore")
ultimate_rvc/__init__.py DELETED
File without changes
ultimate_rvc/common.py DELETED
@@ -1,37 +0,0 @@
1
- """Common variables used in the Ultimate RVC project."""
2
-
3
- from __future__ import annotations
4
-
5
- import os
6
- import sys
7
- from pathlib import Path
8
-
9
- BASE_DIR = Path.cwd()
10
- VENV_DIR = Path(sys.prefix)
11
- MODELS_DIR = Path(os.getenv("URVC_MODELS_DIR") or BASE_DIR / "models")
12
- RVC_MODELS_DIR = MODELS_DIR / "rvc"
13
- VOICE_MODELS_DIR = Path(
14
- os.getenv("URVC_VOICE_MODELS_DIR") or RVC_MODELS_DIR / "voice_models",
15
- )
16
- EMBEDDER_MODELS_DIR = RVC_MODELS_DIR / "embedders"
17
- CUSTOM_EMBEDDER_MODELS_DIR = EMBEDDER_MODELS_DIR / "custom"
18
- PRETRAINED_MODELS_DIR = RVC_MODELS_DIR / "pretraineds"
19
- CUSTOM_PRETRAINED_MODELS_DIR = PRETRAINED_MODELS_DIR / "custom"
20
-
21
- SEPARATOR_MODELS_DIR = MODELS_DIR / "audio_separator"
22
- TRAINING_MODELS_DIR = RVC_MODELS_DIR / "training"
23
- AUDIO_DIR = Path(os.getenv("URVC_AUDIO_DIR") or BASE_DIR / "audio")
24
- TEMP_DIR = Path(os.getenv("URVC_TEMP_DIR") or BASE_DIR / "temp")
25
- CONFIG_DIR = Path(os.getenv("URVC_CONFIG_DIR") or BASE_DIR / "config")
26
- NODE_PATH = Path(
27
- (
28
- os.getenv("GRADIO_NODE_PATH")
29
- or (
30
- VENV_DIR
31
- / f"lib/python{sys.version_info.major}.{sys.version_info.minor}"
32
- / "site-packages/nodejs_wheel/bin/node"
33
- )
34
- if sys.platform == "linux"
35
- else VENV_DIR / "Lib/site-packages/nodejs_wheel/node.exe"
36
- ),
37
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ultimate_rvc/rvc/__init__.py DELETED
@@ -1,4 +0,0 @@
1
- """
2
- The rvc package is a collection of tools for voice cloning using the RVC
3
- method.
4
- """
 
 
 
 
 
ultimate_rvc/rvc/configs/__init__.py DELETED
File without changes
ultimate_rvc/rvc/infer/__init__.py DELETED
File without changes
ultimate_rvc/rvc/infer/typing_extra.py DELETED
@@ -1,57 +0,0 @@
1
- """Extra type definitions for the `ultimate_rvc.rvc.infer` package."""
2
-
3
- from typing import TypedDict
4
-
5
-
6
- class ConvertAudioKwArgs(TypedDict, total=False):
7
- """Keyword arguments for the `convert_audio` function."""
8
-
9
- # pre-processing arguments
10
- formant_shifting: bool
11
- formant_qfrency: float
12
- formant_timbre: float
13
- # reverb post-processing arguments
14
- reverb: bool
15
- reverb_room_size: float
16
- reverb_damping: float
17
- reverb_wet_level: float
18
- reverb_dry_level: float
19
- reverb_width: float
20
- reverb_freeze_mode: int
21
- # pitch shift post-processing arguments
22
- pitch_shift: bool
23
- pitch_shift_semitones: int
24
- # limiter post-processing arguments
25
- limiter: bool
26
- limiter_threshold: float
27
- limiter_release: float
28
- # gain post-processing arguments
29
- gain: bool
30
- gain_db: int
31
- # distortion post-processing arguments
32
- distortion: bool
33
- distortion_gain: int
34
- # chorus post-processing arguments
35
- chorus: bool
36
- chorus_rate: float
37
- chorus_depth: float
38
- chorus_delay: int
39
- chorus_feedback: float
40
- chorus_mix: float
41
- # bitcrush post-processing arguments
42
- bitcrush: bool
43
- bitcrush_bit_depth: int
44
- # clipping post-processing arguments
45
- clipping: bool
46
- clipping_threshold: int
47
- # compressor post-processing arguments
48
- compressor: bool
49
- compressor_threshold: int
50
- compressor_ratio: int
51
- compressor_attack: float
52
- compressor_release: int
53
- # delay post-processing arguments
54
- delay: bool
55
- delay_seconds: float
56
- delay_feedback: float
57
- delay_mix: float
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ultimate_rvc/rvc/lib/__init__.py DELETED
File without changes
ultimate_rvc/rvc/lib/algorithm/__init__.py DELETED
File without changes
ultimate_rvc/rvc/lib/algorithm/generators/__init__.py DELETED
File without changes
ultimate_rvc/rvc/lib/predictors/__init__.py DELETED
File without changes
ultimate_rvc/rvc/lib/tools/__init__.py DELETED
File without changes
ultimate_rvc/typing_extra.py DELETED
@@ -1,154 +0,0 @@
1
- """Extra typing for the Ultimate RVC project."""
2
-
3
- from __future__ import annotations
4
-
5
- from collections.abc import Mapping, Sequence
6
- from enum import IntEnum, StrEnum
7
- from os import PathLike
8
-
9
- type StrPath = str | PathLike[str]
10
-
11
- type Json = Mapping[str, Json] | Sequence[Json] | str | int | float | bool | None
12
-
13
-
14
- class SeparationModel(StrEnum):
15
- """Enumeration of audio separation models."""
16
-
17
- UVR_MDX_NET_VOC_FT = "UVR-MDX-NET-Voc_FT.onnx"
18
- UVR_MDX_NET_KARA_2 = "UVR_MDXNET_KARA_2.onnx"
19
- REVERB_HQ_BY_FOXJOY = "Reverb_HQ_By_FoxJoy.onnx"
20
-
21
-
22
- class SegmentSize(IntEnum):
23
- """Enumeration of segment sizes for audio separation."""
24
-
25
- SEG_64 = 64
26
- SEG_128 = 128
27
- SEG_256 = 256
28
- SEG_512 = 512
29
- SEG_1024 = 1024
30
- SEG_2048 = 2048
31
-
32
-
33
- class F0Method(StrEnum):
34
- """Enumeration of pitch extraction methods."""
35
-
36
- RMVPE = "rmvpe"
37
- CREPE = "crepe"
38
- CREPE_TINY = "crepe-tiny"
39
- FCPE = "fcpe"
40
-
41
-
42
- class EmbedderModel(StrEnum):
43
- """Enumeration of audio embedding models."""
44
-
45
- CONTENTVEC = "contentvec"
46
- SPIN = "spin"
47
- SPIN_V2 = "spin-v2"
48
- CHINESE_HUBERT_BASE = "chinese-hubert-base"
49
- JAPANESE_HUBERT_BASE = "japanese-hubert-base"
50
- KOREAN_HUBERT_BASE = "korean-hubert-base"
51
- CUSTOM = "custom"
52
-
53
-
54
- class RVCContentType(StrEnum):
55
- """Enumeration of valid content to convert with RVC."""
56
-
57
- VOCALS = "vocals"
58
- VOICE = "voice"
59
- SPEECH = "speech"
60
- AUDIO = "audio"
61
-
62
-
63
- class SampleRate(IntEnum):
64
- """Enumeration of supported audio sample rates."""
65
-
66
- HZ_16K = 16000
67
- HZ_44K = 44100
68
- HZ_48K = 48000
69
- HZ_96K = 96000
70
- HZ_192K = 192000
71
-
72
-
73
- class AudioExt(StrEnum):
74
- """Enumeration of supported audio file formats."""
75
-
76
- MP3 = "mp3"
77
- WAV = "wav"
78
- FLAC = "flac"
79
- OGG = "ogg"
80
- M4A = "m4a"
81
- AAC = "aac"
82
-
83
-
84
- class DeviceType(StrEnum):
85
- """Enumeration of device types for training voice models."""
86
-
87
- AUTOMATIC = "Automatic"
88
- CPU = "CPU"
89
- GPU = "GPU"
90
-
91
-
92
- class PrecisionType(StrEnum):
93
- """Enumeration of precision types for training voice models."""
94
-
95
- FP32 = "fp32"
96
- FP16 = "fp16"
97
- BF16 = "bf16"
98
-
99
-
100
- class TrainingSampleRate(IntEnum):
101
- """Enumeration of sample rates for training voice models."""
102
-
103
- HZ_32K = 32000
104
- HZ_40K = 40000
105
- HZ_48K = 48000
106
-
107
-
108
- class AudioSplitMethod(StrEnum):
109
- """
110
- Enumeration of methods to use for splitting audio files during
111
- dataset preprocessing.
112
- """
113
-
114
- SKIP = "Skip"
115
- SIMPLE = "Simple"
116
- AUTOMATIC = "Automatic"
117
-
118
-
119
- class AudioNormalizationMode(StrEnum):
120
- """
121
- Enumeration of audio normalization methods during
122
- dataset preprocessing.
123
- """
124
-
125
- NONE = "none"
126
- PRE = "pre"
127
- POST = "post"
128
-
129
-
130
- class Vocoder(StrEnum):
131
- """Enumeration of vocoders for training voice models."""
132
-
133
- HIFI_GAN = "HiFi-GAN"
134
- MRF_HIFI_GAN = "MRF HiFi-GAN"
135
- REFINE_GAN = "RefineGAN"
136
-
137
-
138
- class IndexAlgorithm(StrEnum):
139
- """Enumeration of indexing algorithms for training voice models."""
140
-
141
- AUTO = "Auto"
142
- FAISS = "Faiss"
143
- KMEANS = "KMeans"
144
-
145
-
146
- class PretrainedType(StrEnum):
147
- """
148
- Enumeration of the possible types of pretrained models to finetune
149
- voice models on.
150
- """
151
-
152
- NONE = "None"
153
- DEFAULT = "Default"
154
- CUSTOM = "Custom"