Spaces:
Running
Running
ozipoetra commited on
Commit Β·
754f043
1
Parent(s): aac14c4
refactor: create own RVC library from ultimate_rvc
Browse files- Create lib/rvc/ with essential voice conversion components
- lib/rvc/algorithm/: encoders, generators, synthesizers
- lib/rvc/predictors/: F0 extraction (RMVPE, FCPE, CREPE)
- lib/rvc/tools/: audio splitting utilities
- lib/rvc/converter.py: VoiceConverter class
- lib/rvc/pipeline.py: voice conversion pipeline
- lib/rvc/config.py: device and model configuration
- Move configs (48000.json, 40000.json, 32000.json) to configs/
- Remove ultimate_rvc/ directory (no longer needed)
- Update lib/jobs.py to use new lib.rvc.converter
This creates a self-contained RVC library based on ultimate-rvc,
removing the external dependency and allowing for easier maintenance.
- {ultimate_rvc/rvc/configs β configs}/32000.json +0 -0
- {ultimate_rvc/rvc/configs β configs}/40000.json +0 -0
- {ultimate_rvc/rvc/configs β configs}/48000.json +0 -0
- lib/jobs.py +1 -1
- lib/rvc/__init__.py +16 -0
- lib/rvc/algorithm/__init__.py +2 -0
- {ultimate_rvc/rvc/lib β lib/rvc}/algorithm/attentions.py +1 -1
- {ultimate_rvc/rvc/lib β lib/rvc}/algorithm/commons.py +0 -0
- {ultimate_rvc/rvc/lib β lib/rvc}/algorithm/encoders.py +4 -4
- lib/rvc/algorithm/generators/__init__.py +2 -0
- {ultimate_rvc/rvc/lib β lib/rvc}/algorithm/generators/hifigan.py +2 -2
- {ultimate_rvc/rvc/lib β lib/rvc}/algorithm/generators/hifigan_mrf.py +0 -0
- {ultimate_rvc/rvc/lib β lib/rvc}/algorithm/generators/hifigan_nsf.py +3 -3
- {ultimate_rvc/rvc/lib β lib/rvc}/algorithm/generators/refinegan.py +1 -1
- {ultimate_rvc/rvc/lib β lib/rvc}/algorithm/modules.py +1 -1
- {ultimate_rvc/rvc/lib β lib/rvc}/algorithm/normalization.py +0 -0
- {ultimate_rvc/rvc/lib β lib/rvc}/algorithm/residuals.py +2 -2
- {ultimate_rvc/rvc/lib β lib/rvc}/algorithm/synthesizers.py +7 -7
- {ultimate_rvc β lib}/rvc/common.py +2 -2
- {ultimate_rvc/rvc/configs β lib/rvc}/config.py +1 -1
- ultimate_rvc/rvc/infer/infer.py β lib/rvc/converter.py +35 -122
- {ultimate_rvc/rvc/infer β lib/rvc}/pipeline.py +5 -9
- {ultimate_rvc/rvc/lib β lib/rvc}/predictors/F0Extractor.py +3 -3
- {ultimate_rvc/rvc/lib β lib/rvc}/predictors/FCPE.py +0 -0
- {ultimate_rvc/rvc/lib β lib/rvc}/predictors/RMVPE.py +0 -0
- lib/rvc/predictors/__init__.py +2 -0
- {ultimate_rvc/rvc/lib β lib/rvc}/predictors/f0.py +2 -2
- lib/rvc/tools/__init__.py +2 -0
- {ultimate_rvc/rvc/lib β lib/rvc}/tools/split_audio.py +0 -0
- {ultimate_rvc/rvc/lib β lib/rvc}/utils.py +1 -1
- ultimate_rvc/__init__.py +0 -0
- ultimate_rvc/common.py +0 -37
- ultimate_rvc/rvc/__init__.py +0 -4
- ultimate_rvc/rvc/configs/__init__.py +0 -0
- ultimate_rvc/rvc/infer/__init__.py +0 -0
- ultimate_rvc/rvc/infer/typing_extra.py +0 -57
- ultimate_rvc/rvc/lib/__init__.py +0 -0
- ultimate_rvc/rvc/lib/algorithm/__init__.py +0 -0
- ultimate_rvc/rvc/lib/algorithm/generators/__init__.py +0 -0
- ultimate_rvc/rvc/lib/predictors/__init__.py +0 -0
- ultimate_rvc/rvc/lib/tools/__init__.py +0 -0
- ultimate_rvc/typing_extra.py +0 -154
{ultimate_rvc/rvc/configs β configs}/32000.json
RENAMED
|
File without changes
|
{ultimate_rvc/rvc/configs β configs}/40000.json
RENAMED
|
File without changes
|
{ultimate_rvc/rvc/configs β configs}/48000.json
RENAMED
|
File without changes
|
lib/jobs.py
CHANGED
|
@@ -34,7 +34,7 @@ def get_vc():
|
|
| 34 |
global _vc_instance
|
| 35 |
if _vc_instance is None:
|
| 36 |
logger.info("Loading VoiceConverterβ¦")
|
| 37 |
-
from
|
| 38 |
_vc_instance = VoiceConverter()
|
| 39 |
logger.info("VoiceConverter ready.")
|
| 40 |
return _vc_instance
|
|
|
|
| 34 |
global _vc_instance
|
| 35 |
if _vc_instance is None:
|
| 36 |
logger.info("Loading VoiceConverterβ¦")
|
| 37 |
+
from lib.rvc.converter import VoiceConverter
|
| 38 |
_vc_instance = VoiceConverter()
|
| 39 |
logger.info("VoiceConverter ready.")
|
| 40 |
return _vc_instance
|
lib/rvc/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""RVC Voice Conversion Library.
|
| 2 |
+
|
| 3 |
+
This is a minimal rewrite of the ultimate-rvc library for voice conversion.
|
| 4 |
+
"""
|
| 5 |
+
from __future__ import annotations
|
| 6 |
+
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
# Base directory for RVC resources
|
| 10 |
+
BASE_DIR = Path(__file__).parent.parent.parent
|
| 11 |
+
|
| 12 |
+
# Models directory
|
| 13 |
+
MODELS_DIR = BASE_DIR / "rvc_models"
|
| 14 |
+
|
| 15 |
+
# Configs directory (for model configs)
|
| 16 |
+
CONFIGS_DIR = BASE_DIR / "configs"
|
lib/rvc/algorithm/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Algorithm modules for RVC."""
|
| 2 |
+
from __future__ import annotations
|
{ultimate_rvc/rvc/lib β lib/rvc}/algorithm/attentions.py
RENAMED
|
@@ -2,7 +2,7 @@ import math
|
|
| 2 |
|
| 3 |
import torch
|
| 4 |
|
| 5 |
-
from
|
| 6 |
|
| 7 |
|
| 8 |
class MultiHeadAttention(torch.nn.Module):
|
|
|
|
| 2 |
|
| 3 |
import torch
|
| 4 |
|
| 5 |
+
from lib.rvc.lib.algorithm.commons import convert_pad_shape
|
| 6 |
|
| 7 |
|
| 8 |
class MultiHeadAttention(torch.nn.Module):
|
{ultimate_rvc/rvc/lib β lib/rvc}/algorithm/commons.py
RENAMED
|
File without changes
|
{ultimate_rvc/rvc/lib β lib/rvc}/algorithm/encoders.py
RENAMED
|
@@ -3,10 +3,10 @@ import math
|
|
| 3 |
|
| 4 |
import torch
|
| 5 |
|
| 6 |
-
from
|
| 7 |
-
from
|
| 8 |
-
from
|
| 9 |
-
from
|
| 10 |
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
|
|
|
|
| 3 |
|
| 4 |
import torch
|
| 5 |
|
| 6 |
+
from lib.rvc.lib.algorithm.attentions import FFN, MultiHeadAttention
|
| 7 |
+
from lib.rvc.lib.algorithm.commons import sequence_mask
|
| 8 |
+
from lib.rvc.lib.algorithm.modules import WaveNet
|
| 9 |
+
from lib.rvc.lib.algorithm.normalization import LayerNorm
|
| 10 |
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
|
lib/rvc/algorithm/generators/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Generator modules for RVC vocoders."""
|
| 2 |
+
from __future__ import annotations
|
{ultimate_rvc/rvc/lib β lib/rvc}/algorithm/generators/hifigan.py
RENAMED
|
@@ -6,8 +6,8 @@ import torch
|
|
| 6 |
from torch.nn.utils import remove_weight_norm
|
| 7 |
from torch.nn.utils.parametrizations import weight_norm
|
| 8 |
|
| 9 |
-
from
|
| 10 |
-
from
|
| 11 |
|
| 12 |
|
| 13 |
class HiFiGANGenerator(torch.nn.Module):
|
|
|
|
| 6 |
from torch.nn.utils import remove_weight_norm
|
| 7 |
from torch.nn.utils.parametrizations import weight_norm
|
| 8 |
|
| 9 |
+
from lib.rvc.lib.algorithm.commons import init_weights
|
| 10 |
+
from lib.rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock
|
| 11 |
|
| 12 |
|
| 13 |
class HiFiGANGenerator(torch.nn.Module):
|
{ultimate_rvc/rvc/lib β lib/rvc}/algorithm/generators/hifigan_mrf.py
RENAMED
|
File without changes
|
{ultimate_rvc/rvc/lib β lib/rvc}/algorithm/generators/hifigan_nsf.py
RENAMED
|
@@ -7,9 +7,9 @@ from torch.nn.utils import remove_weight_norm
|
|
| 7 |
from torch.nn.utils.parametrizations import weight_norm
|
| 8 |
from torch.utils.checkpoint import checkpoint
|
| 9 |
|
| 10 |
-
from
|
| 11 |
-
from
|
| 12 |
-
from
|
| 13 |
|
| 14 |
|
| 15 |
class SourceModuleHnNSF(torch.nn.Module):
|
|
|
|
| 7 |
from torch.nn.utils.parametrizations import weight_norm
|
| 8 |
from torch.utils.checkpoint import checkpoint
|
| 9 |
|
| 10 |
+
from lib.rvc.lib.algorithm.commons import init_weights
|
| 11 |
+
from lib.rvc.lib.algorithm.generators.hifigan import SineGenerator
|
| 12 |
+
from lib.rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock
|
| 13 |
|
| 14 |
|
| 15 |
class SourceModuleHnNSF(torch.nn.Module):
|
{ultimate_rvc/rvc/lib β lib/rvc}/algorithm/generators/refinegan.py
RENAMED
|
@@ -8,7 +8,7 @@ from torch.nn.utils import remove_weight_norm
|
|
| 8 |
from torch.nn.utils.parametrizations import weight_norm
|
| 9 |
from torch.utils.checkpoint import checkpoint
|
| 10 |
|
| 11 |
-
from
|
| 12 |
|
| 13 |
|
| 14 |
class ResBlock(nn.Module):
|
|
|
|
| 8 |
from torch.nn.utils.parametrizations import weight_norm
|
| 9 |
from torch.utils.checkpoint import checkpoint
|
| 10 |
|
| 11 |
+
from lib.rvc.lib.algorithm.commons import get_padding, init_weights
|
| 12 |
|
| 13 |
|
| 14 |
class ResBlock(nn.Module):
|
{ultimate_rvc/rvc/lib β lib/rvc}/algorithm/modules.py
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import torch
|
| 2 |
|
| 3 |
-
from
|
| 4 |
|
| 5 |
|
| 6 |
class WaveNet(torch.nn.Module):
|
|
|
|
| 1 |
import torch
|
| 2 |
|
| 3 |
+
from lib.rvc.lib.algorithm.commons import fused_add_tanh_sigmoid_multiply
|
| 4 |
|
| 5 |
|
| 6 |
class WaveNet(torch.nn.Module):
|
{ultimate_rvc/rvc/lib β lib/rvc}/algorithm/normalization.py
RENAMED
|
File without changes
|
{ultimate_rvc/rvc/lib β lib/rvc}/algorithm/residuals.py
RENAMED
|
@@ -6,8 +6,8 @@ import torch
|
|
| 6 |
from torch.nn.utils import remove_weight_norm
|
| 7 |
from torch.nn.utils.parametrizations import weight_norm
|
| 8 |
|
| 9 |
-
from
|
| 10 |
-
from
|
| 11 |
|
| 12 |
LRELU_SLOPE = 0.1
|
| 13 |
|
|
|
|
| 6 |
from torch.nn.utils import remove_weight_norm
|
| 7 |
from torch.nn.utils.parametrizations import weight_norm
|
| 8 |
|
| 9 |
+
from lib.rvc.lib.algorithm.commons import get_padding, init_weights
|
| 10 |
+
from lib.rvc.lib.algorithm.modules import WaveNet
|
| 11 |
|
| 12 |
LRELU_SLOPE = 0.1
|
| 13 |
|
{ultimate_rvc/rvc/lib β lib/rvc}/algorithm/synthesizers.py
RENAMED
|
@@ -4,13 +4,13 @@ import logging
|
|
| 4 |
|
| 5 |
import torch
|
| 6 |
|
| 7 |
-
from
|
| 8 |
-
from
|
| 9 |
-
from
|
| 10 |
-
from
|
| 11 |
-
from
|
| 12 |
-
from
|
| 13 |
-
from
|
| 14 |
|
| 15 |
logger = logging.getLogger(__name__)
|
| 16 |
|
|
|
|
| 4 |
|
| 5 |
import torch
|
| 6 |
|
| 7 |
+
from lib.rvc.lib.algorithm.commons import rand_slice_segments, slice_segments
|
| 8 |
+
from lib.rvc.lib.algorithm.encoders import PosteriorEncoder, TextEncoder
|
| 9 |
+
from lib.rvc.lib.algorithm.generators.hifigan import HiFiGANGenerator
|
| 10 |
+
from lib.rvc.lib.algorithm.generators.hifigan_mrf import HiFiGANMRFGenerator
|
| 11 |
+
from lib.rvc.lib.algorithm.generators.hifigan_nsf import HiFiGANNSFGenerator
|
| 12 |
+
from lib.rvc.lib.algorithm.generators.refinegan import RefineGANGenerator
|
| 13 |
+
from lib.rvc.lib.algorithm.residuals import ResidualCouplingBlock
|
| 14 |
|
| 15 |
logger = logging.getLogger(__name__)
|
| 16 |
|
{ultimate_rvc β lib}/rvc/common.py
RENAMED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
"""Common constants and functions for the RVC package."""
|
| 2 |
-
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
RVC_DIR = Path(__file__).resolve().parent
|
| 8 |
-
RVC_CONFIGS_DIR =
|
|
|
|
| 9 |
RVC_TRAINING_MODELS_DIR = RVC_DIR / "train" / "models"
|
|
|
|
| 1 |
"""Common constants and functions for the RVC package."""
|
|
|
|
| 2 |
from __future__ import annotations
|
| 3 |
|
| 4 |
from pathlib import Path
|
| 5 |
|
| 6 |
RVC_DIR = Path(__file__).resolve().parent
|
| 7 |
+
RVC_CONFIGS_DIR = Path(__file__).resolve().parent.parent.parent / "configs"
|
| 8 |
+
RVC_MODELS_DIR = Path(__file__).resolve().parent.parent.parent / "rvc_models"
|
| 9 |
RVC_TRAINING_MODELS_DIR = RVC_DIR / "train" / "models"
|
{ultimate_rvc/rvc/configs β lib/rvc}/config.py
RENAMED
|
@@ -4,7 +4,7 @@ import pathlib
|
|
| 4 |
|
| 5 |
import torch
|
| 6 |
|
| 7 |
-
from
|
| 8 |
|
| 9 |
version_config_paths = [
|
| 10 |
os.path.join("48000.json"),
|
|
|
|
| 4 |
|
| 5 |
import torch
|
| 6 |
|
| 7 |
+
from lib.rvc.common import RVC_CONFIGS_DIR
|
| 8 |
|
| 9 |
version_config_paths = [
|
| 10 |
os.path.join("48000.json"),
|
ultimate_rvc/rvc/infer/infer.py β lib/rvc/converter.py
RENAMED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
-
|
|
|
|
| 2 |
|
| 3 |
import logging
|
| 4 |
import os
|
|
@@ -7,14 +8,11 @@ import sys
|
|
| 7 |
import time
|
| 8 |
import traceback
|
| 9 |
|
| 10 |
-
import soxr
|
| 11 |
-
|
| 12 |
-
import numpy as np
|
| 13 |
-
|
| 14 |
-
import torch
|
| 15 |
-
|
| 16 |
import librosa
|
|
|
|
|
|
|
| 17 |
import soundfile as sf
|
|
|
|
| 18 |
from pedalboard import (
|
| 19 |
Bitcrush,
|
| 20 |
Chorus,
|
|
@@ -28,30 +26,29 @@ from pedalboard import (
|
|
| 28 |
PitchShift,
|
| 29 |
Reverb,
|
| 30 |
)
|
|
|
|
| 31 |
|
| 32 |
-
now_dir = pathlib.Path.cwd()
|
| 33 |
-
sys.path.append(str(now_dir))
|
| 34 |
import lazy_loader as lazy
|
| 35 |
|
| 36 |
-
from
|
| 37 |
-
from
|
| 38 |
-
from
|
| 39 |
-
from
|
| 40 |
-
from
|
| 41 |
-
from ultimate_rvc.rvc.lib.utils import load_audio_infer, load_embedding
|
| 42 |
-
from ultimate_rvc.typing_extra import F0Method
|
| 43 |
|
| 44 |
if TYPE_CHECKING:
|
| 45 |
import noisereduce as nr
|
| 46 |
else:
|
| 47 |
nr = lazy.load("noisereduce")
|
| 48 |
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
# logging.getLogger("faiss.loader").setLevel(logging.WARNING)
|
| 53 |
logger = logging.getLogger(__name__)
|
| 54 |
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
class VoiceConverter:
|
| 57 |
"""
|
|
@@ -62,18 +59,16 @@ class VoiceConverter:
|
|
| 62 |
"""
|
| 63 |
Initializes the VoiceConverter with default configuration, and sets up models and parameters.
|
| 64 |
"""
|
| 65 |
-
self.config = Config()
|
| 66 |
-
self.hubert_model =
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
self.
|
| 70 |
-
self.
|
| 71 |
-
self.
|
| 72 |
-
self.
|
| 73 |
-
self.
|
| 74 |
-
self.
|
| 75 |
-
self.n_spk = None # Number of speakers in the model
|
| 76 |
-
self.use_f0 = None # Whether the model uses F0
|
| 77 |
self.loaded_model = None
|
| 78 |
|
| 79 |
def load_hubert(self, embedder_model: str, embedder_model_custom: str = None):
|
|
@@ -88,18 +83,16 @@ class VoiceConverter:
|
|
| 88 |
self.hubert_model = load_embedding(embedder_model, embedder_model_custom)
|
| 89 |
self.hubert_model = self.hubert_model.to(self.config.device).float()
|
| 90 |
self.hubert_model.eval()
|
| 91 |
-
# Disable gradient tracking permanently for inference-only model
|
| 92 |
for param in self.hubert_model.parameters():
|
| 93 |
param.requires_grad_(False)
|
| 94 |
-
# Compile with torch.compile if available (torch 2.0+) for CPU kernel fusion
|
| 95 |
if hasattr(torch, "compile"):
|
| 96 |
try:
|
| 97 |
self.hubert_model = torch.compile(
|
| 98 |
self.hubert_model, mode="reduce-overhead", fullgraph=False
|
| 99 |
)
|
| 100 |
logger.info("HuBERT compiled with torch.compile (reduce-overhead)")
|
| 101 |
-
except Exception as
|
| 102 |
-
logger.info("torch.compile skipped: %s",
|
| 103 |
|
| 104 |
@staticmethod
|
| 105 |
def remove_audio_noise(data, sr, reduction_strength=0.7):
|
|
@@ -113,7 +106,6 @@ class VoiceConverter:
|
|
| 113 |
|
| 114 |
"""
|
| 115 |
try:
|
| 116 |
-
|
| 117 |
reduced_noise = nr.reduce_noise(
|
| 118 |
y=data,
|
| 119 |
sr=sr,
|
|
@@ -140,15 +132,7 @@ class VoiceConverter:
|
|
| 140 |
print(f"Saving audio as {output_format}...")
|
| 141 |
audio, sample_rate = librosa.load(input_path, sr=None)
|
| 142 |
common_sample_rates = [
|
| 143 |
-
8000,
|
| 144 |
-
11025,
|
| 145 |
-
12000,
|
| 146 |
-
16000,
|
| 147 |
-
22050,
|
| 148 |
-
24000,
|
| 149 |
-
32000,
|
| 150 |
-
44100,
|
| 151 |
-
48000,
|
| 152 |
]
|
| 153 |
target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate))
|
| 154 |
audio = librosa.resample(
|
|
@@ -158,7 +142,7 @@ class VoiceConverter:
|
|
| 158 |
res_type="soxr_vhq",
|
| 159 |
)
|
| 160 |
sf.write(output_path, audio, target_sr, format=output_format.lower())
|
| 161 |
-
|
| 162 |
except Exception as error:
|
| 163 |
print(f"An error occurred converting the audio format: {error}")
|
| 164 |
|
|
@@ -251,7 +235,7 @@ class VoiceConverter:
|
|
| 251 |
sid: int = 0,
|
| 252 |
proposed_pitch: bool = False,
|
| 253 |
proposed_pitch_threshold: float = 155.0,
|
| 254 |
-
**kwargs
|
| 255 |
):
|
| 256 |
"""
|
| 257 |
Performs voice conversion on the input audio.
|
|
@@ -327,7 +311,7 @@ class VoiceConverter:
|
|
| 327 |
sid=sid,
|
| 328 |
audio=c,
|
| 329 |
pitch=pitch,
|
| 330 |
-
f0_method=f0_method or
|
| 331 |
file_index=file_index,
|
| 332 |
index_rate=index_rate,
|
| 333 |
pitch_guidance=self.use_f0,
|
|
@@ -389,73 +373,6 @@ class VoiceConverter:
|
|
| 389 |
elapsed_time,
|
| 390 |
)
|
| 391 |
|
| 392 |
-
def convert_audio_batch(
|
| 393 |
-
self,
|
| 394 |
-
audio_input_paths: str,
|
| 395 |
-
audio_output_path: str,
|
| 396 |
-
**kwargs,
|
| 397 |
-
):
|
| 398 |
-
"""
|
| 399 |
-
Performs voice conversion on a batch of input audio files.
|
| 400 |
-
|
| 401 |
-
Args:
|
| 402 |
-
audio_input_paths (str): List of paths to the input audio files.
|
| 403 |
-
audio_output_path (str): Path to the output audio file.
|
| 404 |
-
resample_sr (int, optional): Resample sampling rate. Default is 0.
|
| 405 |
-
sid (int, optional): Speaker ID. Default is 0.
|
| 406 |
-
**kwargs: Additional keyword arguments.
|
| 407 |
-
|
| 408 |
-
"""
|
| 409 |
-
pid = os.getpid()
|
| 410 |
-
try:
|
| 411 |
-
with pathlib.Path(os.path.join(now_dir, "assets", "infer_pid.txt")).open(
|
| 412 |
-
"w",
|
| 413 |
-
) as pid_file:
|
| 414 |
-
pid_file.write(str(pid))
|
| 415 |
-
start_time = time.time()
|
| 416 |
-
print(f"Converting audio batch '{audio_input_paths}'...")
|
| 417 |
-
audio_files = [
|
| 418 |
-
f
|
| 419 |
-
for f in os.listdir(audio_input_paths)
|
| 420 |
-
if f.lower().endswith(
|
| 421 |
-
(
|
| 422 |
-
"wav",
|
| 423 |
-
"mp3",
|
| 424 |
-
"flac",
|
| 425 |
-
"ogg",
|
| 426 |
-
"opus",
|
| 427 |
-
"m4a",
|
| 428 |
-
"mp4",
|
| 429 |
-
"aac",
|
| 430 |
-
"alac",
|
| 431 |
-
"wma",
|
| 432 |
-
"aiff",
|
| 433 |
-
"webm",
|
| 434 |
-
"ac3",
|
| 435 |
-
),
|
| 436 |
-
)
|
| 437 |
-
]
|
| 438 |
-
print(f"Detected {len(audio_files)} audio files for inference.")
|
| 439 |
-
for a in audio_files:
|
| 440 |
-
new_input = os.path.join(audio_input_paths, a)
|
| 441 |
-
new_output = os.path.splitext(a)[0] + "_output.wav"
|
| 442 |
-
new_output = os.path.join(audio_output_path, new_output)
|
| 443 |
-
if pathlib.Path(new_output).exists():
|
| 444 |
-
continue
|
| 445 |
-
self.convert_audio(
|
| 446 |
-
audio_input_path=new_input,
|
| 447 |
-
audio_output_path=new_output,
|
| 448 |
-
**kwargs,
|
| 449 |
-
)
|
| 450 |
-
print(f"Conversion completed at '{audio_input_paths}'.")
|
| 451 |
-
elapsed_time = time.time() - start_time
|
| 452 |
-
print(f"Batch conversion completed in {elapsed_time:.2f} seconds.")
|
| 453 |
-
except Exception as error:
|
| 454 |
-
print(f"An error occurred during audio batch conversion: {error}")
|
| 455 |
-
print(traceback.format_exc())
|
| 456 |
-
finally:
|
| 457 |
-
pathlib.Path(os.path.join(now_dir, "assets", "infer_pid.txt")).unlink()
|
| 458 |
-
|
| 459 |
def get_vc(self, weight_root, sid):
|
| 460 |
"""
|
| 461 |
Loads the voice conversion model and sets up the pipeline.
|
|
@@ -509,9 +426,7 @@ class VoiceConverter:
|
|
| 509 |
try:
|
| 510 |
self.cpt = torch.load(weight_root, map_location="cpu", weights_only=False)
|
| 511 |
except Exception:
|
| 512 |
-
# Fallback for models saved with newer pickle protocols (e.g. protocol 83)
|
| 513 |
import pickle
|
| 514 |
-
import io
|
| 515 |
try:
|
| 516 |
with open(weight_root, "rb") as f:
|
| 517 |
self.cpt = pickle.load(f)
|
|
@@ -541,18 +456,16 @@ class VoiceConverter:
|
|
| 541 |
self.net_g.load_state_dict(self.cpt["weight"], strict=False)
|
| 542 |
self.net_g = self.net_g.to(self.config.device).float()
|
| 543 |
self.net_g.eval()
|
| 544 |
-
# Disable gradient tracking for all synthesizer params
|
| 545 |
for param in self.net_g.parameters():
|
| 546 |
param.requires_grad_(False)
|
| 547 |
-
# Compile synthesizer for faster CPU inference
|
| 548 |
if hasattr(torch, "compile"):
|
| 549 |
try:
|
| 550 |
self.net_g = torch.compile(
|
| 551 |
self.net_g, mode="reduce-overhead", fullgraph=False
|
| 552 |
)
|
| 553 |
logger.info("Synthesizer compiled with torch.compile")
|
| 554 |
-
except Exception as
|
| 555 |
-
logger.info("torch.compile skipped for net_g: %s",
|
| 556 |
|
| 557 |
def setup_vc_instance(self):
|
| 558 |
"""
|
|
|
|
| 1 |
+
"""Voice Converter for RVC."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
|
| 4 |
import logging
|
| 5 |
import os
|
|
|
|
| 8 |
import time
|
| 9 |
import traceback
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
import librosa
|
| 12 |
+
import numpy as np
|
| 13 |
+
import soxr
|
| 14 |
import soundfile as sf
|
| 15 |
+
import torch
|
| 16 |
from pedalboard import (
|
| 17 |
Bitcrush,
|
| 18 |
Chorus,
|
|
|
|
| 26 |
PitchShift,
|
| 27 |
Reverb,
|
| 28 |
)
|
| 29 |
+
from typing import TYPE_CHECKING, Unpack
|
| 30 |
|
|
|
|
|
|
|
| 31 |
import lazy_loader as lazy
|
| 32 |
|
| 33 |
+
from lib.rvc.config import Config
|
| 34 |
+
from lib.rvc.pipeline import Pipeline as VC
|
| 35 |
+
from lib.rvc.lib.algorithm.synthesizers import Synthesizer
|
| 36 |
+
from lib.rvc.lib.tools.split_audio import merge_audio, process_audio
|
| 37 |
+
from lib.rvc.lib.utils import load_audio_infer, load_embedding
|
|
|
|
|
|
|
| 38 |
|
| 39 |
if TYPE_CHECKING:
|
| 40 |
import noisereduce as nr
|
| 41 |
else:
|
| 42 |
nr = lazy.load("noisereduce")
|
| 43 |
|
| 44 |
+
now_dir = pathlib.Path.cwd()
|
| 45 |
+
sys.path.append(str(now_dir))
|
| 46 |
+
|
|
|
|
| 47 |
logger = logging.getLogger(__name__)
|
| 48 |
|
| 49 |
+
# Type alias for F0 method
|
| 50 |
+
F0Method = str
|
| 51 |
+
|
| 52 |
|
| 53 |
class VoiceConverter:
|
| 54 |
"""
|
|
|
|
| 59 |
"""
|
| 60 |
Initializes the VoiceConverter with default configuration, and sets up models and parameters.
|
| 61 |
"""
|
| 62 |
+
self.config = Config()
|
| 63 |
+
self.hubert_model = None
|
| 64 |
+
self.last_embedder_model = None
|
| 65 |
+
self.tgt_sr = None
|
| 66 |
+
self.net_g = None
|
| 67 |
+
self.vc = None
|
| 68 |
+
self.cpt = None
|
| 69 |
+
self.version = None
|
| 70 |
+
self.n_spk = None
|
| 71 |
+
self.use_f0 = None
|
|
|
|
|
|
|
| 72 |
self.loaded_model = None
|
| 73 |
|
| 74 |
def load_hubert(self, embedder_model: str, embedder_model_custom: str = None):
|
|
|
|
| 83 |
self.hubert_model = load_embedding(embedder_model, embedder_model_custom)
|
| 84 |
self.hubert_model = self.hubert_model.to(self.config.device).float()
|
| 85 |
self.hubert_model.eval()
|
|
|
|
| 86 |
for param in self.hubert_model.parameters():
|
| 87 |
param.requires_grad_(False)
|
|
|
|
| 88 |
if hasattr(torch, "compile"):
|
| 89 |
try:
|
| 90 |
self.hubert_model = torch.compile(
|
| 91 |
self.hubert_model, mode="reduce-overhead", fullgraph=False
|
| 92 |
)
|
| 93 |
logger.info("HuBERT compiled with torch.compile (reduce-overhead)")
|
| 94 |
+
except Exception as e:
|
| 95 |
+
logger.info("torch.compile skipped: %s", e)
|
| 96 |
|
| 97 |
@staticmethod
|
| 98 |
def remove_audio_noise(data, sr, reduction_strength=0.7):
|
|
|
|
| 106 |
|
| 107 |
"""
|
| 108 |
try:
|
|
|
|
| 109 |
reduced_noise = nr.reduce_noise(
|
| 110 |
y=data,
|
| 111 |
sr=sr,
|
|
|
|
| 132 |
print(f"Saving audio as {output_format}...")
|
| 133 |
audio, sample_rate = librosa.load(input_path, sr=None)
|
| 134 |
common_sample_rates = [
|
| 135 |
+
8000, 11025, 12000, 16000, 22050, 24000, 32000, 44100, 48000,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
]
|
| 137 |
target_sr = min(common_sample_rates, key=lambda x: abs(x - sample_rate))
|
| 138 |
audio = librosa.resample(
|
|
|
|
| 142 |
res_type="soxr_vhq",
|
| 143 |
)
|
| 144 |
sf.write(output_path, audio, target_sr, format=output_format.lower())
|
| 145 |
+
return output_path
|
| 146 |
except Exception as error:
|
| 147 |
print(f"An error occurred converting the audio format: {error}")
|
| 148 |
|
|
|
|
| 235 |
sid: int = 0,
|
| 236 |
proposed_pitch: bool = False,
|
| 237 |
proposed_pitch_threshold: float = 155.0,
|
| 238 |
+
**kwargs,
|
| 239 |
):
|
| 240 |
"""
|
| 241 |
Performs voice conversion on the input audio.
|
|
|
|
| 311 |
sid=sid,
|
| 312 |
audio=c,
|
| 313 |
pitch=pitch,
|
| 314 |
+
f0_method=f0_method or "rmvpe",
|
| 315 |
file_index=file_index,
|
| 316 |
index_rate=index_rate,
|
| 317 |
pitch_guidance=self.use_f0,
|
|
|
|
| 373 |
elapsed_time,
|
| 374 |
)
|
| 375 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
def get_vc(self, weight_root, sid):
|
| 377 |
"""
|
| 378 |
Loads the voice conversion model and sets up the pipeline.
|
|
|
|
| 426 |
try:
|
| 427 |
self.cpt = torch.load(weight_root, map_location="cpu", weights_only=False)
|
| 428 |
except Exception:
|
|
|
|
| 429 |
import pickle
|
|
|
|
| 430 |
try:
|
| 431 |
with open(weight_root, "rb") as f:
|
| 432 |
self.cpt = pickle.load(f)
|
|
|
|
| 456 |
self.net_g.load_state_dict(self.cpt["weight"], strict=False)
|
| 457 |
self.net_g = self.net_g.to(self.config.device).float()
|
| 458 |
self.net_g.eval()
|
|
|
|
| 459 |
for param in self.net_g.parameters():
|
| 460 |
param.requires_grad_(False)
|
|
|
|
| 461 |
if hasattr(torch, "compile"):
|
| 462 |
try:
|
| 463 |
self.net_g = torch.compile(
|
| 464 |
self.net_g, mode="reduce-overhead", fullgraph=False
|
| 465 |
)
|
| 466 |
logger.info("Synthesizer compiled with torch.compile")
|
| 467 |
+
except Exception as e:
|
| 468 |
+
logger.info("torch.compile skipped for net_g: %s", e)
|
| 469 |
|
| 470 |
def setup_vc_instance(self):
|
| 471 |
"""
|
{ultimate_rvc/rvc/infer β lib/rvc}/pipeline.py
RENAMED
|
@@ -1,23 +1,19 @@
|
|
|
|
|
| 1 |
import pathlib
|
| 2 |
import sys
|
| 3 |
|
| 4 |
-
import numpy as np
|
| 5 |
-
from scipy import signal
|
| 6 |
-
|
| 7 |
import faiss
|
|
|
|
|
|
|
| 8 |
import torch
|
| 9 |
import torch.nn.functional as F
|
| 10 |
-
|
| 11 |
-
import librosa
|
| 12 |
|
| 13 |
now_dir = pathlib.Path.cwd()
|
| 14 |
sys.path.append(str(now_dir))
|
| 15 |
|
| 16 |
-
import
|
| 17 |
-
|
| 18 |
-
from ultimate_rvc.rvc.lib.predictors.f0 import CREPE, FCPE, RMVPE
|
| 19 |
|
| 20 |
-
# logging.getLogger("faiss").setLevel(logging.WARNING)
|
| 21 |
logger = logging.getLogger(__name__)
|
| 22 |
|
| 23 |
# Constants for high-pass filter
|
|
|
|
| 1 |
+
import logging
|
| 2 |
import pathlib
|
| 3 |
import sys
|
| 4 |
|
|
|
|
|
|
|
|
|
|
| 5 |
import faiss
|
| 6 |
+
import librosa
|
| 7 |
+
import numpy as np
|
| 8 |
import torch
|
| 9 |
import torch.nn.functional as F
|
| 10 |
+
from scipy import signal
|
|
|
|
| 11 |
|
| 12 |
now_dir = pathlib.Path.cwd()
|
| 13 |
sys.path.append(str(now_dir))
|
| 14 |
|
| 15 |
+
from lib.rvc.predictors.f0 import CREPE, FCPE, RMVPE
|
|
|
|
|
|
|
| 16 |
|
|
|
|
| 17 |
logger = logging.getLogger(__name__)
|
| 18 |
|
| 19 |
# Constants for high-pass filter
|
{ultimate_rvc/rvc/lib β lib/rvc}/predictors/F0Extractor.py
RENAMED
|
@@ -12,11 +12,11 @@ import torchcrepe
|
|
| 12 |
|
| 13 |
import librosa
|
| 14 |
|
| 15 |
-
from
|
| 16 |
-
from
|
| 17 |
|
| 18 |
# from tools.anyf0.rmvpe import RMVPE
|
| 19 |
-
from
|
| 20 |
|
| 21 |
config = Config()
|
| 22 |
|
|
|
|
| 12 |
|
| 13 |
import librosa
|
| 14 |
|
| 15 |
+
from lib.rvc.common import RVC_MODELS_DIR
|
| 16 |
+
from lib.rvc.configs.config import Config
|
| 17 |
|
| 18 |
# from tools.anyf0.rmvpe import RMVPE
|
| 19 |
+
from lib.rvc.lib.predictors.RMVPE import RMVPE0Predictor
|
| 20 |
|
| 21 |
config = Config()
|
| 22 |
|
{ultimate_rvc/rvc/lib β lib/rvc}/predictors/FCPE.py
RENAMED
|
File without changes
|
{ultimate_rvc/rvc/lib β lib/rvc}/predictors/RMVPE.py
RENAMED
|
File without changes
|
lib/rvc/predictors/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""F0 pitch prediction modules."""
|
| 2 |
+
from __future__ import annotations
|
{ultimate_rvc/rvc/lib β lib/rvc}/predictors/f0.py
RENAMED
|
@@ -5,8 +5,8 @@ from torchfcpe import spawn_infer_model_from_pt
|
|
| 5 |
import torch
|
| 6 |
import torchcrepe
|
| 7 |
|
| 8 |
-
from
|
| 9 |
-
from
|
| 10 |
|
| 11 |
|
| 12 |
class RMVPE:
|
|
|
|
| 5 |
import torch
|
| 6 |
import torchcrepe
|
| 7 |
|
| 8 |
+
from lib.rvc.common import RVC_MODELS_DIR
|
| 9 |
+
from lib.rvc.lib.predictors.RMVPE import RMVPE0Predictor
|
| 10 |
|
| 11 |
|
| 12 |
class RMVPE:
|
lib/rvc/tools/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Audio processing tools."""
|
| 2 |
+
from __future__ import annotations
|
{ultimate_rvc/rvc/lib β lib/rvc}/tools/split_audio.py
RENAMED
|
File without changes
|
{ultimate_rvc/rvc/lib β lib/rvc}/utils.py
RENAMED
|
@@ -18,7 +18,7 @@ from transformers import HubertModel
|
|
| 18 |
import librosa
|
| 19 |
import soundfile as sf
|
| 20 |
|
| 21 |
-
from
|
| 22 |
|
| 23 |
# Remove this to see warnings about transformers models
|
| 24 |
warnings.filterwarnings("ignore")
|
|
|
|
| 18 |
import librosa
|
| 19 |
import soundfile as sf
|
| 20 |
|
| 21 |
+
from lib.rvc.common import RVC_MODELS_DIR
|
| 22 |
|
| 23 |
# Remove this to see warnings about transformers models
|
| 24 |
warnings.filterwarnings("ignore")
|
ultimate_rvc/__init__.py
DELETED
|
File without changes
|
ultimate_rvc/common.py
DELETED
|
@@ -1,37 +0,0 @@
|
|
| 1 |
-
"""Common variables used in the Ultimate RVC project."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import os
|
| 6 |
-
import sys
|
| 7 |
-
from pathlib import Path
|
| 8 |
-
|
| 9 |
-
BASE_DIR = Path.cwd()
|
| 10 |
-
VENV_DIR = Path(sys.prefix)
|
| 11 |
-
MODELS_DIR = Path(os.getenv("URVC_MODELS_DIR") or BASE_DIR / "models")
|
| 12 |
-
RVC_MODELS_DIR = MODELS_DIR / "rvc"
|
| 13 |
-
VOICE_MODELS_DIR = Path(
|
| 14 |
-
os.getenv("URVC_VOICE_MODELS_DIR") or RVC_MODELS_DIR / "voice_models",
|
| 15 |
-
)
|
| 16 |
-
EMBEDDER_MODELS_DIR = RVC_MODELS_DIR / "embedders"
|
| 17 |
-
CUSTOM_EMBEDDER_MODELS_DIR = EMBEDDER_MODELS_DIR / "custom"
|
| 18 |
-
PRETRAINED_MODELS_DIR = RVC_MODELS_DIR / "pretraineds"
|
| 19 |
-
CUSTOM_PRETRAINED_MODELS_DIR = PRETRAINED_MODELS_DIR / "custom"
|
| 20 |
-
|
| 21 |
-
SEPARATOR_MODELS_DIR = MODELS_DIR / "audio_separator"
|
| 22 |
-
TRAINING_MODELS_DIR = RVC_MODELS_DIR / "training"
|
| 23 |
-
AUDIO_DIR = Path(os.getenv("URVC_AUDIO_DIR") or BASE_DIR / "audio")
|
| 24 |
-
TEMP_DIR = Path(os.getenv("URVC_TEMP_DIR") or BASE_DIR / "temp")
|
| 25 |
-
CONFIG_DIR = Path(os.getenv("URVC_CONFIG_DIR") or BASE_DIR / "config")
|
| 26 |
-
NODE_PATH = Path(
|
| 27 |
-
(
|
| 28 |
-
os.getenv("GRADIO_NODE_PATH")
|
| 29 |
-
or (
|
| 30 |
-
VENV_DIR
|
| 31 |
-
/ f"lib/python{sys.version_info.major}.{sys.version_info.minor}"
|
| 32 |
-
/ "site-packages/nodejs_wheel/bin/node"
|
| 33 |
-
)
|
| 34 |
-
if sys.platform == "linux"
|
| 35 |
-
else VENV_DIR / "Lib/site-packages/nodejs_wheel/node.exe"
|
| 36 |
-
),
|
| 37 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ultimate_rvc/rvc/__init__.py
DELETED
|
@@ -1,4 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
The rvc package is a collection of tools for voice cloning using the RVC
|
| 3 |
-
method.
|
| 4 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ultimate_rvc/rvc/configs/__init__.py
DELETED
|
File without changes
|
ultimate_rvc/rvc/infer/__init__.py
DELETED
|
File without changes
|
ultimate_rvc/rvc/infer/typing_extra.py
DELETED
|
@@ -1,57 +0,0 @@
|
|
| 1 |
-
"""Extra type definitions for the `ultimate_rvc.rvc.infer` package."""
|
| 2 |
-
|
| 3 |
-
from typing import TypedDict
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
class ConvertAudioKwArgs(TypedDict, total=False):
|
| 7 |
-
"""Keyword arguments for the `convert_audio` function."""
|
| 8 |
-
|
| 9 |
-
# pre-processing arguments
|
| 10 |
-
formant_shifting: bool
|
| 11 |
-
formant_qfrency: float
|
| 12 |
-
formant_timbre: float
|
| 13 |
-
# reverb post-processing arguments
|
| 14 |
-
reverb: bool
|
| 15 |
-
reverb_room_size: float
|
| 16 |
-
reverb_damping: float
|
| 17 |
-
reverb_wet_level: float
|
| 18 |
-
reverb_dry_level: float
|
| 19 |
-
reverb_width: float
|
| 20 |
-
reverb_freeze_mode: int
|
| 21 |
-
# pitch shift post-processing arguments
|
| 22 |
-
pitch_shift: bool
|
| 23 |
-
pitch_shift_semitones: int
|
| 24 |
-
# limiter post-processing arguments
|
| 25 |
-
limiter: bool
|
| 26 |
-
limiter_threshold: float
|
| 27 |
-
limiter_release: float
|
| 28 |
-
# gain post-processing arguments
|
| 29 |
-
gain: bool
|
| 30 |
-
gain_db: int
|
| 31 |
-
# distortion post-processing arguments
|
| 32 |
-
distortion: bool
|
| 33 |
-
distortion_gain: int
|
| 34 |
-
# chorus post-processing arguments
|
| 35 |
-
chorus: bool
|
| 36 |
-
chorus_rate: float
|
| 37 |
-
chorus_depth: float
|
| 38 |
-
chorus_delay: int
|
| 39 |
-
chorus_feedback: float
|
| 40 |
-
chorus_mix: float
|
| 41 |
-
# bitcrush post-processing arguments
|
| 42 |
-
bitcrush: bool
|
| 43 |
-
bitcrush_bit_depth: int
|
| 44 |
-
# clipping post-processing arguments
|
| 45 |
-
clipping: bool
|
| 46 |
-
clipping_threshold: int
|
| 47 |
-
# compressor post-processing arguments
|
| 48 |
-
compressor: bool
|
| 49 |
-
compressor_threshold: int
|
| 50 |
-
compressor_ratio: int
|
| 51 |
-
compressor_attack: float
|
| 52 |
-
compressor_release: int
|
| 53 |
-
# delay post-processing arguments
|
| 54 |
-
delay: bool
|
| 55 |
-
delay_seconds: float
|
| 56 |
-
delay_feedback: float
|
| 57 |
-
delay_mix: float
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ultimate_rvc/rvc/lib/__init__.py
DELETED
|
File without changes
|
ultimate_rvc/rvc/lib/algorithm/__init__.py
DELETED
|
File without changes
|
ultimate_rvc/rvc/lib/algorithm/generators/__init__.py
DELETED
|
File without changes
|
ultimate_rvc/rvc/lib/predictors/__init__.py
DELETED
|
File without changes
|
ultimate_rvc/rvc/lib/tools/__init__.py
DELETED
|
File without changes
|
ultimate_rvc/typing_extra.py
DELETED
|
@@ -1,154 +0,0 @@
|
|
| 1 |
-
"""Extra typing for the Ultimate RVC project."""
|
| 2 |
-
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
from collections.abc import Mapping, Sequence
|
| 6 |
-
from enum import IntEnum, StrEnum
|
| 7 |
-
from os import PathLike
|
| 8 |
-
|
| 9 |
-
type StrPath = str | PathLike[str]
|
| 10 |
-
|
| 11 |
-
type Json = Mapping[str, Json] | Sequence[Json] | str | int | float | bool | None
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
class SeparationModel(StrEnum):
|
| 15 |
-
"""Enumeration of audio separation models."""
|
| 16 |
-
|
| 17 |
-
UVR_MDX_NET_VOC_FT = "UVR-MDX-NET-Voc_FT.onnx"
|
| 18 |
-
UVR_MDX_NET_KARA_2 = "UVR_MDXNET_KARA_2.onnx"
|
| 19 |
-
REVERB_HQ_BY_FOXJOY = "Reverb_HQ_By_FoxJoy.onnx"
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
class SegmentSize(IntEnum):
|
| 23 |
-
"""Enumeration of segment sizes for audio separation."""
|
| 24 |
-
|
| 25 |
-
SEG_64 = 64
|
| 26 |
-
SEG_128 = 128
|
| 27 |
-
SEG_256 = 256
|
| 28 |
-
SEG_512 = 512
|
| 29 |
-
SEG_1024 = 1024
|
| 30 |
-
SEG_2048 = 2048
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
class F0Method(StrEnum):
|
| 34 |
-
"""Enumeration of pitch extraction methods."""
|
| 35 |
-
|
| 36 |
-
RMVPE = "rmvpe"
|
| 37 |
-
CREPE = "crepe"
|
| 38 |
-
CREPE_TINY = "crepe-tiny"
|
| 39 |
-
FCPE = "fcpe"
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
class EmbedderModel(StrEnum):
|
| 43 |
-
"""Enumeration of audio embedding models."""
|
| 44 |
-
|
| 45 |
-
CONTENTVEC = "contentvec"
|
| 46 |
-
SPIN = "spin"
|
| 47 |
-
SPIN_V2 = "spin-v2"
|
| 48 |
-
CHINESE_HUBERT_BASE = "chinese-hubert-base"
|
| 49 |
-
JAPANESE_HUBERT_BASE = "japanese-hubert-base"
|
| 50 |
-
KOREAN_HUBERT_BASE = "korean-hubert-base"
|
| 51 |
-
CUSTOM = "custom"
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
class RVCContentType(StrEnum):
|
| 55 |
-
"""Enumeration of valid content to convert with RVC."""
|
| 56 |
-
|
| 57 |
-
VOCALS = "vocals"
|
| 58 |
-
VOICE = "voice"
|
| 59 |
-
SPEECH = "speech"
|
| 60 |
-
AUDIO = "audio"
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
class SampleRate(IntEnum):
|
| 64 |
-
"""Enumeration of supported audio sample rates."""
|
| 65 |
-
|
| 66 |
-
HZ_16K = 16000
|
| 67 |
-
HZ_44K = 44100
|
| 68 |
-
HZ_48K = 48000
|
| 69 |
-
HZ_96K = 96000
|
| 70 |
-
HZ_192K = 192000
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
class AudioExt(StrEnum):
|
| 74 |
-
"""Enumeration of supported audio file formats."""
|
| 75 |
-
|
| 76 |
-
MP3 = "mp3"
|
| 77 |
-
WAV = "wav"
|
| 78 |
-
FLAC = "flac"
|
| 79 |
-
OGG = "ogg"
|
| 80 |
-
M4A = "m4a"
|
| 81 |
-
AAC = "aac"
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
class DeviceType(StrEnum):
|
| 85 |
-
"""Enumeration of device types for training voice models."""
|
| 86 |
-
|
| 87 |
-
AUTOMATIC = "Automatic"
|
| 88 |
-
CPU = "CPU"
|
| 89 |
-
GPU = "GPU"
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
class PrecisionType(StrEnum):
|
| 93 |
-
"""Enumeration of precision types for training voice models."""
|
| 94 |
-
|
| 95 |
-
FP32 = "fp32"
|
| 96 |
-
FP16 = "fp16"
|
| 97 |
-
BF16 = "bf16"
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
class TrainingSampleRate(IntEnum):
|
| 101 |
-
"""Enumeration of sample rates for training voice models."""
|
| 102 |
-
|
| 103 |
-
HZ_32K = 32000
|
| 104 |
-
HZ_40K = 40000
|
| 105 |
-
HZ_48K = 48000
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
class AudioSplitMethod(StrEnum):
|
| 109 |
-
"""
|
| 110 |
-
Enumeration of methods to use for splitting audio files during
|
| 111 |
-
dataset preprocessing.
|
| 112 |
-
"""
|
| 113 |
-
|
| 114 |
-
SKIP = "Skip"
|
| 115 |
-
SIMPLE = "Simple"
|
| 116 |
-
AUTOMATIC = "Automatic"
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
class AudioNormalizationMode(StrEnum):
|
| 120 |
-
"""
|
| 121 |
-
Enumeration of audio normalization methods during
|
| 122 |
-
dataset preprocessing.
|
| 123 |
-
"""
|
| 124 |
-
|
| 125 |
-
NONE = "none"
|
| 126 |
-
PRE = "pre"
|
| 127 |
-
POST = "post"
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
class Vocoder(StrEnum):
|
| 131 |
-
"""Enumeration of vocoders for training voice models."""
|
| 132 |
-
|
| 133 |
-
HIFI_GAN = "HiFi-GAN"
|
| 134 |
-
MRF_HIFI_GAN = "MRF HiFi-GAN"
|
| 135 |
-
REFINE_GAN = "RefineGAN"
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
class IndexAlgorithm(StrEnum):
|
| 139 |
-
"""Enumeration of indexing algorithms for training voice models."""
|
| 140 |
-
|
| 141 |
-
AUTO = "Auto"
|
| 142 |
-
FAISS = "Faiss"
|
| 143 |
-
KMEANS = "KMeans"
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
class PretrainedType(StrEnum):
|
| 147 |
-
"""
|
| 148 |
-
Enumeration of the possible types of pretrained models to finetune
|
| 149 |
-
voice models on.
|
| 150 |
-
"""
|
| 151 |
-
|
| 152 |
-
NONE = "None"
|
| 153 |
-
DEFAULT = "Default"
|
| 154 |
-
CUSTOM = "Custom"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|