RVC-CH

Running

File size: 7,171 Bytes

import logging
import os
import pathlib
import re
import sys
import unicodedata
import warnings

import soxr

import wget

import numpy as np

from torch import nn
from transformers import HubertModel

import librosa
import soundfile as sf

from lib.rvc.common import RVC_MODELS_DIR

# Remove this to see warnings about transformers models
warnings.filterwarnings("ignore")

logging.getLogger("fairseq").setLevel(logging.ERROR)
logging.getLogger("faiss.loader").setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)
logging.getLogger("torch").setLevel(logging.ERROR)

now_dir = pathlib.Path.cwd()
sys.path.append(str(now_dir))

base_path = os.path.join(str(RVC_MODELS_DIR), "formant", "stftpitchshift")
stft = base_path + ".exe" if sys.platform == "win32" else base_path


class HubertModelWithFinalProj(HubertModel):
    def __init__(self, config):
        super().__init__(config)
        self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)


def load_audio_16k(file):
    # this is used by f0 and feature extractions that load preprocessed 16k files, so there's no need to resample
    try:
        audio, sr = librosa.load(file, sr=16000)
    except Exception as error:
        raise RuntimeError(f"An error occurred loading the audio: {error}")

    return audio.flatten()


def load_audio(file, sample_rate):
    try:
        file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
        audio, sr = sf.read(file)
        if len(audio.shape) > 1:
            audio = librosa.to_mono(audio.T)
        if sr != sample_rate:
            audio = librosa.resample(
                audio,
                orig_sr=sr,
                target_sr=sample_rate,
                res_type="soxr_vhq",
            )
    except Exception as error:
        raise RuntimeError(f"An error occurred loading the audio: {error}")

    return audio.flatten()


def load_audio_infer(
    file,
    sample_rate,
    **kwargs,
):
    formant_shifting = kwargs.get("formant_shifting", False)
    try:
        file = file.strip(" ").strip('"').strip("\n").strip('"').strip(" ")
        if not pathlib.Path(file).is_file():
            raise FileNotFoundError(f"File not found: {file}")
        audio, sr = sf.read(file)
        if len(audio.shape) > 1:
            audio = librosa.to_mono(audio.T)
        if sr != sample_rate:
            audio = librosa.resample(
                audio,
                orig_sr=sr,
                target_sr=sample_rate,
                res_type="soxr_vhq",
            )
        if formant_shifting:
            formant_qfrency = kwargs.get("formant_qfrency", 0.8)
            formant_timbre = kwargs.get("formant_timbre", 0.8)

            from stftpitchshift import StftPitchShift

            pitchshifter = StftPitchShift(1024, 32, sample_rate)
            audio = pitchshifter.shiftpitch(
                audio,
                factors=1,
                quefrency=formant_qfrency * 1e-3,
                distortion=formant_timbre,
            )
    except Exception as error:
        raise RuntimeError(f"An error occurred loading the audio: {error}")
    return np.array(audio).flatten()


def format_title(title):
    formatted_title = unicodedata.normalize("NFC", title)
    formatted_title = re.sub(r"[\u2500-\u257F]+", "", formatted_title)
    formatted_title = re.sub(r"[^\w\s.-]", "", formatted_title, flags=re.UNICODE)
    formatted_title = re.sub(r"\s+", "_", formatted_title)
    return formatted_title


def load_embedding(embedder_model, custom_embedder=None):
    embedder_root = os.path.join(str(RVC_MODELS_DIR), "embedders")
    embedding_list = {
        "contentvec": os.path.join(embedder_root, "contentvec"),
        "spin": os.path.join(embedder_root, "spin"),
        "spin-v2": os.path.join(embedder_root, "spin-v2"),
        "chinese-hubert-base": os.path.join(embedder_root, "chinese_hubert_base"),
        "japanese-hubert-base": os.path.join(embedder_root, "japanese_hubert_base"),
        "korean-hubert-base": os.path.join(embedder_root, "korean_hubert_base"),
    }

    online_embedders = {
        "contentvec": (
            "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/contentvec/pytorch_model.bin"
        ),
        "spin": (
            "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/spin/pytorch_model.bin"
        ),
        "spin-v2": (
            "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/spin-v2/pytorch_model.bin"
        ),
        "chinese-hubert-base": (
            "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/chinese_hubert_base/pytorch_model.bin"
        ),
        "japanese-hubert-base": (
            "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/japanese_hubert_base/pytorch_model.bin"
        ),
        "korean-hubert-base": (
            "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/korean_hubert_base/pytorch_model.bin"
        ),
    }

    config_files = {
        "contentvec": (
            "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/contentvec/config.json"
        ),
        "spin": (
            "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/spin/config.json"
        ),
        "spin-v2": (
            "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/spin-v2/config.json"
        ),
        "chinese-hubert-base": (
            "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/chinese_hubert_base/config.json"
        ),
        "japanese-hubert-base": (
            "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/japanese_hubert_base/config.json"
        ),
        "korean-hubert-base": (
            "https://huggingface.co/JackismyShephard/ultimate-rvc/resolve/main/Resources/embedders/korean_hubert_base/config.json"
        ),
    }

    if embedder_model == "custom":
        if pathlib.Path(custom_embedder).exists():
            model_path = custom_embedder
        else:
            print(f"Custom embedder not found: {custom_embedder}, using contentvec")
            model_path = embedding_list["contentvec"]
    else:
        model_path = embedding_list[embedder_model]
        bin_file = os.path.join(model_path, "pytorch_model.bin")
        json_file = os.path.join(model_path, "config.json")
        pathlib.Path(model_path).mkdir(exist_ok=True, parents=True)
        if not pathlib.Path(bin_file).exists():
            url = online_embedders[embedder_model]
            print(f"Downloading {url} to {model_path}...")
            wget.download(url, out=bin_file)
        if not pathlib.Path(json_file).exists():
            url = config_files[embedder_model]
            print(f"Downloading {url} to {model_path}...")
            wget.download(url, out=json_file)

    models = HubertModelWithFinalProj.from_pretrained(model_path)
    return models