diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..302cd10c84613a89a4bcb542b0c6f77e09a7ad8f --- /dev/null +++ b/.gitignore @@ -0,0 +1,186 @@ +.DS_Store + +tmp/ + + +### Generated by gibo (https://github.com/simonwhitaker/gibo) +### https://raw.github.com/github/gitignore/4488915eec0b3a45b5c63ead28f286819c0917de/Global/VisualStudioCode.gitignore + +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + + +### https://raw.github.com/github/gitignore/4488915eec0b3a45b5c63ead28f286819c0917de/Python.gitignore + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +# lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + + diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000000000000000000000000000000000000..5ff56ff9d379acd52176ef9d118209e7a12a566e --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,7 @@ +{ + "python.formatting.provider": "black", + "editor.codeActionsOnSave": { + "source.organizeImports": "explicit" + }, + "editor.formatOnSave": true, +} \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..9ba3f8d92380507312f92d1fd93a5f3e0782b45c --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 ddPn08 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README-ja.md b/README-ja.md new file mode 100644 index 0000000000000000000000000000000000000000..36e2af6e07ae43ed4a15660cc9d10dd072ecabfb --- /dev/null +++ b/README-ja.md @@ -0,0 +1,54 @@ +

RVC-WebUI

+
+

+ +[`liujing04/Retrieval-based-Voice-Conversion-WebUI`](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI) の再構築プロジェクト + +

+
+ +--- + +
+

+ +[日本語](README-ja.md) | [English](README.md) + +

+
+ +
+ +# 起動 + +## Windows +`webui-user.bat` をダブルクリックして、webuiを起動します。 + +## Linux or Mac +`webui.sh` を実行して、webuiを起動します。 + +
+ +``` +テスト環境: Windows 10, Python 3.10.9, torch 2.0.0+cu118 +``` + +
+ +# トラブルシューティング + +## `error: Microsoft Visual C++ 14.0 or greater is required.` + +Microsoft C++ Build Tools がインストールされている必要があります。 + +### Step 1: インストーラーをダウンロード +[Download](https://visualstudio.microsoft.com/ja/thank-you-downloading-visual-studio/?sku=BuildTools&rel=16) + +### Step 2: `C++ Build Tools` をインストール +インストーラーを実行し、`Workloads` タブで `C++ Build Tools` を選択します。 + +
+ +# クレジット +- [`liujing04/Retrieval-based-Voice-Conversion-WebUI`](https://github.com/liujing04/Retrieval-based-Voice-Conversion-WebUI) +- [`teftef6220/Voice_Separation_and_Selection`](https://github.com/teftef6220/Voice_Separation_and_Selection) diff --git a/bin/.gitignore b/bin/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d6b7ef32c8478a48c3994dcadc86837f4371184d --- /dev/null +++ b/bin/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/configs/32k-768.json b/configs/32k-768.json new file mode 100644 index 0000000000000000000000000000000000000000..8f73dfbeba045f0b4d0036f3ae7bff83809b8795 --- /dev/null +++ b/configs/32k-768.json @@ -0,0 +1,47 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "epochs": 20000, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 4, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 12800, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sampling_rate": 32000, + "filter_length": 1024, + "hop_length": 320, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,4,2,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "emb_channels": 768, + "spk_embed_dim": 109 + } +} diff --git a/configs/32k.json b/configs/32k.json new file mode 100644 index 0000000000000000000000000000000000000000..14d0765d40e5fe2bdd605044bc998c38e992c1c1 --- /dev/null +++ b/configs/32k.json @@ -0,0 +1,47 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "epochs": 20000, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 4, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 12800, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sampling_rate": 32000, + "filter_length": 1024, + "hop_length": 320, + "win_length": 1024, + "n_mel_channels": 80, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,4,2,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "emb_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/configs/40k-768.json b/configs/40k-768.json new file mode 100644 index 0000000000000000000000000000000000000000..9bb0684bfcb70992fe79445ee54cf6308b1a17dc --- /dev/null +++ b/configs/40k-768.json @@ -0,0 +1,47 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "epochs": 20000, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 4, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 12800, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sampling_rate": 40000, + "filter_length": 2048, + "hop_length": 400, + "win_length": 2048, + "n_mel_channels": 125, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,10,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "emb_channels": 768, + "spk_embed_dim": 109 + } +} diff --git a/configs/40k.json b/configs/40k.json new file mode 100644 index 0000000000000000000000000000000000000000..4d4f61477d091f8faf1124d531ae8923326eeae9 --- /dev/null +++ b/configs/40k.json @@ -0,0 +1,47 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "epochs": 20000, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 4, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 12800, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sampling_rate": 40000, + "filter_length": 2048, + "hop_length": 400, + "win_length": 2048, + "n_mel_channels": 125, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,10,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "emb_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/configs/48k-768.json b/configs/48k-768.json new file mode 100644 index 0000000000000000000000000000000000000000..74f069fbe63f3b6271560b00eeb0a1d039c8843e --- /dev/null +++ b/configs/48k-768.json @@ -0,0 +1,47 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "epochs": 20000, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 4, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 11520, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sampling_rate": 48000, + "filter_length": 2048, + "hop_length": 480, + "win_length": 2048, + "n_mel_channels": 128, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,6,2,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "emb_channels": 768, + "spk_embed_dim": 109 + } +} diff --git a/configs/48k.json b/configs/48k.json new file mode 100644 index 0000000000000000000000000000000000000000..59e909f29db658bd50b13f4082bab1a1d27b0390 --- /dev/null +++ b/configs/48k.json @@ -0,0 +1,47 @@ +{ + "train": { + "log_interval": 200, + "seed": 1234, + "epochs": 20000, + "learning_rate": 1e-4, + "betas": [0.8, 0.99], + "eps": 1e-9, + "batch_size": 4, + "fp16_run": true, + "lr_decay": 0.999875, + "segment_size": 11520, + "init_lr_ratio": 1, + "warmup_epochs": 0, + "c_mel": 45, + "c_kl": 1.0 + }, + "data": { + "max_wav_value": 32768.0, + "sampling_rate": 48000, + "filter_length": 2048, + "hop_length": 480, + "win_length": 2048, + "n_mel_channels": 128, + "mel_fmin": 0.0, + "mel_fmax": null + }, + "model": { + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3,7,11], + "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]], + "upsample_rates": [10,6,2,2,2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16,16,4,4,4], + "use_spectral_norm": false, + "gin_channels": 256, + "emb_channels": 256, + "spk_embed_dim": 109 + } +} diff --git a/dev.py b/dev.py new file mode 100644 index 0000000000000000000000000000000000000000..0cca7b39ab42195e852178064d2bab4ba014fbd9 --- /dev/null +++ b/dev.py @@ -0,0 +1,3 @@ +import modules.ui as ui + +demo = ui.create_ui() diff --git a/launch.py b/launch.py new file mode 100644 index 0000000000000000000000000000000000000000..f627c6b4f6616bbd2366f0151ffec966f0f90abb --- /dev/null +++ b/launch.py @@ -0,0 +1,139 @@ +import importlib.util +import os +import shlex +import subprocess +import sys + +commandline_args = os.environ.get("COMMANDLINE_ARGS", "") +sys.argv += shlex.split(commandline_args) + +python = sys.executable +git = os.environ.get("GIT", "git") +index_url = os.environ.get("INDEX_URL", "") +stored_commit_hash = None +skip_install = False + + +def run(command, desc=None, errdesc=None, custom_env=None): + if desc is not None: + print(desc) + + result = subprocess.run( + command, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True, + env=os.environ if custom_env is None else custom_env, + ) + + if result.returncode != 0: + message = f"""{errdesc or 'Error running command'}. +Command: {command} +Error code: {result.returncode} +stdout: {result.stdout.decode(encoding="utf8", errors="ignore") if len(result.stdout)>0 else ''} +stderr: {result.stderr.decode(encoding="utf8", errors="ignore") if len(result.stderr)>0 else ''} +""" + raise RuntimeError(message) + + return result.stdout.decode(encoding="utf8", errors="ignore") + + +def check_run(command): + result = subprocess.run( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True + ) + return result.returncode == 0 + + +def is_installed(package): + try: + spec = importlib.util.find_spec(package) + except ModuleNotFoundError: + return False + + return spec is not None + + +def commit_hash(): + global stored_commit_hash + + if stored_commit_hash is not None: + return stored_commit_hash + + try: + stored_commit_hash = run(f"{git} rev-parse HEAD").strip() + except Exception: + stored_commit_hash = "" + + return stored_commit_hash + + +def run_pip(args, desc=None): + if skip_install: + return + + index_url_line = f" --index-url {index_url}" if index_url != "" else "" + return run( + f'"{python}" -m pip {args} --prefer-binary{index_url_line}', + desc=f"Installing {desc}", + errdesc=f"Couldn't install {desc}", + ) + + +def run_python(code, desc=None, errdesc=None): + return run(f'"{python}" -c "{code}"', desc, errdesc) + + +def extract_arg(args, name): + return [x for x in args if x != name], name in args + + +def prepare_environment(): + commit = commit_hash() + + print(f"Python {sys.version}") + print(f"Commit hash: {commit}") + + torch_command = os.environ.get( + "TORCH_COMMAND", + "pip install torch torchaudio --extra-index-url https://download.pytorch.org/whl/cu118", + ) + + sys.argv, skip_install = extract_arg(sys.argv, "--skip-install") + if skip_install: + return + + sys.argv, reinstall_torch = extract_arg(sys.argv, "--reinstall-torch") + ngrok = "--ngrok" in sys.argv + + if reinstall_torch or not is_installed("torch") or not is_installed("torchaudio"): + run( + f'"{python}" -m {torch_command}', + "Installing torch and torchaudio", + "Couldn't install torch", + ) + + if not is_installed("pyngrok") and ngrok: + run_pip("install pyngrok", "ngrok") + + run( + f'"{python}" -m pip install -r requirements.txt', + desc=f"Installing requirements", + errdesc=f"Couldn't install requirements", + ) + + +def start(): + os.environ["PATH"] = ( + os.path.join(os.path.dirname(__file__), "bin") + + os.pathsep + + os.environ.get("PATH", "") + ) + subprocess.run( + [python, "webui.py", *sys.argv[1:]], + ) + + +if __name__ == "__main__": + prepare_environment() + start() diff --git a/lib/rvc/attentions.py b/lib/rvc/attentions.py new file mode 100644 index 0000000000000000000000000000000000000000..f2c1f69c261b3eb758f0b39490a1186153011abb --- /dev/null +++ b/lib/rvc/attentions.py @@ -0,0 +1,415 @@ +import math + +import torch +from torch import nn +from torch.nn import functional as F + +from . import commons +from .modules import LayerNorm + + +class Encoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + window_size=10, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class Decoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + proximal_bias=False, + proximal_init=True, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + + self.drop = nn.Dropout(p_dropout) + self.self_attn_layers = nn.ModuleList() + self.norm_layers_0 = nn.ModuleList() + self.encdec_attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.self_attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + proximal_bias=proximal_bias, + proximal_init=proximal_init, + ) + ) + self.norm_layers_0.append(LayerNorm(hidden_channels)) + self.encdec_attn_layers.append( + MultiHeadAttention( + hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + causal=True, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask, h, h_mask): + """ + x: decoder input + h: encoder output + """ + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( + device=x.device, dtype=x.dtype + ) + encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.self_attn_layers[i](x, x, self_attn_mask) + y = self.drop(y) + x = self.norm_layers_0[i](x + y) + + y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class MultiHeadAttention(nn.Module): + def __init__( + self, + channels, + out_channels, + n_heads, + p_dropout=0.0, + window_size=None, + heads_share=True, + block_length=None, + proximal_bias=False, + proximal_init=False, + ): + super().__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.p_dropout = p_dropout + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + nn.init.xavier_uniform_(self.conv_v.weight) + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + if self.window_size is not None: + assert ( + t_s == t_t + ), "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys( + query / math.sqrt(self.k_channels), key_relative_embeddings + ) + scores_local = self._relative_position_to_absolute_position(rel_logits) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype + ) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + assert ( + t_s == t_t + ), "Local attention is only available for self-attention." + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores.masked_fill(block_mask == 0, -1e4) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s + ) + output = output + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings + ) + output = ( + output.transpose(2, 3).contiguous().view(b, d, t_t) + ) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + ) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[ + :, slice_start_position:slice_end_position + ] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad( + x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) + ) + + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ + :, :, :length, length - 1 : + ] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.size() + # padd along column + x = F.pad( + x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) + ) + x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(nn.Module): + def __init__( + self, + in_channels, + out_channels, + filter_channels, + kernel_size, + p_dropout=0.0, + activation=None, + causal=False, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + self.causal = causal + + if causal: + self.padding = self._causal_padding + else: + self.padding = self._same_padding + + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask): + x = self.conv_1(self.padding(x * x_mask)) + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(self.padding(x * x_mask)) + return x * x_mask + + def _causal_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = self.kernel_size - 1 + pad_r = 0 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x + + def _same_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = (self.kernel_size - 1) // 2 + pad_r = self.kernel_size // 2 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x diff --git a/lib/rvc/checkpoints.py b/lib/rvc/checkpoints.py new file mode 100644 index 0000000000000000000000000000000000000000..205ef98ba54fa821ee0c65d4bec0ee60fd2a4441 --- /dev/null +++ b/lib/rvc/checkpoints.py @@ -0,0 +1,149 @@ +import os +from collections import OrderedDict +from typing import * + +import torch + + +def write_config(state_dict: Dict[str, Any], cfg: Dict[str, Any]): + state_dict["config"] = [] + for key, x in cfg.items(): + state_dict["config"].append(x) + state_dict["params"] = cfg + + +def create_trained_model( + weights: Dict[str, Any], + version: Literal["v1", "v2"], + sr: str, + f0: bool, + emb_name: str, + emb_ch: int, + emb_output_layer: int, + epoch: int, + speaker_info: Optional[dict[str, int]] +): + state_dict = OrderedDict() + state_dict["weight"] = {} + for key in weights.keys(): + if "enc_q" in key: + continue + state_dict["weight"][key] = weights[key].half() + if sr == "40k": + write_config( + state_dict, + { + "spec_channels": 1025, + "segment_size": 32, + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3, 7, 11], + "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + "upsample_rates": [10, 10, 2, 2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16, 16, 4, 4], + "spk_embed_dim": 109 if speaker_info is None else len(speaker_info), + "gin_channels": 256, + "emb_channels": emb_ch, + "sr": 40000, + }, + ) + elif sr == "48k": + write_config( + state_dict, + { + "spec_channels": 1025, + "segment_size": 32, + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3, 7, 11], + "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + "upsample_rates": [10, 6, 2, 2, 2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16, 16, 4, 4, 4], + "spk_embed_dim": 109 if speaker_info is None else len(speaker_info), + "gin_channels": 256, + "emb_channels": emb_ch, + "sr": 48000, + }, + ) + elif sr == "32k": + write_config( + state_dict, + { + "spec_channels": 513, + "segment_size": 32, + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [3, 7, 11], + "resblock_dilation_sizes": [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + "upsample_rates": [10, 4, 2, 2, 2], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [16, 16, 4, 4, 4], + "spk_embed_dim": 109 if speaker_info is None else len(speaker_info), + "gin_channels": 256, + "emb_channels": emb_ch, + "sr": 32000, + }, + ) + state_dict["version"] = version + state_dict["info"] = f"{epoch}epoch" + state_dict["sr"] = sr + state_dict["f0"] = 1 if f0 else 0 + state_dict["embedder_name"] = emb_name + state_dict["embedder_output_layer"] = emb_output_layer + if not speaker_info is None: + state_dict["speaker_info"] = {str(v): str(k) for k, v in speaker_info.items()} + return state_dict + + +def save( + model, + version: Literal["v1", "v2"], + sr: str, + f0: bool, + emb_name: str, + emb_ch: int, + emb_output_layer: int, + filepath: str, + epoch: int, + speaker_info: Optional[dict[str, int]] +): + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + + print(f"save: emb_name: {emb_name} {emb_ch}") + + state_dict = create_trained_model( + state_dict, + version, + sr, + f0, + emb_name, + emb_ch, + emb_output_layer, + epoch, + speaker_info + ) + os.makedirs(os.path.dirname(filepath), exist_ok=True) + torch.save(state_dict, filepath) diff --git a/lib/rvc/commons.py b/lib/rvc/commons.py new file mode 100644 index 0000000000000000000000000000000000000000..99b91812646cd934ce69a5d829f99fd7ca23a919 --- /dev/null +++ b/lib/rvc/commons.py @@ -0,0 +1,163 @@ +import math + +import torch +from torch.nn import functional as F + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def kl_divergence(m_p, logs_p, m_q, logs_q): + """KL(P||Q)""" + kl = (logs_q - logs_p) - 0.5 + kl += ( + 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) + ) + return kl + + +def rand_gumbel(shape): + """Sample from the Gumbel distribution, protect from overflows.""" + uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 + return -torch.log(-torch.log(uniform_samples)) + + +def rand_gumbel_like(x): + g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) + return g + + +def slice_segments(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, :, idx_str:idx_end] + return ret + + +def slice_segments2(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, idx_str:idx_end] + return ret + + +def rand_slice_segments(x, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + +def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): + position = torch.arange(length, dtype=torch.float) + num_timescales = channels // 2 + log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( + num_timescales - 1 + ) + inv_timescales = min_timescale * torch.exp( + torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment + ) + scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) + signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) + signal = F.pad(signal, [0, 0, 0, channels % 2]) + signal = signal.view(1, channels, length) + return signal + + +def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return x + signal.to(dtype=x.dtype, device=x.device) + + +def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) + + +def subsequent_mask(length): + mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) + return mask + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def shift_1d(x): + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] + return x + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def generate_path(duration, mask): + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + b, _, t_y, t_x = mask.shape + cum_duration = torch.cumsum(duration, -1) + + cum_duration_flat = cum_duration.view(b * t_x) + path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) + path = path.view(b, t_x, t_y) + path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] + path = path.unsqueeze(1).transpose(2, 3) * mask + return path + + +def clip_grad_value_(parameters, clip_value, norm_type=2): + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) + if clip_value is not None: + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + if clip_value is not None: + p.grad.data.clamp_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1.0 / norm_type) + return total_norm diff --git a/lib/rvc/config.py b/lib/rvc/config.py new file mode 100644 index 0000000000000000000000000000000000000000..6a0dc817a7f194f38663f2c957dd8f488829903f --- /dev/null +++ b/lib/rvc/config.py @@ -0,0 +1,71 @@ +from typing import * + +from pydantic import BaseModel + + +class TrainConfigTrain(BaseModel): + log_interval: int + seed: int + epochs: int + learning_rate: float + betas: List[float] + eps: float + batch_size: int + fp16_run: bool + lr_decay: float + segment_size: int + init_lr_ratio: int + warmup_epochs: int + c_mel: int + c_kl: float + + +class TrainConfigData(BaseModel): + max_wav_value: float + sampling_rate: int + filter_length: int + hop_length: int + win_length: int + n_mel_channels: int + mel_fmin: float + mel_fmax: Any + + +class TrainConfigModel(BaseModel): + inter_channels: int + hidden_channels: int + filter_channels: int + n_heads: int + n_layers: int + kernel_size: int + p_dropout: int + resblock: str + resblock_kernel_sizes: List[int] + resblock_dilation_sizes: List[List[int]] + upsample_rates: List[int] + upsample_initial_channel: int + upsample_kernel_sizes: List[int] + use_spectral_norm: bool + gin_channels: int + emb_channels: int + spk_embed_dim: int + + +class TrainConfig(BaseModel): + version: Literal["v1", "v2"] = "v2" + train: TrainConfigTrain + data: TrainConfigData + model: TrainConfigModel + + +class DatasetMetaItem(BaseModel): + gt_wav: str + co256: str + f0: Optional[str] + f0nsf: Optional[str] + speaker_id: int + + +class DatasetMetadata(BaseModel): + files: Dict[str, DatasetMetaItem] + # mute: DatasetMetaItem diff --git a/lib/rvc/data_utils.py b/lib/rvc/data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..bebd7915f0f10e4c8a7c0c905d61e4e0a7df49f4 --- /dev/null +++ b/lib/rvc/data_utils.py @@ -0,0 +1,515 @@ +import os +import traceback + +import numpy as np +import torch +import torch.utils.data + +from .config import DatasetMetadata, DatasetMetaItem, TrainConfigData +from .mel_processing import spectrogram_torch +from .utils import load_wav_to_torch + + +class TextAudioLoader(torch.utils.data.Dataset): + """ + 1) loads audio, text pairs + 2) normalizes text and converts them to sequences of integers + 3) computes spectrograms from audio files. + """ + + def __init__(self, dataset_meta: DatasetMetadata, data: TrainConfigData): + self.dataset_meta = dataset_meta + self.max_wav_value = data.max_wav_value + self.sampling_rate = data.sampling_rate + self.filter_length = data.filter_length + self.hop_length = data.hop_length + self.win_length = data.win_length + self.sampling_rate = data.sampling_rate + self.min_text_len = getattr(data, "min_text_len", 1) + self.max_text_len = getattr(data, "max_text_len", 5000) + self._filter() + + def _filter(self): + """ + Filter text & store spec lengths + """ + # Store spectrogram lengths for Bucketing + # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) + # spec_length = wav_length // hop_length + lengths = [] + for key, data in self.dataset_meta.files.items(): + if ( + self.min_text_len <= len(data.co256) + and len(data.co256) <= self.max_text_len + ): + lengths.append(os.path.getsize(data.gt_wav) // (2 * self.hop_length)) + else: + del self.dataset_meta.files[key] + self.lengths = lengths + + def get_sid(self, sid): + sid = torch.LongTensor([int(sid)]) + return sid + + def get_audio_text_pair(self, data: DatasetMetaItem): + # separate filename and text + file = data.gt_wav + phone = data.co256 + dv = data.speaker_id + + phone = self.get_labels(phone) + spec, wav = self.get_audio(file) + dv = self.get_sid(dv) + + len_phone = phone.size()[0] + len_spec = spec.size()[-1] + if len_phone != len_spec: + len_min = min(len_phone, len_spec) + len_wav = len_min * self.hop_length + spec = spec[:, :len_min] + wav = wav[:, :len_wav] + phone = phone[:len_min, :] + return (spec, wav, phone, dv) + + def get_labels(self, phone): + phone = np.load(phone) + phone = np.repeat(phone, 2, axis=0) + n_num = min(phone.shape[0], 900) # DistributedBucketSampler + phone = phone[:n_num, :] + phone = torch.FloatTensor(phone) + return phone + + def get_audio(self, filename): + audio, sampling_rate = load_wav_to_torch(filename) + if sampling_rate != self.sampling_rate: + raise ValueError( + "{} SR doesn't match target {} SR".format( + sampling_rate, self.sampling_rate + ) + ) + # audio_norm = audio / self.max_wav_value + audio_norm = audio.unsqueeze(0) + spec_filename = filename.replace(".wav", ".spec.pt") + if os.path.exists(spec_filename): + try: + spec = torch.load(spec_filename) + except: + print(spec_filename, traceback.format_exc()) + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + else: + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + return spec, audio_norm + + def __getitem__(self, index): + _, data = list(self.dataset_meta.files.items())[index] + return self.get_audio_text_pair(data) + + def __len__(self): + return len(self.dataset_meta.files) + + +class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): + """ + 1) loads audio, text pairs + 2) normalizes text and converts them to sequences of integers + 3) computes spectrograms from audio files. + """ + + def __init__(self, dataset_meta: DatasetMetadata, data: TrainConfigData): + self.dataset_meta = dataset_meta + self.max_wav_value = data.max_wav_value + self.sampling_rate = data.sampling_rate + self.filter_length = data.filter_length + self.hop_length = data.hop_length + self.win_length = data.win_length + self.sampling_rate = data.sampling_rate + self.min_text_len = getattr(data, "min_text_len", 1) + self.max_text_len = getattr(data, "max_text_len", 5000) + self._filter() + + def _filter(self): + """ + Filter text & store spec lengths + """ + # Store spectrogram lengths for Bucketing + # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) + # spec_length = wav_length // hop_length + lengths = [] + for key, data in self.dataset_meta.files.items(): + if ( + self.min_text_len <= len(data.co256) + and len(data.co256) <= self.max_text_len + ): + lengths.append(os.path.getsize(data.gt_wav) // (2 * self.hop_length)) + else: + del self.dataset_meta.files[key] + self.lengths = lengths + + def get_sid(self, sid): + sid = torch.LongTensor([int(sid)]) + return sid + + def get_audio_text_pair(self, data: DatasetMetaItem): + # separate filename and text + file = data.gt_wav + phone = data.co256 + pitch = data.f0 + pitchf = data.f0nsf + dv = data.speaker_id + + phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf) + spec, wav = self.get_audio(file) + dv = self.get_sid(dv) + + len_phone = phone.size()[0] + len_spec = spec.size()[-1] + # print(123,phone.shape,pitch.shape,spec.shape) + if len_phone != len_spec: + len_min = min(len_phone, len_spec) + # amor + len_wav = len_min * self.hop_length + + spec = spec[:, :len_min] + wav = wav[:, :len_wav] + + phone = phone[:len_min, :] + pitch = pitch[:len_min] + pitchf = pitchf[:len_min] + + return (spec, wav, phone, pitch, pitchf, dv) + + def get_labels(self, phone, pitch, pitchf): + phone = np.load(phone) + phone = np.repeat(phone, 2, axis=0) + pitch = np.load(pitch) + pitchf = np.load(pitchf) + n_num = min(phone.shape[0], 900) # DistributedBucketSampler + # print(234,phone.shape,pitch.shape) + phone = phone[:n_num, :] + pitch = pitch[:n_num] + pitchf = pitchf[:n_num] + phone = torch.FloatTensor(phone) + pitch = torch.LongTensor(pitch) + pitchf = torch.FloatTensor(pitchf) + return phone, pitch, pitchf + + def get_audio(self, filename): + audio, sampling_rate = load_wav_to_torch(filename) + if sampling_rate != self.sampling_rate: + raise ValueError( + "{} SR doesn't match target {} SR".format( + sampling_rate, self.sampling_rate + ) + ) + # audio_norm = audio / self.max_wav_value + audio_norm = audio.unsqueeze(0) + spec_filename = filename.replace(".wav", ".spec.pt") + if os.path.exists(spec_filename): + try: + spec = torch.load(spec_filename) + except: + print(spec_filename, traceback.format_exc()) + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + else: + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + return spec, audio_norm + + def __getitem__(self, index): + _, data = list(self.dataset_meta.files.items())[index] + return self.get_audio_text_pair(data) + + def __len__(self): + return len(self.dataset_meta.files) + + +class TextAudioCollateMultiNSFsid: + """Zero-pads model inputs and targets""" + + def __init__(self, return_ids=False): + self.return_ids = return_ids + + def __call__(self, batch): + """Collate's training batch from normalized text and aduio + PARAMS + ------ + batch: [text_normalized, spec_normalized, wav_normalized] + """ + # Right zero-pad all one-hot text sequences to max input length + _, ids_sorted_decreasing = torch.sort( + torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True + ) + + max_spec_len = max([x[0].size(1) for x in batch]) + max_wave_len = max([x[1].size(1) for x in batch]) + spec_lengths = torch.LongTensor(len(batch)) + wave_lengths = torch.LongTensor(len(batch)) + spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len) + wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len) + spec_padded.zero_() + wave_padded.zero_() + + max_phone_len = max([x[2].size(0) for x in batch]) + phone_lengths = torch.LongTensor(len(batch)) + phone_padded = torch.FloatTensor( + len(batch), max_phone_len, batch[0][2].shape[1] + ) # (spec, wav, phone, pitch) + pitch_padded = torch.LongTensor(len(batch), max_phone_len) + pitchf_padded = torch.FloatTensor(len(batch), max_phone_len) + phone_padded.zero_() + pitch_padded.zero_() + pitchf_padded.zero_() + # dv = torch.FloatTensor(len(batch), 256)#gin=256 + sid = torch.LongTensor(len(batch)) + + for i in range(len(ids_sorted_decreasing)): + row = batch[ids_sorted_decreasing[i]] + + spec = row[0] + spec_padded[i, :, : spec.size(1)] = spec + spec_lengths[i] = spec.size(1) + + wave = row[1] + wave_padded[i, :, : wave.size(1)] = wave + wave_lengths[i] = wave.size(1) + + phone = row[2] + phone_padded[i, : phone.size(0), :] = phone + phone_lengths[i] = phone.size(0) + + pitch = row[3] + pitch_padded[i, : pitch.size(0)] = pitch + pitchf = row[4] + pitchf_padded[i, : pitchf.size(0)] = pitchf + + # dv[i] = row[5] + sid[i] = row[5] + + return ( + phone_padded, + phone_lengths, + pitch_padded, + pitchf_padded, + spec_padded, + spec_lengths, + wave_padded, + wave_lengths, + # dv + sid, + ) + + +class TextAudioCollate: + """Zero-pads model inputs and targets""" + + def __init__(self, return_ids=False): + self.return_ids = return_ids + + def __call__(self, batch): + """Collate's training batch from normalized text and aduio + PARAMS + ------ + batch: [text_normalized, spec_normalized, wav_normalized] + """ + # Right zero-pad all one-hot text sequences to max input length + _, ids_sorted_decreasing = torch.sort( + torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True + ) + + max_spec_len = max([x[0].size(1) for x in batch]) + max_wave_len = max([x[1].size(1) for x in batch]) + spec_lengths = torch.LongTensor(len(batch)) + wave_lengths = torch.LongTensor(len(batch)) + spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len) + wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len) + spec_padded.zero_() + wave_padded.zero_() + + max_phone_len = max([x[2].size(0) for x in batch]) + phone_lengths = torch.LongTensor(len(batch)) + phone_padded = torch.FloatTensor( + len(batch), max_phone_len, batch[0][2].shape[1] + ) + phone_padded.zero_() + sid = torch.LongTensor(len(batch)) + + for i in range(len(ids_sorted_decreasing)): + row = batch[ids_sorted_decreasing[i]] + + spec = row[0] + spec_padded[i, :, : spec.size(1)] = spec + spec_lengths[i] = spec.size(1) + + wave = row[1] + wave_padded[i, :, : wave.size(1)] = wave + wave_lengths[i] = wave.size(1) + + phone = row[2] + phone_padded[i, : phone.size(0), :] = phone + phone_lengths[i] = phone.size(0) + + sid[i] = row[3] + + return ( + phone_padded, + phone_lengths, + spec_padded, + spec_lengths, + wave_padded, + wave_lengths, + sid, + ) + + +class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): + """ + Maintain similar input lengths in a batch. + Length groups are specified by boundaries. + Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}. + + It removes samples which are not included in the boundaries. + Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded. + """ + + def __init__( + self, + dataset, + batch_size, + boundaries, + num_replicas=None, + rank=None, + shuffle=True, + ): + super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) + self.lengths = dataset.lengths + self.batch_size = batch_size + self.boundaries = boundaries + + self.buckets, self.num_samples_per_bucket = self._create_buckets() + self.total_size = sum(self.num_samples_per_bucket) + self.num_samples = self.total_size // self.num_replicas + + def _create_buckets(self): + buckets = [[] for _ in range(len(self.boundaries) - 1)] + for i in range(len(self.lengths)): + length = self.lengths[i] + idx_bucket = self._bisect(length) + if idx_bucket != -1: + buckets[idx_bucket].append(i) + + for i in range(len(buckets) - 1, -1, -1): # + if len(buckets[i]) == 0: + buckets.pop(i) + self.boundaries.pop(i + 1) + + num_samples_per_bucket = [] + for i in range(len(buckets)): + len_bucket = len(buckets[i]) + total_batch_size = self.num_replicas * self.batch_size + rem = ( + total_batch_size - (len_bucket % total_batch_size) + ) % total_batch_size + num_samples_per_bucket.append(len_bucket + rem) + return buckets, num_samples_per_bucket + + def __iter__(self): + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch) + + indices = [] + if self.shuffle: + for bucket in self.buckets: + indices.append(torch.randperm(len(bucket), generator=g).tolist()) + else: + for bucket in self.buckets: + indices.append(list(range(len(bucket)))) + + batches = [] + for i in range(len(self.buckets)): + bucket = self.buckets[i] + len_bucket = len(bucket) + ids_bucket = indices[i] + num_samples_bucket = self.num_samples_per_bucket[i] + + # add extra samples to make it evenly divisible + rem = num_samples_bucket - len_bucket + ids_bucket = ( + ids_bucket + + ids_bucket * (rem // len_bucket) + + ids_bucket[: (rem % len_bucket)] + ) + + # subsample + ids_bucket = ids_bucket[self.rank :: self.num_replicas] + + # batching + for j in range(len(ids_bucket) // self.batch_size): + batch = [ + bucket[idx] + for idx in ids_bucket[ + j * self.batch_size : (j + 1) * self.batch_size + ] + ] + batches.append(batch) + + if self.shuffle: + batch_ids = torch.randperm(len(batches), generator=g).tolist() + batches = [batches[i] for i in batch_ids] + self.batches = batches + + assert len(self.batches) * self.batch_size == self.num_samples + return iter(self.batches) + + def _bisect(self, x, lo=0, hi=None): + if hi is None: + hi = len(self.boundaries) - 1 + + if hi > lo: + mid = (hi + lo) // 2 + if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]: + return mid + elif x <= self.boundaries[mid]: + return self._bisect(x, lo, mid) + else: + return self._bisect(x, mid + 1, hi) + else: + return -1 + + def __len__(self): + return self.num_samples // self.batch_size diff --git a/lib/rvc/losses.py b/lib/rvc/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..b1b263e4c205e78ffe970f622ab6ff68f36d3b17 --- /dev/null +++ b/lib/rvc/losses.py @@ -0,0 +1,58 @@ +import torch + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + rl = rl.float().detach() + gl = gl.float() + loss += torch.mean(torch.abs(rl - gl)) + + return loss * 2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + dr = dr.float() + dg = dg.float() + r_loss = torch.mean((1 - dr) ** 2) + g_loss = torch.mean(dg**2) + loss += r_loss + g_loss + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + dg = dg.float() + l = torch.mean((1 - dg) ** 2) + gen_losses.append(l) + loss += l + + return loss, gen_losses + + +def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): + """ + z_p, logs_q: [b, h, t_t] + m_p, logs_p: [b, h, t_t] + """ + z_p = z_p.float() + logs_q = logs_q.float() + m_p = m_p.float() + logs_p = logs_p.float() + z_mask = z_mask.float() + + kl = logs_p - logs_q - 0.5 + kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) + kl = torch.sum(kl * z_mask) + l = kl / torch.sum(z_mask) + return l diff --git a/lib/rvc/mel_processing.py b/lib/rvc/mel_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..e41ea4900aab73e67bc893fa82493fca38b18d23 --- /dev/null +++ b/lib/rvc/mel_processing.py @@ -0,0 +1,113 @@ +import torch +import torch.utils.data +from librosa.filters import mel as librosa_mel_fn + +MAX_WAV_VALUE = 32768.0 + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + """ + PARAMS + ------ + C: compression factor + """ + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + """ + PARAMS + ------ + C: compression factor used to compress + """ + return torch.exp(x) / C + + +def spectral_normalize_torch(magnitudes): + return dynamic_range_compression_torch(magnitudes) + + +def spectral_de_normalize_torch(magnitudes): + return dynamic_range_decompression_torch(magnitudes) + + +mel_basis = {} +hann_window = {} + + +def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): + if torch.min(y) < -1.07: + print("min value is ", torch.min(y)) + if torch.max(y) > 1.07: + print("max value is ", torch.max(y)) + + global hann_window + dtype_device = str(y.dtype) + "_" + str(y.device) + wnsize_dtype_device = str(win_size) + "_" + dtype_device + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( + dtype=y.dtype, device=y.device + ) + + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), + mode="reflect", + ) + y = y.squeeze(1) + + # mps does not support torch.stft. + if y.device.type == "mps": + i = y.cpu() + win = hann_window[wnsize_dtype_device].cpu() + else: + i = y + win = hann_window[wnsize_dtype_device] + spec = torch.stft( + i, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=win, + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=False, + ).to(device=y.device) + + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + return spec + + +def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): + global mel_basis + dtype_device = str(spec.dtype) + "_" + str(spec.device) + fmax_dtype_device = str(fmax) + "_" + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( + dtype=spec.dtype, device=spec.device + ) + melspec = torch.matmul(mel_basis[fmax_dtype_device], spec) + melspec = spectral_normalize_torch(melspec) + return melspec + + +def mel_spectrogram_torch( + y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False +): + """Convert waveform into Mel-frequency Log-amplitude spectrogram. + + Args: + y :: (B, T) - Waveforms + Returns: + melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram + """ + # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame) + spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center) + + # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame) + melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax) + + return melspec diff --git a/lib/rvc/models.py b/lib/rvc/models.py new file mode 100644 index 0000000000000000000000000000000000000000..eb6647f1a293eae7e552bd044c43148d5fdbf71a --- /dev/null +++ b/lib/rvc/models.py @@ -0,0 +1,853 @@ +import math + +import numpy as np +import torch +from torch import nn +from torch.nn import Conv1d, Conv2d, ConvTranspose1d +from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm + +from . import attentions, commons, modules +from .commons import get_padding, init_weights + + +class TextEncoder(nn.Module): + def __init__( + self, + out_channels: int, + hidden_channels: int, + filter_channels: int, + emb_channels: int, + n_heads: int, + n_layers: int, + kernel_size: int, + p_dropout: int, + f0: bool = True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.emb_channels = emb_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(emb_channels, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + def remove_weight_norm(self): + for i in range(self.n_flows): + self.flows[i * 2].remove_weight_norm() + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + +class Generator(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class SineGen(torch.nn.Module): + """Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False, + ): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def forward(self, f0, upp): + """sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0 = f0[:, None].transpose(1, 2) + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( + idx + 2 + ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic + rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化 + rand_ini = torch.rand( + f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化 + tmp_over_one *= upp + tmp_over_one = F.interpolate( + tmp_over_one.transpose(2, 1), + scale_factor=upp, + mode="linear", + align_corners=True, + ).transpose(2, 1) + rad_values = F.interpolate( + rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose( + 2, 1 + ) ####### + tmp_over_one %= 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + sine_waves = torch.sin( + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi + ) + sine_waves = sine_waves * self.sine_amp + uv = self._f02uv(f0) + uv = F.interpolate( + uv.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose(2, 1) + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__( + self, + sampling_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + is_half=True, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + self.is_half = is_half + # to produce sine waveforms + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x, upp=None): + sine_wavs, uv, _ = self.l_sin_gen(x, upp) + if self.is_half == True: + sine_wavs = sine_wavs.half() + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + return sine_merge, None, None # noise, uv + + +class GeneratorNSF(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + sr, + is_half=False, + ): + super(GeneratorNSF, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + + self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF( + sampling_rate=sr, harmonic_num=0, is_half=is_half + ) + self.noise_convs = nn.ModuleList() + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + c_cur = upsample_initial_channel // (2 ** (i + 1)) + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + if i + 1 < len(upsample_rates): + stride_f0 = np.prod(upsample_rates[i + 1 :]) + self.noise_convs.append( + Conv1d( + 1, + c_cur, + kernel_size=stride_f0 * 2, + stride=stride_f0, + padding=stride_f0 // 2, + ) + ) + else: + self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + self.upp = np.prod(upsample_rates) + + def forward(self, x, f0, g=None): + har_source, noi_source, uv = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + x_source = self.noise_convs[i](har_source) + x = x + x_source + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +sr2sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, +} + + +class SynthesizerTrnMs256NSFSid(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + emb_channels, + sr, + **kwargs + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.emb_channels = emb_channels + self.sr = sr + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder( + inter_channels, + hidden_channels, + filter_channels, + emb_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print( + "gin_channels:", + gin_channels, + "self.spk_embed_dim:", + self.spk_embed_dim, + "emb_channels:", + emb_channels, + ) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward( + self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds + ): # 这里ds是id,[bs,1] + # print(1,pitch.shape)#[bs,t] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + # print(-2,pitchf.shape,z_slice.shape) + o = self.dec(z_slice, pitchf, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, pitch, nsff0, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs256NSFSidNono(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + emb_channels, + sr=None, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.emb_channels = emb_channels + self.sr = sr + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder( + inter_channels, + hidden_channels, + filter_channels, + emb_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print( + "gin_channels:", + gin_channels, + "self.spk_embed_dim:", + self.spk_embed_dim, + "emb_channels:", + emb_channels, + ) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的 + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, sid, max_len=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec((z * x_mask)[:, :, :max_len], g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False, periods=[2, 3, 5, 7, 11, 17]): + super(MultiPeriodDiscriminator, self).__init__() + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs diff --git a/lib/rvc/modules.py b/lib/rvc/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..f37c6c15f80b76e28a93e20a0c3f219b4b92cd4a --- /dev/null +++ b/lib/rvc/modules.py @@ -0,0 +1,518 @@ +import math + +import torch +from torch import nn +from torch.nn import Conv1d +from torch.nn import functional as F +from torch.nn.utils import remove_weight_norm, weight_norm + +from . import commons +from .commons import get_padding, init_weights +from .transforms import piecewise_rational_quadratic_transform + +LRELU_SLOPE = 0.1 + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + + +class ConvReluNorm(nn.Module): + def __init__( + self, + in_channels, + hidden_channels, + out_channels, + kernel_size, + n_layers, + p_dropout, + ): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + assert n_layers > 1, "Number of layers should be larger than 0." + + self.conv_layers = nn.ModuleList() + self.norm_layers = nn.ModuleList() + self.conv_layers.append( + nn.Conv1d( + in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) + for _ in range(n_layers - 1): + self.conv_layers.append( + nn.Conv1d( + hidden_channels, + hidden_channels, + kernel_size, + padding=kernel_size // 2, + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask): + x_org = x + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.norm_layers[i](x) + x = self.relu_drop(x) + x = x_org + self.proj(x) + return x * x_mask + + +class DDSConv(nn.Module): + """ + Dialted and Depth-Separable Convolution + """ + + def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): + super().__init__() + self.channels = channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + + self.drop = nn.Dropout(p_dropout) + self.convs_sep = nn.ModuleList() + self.convs_1x1 = nn.ModuleList() + self.norms_1 = nn.ModuleList() + self.norms_2 = nn.ModuleList() + for i in range(n_layers): + dilation = kernel_size**i + padding = (kernel_size * dilation - dilation) // 2 + self.convs_sep.append( + nn.Conv1d( + channels, + channels, + kernel_size, + groups=channels, + dilation=dilation, + padding=padding, + ) + ) + self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) + self.norms_1.append(LayerNorm(channels)) + self.norms_2.append(LayerNorm(channels)) + + def forward(self, x, x_mask, g=None): + if g is not None: + x = x + g + for i in range(self.n_layers): + y = self.convs_sep[i](x * x_mask) + y = self.norms_1[i](y) + y = F.gelu(y) + y = self.convs_1x1[i](y) + y = self.norms_2[i](y) + y = F.gelu(y) + y = self.drop(y) + x = x + y + return x * x_mask + + +class WN(torch.nn.Module): + def __init__( + self, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + p_dropout=0, + ): + super(WN, self).__init__() + assert kernel_size % 2 == 1 + self.hidden_channels = hidden_channels + self.kernel_size = (kernel_size,) + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d( + gin_channels, 2 * hidden_channels * n_layers, 1 + ) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + + for i in range(n_layers): + dilation = dilation_rate**i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d( + hidden_channels, + 2 * hidden_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask, g=None, **kwargs): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + if g is not None: + g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] + else: + g_l = torch.zeros_like(x_in) + + acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) + acts = self.drop(acts) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, : self.hidden_channels, :] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:, self.hidden_channels :, :] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) + + +class ResBlock1(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.convs1 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]), + ) + ), + ] + ) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + ] + ) + self.convs2.apply(init_weights) + + def forward(self, x, x_mask=None): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c2(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.convs = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + ] + ) + self.convs.apply(init_weights) + + def forward(self, x, x_mask=None): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class Log(nn.Module): + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask + logdet = torch.sum(-y, [1, 2]) + return y, logdet + else: + x = torch.exp(x) * x_mask + return x + + +class Flip(nn.Module): + def forward(self, x, *args, reverse=False, **kwargs): + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x + + +class ElementwiseAffine(nn.Module): + def __init__(self, channels): + super().__init__() + self.channels = channels + self.m = nn.Parameter(torch.zeros(channels, 1)) + self.logs = nn.Parameter(torch.zeros(channels, 1)) + + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = self.m + torch.exp(self.logs) * x + y = y * x_mask + logdet = torch.sum(self.logs * x_mask, [1, 2]) + return y, logdet + else: + x = (x - self.m) * torch.exp(-self.logs) * x_mask + return x + + +class ResidualCouplingLayer(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False, + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=p_dropout, + gin_channels=gin_channels, + ) + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + +class ConvFlow(nn.Module): + def __init__( + self, + in_channels, + filter_channels, + kernel_size, + n_layers, + num_bins=10, + tail_bound=5.0, + ): + super().__init__() + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.num_bins = num_bins + self.tail_bound = tail_bound + self.half_channels = in_channels // 2 + + self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) + self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) + self.proj = nn.Conv1d( + filter_channels, self.half_channels * (num_bins * 3 - 1), 1 + ) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) + h = self.convs(h, x_mask, g=g) + h = self.proj(h) * x_mask + + b, c, t = x0.shape + h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] + + unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) + unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( + self.filter_channels + ) + unnormalized_derivatives = h[..., 2 * self.num_bins :] + + x1, logabsdet = piecewise_rational_quadratic_transform( + x1, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=reverse, + tails="linear", + tail_bound=self.tail_bound, + ) + + x = torch.cat([x0, x1], 1) * x_mask + logdet = torch.sum(logabsdet * x_mask, [1, 2]) + if not reverse: + return x, logdet + else: + return x diff --git a/lib/rvc/pipeline.py b/lib/rvc/pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..14986f9f2144a91ec4b5b95de0919d773cc4d456 --- /dev/null +++ b/lib/rvc/pipeline.py @@ -0,0 +1,453 @@ +import os +import traceback +from typing import * + +import faiss +import numpy as np +import pyworld +import scipy.signal as signal +import torch +import torch.nn.functional as F +import torchcrepe +from torch import Tensor +# from faiss.swigfaiss_avx2 import IndexIVFFlat # cause crash on windows' faiss-cpu installed from pip +from fairseq.models.hubert import HubertModel + +from .models import SynthesizerTrnMs256NSFSid + + +class VocalConvertPipeline(object): + def __init__(self, tgt_sr: int, device: Union[str, torch.device], is_half: bool): + if isinstance(device, str): + device = torch.device(device) + if device.type == "cuda": + vram = torch.cuda.get_device_properties(device).total_memory / 1024**3 + else: + vram = None + + if vram is not None and vram <= 4: + self.x_pad = 1 + self.x_query = 5 + self.x_center = 30 + self.x_max = 32 + elif vram is not None and vram <= 5: + self.x_pad = 1 + self.x_query = 6 + self.x_center = 38 + self.x_max = 41 + else: + self.x_pad = 3 + self.x_query = 10 + self.x_center = 60 + self.x_max = 65 + + self.sr = 16000 # hubert input sample rate + self.window = 160 # hubert input window + self.t_pad = self.sr * self.x_pad # padding time for each utterance + self.t_pad_tgt = tgt_sr * self.x_pad + self.t_pad2 = self.t_pad * 2 + self.t_query = self.sr * self.x_query # query time before and after query point + self.t_center = self.sr * self.x_center # query cut point position + self.t_max = self.sr * self.x_max # max time for no query + self.device = device + self.is_half = is_half + + def get_optimal_torch_device(self, index: int = 0) -> torch.device: + # Get cuda device + if torch.cuda.is_available(): + return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast + elif torch.backends.mps.is_available(): + return torch.device("mps") + # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library + # Else wise return the "cpu" as a torch device, + return torch.device("cpu") + + def get_f0_crepe_computation( + self, + x, + f0_min, + f0_max, + p_len, + hop_length=64, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time. + model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full + ): + x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float. + x /= np.quantile(np.abs(x), 0.999) + torch_device = self.get_optimal_torch_device() + audio = torch.from_numpy(x).to(torch_device, copy=True) + audio = torch.unsqueeze(audio, dim=0) + if audio.ndim == 2 and audio.shape[0] > 1: + audio = torch.mean(audio, dim=0, keepdim=True).detach() + audio = audio.detach() + print("Initiating prediction with a crepe_hop_length of: " + str(hop_length)) + pitch: Tensor = torchcrepe.predict( + audio, + self.sr, + hop_length, + f0_min, + f0_max, + model, + batch_size=hop_length * 2, + device=torch_device, + pad=True + ) + p_len = p_len or x.shape[0] // hop_length + # Resize the pitch for final f0 + source = np.array(pitch.squeeze(0).cpu().float().numpy()) + source[source < 0.001] = np.nan + target = np.interp( + np.arange(0, len(source) * p_len, len(source)) / p_len, + np.arange(0, len(source)), + source + ) + f0 = np.nan_to_num(target) + return f0 # Resized f0 + + def get_f0_official_crepe_computation( + self, + x, + f0_min, + f0_max, + model="full", + ): + # Pick a batch size that doesn't cause memory errors on your gpu + batch_size = 512 + # Compute pitch using first gpu + audio = torch.tensor(np.copy(x))[None].float() + f0, pd = torchcrepe.predict( + audio, + self.sr, + self.window, + f0_min, + f0_max, + model, + batch_size=batch_size, + device=self.device, + return_periodicity=True, + ) + pd = torchcrepe.filter.median(pd, 3) + f0 = torchcrepe.filter.mean(f0, 3) + f0[pd < 0.1] = 0 + f0 = f0[0].cpu().numpy() + return f0 + + def get_f0( + self, + x: np.ndarray, + p_len: int, + f0_up_key: int, + f0_method: str, + inp_f0: np.ndarray = None, + ): + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + + if f0_method == "harvest": + f0, t = pyworld.harvest( + x.astype(np.double), + fs=self.sr, + f0_ceil=f0_max, + f0_floor=f0_min, + frame_period=10, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr) + f0 = signal.medfilt(f0, 3) + elif f0_method == "dio": + f0, t = pyworld.dio( + x.astype(np.double), + fs=self.sr, + f0_ceil=f0_max, + f0_floor=f0_min, + frame_period=10, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.sr) + f0 = signal.medfilt(f0, 3) + elif f0_method == "mangio-crepe": + f0 = self.get_f0_crepe_computation(x, f0_min, f0_max, p_len, 160, "full") + elif f0_method == "crepe": + f0 = self.get_f0_official_crepe_computation(x, f0_min, f0_max, "full") + + f0 *= pow(2, f0_up_key / 12) + tf0 = self.sr // self.window # f0 points per second + if inp_f0 is not None: + delta_t = np.round( + (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 + ).astype("int16") + replace_f0 = np.interp( + list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] + ) + shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] + f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ + :shape + ] + + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int) + return f0_coarse, f0bak # 1-0 + + def _convert( + self, + model: HubertModel, + embedding_output_layer: int, + net_g: SynthesizerTrnMs256NSFSid, + sid: int, + audio: np.ndarray, + pitch: np.ndarray, + pitchf: np.ndarray, + index: faiss.IndexIVFFlat, + big_npy: np.ndarray, + index_rate: float, + ): + feats = torch.from_numpy(audio) + if self.is_half: + feats = feats.half() + else: + feats = feats.float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) + + half_support = ( + self.device.type == "cuda" + and torch.cuda.get_device_capability(self.device)[0] >= 5.3 + ) + is_feats_dim_768 = net_g.emb_channels == 768 + + if isinstance(model, tuple): + feats = model[0]( + feats.squeeze(0).squeeze(0).to(self.device), + return_tensors="pt", + sampling_rate=16000, + ) + if self.is_half: + feats = feats.input_values.to(self.device).half() + else: + feats = feats.input_values.to(self.device) + with torch.no_grad(): + if is_feats_dim_768: + feats = model[1](feats).last_hidden_state + else: + feats = model[1](feats).extract_features + else: + inputs = { + "source": feats.half().to(self.device) + if half_support + else feats.to(self.device), + "padding_mask": padding_mask.to(self.device), + "output_layer": embedding_output_layer, + } + + if not half_support: + model = model.float() + inputs["source"] = inputs["source"].float() + + with torch.no_grad(): + logits = model.extract_features(**inputs) + if is_feats_dim_768: + feats = logits[0] + else: + feats = model.final_proj(logits[0]) + + if ( + isinstance(index, type(None)) == False + and isinstance(big_npy, type(None)) == False + and index_rate != 0 + ): + npy = feats[0].cpu().numpy() + if self.is_half: + npy = npy.astype("float32") + + score, ix = index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + + if self.is_half: + npy = npy.astype("float16") + feats = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + + (1 - index_rate) * feats + ) + + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + + p_len = audio.shape[0] // self.window + if feats.shape[1] < p_len: + p_len = feats.shape[1] + if pitch != None and pitchf != None: + pitch = pitch[:, :p_len] + pitchf = pitchf[:, :p_len] + p_len = torch.tensor([p_len], device=self.device).long() + with torch.no_grad(): + if pitch != None and pitchf != None: + audio1 = ( + (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768) + .data.cpu() + .float() + .numpy() + .astype(np.int16) + ) + else: + audio1 = ( + (net_g.infer(feats, p_len, sid)[0][0, 0] * 32768) + .data.cpu() + .float() + .numpy() + .astype(np.int16) + ) + del feats, p_len, padding_mask + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio1 + + def __call__( + self, + model: HubertModel, + embedding_output_layer: int, + net_g: SynthesizerTrnMs256NSFSid, + sid: int, + audio: np.ndarray, + transpose: int, + f0_method: str, + file_index: str, + index_rate: float, + if_f0: bool, + f0_file: str = None, + ): + if file_index != "" and os.path.exists(file_index) and index_rate != 0: + try: + index = faiss.read_index(file_index) + # big_npy = np.load(file_big_npy) + big_npy = index.reconstruct_n(0, index.ntotal) + except: + traceback.print_exc() + index = big_npy = None + else: + index = big_npy = None + + bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) + audio = signal.filtfilt(bh, ah, audio) + + audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") + opt_ts = [] + if audio_pad.shape[0] > self.t_max: + audio_sum = np.zeros_like(audio) + for i in range(self.window): + audio_sum += audio_pad[i : i - self.window] + for t in range(self.t_center, audio.shape[0], self.t_center): + opt_ts.append( + t + - self.t_query + + np.where( + np.abs(audio_sum[t - self.t_query : t + self.t_query]) + == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() + )[0][0] + ) + + audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") + p_len = audio_pad.shape[0] // self.window + inp_f0 = None + if hasattr(f0_file, "name"): + try: + with open(f0_file.name, "r") as f: + lines = f.read().strip("\n").split("\n") + inp_f0 = [] + for line in lines: + inp_f0.append([float(i) for i in line.split(",")]) + inp_f0 = np.array(inp_f0, dtype="float32") + except: + traceback.print_exc() + sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() + pitch, pitchf = None, None + if if_f0 == 1: + pitch, pitchf = self.get_f0(audio_pad, p_len, transpose, f0_method, inp_f0) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + if self.device.type == "mps": + pitchf = pitchf.astype(np.float32) + pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() + pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() + + audio_opt = [] + + s = 0 + t = None + + for t in opt_ts: + t = t // self.window * self.window + if if_f0 == 1: + audio_opt.append( + self._convert( + model, + embedding_output_layer, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + pitch[:, s // self.window : (t + self.t_pad2) // self.window], + pitchf[:, s // self.window : (t + self.t_pad2) // self.window], + index, + big_npy, + index_rate, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self._convert( + model, + embedding_output_layer, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + None, + None, + index, + big_npy, + index_rate, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + s = t + if if_f0 == 1: + audio_opt.append( + self._convert( + model, + embedding_output_layer, + net_g, + sid, + audio_pad[t:], + pitch[:, t // self.window :] if t is not None else pitch, + pitchf[:, t // self.window :] if t is not None else pitchf, + index, + big_npy, + index_rate, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self._convert( + model, + embedding_output_layer, + net_g, + sid, + audio_pad[t:], + None, + None, + index, + big_npy, + index_rate, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + audio_opt = np.concatenate(audio_opt) + del pitch, pitchf, sid + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio_opt diff --git a/lib/rvc/preprocessing/extract_f0.py b/lib/rvc/preprocessing/extract_f0.py new file mode 100644 index 0000000000000000000000000000000000000000..617a438ffdf0310206c519c150c984a5b2151c42 --- /dev/null +++ b/lib/rvc/preprocessing/extract_f0.py @@ -0,0 +1,221 @@ +import os +import traceback +from concurrent.futures import ProcessPoolExecutor +from typing import * +import multiprocessing as mp + +import numpy as np +import pyworld +import torch +import torchcrepe +from torch import Tensor +from tqdm import tqdm + +from lib.rvc.utils import load_audio + +def get_optimal_torch_device(index: int = 0) -> torch.device: + # Get cuda device + if torch.cuda.is_available(): + return torch.device(f"cuda:{index % torch.cuda.device_count()}") # Very fast + elif torch.backends.mps.is_available(): + return torch.device("mps") + # Insert an else here to grab "xla" devices if available. TO DO later. Requires the torch_xla.core.xla_model library + # Else wise return the "cpu" as a torch device, + return torch.device("cpu") + +def get_f0_official_crepe_computation( + x, + sr, + f0_min, + f0_max, + model="full", +): + batch_size = 512 + torch_device = get_optimal_torch_device() + audio = torch.tensor(np.copy(x))[None].float() + f0, pd = torchcrepe.predict( + audio, + sr, + 160, + f0_min, + f0_max, + model, + batch_size=batch_size, + device=torch_device, + return_periodicity=True, + ) + pd = torchcrepe.filter.median(pd, 3) + f0 = torchcrepe.filter.mean(f0, 3) + f0[pd < 0.1] = 0 + f0 = f0[0].cpu().numpy() + f0 = f0[1:] # Get rid of extra first frame + return f0 + +def get_f0_crepe_computation( + x, + sr, + f0_min, + f0_max, + hop_length=160, # 512 before. Hop length changes the speed that the voice jumps to a different dramatic pitch. Lower hop lengths means more pitch accuracy but longer inference time. + model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full +): + x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float. + x /= np.quantile(np.abs(x), 0.999) + torch_device = get_optimal_torch_device() + audio = torch.from_numpy(x).to(torch_device, copy=True) + audio = torch.unsqueeze(audio, dim=0) + if audio.ndim == 2 and audio.shape[0] > 1: + audio = torch.mean(audio, dim=0, keepdim=True).detach() + audio = audio.detach() + print("Initiating prediction with a crepe_hop_length of: " + str(hop_length)) + pitch: Tensor = torchcrepe.predict( + audio, + sr, + hop_length, + f0_min, + f0_max, + model, + batch_size=hop_length * 2, + device=torch_device, + pad=True + ) + p_len = x.shape[0] // hop_length + # Resize the pitch for final f0 + source = np.array(pitch.squeeze(0).cpu().float().numpy()) + source[source < 0.001] = np.nan + target = np.interp( + np.arange(0, len(source) * p_len, len(source)) / p_len, + np.arange(0, len(source)), + source + ) + f0 = np.nan_to_num(target) + f0 = f0[1:] # Get rid of extra first frame + return f0 # Resized f0 + + +def compute_f0( + path: str, + f0_method: str, + fs: int, + hop: int, + f0_max: float, + f0_min: float, +): + x = load_audio(path, fs) + if f0_method == "harvest": + f0, t = pyworld.harvest( + x.astype(np.double), + fs=fs, + f0_ceil=f0_max, + f0_floor=f0_min, + frame_period=1000 * hop / fs, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, fs) + elif f0_method == "dio": + f0, t = pyworld.dio( + x.astype(np.double), + fs=fs, + f0_ceil=f0_max, + f0_floor=f0_min, + frame_period=1000 * hop / fs, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, fs) + elif f0_method == "mangio-crepe": + f0 = get_f0_crepe_computation(x, fs, f0_min, f0_max, 160, "full") + elif f0_method == "crepe": + f0 = get_f0_official_crepe_computation(x.astype(np.double), fs, f0_min, f0_max, "full") + return f0 + + +def coarse_f0(f0, f0_bin, f0_mel_min, f0_mel_max): + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * (f0_bin - 2) / ( + f0_mel_max - f0_mel_min + ) + 1 + + # use 0 or 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > f0_bin - 1] = f0_bin - 1 + f0_coarse = np.rint(f0_mel).astype(np.int) + assert f0_coarse.max() <= 255 and f0_coarse.min() >= 1, ( + f0_coarse.max(), + f0_coarse.min(), + ) + return f0_coarse + + +def processor(paths, f0_method, samplerate=16000, hop_size=160, process_id=0): + fs = samplerate + hop = hop_size + + f0_bin = 256 + f0_max = 1100.0 + f0_min = 50.0 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + if len(paths) != 0: + for idx, (inp_path, opt_path1, opt_path2) in enumerate( + tqdm(paths, position=1 + process_id) + ): + try: + if ( + os.path.exists(opt_path1 + ".npy") == True + and os.path.exists(opt_path2 + ".npy") == True + ): + continue + featur_pit = compute_f0(inp_path, f0_method, fs, hop, f0_max, f0_min) + np.save( + opt_path2, + featur_pit, + allow_pickle=False, + ) # nsf + coarse_pit = coarse_f0(featur_pit, f0_bin, f0_mel_min, f0_mel_max) + np.save( + opt_path1, + coarse_pit, + allow_pickle=False, + ) # ori + except: + print(f"f0 failed {idx}: {inp_path} {traceback.format_exc()}") + + +def run(training_dir: str, num_processes: int, f0_method: str): + paths = [] + dataset_dir = os.path.join(training_dir, "1_16k_wavs") + opt_dir_f0 = os.path.join(training_dir, "2a_f0") + opt_dir_f0_nsf = os.path.join(training_dir, "2b_f0nsf") + + if os.path.exists(opt_dir_f0) and os.path.exists(opt_dir_f0_nsf): + return + + os.makedirs(opt_dir_f0, exist_ok=True) + os.makedirs(opt_dir_f0_nsf, exist_ok=True) + + names = [] + + for pathname in sorted(list(os.listdir(dataset_dir))): + if os.path.isdir(os.path.join(dataset_dir, pathname)): + for f in sorted(list(os.listdir(os.path.join(dataset_dir, pathname)))): + if "spec" in f: + continue + names.append(os.path.join(pathname, f)) + else: + names.append(pathname) + + for name in names: # dataset_dir/{05d}/file.ext + filepath = os.path.join(dataset_dir, name) + if "spec" in filepath: + continue + opt_filepath_f0 = os.path.join(opt_dir_f0, name) + opt_filepath_f0_nsf = os.path.join(opt_dir_f0_nsf, name) + paths.append([filepath, opt_filepath_f0, opt_filepath_f0_nsf]) + + for dir in set([(os.path.dirname(p[1]), os.path.dirname(p[2])) for p in paths]): + os.makedirs(dir[0], exist_ok=True) + os.makedirs(dir[1], exist_ok=True) + + with ProcessPoolExecutor(mp_context=mp.get_context("spawn")) as executer: + for i in range(num_processes): + executer.submit(processor, paths[i::num_processes], f0_method, process_id=i) + + processor(paths, f0_method) diff --git a/lib/rvc/preprocessing/extract_feature.py b/lib/rvc/preprocessing/extract_feature.py new file mode 100644 index 0000000000000000000000000000000000000000..328c083832a2a46c677ec26b41095ac4c0c7b68e --- /dev/null +++ b/lib/rvc/preprocessing/extract_feature.py @@ -0,0 +1,217 @@ +import multiprocessing as mp +import os +import traceback +from concurrent.futures import ProcessPoolExecutor +from typing import * + +import numpy as np +import soundfile as sf +import torch +import torch.nn.functional as F +from fairseq import checkpoint_utils +from tqdm import tqdm + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) +MODELS_DIR = os.path.join(ROOT_DIR, "models") +EMBEDDINGS_LIST = { + "hubert-base-japanese": ( + "rinna_hubert_base_jp.pt", + "hubert-base-japanese", + "local", + ), + "contentvec": ("checkpoint_best_legacy_500.pt", "contentvec", "local"), +} + +def get_embedder(embedder_name): + if embedder_name in EMBEDDINGS_LIST: + return EMBEDDINGS_LIST[embedder_name] + return None + + +def load_embedder(embedder_path: str, device): + try: + models, cfg, _ = checkpoint_utils.load_model_ensemble_and_task( + [embedder_path], + suffix="", + ) + embedder_model = models[0] + embedder_model = embedder_model.to(device) + if device != "cpu": + embedder_model = embedder_model.half() + else: + embedder_model = embedder_model.float() + embedder_model.eval() + except Exception as e: + print(f"Error: {e} {embedder_path}") + traceback.print_exc() + + return embedder_model, cfg + + +# wave must be 16k, hop_size=320 +def readwave(wav_path, normalize=False): + wav, sr = sf.read(wav_path) + assert sr == 16000 + feats = torch.from_numpy(wav).float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + if normalize: + with torch.no_grad(): + feats = F.layer_norm(feats, feats.shape) + feats = feats.view(1, -1) + return feats + + +def processor( + todo: List[str], + device: torch.device, + embedder_path: str, + embedder_load_from: str, + embedding_channel: bool, + embedding_output_layer: int, + wav_dir: str, + out_dir: str, + process_id: int, +): + half_support = ( + device.type == "cuda" and torch.cuda.get_device_capability(device)[0] >= 5.3 + ) + is_feats_dim_768 = embedding_channel == 768 + + if embedder_load_from == "local" and not os.path.exists(embedder_path): + return f"Embedder not found: {embedder_path}" + + model, cfg = load_embedder(embedder_path, device) + + for file in tqdm(todo, position=1 + process_id): + try: + if file.endswith(".wav"): + wav_filepath = os.path.join(wav_dir, file) + out_filepath = os.path.join(out_dir, file.replace("wav", "npy")) + + if os.path.exists(out_filepath): + continue + + os.makedirs(os.path.dirname(out_filepath), exist_ok=True) + + is_normalize = False if cfg is None else cfg.task.normalize + feats = readwave(wav_filepath, normalize=is_normalize) + padding_mask = torch.BoolTensor(feats.shape).fill_(False) + if isinstance(model, tuple): + feats = model[0]( + feats.squeeze(0).squeeze(0).to(device), + return_tensors="pt", + sampling_rate=16000, + ) + if half_support: + feats = feats.input_values.to(device).half() + else: + feats = feats.input_values.to(device).float() + + with torch.no_grad(): + if half_support: + if is_feats_dim_768: + feats = model[1](feats).last_hidden_state + else: + feats = model[1](feats).extract_features + else: + if is_feats_dim_768: + feats = model[1].float()(feats).last_hidden_state + else: + feats = model[1].float()(feats).extract_features + else: + inputs = { + "source": feats.half().to(device) + if half_support + else feats.to(device), + "padding_mask": padding_mask.to(device), + "output_layer": embedding_output_layer, + } + + # なんかまだこの時点でfloat16なので改めて変換 + if not half_support: + model = model.float() + inputs["source"] = inputs["source"].float() + + with torch.no_grad(): + logits = model.extract_features(**inputs) + if is_feats_dim_768: + feats = logits[0] + else: + feats = model.final_proj(logits[0]) + + feats = feats.squeeze(0).float().cpu().numpy() + if np.isnan(feats).sum() == 0: + np.save(out_filepath, feats, allow_pickle=False) + else: + print(f"{file} contains nan") + except Exception as e: + print(f"Error: {e} {file}") + traceback.print_exc() + + +def run( + training_dir: str, + embedder_path: str, + embedder_load_from: str, + embedding_channel: int, + embedding_output_layer: int, + gpu_ids: List[int], + device: Optional[Union[torch.device, str]] = None, +): + wav_dir = os.path.join(training_dir, "1_16k_wavs") + out_dir = os.path.join(training_dir, "3_feature256") + + num_gpus = len(gpu_ids) + + for gpu_id in gpu_ids: + if num_gpus < gpu_id + 1: + print(f"GPU {gpu_id} is not available") + return + + if os.path.exists(out_dir): + return + + os.makedirs(out_dir, exist_ok=True) + + todo = [ + os.path.join(dir, f) + for dir in sorted(list(os.listdir(wav_dir))) + if os.path.isdir(os.path.join(wav_dir, dir)) + for f in sorted(list(os.listdir(os.path.join(wav_dir, dir)))) + ] + + if device is not None: + if type(device) == str: + device = torch.device(device) + if device.type == "mps": + device = torch.device( + "cpu" + ) # Mac(MPS) crashes when multiprocess, so change to CPU. + processor( + todo, + device, + embedder_path, + embedder_load_from, + embedding_channel, + embedding_output_layer, + wav_dir, + out_dir, + process_id=0, + ) + else: + with ProcessPoolExecutor(mp_context=mp.get_context("spawn")) as executor: + for i, id in enumerate(gpu_ids): + executor.submit( + processor, + todo[i::num_gpus], + torch.device(f"cuda:{id}"), + embedder_path, + embedder_load_from, + embedding_channel, + embedding_output_layer, + wav_dir, + out_dir, + process_id=i, + ) diff --git a/lib/rvc/preprocessing/slicer.py b/lib/rvc/preprocessing/slicer.py new file mode 100644 index 0000000000000000000000000000000000000000..3160332ecef412786ef7a134f123b9b2780989bd --- /dev/null +++ b/lib/rvc/preprocessing/slicer.py @@ -0,0 +1,179 @@ +import numpy as np + + +# This function is obtained from librosa. +def get_rms( + y, + frame_length=2048, + hop_length=512, + pad_mode="constant", +): + padding = (int(frame_length // 2), int(frame_length // 2)) + y = np.pad(y, padding, mode=pad_mode) + + axis = -1 + # put our new within-frame axis at the end for now + out_strides = y.strides + tuple([y.strides[axis]]) + # Reduce the shape on the framing axis + x_shape_trimmed = list(y.shape) + x_shape_trimmed[axis] -= frame_length - 1 + out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) + xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides) + if axis < 0: + target_axis = axis - 1 + else: + target_axis = axis + 1 + xw = np.moveaxis(xw, -1, target_axis) + # Downsample along the target axis + slices = [slice(None)] * xw.ndim + slices[axis] = slice(0, None, hop_length) + x = xw[tuple(slices)] + + # Calculate power + power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True) + + return np.sqrt(power) + + +class Slicer: + def __init__( + self, + sr: int, + threshold: float = -40.0, + min_length: int = 5000, + min_interval: int = 300, + hop_size: int = 20, + max_sil_kept: int = 5000, + ): + if not min_length >= min_interval >= hop_size: + raise ValueError( + "The following condition must be satisfied: min_length >= min_interval >= hop_size" + ) + if not max_sil_kept >= hop_size: + raise ValueError( + "The following condition must be satisfied: max_sil_kept >= hop_size" + ) + min_interval = sr * min_interval / 1000 + self.threshold = 10 ** (threshold / 20.0) + self.hop_size = round(sr * hop_size / 1000) + self.win_size = min(round(min_interval), 4 * self.hop_size) + self.min_length = round(sr * min_length / 1000 / self.hop_size) + self.min_interval = round(min_interval / self.hop_size) + self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) + + def _apply_slice(self, waveform, begin, end): + if len(waveform.shape) > 1: + return waveform[ + :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size) + ] + else: + return waveform[ + begin * self.hop_size : min(waveform.shape[0], end * self.hop_size) + ] + + # @timeit + def slice(self, waveform): + if len(waveform.shape) > 1: + samples = waveform.mean(axis=0) + else: + samples = waveform + if samples.shape[0] <= self.min_length: + return [waveform] + rms_list = get_rms( + y=samples, frame_length=self.win_size, hop_length=self.hop_size + ).squeeze(0) + sil_tags = [] + silence_start = None + clip_start = 0 + for i, rms in enumerate(rms_list): + # Keep looping while frame is silent. + if rms < self.threshold: + # Record start of silent frames. + if silence_start is None: + silence_start = i + continue + # Keep looping while frame is not silent and silence start has not been recorded. + if silence_start is None: + continue + # Clear recorded silence start if interval is not enough or clip is too short + is_leading_silence = silence_start == 0 and i > self.max_sil_kept + need_slice_middle = ( + i - silence_start >= self.min_interval + and i - clip_start >= self.min_length + ) + if not is_leading_silence and not need_slice_middle: + silence_start = None + continue + # Need slicing. Record the range of silent frames to be removed. + if i - silence_start <= self.max_sil_kept: + pos = rms_list[silence_start : i + 1].argmin() + silence_start + if silence_start == 0: + sil_tags.append((0, pos)) + else: + sil_tags.append((pos, pos)) + clip_start = pos + elif i - silence_start <= self.max_sil_kept * 2: + pos = rms_list[ + i - self.max_sil_kept : silence_start + self.max_sil_kept + 1 + ].argmin() + pos += i - self.max_sil_kept + pos_l = ( + rms_list[ + silence_start : silence_start + self.max_sil_kept + 1 + ].argmin() + + silence_start + ) + pos_r = ( + rms_list[i - self.max_sil_kept : i + 1].argmin() + + i + - self.max_sil_kept + ) + if silence_start == 0: + sil_tags.append((0, pos_r)) + clip_start = pos_r + else: + sil_tags.append((min(pos_l, pos), max(pos_r, pos))) + clip_start = max(pos_r, pos) + else: + pos_l = ( + rms_list[ + silence_start : silence_start + self.max_sil_kept + 1 + ].argmin() + + silence_start + ) + pos_r = ( + rms_list[i - self.max_sil_kept : i + 1].argmin() + + i + - self.max_sil_kept + ) + if silence_start == 0: + sil_tags.append((0, pos_r)) + else: + sil_tags.append((pos_l, pos_r)) + clip_start = pos_r + silence_start = None + # Deal with trailing silence. + total_frames = rms_list.shape[0] + if ( + silence_start is not None + and total_frames - silence_start >= self.min_interval + ): + silence_end = min(total_frames, silence_start + self.max_sil_kept) + pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start + sil_tags.append((pos, total_frames + 1)) + # Apply and return slices. + if len(sil_tags) == 0: + return [waveform] + else: + chunks = [] + if sil_tags[0][0] > 0: + chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0])) + for i in range(len(sil_tags) - 1): + chunks.append( + self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]) + ) + if sil_tags[-1][1] < total_frames: + chunks.append( + self._apply_slice(waveform, sil_tags[-1][1], total_frames) + ) + return chunks diff --git a/lib/rvc/preprocessing/split.py b/lib/rvc/preprocessing/split.py new file mode 100644 index 0000000000000000000000000000000000000000..50b09795fbb3adef161015526d0bc356332cf688 --- /dev/null +++ b/lib/rvc/preprocessing/split.py @@ -0,0 +1,195 @@ +import operator +import os +from concurrent.futures import ProcessPoolExecutor +from typing import * + +import librosa +import numpy as np +import scipy.signal as signal +from scipy.io import wavfile +from tqdm import tqdm + +from lib.rvc.utils import load_audio + +from .slicer import Slicer + + +def norm_write( + tmp_audio: np.ndarray, + idx0: int, + idx1: int, + speaker_id: int, + outdir: str, + outdir_16k: str, + sampling_rate: int, + max: float, + alpha: float, + is_normalize: bool, +): + if is_normalize: + tmp_audio = (tmp_audio / np.abs(tmp_audio).max() * (max * alpha)) + ( + 1 - alpha + ) * tmp_audio + else: + # clip level to max (cause sometimes when floating point decoding) + audio_min = np.min(tmp_audio) + if audio_min < -max: + tmp_audio = tmp_audio / -audio_min * max + audio_max = np.max(tmp_audio) + if audio_max > max: + tmp_audio = tmp_audio / audio_max * max + + wavfile.write( + os.path.join(outdir, f"{speaker_id:05}", f"{idx0}_{idx1}.wav"), + sampling_rate, + tmp_audio.astype(np.float32), + ) + + tmp_audio = librosa.resample( + tmp_audio, orig_sr=sampling_rate, target_sr=16000, res_type="soxr_vhq" + ) + wavfile.write( + os.path.join(outdir_16k, f"{speaker_id:05}", f"{idx0}_{idx1}.wav"), + 16000, + tmp_audio.astype(np.float32), + ) + + +def write_mute( + mute_wave_filename: str, + speaker_id: int, + outdir: str, + outdir_16k: str, + sampling_rate: int, +): + tmp_audio = load_audio(mute_wave_filename, sampling_rate) + wavfile.write( + os.path.join(outdir, f"{speaker_id:05}", "mute.wav"), + sampling_rate, + tmp_audio.astype(np.float32), + ) + tmp_audio = librosa.resample( + tmp_audio, orig_sr=sampling_rate, target_sr=16000, res_type="soxr_vhq" + ) + wavfile.write( + os.path.join(outdir_16k, f"{speaker_id:05}", "mute.wav"), + 16000, + tmp_audio.astype(np.float32), + ) + + +def pipeline( + slicer: Slicer, + datasets: List[Tuple[str, int]], # List[(path, speaker_id)] + outdir: str, + outdir_16k: str, + sampling_rate: int, + is_normalize: bool, + process_id: int = 0, +): + per = 3.7 + overlap = 0.3 + tail = per + overlap + max = 0.95 + alpha = 0.8 + + bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=sampling_rate) + + for index, (wave_filename, speaker_id) in tqdm(datasets, position=1 + process_id): + audio = load_audio(wave_filename, sampling_rate) + audio = signal.lfilter(bh, ah, audio) + + idx1 = 0 + for audio in slicer.slice(audio): + i = 0 + while 1: + start = int(sampling_rate * (per - overlap) * i) + i += 1 + if len(audio[start:]) > tail * sampling_rate: + tmp_audio = audio[start : start + int(per * sampling_rate)] + norm_write( + tmp_audio, + index, + idx1, + speaker_id, + outdir, + outdir_16k, + sampling_rate, + max, + alpha, + is_normalize, + ) + idx1 += 1 + else: + tmp_audio = audio[start:] + break + norm_write( + tmp_audio, + index, + idx1, + speaker_id, + outdir, + outdir_16k, + sampling_rate, + max, + alpha, + is_normalize, + ) + idx1 += 1 + + +def preprocess_audio( + datasets: List[Tuple[str, int]], # List[(path, speaker_id)] + sampling_rate: int, + num_processes: int, + training_dir: str, + is_normalize: bool, + mute_wav_path: str, +): + waves_dir = os.path.join(training_dir, "0_gt_wavs") + waves16k_dir = os.path.join(training_dir, "1_16k_wavs") + if os.path.exists(waves_dir) and os.path.exists(waves16k_dir): + return + + for speaker_id in set([spk for _, spk in datasets]): + os.makedirs(os.path.join(waves_dir, f"{speaker_id:05}"), exist_ok=True) + os.makedirs(os.path.join(waves16k_dir, f"{speaker_id:05}"), exist_ok=True) + + all = [(i, x) for i, x in enumerate(sorted(datasets, key=operator.itemgetter(0)))] + + # n of datasets per process + process_all_nums = [len(all) // num_processes] * num_processes + # add residual datasets + for i in range(len(all) % num_processes): + process_all_nums[i] += 1 + + assert len(all) == sum(process_all_nums), print( + f"len(all): {len(all)}, sum(process_all_nums): {sum(process_all_nums)}" + ) + + with ProcessPoolExecutor(max_workers=num_processes) as executor: + all_index = 0 + for i in range(num_processes): + data = all[all_index : all_index + process_all_nums[i]] + slicer = Slicer( + sr=sampling_rate, + threshold=-42, + min_length=1500, + min_interval=400, + hop_size=15, + max_sil_kept=500, + ) + executor.submit( + pipeline, + slicer, + data, + waves_dir, + waves16k_dir, + sampling_rate, + is_normalize, + process_id=i, + ) + all_index += process_all_nums[i] + + for speaker_id in set([spk for _, spk in datasets]): + write_mute(mute_wav_path, speaker_id, waves_dir, waves16k_dir, sampling_rate) diff --git a/lib/rvc/train.py b/lib/rvc/train.py new file mode 100644 index 0000000000000000000000000000000000000000..fc2eaad791de481c5c8c7ad555a7888390b84747 --- /dev/null +++ b/lib/rvc/train.py @@ -0,0 +1,998 @@ +import glob +import json +import operator +import os +import shutil +import time +from random import shuffle +from typing import * + +import faiss +import numpy as np +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +import torchaudio +import tqdm +from sklearn.cluster import MiniBatchKMeans +from torch.cuda.amp import GradScaler, autocast +from torch.nn import functional as F +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter + +from . import commons, utils +from .checkpoints import save +from .config import DatasetMetadata, TrainConfig +from .data_utils import (DistributedBucketSampler, TextAudioCollate, + TextAudioCollateMultiNSFsid, TextAudioLoader, + TextAudioLoaderMultiNSFsid) +from .losses import discriminator_loss, feature_loss, generator_loss, kl_loss +from .mel_processing import mel_spectrogram_torch, spec_to_mel_torch +from .models import (MultiPeriodDiscriminator, SynthesizerTrnMs256NSFSid, + SynthesizerTrnMs256NSFSidNono) +from .preprocessing.extract_feature import (MODELS_DIR, get_embedder, + load_embedder) + + +def is_audio_file(file: str): + if "." not in file: + return False + ext = os.path.splitext(file)[1] + return ext.lower() in [ + ".wav", + ".flac", + ".ogg", + ".mp3", + ".m4a", + ".wma", + ".aiff", + ] + + +def glob_dataset( + glob_str: str, + speaker_id: int, + multiple_speakers: bool = False, + recursive: bool = True, + training_dir: str = ".", +): + globs = glob_str.split(",") + speaker_count = 0 + datasets_speakers = [] + speaker_to_id_mapping = {} + for glob_str in globs: + if os.path.isdir(glob_str): + if multiple_speakers: + # Multispeaker format: + # dataset_path/ + # - speakername/ + # - {wav name here}.wav + # - ... + # - next_speakername/ + # - {wav name here}.wav + # - ... + # - ... + print("Multispeaker dataset enabled; Processing speakers.") + for dir in tqdm.tqdm(os.listdir(glob_str)): + print("Speaker ID " + str(speaker_count) + ": " + dir) + speaker_to_id_mapping[dir] = speaker_count + speaker_path = glob_str + "/" + dir + for audio in tqdm.tqdm(os.listdir(speaker_path)): + if is_audio_file(glob_str + "/" + dir + "/" + audio): + datasets_speakers.append((glob_str + "/" + dir + "/" + audio, speaker_count)) + speaker_count += 1 + with open(os.path.join(training_dir, "speaker_info.json"), "w") as outfile: + print("Dumped speaker info to {}".format(os.path.join(training_dir, "speaker_info.json"))) + json.dump(speaker_to_id_mapping, outfile) + continue # Skip the normal speaker extend + + glob_str = os.path.join(glob_str, "**", "*") + print("Single speaker dataset enabled; Processing speaker as ID " + str(speaker_id) + ".") + datasets_speakers.extend( + [ + (file, speaker_id) + for file in glob.iglob(glob_str, recursive=recursive) + if is_audio_file(file) + ] + ) + + return sorted(datasets_speakers) + + +def create_dataset_meta(training_dir: str, f0: bool): + gt_wavs_dir = os.path.join(training_dir, "0_gt_wavs") + co256_dir = os.path.join(training_dir, "3_feature256") + + def list_data(dir: str): + files = [] + for subdir in os.listdir(dir): + speaker_dir = os.path.join(dir, subdir) + for name in os.listdir(speaker_dir): + files.append(os.path.join(subdir, name.split(".")[0])) + return files + + names = set(list_data(gt_wavs_dir)) & set(list_data(co256_dir)) + + if f0: + f0_dir = os.path.join(training_dir, "2a_f0") + f0nsf_dir = os.path.join(training_dir, "2b_f0nsf") + names = names & set(list_data(f0_dir)) & set(list_data(f0nsf_dir)) + + meta = { + "files": {}, + } + + for name in names: + speaker_id = os.path.dirname(name).split("_")[0] + speaker_id = int(speaker_id) if speaker_id.isdecimal() else 0 + if f0: + gt_wav_path = os.path.join(gt_wavs_dir, f"{name}.wav") + co256_path = os.path.join(co256_dir, f"{name}.npy") + f0_path = os.path.join(f0_dir, f"{name}.wav.npy") + f0nsf_path = os.path.join(f0nsf_dir, f"{name}.wav.npy") + meta["files"][name] = { + "gt_wav": gt_wav_path, + "co256": co256_path, + "f0": f0_path, + "f0nsf": f0nsf_path, + "speaker_id": speaker_id, + } + else: + gt_wav_path = os.path.join(gt_wavs_dir, f"{name}.wav") + co256_path = os.path.join(co256_dir, f"{name}.npy") + meta["files"][name] = { + "gt_wav": gt_wav_path, + "co256": co256_path, + "speaker_id": speaker_id, + } + + with open(os.path.join(training_dir, "meta.json"), "w") as f: + json.dump(meta, f, indent=2) + + +def change_speaker(net_g, speaker_info, embedder, embedding_output_layer, phone, phone_lengths, pitch, pitchf, spec_lengths): + """ + random change formant + inspired by https://github.com/auspicious3000/contentvec/blob/d746688a32940f4bee410ed7c87ec9cf8ff04f74/contentvec/data/audio/audio_utils_1.py#L179 + """ + N = phone.shape[0] + device = phone.device + dtype = phone.dtype + + f0_bin = 256 + f0_max = 1100.0 + f0_min = 50.0 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + + pitch_median = torch.median(pitchf, 1).values + lo = 75. + 25. * (pitch_median >= 200).to(dtype=dtype) + hi = 250. + 150. * (pitch_median >= 200).to(dtype=dtype) + pitch_median = torch.clip(pitch_median, lo, hi).unsqueeze(1) + + shift_pitch = torch.exp2((1. - 2. * torch.rand(N)) / 4).unsqueeze(1).to(device, dtype) # ピッチを半オクターブの範囲でずらす + + new_sid = np.random.choice(np.arange(len(speaker_info))[speaker_info > 0], size=N) + rel_pitch = pitchf / pitch_median + new_pitch_median = torch.from_numpy(speaker_info[new_sid]).to(device, dtype).unsqueeze(1) * shift_pitch + new_pitchf = new_pitch_median * rel_pitch + new_sid = torch.from_numpy(new_sid).to(device) + + new_pitch = 1127. * torch.log(1. + new_pitchf / 700.) + new_pitch = (pitch - f0_mel_min) * (f0_bin - 2.) / (f0_mel_max - f0_mel_min) + 1. + new_pitch = torch.clip(new_pitch, 1, f0_bin - 1).to(dtype=torch.int) + + aug_wave = net_g.infer(phone, phone_lengths, new_pitch, new_pitchf, new_sid)[0] + aug_wave_16k = torchaudio.functional.resample(aug_wave, net_g.sr, 16000, rolloff=0.99).squeeze(1) + padding_mask = torch.arange(aug_wave_16k.shape[1]).unsqueeze(0).to(device) > (spec_lengths.unsqueeze(1) * 160).to(device) + + inputs = { + "source": aug_wave_16k.to(device, dtype), + "padding_mask": padding_mask.to(device), + "output_layer": embedding_output_layer + } + logits = embedder.extract_features(**inputs) + if phone.shape[-1] == 768: + feats = logits[0] + else: + feats = embedder.final_proj(logits[0]) + feats = torch.repeat_interleave(feats, 2, 1) + new_phone = torch.zeros(phone.shape).to(device, dtype) + new_phone[:, :feats.shape[1]] = feats[:, :phone.shape[1]] + return new_phone.to(device), aug_wave + + +def change_speaker_nono(net_g, embedder, embedding_output_layer, phone, phone_lengths, spec_lengths): + """ + random change formant + inspired by https://github.com/auspicious3000/contentvec/blob/d746688a32940f4bee410ed7c87ec9cf8ff04f74/contentvec/data/audio/audio_utils_1.py#L179 + """ + N = phone.shape[0] + device = phone.device + dtype = phone.dtype + + new_sid = np.random.randint(net_g.spk_embed_dim, size=N) + new_sid = torch.from_numpy(new_sid).to(device) + + aug_wave = net_g.infer(phone, phone_lengths, new_sid)[0] + aug_wave_16k = torchaudio.functional.resample(aug_wave, net_g.sr, 16000, rolloff=0.99).squeeze(1) + padding_mask = torch.arange(aug_wave_16k.shape[1]).unsqueeze(0).to(device) > (spec_lengths.unsqueeze(1) * 160).to(device) + + inputs = { + "source": aug_wave_16k.to(device, dtype), + "padding_mask": padding_mask.to(device), + "output_layer": embedding_output_layer + } + + logits = embedder.extract_features(**inputs) + if phone.shape[-1] == 768: + feats = logits[0] + else: + feats = embedder.final_proj(logits[0]) + feats = torch.repeat_interleave(feats, 2, 1) + new_phone = torch.zeros(phone.shape).to(device, dtype) + new_phone[:, :feats.shape[1]] = feats[:, :phone.shape[1]] + return new_phone.to(device), aug_wave + + +def train_index( + training_dir: str, + model_name: str, + out_dir: str, + emb_ch: int, + num_cpu_process: int, + maximum_index_size: Optional[int], +): + checkpoint_path = os.path.join(out_dir, model_name) + feature_256_dir = os.path.join(training_dir, "3_feature256") + index_dir = os.path.join(os.path.dirname(checkpoint_path), f"{model_name}_index") + os.makedirs(index_dir, exist_ok=True) + for speaker_id in tqdm.tqdm( + sorted([dir for dir in os.listdir(feature_256_dir) if dir.isdecimal()]) + ): + feature_256_spk_dir = os.path.join(feature_256_dir, speaker_id) + speaker_id = int(speaker_id) + npys = [] + for name in [ + os.path.join(feature_256_spk_dir, file) + for file in os.listdir(feature_256_spk_dir) + if file.endswith(".npy") + ]: + phone = np.load(os.path.join(feature_256_spk_dir, name)) + npys.append(phone) + + # shuffle big_npy to prevent reproducing the sound source + big_npy = np.concatenate(npys, 0) + big_npy_idx = np.arange(big_npy.shape[0]) + np.random.shuffle(big_npy_idx) + big_npy = big_npy[big_npy_idx] + + if not maximum_index_size is None and big_npy.shape[0] > maximum_index_size: + kmeans = MiniBatchKMeans( + n_clusters=maximum_index_size, + batch_size=256 * num_cpu_process, + init="random", + compute_labels=False, + ) + kmeans.fit(big_npy) + big_npy = kmeans.cluster_centers_ + + # recommend parameter in https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index + emb_ch = big_npy.shape[1] + emb_ch_half = emb_ch // 2 + n_ivf = int(8 * np.sqrt(big_npy.shape[0])) + if big_npy.shape[0] >= 1_000_000: + index = faiss.index_factory( + emb_ch, f"IVF{n_ivf},PQ{emb_ch_half}x4fsr,RFlat" + ) + else: + index = faiss.index_factory(emb_ch, f"IVF{n_ivf},Flat") + + index.train(big_npy) + batch_size_add = 8192 + for i in range(0, big_npy.shape[0], batch_size_add): + index.add(big_npy[i : i + batch_size_add]) + np.save( + os.path.join(index_dir, f"{model_name}.{speaker_id}.big.npy"), + big_npy, + ) + faiss.write_index( + index, + os.path.join(index_dir, f"{model_name}.{speaker_id}.index"), + ) + + +def train_model( + gpus: List[int], + config: TrainConfig, + training_dir: str, + model_name: str, + out_dir: str, + sample_rate: int, + f0: bool, + batch_size: int, + augment: bool, + augment_path: Optional[str], + speaker_info_path: Optional[str], + cache_batch: bool, + total_epoch: int, + save_every_epoch: int, + save_wav_with_checkpoint: bool, + pretrain_g: str, + pretrain_d: str, + embedder_name: str, + embedding_output_layer: int, + save_only_last: bool = False, + device: Optional[Union[str, torch.device]] = None, +): + os.environ["MASTER_ADDR"] = "localhost" + os.environ["MASTER_PORT"] = str(utils.find_empty_port()) + + deterministic = torch.backends.cudnn.deterministic + benchmark = torch.backends.cudnn.benchmark + PREV_CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None) + + torch.backends.cudnn.deterministic = False + torch.backends.cudnn.benchmark = False + + os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(gpu) for gpu in gpus]) + + start = time.perf_counter() + + # Mac(MPS)でやると、mp.spawnでなんかトラブルが出るので普通にtraining_runnerを呼び出す。 + if device is not None: + training_runner( + 0, # rank + 1, # world size + config, + training_dir, + model_name, + out_dir, + sample_rate, + f0, + batch_size, + augment, + augment_path, + speaker_info_path, + cache_batch, + total_epoch, + save_every_epoch, + save_wav_with_checkpoint, + pretrain_g, + pretrain_d, + embedder_name, + embedding_output_layer, + save_only_last, + device, + ) + else: + mp.spawn( + training_runner, + nprocs=len(gpus), + args=( + len(gpus), + config, + training_dir, + model_name, + out_dir, + sample_rate, + f0, + batch_size, + augment, + augment_path, + speaker_info_path, + cache_batch, + total_epoch, + save_every_epoch, + save_wav_with_checkpoint, + pretrain_g, + pretrain_d, + embedder_name, + embedding_output_layer, + save_only_last, + device, + ), + ) + + end = time.perf_counter() + + print(f"Time: {end - start}") + + if PREV_CUDA_VISIBLE_DEVICES is None: + del os.environ["CUDA_VISIBLE_DEVICES"] + else: + os.environ["CUDA_VISIBLE_DEVICES"] = PREV_CUDA_VISIBLE_DEVICES + + torch.backends.cudnn.deterministic = deterministic + torch.backends.cudnn.benchmark = benchmark + + +def training_runner( + rank: int, + world_size: List[int], + config: TrainConfig, + training_dir: str, + model_name: str, + out_dir: str, + sample_rate: int, + f0: bool, + batch_size: int, + augment: bool, + augment_path: Optional[str], + speaker_info_path: Optional[str], + cache_in_gpu: bool, + total_epoch: int, + save_every_epoch: int, + save_wav_with_checkpoint: bool, + pretrain_g: str, + pretrain_d: str, + embedder_name: str, + embedding_output_layer: int, + save_only_last: bool = False, + device: Optional[Union[str, torch.device]] = None, +): + config.train.batch_size = batch_size + log_dir = os.path.join(training_dir, "logs") + state_dir = os.path.join(training_dir, "state") + training_files_path = os.path.join(training_dir, "meta.json") + training_meta = DatasetMetadata.parse_file(training_files_path) + embedder_out_channels = config.model.emb_channels + + is_multi_process = world_size > 1 + + if device is not None: + if type(device) == str: + device = torch.device(device) + + global_step = 0 + is_main_process = rank == 0 + + if is_main_process: + os.makedirs(log_dir, exist_ok=True) + os.makedirs(state_dir, exist_ok=True) + writer = SummaryWriter(log_dir=log_dir) + + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + if not dist.is_initialized(): + dist.init_process_group( + backend="gloo", init_method="env://", rank=rank, world_size=world_size + ) + + if is_multi_process: + torch.cuda.set_device(rank) + + torch.manual_seed(config.train.seed) + + if f0: + train_dataset = TextAudioLoaderMultiNSFsid(training_meta, config.data) + else: + train_dataset = TextAudioLoader(training_meta, config.data) + + train_sampler = DistributedBucketSampler( + train_dataset, + config.train.batch_size * world_size, + [100, 200, 300, 400, 500, 600, 700, 800, 900], + num_replicas=world_size, + rank=rank, + shuffle=True, + ) + + if f0: + collate_fn = TextAudioCollateMultiNSFsid() + else: + collate_fn = TextAudioCollate() + + train_loader = DataLoader( + train_dataset, + num_workers=4, + shuffle=False, + pin_memory=True, + collate_fn=collate_fn, + batch_sampler=train_sampler, + persistent_workers=True, + prefetch_factor=8, + ) + speaker_info = None + if os.path.exists(os.path.join(training_dir, "speaker_info.json")): + with open(os.path.join(training_dir, "speaker_info.json"), "r") as f: + speaker_info = json.load(f) + config.model.spk_embed_dim = len(speaker_info) + if f0: + net_g = SynthesizerTrnMs256NSFSid( + config.data.filter_length // 2 + 1, + config.train.segment_size // config.data.hop_length, + **config.model.dict(), + is_half=False, # config.train.fp16_run, + sr=int(sample_rate[:-1] + "000"), + ) + else: + net_g = SynthesizerTrnMs256NSFSidNono( + config.data.filter_length // 2 + 1, + config.train.segment_size // config.data.hop_length, + **config.model.dict(), + is_half=False, # config.train.fp16_run, + sr=int(sample_rate[:-1] + "000"), + ) + + if is_multi_process: + net_g = net_g.cuda(rank) + else: + net_g = net_g.to(device=device) + + if config.version == "v1": + periods = [2, 3, 5, 7, 11, 17] + elif config.version == "v2": + periods = [2, 3, 5, 7, 11, 17, 23, 37] + net_d = MultiPeriodDiscriminator(config.model.use_spectral_norm, periods=periods) + if is_multi_process: + net_d = net_d.cuda(rank) + else: + net_d = net_d.to(device=device) + + optim_g = torch.optim.AdamW( + net_g.parameters(), + config.train.learning_rate, + betas=config.train.betas, + eps=config.train.eps, + ) + optim_d = torch.optim.AdamW( + net_d.parameters(), + config.train.learning_rate, + betas=config.train.betas, + eps=config.train.eps, + ) + + last_d_state = utils.latest_checkpoint_path(state_dir, "D_*.pth") + last_g_state = utils.latest_checkpoint_path(state_dir, "G_*.pth") + + if last_d_state is None or last_g_state is None: + epoch = 1 + global_step = 0 + if os.path.exists(pretrain_g) and os.path.exists(pretrain_d): + net_g_state = torch.load(pretrain_g, map_location="cpu")["model"] + emb_spk_size = (config.model.spk_embed_dim, config.model.gin_channels) + emb_phone_size = (config.model.hidden_channels, config.model.emb_channels) + if emb_spk_size != net_g_state["emb_g.weight"].size(): + original_weight = net_g_state["emb_g.weight"] + net_g_state["emb_g.weight"] = original_weight.mean(dim=0, keepdims=True) * torch.ones(emb_spk_size, device=original_weight.device, dtype=original_weight.dtype) + if emb_phone_size != net_g_state["enc_p.emb_phone.weight"].size(): + # interpolate + orig_shape = net_g_state["enc_p.emb_phone.weight"].size() + if net_g_state["enc_p.emb_phone.weight"].dtype == torch.half: + net_g_state["enc_p.emb_phone.weight"] = ( + F.interpolate( + net_g_state["enc_p.emb_phone.weight"] + .float() + .unsqueeze(0) + .unsqueeze(0), + size=emb_phone_size, + mode="bilinear", + ) + .half() + .squeeze(0) + .squeeze(0) + ) + else: + net_g_state["enc_p.emb_phone.weight"] = ( + F.interpolate( + net_g_state["enc_p.emb_phone.weight"] + .unsqueeze(0) + .unsqueeze(0), + size=emb_phone_size, + mode="bilinear", + ) + .squeeze(0) + .squeeze(0) + ) + print( + "interpolated pretrained state enc_p.emb_phone from", + orig_shape, + "to", + emb_phone_size, + ) + if is_multi_process: + net_g.module.load_state_dict(net_g_state) + else: + net_g.load_state_dict(net_g_state) + + del net_g_state + + if is_multi_process: + net_d.module.load_state_dict( + torch.load(pretrain_d, map_location="cpu")["model"] + ) + else: + net_d.load_state_dict( + torch.load(pretrain_d, map_location="cpu")["model"] + ) + if is_main_process: + print(f"loaded pretrained {pretrain_g} {pretrain_d}") + + else: + _, _, _, epoch = utils.load_checkpoint(last_d_state, net_d, optim_d) + _, _, _, epoch = utils.load_checkpoint(last_g_state, net_g, optim_g) + if is_main_process: + print(f"loaded last state {last_d_state} {last_g_state}") + + epoch += 1 + global_step = (epoch - 1) * len(train_loader) + + if augment: + # load embedder + embedder_filepath, _, embedder_load_from = get_embedder(embedder_name) + + if embedder_load_from == "local": + embedder_filepath = os.path.join( + MODELS_DIR, "embeddings", embedder_filepath + ) + embedder, _ = load_embedder(embedder_filepath, device) + if not config.train.fp16_run: + embedder = embedder.float() + + if (augment_path is not None): + state_dict = torch.load(augment_path, map_location="cpu") + if state_dict["f0"] == 1: + augment_net_g = SynthesizerTrnMs256NSFSid( + **state_dict["params"], is_half=config.train.fp16_run + ) + augment_speaker_info = np.load(speaker_info_path) + else: + augment_net_g = SynthesizerTrnMs256NSFSidNono( + **state_dict["params"], is_half=config.train.fp16_run + ) + + augment_net_g.load_state_dict(state_dict["weight"], strict=False) + augment_net_g.eval().to(device) + + else: + augment_net_g = net_g + if f0: + medians = [[] for _ in range(augment_net_g.spk_embed_dim)] + for file in training_meta.files.values(): + f0f = np.load(file.f0nsf) + if np.any(f0f > 0): + medians[file.speaker_id].append(np.median(f0f[f0f > 0])) + augment_speaker_info = np.array([np.median(x) if len(x) else 0. for x in medians]) + np.save(os.path.join(training_dir, "speaker_info.npy"), augment_speaker_info) + + if is_multi_process: + net_g = DDP(net_g, device_ids=[rank]) + net_d = DDP(net_d, device_ids=[rank]) + + scheduler_g = torch.optim.lr_scheduler.ExponentialLR( + optim_g, gamma=config.train.lr_decay, last_epoch=epoch - 2 + ) + scheduler_d = torch.optim.lr_scheduler.ExponentialLR( + optim_d, gamma=config.train.lr_decay, last_epoch=epoch - 2 + ) + + scaler = GradScaler(enabled=config.train.fp16_run) + + cache = [] + progress_bar = tqdm.tqdm(range((total_epoch - epoch + 1) * len(train_loader))) + progress_bar.set_postfix(epoch=epoch) + step = -1 + len(train_loader) * (epoch - 1) + for epoch in range(epoch, total_epoch + 1): + train_loader.batch_sampler.set_epoch(epoch) + + net_g.train() + net_d.train() + + use_cache = len(cache) == len(train_loader) + data = cache if use_cache else enumerate(train_loader) + + if is_main_process: + lr = optim_g.param_groups[0]["lr"] + + if use_cache: + shuffle(cache) + + for batch_idx, batch in data: + step += 1 + progress_bar.update(1) + if f0: + ( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ) = batch + else: + ( + phone, + phone_lengths, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ) = batch + + if not use_cache: + phone, phone_lengths = ( + phone.to(device=device, non_blocking=True), + phone_lengths.to(device=device, non_blocking=True), + ) + if f0: + pitch, pitchf = ( + pitch.to(device=device, non_blocking=True), + pitchf.to(device=device, non_blocking=True), + ) + sid = sid.to(device=device, non_blocking=True) + spec, spec_lengths = ( + spec.to(device=device, non_blocking=True), + spec_lengths.to(device=device, non_blocking=True), + ) + wave, wave_lengths = ( + wave.to(device=device, non_blocking=True), + wave_lengths.to(device=device, non_blocking=True), + ) + if cache_in_gpu: + if f0: + cache.append( + ( + batch_idx, + ( + phone, + phone_lengths, + pitch, + pitchf, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ), + ) + ) + else: + cache.append( + ( + batch_idx, + ( + phone, + phone_lengths, + spec, + spec_lengths, + wave, + wave_lengths, + sid, + ), + ) + ) + + with autocast(enabled=config.train.fp16_run): + if augment: + with torch.no_grad(): + if type(augment_net_g) == SynthesizerTrnMs256NSFSid: + new_phone, aug_wave = change_speaker(augment_net_g, augment_speaker_info, embedder, embedding_output_layer, phone, phone_lengths, pitch, pitchf, spec_lengths) + else: + new_phone, aug_wave = change_speaker_nono(augment_net_g, embedder, embedding_output_layer, phone, phone_lengths, spec_lengths) + weight = np.power(.5, step / len(train_loader)) # 学習の初期はそのままのphone embeddingを使う + phone = phone * weight + new_phone * (1. - weight) + + if f0: + ( + y_hat, + ids_slice, + x_mask, + z_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + ) = net_g( + phone, phone_lengths, pitch, pitchf, spec, spec_lengths, sid + ) + else: + ( + y_hat, + ids_slice, + x_mask, + z_mask, + (z, z_p, m_p, logs_p, m_q, logs_q), + ) = net_g(phone, phone_lengths, spec, spec_lengths, sid) + mel = spec_to_mel_torch( + spec, + config.data.filter_length, + config.data.n_mel_channels, + config.data.sampling_rate, + config.data.mel_fmin, + config.data.mel_fmax, + ) + y_mel = commons.slice_segments( + mel, ids_slice, config.train.segment_size // config.data.hop_length + ) + with autocast(enabled=False): + y_hat_mel = mel_spectrogram_torch( + y_hat.float().squeeze(1), + config.data.filter_length, + config.data.n_mel_channels, + config.data.sampling_rate, + config.data.hop_length, + config.data.win_length, + config.data.mel_fmin, + config.data.mel_fmax, + ) + if config.train.fp16_run == True and device != torch.device("mps"): + y_hat_mel = y_hat_mel.half() + wave_slice = commons.slice_segments( + wave, ids_slice * config.data.hop_length, config.train.segment_size + ) # slice + + # Discriminator + y_d_hat_r, y_d_hat_g, _, _ = net_d(wave_slice, y_hat.detach()) + with autocast(enabled=False): + loss_disc, losses_disc_r, losses_disc_g = discriminator_loss( + y_d_hat_r, y_d_hat_g + ) + optim_d.zero_grad() + scaler.scale(loss_disc).backward() + scaler.unscale_(optim_d) + grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None) + scaler.step(optim_d) + + with autocast(enabled=config.train.fp16_run): + # Generator + y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(wave_slice, y_hat) + with autocast(enabled=False): + loss_mel = F.l1_loss(y_mel, y_hat_mel) * config.train.c_mel + loss_kl = ( + kl_loss(z_p, logs_q, m_p, logs_p, z_mask) * config.train.c_kl + ) + loss_fm = feature_loss(fmap_r, fmap_g) + loss_gen, losses_gen = generator_loss(y_d_hat_g) + loss_gen_all = loss_gen + loss_fm + loss_mel + loss_kl + optim_g.zero_grad() + scaler.scale(loss_gen_all).backward() + scaler.unscale_(optim_g) + grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None) + scaler.step(optim_g) + scaler.update() + + if is_main_process: + progress_bar.set_postfix( + epoch=epoch, + loss_g=float(loss_gen_all) if loss_gen_all is not None else 0.0, + loss_d=float(loss_disc) if loss_disc is not None else 0.0, + lr=float(lr) if lr is not None else 0.0, + use_cache=use_cache, + ) + if global_step % config.train.log_interval == 0: + lr = optim_g.param_groups[0]["lr"] + # Amor For Tensorboard display + if loss_mel > 50: + loss_mel = 50 + if loss_kl > 5: + loss_kl = 5 + + scalar_dict = { + "loss/g/total": loss_gen_all, + "loss/d/total": loss_disc, + "learning_rate": lr, + "grad_norm_d": grad_norm_d, + "grad_norm_g": grad_norm_g, + } + scalar_dict.update( + { + "loss/g/fm": loss_fm, + "loss/g/mel": loss_mel, + "loss/g/kl": loss_kl, + } + ) + + scalar_dict.update( + {"loss/g/{}".format(i): v for i, v in enumerate(losses_gen)} + ) + scalar_dict.update( + { + "loss/d_r/{}".format(i): v + for i, v in enumerate(losses_disc_r) + } + ) + scalar_dict.update( + { + "loss/d_g/{}".format(i): v + for i, v in enumerate(losses_disc_g) + } + ) + image_dict = { + "slice/mel_org": utils.plot_spectrogram_to_numpy( + y_mel[0].data.cpu().numpy() + ), + "slice/mel_gen": utils.plot_spectrogram_to_numpy( + y_hat_mel[0].data.cpu().numpy() + ), + "all/mel": utils.plot_spectrogram_to_numpy( + mel[0].data.cpu().numpy() + ), + } + utils.summarize( + writer=writer, + global_step=global_step, + images=image_dict, + scalars=scalar_dict, + ) + global_step += 1 + if is_main_process and save_every_epoch != 0 and epoch % save_every_epoch == 0: + if save_only_last: + old_g_path = os.path.join( + state_dir, f"G_{epoch - save_every_epoch}.pth" + ) + old_d_path = os.path.join( + state_dir, f"D_{epoch - save_every_epoch}.pth" + ) + old_wav_path = os.path.join( + state_dir, f"wav_sample_{epoch - save_every_epoch}" + ) + if os.path.exists(old_g_path): + os.remove(old_g_path) + if os.path.exists(old_d_path): + os.remove(old_d_path) + if os.path.exists(old_wav_path): + shutil.rmtree(old_wav_path) + + if save_wav_with_checkpoint: + with autocast(enabled=config.train.fp16_run): + with torch.no_grad(): + if f0: + pred_wave = net_g.infer(phone, phone_lengths, pitch, pitchf, sid)[0] + else: + pred_wave = net_g.infer(phone, phone_lengths, sid)[0] + os.makedirs(os.path.join(state_dir, f"wav_sample_{epoch}"), exist_ok=True) + for i in range(pred_wave.shape[0]): + torchaudio.save(filepath=os.path.join(state_dir, f"wav_sample_{epoch}", f"{i:02}_y_true.wav"), src=wave[i].detach().cpu().float(), sample_rate=int(sample_rate[:-1] + "000")) + torchaudio.save(filepath=os.path.join(state_dir, f"wav_sample_{epoch}", f"{i:02}_y_pred.wav"), src=pred_wave[i].detach().cpu().float(), sample_rate=int(sample_rate[:-1] + "000")) + if augment: + torchaudio.save(filepath=os.path.join(state_dir, f"wav_sample_{epoch}", f"{i:02}_y_aug.wav"), src=aug_wave[i].detach().cpu().float(), sample_rate=int(sample_rate[:-1] + "000")) + + utils.save_state( + net_g, + optim_g, + config.train.learning_rate, + epoch, + os.path.join(state_dir, f"G_{epoch}.pth"), + ) + utils.save_state( + net_d, + optim_d, + config.train.learning_rate, + epoch, + os.path.join(state_dir, f"D_{epoch}.pth"), + ) + + save( + net_g, + config.version, + sample_rate, + f0, + embedder_name, + embedder_out_channels, + embedding_output_layer, + os.path.join(training_dir, "checkpoints", f"{model_name}-{epoch}.pth"), + epoch, + speaker_info + ) + + scheduler_g.step() + scheduler_d.step() + + if is_main_process: + print("Training is done. The program is closed.") + save( + net_g, + config.version, + sample_rate, + f0, + embedder_name, + embedder_out_channels, + embedding_output_layer, + os.path.join(out_dir, f"{model_name}.pth"), + epoch, + speaker_info + ) diff --git a/lib/rvc/transforms.py b/lib/rvc/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..6f30b7177d17fc61a4173c21b4233172a890be58 --- /dev/null +++ b/lib/rvc/transforms.py @@ -0,0 +1,207 @@ +import numpy as np +import torch +from torch.nn import functional as F + +DEFAULT_MIN_BIN_WIDTH = 1e-3 +DEFAULT_MIN_BIN_HEIGHT = 1e-3 +DEFAULT_MIN_DERIVATIVE = 1e-3 + + +def piecewise_rational_quadratic_transform( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + tails=None, + tail_bound=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + if tails is None: + spline_fn = rational_quadratic_spline + spline_kwargs = {} + else: + spline_fn = unconstrained_rational_quadratic_spline + spline_kwargs = {"tails": tails, "tail_bound": tail_bound} + + outputs, logabsdet = spline_fn( + inputs=inputs, + unnormalized_widths=unnormalized_widths, + unnormalized_heights=unnormalized_heights, + unnormalized_derivatives=unnormalized_derivatives, + inverse=inverse, + min_bin_width=min_bin_width, + min_bin_height=min_bin_height, + min_derivative=min_derivative, + **spline_kwargs + ) + return outputs, logabsdet + + +def searchsorted(bin_locations, inputs, eps=1e-6): + bin_locations[..., -1] += eps + return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1 + + +def unconstrained_rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + tails="linear", + tail_bound=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) + outside_interval_mask = ~inside_interval_mask + + outputs = torch.zeros_like(inputs) + logabsdet = torch.zeros_like(inputs) + + if tails == "linear": + unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) + constant = np.log(np.exp(1 - min_derivative) - 1) + unnormalized_derivatives[..., 0] = constant + unnormalized_derivatives[..., -1] = constant + + outputs[outside_interval_mask] = inputs[outside_interval_mask] + logabsdet[outside_interval_mask] = 0 + else: + raise RuntimeError("{} tails are not implemented.".format(tails)) + + ( + outputs[inside_interval_mask], + logabsdet[inside_interval_mask], + ) = rational_quadratic_spline( + inputs=inputs[inside_interval_mask], + unnormalized_widths=unnormalized_widths[inside_interval_mask, :], + unnormalized_heights=unnormalized_heights[inside_interval_mask, :], + unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], + inverse=inverse, + left=-tail_bound, + right=tail_bound, + bottom=-tail_bound, + top=tail_bound, + min_bin_width=min_bin_width, + min_bin_height=min_bin_height, + min_derivative=min_derivative, + ) + + return outputs, logabsdet + + +def rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + left=0.0, + right=1.0, + bottom=0.0, + top=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + if torch.min(inputs) < left or torch.max(inputs) > right: + raise ValueError("Input to a transform is not within its domain") + + num_bins = unnormalized_widths.shape[-1] + + if min_bin_width * num_bins > 1.0: + raise ValueError("Minimal bin width too large for the number of bins") + if min_bin_height * num_bins > 1.0: + raise ValueError("Minimal bin height too large for the number of bins") + + widths = F.softmax(unnormalized_widths, dim=-1) + widths = min_bin_width + (1 - min_bin_width * num_bins) * widths + cumwidths = torch.cumsum(widths, dim=-1) + cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) + cumwidths = (right - left) * cumwidths + left + cumwidths[..., 0] = left + cumwidths[..., -1] = right + widths = cumwidths[..., 1:] - cumwidths[..., :-1] + + derivatives = min_derivative + F.softplus(unnormalized_derivatives) + + heights = F.softmax(unnormalized_heights, dim=-1) + heights = min_bin_height + (1 - min_bin_height * num_bins) * heights + cumheights = torch.cumsum(heights, dim=-1) + cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) + cumheights = (top - bottom) * cumheights + bottom + cumheights[..., 0] = bottom + cumheights[..., -1] = top + heights = cumheights[..., 1:] - cumheights[..., :-1] + + if inverse: + bin_idx = searchsorted(cumheights, inputs)[..., None] + else: + bin_idx = searchsorted(cumwidths, inputs)[..., None] + + input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] + input_bin_widths = widths.gather(-1, bin_idx)[..., 0] + + input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] + delta = heights / widths + input_delta = delta.gather(-1, bin_idx)[..., 0] + + input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] + input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] + + input_heights = heights.gather(-1, bin_idx)[..., 0] + + if inverse: + a = (inputs - input_cumheights) * ( + input_derivatives + input_derivatives_plus_one - 2 * input_delta + ) + input_heights * (input_delta - input_derivatives) + b = input_heights * input_derivatives - (inputs - input_cumheights) * ( + input_derivatives + input_derivatives_plus_one - 2 * input_delta + ) + c = -input_delta * (inputs - input_cumheights) + + discriminant = b.pow(2) - 4 * a * c + assert (discriminant >= 0).all() + + root = (2 * c) / (-b - torch.sqrt(discriminant)) + outputs = root * input_bin_widths + input_cumwidths + + theta_one_minus_theta = root * (1 - root) + denominator = input_delta + ( + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + * theta_one_minus_theta + ) + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * root.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - root).pow(2) + ) + logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) + + return outputs, -logabsdet + else: + theta = (inputs - input_cumwidths) / input_bin_widths + theta_one_minus_theta = theta * (1 - theta) + + numerator = input_heights * ( + input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta + ) + denominator = input_delta + ( + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + * theta_one_minus_theta + ) + outputs = input_cumheights + numerator / denominator + + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * theta.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - theta).pow(2) + ) + logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) + + return outputs, logabsdet diff --git a/lib/rvc/utils.py b/lib/rvc/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..68d7d83465e79787dcd4635d8f71345cf3784a04 --- /dev/null +++ b/lib/rvc/utils.py @@ -0,0 +1,225 @@ +import glob +import logging +import os +import shutil +import socket +import sys + +import ffmpeg +import matplotlib +import matplotlib.pylab as plt +import numpy as np +import torch +from scipy.io.wavfile import read +from torch.nn import functional as F + +from modules.shared import ROOT_DIR + +from .config import TrainConfig + +matplotlib.use("Agg") +logging.getLogger("matplotlib").setLevel(logging.WARNING) + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +logger = logging + + +def load_audio(file: str, sr): + try: + # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 + # This launches a subprocess to decode audio while down-mixing and resampling as necessary. + # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. + file = ( + file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) # Prevent small white copy path head and tail with spaces and " and return + out, _ = ( + ffmpeg.input(file, threads=0) + .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) + .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) + ) + except Exception as e: + raise RuntimeError(f"Failed to load audio: {e}") + + return np.frombuffer(out, np.float32).flatten() + + +def find_empty_port(): + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.bind(("", 0)) + s.listen(1) + port = s.getsockname()[1] + s.close() + return port + + +def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") + + saved_state_dict = checkpoint_dict["model"] + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): # 模型需要的shape + try: + new_state_dict[k] = saved_state_dict[k] + if saved_state_dict[k].shape != state_dict[k].shape: + print( + f"shape-{k}-mismatch|need-{state_dict[k].shape}|get-{saved_state_dict[k].shape}" + ) + if saved_state_dict[k].dim() == 2: # NOTE: check is this ok? + # for embedded input 256 <==> 768 + # this achieves we can continue training from original's pretrained checkpoints when using embedder that 768-th dim output etc. + if saved_state_dict[k].dtype == torch.half: + new_state_dict[k] = ( + F.interpolate( + saved_state_dict[k].float().unsqueeze(0).unsqueeze(0), + size=state_dict[k].shape, + mode="bilinear", + ) + .half() + .squeeze(0) + .squeeze(0) + ) + else: + new_state_dict[k] = ( + F.interpolate( + saved_state_dict[k].unsqueeze(0).unsqueeze(0), + size=state_dict[k].shape, + mode="bilinear", + ) + .squeeze(0) + .squeeze(0) + ) + print( + "interpolated new_state_dict", + k, + "from", + saved_state_dict[k].shape, + "to", + new_state_dict[k].shape, + ) + else: + raise KeyError + except Exception as e: + # print(traceback.format_exc()) + print(f"{k} is not in the checkpoint") + print("error: %s" % e) + new_state_dict[k] = v # 模型自带的随机值 + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict, strict=False) + else: + model.load_state_dict(new_state_dict, strict=False) + print("Loaded model weights") + + epoch = checkpoint_dict["epoch"] + learning_rate = checkpoint_dict["learning_rate"] + if optimizer is not None and load_opt == 1: + optimizer.load_state_dict(checkpoint_dict["optimizer"]) + print("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, epoch)) + return model, optimizer, learning_rate, epoch + + +def save_state(model, optimizer, learning_rate, epoch, checkpoint_path): + print( + "Saving model and optimizer state at epoch {} to {}".format( + epoch, checkpoint_path + ) + ) + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + torch.save( + { + "model": state_dict, + "epoch": epoch, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + }, + checkpoint_path, + ) + + +def summarize( + writer, + global_step, + scalars={}, + histograms={}, + images={}, + audios={}, + audio_sampling_rate=22050, +): + for k, v in scalars.items(): + writer.add_scalar(k, v, global_step) + for k, v in histograms.items(): + writer.add_histogram(k, v, global_step) + for k, v in images.items(): + writer.add_image(k, v, global_step, dataformats="HWC") + for k, v in audios.items(): + writer.add_audio(k, v, global_step, audio_sampling_rate) + + +def latest_checkpoint_path(dir_path, regex="G_*.pth"): + filelist = glob.glob(os.path.join(dir_path, regex)) + if len(filelist) == 0: + return None + filelist.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) + filepath = filelist[-1] + return filepath + + +def plot_spectrogram_to_numpy(spectrogram): + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def plot_alignment_to_numpy(alignment, info=None): + fig, ax = plt.subplots(figsize=(6, 4)) + im = ax.imshow( + alignment.transpose(), aspect="auto", origin="lower", interpolation="none" + ) + fig.colorbar(im, ax=ax) + xlabel = "Decoder timestep" + if info is not None: + xlabel += "\n\n" + info + plt.xlabel(xlabel) + plt.ylabel("Encoder timestep") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def load_wav_to_torch(full_path): + sampling_rate, data = read(full_path) + return torch.FloatTensor(data.astype(np.float32)), sampling_rate + + +def load_config(training_dir: str, sample_rate: int, emb_channels: int): + if emb_channels == 256: + config_path = os.path.join(ROOT_DIR, "configs", f"{sample_rate}.json") + else: + config_path = os.path.join( + ROOT_DIR, "configs", f"{sample_rate}-{emb_channels}.json" + ) + config_save_path = os.path.join(training_dir, "config.json") + + shutil.copyfile(config_path, config_save_path) + + return TrainConfig.parse_file(config_save_path) diff --git a/models/checkpoints/.gitignore b/models/checkpoints/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d6b7ef32c8478a48c3994dcadc86837f4371184d --- /dev/null +++ b/models/checkpoints/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/models/embeddings/.gitignore b/models/embeddings/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d6b7ef32c8478a48c3994dcadc86837f4371184d --- /dev/null +++ b/models/embeddings/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/models/pretrained/.gitignore b/models/pretrained/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d6b7ef32c8478a48c3994dcadc86837f4371184d --- /dev/null +++ b/models/pretrained/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/models/training/.gitignore b/models/training/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..6b2164379c72ce3d3755816cfcea62354fb181ad --- /dev/null +++ b/models/training/.gitignore @@ -0,0 +1,6 @@ +*/** + +!mute/**/* +!.gitignore + +mute/**/*.pt diff --git a/models/training/models/.gitignore b/models/training/models/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d6b7ef32c8478a48c3994dcadc86837f4371184d --- /dev/null +++ b/models/training/models/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/models/training/mute/0_gt_wavs/mute32k.wav b/models/training/mute/0_gt_wavs/mute32k.wav new file mode 100644 index 0000000000000000000000000000000000000000..a83c72a4079056aa7b9994c05082d5018dc1b60a --- /dev/null +++ b/models/training/mute/0_gt_wavs/mute32k.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9edcf85ec77e88bd01edf3d887bdc418d3596d573f7ad2694da546f41dae6baf +size 192078 diff --git a/models/training/mute/0_gt_wavs/mute40k.wav b/models/training/mute/0_gt_wavs/mute40k.wav new file mode 100644 index 0000000000000000000000000000000000000000..60e81785a92525bc7a39d98fa16d8209279da9cd --- /dev/null +++ b/models/training/mute/0_gt_wavs/mute40k.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67a816e77b50cb9f016e49e5c01f07e080c4e3b82b7a8ac3e64bcb143f90f31b +size 240078 diff --git a/models/training/mute/0_gt_wavs/mute48k.wav b/models/training/mute/0_gt_wavs/mute48k.wav new file mode 100644 index 0000000000000000000000000000000000000000..57e2db6dec3b3546fadbc4094e75d42bc465a1cf --- /dev/null +++ b/models/training/mute/0_gt_wavs/mute48k.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f2bb4daaa106e351aebb001e5a25de985c0b472f22e8d60676bc924a79056ee +size 288078 diff --git a/models/training/mute/1_16k_wavs/mute.wav b/models/training/mute/1_16k_wavs/mute.wav new file mode 100644 index 0000000000000000000000000000000000000000..e40db260891baa6c988dc73c41ec8a14ae23e9ac --- /dev/null +++ b/models/training/mute/1_16k_wavs/mute.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e233e86ba1be365e1133f157d56b61110086b89650ecfbdfc013c759e466250 +size 96078 diff --git a/models/training/mute/2a_f0/mute.wav.npy b/models/training/mute/2a_f0/mute.wav.npy new file mode 100644 index 0000000000000000000000000000000000000000..dd7e9afd2e7f2aefaa30bcd4541a23ce96a9e150 --- /dev/null +++ b/models/training/mute/2a_f0/mute.wav.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b9acf9ab7facdb032e1d687fe35182670b0b94566c4b209ae48c239d19956a6 +size 1332 diff --git a/models/training/mute/2b_f0nsf/mute.wav.npy b/models/training/mute/2b_f0nsf/mute.wav.npy new file mode 100644 index 0000000000000000000000000000000000000000..7644e325ddd34bd186153ecf7461aa1593a054f3 --- /dev/null +++ b/models/training/mute/2b_f0nsf/mute.wav.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30792849c8e72d67e6691754077f2888b101cb741e9c7f193c91dd9692870c87 +size 2536 diff --git a/models/training/mute/3_feature256/mute.npy b/models/training/mute/3_feature256/mute.npy new file mode 100644 index 0000000000000000000000000000000000000000..c57ae95d19d969788ef186a81cdc2f4b462ed6df --- /dev/null +++ b/models/training/mute/3_feature256/mute.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64d5abbac078e19a3f649c0d78a02cb33a71407ded3ddf2db78e6b803d0c0126 +size 152704 diff --git a/modules/cmd_opts.py b/modules/cmd_opts.py new file mode 100644 index 0000000000000000000000000000000000000000..f3ac05d5663273292e91380956b9b545ffc9b584 --- /dev/null +++ b/modules/cmd_opts.py @@ -0,0 +1,22 @@ +import argparse + +parser = argparse.ArgumentParser() + +parser.add_argument("--host", help="Host to connect to", type=str, default="127.0.0.1") +parser.add_argument("--port", help="Port to connect to", type=int) +parser.add_argument("--share", help="Enable gradio share", action="store_true") +parser.add_argument( + "--models-dir", help="Path to models directory", type=str, default=None +) +parser.add_argument( + "--output-dir", help="Path to output directory", type=str, default=None +) +parser.add_argument( + "--precision", + help="Precision to use", + type=str, + default="fp16", + choices=["fp32", "fp16"], +) + +opts, _ = parser.parse_known_args() diff --git a/modules/core.py b/modules/core.py new file mode 100644 index 0000000000000000000000000000000000000000..8778e5ecaf12996cf815132578add319f03448cb --- /dev/null +++ b/modules/core.py @@ -0,0 +1,156 @@ +import hashlib +import os +import shutil +import sys +from concurrent.futures import ThreadPoolExecutor + +import requests + +from modules.models import MODELS_DIR +from modules.shared import ROOT_DIR +from modules.utils import download_file + + +def get_hf_etag(url: str): + r = requests.head(url) + + etag = r.headers["X-Linked-ETag"] if "X-Linked-ETag" in r.headers else "" + + if etag.startswith('"') and etag.endswith('"'): + etag = etag[1:-1] + + return etag + + +def calc_sha256(filepath: str): + sha256 = hashlib.sha256() + with open(filepath, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + sha256.update(chunk) + return sha256.hexdigest() + + +def download_models(): + def hash_check(url: str, out: str): + if not os.path.exists(out): + return False + etag = get_hf_etag(url) + hash = calc_sha256(out) + return etag == hash + + os.makedirs(os.path.join(MODELS_DIR, "pretrained", "v2"), exist_ok=True) + + tasks = [] + for template in [ + "D{}k", + "G{}k", + "f0D{}k", + "f0G{}k", + ]: + basename = template.format("40") + url = f"https://huggingface.co/ddPn08/rvc-webui-models/resolve/main/pretrained/v2/{basename}.pth" + out = os.path.join(MODELS_DIR, "pretrained", "v2", f"{basename}.pth") + + if hash_check(url, out): + continue + + tasks.append((url, out)) + + for filename in [ + "checkpoint_best_legacy_500.pt", + ]: + out = os.path.join(MODELS_DIR, "embeddings", filename) + url = f"https://huggingface.co/ddPn08/rvc-webui-models/resolve/main/embeddings/{filename}" + + if hash_check(url, out): + continue + + tasks.append( + ( + f"https://huggingface.co/ddPn08/rvc-webui-models/resolve/main/embeddings/{filename}", + out, + ) + ) + + # japanese-hubert-base (Fairseq) + # from official repo + # NOTE: change filename? + hubert_jp_url = f"https://huggingface.co/rinna/japanese-hubert-base/resolve/main/fairseq/model.pt" + out = os.path.join(MODELS_DIR, "embeddings", "rinna_hubert_base_jp.pt") + if not hash_check(hubert_jp_url, out): + tasks.append( + ( + hubert_jp_url, + out, + ) + ) + + if len(tasks) < 1: + return + + with ThreadPoolExecutor() as pool: + pool.map( + download_file, + *zip( + *[(filename, out, i, True) for i, (filename, out) in enumerate(tasks)] + ), + ) + + +def install_ffmpeg(): + if os.path.exists(os.path.join(ROOT_DIR, "bin", "ffmpeg.exe")): + return + tmpdir = os.path.join(ROOT_DIR, "tmp") + url = ( + "https://www.gyan.dev/ffmpeg/builds/packages/ffmpeg-5.1.2-essentials_build.zip" + ) + out = os.path.join(tmpdir, "ffmpeg.zip") + os.makedirs(os.path.dirname(out), exist_ok=True) + download_file(url, out) + shutil.unpack_archive(out, os.path.join(tmpdir, "ffmpeg")) + shutil.copyfile( + os.path.join( + tmpdir, "ffmpeg", "ffmpeg-5.1.2-essentials_build", "bin", "ffmpeg.exe" + ), + os.path.join(ROOT_DIR, "bin", "ffmpeg.exe"), + ) + os.remove(os.path.join(tmpdir, "ffmpeg.zip")) + shutil.rmtree(os.path.join(tmpdir, "ffmpeg")) + + +def update_modelnames(): + for sr in ["32k", "40k", "48k"]: + files = [ + f"f0G{sr}", + f"f0D{sr}", + f"G{sr}", + f"D{sr}", + ] + for file in files: + filepath = os.path.join(MODELS_DIR, "pretrained", f"{file}.pth") + if os.path.exists(filepath): + os.rename( + filepath, + os.path.join(MODELS_DIR, "pretrained", f"{file}256.pth"), + ) + + if not os.path.exists(os.path.join(MODELS_DIR, "embeddings")): + os.makedirs(os.path.join(MODELS_DIR, "embeddings")) + + if os.path.exists(os.path.join(MODELS_DIR, "hubert_base.pt")): + os.rename( + os.path.join(MODELS_DIR, "hubert_base.pt"), + os.path.join(MODELS_DIR, "embeddings", "hubert_base.pt"), + ) + if os.path.exists(os.path.join(MODELS_DIR, "checkpoint_best_legacy_500.pt")): + os.rename( + os.path.join(MODELS_DIR, "checkpoint_best_legacy_500.pt"), + os.path.join(MODELS_DIR, "embeddings", "checkpoint_best_legacy_500.pt"), + ) + + +def preload(): + update_modelnames() + download_models() + if sys.platform == "win32": + install_ffmpeg() diff --git a/modules/merge.py b/modules/merge.py new file mode 100644 index 0000000000000000000000000000000000000000..6c7ea69bd6609b9cd10cf00bc7e5df01208fb4ba --- /dev/null +++ b/modules/merge.py @@ -0,0 +1,81 @@ +from collections import OrderedDict +from typing import * + +import torch +import tqdm + + +def merge( + path_a: str, + path_b: str, + path_c: str, + alpha: float, + weights: Dict[str, float], + method: str, +): + def extract(ckpt: Dict[str, Any]): + a = ckpt["model"] + opt = OrderedDict() + opt["weight"] = {} + for key in a.keys(): + if "enc_q" in key: + continue + opt["weight"][key] = a[key] + return opt + + def load_weight(path: str): + print(f"Loading {path}...") + state_dict = torch.load(path, map_location="cpu") + if "model" in state_dict: + weight = extract(state_dict) + else: + weight = state_dict["weight"] + return weight, state_dict + + def get_alpha(key: str): + try: + filtered = sorted( + [x for x in weights.keys() if key.startswith(x)], key=len, reverse=True + ) + if len(filtered) < 1: + return alpha + return weights[filtered[0]] + except: + return alpha + + weight_a, state_dict = load_weight(path_a) + weight_b, _ = load_weight(path_b) + if path_c is not None: + weight_c, _ = load_weight(path_c) + + if sorted(list(weight_a.keys())) != sorted(list(weight_b.keys())): + raise RuntimeError("Failed to merge models.") + + merged = OrderedDict() + merged["weight"] = {} + + def merge_weight(a, b, c, alpha): + if method == "weight_sum": + return (1 - alpha) * a + alpha * b + elif method == "add_diff": + return a + (b - c) * alpha + + for key in tqdm.tqdm(weight_a.keys()): + a = get_alpha(key) + if path_c is not None: + merged["weight"][key] = merge_weight( + weight_a[key], weight_b[key], weight_c[key], a + ) + else: + merged["weight"][key] = merge_weight(weight_a[key], weight_b[key], None, a) + merged["config"] = state_dict["config"] + merged["params"] = state_dict["params"] if "params" in state_dict else None + merged["version"] = state_dict.get("version", "v1") + merged["sr"] = state_dict["sr"] + merged["f0"] = state_dict["f0"] + merged["info"] = state_dict["info"] + merged["embedder_name"] = ( + state_dict["embedder_name"] if "embedder_name" in state_dict else None + ) + merged["embedder_output_layer"] = state_dict.get("embedder_output_layer", "12") + return merged diff --git a/modules/models.py b/modules/models.py new file mode 100644 index 0000000000000000000000000000000000000000..fde714acf93239a39d48862fdab0b71ec19a1ca1 --- /dev/null +++ b/modules/models.py @@ -0,0 +1,266 @@ +import os +import re +from typing import * + +import torch +from fairseq import checkpoint_utils +from fairseq.models.hubert.hubert import HubertModel +from pydub import AudioSegment + +from lib.rvc.models import (SynthesizerTrnMs256NSFSid, + SynthesizerTrnMs256NSFSidNono) +from lib.rvc.pipeline import VocalConvertPipeline + +from .cmd_opts import opts +from .shared import ROOT_DIR, device, is_half +from .utils import load_audio + +AUDIO_OUT_DIR = opts.output_dir or os.path.join(ROOT_DIR, "outputs") + + +EMBEDDINGS_LIST = { + "hubert-base-japanese": ( + "rinna_hubert_base_jp.pt", + "hubert-base-japanese", + "local", + ), + "contentvec": ("checkpoint_best_legacy_500.pt", "contentvec", "local"), +} + + +def update_state_dict(state_dict): + if "params" in state_dict and state_dict["params"] is not None: + return + keys = [ + "spec_channels", + "segment_size", + "inter_channels", + "hidden_channels", + "filter_channels", + "n_heads", + "n_layers", + "kernel_size", + "p_dropout", + "resblock", + "resblock_kernel_sizes", + "resblock_dilation_sizes", + "upsample_rates", + "upsample_initial_channel", + "upsample_kernel_sizes", + "spk_embed_dim", + "gin_channels", + "emb_channels", + "sr", + ] + state_dict["params"] = {} + n = 0 + for i, key in enumerate(keys): + i = i - n + if len(state_dict["config"]) != 19 and key == "emb_channels": + # backward compat. + n += 1 + continue + state_dict["params"][key] = state_dict["config"][i] + + if not "emb_channels" in state_dict["params"]: + if state_dict.get("version", "v1") == "v1": + state_dict["params"]["emb_channels"] = 256 # for backward compat. + state_dict["embedder_output_layer"] = 9 + else: + state_dict["params"]["emb_channels"] = 768 # for backward compat. + state_dict["embedder_output_layer"] = 12 + + +class VoiceConvertModel: + def __init__(self, model_name: str, state_dict: Dict[str, Any]) -> None: + update_state_dict(state_dict) + self.model_name = model_name + self.state_dict = state_dict + self.tgt_sr = state_dict["params"]["sr"] + f0 = state_dict.get("f0", 1) + state_dict["params"]["spk_embed_dim"] = state_dict["weight"][ + "emb_g.weight" + ].shape[0] + if not "emb_channels" in state_dict["params"]: + state_dict["params"]["emb_channels"] = 256 # for backward compat. + + if f0 == 1: + self.net_g = SynthesizerTrnMs256NSFSid( + **state_dict["params"], is_half=is_half + ) + else: + self.net_g = SynthesizerTrnMs256NSFSidNono(**state_dict["params"]) + + del self.net_g.enc_q + + self.net_g.load_state_dict(state_dict["weight"], strict=False) + self.net_g.eval().to(device) + + if is_half: + self.net_g = self.net_g.half() + else: + self.net_g = self.net_g.float() + + self.vc = VocalConvertPipeline(self.tgt_sr, device, is_half) + self.n_spk = state_dict["params"]["spk_embed_dim"] + + def single( + self, + sid: int, + input_audio: str, + embedder_model_name: str, + embedding_output_layer: str, + f0_up_key: int, + f0_file: str, + f0_method: str, + auto_load_index: bool, + faiss_index_file: str, + index_rate: float, + output_dir: str = AUDIO_OUT_DIR, + ): + if not input_audio: + raise Exception("You need to set Source Audio") + f0_up_key = int(f0_up_key) + audio = load_audio(input_audio, 16000) + + if embedder_model_name == "auto": + embedder_model_name = ( + self.state_dict["embedder_name"] + if "embedder_name" in self.state_dict + else "hubert_base" + ) + if embedder_model_name.endswith("768"): + embedder_model_name = embedder_model_name[:-3] + + if embedder_model_name == "hubert_base": + embedder_model_name = "contentvec" + + if not embedder_model_name in EMBEDDINGS_LIST.keys(): + raise Exception(f"Not supported embedder: {embedder_model_name}") + + if ( + embedder_model == None + or loaded_embedder_model != EMBEDDINGS_LIST[embedder_model_name][1] + ): + print(f"load {embedder_model_name} embedder") + embedder_filename, embedder_name, load_from = get_embedder( + embedder_model_name + ) + load_embedder(embedder_filename, embedder_name) + + if embedding_output_layer == "auto": + embedding_output_layer = ( + self.state_dict["embedding_output_layer"] + if "embedding_output_layer" in self.state_dict + else 12 + ) + else: + embedding_output_layer = int(embedding_output_layer) + + f0 = self.state_dict.get("f0", 1) + + if not faiss_index_file and auto_load_index: + faiss_index_file = self.get_index_path(sid) + + audio_opt = self.vc( + embedder_model, + embedding_output_layer, + self.net_g, + sid, + audio, + f0_up_key, + f0_method, + faiss_index_file, + index_rate, + f0, + f0_file=f0_file, + ) + + audio = AudioSegment( + audio_opt, + frame_rate=self.tgt_sr, + sample_width=2, + channels=1, + ) + os.makedirs(output_dir, exist_ok=True) + input_audio_splitext = os.path.splitext(os.path.basename(input_audio))[0] + model_splitext = os.path.splitext(self.model_name)[0] + index = 0 + existing_files = os.listdir(output_dir) + for existing_file in existing_files: + result = re.match(r"\d+", existing_file) + if result: + prefix_num = int(result.group(0)) + if index < prefix_num: + index = prefix_num + audio.export( + os.path.join( + output_dir, f"{index+1}-{model_splitext}-{input_audio_splitext}.wav" + ), + format="wav", + ) + return audio_opt + + def get_index_path(self, speaker_id: int): + basename = os.path.splitext(self.model_name)[0] + speaker_index_path = os.path.join( + MODELS_DIR, + "checkpoints", + f"{basename}_index", + f"{basename}.{speaker_id}.index", + ) + if os.path.exists(speaker_index_path): + return speaker_index_path + return os.path.join(MODELS_DIR, "checkpoints", f"{basename}.index") + + +MODELS_DIR = opts.models_dir or os.path.join(ROOT_DIR, "models") +vc_model: Optional[VoiceConvertModel] = None +embedder_model: Optional[HubertModel] = None +loaded_embedder_model = "" + + +def get_models(): + dir = os.path.join(ROOT_DIR, "models", "checkpoints") + os.makedirs(dir, exist_ok=True) + return [ + file + for file in os.listdir(dir) + if any([x for x in [".ckpt", ".pth"] if file.endswith(x)]) + ] + + +def get_embedder(embedder_name): + if embedder_name in EMBEDDINGS_LIST: + return EMBEDDINGS_LIST[embedder_name] + return None + + +def load_embedder(emb_file: str, emb_name: str): + global embedder_model, loaded_embedder_model + emb_file = os.path.join(MODELS_DIR, "embeddings", emb_file) + models, _, _ = checkpoint_utils.load_model_ensemble_and_task( + [emb_file], + suffix="", + ) + embedder_model = models[0] + embedder_model = embedder_model.to(device) + + if is_half: + embedder_model = embedder_model.half() + else: + embedder_model = embedder_model.float() + embedder_model.eval() + + loaded_embedder_model = emb_name + + +def get_vc_model(model_name: str): + model_path = os.path.join(MODELS_DIR, "checkpoints", model_name) + weight = torch.load(model_path, map_location="cpu") + return VoiceConvertModel(model_name, weight) + + +def load_model(model_name: str): + global vc_model + vc_model = get_vc_model(model_name) diff --git a/modules/separate.py b/modules/separate.py new file mode 100644 index 0000000000000000000000000000000000000000..8921d2bf4e873c0be7f22dc9ad25d42084a60c7c --- /dev/null +++ b/modules/separate.py @@ -0,0 +1,82 @@ +import os +from typing import * + +import tqdm +from pydub import AudioSegment +from pydub.silence import split_on_silence + + +def separate_audio( + input: str, + output: str, + silence_thresh: int, + min_silence_len: int = 1000, + keep_silence: int = 100, + margin: int = 0, + padding: bool = False, + min: Optional[int] = None, + max: Optional[int] = None, +): + if os.path.isfile(input): + input = [input] + elif os.path.isdir(input): + input = [os.path.join(input, f) for f in os.listdir(input)] + else: + raise ValueError("input must be a file or directory") + + os.makedirs(output, exist_ok=True) + + for file in input: + if os.path.splitext(file)[1] == ".mp3": + audio = AudioSegment.from_mp3(file) + elif os.path.splitext(file)[1] == ".wav": + audio = AudioSegment.from_wav(file) + elif os.path.splitext(file)[1] == ".flac": + audio = AudioSegment.from_file(file, "flac") + else: + raise ValueError( + "Invalid file format. Only MP3 and WAV files are supported." + ) + + chunks = split_on_silence( + audio, + min_silence_len=min_silence_len, + silence_thresh=silence_thresh, + keep_silence=keep_silence, + ) + + output_chunks: List[AudioSegment] = [] + + so_short = None + + for chunk in tqdm.tqdm(chunks): + if so_short is not None: + chunk = so_short + chunk + so_short = None + if min is None or len(chunk) > min: + if max is not None and len(chunk) > max: + sub_chunks = [ + chunk[i : i + max + margin] + for i in range(0, len(chunk) - margin, max) + ] + + if len(sub_chunks[-1]) < min: + if padding and len(sub_chunks) > 2: + output_chunks.extend(sub_chunks[0:-2]) + output_chunks.append(sub_chunks[-2] + sub_chunks[-1]) + else: + output_chunks.extend(sub_chunks[0:-1]) + else: + output_chunks.extend(sub_chunks) + else: + output_chunks.append(chunk) + else: + if so_short is None: + so_short = chunk + else: + so_short += chunk + basename = os.path.splitext(os.path.basename(file))[0] + + for i, chunk in enumerate(output_chunks): + filepath = os.path.join(output, f"{basename}_{i}.wav") + chunk.export(filepath, format="wav") diff --git a/modules/server/model.py b/modules/server/model.py new file mode 100644 index 0000000000000000000000000000000000000000..2077a5052f5134f13f7155900c8dc48ced3bd6d9 --- /dev/null +++ b/modules/server/model.py @@ -0,0 +1,451 @@ +import os +import re +from typing import * + +import faiss +import numpy as np +import pyworld +import scipy.signal as signal +import torch +import torch.nn.functional as F +import torchaudio +import torchcrepe +from fairseq import checkpoint_utils +from fairseq.models.hubert.hubert import HubertModel +from pydub import AudioSegment +from torch import Tensor + +from lib.rvc.models import (SynthesizerTrnMs256NSFSid, + SynthesizerTrnMs256NSFSidNono) +from lib.rvc.pipeline import VocalConvertPipeline +from modules.cmd_opts import opts +from modules.models import (EMBEDDINGS_LIST, MODELS_DIR, get_embedder, + get_vc_model, update_state_dict) +from modules.shared import ROOT_DIR, device, is_half + +MODELS_DIR = opts.models_dir or os.path.join(ROOT_DIR, "models") +vc_model: Optional["VoiceServerModel"] = None +embedder_model: Optional[HubertModel] = None +loaded_embedder_model = "" + + +class VoiceServerModel: + def __init__(self, rvc_model_file: str, faiss_index_file: str) -> None: + # setting vram + global device, is_half + if isinstance(device, str): + device = torch.device(device) + if device.type == "cuda": + vram = torch.cuda.get_device_properties(device).total_memory / 1024**3 + else: + vram = None + if vram is not None and vram <= 4: + self.x_pad = 1 + self.x_query = 5 + self.x_center = 30 + self.x_max = 32 + elif vram is not None and vram <= 5: + self.x_pad = 1 + self.x_query = 6 + self.x_center = 38 + self.x_max = 41 + else: + self.x_pad = 3 + self.x_query = 10 + self.x_center = 60 + self.x_max = 65 + + # load_model + state_dict = torch.load(rvc_model_file, map_location="cpu") + update_state_dict(state_dict) + self.state_dict = state_dict + self.tgt_sr = state_dict["params"]["sr"] + self.f0 = state_dict.get("f0", 1) + state_dict["params"]["spk_embed_dim"] = state_dict["weight"][ + "emb_g.weight" + ].shape[0] + if not "emb_channels" in state_dict["params"]: + if state_dict.get("version", "v1") == "v1": + state_dict["params"]["emb_channels"] = 256 # for backward compat. + state_dict["embedder_output_layer"] = 9 + else: + state_dict["params"]["emb_channels"] = 768 # for backward compat. + state_dict["embedder_output_layer"] = 12 + if self.f0 == 1: + self.net_g = SynthesizerTrnMs256NSFSid( + **state_dict["params"], is_half=is_half + ) + else: + self.net_g = SynthesizerTrnMs256NSFSidNono(**state_dict["params"]) + del self.net_g.enc_q + self.net_g.load_state_dict(state_dict["weight"], strict=False) + self.net_g.eval().to(device) + if is_half: + self.net_g = self.net_g.half() + else: + self.net_g = self.net_g.float() + + emb_name = state_dict.get("embedder_name", "contentvec") + if emb_name == "hubert_base": + emb_name = "contentvec" + emb_file = os.path.join(MODELS_DIR, "embeddings", EMBEDDINGS_LIST[emb_name][0]) + models, _, _ = checkpoint_utils.load_model_ensemble_and_task( + [emb_file], + suffix="", + ) + embedder_model = models[0] + embedder_model = embedder_model.to(device) + + if is_half: + embedder_model = embedder_model.half() + else: + embedder_model = embedder_model.float() + embedder_model.eval() + self.embedder_model = embedder_model + + self.embedder_output_layer = state_dict["embedder_output_layer"] + + self.index = None + if faiss_index_file != "" and os.path.exists(faiss_index_file): + self.index = faiss.read_index(faiss_index_file) + self.big_npy = self.index.reconstruct_n(0, self.index.ntotal) + + self.n_spk = state_dict["params"]["spk_embed_dim"] + + self.sr = 16000 # hubert input sample rate + self.window = 160 # hubert input window + self.t_pad = self.sr * self.x_pad # padding time for each utterance + self.t_pad_tgt = self.tgt_sr * self.x_pad + self.t_pad2 = self.t_pad * 2 + self.t_query = self.sr * self.x_query # query time before and after query point + self.t_center = self.sr * self.x_center # query cut point position + self.t_max = self.sr * self.x_max # max time for no query + self.device = device + self.is_half = is_half + + def __call__( + self, + audio: np.ndarray, + sr: int, + sid: int, + transpose: int, + f0_method: str, + index_rate: float, + ): + # bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) + # audio = signal.filtfilt(bh, ah, audio) + if sr != self.sr: + audio = torchaudio.functional.resample(torch.from_numpy(audio), sr, self.sr, rolloff=0.99).detach().cpu().numpy() + audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect" if audio.shape[0] > self.window // 2 else "constant") + + opt_ts = [] + if audio_pad.shape[0] > self.t_max: + audio_sum = np.zeros_like(audio) + for i in range(self.window): + audio_sum += audio_pad[i : i - self.window] + for t in range(self.t_center, audio.shape[0], self.t_center): + opt_ts.append( + t + - self.t_query + + np.where( + np.abs(audio_sum[t - self.t_query : t + self.t_query]) + == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() + )[0][0] + ) + audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect" if audio.shape[0] > self.t_pad else "constant") + p_len = audio_pad.shape[0] // self.window + + sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() + pitch, pitchf = None, None + if self.f0 == 1: + pitch, pitchf = get_f0(audio_pad, self.sr, p_len, transpose, f0_method) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + if self.device.type == "mps": + pitchf = pitchf.astype(np.float32) + pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() + pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() + + audio_opt = [] + + s = 0 + t = None + + for t in opt_ts: + t = t // self.window * self.window + if self.f0 == 1: + audio_opt.append( + self._convert( + sid, + audio_pad[s : t + self.t_pad2 + self.window], + pitch[:, s // self.window : (t + self.t_pad2) // self.window], + pitchf[:, s // self.window : (t + self.t_pad2) // self.window], + index_rate, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self._convert( + sid, + audio_pad[s : t + self.t_pad2 + self.window], + None, + None, + index_rate, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + s = t + if self.f0 == 1: + audio_opt.append( + self._convert( + sid, + audio_pad[t:], + pitch[:, t // self.window :] if t is not None else pitch, + pitchf[:, t // self.window :] if t is not None else pitchf, + index_rate, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self._convert( + sid, + audio_pad[t:], + None, + None, + index_rate, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + audio_opt = np.concatenate(audio_opt) + del pitch, pitchf, sid + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio_opt + + + def _convert( + self, + sid: int, + audio: np.ndarray, + pitch: Optional[np.ndarray], + pitchf: Optional[np.ndarray], + index_rate: float, + ): + feats = torch.from_numpy(audio) + if self.is_half: + feats = feats.half() + else: + feats = feats.float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) + + half_support = ( + self.device.type == "cuda" + and torch.cuda.get_device_capability(self.device)[0] >= 5.3 + ) + is_feats_dim_768 = self.net_g.emb_channels == 768 + + if isinstance(self.embedder_model, tuple): + feats = self.embedder_model[0]( + feats.squeeze(0).squeeze(0).to(self.device), + return_tensors="pt", + sampling_rate=16000, + ) + if self.is_half: + feats = feats.input_values.to(self.device).half() + else: + feats = feats.input_values.to(self.device) + with torch.no_grad(): + if is_feats_dim_768: + feats = self.embedder_model[1](feats).last_hidden_state + else: + feats = self.embedder_model[1](feats).extract_features + else: + inputs = { + "source": feats.half().to(self.device) + if half_support + else feats.to(self.device), + "padding_mask": padding_mask.to(self.device), + "output_layer": self.embedder_output_layer, + } + + if not half_support: + self.embedder_model = self.embedder_model.float() + inputs["source"] = inputs["source"].float() + + with torch.no_grad(): + logits = self.embedder_model.extract_features(**inputs) + if is_feats_dim_768: + feats = logits[0] + else: + feats = self.embedder_model.final_proj(logits[0]) + + if ( + isinstance(self.index, type(None)) == False + and isinstance(self.big_npy, type(None)) == False + and index_rate != 0 + ): + npy = feats[0].cpu().numpy() + if self.is_half: + npy = npy.astype("float32") + + _, ix = self.index.search(npy, k=1) + npy = self.big_npy[ix[:, 0]] + + if self.is_half: + npy = npy.astype("float16") + feats = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + + (1 - index_rate) * feats + ) + + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + + p_len = audio.shape[0] // self.window + if feats.shape[1] < p_len: + p_len = feats.shape[1] + if pitch != None and pitchf != None: + pitch = pitch[:, :p_len] + pitchf = pitchf[:, :p_len] + p_len = torch.tensor([p_len], device=self.device).long() + with torch.no_grad(): + if pitch != None and pitchf != None: + audio1 = ( + (self.net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0] * 32768) + .data.cpu() + .float() + .numpy() + .astype(np.int16) + ) + else: + audio1 = ( + (self.net_g.infer(feats, p_len, sid)[0][0, 0] * 32768) + .data.cpu() + .float() + .numpy() + .astype(np.int16) + ) + del feats, p_len, padding_mask + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio1 + + +# F0 computation +def get_f0_crepe_computation( + x, + sr, + f0_min, + f0_max, + p_len, + model="full", # Either use crepe-tiny "tiny" or crepe "full". Default is full +): + hop_length = sr // 100 + x = x.astype(np.float32) # fixes the F.conv2D exception. We needed to convert double to float. + x /= np.quantile(np.abs(x), 0.999) + torch_device = self.get_optimal_torch_device() + audio = torch.from_numpy(x).to(torch_device, copy=True) + audio = torch.unsqueeze(audio, dim=0) + if audio.ndim == 2 and audio.shape[0] > 1: + audio = torch.mean(audio, dim=0, keepdim=True).detach() + audio = audio.detach() + print("Initiating prediction with a crepe_hop_length of: " + str(hop_length)) + pitch: Tensor = torchcrepe.predict( + audio, + sr, + sr // 100, + f0_min, + f0_max, + model, + batch_size=hop_length * 2, + device=torch_device, + pad=True + ) + p_len = p_len or x.shape[0] // hop_length + # Resize the pitch for final f0 + source = np.array(pitch.squeeze(0).cpu().float().numpy()) + source[source < 0.001] = np.nan + target = np.interp( + np.arange(0, len(source) * p_len, len(source)) / p_len, + np.arange(0, len(source)), + source + ) + f0 = np.nan_to_num(target) + return f0 # Resized f0 + +def get_f0_official_crepe_computation( + x, + sr, + f0_min, + f0_max, + model="full", +): + # Pick a batch size that doesn't cause memory errors on your gpu + batch_size = 512 + # Compute pitch using first gpu + audio = torch.tensor(np.copy(x))[None].float() + f0, pd = torchcrepe.predict( + audio, + sr, + sr // 100, + f0_min, + f0_max, + model, + batch_size=batch_size, + device=device, + return_periodicity=True, + ) + pd = torchcrepe.filter.median(pd, 3) + f0 = torchcrepe.filter.mean(f0, 3) + f0[pd < 0.1] = 0 + f0 = f0[0].cpu().numpy() + return f0 + +def get_f0( + x: np.ndarray, + sr: int, + p_len: int, + f0_up_key: int, + f0_method: str, +): + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + + if f0_method == "harvest": + f0, t = pyworld.harvest( + x.astype(np.double), + fs=sr, + f0_ceil=f0_max, + f0_floor=f0_min, + frame_period=10, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, sr) + f0 = signal.medfilt(f0, 3) + elif f0_method == "dio": + f0, t = pyworld.dio( + x.astype(np.double), + fs=sr, + f0_ceil=f0_max, + f0_floor=f0_min, + frame_period=10, + ) + f0 = pyworld.stonemask(x.astype(np.double), f0, t, sr) + f0 = signal.medfilt(f0, 3) + elif f0_method == "mangio-crepe": + f0 = get_f0_crepe_computation(x, sr, f0_min, f0_max, p_len, "full") + elif f0_method == "crepe": + f0 = get_f0_official_crepe_computation(x, sr, f0_min, f0_max, "full") + + f0 *= pow(2, f0_up_key / 12) + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int32) + return f0_coarse, f0bak # 1-0 \ No newline at end of file diff --git a/modules/shared.py b/modules/shared.py new file mode 100644 index 0000000000000000000000000000000000000000..f2556b2b9c413b03bac798d2109f26ec9ddd2808 --- /dev/null +++ b/modules/shared.py @@ -0,0 +1,44 @@ +import os +import sys + +import torch + +from modules.cmd_opts import opts + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +MODELS_DIR = os.path.join(ROOT_DIR, "models") + + +def has_mps(): + if sys.platform != "darwin": + return False + else: + if not getattr(torch, "has_mps", False): + return False + try: + torch.zeros(1).to(torch.device("mps")) + return True + except Exception: + return False + + +is_half = opts.precision == "fp16" +half_support = ( + torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 5.3 +) + +if not half_support: + print("WARNING: FP16 is not supported on this GPU") + is_half = False + +device = "cuda:0" + +if not torch.cuda.is_available(): + if has_mps(): + print("Using MPS") + device = "mps" + else: + print("Using CPU") + device = "cpu" + +device = torch.device(device) diff --git a/modules/tabs/inference.py b/modules/tabs/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..6c5a1c7f340bf167262d249b06d1f774bd67fe19 --- /dev/null +++ b/modules/tabs/inference.py @@ -0,0 +1,168 @@ +import glob +import os +import traceback + +import gradio as gr + +from modules import models, ui +from modules.ui import Tab + + +def inference_options_ui(show_out_dir=True): + with gr.Row(equal_height=False): + with gr.Column(): + source_audio = gr.Textbox(label="Source Audio") + out_dir = gr.Textbox( + label="Out folder", + visible=show_out_dir, + placeholder=models.AUDIO_OUT_DIR, + ) + with gr.Column(): + transpose = gr.Slider( + minimum=-20, maximum=20, value=0, step=1, label="Transpose" + ) + pitch_extraction_algo = gr.Radio( + choices=["dio", "harvest", "mangio-crepe", "crepe"], + value="crepe", + label="Pitch Extraction Algorithm", + ) + embedding_model = gr.Radio( + choices=["auto", *models.EMBEDDINGS_LIST.keys()], + value="auto", + label="Embedder Model", + ) + embedding_output_layer = gr.Radio( + choices=["auto", "9", "12"], + value="auto", + label="Embedder Output Layer", + ) + with gr.Column(): + auto_load_index = gr.Checkbox(value=False, label="Auto Load Index") + faiss_index_file = gr.Textbox(value="", label="Faiss Index File Path") + retrieval_feature_ratio = gr.Slider( + minimum=0, + maximum=1, + value=1, + step=0.01, + label="Retrieval Feature Ratio", + ) + with gr.Column(): + fo_curve_file = gr.File(label="F0 Curve File") + + return ( + source_audio, + out_dir, + transpose, + embedding_model, + embedding_output_layer, + pitch_extraction_algo, + auto_load_index, + faiss_index_file, + retrieval_feature_ratio, + fo_curve_file, + ) + + +class Inference(Tab): + def title(self): + return "Inference" + + def sort(self): + return 1 + + def ui(self, outlet): + def infer( + sid, + input_audio, + out_dir, + embedder_model, + embedding_output_layer, + f0_up_key, + f0_file, + f0_method, + auto_load_index, + faiss_index_file, + index_rate, + ): + model = models.vc_model + try: + yield "Infering...", None + if out_dir == "": + out_dir = models.AUDIO_OUT_DIR + + if "*" in input_audio: + assert ( + out_dir is not None + ), "Out folder is required for batch processing" + files = glob.glob(input_audio, recursive=True) + elif os.path.isdir(input_audio): + assert ( + out_dir is not None + ), "Out folder is required for batch processing" + files = glob.glob( + os.path.join(input_audio, "**", "*.wav"), recursive=True + ) + else: + files = [input_audio] + for file in files: + audio = model.single( + sid, + file, + embedder_model, + embedding_output_layer, + f0_up_key, + f0_file, + f0_method, + auto_load_index, + faiss_index_file, + index_rate, + output_dir=out_dir, + ) + yield "Success", (model.tgt_sr, audio) if len(files) == 1 else None + except: + yield "Error: " + traceback.format_exc(), None + + with gr.Group(): + with gr.Box(): + with gr.Column(): + _, speaker_id = ui.create_model_list_ui() + + ( + source_audio, + out_dir, + transpose, + embedder_model, + embedding_output_layer, + pitch_extraction_algo, + auto_load_index, + faiss_index_file, + retrieval_feature_ratio, + f0_curve_file, + ) = inference_options_ui() + + with gr.Row(equal_height=False): + with gr.Column(): + status = gr.Textbox(value="", label="Status") + output = gr.Audio(label="Output", interactive=False) + + with gr.Row(): + infer_button = gr.Button("Infer", variant="primary") + + infer_button.click( + infer, + inputs=[ + speaker_id, + source_audio, + out_dir, + embedder_model, + embedding_output_layer, + transpose, + f0_curve_file, + pitch_extraction_algo, + auto_load_index, + faiss_index_file, + retrieval_feature_ratio, + ], + outputs=[status, output], + queue=True, + ) diff --git a/modules/tabs/merge.py b/modules/tabs/merge.py new file mode 100644 index 0000000000000000000000000000000000000000..fff006a168f91a5cb40fd88c2e5a68503c439af1 --- /dev/null +++ b/modules/tabs/merge.py @@ -0,0 +1,365 @@ +import json +import os +from typing import * + +import gradio as gr +import torch + +from modules import models +from modules.merge import merge +from modules.tabs.inference import inference_options_ui +from modules.ui import Tab + +MERGE_METHODS = { + "weight_sum": "Weight sum:A*(1-alpha)+B*alpha", + "add_diff": "Add difference:A+(B-C)*alpha", +} + + +class Merge(Tab): + def title(self): + return "Merge" + + def sort(self): + return 3 + + def ui(self, outlet): + def merge_ckpt(model_a, model_b, model_c, weight_text, alpha, each_key, method): + model_a = model_a if type(model_a) != list and model_a != "" else None + model_b = model_b if type(model_b) != list and model_b != "" else None + model_c = model_c if type(model_c) != list and model_c != "" else None + + if each_key: + weights = json.loads(weight_text) + else: + weights = {} + + method = [k for k, v in MERGE_METHODS.items() if v == method][0] + return merge( + os.path.join(models.MODELS_DIR, "checkpoints", model_a), + os.path.join(models.MODELS_DIR, "checkpoints", model_b), + os.path.join(models.MODELS_DIR, "checkpoints", model_c) + if model_c + else None, + alpha, + weights, + method, + ) + + def merge_and_save( + model_a, model_b, model_c, alpha, each_key, weight_text, method, out_name + ): + print(each_key) + out_path = os.path.join(models.MODELS_DIR, "checkpoints", out_name) + if os.path.exists(out_path): + return "Model name already exists." + merged = merge_ckpt( + model_a, model_b, model_c, weight_text, alpha, each_key, method + ) + if not out_name.endswith(".pth"): + out_name += ".pth" + torch.save(merged, os.path.join(models.MODELS_DIR, "checkpoints", out_name)) + return "Success" + + def merge_and_gen( + model_a, + model_b, + model_c, + alpha, + each_key, + weight_text, + method, + speaker_id, + source_audio, + embedder_name, + embedding_output_layer, + transpose, + fo_curve_file, + pitch_extraction_algo, + auto_load_index, + faiss_index_file, + retrieval_feature_ratio, + ): + merged = merge_ckpt( + model_a, model_b, model_c, weight_text, alpha, each_key, method + ) + model = models.VoiceConvertModel("merge", merged) + audio = model.single( + speaker_id, + source_audio, + embedder_name, + embedding_output_layer, + transpose, + fo_curve_file, + pitch_extraction_algo, + auto_load_index, + faiss_index_file, + retrieval_feature_ratio, + ) + tgt_sr = model.tgt_sr + del merged + del model + torch.cuda.empty_cache() + return "Success", (tgt_sr, audio) + + def reload_model(): + model_list = models.get_models() + return ( + gr.Dropdown.update(choices=model_list), + gr.Dropdown.update(choices=model_list), + gr.Dropdown.update(choices=model_list), + ) + + def update_speaker_ids(model): + if model == "": + return gr.Slider.update( + maximum=0, + visible=False, + ) + model = torch.load( + os.path.join(models.MODELS_DIR, "checkpoints", model), + map_location="cpu", + ) + vc_model = models.VoiceConvertModel("merge", model) + max = vc_model.n_spk + del model + del vc_model + return gr.Slider.update( + maximum=max, + visible=True, + ) + + with gr.Group(): + with gr.Column(): + with gr.Row(equal_height=False): + model_a = gr.Dropdown(choices=models.get_models(), label="Model A") + model_b = gr.Dropdown(choices=models.get_models(), label="Model B") + model_c = gr.Dropdown(choices=models.get_models(), label="Model C") + reload_model_button = gr.Button("♻️") + reload_model_button.click( + reload_model, outputs=[model_a, model_b, model_c] + ) + with gr.Row(equal_height=False): + method = gr.Radio( + label="Merge method", + choices=list(MERGE_METHODS.values()), + value="Weight sum:A*(1-alpha)+B*alpha", + ) + output_name = gr.Textbox(label="Output name") + each_key = gr.Checkbox(label="Each key merge") + with gr.Row(equal_height=False): + base_alpha = gr.Slider( + label="Base alpha", minimum=0, maximum=1, value=0.5, step=0.01 + ) + + default_weights = {} + weights = {} + + def create_weight_ui(name: str, *keys_list: List[List[str]]): + with gr.Accordion(label=name, open=False): + with gr.Row(equal_height=False): + for keys in keys_list: + with gr.Column(): + for key in keys: + default_weights[key] = 0.5 + weights[key] = gr.Slider( + label=key, + minimum=0, + maximum=1, + step=0.01, + value=0.5, + ) + + with gr.Box(visible=False) as each_key_ui: + with gr.Column(): + create_weight_ui( + "enc_p", + [ + "enc_p.encoder.attn_layers.0", + "enc_p.encoder.attn_layers.1", + "enc_p.encoder.attn_layers.2", + "enc_p.encoder.attn_layers.3", + "enc_p.encoder.attn_layers.4", + "enc_p.encoder.attn_layers.5", + "enc_p.encoder.norm_layers_1.0", + "enc_p.encoder.norm_layers_1.1", + "enc_p.encoder.norm_layers_1.2", + "enc_p.encoder.norm_layers_1.3", + "enc_p.encoder.norm_layers_1.4", + "enc_p.encoder.norm_layers_1.5", + ], + [ + "enc_p.encoder.ffn_layers.0", + "enc_p.encoder.ffn_layers.1", + "enc_p.encoder.ffn_layers.2", + "enc_p.encoder.ffn_layers.3", + "enc_p.encoder.ffn_layers.4", + "enc_p.encoder.ffn_layers.5", + "enc_p.encoder.norm_layers_2.0", + "enc_p.encoder.norm_layers_2.1", + "enc_p.encoder.norm_layers_2.2", + "enc_p.encoder.norm_layers_2.3", + "enc_p.encoder.norm_layers_2.4", + "enc_p.encoder.norm_layers_2.5", + ], + [ + "enc_p.emb_phone", + "enc_p.emb_pitch", + ], + ) + + create_weight_ui( + "dec", + [ + "dec.noise_convs.0", + "dec.noise_convs.1", + "dec.noise_convs.2", + "dec.noise_convs.3", + "dec.noise_convs.4", + "dec.noise_convs.5", + "dec.ups.0", + "dec.ups.1", + "dec.ups.2", + "dec.ups.3", + ], + [ + "dec.resblocks.0", + "dec.resblocks.1", + "dec.resblocks.2", + "dec.resblocks.3", + "dec.resblocks.4", + "dec.resblocks.5", + "dec.resblocks.6", + "dec.resblocks.7", + "dec.resblocks.8", + "dec.resblocks.9", + "dec.resblocks.10", + "dec.resblocks.11", + ], + [ + "dec.m_source.l_linear", + "dec.conv_pre", + "dec.conv_post", + "dec.cond", + ], + ) + + create_weight_ui( + "flow", + [ + "flow.flows.0", + "flow.flows.1", + "flow.flows.2", + "flow.flows.3", + "flow.flows.4", + "flow.flows.5", + "flow.flows.6", + "emb_g.weight", + ], + ) + + with gr.Accordion(label="JSON", open=False): + weights_text = gr.TextArea( + value=json.dumps(default_weights), + ) + + with gr.Accordion(label="Inference options", open=False): + with gr.Row(equal_height=False): + speaker_id = gr.Slider( + minimum=0, + maximum=2333, + step=1, + label="Speaker ID", + value=0, + visible=True, + interactive=True, + ) + ( + source_audio, + _, + transpose, + embedder_name, + embedding_output_layer, + pitch_extraction_algo, + auto_load_index, + faiss_index_file, + retrieval_feature_ratio, + fo_curve_file, + ) = inference_options_ui(show_out_dir=False) + + with gr.Row(equal_height=False): + with gr.Column(): + status = gr.Textbox(value="", label="Status") + audio_output = gr.Audio(label="Output", interactive=False) + + with gr.Row(equal_height=False): + merge_and_save_button = gr.Button( + "Merge and save", variant="primary" + ) + merge_and_gen_button = gr.Button("Merge and gen", variant="primary") + + def each_key_on_change(each_key): + return gr.update(visible=each_key) + + each_key.change( + fn=each_key_on_change, + inputs=[each_key], + outputs=[each_key_ui], + ) + + def update_weights_text(data): + d = {} + for key in weights.keys(): + d[key] = data[weights[key]] + return json.dumps(d) + + for w in weights.values(): + w.change( + fn=update_weights_text, + inputs={*weights.values()}, + outputs=[weights_text], + ) + + merge_data = [ + model_a, + model_b, + model_c, + base_alpha, + each_key, + weights_text, + method, + ] + + inference_opts = [ + speaker_id, + source_audio, + embedder_name, + embedding_output_layer, + transpose, + fo_curve_file, + pitch_extraction_algo, + auto_load_index, + faiss_index_file, + retrieval_feature_ratio, + ] + + merge_and_save_button.click( + fn=merge_and_save, + inputs=[ + *merge_data, + output_name, + ], + outputs=[status], + ) + merge_and_gen_button.click( + fn=merge_and_gen, + inputs=[ + *merge_data, + *inference_opts, + ], + outputs=[status, audio_output], + ) + + model_a.change( + update_speaker_ids, inputs=[model_a], outputs=[speaker_id] + ) diff --git a/modules/tabs/server.py b/modules/tabs/server.py new file mode 100644 index 0000000000000000000000000000000000000000..462059d4ca755d7ba2ec5082c6f329c691ee0870 --- /dev/null +++ b/modules/tabs/server.py @@ -0,0 +1,159 @@ +import io +import json + +import gradio as gr +import requests +import soundfile as sf +import torch.multiprocessing as multiprocessing +from scipy.io.wavfile import write + +from modules.ui import Tab +from server import app + +proc = None + +def server_options_ui(show_out_dir=True): + with gr.Row().style(equal_height=False): + with gr.Row(): + host = gr.Textbox(value="127.0.0.1", label="host") + port = gr.Textbox(value="5001", label="port") + with gr.Row().style(equal_height=False): + with gr.Row(): + rvc_model_file = gr.Textbox(value="", label="RVC model file path") + faiss_index_file = gr.Textbox(value="", label="Faiss index file path") + with gr.Row().style(equal_height=False): + with gr.Row(): + input_voice_file = gr.Textbox(value="", label="input voice file path") + speaker_id = gr.Number( + value=0, + label="speaker_id", + ) + transpose = gr.Slider( + minimum=-20, maximum=20, value=0, step=1, label="transpose" + ) + pitch_extraction_algo = gr.Radio( + choices=["dio", "harvest", "mangio-crepe", "crepe"], + value="crepe", + label="pitch_extraction_algo", + ) + retrieval_feature_ratio = gr.Slider( + minimum=0, + maximum=1, + value=1, + step=0.01, + label="retrieval_feature_ratio", + ) + return ( + host, + port, + rvc_model_file, + faiss_index_file, + input_voice_file, + speaker_id, + transpose, + pitch_extraction_algo, + retrieval_feature_ratio, + ) + +def run(**kwargs): + app.run(**kwargs) + +class Server(Tab): + def title(self): + return "Server(experimental)" + + def sort(self): + return 6 + + def ui(self, outlet): + def start(host, port): + if multiprocessing.get_start_method() == 'fork': + multiprocessing.set_start_method('spawn', force=True) + proc = multiprocessing.Process(target = run, kwargs = {'host': host, 'port': port}) + proc.start() + yield "start server" + + def upload(host, port, rvc_model_file, faiss_index_file): + file_names = {"rvc_model_file": rvc_model_file, "faiss_index_file": faiss_index_file} + res = requests.post(f"http://{host}:{port}/upload_model", json=file_names) + yield res.text + + def convert(host, port, input_voice_file, speaker_id, transpose, pitch_extraction_algo, retrieval_feature_ratio): + params = { + "speaker_id": speaker_id, + "transpose": transpose, + "pitch_extraction_algo": pitch_extraction_algo, + "retrieval_feature_ratio": retrieval_feature_ratio + } + + audio, sr = sf.read(input_voice_file) + audio_buffer = io.BytesIO() + write(audio_buffer, rate=sr, data=audio) + json_buffer = io.BytesIO(json.dumps(params).encode('utf-8')) + files = { + "input_wav": audio_buffer, + "params": json_buffer + } + res = requests.post(f"http://{host}:{port}/convert_sound", files=files) + audio, sr = sf.read(io.BytesIO(res.content)) + yield "convert succeed", (sr, audio) + + with gr.Group(): + with gr.Box(): + with gr.Column(): + ( + host, + port, + rvc_model_file, + faiss_index_file, + input_voice_file, + speaker_id, + transpose, + pitch_extraction_algo, + retrieval_feature_ratio, + ) = server_options_ui() + + with gr.Row().style(equal_height=False): + with gr.Column(): + status = gr.Textbox(value="", label="Status") + output = gr.Audio(label="Output", interactive=False) + + with gr.Row(): + start_button = gr.Button("Start server", variant="primary") + upload_button = gr.Button("Upload Model") + convert_button = gr.Button("Convert Voice") + + start_button.click( + start, + inputs=[ + host, + port + ], + outputs=[status], + queue=True, + ) + upload_button.click( + upload, + inputs=[ + host, + port, + rvc_model_file, + faiss_index_file + ], + outputs=[status], + queue=True, + ) + convert_button.click( + convert, + inputs=[ + host, + port, + input_voice_file, + speaker_id, + transpose, + pitch_extraction_algo, + retrieval_feature_ratio + ], + outputs=[status, output], + queue=True, + ) diff --git a/modules/tabs/split.py b/modules/tabs/split.py new file mode 100644 index 0000000000000000000000000000000000000000..830bde4c14e0ff52d94fceaeb8ceea38bb9f1c6a --- /dev/null +++ b/modules/tabs/split.py @@ -0,0 +1,79 @@ +import gradio as gr + +from modules.separate import separate_audio +from modules.ui import Tab + + +class Split(Tab): + def title(self): + return "Split Audio" + + def sort(self): + return 5 + + def ui(self, outlet): + def separate( + input_audio, + output_dir, + silence_thresh, + min_silence_len, + keep_silence, + margin, + padding, + min, + max, + ): + min = None if min == 0 else min + max = None if max == 0 else max + separate_audio( + input_audio, + output_dir, + int(silence_thresh), + int(min_silence_len), + int(keep_silence), + int(margin), + padding, + int(min), + int(max), + ) + return "Success" + + with gr.Group(): + with gr.Column(): + with gr.Row(equal_height=False): + input_audio = gr.Textbox(label="Input Audio (File or Directory)") + output_dir = gr.Textbox(label="Output Directory") + + with gr.Row(equal_height=False): + silence_thresh = gr.Number(value=-40, label="Silence Threshold") + min_silence_len = gr.Number( + value=750, label="Minimum Silence Length" + ) + keep_silence = gr.Number(value=750, label="Keep Silence") + margin = gr.Number(value=0, label="Margin") + padding = gr.Checkbox(value=True, label="Padding") + + with gr.Row(equal_height=False): + min = gr.Number(value=1000, label="Minimum audio length") + max = gr.Number(value=5000, label="Maximum audio length") + + with gr.Row(equal_height=False): + status = gr.Textbox(value="", label="Status") + with gr.Row(equal_height=False): + separate_button = gr.Button("Separate", variant="primary") + + separate_button.click( + separate, + inputs=[ + input_audio, + output_dir, + silence_thresh, + min_silence_len, + keep_silence, + margin, + padding, + min, + max, + ], + outputs=[status], + ) diff --git a/modules/tabs/training.py b/modules/tabs/training.py new file mode 100644 index 0000000000000000000000000000000000000000..787939fbc955b501045a765742b459beb1bbaedf --- /dev/null +++ b/modules/tabs/training.py @@ -0,0 +1,496 @@ +import math +import os +import shutil +from multiprocessing import cpu_count + +import gradio as gr + +from lib.rvc.preprocessing import extract_f0, extract_feature, split +from lib.rvc.train import create_dataset_meta, glob_dataset, train_index, train_model +from modules import models, utils +from modules.shared import MODELS_DIR, device, half_support +from modules.ui import Tab + +SR_DICT = { + "32k": 32000, + "40k": 40000, + "48k": 48000, +} + + +class Training(Tab): + def title(self): + return "Training" + + def sort(self): + return 2 + + def ui(self, outlet): + def train_index_only( + model_name, + target_sr, + f0, + dataset_glob, + recursive, + multiple_speakers, + speaker_id, + gpu_id, + num_cpu_process, + norm_audio_when_preprocess, + pitch_extraction_algo, + run_train_index, + reduce_index_size, + maximum_index_size, + embedder_name, + embedding_channels, + embedding_output_layer, + ignore_cache, + ): + maximum_index_size = int(maximum_index_size) + f0 = f0 == "Yes" + norm_audio_when_preprocess = norm_audio_when_preprocess == "Yes" + run_train_index = run_train_index == "Yes" + reduce_index_size = reduce_index_size == "Yes" + training_dir = os.path.join(MODELS_DIR, "training", "models", model_name) + gpu_ids = [int(x.strip()) for x in gpu_id.split(",")] if gpu_id else [] + yield f"Training directory: {training_dir}" + + if os.path.exists(training_dir) and ignore_cache: + shutil.rmtree(training_dir) + + os.makedirs(training_dir, exist_ok=True) + + datasets = glob_dataset( + dataset_glob, + speaker_id, + multiple_speakers=multiple_speakers, + recursive=recursive, + ) + + if len(datasets) == 0: + raise Exception("No audio files found") + + yield "Preprocessing..." + split.preprocess_audio( + datasets, + SR_DICT[target_sr], + num_cpu_process, + training_dir, + norm_audio_when_preprocess, + os.path.join( + MODELS_DIR, + "training", + "mute", + "0_gt_wavs", + f"mute{target_sr}.wav", + ), + ) + + if f0: + yield "Extracting f0..." + extract_f0.run(training_dir, num_cpu_process, pitch_extraction_algo) + + yield "Extracting features..." + + embedder_filepath, _, embedder_load_from = models.get_embedder( + embedder_name + ) + + if embedder_load_from == "local": + embedder_filepath = os.path.join( + MODELS_DIR, "embeddings", embedder_filepath + ) + + extract_feature.run( + training_dir, + embedder_filepath, + embedder_load_from, + int(embedding_channels), + int(embedding_output_layer), + gpu_ids, + ) + + out_dir = os.path.join(MODELS_DIR, "checkpoints") + + yield "Training index..." + if run_train_index: + if not reduce_index_size: + maximum_index_size = None + train_index( + training_dir, + model_name, + out_dir, + int(embedding_channels), + num_cpu_process, + maximum_index_size, + ) + + yield "Training complete" + + def train_all( + model_name, + version, + sampling_rate_str, + f0, + dataset_glob, + recursive, + multiple_speakers, + speaker_id, + gpu_id, + num_cpu_process, + norm_audio_when_preprocess, + pitch_extraction_algo, + batch_size, + augment, + augment_from_pretrain, + augment_path, + speaker_info_path, + cache_batch, + num_epochs, + save_every_epoch, + save_wav_with_checkpoint, + fp16, + save_only_last, + pre_trained_bottom_model_g, + pre_trained_bottom_model_d, + run_train_index, + reduce_index_size, + maximum_index_size, + embedder_name, + embedding_channels, + embedding_output_layer, + ignore_cache, + ): + batch_size = int(batch_size) + num_epochs = int(num_epochs) + maximum_index_size = int(maximum_index_size) + f0 = f0 == "Yes" + norm_audio_when_preprocess = norm_audio_when_preprocess == "Yes" + run_train_index = run_train_index == "Yes" + reduce_index_size = reduce_index_size == "Yes" + training_dir = os.path.join(MODELS_DIR, "training", "models", model_name) + gpu_ids = [int(x.strip()) for x in gpu_id.split(",")] if gpu_id else [] + + if os.path.exists(training_dir) and ignore_cache: + shutil.rmtree(training_dir) + + os.makedirs(training_dir, exist_ok=True) + + yield f"Training directory: {training_dir}" + + datasets = glob_dataset( + dataset_glob, + speaker_id, + multiple_speakers=multiple_speakers, + recursive=recursive, + training_dir=training_dir, + ) + + if len(datasets) == 0: + raise Exception("No audio files found") + + yield "Preprocessing..." + split.preprocess_audio( + datasets, + SR_DICT[sampling_rate_str], + num_cpu_process, + training_dir, + norm_audio_when_preprocess, + os.path.join( + MODELS_DIR, + "training", + "mute", + "0_gt_wavs", + f"mute{sampling_rate_str}.wav", + ), + ) + + if f0: + yield "Extracting f0..." + extract_f0.run(training_dir, num_cpu_process, pitch_extraction_algo) + + yield "Extracting features..." + + embedder_filepath, _, embedder_load_from = models.get_embedder( + embedder_name + ) + + if embedder_load_from == "local": + embedder_filepath = os.path.join( + MODELS_DIR, "embeddings", embedder_filepath + ) + + extract_feature.run( + training_dir, + embedder_filepath, + embedder_load_from, + int(embedding_channels), + int(embedding_output_layer), + gpu_ids, + None if len(gpu_ids) > 1 else device, + ) + + create_dataset_meta(training_dir, f0) + + yield "Training model..." + + print(f"train_all: emb_name: {embedder_name}") + + config = utils.load_config( + version, training_dir, sampling_rate_str, embedding_channels, fp16 + ) + out_dir = os.path.join(MODELS_DIR, "checkpoints") + + if not augment_from_pretrain: + augment_path = None + speaker_info_path = None + + train_model( + gpu_ids, + config, + training_dir, + model_name, + out_dir, + sampling_rate_str, + f0, + batch_size, + augment, + augment_path, + speaker_info_path, + cache_batch, + num_epochs, + save_every_epoch, + save_wav_with_checkpoint, + pre_trained_bottom_model_g, + pre_trained_bottom_model_d, + embedder_name, + int(embedding_output_layer), + save_only_last, + None if len(gpu_ids) > 1 else device, + ) + + yield "Training index..." + if run_train_index: + if not reduce_index_size: + maximum_index_size = None + train_index( + training_dir, + model_name, + out_dir, + int(embedding_channels), + num_cpu_process, + maximum_index_size, + ) + + yield "Training completed" + + with gr.Group(): + with gr.Box(): + with gr.Column(): + with gr.Row(): + with gr.Column(): + model_name = gr.Textbox(label="Model Name") + ignore_cache = gr.Checkbox(label="Ignore cache") + with gr.Column(): + dataset_glob = gr.Textbox( + label="Dataset glob", placeholder="data/**/*.wav" + ) + recursive = gr.Checkbox(label="Recursive", value=True) + multiple_speakers = gr.Checkbox( + label="Multiple speakers", value=False + ) + speaker_id = gr.Slider( + maximum=4, + minimum=0, + value=0, + step=1, + label="Speaker ID", + ) + + with gr.Row(equal_height=False): + version = gr.Radio( + choices=["v1", "v2"], + value="v2", + label="Model version", + ) + target_sr = gr.Radio( + choices=["32k", "40k", "48k"], + value="40k", + label="Target sampling rate", + ) + f0 = gr.Radio( + choices=["Yes", "No"], + value="Yes", + label="f0 Model", + ) + with gr.Row(equal_height=False): + embedding_name = gr.Radio( + choices=list(models.EMBEDDINGS_LIST.keys()), + value="contentvec", + label="Using phone embedder", + ) + embedding_channels = gr.Radio( + choices=["256", "768"], + value="768", + label="Embedding channels", + ) + embedding_output_layer = gr.Radio( + choices=["9", "12"], + value="12", + label="Embedding output layer", + ) + with gr.Row(equal_height=False): + gpu_id = gr.Textbox( + label="GPU ID", + value=", ".join([f"{x.index}" for x in utils.get_gpus()]), + ) + num_cpu_process = gr.Slider( + minimum=0, + maximum=cpu_count(), + step=1, + value=math.ceil(cpu_count() / 2), + label="Number of CPU processes", + ) + norm_audio_when_preprocess = gr.Radio( + choices=["Yes", "No"], + value="Yes", + label="Normalize audio volume when preprocess", + ) + pitch_extraction_algo = gr.Radio( + choices=["dio", "harvest", "mangio-crepe", "crepe"], + value="crepe", + label="Pitch extraction algorithm", + ) + with gr.Row(equal_height=False): + batch_size = gr.Number(value=4, label="Batch size") + num_epochs = gr.Number( + value=30, + label="Number of epochs", + ) + save_every_epoch = gr.Slider( + minimum=0, + maximum=100, + value=10, + step=1, + label="Save every epoch", + ) + save_wav_with_checkpoint = gr.Checkbox( + label="save_wav_with_checkpoint", value=False + ) + cache_batch = gr.Checkbox(label="Cache batch", value=True) + fp16 = gr.Checkbox( + label="FP16", value=half_support, disabled=not half_support + ) + save_only_last = gr.Checkbox( + label="Save only the latest G and D files", value=False + ) + with gr.Row(equal_height=False): + augment = gr.Checkbox(label="Augment", value=False) + augment_from_pretrain = gr.Checkbox( + label="Augment From Pretrain", value=False + ) + augment_path = gr.Textbox( + label="Pre trained generator path (pth)", + value="file is not prepared", + ) + speaker_info_path = gr.Textbox( + label="speaker info path (npy)", + value="file is not prepared", + ) + with gr.Row(equal_height=False): + pre_trained_generator = gr.Textbox( + label="Pre trained generator path", + value=os.path.join( + MODELS_DIR, "pretrained", "v2", "f0G40k.pth" + ), + ) + pre_trained_discriminator = gr.Textbox( + label="Pre trained discriminator path", + value=os.path.join( + MODELS_DIR, "pretrained", "v2", "f0D40k.pth" + ), + ) + with gr.Row(equal_height=False): + run_train_index = gr.Radio( + choices=["Yes", "No"], + value="Yes", + label="Train Index", + ) + reduce_index_size = gr.Radio( + choices=["Yes", "No"], + value="No", + label="Reduce index size with kmeans", + ) + maximum_index_size = gr.Number( + value=10000, label="maximum index size" + ) + + with gr.Row(equal_height=False): + status = gr.Textbox(value="", label="Status") + with gr.Row(equal_height=False): + train_index_button = gr.Button("Train Index", variant="primary") + train_all_button = gr.Button("Train", variant="primary") + + train_index_button.click( + train_index_only, + inputs=[ + model_name, + target_sr, + f0, + dataset_glob, + recursive, + multiple_speakers, + speaker_id, + gpu_id, + num_cpu_process, + norm_audio_when_preprocess, + pitch_extraction_algo, + run_train_index, + reduce_index_size, + maximum_index_size, + embedding_name, + embedding_channels, + embedding_output_layer, + ignore_cache, + ], + outputs=[status], + ) + + train_all_button.click( + train_all, + inputs=[ + model_name, + version, + target_sr, + f0, + dataset_glob, + recursive, + multiple_speakers, + speaker_id, + gpu_id, + num_cpu_process, + norm_audio_when_preprocess, + pitch_extraction_algo, + batch_size, + augment, + augment_from_pretrain, + augment_path, + speaker_info_path, + cache_batch, + num_epochs, + save_every_epoch, + save_wav_with_checkpoint, + fp16, + save_only_last, + pre_trained_generator, + pre_trained_discriminator, + run_train_index, + reduce_index_size, + maximum_index_size, + embedding_name, + embedding_channels, + embedding_output_layer, + ignore_cache, + ], + outputs=[status], + ) diff --git a/modules/ui.py b/modules/ui.py new file mode 100644 index 0000000000000000000000000000000000000000..8b25bca9089ce2a9ea9dd88935b5f40a57ef69d3 --- /dev/null +++ b/modules/ui.py @@ -0,0 +1,198 @@ +import importlib +import os +from typing import * + +import gradio as gr +import gradio.routes +import torch + +from . import models, shared +from .core import preload +from .shared import ROOT_DIR + + +class Tab: + TABS_DIR = os.path.join(ROOT_DIR, "modules", "tabs") + + def __init__(self, filepath: str) -> None: + self.filepath = filepath + + def sort(self): + return 1 + + def title(self): + return "" + + def ui(self, outlet: Callable): + pass + + def __call__(self): + children_dir = self.filepath[:-3] + children = [] + + if os.path.isdir(children_dir): + for file in os.listdir(children_dir): + if not file.endswith(".py"): + continue + module_name = file[:-3] + parent = os.path.relpath(Tab.TABS_DIR, Tab.TABS_DIR).replace("/", ".") + + if parent.startswith("."): + parent = parent[1:] + if parent.endswith("."): + parent = parent[:-1] + + children.append( + importlib.import_module(f"modules.tabs.{parent}.{module_name}") + ) + + children = sorted(children, key=lambda x: x.sort()) + + tabs = [] + + for child in children: + attrs = child.__dict__ + tab = [x for x in attrs.values() if issubclass(x, Tab)] + if len(tab) > 0: + tabs.append(tab[0]) + + def outlet(): + with gr.Tabs(): + for tab in tabs: + with gr.Tab(tab.title()): + tab() + + return self.ui(outlet) + + +def load_tabs() -> List[Tab]: + tabs = [] + files = os.listdir(os.path.join(ROOT_DIR, "modules", "tabs")) + + for file in files: + if not file.endswith(".py"): + continue + module_name = file[:-3] + module = importlib.import_module(f"modules.tabs.{module_name}") + attrs = module.__dict__ + TabClass = [ + x + for x in attrs.values() + if type(x) == type and issubclass(x, Tab) and not x == Tab + ] + if len(TabClass) > 0: + tabs.append((file, TabClass[0])) + + tabs = sorted([TabClass(file) for file, TabClass in tabs], key=lambda x: x.sort()) + return tabs + + +def webpath(fn): + if fn.startswith(ROOT_DIR): + web_path = os.path.relpath(fn, ROOT_DIR).replace("\\", "/") + else: + web_path = os.path.abspath(fn) + + return f"file={web_path}?{os.path.getmtime(fn)}" + + +def javascript_html(): + script_js = os.path.join(ROOT_DIR, "script.js") + head = f'\n' + + return head + + +def css_html(): + return f'' + + +def create_head(): + head = "" + head += css_html() + head += javascript_html() + + def template_response(*args, **kwargs): + res = shared.gradio_template_response_original(*args, **kwargs) + res.body = res.body.replace(b"", f"{head}".encode("utf8")) + res.init_headers() + return res + + gradio.routes.templates.TemplateResponse = template_response + + +def create_ui(): + preload() + block = gr.Blocks() + + with block: + with gr.Tabs(): + tabs = load_tabs() + for tab in tabs: + with gr.Tab(tab.title()): + tab() + + create_head() + + return block + + +def create_model_list_ui(speaker_id: bool = True, load: bool = True): + speaker_id_info = { + "visible": False, + "maximum": 10000, + } + + def reload_model(raw=False): + model_list = models.get_models() + if len(model_list) > 0: + models.load_model(model_list[0]) + + if models.vc_model is not None: + speaker_id_info["visible"] = True + speaker_id_info["maximum"] = models.vc_model.n_spk + + return model_list if raw else gr.Dropdown.update(choices=model_list) + + model_list = reload_model(raw=True) + + def load_model(model_name): + if load: + models.load_model(model_name) + speaker_id_info["visible"] = True + speaker_id_info["maximum"] = models.vc_model.n_spk + else: + model = models.get_vc_model(model_name) + speaker_id_info["visible"] = True + speaker_id_info["maximum"] = model.n_spk + del model + torch.cuda.empty_cache() + return gr.Slider.update( + maximum=speaker_id_info["maximum"], visible=speaker_id_info["visible"] + ) + + with gr.Row(equal_height=False): + model = gr.Dropdown( + choices=model_list, + label="Model", + value=model_list[0] if len(model_list) > 0 else None, + ) + speaker_id = gr.Slider( + minimum=0, + maximum=speaker_id_info["maximum"], + step=1, + label="Speaker ID", + value=0, + visible=speaker_id and speaker_id_info["visible"], + interactive=True, + ) + reload_model_button = gr.Button("♻️") + + model.change(load_model, inputs=[model], outputs=[speaker_id]) + reload_model_button.click(reload_model, outputs=[model]) + + return model, speaker_id + + +if not hasattr(shared, "gradio_template_response_original"): + shared.gradio_template_response_original = gradio.routes.templates.TemplateResponse diff --git a/modules/utils.py b/modules/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..126728ac9065012fa8184ee210c2dafa83a51dce --- /dev/null +++ b/modules/utils.py @@ -0,0 +1,83 @@ +import os +from typing import * + +import ffmpeg +import numpy as np +import requests +import torch +from tqdm import tqdm + +from lib.rvc.config import TrainConfig +from modules.shared import ROOT_DIR + + +def load_audio(file: str, sr): + try: + # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 + # This launches a subprocess to decode audio while down-mixing and resampling as necessary. + # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. + file = ( + file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) # Prevent small white copy path head and tail with spaces and " and return + out, _ = ( + ffmpeg.input(file, threads=0) + .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) + .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) + ) + except Exception as e: + raise RuntimeError(f"Failed to load audio: {e}") + + return np.frombuffer(out, np.float32).flatten() + + +def get_gpus(): + num_gpus = torch.cuda.device_count() + return [torch.device(f"cuda:{i}") for i in range(num_gpus)] + + +def download_file(url: str, out: str, position: int = 0, show: bool = True): + req = requests.get(url, stream=True, allow_redirects=True) + content_length = req.headers.get("content-length") + if show: + progress_bar = tqdm( + total=int(content_length) if content_length is not None else None, + leave=False, + unit="B", + unit_scale=True, + unit_divisor=1024, + position=position, + ) + + # with tqdm + with open(out, "wb") as f: + for chunk in req.iter_content(chunk_size=1024): + if chunk: + if show: + progress_bar.update(len(chunk)) + f.write(chunk) + + +def load_config( + version: Literal["v1", "v2"], + training_dir: str, + sample_rate: str, + emb_channels: int, + fp16: bool, +): + if emb_channels == 256: + config_path = os.path.join(ROOT_DIR, "configs", f"{sample_rate}.json") + else: + config_path = os.path.join( + ROOT_DIR, "configs", f"{sample_rate}-{emb_channels}.json" + ) + + config = TrainConfig.parse_file(config_path) + config.version = version + config.train.fp16_run = fp16 + + config_save_path = os.path.join(training_dir, "config.json") + + with open(config_save_path, "w") as f: + f.write(config.json()) + + return config diff --git a/outputs/.gitignore b/outputs/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d6b7ef32c8478a48c3994dcadc86837f4371184d --- /dev/null +++ b/outputs/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..e07f847800402b755d06a1e33fa2d079798002f0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +-r requirements/main.txt \ No newline at end of file diff --git a/requirements/dev.txt b/requirements/dev.txt new file mode 100644 index 0000000000000000000000000000000000000000..948887f58de438addc3115f7a2a078856d5d5620 --- /dev/null +++ b/requirements/dev.txt @@ -0,0 +1,4 @@ +# -r main.txt + +black +isort \ No newline at end of file diff --git a/requirements/main.txt b/requirements/main.txt new file mode 100644 index 0000000000000000000000000000000000000000..19aeabc537e0d5bc05fdf6501b1427bcb3d6a1ad --- /dev/null +++ b/requirements/main.txt @@ -0,0 +1,20 @@ +gradio==3.36.1 +tqdm==4.65.0 +numpy==1.23.5 +faiss-cpu==1.7.3 +fairseq==0.12.2 +matplotlib==3.7.1 +scipy==1.9.3 +librosa==0.9.1 +pyworld==0.3.2 +soundfile==0.12.1 +ffmpeg-python==0.2.0 +pydub==0.25.1 +soxr==0.3.5 +transformers==4.28.1 +torchcrepe==0.0.20 +Flask==2.3.2 + +tensorboard +tensorboardX +requests \ No newline at end of file diff --git a/script.js b/script.js new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/server.py b/server.py new file mode 100644 index 0000000000000000000000000000000000000000..3e2e5c81cc3a65808f6c07a76cbac6cbf99b9e4f --- /dev/null +++ b/server.py @@ -0,0 +1,89 @@ +import io +import json +import os +import traceback +from typing import * + +import soundfile as sf +from flask import Flask, make_response, request, send_file +from scipy.io.wavfile import write + +from modules.server.model import VoiceServerModel + +model: Optional[VoiceServerModel] = None +app = Flask(__name__) + +@app.route('/ping') +def ping(): + return make_response("server is alive", 200) + +@app.route('/upload_model', methods=['POST']) +def upload_model(): + """ + input: + json: + rvc_model_file: str + specify rvc model's absolute path (.pt, .pth) + faiss_index_file: Optional[str] + specify faiss index'S absolute path (.index) + """ + global model + if request.method == "POST": + rvc_model_file = request.json["rvc_model_file"] + faiss_index_file =request.json["faiss_index_file"] if "faiss_index_file" in request.json else "" + try: + model = VoiceServerModel(rvc_model_file, faiss_index_file) + return make_response("model is load", 200) + except: + traceback.print_exc() + return make_response("model load error", 400) + else: + return make_response("use post method", 400) + +@app.route('/convert_sound', methods=['POST']) +def convert_sound(): + """ + input: + params: json + speaker_id: int + default: 0 + transpose: int + default: 0 + pitch_extraction_algo: str + default: dio + value: ["dio", "harvest", "mangio-crepe", "crepe"] + retrieval_feature_ratio: float + default: 0 + value: 0. ~ 1. + input_wav: wav file + + output: + wavfile + """ + global model + if model is None: + return make_response("please upload model", 400) + print("start") + if request.method == "POST": + input_buffer = io.BytesIO(request.files["input_wav"].stream.read()) + audio, sr = sf.read(input_buffer) + + req_json = json.load(io.BytesIO(request.files["params"].stream.read())) + sid = int(req_json.get("speaker_id", 0)) + transpose = int(req_json.get("transpose", 0)) + pitch_extraction_algo = req_json.get("pitch_extraction_algo", "dio") + if not pitch_extraction_algo in ["dio", "harvest", "mangio-crepe", "crepe"]: + return make_response("bad pitch extraction algo", 400) + retrieval_feature_ratio = float(req_json.get("retrieval_feature_ratio", 0.)) + + out_audio = model(audio, sr, sid, transpose, pitch_extraction_algo, retrieval_feature_ratio) + output_buffer = io.BytesIO() + write(output_buffer, rate=model.tgt_sr, data=out_audio) + output_buffer.seek(0) + response = make_response(send_file(output_buffer, mimetype="audio/wav"), 200) + return response + else: + return make_response("use post method", 400) + +if __name__ == "__main__": + app.run() \ No newline at end of file diff --git a/styles.css b/styles.css new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/update.bat b/update.bat new file mode 100644 index 0000000000000000000000000000000000000000..eca8c5ef2a1d69d50afac4a4a5f27129d196135c --- /dev/null +++ b/update.bat @@ -0,0 +1,13 @@ +@echo off + +if exist ".git" ( + git fetch --prune + git reset --hard origin/main +) else ( + git init + git remote add origin https://github.com/ddPn08/rvc-webui.git + git fetch --prune + git reset --hard origin/main +) + +pause \ No newline at end of file diff --git a/update.sh b/update.sh new file mode 100644 index 0000000000000000000000000000000000000000..08175cc7475a9521ce72971e2c758330a2eff94c --- /dev/null +++ b/update.sh @@ -0,0 +1,9 @@ +#!/bin/bash +if [ -d .git ]; then + git fetch --prune + git reset --hard origin/main +else + git init + git remote add origin + git fetch --prune + git reset --hard origin/main \ No newline at end of file diff --git a/webui-macos-env.sh b/webui-macos-env.sh new file mode 100644 index 0000000000000000000000000000000000000000..2821aa8cbd0be3522a88060f1ca3a54c409e7bf5 --- /dev/null +++ b/webui-macos-env.sh @@ -0,0 +1,16 @@ +#!/bin/bash +#################################################################### +# macOS defaults # +# Please modify webui-user.sh to change these instead of this file # +#################################################################### + +if [[ -x "$(command -v python3.10)" ]] +then + python_cmd="python3.10" +fi + +export COMMANDLINE_ARGS="" +export TORCH_COMMAND="pip install torch torchvision torchaudio" +export PYTORCH_ENABLE_MPS_FALLBACK=1 + +#################################################################### \ No newline at end of file diff --git a/webui-user.bat b/webui-user.bat new file mode 100644 index 0000000000000000000000000000000000000000..3b1a7704da76fe7d70de2566ac6b7cf1a4d8a1b6 --- /dev/null +++ b/webui-user.bat @@ -0,0 +1,8 @@ +@echo off + +set PYTHON= +set GIT= +set VENV_DIR= +set COMMANDLINE_ARGS= + +call webui.bat \ No newline at end of file diff --git a/webui-user.sh b/webui-user.sh new file mode 100644 index 0000000000000000000000000000000000000000..3061cbb61e174414bb9aadfcdb1beb39dd6de9be --- /dev/null +++ b/webui-user.sh @@ -0,0 +1,27 @@ +#!/bin/bash +######################################################### +# Uncomment and change the variables below to your need:# +######################################################### + +# Commandline arguments for webui.py, for example: export COMMANDLINE_ARGS="--medvram --opt-split-attention" +#export COMMANDLINE_ARGS="" + +# python3 executable +#python_cmd="python3" + +# git executable +#export GIT="git" + +# python3 venv without trailing slash (defaults to ${install_dir}/${clone_dir}/venv) +#venv_dir="venv" + +# script to launch to start the app +#export LAUNCH_SCRIPT="launch.py" + +# install command for torch +#export TORCH_COMMAND="pip install torch --extra-index-url https://download.pytorch.org/whl/cu118" + +# Requirements file to use for stable-diffusion-webui +#export REQS_FILE="requirements_versions.txt" + +########################################### \ No newline at end of file diff --git a/webui.bat b/webui.bat new file mode 100644 index 0000000000000000000000000000000000000000..65051b1d3ecc1be8873ddecf0ce0cd9efac05185 --- /dev/null +++ b/webui.bat @@ -0,0 +1,70 @@ +@echo off + +if not defined PYTHON (set PYTHON=python) +if not defined VENV_DIR (set "VENV_DIR=%~dp0%venv") + + +set ERROR_REPORTING=FALSE + +mkdir tmp 2>NUL + +%PYTHON% -c "" >tmp/stdout.txt 2>tmp/stderr.txt +if %ERRORLEVEL% == 0 goto :check_pip +echo Couldn't launch python +goto :show_stdout_stderr + +:check_pip +%PYTHON% -mpip --help >tmp/stdout.txt 2>tmp/stderr.txt +if %ERRORLEVEL% == 0 goto :start_venv +if "%PIP_INSTALLER_LOCATION%" == "" goto :show_stdout_stderr +%PYTHON% "%PIP_INSTALLER_LOCATION%" >tmp/stdout.txt 2>tmp/stderr.txt +if %ERRORLEVEL% == 0 goto :start_venv +echo Couldn't install pip +goto :show_stdout_stderr + +:start_venv +if ["%VENV_DIR%"] == ["-"] goto :launch +if ["%SKIP_VENV%"] == ["1"] goto :launch + +dir "%VENV_DIR%\Scripts\Python.exe" >tmp/stdout.txt 2>tmp/stderr.txt +if %ERRORLEVEL% == 0 goto :activate_venv + +for /f "delims=" %%i in ('CALL %PYTHON% -c "import sys; print(sys.executable)"') do set PYTHON_FULLNAME="%%i" +echo Creating venv in directory %VENV_DIR% using python %PYTHON_FULLNAME% +%PYTHON_FULLNAME% -m venv "%VENV_DIR%" >tmp/stdout.txt 2>tmp/stderr.txt +if %ERRORLEVEL% == 0 goto :activate_venv +echo Unable to create venv in directory "%VENV_DIR%" +goto :show_stdout_stderr + +:activate_venv +set PYTHON="%VENV_DIR%\Scripts\Python.exe" +echo venv %PYTHON% + +:launch +%PYTHON% launch.py %* +pause +exit /b + +:show_stdout_stderr + +echo. +echo exit code: %errorlevel% + +for /f %%i in ("tmp\stdout.txt") do set size=%%~zi +if %size% equ 0 goto :show_stderr +echo. +echo stdout: +type tmp\stdout.txt + +:show_stderr +for /f %%i in ("tmp\stderr.txt") do set size=%%~zi +if %size% equ 0 goto :show_stderr +echo. +echo stderr: +type tmp\stderr.txt + +:endofscript + +echo. +echo Launch unsuccessful. Exiting. +pause \ No newline at end of file diff --git a/webui.py b/webui.py new file mode 100644 index 0000000000000000000000000000000000000000..138914ec49f7fe899eb52853af2357ac75f12328 --- /dev/null +++ b/webui.py @@ -0,0 +1,26 @@ +import os + +from modules import cmd_opts, ui + +# なんか知らんが湧いて出てくる ".DS_Store" を無視する。 +# ここにこんなコードを置くべきかはわからないけど… +_list_dir = os.listdir + +def listdir4mac(path): + return [file for file in _list_dir(path) if not file.startswith(".")] + +os.listdir = listdir4mac + + +def webui(): + app = ui.create_ui() + app.queue(64) + app, local_url, share_url = app.launch( + server_name=cmd_opts.opts.host, + server_port=cmd_opts.opts.port, + share=cmd_opts.opts.share, + ) + + +if __name__ == "__main__": + webui() diff --git a/webui.sh b/webui.sh new file mode 100644 index 0000000000000000000000000000000000000000..fa8129bd877435052dcd2788ac628fe6b7e79fb8 --- /dev/null +++ b/webui.sh @@ -0,0 +1,125 @@ +#!/usr/bin/env bash +################################################# +# Please do not make any changes to this file, # +# change the variables in webui-user.sh instead # +################################################# + +# If run from macOS, load defaults from webui-macos-env.sh +if [[ "$OSTYPE" == "darwin"* ]]; then + if [[ -f webui-macos-env.sh ]] + then + source ./webui-macos-env.sh + fi +fi + +# Read variables from webui-user.sh +# shellcheck source=/dev/null +if [[ -f webui-user.sh ]] +then + source ./webui-user.sh +fi + +# python3 executable +if [[ -z "${python_cmd}" ]] +then + python_cmd="python3" +fi + +# git executable +if [[ -z "${GIT}" ]] +then + export GIT="git" +fi + +# python3 venv without trailing slash (defaults to ${install_dir}/${clone_dir}/venv) +if [[ -z "${venv_dir}" ]] +then + venv_dir="venv" +fi + +if [[ -z "${LAUNCH_SCRIPT}" ]] +then + LAUNCH_SCRIPT="launch.py" +fi + +# this script cannot be run as root by default +can_run_as_root=0 + +# read any command line flags to the webui.sh script +while getopts "f" flag > /dev/null 2>&1 +do + case ${flag} in + f) can_run_as_root=1;; + *) break;; + esac +done + +# Disable sentry logging +export ERROR_REPORTING=FALSE + +# Do not reinstall existing pip packages on Debian/Ubuntu +export PIP_IGNORE_INSTALLED=0 + +# Pretty print +delimiter="################################################################" + +# Do not run as root +if [[ $(id -u) -eq 0 && can_run_as_root -eq 0 ]] +then + printf "\n%s\n" "${delimiter}" + printf "\e[1m\e[31mERROR: This script must not be launched as root, aborting...\e[0m" + printf "\n%s\n" "${delimiter}" + exit 1 +else + printf "\n%s\n" "${delimiter}" + printf "Running on \e[1m\e[32m%s\e[0m user" "$(whoami)" + printf "\n%s\n" "${delimiter}" +fi + +if echo "$gpu_info" | grep -q "AMD" && [[ -z "${TORCH_COMMAND}" ]] +then + export TORCH_COMMAND="pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/rocm5.2" +fi + +for preq in "${GIT}" "${python_cmd}" +do + if ! hash "${preq}" &>/dev/null + then + printf "\n%s\n" "${delimiter}" + printf "\e[1m\e[31mERROR: %s is not installed, aborting...\e[0m" "${preq}" + printf "\n%s\n" "${delimiter}" + exit 1 + fi +done + +if ! "${python_cmd}" -c "import venv" &>/dev/null +then + printf "\n%s\n" "${delimiter}" + printf "\e[1m\e[31mERROR: python3-venv is not installed, aborting...\e[0m" + printf "\n%s\n" "${delimiter}" + exit 1 +fi + +printf "\n%s\n" "${delimiter}" +printf "Create and activate python venv" +printf "\n%s\n" "${delimiter}" +if [[ ! -d "${venv_dir}" ]] +then + "${python_cmd}" -m venv "${venv_dir}" + first_launch=1 +fi +# shellcheck source=/dev/null +if [[ -f "${venv_dir}"/bin/activate ]] +then + source "${venv_dir}"/bin/activate +else + printf "\n%s\n" "${delimiter}" + printf "\e[1m\e[31mERROR: Cannot activate python venv, aborting...\e[0m" + printf "\n%s\n" "${delimiter}" + exit 1 +fi + +printf "\n%s\n" "${delimiter}" +printf "Launching launch.py..." +printf "\n%s\n" "${delimiter}" +exec "${python_cmd}" "${LAUNCH_SCRIPT}" "$@" \ No newline at end of file