Spaces:

mason369
/

AI-RVC

Running

App Files Files Community

mason369 commited on Mar 15

Commit

4204217

verified ·

1 Parent(s): 2a34648

sync: 同步GitHub最新代码到HF Space

Browse files

Files changed (19) hide show

README.md +6 -6
app.py +25 -28
configs/config.py +260 -0
configs/inuse/v1/32k.json +46 -0
configs/inuse/v1/40k.json +46 -0
configs/inuse/v1/48k.json +46 -0
configs/inuse/v2/32k.json +46 -0
configs/inuse/v2/48k.json +46 -0
configs/presets/balanced.json +20 -0
configs/presets/clarity_priority.json +20 -0
configs/presets/timbre_priority.json +20 -0
configs/v1/32k.json +46 -0
configs/v1/40k.json +46 -0
configs/v1/48k.json +46 -0
configs/v2/32k.json +46 -0
configs/v2/48k.json +46 -0
infer/cover_pipeline.py +35 -18
requirements.txt +39 -31
run.py +133 -0

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: AI-RVC 语音转换 & AI 翻唱
 emoji: 🎤
 colorFrom: blue
 colorTo: purple
@@ -10,15 +10,15 @@ pinned: false
 license: mit
 ---
-# 🎤 AI-RVC 语音转换 & AI 翻唱
-基于 RVC v2 + RMVPE 的高质量语音转换系统，支持一键 AI 翻唱功能。
 ## 功能特点
 - **AI 歌曲翻唱**：上传歌曲自动分离人声、转换音色、混合伴奏，一键生成翻唱
 - **人声分离**：默认 Mel-Band Roformer (KimberleyJensen)，在 MVSEP 公开 Multisong 指标中为 Vocals SDR 11.01 / Instrum SDR 17.32
-- **语音转换**：RVC v2 架构 + FAISS 检索增强流程
 - **RMVPE 音高提取**：高精度 F0 提取，噪声鲁棒性强
 - **角色模型**：内置 117 个可下载角色模型
 - **混音效果**：支持人声混响、音量调节、4 种混音预设
@@ -88,7 +88,7 @@ license: mit
               ↓
           人声分离 (Mel-Band Roformer)
               ↓
-          RVC 语音转换 (HuBERT + RMVPE + FAISS)
               ↓
           混音 (音量调节 + 混响)
               ↓
@@ -148,4 +148,4 @@ A: 建议选择与原唱性别、音色相近的角色，效果更自然。
 **License**: MIT
 **Version**: 2.0
-**Last Updated**: 2026-03-10

 ---
+title: AI-RVC 一键 AI 翻唱
 emoji: 🎤
 colorFrom: blue
 colorTo: purple
 license: mit
 ---
+# 🎤 AI-RVC 一键 AI 翻唱
+基于 RVC v2 的一键 AI 翻唱系统，自动完成人声分离、音色转换、混音合成全流程。
 ## 功能特点
 - **AI 歌曲翻唱**：上传歌曲自动分离人声、转换音色、混合伴奏，一键生成翻唱
 - **人声分离**：默认 Mel-Band Roformer (KimberleyJensen)，在 MVSEP 公开 Multisong 指标中为 Vocals SDR 11.01 / Instrum SDR 17.32
+- **音色转换**：RVC v2 架构 + FAISS 检索增强流程
 - **RMVPE 音高提取**：高精度 F0 提取，噪声鲁棒性强
 - **角色模型**：内置 117 个可下载角色模型
 - **混音效果**：支持人声混响、音量调节、4 种混音预设
               ↓
           人声分离 (Mel-Band Roformer)
               ↓
+          RVC 音色转换 (HuBERT + RMVPE + FAISS)
               ↓
           混音 (音量调节 + 混响)
               ↓
 **License**: MIT
 **Version**: 2.0
+**Last Updated**: 2026-03-15

app.py CHANGED Viewed

@@ -1,28 +1,25 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-"""
-Hugging Face Spaces 入口文件
-"""
-import os
-import sys
-from pathlib import Path
-# 添加项目根目录到路径
-ROOT_DIR = Path(__file__).parent
-sys.path.insert(0, str(ROOT_DIR))
-# 设置环境变量
-os.environ["GRADIO_SERVER_NAME"] = "0.0.0.0"
-os.environ["GRADIO_SERVER_PORT"] = "7860"
-# 导入并启动应用
-from ui.app import create_ui
-app = create_ui()
-app.queue()
-app.launch(
-    server_name="0.0.0.0",
-    server_port=7860,
-    share=False,
-    inbrowser=False,
-)

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Hugging Face Spaces 入口文件
+"""
+import os
+import sys
+from pathlib import Path
+ROOT_DIR = Path(__file__).parent
+sys.path.insert(0, str(ROOT_DIR))
+os.environ["GRADIO_SERVER_NAME"] = "0.0.0.0"
+os.environ["GRADIO_SERVER_PORT"] = "7860"
+from ui.app import create_ui
+app = create_ui()
+app.queue()
+app.launch(
+    server_name="0.0.0.0",
+    server_port=7860,
+    share=False,
+    inbrowser=False,
+)

configs/config.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import argparse
+import os
+import sys
+import json
+import shutil
+from multiprocessing import cpu_count
+import torch
+try:
+    import intel_extension_for_pytorch as ipex  # pylint: disable=import-error, unused-import
+    if torch.xpu.is_available():
+        from infer.modules.ipex import ipex_init
+        ipex_init()
+except Exception:  # pylint: disable=broad-exception-caught
+    pass
+import logging
+logger = logging.getLogger(__name__)
+version_config_list = [
+    "v1/32k.json",
+    "v1/40k.json",
+    "v1/48k.json",
+    "v2/48k.json",
+    "v2/32k.json",
+]
+def singleton_variable(func):
+    def wrapper(*args, **kwargs):
+        if not wrapper.instance:
+            wrapper.instance = func(*args, **kwargs)
+        return wrapper.instance
+    wrapper.instance = None
+    return wrapper
+@singleton_variable
+class Config:
+    def __init__(self):
+        self.device = "cuda:0"
+        self.is_half = False
+        self.use_jit = False
+        self.n_cpu = 0
+        self.gpu_name = None
+        self.json_config = self.load_config_json()
+        self.gpu_mem = None
+        (
+            self.python_cmd,
+            self.listen_port,
+            self.iscolab,
+            self.noparallel,
+            self.noautoopen,
+            self.dml,
+        ) = self.arg_parse()
+        self.instead = ""
+        self.preprocess_per = 3.7
+        self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
+    @staticmethod
+    def load_config_json() -> dict:
+        d = {}
+        for config_file in version_config_list:
+            p = f"configs/inuse/{config_file}"
+            if not os.path.exists(p):
+                shutil.copy(f"configs/{config_file}", p)
+            with open(f"configs/inuse/{config_file}", "r") as f:
+                d[config_file] = json.load(f)
+        return d
+    @staticmethod
+    def arg_parse() -> tuple:
+        exe = sys.executable or "python"
+        parser = argparse.ArgumentParser()
+        parser.add_argument("--port", type=int, default=7865, help="Listen port")
+        parser.add_argument("--pycmd", type=str, default=exe, help="Python command")
+        parser.add_argument("--colab", action="store_true", help="Launch in colab")
+        parser.add_argument(
+            "--noparallel", action="store_true", help="Disable parallel processing"
+        )
+        parser.add_argument(
+            "--noautoopen",
+            action="store_true",
+            help="Do not open in browser automatically",
+        )
+        parser.add_argument(
+            "--dml",
+            action="store_true",
+            help="torch_dml",
+        )
+        cmd_opts = parser.parse_args()
+        cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
+        return (
+            cmd_opts.pycmd,
+            cmd_opts.port,
+            cmd_opts.colab,
+            cmd_opts.noparallel,
+            cmd_opts.noautoopen,
+            cmd_opts.dml,
+        )
+    # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.
+    # check `getattr` and try it for compatibility
+    @staticmethod
+    def has_mps() -> bool:
+        if not torch.backends.mps.is_available():
+            return False
+        try:
+            torch.zeros(1).to(torch.device("mps"))
+            return True
+        except Exception:
+            return False
+    @staticmethod
+    def has_xpu() -> bool:
+        if hasattr(torch, "xpu") and torch.xpu.is_available():
+            return True
+        else:
+            return False
+    def use_fp32_config(self):
+        for config_file in version_config_list:
+            self.json_config[config_file]["train"]["fp16_run"] = False
+            with open(f"configs/inuse/{config_file}", "r") as f:
+                strr = f.read().replace("true", "false")
+            with open(f"configs/inuse/{config_file}", "w") as f:
+                f.write(strr)
+            logger.info("overwrite " + config_file)
+        self.preprocess_per = 3.0
+        logger.info("overwrite preprocess_per to %d" % (self.preprocess_per))
+    def device_config(self) -> tuple:
+        if torch.cuda.is_available():
+            if self.has_xpu():
+                self.device = self.instead = "xpu:0"
+                self.is_half = False
+            i_device = int(self.device.split(":")[-1])
+            self.gpu_name = torch.cuda.get_device_name(i_device)
+            if (
+                ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
+                or "P40" in self.gpu_name.upper()
+                or "P10" in self.gpu_name.upper()
+                or "1060" in self.gpu_name
+                or "1070" in self.gpu_name
+                or "1080" in self.gpu_name
+            ):
+                logger.info("Found GPU %s, force to fp32", self.gpu_name)
+                self.is_half = False
+                self.use_fp32_config()
+            else:
+                logger.info("Found GPU %s", self.gpu_name)
+            self.gpu_mem = int(
+                torch.cuda.get_device_properties(i_device).total_memory
+                / 1024
+                / 1024
+                / 1024
+                + 0.4
+            )
+            if self.gpu_mem <= 4:
+                self.preprocess_per = 3.0
+        elif self.has_mps():
+            logger.info("No supported Nvidia GPU found")
+            self.device = self.instead = "mps"
+            self.is_half = False
+            self.use_fp32_config()
+        else:
+            logger.info("No supported Nvidia GPU found")
+            self.device = self.instead = "cpu"
+            self.is_half = False
+            self.use_fp32_config()
+        if self.n_cpu == 0:
+            self.n_cpu = cpu_count()
+        if self.gpu_mem is not None and self.gpu_mem >= 8:
+            # 8G+显存配置（含fp32全精度）
+            x_pad = 3
+            x_query = 10
+            x_center = 60
+            x_max = 65
+        elif self.is_half:
+            # 6G显存配置
+            x_pad = 3
+            x_query = 10
+            x_center = 60
+            x_max = 65
+        else:
+            # 5G显存配置
+            x_pad = 1
+            x_query = 6
+            x_center = 38
+            x_max = 41
+        if self.gpu_mem is not None and self.gpu_mem <= 4:
+            x_pad = 1
+            x_query = 5
+            x_center = 30
+            x_max = 32
+        if self.dml:
+            logger.info("Use DirectML instead")
+            if (
+                os.path.exists(
+                    r"runtime\Lib\site-packages\onnxruntime\capi\DirectML.dll"
+                )
+                == False
+            ):
+                try:
+                    os.rename(
+                        r"runtime\Lib\site-packages\onnxruntime",
+                        r"runtime\Lib\site-packages\onnxruntime-cuda",
+                    )
+                except:
+                    pass
+                try:
+                    os.rename(
+                        r"runtime\Lib\site-packages\onnxruntime-dml",
+                        r"runtime\Lib\site-packages\onnxruntime",
+                    )
+                except:
+                    pass
+            # if self.device != "cpu":
+            import torch_directml
+            self.device = torch_directml.device(torch_directml.default_device())
+            self.is_half = False
+        else:
+            if self.instead:
+                logger.info(f"Use {self.instead} instead")
+            if (
+                os.path.exists(
+                    r"runtime\Lib\site-packages\onnxruntime\capi\onnxruntime_providers_cuda.dll"
+                )
+                == False
+            ):
+                try:
+                    os.rename(
+                        r"runtime\Lib\site-packages\onnxruntime",
+                        r"runtime\Lib\site-packages\onnxruntime-dml",
+                    )
+                except:
+                    pass
+                try:
+                    os.rename(
+                        r"runtime\Lib\site-packages\onnxruntime-cuda",
+                        r"runtime\Lib\site-packages\onnxruntime",
+                    )
+                except:
+                    pass
+        logger.info(
+            "Half-precision floating-point: %s, device: %s"
+            % (self.is_half, self.device)
+        )
+        return x_pad, x_query, x_center, x_max

configs/inuse/v1/32k.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 12800,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 32000,
+    "filter_length": 1024,
+    "hop_length": 320,
+    "win_length": 1024,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,4,2,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

configs/inuse/v1/40k.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 12800,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 40000,
+    "filter_length": 2048,
+    "hop_length": 400,
+    "win_length": 2048,
+    "n_mel_channels": 125,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,10,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

configs/inuse/v1/48k.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 11520,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 48000,
+    "filter_length": 2048,
+    "hop_length": 480,
+    "win_length": 2048,
+    "n_mel_channels": 128,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,6,2,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

configs/inuse/v2/32k.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 12800,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 32000,
+    "filter_length": 1024,
+    "hop_length": 320,
+    "win_length": 1024,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,8,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [20,16,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

configs/inuse/v2/48k.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": false,
+    "lr_decay": 0.999875,
+    "segment_size": 17280,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 48000,
+    "filter_length": 2048,
+    "hop_length": 480,
+    "win_length": 2048,
+    "n_mel_channels": 128,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [12,10,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [24,20,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

configs/presets/balanced.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "name": "平衡型配置",
+  "description": "适合大多数歌曲，在音色转换和清晰度之间取得平衡",
+  "cover": {
+    "index_rate": 0.50,
+    "filter_radius": 3,
+    "rms_mix_rate": 0.50,
+    "protect": 0.40,
+    "f0_method": "hybrid",
+    "rmvpe_threshold": 0.005,
+    "f0_min": 80,
+    "f0_max": 1600,
+    "f0_stabilize": true,
+    "f0_stabilize_window": 3,
+    "f0_stabilize_max_semitones": 3.0,
+    "vc_preprocess_mode": "uvr_deecho",
+    "source_constraint_mode": "on",
+    "uvr5_agg": 10
+  }
+}

configs/presets/clarity_priority.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "name": "清晰度优先配置",
+  "description": "减少伪影和失真，保留更多源音频特征，适合复杂歌曲和高音多的情况",
+  "cover": {
+    "index_rate": 0.30,
+    "filter_radius": 1,
+    "rms_mix_rate": 0.75,
+    "protect": 0.55,
+    "f0_method": "hybrid",
+    "rmvpe_threshold": 0.008,
+    "f0_min": 60,
+    "f0_max": 1400,
+    "f0_stabilize": false,
+    "f0_stabilize_window": 2,
+    "f0_stabilize_max_semitones": 2.0,
+    "vc_preprocess_mode": "uvr_deecho",
+    "source_constraint_mode": "on",
+    "uvr5_agg": 8
+  }
+}

configs/presets/timbre_priority.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "name": "音色优先配置",
+  "description": "彻底的音色转换，适合音色特征明显的角色，可能有轻微口齿模糊",
+  "cover": {
+    "index_rate": 0.80,
+    "filter_radius": 5,
+    "rms_mix_rate": 0.30,
+    "protect": 0.25,
+    "f0_method": "hybrid",
+    "rmvpe_threshold": 0.003,
+    "f0_min": 80,
+    "f0_max": 1800,
+    "f0_stabilize": true,
+    "f0_stabilize_window": 5,
+    "f0_stabilize_max_semitones": 4.0,
+    "vc_preprocess_mode": "uvr_deecho",
+    "source_constraint_mode": "on",
+    "uvr5_agg": 12
+  }
+}

configs/v1/32k.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 12800,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 32000,
+    "filter_length": 1024,
+    "hop_length": 320,
+    "win_length": 1024,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,4,2,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

configs/v1/40k.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 12800,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 40000,
+    "filter_length": 2048,
+    "hop_length": 400,
+    "win_length": 2048,
+    "n_mel_channels": 125,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,10,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

configs/v1/48k.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 11520,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 48000,
+    "filter_length": 2048,
+    "hop_length": 480,
+    "win_length": 2048,
+    "n_mel_channels": 128,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,6,2,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [16,16,4,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

configs/v2/32k.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 12800,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 32000,
+    "filter_length": 1024,
+    "hop_length": 320,
+    "win_length": 1024,
+    "n_mel_channels": 80,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [10,8,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [20,16,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

configs/v2/48k.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "train": {
+    "log_interval": 200,
+    "seed": 1234,
+    "epochs": 20000,
+    "learning_rate": 1e-4,
+    "betas": [0.8, 0.99],
+    "eps": 1e-9,
+    "batch_size": 4,
+    "fp16_run": true,
+    "lr_decay": 0.999875,
+    "segment_size": 17280,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0
+  },
+  "data": {
+    "max_wav_value": 32768.0,
+    "sampling_rate": 48000,
+    "filter_length": 2048,
+    "hop_length": 480,
+    "win_length": 2048,
+    "n_mel_channels": 128,
+    "mel_fmin": 0.0,
+    "mel_fmax": null
+  },
+  "model": {
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0,
+    "resblock": "1",
+    "resblock_kernel_sizes": [3,7,11],
+    "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
+    "upsample_rates": [12,10,2,2],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [24,20,4,4],
+    "use_spectral_norm": false,
+    "gin_channels": 256,
+    "spk_embed_dim": 109
+  }
+}

infer/cover_pipeline.py CHANGED Viewed

@@ -1667,6 +1667,15 @@ class CoverPipeline:
         effective_karaoke_merge_backing = False if effective_official_mode else karaoke_merge_backing_into_accompaniment
         effective_use_official = True if effective_official_mode else use_official
         total_steps = 5 if effective_karaoke_separation else 4
         step_karaoke = 2 if effective_karaoke_separation else None
         step_convert = 3 if effective_karaoke_separation else 2
@@ -1695,7 +1704,7 @@ class CoverPipeline:
         log.config(f"说话人ID: {speaker_id}")
         log.config(f"VC管线模式: {normalized_vc_pipeline_mode}")
         if effective_official_mode:
-            log.config("官方模式: 强制使用官方UVR5分离 + 官方VC，不使用Karaoke二次分离")
         log.config(f"人声分离器: {effective_separator}")
         if effective_separator == "uvr5":
             log.config(f"UVR5模型: {uvr5_model or '自动选择'}")
@@ -1800,27 +1809,29 @@ class CoverPipeline:
             normalized_source_constraint_mode = str(source_constraint_mode or "auto").strip().lower()
             available_uvr_deecho_model = self._get_available_uvr_deecho_model()
             log.config(f"VC预处理模式: {normalized_vc_preprocess_mode}")
-            if normalized_vc_pipeline_mode == "current" and normalized_vc_preprocess_mode in {"auto", "uvr_deecho"}:
                 if available_uvr_deecho_model:
                     log.config(f"Mature DeEcho模型: {available_uvr_deecho_model}")
                 else:
                     log.config("Mature DeEcho模型: 未找到，将回退到主唱直通")
             log.config(f"源约束模式: {normalized_source_constraint_mode}")
             vc_input_path = vocals_path
             vc_preprocessed = False
-            if normalized_vc_pipeline_mode == "official":
-                self._last_vc_preprocess_mode = "direct"
-                log.detail("官方VC模式：跳过自定义VC预处理")
-                log.audio(f"官方VC输入: {Path(vc_input_path).name}")
-            else:
-                try:
-                    prepared_path = self._prepare_vocals_for_vc(vocals_path, session_dir, preprocess_mode=normalized_vc_preprocess_mode)
-                    vc_input_path = prepared_path
-                    vc_preprocessed = True
-                    log.audio(f"VC预处理输入: {Path(vc_input_path).name}")
-                except Exception as e:
-                    log.warning(f"VC预处理失败，回退原始输入: {e}")
             report_progress("正在转换人声...", step_convert)
             converted_vocals_path = str(session_dir / "converted_vocals.wav")
@@ -1846,7 +1857,7 @@ class CoverPipeline:
                     protect=protect,
                     speaker_id=speaker_id,
                 )
-                log.detail("内置官方模式已跳过自定义VC前后处理")
                 log.success("内置官方VC转换完成")
             elif normalized_vc_pipeline_mode == "official" and singing_repair:
                 log.detail("使用官方兼容唱歌修复链进行转换")
@@ -1959,8 +1970,14 @@ class CoverPipeline:
                     log.warning("VC preprocess unavailable, skipping source-guided reconstruction")
                 log.success("官方VC转换完成")
-            # 如果使用了advanced dereverb，重新应用原始混响
-            if hasattr(self, '_original_reverb_path') and self._original_reverb_path and Path(self._original_reverb_path).exists():
                 log.detail("重新应用原始混响到转换后的干声...")
                 import librosa
                 import soundfile as sf
@@ -1978,7 +1995,7 @@ class CoverPipeline:
                 sf.write(converted_vocals_path, wet_signal, sr)
                 log.detail(f"混响重应用完成: mix_ratio=0.8")
-            else:
                 # 使用自定义VC管道进行转换
                 log.detail("使用自定义VC管道进行转换")
                 self._init_rvc_pipeline()

         effective_karaoke_merge_backing = False if effective_official_mode else karaoke_merge_backing_into_accompaniment
         effective_use_official = True if effective_official_mode else use_official
+        # 官方模式：强制使用官方推荐参数，确保1:1纯净推理
+        if effective_official_mode:
+            if f0_method != "rmvpe":
+                log.warning(f"官方模式：F0方法从 {f0_method} 强制切换为 rmvpe（抗噪性最佳）")
+                f0_method = "rmvpe"
+            if protect != 0.33:
+                log.warning(f"官方模式：保护系数从 {protect} 强制设为 0.33（官方推荐值）")
+                protect = 0.33
         total_steps = 5 if effective_karaoke_separation else 4
         step_karaoke = 2 if effective_karaoke_separation else None
         step_convert = 3 if effective_karaoke_separation else 2
         log.config(f"说话人ID: {speaker_id}")
         log.config(f"VC管线模式: {normalized_vc_pipeline_mode}")
         if effective_official_mode:
+            log.config("官方模式: 强制UVR5分离 + 去混响预处理 + 官方VC (rmvpe, protect=0.33)")
         log.config(f"人声分离器: {effective_separator}")
         if effective_separator == "uvr5":
             log.config(f"UVR5模型: {uvr5_model or '自动选择'}")
             normalized_source_constraint_mode = str(source_constraint_mode or "auto").strip().lower()
             available_uvr_deecho_model = self._get_available_uvr_deecho_model()
             log.config(f"VC预处理模式: {normalized_vc_preprocess_mode}")
+            if normalized_vc_preprocess_mode in {"auto", "uvr_deecho"}:
                 if available_uvr_deecho_model:
                     log.config(f"Mature DeEcho模型: {available_uvr_deecho_model}")
                 else:
                     log.config("Mature DeEcho模型: 未找到，将回退到主唱直通")
             log.config(f"源约束模式: {normalized_source_constraint_mode}")
+            # 官方模式也必须经过去混响预处理，确保输入RVC的是纯净干声
+            # 官方模式下如果用户选了 direct，强制提升为 auto（带混响的人声会破坏F0提取）
+            effective_preprocess_mode = normalized_vc_preprocess_mode
+            if normalized_vc_pipeline_mode == "official" and effective_preprocess_mode == "direct":
+                effective_preprocess_mode = "auto"
+                log.warning("官方模式：direct预处理已提升为auto，确保去混响后再进入RVC推理")
             vc_input_path = vocals_path
             vc_preprocessed = False
+            try:
+                prepared_path = self._prepare_vocals_for_vc(vocals_path, session_dir, preprocess_mode=effective_preprocess_mode)
+                vc_input_path = prepared_path
+                vc_preprocessed = True
+                log.audio(f"VC预处理输入: {Path(vc_input_path).name}")
+            except Exception as e:
+                log.warning(f"VC预处理失败，回退原始输入: {e}")
             report_progress("正在转换人声...", step_convert)
             converted_vocals_path = str(session_dir / "converted_vocals.wav")
                     protect=protect,
                     speaker_id=speaker_id,
                 )
+                log.detail("内置官方模式：去混响干声 -> 官方RVC推理（纯净管道）")
                 log.success("内置官方VC转换完成")
             elif normalized_vc_pipeline_mode == "official" and singing_repair:
                 log.detail("使用官方兼容唱歌修复链进行转换")
                     log.warning("VC preprocess unavailable, skipping source-guided reconstruction")
                 log.success("官方VC转换完成")
+            # 如果使用了advanced dereverb，重新应用原始混响（仅非官方模式）
+            if (
+                not effective_official_mode
+                and not effective_use_official
+                and hasattr(self, '_original_reverb_path')
+                and self._original_reverb_path
+                and Path(self._original_reverb_path).exists()
+            ):
                 log.detail("重新应用原始混响到转换后的干声...")
                 import librosa
                 import soundfile as sf
                 sf.write(converted_vocals_path, wet_signal, sr)
                 log.detail(f"混响重应用完成: mix_ratio=0.8")
+            elif not effective_official_mode and not effective_use_official:
                 # 使用自定义VC管道进行转换
                 log.detail("使用自定义VC管道进行转换")
                 self._init_rvc_pipeline()

requirements.txt CHANGED Viewed

@@ -1,31 +1,39 @@
-# RVC AI 翻唱依赖 (Hugging Face Space - 最小化)
-# PyTorch
-torch>=2.0.0
-torchaudio>=2.0.0
-# Gradio 界面
-gradio==3.50.2
-# 音频处理
-librosa>=0.9.0
-soundfile>=0.12.0
-scipy>=1.10.0
-numpy>=1.23.0
-praat-parselmouth>=0.4.3
-torchcrepe>=0.0.20
-# 向量检索
-faiss-cpu>=1.7.4
-# 工具库
-tqdm>=4.65.0
-requests>=2.28.0
-python-dotenv>=1.0.0
-colorama>=0.4.6
-# AI 翻唱功能（核心）
-audio-separator
-huggingface_hub>=0.19.0
-pedalboard>=0.7.0
-ffmpeg-python>=0.2.0

+# RVC AI 翻唱依赖 (Hugging Face Space - CPU 精简版)
+# 注意：此文件用于 HF Space 部署，同步到 Space 时需重命名为 requirements.txt
+# 本地安装请使用 requirements.txt（包含完整 GPU 依赖）
+# PyTorch
+torch>=2.0.0
+torchaudio>=2.0.0
+# Gradio 界面
+gradio==3.50.2
+# 音频处理
+librosa>=0.9.0
+soundfile>=0.12.0
+scipy>=1.10.0
+numpy>=1.23.0
+praat-parselmouth>=0.4.3
+torchcrepe>=0.0.20
+# 向量检索
+faiss-cpu>=1.7.4
+# 工具库
+tqdm>=4.65.0
+requests>=2.28.0
+python-dotenv>=1.0.0
+colorama>=0.4.6
+# AI 翻唱功能（核心）
+audio-separator
+huggingface_hub>=0.19.0
+pedalboard>=0.7.0
+ffmpeg-python>=0.2.0
+# 以下包在 HF Space 构建环境中编译失败，改为运行时按需安装：
+# fairseq==0.12.2  (HuBERT 特征提取)
+# demucs>=4.0.0    (人声分离备选)
+# pyworld>=0.3.4   (F0 提取备选)
+# av>=10.0.0       (音频解码备选)

run.py ADDED Viewed

	@@ -0,0 +1,133 @@

+# -*- coding: utf-8 -*-
+"""
+RVC AI 翻唱 - 主入口
+"""
+import os
+import sys
+import argparse
+from pathlib import Path
+# 添加项目根目录到路径
+ROOT_DIR = Path(__file__).parent
+sys.path.insert(0, str(ROOT_DIR))
+from lib.logger import log
+def check_environment():
+    """检查运行环境"""
+    log.header("RVC AI 翻唱系统")
+    # 检查 Python 版本
+    py_version = sys.version_info
+    log.info(f"Python 版本: {py_version.major}.{py_version.minor}.{py_version.micro}")
+    if py_version.major < 3 or (py_version.major == 3 and py_version.minor < 8):
+        log.warning("建议使用 Python 3.8 或更高版本")
+    # 检查 PyTorch
+    try:
+        import torch
+        log.info(f"PyTorch 版本: {torch.__version__}")
+        from lib.device import get_device_info, _is_rocm, _has_xpu, _has_directml, _has_mps
+        info = get_device_info()
+        log.info(f"可用加速后端: {', '.join(info['backends'])}")
+        if torch.cuda.is_available():
+            backend = "ROCm" if _is_rocm() else "CUDA"
+            log.info(f"{backend} 版本: {torch.version.hip if _is_rocm() else torch.version.cuda}")
+            log.info(f"GPU: {torch.cuda.get_device_name(0)}")
+        elif _has_xpu():
+            log.info(f"Intel GPU: {torch.xpu.get_device_name(0)}")
+        elif _has_directml():
+            import torch_directml
+            log.info(f"DirectML 设备: {torch_directml.device_name(0)}")
+        elif _has_mps():
+            log.info("Apple MPS 加速可用")
+        else:
+            log.warning("未检测到 GPU 加速，将使用 CPU")
+    except ImportError:
+        log.error("未安装 PyTorch")
+        return False
+    return True
+def check_models():
+    """检查必需模型"""
+    from tools.download_models import check_model, REQUIRED_MODELS
+    missing = []
+    for name in REQUIRED_MODELS:
+        if not check_model(name):
+            missing.append(name)
+    if missing:
+        log.warning(f"缺少必需模型: {', '.join(missing)}")
+        log.info("正在下载...")
+        from tools.download_models import download_required_models
+        if not download_required_models():
+            log.error("模型下载失败，请检查网络连接")
+            return False
+    return True
+def main():
+    """主函数"""
+    parser = argparse.ArgumentParser(description="RVC AI 翻唱系统")
+    parser.add_argument(
+        "--host",
+        type=str,
+        default="127.0.0.1",
+        help="服务器地址 (默认: 127.0.0.1)"
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        default=7860,
+        help="服务器端口 (默认: 7860)"
+    )
+    parser.add_argument(
+        "--share",
+        action="store_true",
+        help="创建公共链接"
+    )
+    parser.add_argument(
+        "--skip-check",
+        action="store_true",
+        help="跳过环境检查"
+    )
+    parser.add_argument(
+        "--download-models",
+        action="store_true",
+        help="仅下载模型"
+    )
+    args = parser.parse_args()
+    # 仅下载模型
+    if args.download_models:
+        from tools.download_models import download_all_models
+        download_all_models()
+        return
+    # 环境检查
+    if not args.skip_check:
+        if not check_environment():
+            sys.exit(1)
+    # 模型检查
+    if not check_models():
+        log.info("提示: 可以使用 --skip-check 跳过检查")
+        sys.exit(1)
+    # 启动界面
+    log.info(f"启动 Gradio 界面: http://{args.host}:{args.port}")
+    from ui.app import launch
+    launch(host=args.host, port=args.port, share=args.share)
+if __name__ == "__main__":
+    main()