mason369 commited on
Commit
4204217
·
verified ·
1 Parent(s): 2a34648

sync: 同步GitHub最新代码到HF Space

Browse files
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: AI-RVC 语音转换 & AI 翻唱
3
  emoji: 🎤
4
  colorFrom: blue
5
  colorTo: purple
@@ -10,15 +10,15 @@ pinned: false
10
  license: mit
11
  ---
12
 
13
- # 🎤 AI-RVC 语音转换 & AI 翻唱
14
 
15
- 基于 RVC v2 + RMVPE 高质量语音转换系统,支持一键 AI 翻唱功能
16
 
17
  ## 功能特点
18
 
19
  - **AI 歌曲翻唱**:上传歌曲自动分离人声、转换音色、混合伴奏,一键生成翻唱
20
  - **人声分离**:默认 Mel-Band Roformer (KimberleyJensen),在 MVSEP 公开 Multisong 指标中为 Vocals SDR 11.01 / Instrum SDR 17.32
21
- - **音转换**:RVC v2 架构 + FAISS 检索增强流程
22
  - **RMVPE 音高提取**:高精度 F0 提取,噪声鲁棒性强
23
  - **角色模型**:内置 117 个可下载角色模型
24
  - **混音效果**:支持人声混响、音量调节、4 种混音预设
@@ -88,7 +88,7 @@ license: mit
88
 
89
  人声分离 (Mel-Band Roformer)
90
 
91
- RVC 音转换 (HuBERT + RMVPE + FAISS)
92
 
93
  混音 (音量调节 + 混响)
94
 
@@ -148,4 +148,4 @@ A: 建议选择与原唱性别、音色相近的角色,效果更自然。
148
 
149
  **License**: MIT
150
  **Version**: 2.0
151
- **Last Updated**: 2026-03-10
 
1
  ---
2
+ title: AI-RVC 一键 AI 翻唱
3
  emoji: 🎤
4
  colorFrom: blue
5
  colorTo: purple
 
10
  license: mit
11
  ---
12
 
13
+ # 🎤 AI-RVC 一键 AI 翻唱
14
 
15
+ 基于 RVC v2 的一键 AI 翻唱系统,自动完成人声分离、音色转换、混音合成全流程
16
 
17
  ## 功能特点
18
 
19
  - **AI 歌曲翻唱**:上传歌曲自动分离人声、转换音色、混合伴奏,一键生成翻唱
20
  - **人声分离**:默认 Mel-Band Roformer (KimberleyJensen),在 MVSEP 公开 Multisong 指标中为 Vocals SDR 11.01 / Instrum SDR 17.32
21
+ - **音转换**:RVC v2 架构 + FAISS 检索增强流程
22
  - **RMVPE 音高提取**:高精度 F0 提取,噪声鲁棒性强
23
  - **角色模型**:内置 117 个可下载角色模型
24
  - **混音效果**:支持人声混响、音量调节、4 种混音预设
 
88
 
89
  人声分离 (Mel-Band Roformer)
90
 
91
+ RVC 音转换 (HuBERT + RMVPE + FAISS)
92
 
93
  混音 (音量调节 + 混响)
94
 
 
148
 
149
  **License**: MIT
150
  **Version**: 2.0
151
+ **Last Updated**: 2026-03-15
app.py CHANGED
@@ -1,28 +1,25 @@
1
- #!/usr/bin/env python
2
- # -*- coding: utf-8 -*-
3
- """
4
- Hugging Face Spaces 入口文件
5
- """
6
- import os
7
- import sys
8
- from pathlib import Path
9
-
10
- # 添加项目根目录到路径
11
- ROOT_DIR = Path(__file__).parent
12
- sys.path.insert(0, str(ROOT_DIR))
13
-
14
- # 设置环境变量
15
- os.environ["GRADIO_SERVER_NAME"] = "0.0.0.0"
16
- os.environ["GRADIO_SERVER_PORT"] = "7860"
17
-
18
- # 导入并启动应用
19
- from ui.app import create_ui
20
-
21
- app = create_ui()
22
- app.queue()
23
- app.launch(
24
- server_name="0.0.0.0",
25
- server_port=7860,
26
- share=False,
27
- inbrowser=False,
28
- )
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Hugging Face Spaces 入口文件
5
+ """
6
+ import os
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ ROOT_DIR = Path(__file__).parent
11
+ sys.path.insert(0, str(ROOT_DIR))
12
+
13
+ os.environ["GRADIO_SERVER_NAME"] = "0.0.0.0"
14
+ os.environ["GRADIO_SERVER_PORT"] = "7860"
15
+
16
+ from ui.app import create_ui
17
+
18
+ app = create_ui()
19
+ app.queue()
20
+ app.launch(
21
+ server_name="0.0.0.0",
22
+ server_port=7860,
23
+ share=False,
24
+ inbrowser=False,
25
+ )
 
 
 
configs/config.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import sys
4
+ import json
5
+ import shutil
6
+ from multiprocessing import cpu_count
7
+
8
+ import torch
9
+
10
+ try:
11
+ import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
12
+
13
+ if torch.xpu.is_available():
14
+ from infer.modules.ipex import ipex_init
15
+
16
+ ipex_init()
17
+ except Exception: # pylint: disable=broad-exception-caught
18
+ pass
19
+ import logging
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ version_config_list = [
25
+ "v1/32k.json",
26
+ "v1/40k.json",
27
+ "v1/48k.json",
28
+ "v2/48k.json",
29
+ "v2/32k.json",
30
+ ]
31
+
32
+
33
+ def singleton_variable(func):
34
+ def wrapper(*args, **kwargs):
35
+ if not wrapper.instance:
36
+ wrapper.instance = func(*args, **kwargs)
37
+ return wrapper.instance
38
+
39
+ wrapper.instance = None
40
+ return wrapper
41
+
42
+
43
+ @singleton_variable
44
+ class Config:
45
+ def __init__(self):
46
+ self.device = "cuda:0"
47
+ self.is_half = False
48
+ self.use_jit = False
49
+ self.n_cpu = 0
50
+ self.gpu_name = None
51
+ self.json_config = self.load_config_json()
52
+ self.gpu_mem = None
53
+ (
54
+ self.python_cmd,
55
+ self.listen_port,
56
+ self.iscolab,
57
+ self.noparallel,
58
+ self.noautoopen,
59
+ self.dml,
60
+ ) = self.arg_parse()
61
+ self.instead = ""
62
+ self.preprocess_per = 3.7
63
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
64
+
65
+ @staticmethod
66
+ def load_config_json() -> dict:
67
+ d = {}
68
+ for config_file in version_config_list:
69
+ p = f"configs/inuse/{config_file}"
70
+ if not os.path.exists(p):
71
+ shutil.copy(f"configs/{config_file}", p)
72
+ with open(f"configs/inuse/{config_file}", "r") as f:
73
+ d[config_file] = json.load(f)
74
+ return d
75
+
76
+ @staticmethod
77
+ def arg_parse() -> tuple:
78
+ exe = sys.executable or "python"
79
+ parser = argparse.ArgumentParser()
80
+ parser.add_argument("--port", type=int, default=7865, help="Listen port")
81
+ parser.add_argument("--pycmd", type=str, default=exe, help="Python command")
82
+ parser.add_argument("--colab", action="store_true", help="Launch in colab")
83
+ parser.add_argument(
84
+ "--noparallel", action="store_true", help="Disable parallel processing"
85
+ )
86
+ parser.add_argument(
87
+ "--noautoopen",
88
+ action="store_true",
89
+ help="Do not open in browser automatically",
90
+ )
91
+ parser.add_argument(
92
+ "--dml",
93
+ action="store_true",
94
+ help="torch_dml",
95
+ )
96
+ cmd_opts = parser.parse_args()
97
+
98
+ cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
99
+
100
+ return (
101
+ cmd_opts.pycmd,
102
+ cmd_opts.port,
103
+ cmd_opts.colab,
104
+ cmd_opts.noparallel,
105
+ cmd_opts.noautoopen,
106
+ cmd_opts.dml,
107
+ )
108
+
109
+ # has_mps is only available in nightly pytorch (for now) and MasOS 12.3+.
110
+ # check `getattr` and try it for compatibility
111
+ @staticmethod
112
+ def has_mps() -> bool:
113
+ if not torch.backends.mps.is_available():
114
+ return False
115
+ try:
116
+ torch.zeros(1).to(torch.device("mps"))
117
+ return True
118
+ except Exception:
119
+ return False
120
+
121
+ @staticmethod
122
+ def has_xpu() -> bool:
123
+ if hasattr(torch, "xpu") and torch.xpu.is_available():
124
+ return True
125
+ else:
126
+ return False
127
+
128
+ def use_fp32_config(self):
129
+ for config_file in version_config_list:
130
+ self.json_config[config_file]["train"]["fp16_run"] = False
131
+ with open(f"configs/inuse/{config_file}", "r") as f:
132
+ strr = f.read().replace("true", "false")
133
+ with open(f"configs/inuse/{config_file}", "w") as f:
134
+ f.write(strr)
135
+ logger.info("overwrite " + config_file)
136
+ self.preprocess_per = 3.0
137
+ logger.info("overwrite preprocess_per to %d" % (self.preprocess_per))
138
+
139
+ def device_config(self) -> tuple:
140
+ if torch.cuda.is_available():
141
+ if self.has_xpu():
142
+ self.device = self.instead = "xpu:0"
143
+ self.is_half = False
144
+ i_device = int(self.device.split(":")[-1])
145
+ self.gpu_name = torch.cuda.get_device_name(i_device)
146
+ if (
147
+ ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
148
+ or "P40" in self.gpu_name.upper()
149
+ or "P10" in self.gpu_name.upper()
150
+ or "1060" in self.gpu_name
151
+ or "1070" in self.gpu_name
152
+ or "1080" in self.gpu_name
153
+ ):
154
+ logger.info("Found GPU %s, force to fp32", self.gpu_name)
155
+ self.is_half = False
156
+ self.use_fp32_config()
157
+ else:
158
+ logger.info("Found GPU %s", self.gpu_name)
159
+ self.gpu_mem = int(
160
+ torch.cuda.get_device_properties(i_device).total_memory
161
+ / 1024
162
+ / 1024
163
+ / 1024
164
+ + 0.4
165
+ )
166
+ if self.gpu_mem <= 4:
167
+ self.preprocess_per = 3.0
168
+ elif self.has_mps():
169
+ logger.info("No supported Nvidia GPU found")
170
+ self.device = self.instead = "mps"
171
+ self.is_half = False
172
+ self.use_fp32_config()
173
+ else:
174
+ logger.info("No supported Nvidia GPU found")
175
+ self.device = self.instead = "cpu"
176
+ self.is_half = False
177
+ self.use_fp32_config()
178
+
179
+ if self.n_cpu == 0:
180
+ self.n_cpu = cpu_count()
181
+
182
+ if self.gpu_mem is not None and self.gpu_mem >= 8:
183
+ # 8G+显存配置(含fp32全精度)
184
+ x_pad = 3
185
+ x_query = 10
186
+ x_center = 60
187
+ x_max = 65
188
+ elif self.is_half:
189
+ # 6G显存配置
190
+ x_pad = 3
191
+ x_query = 10
192
+ x_center = 60
193
+ x_max = 65
194
+ else:
195
+ # 5G显存配置
196
+ x_pad = 1
197
+ x_query = 6
198
+ x_center = 38
199
+ x_max = 41
200
+
201
+ if self.gpu_mem is not None and self.gpu_mem <= 4:
202
+ x_pad = 1
203
+ x_query = 5
204
+ x_center = 30
205
+ x_max = 32
206
+ if self.dml:
207
+ logger.info("Use DirectML instead")
208
+ if (
209
+ os.path.exists(
210
+ r"runtime\Lib\site-packages\onnxruntime\capi\DirectML.dll"
211
+ )
212
+ == False
213
+ ):
214
+ try:
215
+ os.rename(
216
+ r"runtime\Lib\site-packages\onnxruntime",
217
+ r"runtime\Lib\site-packages\onnxruntime-cuda",
218
+ )
219
+ except:
220
+ pass
221
+ try:
222
+ os.rename(
223
+ r"runtime\Lib\site-packages\onnxruntime-dml",
224
+ r"runtime\Lib\site-packages\onnxruntime",
225
+ )
226
+ except:
227
+ pass
228
+ # if self.device != "cpu":
229
+ import torch_directml
230
+
231
+ self.device = torch_directml.device(torch_directml.default_device())
232
+ self.is_half = False
233
+ else:
234
+ if self.instead:
235
+ logger.info(f"Use {self.instead} instead")
236
+ if (
237
+ os.path.exists(
238
+ r"runtime\Lib\site-packages\onnxruntime\capi\onnxruntime_providers_cuda.dll"
239
+ )
240
+ == False
241
+ ):
242
+ try:
243
+ os.rename(
244
+ r"runtime\Lib\site-packages\onnxruntime",
245
+ r"runtime\Lib\site-packages\onnxruntime-dml",
246
+ )
247
+ except:
248
+ pass
249
+ try:
250
+ os.rename(
251
+ r"runtime\Lib\site-packages\onnxruntime-cuda",
252
+ r"runtime\Lib\site-packages\onnxruntime",
253
+ )
254
+ except:
255
+ pass
256
+ logger.info(
257
+ "Half-precision floating-point: %s, device: %s"
258
+ % (self.is_half, self.device)
259
+ )
260
+ return x_pad, x_query, x_center, x_max
configs/inuse/v1/32k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": false,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,4,2,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/inuse/v1/40k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": false,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 40000,
21
+ "filter_length": 2048,
22
+ "hop_length": 400,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 125,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,10,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/inuse/v1/48k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": false,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 11520,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,6,2,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/inuse/v2/32k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": false,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,8,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [20,16,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/inuse/v2/48k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": false,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 17280,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [12,10,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [24,20,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/presets/balanced.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "平衡型配置",
3
+ "description": "适合大多数歌曲,在音色转换和清晰度之间取得平衡",
4
+ "cover": {
5
+ "index_rate": 0.50,
6
+ "filter_radius": 3,
7
+ "rms_mix_rate": 0.50,
8
+ "protect": 0.40,
9
+ "f0_method": "hybrid",
10
+ "rmvpe_threshold": 0.005,
11
+ "f0_min": 80,
12
+ "f0_max": 1600,
13
+ "f0_stabilize": true,
14
+ "f0_stabilize_window": 3,
15
+ "f0_stabilize_max_semitones": 3.0,
16
+ "vc_preprocess_mode": "uvr_deecho",
17
+ "source_constraint_mode": "on",
18
+ "uvr5_agg": 10
19
+ }
20
+ }
configs/presets/clarity_priority.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "清晰度优先配置",
3
+ "description": "减少伪影和失真,保留更多源音频特征,适合复杂歌曲和高音多的情况",
4
+ "cover": {
5
+ "index_rate": 0.30,
6
+ "filter_radius": 1,
7
+ "rms_mix_rate": 0.75,
8
+ "protect": 0.55,
9
+ "f0_method": "hybrid",
10
+ "rmvpe_threshold": 0.008,
11
+ "f0_min": 60,
12
+ "f0_max": 1400,
13
+ "f0_stabilize": false,
14
+ "f0_stabilize_window": 2,
15
+ "f0_stabilize_max_semitones": 2.0,
16
+ "vc_preprocess_mode": "uvr_deecho",
17
+ "source_constraint_mode": "on",
18
+ "uvr5_agg": 8
19
+ }
20
+ }
configs/presets/timbre_priority.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "音色优先配置",
3
+ "description": "彻底的音色转换,适合音色特征明显的角色,可能有轻微口齿模糊",
4
+ "cover": {
5
+ "index_rate": 0.80,
6
+ "filter_radius": 5,
7
+ "rms_mix_rate": 0.30,
8
+ "protect": 0.25,
9
+ "f0_method": "hybrid",
10
+ "rmvpe_threshold": 0.003,
11
+ "f0_min": 80,
12
+ "f0_max": 1800,
13
+ "f0_stabilize": true,
14
+ "f0_stabilize_window": 5,
15
+ "f0_stabilize_max_semitones": 4.0,
16
+ "vc_preprocess_mode": "uvr_deecho",
17
+ "source_constraint_mode": "on",
18
+ "uvr5_agg": 12
19
+ }
20
+ }
configs/v1/32k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,4,2,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/v1/40k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 40000,
21
+ "filter_length": 2048,
22
+ "hop_length": 400,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 125,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,10,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/v1/48k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 11520,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,6,2,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/v2/32k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,8,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [20,16,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
configs/v2/48k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 17280,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [12,10,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [24,20,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
infer/cover_pipeline.py CHANGED
@@ -1667,6 +1667,15 @@ class CoverPipeline:
1667
  effective_karaoke_merge_backing = False if effective_official_mode else karaoke_merge_backing_into_accompaniment
1668
  effective_use_official = True if effective_official_mode else use_official
1669
 
 
 
 
 
 
 
 
 
 
1670
  total_steps = 5 if effective_karaoke_separation else 4
1671
  step_karaoke = 2 if effective_karaoke_separation else None
1672
  step_convert = 3 if effective_karaoke_separation else 2
@@ -1695,7 +1704,7 @@ class CoverPipeline:
1695
  log.config(f"说话人ID: {speaker_id}")
1696
  log.config(f"VC管线模式: {normalized_vc_pipeline_mode}")
1697
  if effective_official_mode:
1698
- log.config("官方模式: 强制使用官方UVR5分离 + 官方VC,不使用Karaoke二次分离")
1699
  log.config(f"人声分离器: {effective_separator}")
1700
  if effective_separator == "uvr5":
1701
  log.config(f"UVR5模型: {uvr5_model or '自动选择'}")
@@ -1800,27 +1809,29 @@ class CoverPipeline:
1800
  normalized_source_constraint_mode = str(source_constraint_mode or "auto").strip().lower()
1801
  available_uvr_deecho_model = self._get_available_uvr_deecho_model()
1802
  log.config(f"VC预处理模式: {normalized_vc_preprocess_mode}")
1803
- if normalized_vc_pipeline_mode == "current" and normalized_vc_preprocess_mode in {"auto", "uvr_deecho"}:
1804
  if available_uvr_deecho_model:
1805
  log.config(f"Mature DeEcho模型: {available_uvr_deecho_model}")
1806
  else:
1807
  log.config("Mature DeEcho模型: 未找到,将回退到主唱直通")
1808
  log.config(f"源约束模式: {normalized_source_constraint_mode}")
1809
 
 
 
 
 
 
 
 
1810
  vc_input_path = vocals_path
1811
  vc_preprocessed = False
1812
- if normalized_vc_pipeline_mode == "official":
1813
- self._last_vc_preprocess_mode = "direct"
1814
- log.detail("官方VC模式:跳过自定义VC预处理")
1815
- log.audio(f"官方VC输入: {Path(vc_input_path).name}")
1816
- else:
1817
- try:
1818
- prepared_path = self._prepare_vocals_for_vc(vocals_path, session_dir, preprocess_mode=normalized_vc_preprocess_mode)
1819
- vc_input_path = prepared_path
1820
- vc_preprocessed = True
1821
- log.audio(f"VC预处理输入: {Path(vc_input_path).name}")
1822
- except Exception as e:
1823
- log.warning(f"VC预处理失败,回退原始输入: {e}")
1824
 
1825
  report_progress("正在转换人声...", step_convert)
1826
  converted_vocals_path = str(session_dir / "converted_vocals.wav")
@@ -1846,7 +1857,7 @@ class CoverPipeline:
1846
  protect=protect,
1847
  speaker_id=speaker_id,
1848
  )
1849
- log.detail("内置官方模式已跳过自定义VC前后处理")
1850
  log.success("内置官方VC转换完成")
1851
  elif normalized_vc_pipeline_mode == "official" and singing_repair:
1852
  log.detail("使用官方兼容唱歌修复链进行转换")
@@ -1959,8 +1970,14 @@ class CoverPipeline:
1959
  log.warning("VC preprocess unavailable, skipping source-guided reconstruction")
1960
  log.success("官方VC转换完成")
1961
 
1962
- # 如果使用了advanced dereverb,重新应用原始混响
1963
- if hasattr(self, '_original_reverb_path') and self._original_reverb_path and Path(self._original_reverb_path).exists():
 
 
 
 
 
 
1964
  log.detail("重新应用原始混响到转换后的干声...")
1965
  import librosa
1966
  import soundfile as sf
@@ -1978,7 +1995,7 @@ class CoverPipeline:
1978
  sf.write(converted_vocals_path, wet_signal, sr)
1979
  log.detail(f"混响重应用完成: mix_ratio=0.8")
1980
 
1981
- else:
1982
  # 使用自定义VC管道进行转换
1983
  log.detail("使用自定义VC管道进行转换")
1984
  self._init_rvc_pipeline()
 
1667
  effective_karaoke_merge_backing = False if effective_official_mode else karaoke_merge_backing_into_accompaniment
1668
  effective_use_official = True if effective_official_mode else use_official
1669
 
1670
+ # 官方模式:强制使用官方推荐参数,确保1:1纯净推理
1671
+ if effective_official_mode:
1672
+ if f0_method != "rmvpe":
1673
+ log.warning(f"官方模式:F0方法从 {f0_method} 强制切换为 rmvpe(抗噪性最佳)")
1674
+ f0_method = "rmvpe"
1675
+ if protect != 0.33:
1676
+ log.warning(f"官方模式:保护系数从 {protect} 强制设为 0.33(官方推荐值)")
1677
+ protect = 0.33
1678
+
1679
  total_steps = 5 if effective_karaoke_separation else 4
1680
  step_karaoke = 2 if effective_karaoke_separation else None
1681
  step_convert = 3 if effective_karaoke_separation else 2
 
1704
  log.config(f"说话人ID: {speaker_id}")
1705
  log.config(f"VC管线模式: {normalized_vc_pipeline_mode}")
1706
  if effective_official_mode:
1707
+ log.config("官方模式: 强制UVR5分离 + 去混响预处理 + 官方VC (rmvpe, protect=0.33)")
1708
  log.config(f"人声分离器: {effective_separator}")
1709
  if effective_separator == "uvr5":
1710
  log.config(f"UVR5模型: {uvr5_model or '自动选择'}")
 
1809
  normalized_source_constraint_mode = str(source_constraint_mode or "auto").strip().lower()
1810
  available_uvr_deecho_model = self._get_available_uvr_deecho_model()
1811
  log.config(f"VC预处理模式: {normalized_vc_preprocess_mode}")
1812
+ if normalized_vc_preprocess_mode in {"auto", "uvr_deecho"}:
1813
  if available_uvr_deecho_model:
1814
  log.config(f"Mature DeEcho模型: {available_uvr_deecho_model}")
1815
  else:
1816
  log.config("Mature DeEcho模型: 未找到,将回退到主唱直通")
1817
  log.config(f"源约束模式: {normalized_source_constraint_mode}")
1818
 
1819
+ # 官方模式也必须经过去混响预处理,确保输入RVC的是纯净干声
1820
+ # 官方模式下如果用户选了 direct,强制提升为 auto(带混响的人声会破坏F0提取)
1821
+ effective_preprocess_mode = normalized_vc_preprocess_mode
1822
+ if normalized_vc_pipeline_mode == "official" and effective_preprocess_mode == "direct":
1823
+ effective_preprocess_mode = "auto"
1824
+ log.warning("官方模式:direct预处理已提升为auto,确保去混响后再进入RVC推理")
1825
+
1826
  vc_input_path = vocals_path
1827
  vc_preprocessed = False
1828
+ try:
1829
+ prepared_path = self._prepare_vocals_for_vc(vocals_path, session_dir, preprocess_mode=effective_preprocess_mode)
1830
+ vc_input_path = prepared_path
1831
+ vc_preprocessed = True
1832
+ log.audio(f"VC预处理输入: {Path(vc_input_path).name}")
1833
+ except Exception as e:
1834
+ log.warning(f"VC预处理失败,回退原始输入: {e}")
 
 
 
 
 
1835
 
1836
  report_progress("正在转换人声...", step_convert)
1837
  converted_vocals_path = str(session_dir / "converted_vocals.wav")
 
1857
  protect=protect,
1858
  speaker_id=speaker_id,
1859
  )
1860
+ log.detail("内置官方模式:去混响干声 -> 官方RVC推(纯净管道)")
1861
  log.success("内置官方VC转换完成")
1862
  elif normalized_vc_pipeline_mode == "official" and singing_repair:
1863
  log.detail("使用官方兼容唱歌修复链进行转换")
 
1970
  log.warning("VC preprocess unavailable, skipping source-guided reconstruction")
1971
  log.success("官方VC转换完成")
1972
 
1973
+ # 如果使用了advanced dereverb,重新应用原始混响(仅非官方模式)
1974
+ if (
1975
+ not effective_official_mode
1976
+ and not effective_use_official
1977
+ and hasattr(self, '_original_reverb_path')
1978
+ and self._original_reverb_path
1979
+ and Path(self._original_reverb_path).exists()
1980
+ ):
1981
  log.detail("重新应用原始混响到转换后的干声...")
1982
  import librosa
1983
  import soundfile as sf
 
1995
  sf.write(converted_vocals_path, wet_signal, sr)
1996
  log.detail(f"混响重应用完成: mix_ratio=0.8")
1997
 
1998
+ elif not effective_official_mode and not effective_use_official:
1999
  # 使用自定义VC管道进行转换
2000
  log.detail("使用自定义VC管道进行转换")
2001
  self._init_rvc_pipeline()
requirements.txt CHANGED
@@ -1,31 +1,39 @@
1
- # RVC AI 翻唱依赖 (Hugging Face Space - 最小化)
2
-
3
- # PyTorch
4
- torch>=2.0.0
5
- torchaudio>=2.0.0
6
-
7
- # Gradio 界面
8
- gradio==3.50.2
9
-
10
- # 音频处理
11
- librosa>=0.9.0
12
- soundfile>=0.12.0
13
- scipy>=1.10.0
14
- numpy>=1.23.0
15
- praat-parselmouth>=0.4.3
16
- torchcrepe>=0.0.20
17
-
18
- # 向量检索
19
- faiss-cpu>=1.7.4
20
-
21
- # 工具库
22
- tqdm>=4.65.0
23
- requests>=2.28.0
24
- python-dotenv>=1.0.0
25
- colorama>=0.4.6
26
-
27
- # AI 翻唱功能(核心)
28
- audio-separator
29
- huggingface_hub>=0.19.0
30
- pedalboard>=0.7.0
31
- ffmpeg-python>=0.2.0
 
 
 
 
 
 
 
 
 
1
+ # RVC AI 翻唱依赖 (Hugging Face Space - CPU 精简版)
2
+ # 注意:此文件用于 HF Space 部署,同步到 Space 时需重命名为 requirements.txt
3
+ # 本地安装请使用 requirements.txt(包含完整 GPU 依赖)
4
+
5
+ # PyTorch
6
+ torch>=2.0.0
7
+ torchaudio>=2.0.0
8
+
9
+ # Gradio 界面
10
+ gradio==3.50.2
11
+
12
+ # 音频处理
13
+ librosa>=0.9.0
14
+ soundfile>=0.12.0
15
+ scipy>=1.10.0
16
+ numpy>=1.23.0
17
+ praat-parselmouth>=0.4.3
18
+ torchcrepe>=0.0.20
19
+
20
+ # 向量检索
21
+ faiss-cpu>=1.7.4
22
+
23
+ # 工具库
24
+ tqdm>=4.65.0
25
+ requests>=2.28.0
26
+ python-dotenv>=1.0.0
27
+ colorama>=0.4.6
28
+
29
+ # AI 翻唱功能(核心)
30
+ audio-separator
31
+ huggingface_hub>=0.19.0
32
+ pedalboard>=0.7.0
33
+ ffmpeg-python>=0.2.0
34
+
35
+ # 以下包在 HF Space 构建环境中编译失败,改为运行时按需安装:
36
+ # fairseq==0.12.2 (HuBERT 特征提取)
37
+ # demucs>=4.0.0 (人声分离备选)
38
+ # pyworld>=0.3.4 (F0 提取备选)
39
+ # av>=10.0.0 (音频解码备选)
run.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ RVC AI 翻唱 - 主入口
4
+ """
5
+ import os
6
+ import sys
7
+ import argparse
8
+ from pathlib import Path
9
+
10
+ # 添加项目根目录到路径
11
+ ROOT_DIR = Path(__file__).parent
12
+ sys.path.insert(0, str(ROOT_DIR))
13
+
14
+ from lib.logger import log
15
+
16
+
17
+ def check_environment():
18
+ """检查运行环境"""
19
+ log.header("RVC AI 翻唱系统")
20
+
21
+ # 检查 Python 版本
22
+ py_version = sys.version_info
23
+ log.info(f"Python 版本: {py_version.major}.{py_version.minor}.{py_version.micro}")
24
+
25
+ if py_version.major < 3 or (py_version.major == 3 and py_version.minor < 8):
26
+ log.warning("建议使用 Python 3.8 或更高版本")
27
+
28
+ # 检查 PyTorch
29
+ try:
30
+ import torch
31
+ log.info(f"PyTorch 版本: {torch.__version__}")
32
+
33
+ from lib.device import get_device_info, _is_rocm, _has_xpu, _has_directml, _has_mps
34
+ info = get_device_info()
35
+ log.info(f"可用加速后端: {', '.join(info['backends'])}")
36
+
37
+ if torch.cuda.is_available():
38
+ backend = "ROCm" if _is_rocm() else "CUDA"
39
+ log.info(f"{backend} 版本: {torch.version.hip if _is_rocm() else torch.version.cuda}")
40
+ log.info(f"GPU: {torch.cuda.get_device_name(0)}")
41
+ elif _has_xpu():
42
+ log.info(f"Intel GPU: {torch.xpu.get_device_name(0)}")
43
+ elif _has_directml():
44
+ import torch_directml
45
+ log.info(f"DirectML 设备: {torch_directml.device_name(0)}")
46
+ elif _has_mps():
47
+ log.info("Apple MPS 加速可用")
48
+ else:
49
+ log.warning("未检测到 GPU 加速,将使用 CPU")
50
+ except ImportError:
51
+ log.error("未安装 PyTorch")
52
+ return False
53
+
54
+ return True
55
+
56
+
57
+ def check_models():
58
+ """检查必需模型"""
59
+ from tools.download_models import check_model, REQUIRED_MODELS
60
+
61
+ missing = []
62
+ for name in REQUIRED_MODELS:
63
+ if not check_model(name):
64
+ missing.append(name)
65
+
66
+ if missing:
67
+ log.warning(f"缺少必需模型: {', '.join(missing)}")
68
+ log.info("正在下载...")
69
+ from tools.download_models import download_required_models
70
+ if not download_required_models():
71
+ log.error("模型下载失败,请检查网络连接")
72
+ return False
73
+
74
+ return True
75
+
76
+
77
+ def main():
78
+ """主函数"""
79
+ parser = argparse.ArgumentParser(description="RVC AI 翻唱系统")
80
+ parser.add_argument(
81
+ "--host",
82
+ type=str,
83
+ default="127.0.0.1",
84
+ help="服务器地址 (默认: 127.0.0.1)"
85
+ )
86
+ parser.add_argument(
87
+ "--port",
88
+ type=int,
89
+ default=7860,
90
+ help="服务器端口 (默认: 7860)"
91
+ )
92
+ parser.add_argument(
93
+ "--share",
94
+ action="store_true",
95
+ help="创建公共链接"
96
+ )
97
+ parser.add_argument(
98
+ "--skip-check",
99
+ action="store_true",
100
+ help="跳过环境检查"
101
+ )
102
+ parser.add_argument(
103
+ "--download-models",
104
+ action="store_true",
105
+ help="仅下载模型"
106
+ )
107
+
108
+ args = parser.parse_args()
109
+
110
+ # 仅下载模型
111
+ if args.download_models:
112
+ from tools.download_models import download_all_models
113
+ download_all_models()
114
+ return
115
+
116
+ # 环境检查
117
+ if not args.skip_check:
118
+ if not check_environment():
119
+ sys.exit(1)
120
+
121
+ # 模型检查
122
+ if not check_models():
123
+ log.info("提示: 可以使用 --skip-check 跳过检查")
124
+ sys.exit(1)
125
+
126
+ # 启动界面
127
+ log.info(f"启动 Gradio 界面: http://{args.host}:{args.port}")
128
+ from ui.app import launch
129
+ launch(host=args.host, port=args.port, share=args.share)
130
+
131
+
132
+ if __name__ == "__main__":
133
+ main()