mason369 commited on
Commit
a9536c4
·
verified ·
1 Parent(s): 4204217

Release v1.2.1

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env.example +15 -0
  2. .gitattributes +2 -0
  3. .github/workflows/build-executables.yml +373 -0
  4. .gitignore +80 -0
  5. AGENTS.md +126 -0
  6. AI-RVC.spec +71 -0
  7. AI_RVC_Colab.ipynb +380 -0
  8. CLAUDE.md +126 -0
  9. LICENSE +21 -0
  10. README_HF.md +6 -6
  11. app.py +16 -9
  12. assets/weights/characters/_version_notes.json +120 -0
  13. assets/weights/characters/mari/metadata.json +177 -0
  14. assets/weights/characters/tokai_teio/metadata.json +147 -0
  15. check_deecho_config.py +130 -0
  16. configs/config.json +15 -9
  17. configs/presets/balanced.json +2 -2
  18. configs/presets/clarity_priority.json +2 -2
  19. configs/presets/timbre_priority.json +2 -2
  20. docs/Colab演示.png +3 -0
  21. docs/Windows界面.png +3 -0
  22. docs/plans/2026-03-30-portable-ffmpeg-hardening.md +74 -0
  23. docs/reports/2026-03-24-cover-quality-final-root-cause-report.md +127 -0
  24. docs/superpowers/plans/2026-05-17-v1.2.1-language-release.md +90 -0
  25. docs/superpowers/specs/2026-05-17-v1.2.1-language-release-design.md +21 -0
  26. i18n/en_US.json +165 -0
  27. i18n/zh_CN.json +41 -10
  28. infer/__init__.py +17 -17
  29. infer/advanced_dereverb.py +0 -32
  30. infer/cover_pipeline.py +0 -0
  31. infer/f0_extractor.py +114 -127
  32. infer/modules/uvr5/modules.py +20 -4
  33. infer/modules/vc/pipeline.py +89 -30
  34. infer/official_adapter.py +114 -8
  35. infer/quality_policy.py +294 -0
  36. infer/separator.py +349 -123
  37. install.py +396 -0
  38. lib/audio.py +51 -51
  39. lib/audio_metrics.py +93 -0
  40. lib/ffmpeg_runtime.py +130 -0
  41. lib/logger.py +254 -254
  42. lib/mixer.py +22 -22
  43. lib/runtime_build.py +59 -0
  44. mcp/__init__.py +4 -0
  45. mcp/server.py +139 -0
  46. mcp/tools.py +233 -0
  47. models/__init__.py +7 -7
  48. models/rmvpe.py +439 -439
  49. models/synthesizer.py +853 -853
  50. requirements_hf.txt +39 -0
.env.example ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RVC 语音转换配置
2
+
3
+ # 设备设置 (cuda / cpu)
4
+ DEVICE=cuda
5
+
6
+ # Gradio 服务器设置
7
+ SERVER_HOST=127.0.0.1
8
+ SERVER_PORT=7860
9
+
10
+ # 模型路径
11
+ HUBERT_PATH=assets/hubert/hubert_base.pt
12
+ RMVPE_PATH=assets/rmvpe/rmvpe.pt
13
+
14
+ # 日志级别 (DEBUG / INFO / WARNING / ERROR)
15
+ LOG_LEVEL=INFO
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ docs/Colab演示.png filter=lfs diff=lfs merge=lfs -text
37
+ docs/Windows界面.png filter=lfs diff=lfs merge=lfs -text
.github/workflows/build-executables.yml ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Build Executables
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*'
7
+ workflow_dispatch:
8
+ inputs:
9
+ tag:
10
+ description: '要上传资源的 Release 标签(如 v1.2.0),留空则上传到最新 Release'
11
+ required: false
12
+ default: ''
13
+
14
+ permissions:
15
+ contents: write
16
+
17
+ jobs:
18
+ build:
19
+ strategy:
20
+ fail-fast: false
21
+ matrix:
22
+ include:
23
+ - os: windows-latest
24
+ variant: CPU
25
+ platform: Windows
26
+ pytorch_url: https://download.pytorch.org/whl/cpu
27
+ sep: ";"
28
+ shell: pwsh
29
+ - os: windows-latest
30
+ variant: GPU
31
+ platform: Windows
32
+ pytorch_url: https://download.pytorch.org/whl/cu121
33
+ sep: ";"
34
+ shell: pwsh
35
+ - os: ubuntu-latest
36
+ variant: CPU
37
+ platform: Linux
38
+ pytorch_url: https://download.pytorch.org/whl/cpu
39
+ sep: ":"
40
+ shell: bash
41
+ - os: ubuntu-latest
42
+ variant: GPU
43
+ platform: Linux
44
+ pytorch_url: https://download.pytorch.org/whl/cu121
45
+ sep: ":"
46
+ shell: bash
47
+
48
+ runs-on: ${{ matrix.os }}
49
+ name: Build ${{ matrix.platform }}-${{ matrix.variant }}
50
+
51
+ steps:
52
+ - name: Checkout code
53
+ uses: actions/checkout@v4
54
+
55
+ - name: Set up Python
56
+ uses: actions/setup-python@v5
57
+ with:
58
+ python-version: '3.10'
59
+
60
+ - name: Install system dependencies (Linux)
61
+ if: runner.os == 'Linux'
62
+ run: |
63
+ sudo apt-get update
64
+ sudo apt-get install -y build-essential libsndfile1 ffmpeg portaudio19-dev
65
+
66
+ - name: Install FFmpeg (Windows)
67
+ if: runner.os == 'Windows'
68
+ run: choco install ffmpeg -y
69
+
70
+ - name: Install Python dependencies
71
+ run: |
72
+ python -m pip install --upgrade "pip<24.1"
73
+ pip install pyinstaller
74
+ pip install torch torchaudio --index-url ${{ matrix.pytorch_url }}
75
+ pip install omegaconf==2.0.6 --no-deps
76
+ pip install PyYAML antlr4-python3-runtime hydra-core
77
+ pip install fairseq==0.12.2 --no-deps
78
+ pip install "jinja2<3.1.5"
79
+ pip install gradio==3.50.2 librosa soundfile scipy numpy praat-parselmouth pyworld faiss-cpu tqdm requests python-dotenv colorama huggingface_hub pedalboard ffmpeg-python av imageio-ffmpeg
80
+ pip install demucs torchcrepe --no-deps
81
+ pip install julius dora-search lameenc openunmix treetable
82
+ python -c "import torch; print(f'PyTorch: {torch.__version__}, CUDA: {torch.cuda.is_available()}')"
83
+
84
+ - name: Install audio-separator
85
+ run: pip install audio-separator onnxruntime
86
+
87
+ - name: Bundle FFmpeg runtime
88
+ shell: bash
89
+ run: |
90
+ python - <<'PY'
91
+ import os
92
+ import shutil
93
+ import stat
94
+ import subprocess
95
+ from pathlib import Path
96
+
97
+ import imageio_ffmpeg
98
+
99
+ def check_executable(executable: Path) -> None:
100
+ result = subprocess.run([str(executable), "-version"], text=True, capture_output=True)
101
+ if result.returncode != 0:
102
+ details = "\n".join(
103
+ part.strip()
104
+ for part in (result.stdout, result.stderr)
105
+ if part and part.strip()
106
+ )
107
+ raise RuntimeError(f"{executable} failed -version:\n{details}")
108
+
109
+ def resolve_ffprobe() -> Path:
110
+ ffprobe_src = shutil.which("ffprobe")
111
+ if not ffprobe_src:
112
+ raise RuntimeError("ffprobe not found after installing FFmpeg")
113
+
114
+ ffprobe_path = Path(ffprobe_src)
115
+ if os.name == "nt":
116
+ chocolatey_root = Path(os.environ.get("ChocolateyInstall", r"C:\ProgramData\chocolatey"))
117
+ chocolatey_shim_dir = chocolatey_root / "bin"
118
+ if ffprobe_path.parent.resolve() == chocolatey_shim_dir.resolve():
119
+ real_ffprobe = chocolatey_root / "lib" / "ffmpeg" / "tools" / "ffmpeg" / "bin" / "ffprobe.exe"
120
+ if not real_ffprobe.exists():
121
+ raise RuntimeError(f"Chocolatey ffprobe shim found, but real binary is missing: {real_ffprobe}")
122
+ return real_ffprobe
123
+
124
+ return ffprobe_path
125
+
126
+ bundle_dir = Path("tools/ffmpeg/bin")
127
+ bundle_dir.mkdir(parents=True, exist_ok=True)
128
+
129
+ ffmpeg_src = Path(imageio_ffmpeg.get_ffmpeg_exe())
130
+ ffmpeg_name = "ffmpeg.exe" if os.name == "nt" else "ffmpeg"
131
+ ffmpeg_dest = bundle_dir / ffmpeg_name
132
+ shutil.copy2(ffmpeg_src, ffmpeg_dest)
133
+ ffmpeg_dest.chmod(ffmpeg_dest.stat().st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
134
+ check_executable(ffmpeg_dest)
135
+ print(f"Bundled ffmpeg: {ffmpeg_dest}")
136
+
137
+ ffprobe_src = resolve_ffprobe()
138
+ ffprobe_name = "ffprobe.exe" if os.name == "nt" else "ffprobe"
139
+ ffprobe_dest = bundle_dir / ffprobe_name
140
+ shutil.copy2(ffprobe_src, ffprobe_dest)
141
+ ffprobe_dest.chmod(ffprobe_dest.stat().st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
142
+ subprocess.run([str(ffprobe_dest), "-version"], text=True, capture_output=True, check=True)
143
+ print(f"Bundled ffprobe: {ffprobe_dest}")
144
+ PY
145
+
146
+ - name: Download all AI models
147
+ shell: bash
148
+ env:
149
+ PYTHONIOENCODING: utf-8
150
+ run: |
151
+ echo "=== Download all base models (HuBERT, RMVPE, UVR5, DeEcho, pretrained) ==="
152
+ python tools/download_models.py --all
153
+
154
+ echo "=== Download Roformer separator models ==="
155
+ python -c "
156
+ from audio_separator.separator import Separator
157
+ import os
158
+ model_dir = os.path.join('assets', 'separator_models')
159
+ os.makedirs(model_dir, exist_ok=True)
160
+ # Vocal separation model
161
+ sep = Separator(output_dir='.', model_file_dir=model_dir)
162
+ sep.load_model('model_bs_roformer_ep_317_sdr_12.9755.ckpt')
163
+ del sep
164
+ # Karaoke SOTA ensemble models
165
+ for karaoke_model in [
166
+ 'mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt',
167
+ 'mel_band_roformer_karaoke_gabox_v2.ckpt',
168
+ 'mel_band_roformer_karaoke_becruily.ckpt',
169
+ ]:
170
+ sep2 = Separator(output_dir='.', model_file_dir=model_dir)
171
+ sep2.load_model(karaoke_model)
172
+ del sep2
173
+ print('Roformer models downloaded.')
174
+ "
175
+
176
+ echo "=== Verify downloaded models ==="
177
+ find assets/ -name "*.pt" -o -name "*.pth" -o -name "*.ckpt" -o -name "*.onnx" | while read f; do
178
+ SIZE=$(stat -c%s "$f" 2>/dev/null || stat -f%z "$f")
179
+ echo " $f ($(( SIZE / 1048576 )) MB)"
180
+ done
181
+
182
+ - name: Build executable
183
+ shell: bash
184
+ run: |
185
+ SEP="${{ matrix.sep }}"
186
+ NAME="AI-RVC-${{ matrix.platform }}-${{ matrix.variant }}"
187
+ pyinstaller --name "${NAME}" \
188
+ --onedir \
189
+ --add-data "ui${SEP}ui" \
190
+ --add-data "infer${SEP}infer" \
191
+ --add-data "lib${SEP}lib" \
192
+ --add-data "models${SEP}models" \
193
+ --add-data "tools${SEP}tools" \
194
+ --add-data "i18n${SEP}i18n" \
195
+ --add-data "configs${SEP}configs" \
196
+ --add-data "assets/hubert${SEP}assets/hubert" \
197
+ --add-data "assets/rmvpe${SEP}assets/rmvpe" \
198
+ --add-data "assets/uvr5_weights${SEP}assets/uvr5_weights" \
199
+ --add-data "assets/pretrained_v2${SEP}assets/pretrained_v2" \
200
+ --add-data "assets/separator_models${SEP}assets/separator_models" \
201
+ --hidden-import=torch \
202
+ --hidden-import=torchaudio \
203
+ --hidden-import=gradio \
204
+ --hidden-import=librosa \
205
+ --hidden-import=soundfile \
206
+ --hidden-import=fairseq \
207
+ --hidden-import=audio_separator \
208
+ --hidden-import=demucs \
209
+ --hidden-import=pedalboard \
210
+ --collect-all torch \
211
+ --collect-all torchaudio \
212
+ --collect-all gradio \
213
+ --collect-all gradio_client \
214
+ run.py
215
+
216
+ - name: Create portable package
217
+ shell: bash
218
+ run: |
219
+ NAME="AI-RVC-${{ matrix.platform }}-${{ matrix.variant }}"
220
+ PKG="${NAME}-Portable"
221
+ mkdir -p "${PKG}"
222
+ # onedir 输出在 dist/NAME/ 目录下,复制全部内容
223
+ cp -r dist/${NAME}/* "${PKG}/"
224
+ cp README.md "${PKG}/"
225
+ [ -f LICENSE ] && cp LICENSE "${PKG}/"
226
+
227
+ if [ "${{ matrix.variant }}" = "GPU" ]; then
228
+ VARIANT_NOTE="GPU 版(CUDA 12.1),支持 NVIDIA 显卡加速
229
+ 如果没有 NVIDIA 显卡,程序会自动回退到 CPU 推理"
230
+ else
231
+ VARIANT_NOTE="CPU 版,无需显卡即可运行
232
+ 如需 GPU 加速,请下载 GPU 版本或使用本地安装方式:python install.py"
233
+ fi
234
+
235
+ if [ "${{ matrix.platform }}" = "Windows" ]; then
236
+ EXE_NAME="${NAME}.exe"
237
+ cat > "${PKG}/使用说明.txt" << HEREDOC
238
+ AI-RVC ${{ matrix.platform }} 便携版(${{ matrix.variant }})
239
+
240
+ 使用方法:
241
+ 双击 ${EXE_NAME} 启动
242
+ 浏览器访问 http://127.0.0.1:7860
243
+
244
+ ${VARIANT_NOTE}
245
+
246
+ AI 模型已内置,无需额外下载
247
+ 无需安装 Python,解压即用
248
+ HEREDOC
249
+ else
250
+ EXE_NAME="${NAME}"
251
+ chmod +x "${PKG}/${EXE_NAME}"
252
+ cat > "${PKG}/使用说明.txt" << HEREDOC
253
+ AI-RVC ${{ matrix.platform }} 便携版(${{ matrix.variant }})
254
+
255
+ 使用方法:
256
+ chmod +x ${EXE_NAME}
257
+ ./${EXE_NAME}
258
+ 浏览器访问 http://127.0.0.1:7860
259
+
260
+ ${VARIANT_NOTE}
261
+
262
+ AI 模型已内置,无需额外下载
263
+ 无需安装 Python,解压即用
264
+ HEREDOC
265
+ fi
266
+
267
+ - name: Compress package
268
+ shell: bash
269
+ run: |
270
+ NAME="AI-RVC-${{ matrix.platform }}-${{ matrix.variant }}"
271
+ PKG="${NAME}-Portable"
272
+
273
+ if [ "${{ matrix.platform }}" = "Windows" ]; then
274
+ # 先压缩成 zip
275
+ 7z a -tzip "${PKG}.zip" "${PKG}"
276
+ # 超过 1.9GB 时删除 zip 改用 7z(压缩率更高)
277
+ FILE_SIZE=$(stat -c%s "${PKG}.zip" 2>/dev/null || wc -c < "${PKG}.zip" | tr -d ' ')
278
+ if [ "$FILE_SIZE" -gt 1900000000 ]; then
279
+ rm "${PKG}.zip"
280
+ # 先尝试不分卷的 7z
281
+ 7z a "${PKG}.7z" "${PKG}"
282
+ SEVENZ_SIZE=$(stat -c%s "${PKG}.7z" 2>/dev/null || wc -c < "${PKG}.7z" | tr -d ' ')
283
+ if [ "$SEVENZ_SIZE" -gt 1900000000 ]; then
284
+ # 7z 也超限,改用分卷
285
+ rm "${PKG}.7z"
286
+ 7z a -v1900m "${PKG}.7z" "${PKG}"
287
+ fi
288
+ fi
289
+ else
290
+ tar -czf "${PKG}.tar.gz" "${PKG}"
291
+ # 超过 1.9GB 时拆分
292
+ FILE_SIZE=$(stat -c%s "${PKG}.tar.gz" 2>/dev/null || stat -f%z "${PKG}.tar.gz")
293
+ if [ "$FILE_SIZE" -gt 1900000000 ]; then
294
+ split -b 1900M "${PKG}.tar.gz" "${PKG}.tar.gz.part"
295
+ rm "${PKG}.tar.gz"
296
+ fi
297
+ fi
298
+
299
+ - name: Upload artifact
300
+ uses: actions/upload-artifact@v4
301
+ with:
302
+ name: AI-RVC-${{ matrix.platform }}-${{ matrix.variant }}
303
+ path: AI-RVC-${{ matrix.platform }}-${{ matrix.variant }}-Portable*
304
+
305
+ upload-release:
306
+ needs: [build]
307
+ runs-on: ubuntu-latest
308
+
309
+ steps:
310
+ - name: Download all artifacts
311
+ uses: actions/download-artifact@v4
312
+ with:
313
+ merge-multiple: true
314
+
315
+ - name: List artifacts
316
+ run: ls -lh AI-RVC-*
317
+
318
+ - name: Determine target tag
319
+ id: tag
320
+ run: |
321
+ if [ "${{ github.event_name }}" = "push" ]; then
322
+ echo "tag=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
323
+ elif [ -n "${{ github.event.inputs.tag }}" ]; then
324
+ echo "tag=${{ github.event.inputs.tag }}" >> $GITHUB_OUTPUT
325
+ else
326
+ LATEST=$(gh release list --repo ${{ github.repository }} --limit 1 --json tagName -q '.[0].tagName')
327
+ echo "tag=${LATEST}" >> $GITHUB_OUTPUT
328
+ fi
329
+ env:
330
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
331
+
332
+ - name: Upload assets to release
333
+ run: |
334
+ TAG="${{ steps.tag.outputs.tag }}"
335
+ echo "上传资源到 Release: ${TAG}"
336
+ # 收集所有构建产物(含分卷文件)
337
+ mapfile -t FILES < <(find . -maxdepth 1 -name "AI-RVC-*-Portable*" -type f | sort)
338
+ upload_asset_with_retry() {
339
+ local tag="$1"
340
+ local file="$2"
341
+ local attempt
342
+ local exit_code
343
+
344
+ for attempt in 1 2 3; do
345
+ echo "上传(${attempt}/3): ${file}"
346
+ if timeout 30m gh release upload "${tag}" "${file}" \
347
+ --repo ${{ github.repository }} \
348
+ --clobber; then
349
+ return 0
350
+ fi
351
+
352
+ exit_code=$?
353
+ echo "上传失败(退出码=${exit_code}): ${file}"
354
+ if [ "${attempt}" -lt 3 ]; then
355
+ sleep $(( attempt * 20 ))
356
+ fi
357
+ done
358
+
359
+ echo "上传最终失败: ${file}"
360
+ return 1
361
+ }
362
+ echo "找到以下文件:"
363
+ for f in "${FILES[@]}"; do
364
+ SIZE=$(stat -c%s "$f" 2>/dev/null || stat -f%z "$f")
365
+ echo " $f ($(( SIZE / 1048576 )) MB)"
366
+ done
367
+ # 逐个上传
368
+ for f in "${FILES[@]}"; do
369
+ upload_asset_with_retry "${TAG}" "$f"
370
+ done
371
+ echo "全部上传完成"
372
+ env:
373
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
.gitignore ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib64/
14
+ parts/
15
+ sdist/
16
+ var/
17
+ wheels/
18
+ *.egg-info/
19
+ .installed.cfg
20
+ *.egg
21
+
22
+ # 虚拟环境
23
+ venv/
24
+ ENV/
25
+ env/
26
+ .venv/
27
+
28
+ # IDE
29
+ .idea/
30
+ .vscode/
31
+ .claude/
32
+ *.swp
33
+ *.swo
34
+
35
+ # 环境变量
36
+ .env
37
+
38
+ # 模型文件 (太大,不提交)
39
+ assets/hubert/*.pt
40
+ assets/rmvpe/*.pt
41
+ assets/pretrained_v2/*.pth
42
+ assets/separator_models/
43
+ assets/uvr5_weights/
44
+ assets/weights/*.pth
45
+ assets/weights/*.index
46
+ assets/weights/characters/**/*.pth
47
+ assets/weights/characters/**/*.index
48
+ assets/weights/characters/**/ai_rvc_model.json
49
+ assets/weights/official_models/
50
+ assets/weights/official_indexes/
51
+
52
+ # 临时文件
53
+ temp/
54
+ *.tmp
55
+ *.temp
56
+ *.log
57
+ logs/
58
+
59
+ # 音频文件
60
+ *.wav
61
+ *.mp3
62
+ *.flac
63
+ *.ogg
64
+ !assets/test/
65
+
66
+ # Gradio
67
+ flagged/
68
+ /venv310
69
+
70
+ # 上游参考代码 (vendored, 独立 git repo)
71
+ _official_rvc/
72
+
73
+ # 输出文件
74
+ output/
75
+ outputs/
76
+ /.playwright-cli
77
+ /AI-RVC-Windows-GPU-Portable
78
+ /AI-RVC-Windows-GPU-Portable.7z.*
79
+ /tools/ffmpeg/
80
+ /scratch/
AGENTS.md ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AGENTS.md
2
+
3
+ This file provides guidance to Codex (Codex.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ RVC v2 voice conversion & AI cover system. Users upload a song, the pipeline separates vocals from accompaniment (Mel-Band Roformer), converts the vocal timbre via RVC v2 (HuBERT + RMVPE + FAISS), then mixes the result back with the accompaniment.
8
+
9
+ **Platform Support**: Windows / Linux / WSL2 / Google Colab
10
+
11
+ **Key Features**:
12
+ - AI song covers with automatic vocal separation and mixing
13
+ - 117 downloadable character models
14
+ - 4 mixing presets (universal, vocal-focused, accompaniment-focused, live)
15
+ - Karaoke mode (lead/backing vocal separation)
16
+ - 4 VC preprocessing modes (auto, direct, uvr_deecho, legacy)
17
+ - Dual VC pipeline (current implementation vs official RVC)
18
+ - Multi-backend GPU support (CUDA, ROCm, XPU, DirectML, MPS)
19
+
20
+ ## Commands
21
+
22
+ ```powershell
23
+ # Activate venv (Windows)
24
+ .\venv310\Scripts\Activate.ps1
25
+
26
+ # Activate venv (Linux/WSL2)
27
+ source venv310/bin/activate
28
+
29
+ # Install dependencies
30
+ python install.py # full install + launch
31
+ python install.py --check # check only
32
+ python install.py --cpu # CPU variant
33
+
34
+ # Run
35
+ python run.py # default: http://127.0.0.1:7860
36
+ python run.py --skip-check # skip env/model validation
37
+ python run.py --host 0.0.0.0 --port 8080 --share
38
+
39
+ # Download base models (HuBERT, RMVPE)
40
+ python tools/download_models.py
41
+
42
+ # Download character models
43
+ python -c "from tools.character_models import download_character_model; download_character_model('rin')"
44
+
45
+ # Quick CUDA check
46
+ python -c "import torch; print(torch.cuda.is_available())"
47
+
48
+ # Colab
49
+ # Open AI_RVC_Colab.ipynb in Google Colab, set runtime to GPU (T4), run cells sequentially
50
+ ```
51
+
52
+ ## Architecture
53
+
54
+ **Entry:** `run.py` → env check → model check → `ui/app.py:launch()`
55
+
56
+ **Pipeline flow** (`infer/cover_pipeline.py:CoverPipeline.process`):
57
+ 1. Vocal separation (`infer/separator.py`) — Roformer (default), Demucs, or UVR5
58
+ 2. RVC voice conversion (`infer/pipeline.py`) — HuBERT features → RMVPE F0 → RVC v2 inference with FAISS retrieval
59
+ 3. Mixing (`lib/mixer.py`) — volume adjust + reverb via pedalboard
60
+
61
+ **Character model system** (`tools/character_models.py`):
62
+ - 117 downloadable character models from HuggingFace (`trioskosmos/rvc_models`)
63
+ - Stored in `assets/weights/characters/`
64
+ - Version notes (epochs, sample rate) extracted from .pth metadata and cached in `_version_notes.json`
65
+ - Display name assembly: `_get_display_name()` appends `(500 epochs·40k)` style training info
66
+
67
+ **UI** (`ui/app.py`):
68
+ - Gradio 3.50.2, single-file ~2000 lines
69
+ - i18n via `i18n/zh_CN.json`, accessed through `t(key, section)` helper
70
+ - Three main tabs: song cover (full pipeline), model management, settings
71
+ - Cover tab features:
72
+ - Character model download/management with series filtering and keyword search
73
+ - 4 mixing presets (universal, vocal-focused, accompaniment-focused, live)
74
+ - Karaoke separation (lead/backing vocals)
75
+ - 4 VC preprocessing modes (auto, direct, uvr_deecho, legacy)
76
+ - Source constraint control (auto/off/on)
77
+ - Dual VC pipeline mode (current/official)
78
+ - Singing repair (official mode only)
79
+ - Real-time VC route status display
80
+ - Model management tab:
81
+ - Base model download (HuBERT, RMVPE)
82
+ - Mature DeEcho model download
83
+ - Model list table with refresh
84
+ - Settings tab:
85
+ - Device info display
86
+ - Backend selection (CUDA/ROCm/XPU/DirectML/MPS/CPU)
87
+ - Config save
88
+
89
+ **Config:** `configs/config.json` — device, F0 method, index rate, cover separator settings, path mappings
90
+
91
+ ## Key Conventions
92
+
93
+ - Python 3.10, UTF-8, 4-space indent
94
+ - `snake_case` functions/variables, `PascalCase` classes, `UPPER_SNAKE_CASE` constants
95
+ - User-facing text is bilingual Chinese/English
96
+ - Commit messages: short imperative subjects, Chinese/English mixed (e.g. `infer: fix CUDA OOM`)
97
+ - No automated test suite; verify changes by running one voice conversion + one cover through the UI
98
+ - `_official_rvc/` is vendored upstream reference — don't modify unless syncing
99
+
100
+ ## Important Paths
101
+
102
+ - `configs/config.json` — all runtime settings
103
+ - `infer/cover_pipeline.py` — orchestrates the full cover workflow
104
+ - `infer/pipeline.py` — RVC v2 inference core
105
+ - `infer/separator.py` — Roformer/Demucs vocal separation wrappers
106
+ - `tools/character_models.py` — character model registry (117 entries) + download logic
107
+ - `tools/download_models.py` — base model (HuBERT/RMVPE) + mature DeEcho downloader
108
+ - `lib/mixer.py` — audio mixing with volume/reverb
109
+ - `ui/app.py` — entire Gradio UI (~2000 lines)
110
+ - `mcp/server.py` + `mcp/tools.py` — MCP server integration for Codex
111
+ - `AI_RVC_Colab.ipynb` — Google Colab notebook with full feature parity
112
+ - `install.py` — cross-platform installation script (Windows/Linux)
113
+
114
+ ## Things to Watch
115
+
116
+ - `fairseq` is pinned to `0.12.2` — HuBERT loading breaks on other versions
117
+ - `audio-separator` must be installed with `[gpu]` extra for CUDA support
118
+ - Roformer model auto-downloads on first use to `assets/separator_models/`
119
+ - Gradio is pinned to `3.50.2`; the UI code uses v3 API patterns (not v4)
120
+ - Model weights (.pt, .pth) and audio files are gitignored — never commit them
121
+ - Path handling uses `pathlib.Path` for cross-platform compatibility (Windows/Linux)
122
+ - Virtual environment activation differs by platform: `Scripts/Activate.ps1` (Windows) vs `bin/activate` (Linux)
123
+ - `install.py` has hardcoded Windows Python paths in `PYTHON310_CANDIDATES` but falls back to `py -3.10` launcher
124
+ - Platform detection uses `os.name == "nt"` for Windows-specific logic (venv paths, etc.)
125
+ - All core functionality is platform-agnostic; audio libraries work better on Linux
126
+ - Colab notebook (`AI_RVC_Colab.ipynb`) provides full feature parity with Web UI
AI-RVC.spec ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- mode: python ; coding: utf-8 -*-
2
+
3
+ block_cipher = None
4
+
5
+ a = Analysis(
6
+ ['run.py'],
7
+ pathex=[],
8
+ binaries=[],
9
+ datas=[
10
+ ('ui', 'ui'),
11
+ ('infer', 'infer'),
12
+ ('lib', 'lib'),
13
+ ('models', 'models'),
14
+ ('tools', 'tools'),
15
+ ('i18n', 'i18n'),
16
+ ('configs', 'configs'),
17
+ ('assets/hubert', 'assets/hubert'),
18
+ ('assets/rmvpe', 'assets/rmvpe'),
19
+ ],
20
+ hiddenimports=[
21
+ 'torch',
22
+ 'torchaudio',
23
+ 'gradio',
24
+ 'librosa',
25
+ 'soundfile',
26
+ 'scipy',
27
+ 'numpy',
28
+ 'fairseq',
29
+ 'audio_separator',
30
+ 'demucs',
31
+ 'pedalboard',
32
+ 'praat-parselmouth',
33
+ 'pyworld',
34
+ 'torchcrepe',
35
+ 'faiss',
36
+ 'huggingface_hub',
37
+ 'ffmpeg',
38
+ ],
39
+ hookspath=[],
40
+ hooksconfig={},
41
+ runtime_hooks=[],
42
+ excludes=[],
43
+ win_no_prefer_redirects=False,
44
+ win_private_assemblies=False,
45
+ cipher=block_cipher,
46
+ noarchive=False,
47
+ )
48
+
49
+ pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
50
+
51
+ exe = EXE(
52
+ pyz,
53
+ a.scripts,
54
+ a.binaries,
55
+ a.zipfiles,
56
+ a.datas,
57
+ [],
58
+ name='AI-RVC',
59
+ debug=False,
60
+ bootloader_ignore_signals=False,
61
+ strip=False,
62
+ upx=True,
63
+ upx_exclude=[],
64
+ runtime_tmpdir=None,
65
+ console=True,
66
+ disable_windowed_traceback=False,
67
+ argv_emulation=False,
68
+ target_arch=None,
69
+ codesign_identity=None,
70
+ entitlements_file=None,
71
+ )
AI_RVC_Colab.ipynb ADDED
@@ -0,0 +1,380 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# AI-RVC 一键 AI 翻唱 (Colab)\n",
8
+ "\n",
9
+ "[Open in Colab](https://colab.research.google.com/github/mason369/AI-RVC/blob/master/AI_RVC_Colab.ipynb)\n",
10
+ "\n",
11
+ "这是 AI-RVC 的 Google Colab 运行入口。Notebook 会创建独立 Python 3.10 环境,再运行项目安装脚本,避免 Colab 默认 Python 版本变化影响 `fairseq==0.12.2`。\n",
12
+ "\n",
13
+ "当前处理链路模型:\n",
14
+ "\n",
15
+ "| 环节 | 默认模型 |\n",
16
+ "|------|----------|\n",
17
+ "| 人声分离 | `ensemble:vocal_rvc` |\n",
18
+ "| 卡拉OK分离 | `ensemble:karaoke` |\n",
19
+ "| 去混响 / DeEcho | `dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt` |\n",
20
+ "| 内容特征 | `hubert_base.pt` |\n",
21
+ "| F0 音高 | `rmvpe.pt` |\n",
22
+ "| VC 推理 | RVC v2 `.pth` + 可选 FAISS `.index` |\n",
23
+ "\n",
24
+ "使用前请在 Colab 菜单选择:`代码执行程序 -> 更改运行时类型 -> T4 GPU`,然后按顺序运行每个单元格。"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "markdown",
29
+ "metadata": {},
30
+ "source": [
31
+ "## 1. 检查 Colab 运行时\n",
32
+ "\n",
33
+ "确认当前机器有 GPU,并显示 Colab 默认 Python。后续实际安装会使用独立 Python 3.10 环境。"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": null,
39
+ "metadata": {},
40
+ "outputs": [],
41
+ "source": [
42
+ "import sys\n",
43
+ "import subprocess\n",
44
+ "\n",
45
+ "print('Colab 默认 Python:', sys.version)\n",
46
+ "print('\\nGPU 信息:')\n",
47
+ "subprocess.run(['nvidia-smi'], check=True)"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "markdown",
52
+ "metadata": {},
53
+ "source": [
54
+ "## 2. 克隆或刷新仓库\n",
55
+ "\n",
56
+ "如果 `/content/AI-RVC` 已存在,会重置到 GitHub `master` 分支,确保 Colab 环境和仓库最新代码一致。"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": null,
62
+ "metadata": {},
63
+ "outputs": [],
64
+ "source": [
65
+ "%%bash\n",
66
+ "set -euo pipefail\n",
67
+ "cd /content\n",
68
+ "if [ -d AI-RVC/.git ]; then\n",
69
+ " git -C AI-RVC fetch origin master\n",
70
+ " git -C AI-RVC reset --hard origin/master\n",
71
+ "else\n",
72
+ " git clone https://github.com/mason369/AI-RVC.git AI-RVC\n",
73
+ "fi\n",
74
+ "cd /content/AI-RVC\n",
75
+ "git rev-parse --short HEAD\n",
76
+ "ls -la"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "markdown",
81
+ "metadata": {},
82
+ "source": [
83
+ "## 3. 创建 Python 3.10 环境并安装依赖\n",
84
+ "\n",
85
+ "这个步骤会安装 `uv`,用它准备 Python 3.10,然后调用项目自带 `install.py --no-run`。如果任何依赖安装失败,单元格会直接停止并显示错误。"
86
+ ]
87
+ },
88
+ {
89
+ "cell_type": "code",
90
+ "execution_count": null,
91
+ "metadata": {},
92
+ "outputs": [],
93
+ "source": [
94
+ "%%bash\n",
95
+ "set -euo pipefail\n",
96
+ "cd /content/AI-RVC\n",
97
+ "\n",
98
+ "sudo apt-get update -y\n",
99
+ "sudo apt-get install -y ffmpeg build-essential python3-dev curl git\n",
100
+ "\n",
101
+ "if ! command -v uv >/dev/null 2>&1; then\n",
102
+ " curl -LsSf https://astral.sh/uv/install.sh | sh\n",
103
+ "fi\n",
104
+ "export PATH=\"$HOME/.local/bin:$PATH\"\n",
105
+ "uv --version\n",
106
+ "uv python install 3.10\n",
107
+ "uv venv --seed --python 3.10 venv310\n",
108
+ "PY=/content/AI-RVC/venv310/bin/python\n",
109
+ "$PY --version\n",
110
+ "$PY -m pip --version\n",
111
+ "$PY -m pip install --upgrade pip\n",
112
+ "$PY -m pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu126\n",
113
+ "$PY - <<'PY'\n",
114
+ "import torch\n",
115
+ "print('Torch:', torch.__version__)\n",
116
+ "print('CUDA available:', torch.cuda.is_available())\n",
117
+ "if not torch.cuda.is_available():\n",
118
+ " raise RuntimeError('GPU PyTorch install completed but CUDA is not available.')\n",
119
+ "print('CUDA:', torch.version.cuda)\n",
120
+ "print('GPU:', torch.cuda.get_device_name(0))\n",
121
+ "PY\n",
122
+ "uv run --python 3.10 python install.py --no-run"
123
+ ]
124
+ },
125
+ {
126
+ "cell_type": "markdown",
127
+ "metadata": {},
128
+ "source": [
129
+ "## 4. 严格验证依赖与默认模型配置\n",
130
+ "\n",
131
+ "这里会确认关键包、Python 版本、CUDA、Colab 默认分离模型、卡拉OK模型、DeEcho模型都符合当前项目配置。"
132
+ ]
133
+ },
134
+ {
135
+ "cell_type": "code",
136
+ "execution_count": null,
137
+ "metadata": {},
138
+ "outputs": [],
139
+ "source": [
140
+ "%%bash\n",
141
+ "set -euo pipefail\n",
142
+ "cd /content/AI-RVC\n",
143
+ "PY=/content/AI-RVC/venv310/bin/python\n",
144
+ "$PY - <<'PY'\n",
145
+ "import importlib.metadata as md\n",
146
+ "import sys\n",
147
+ "import torch\n",
148
+ "from infer.separator import (\n",
149
+ " ROFORMER_DEFAULT_MODEL,\n",
150
+ " KARAOKE_DEFAULT_MODEL,\n",
151
+ " ROFORMER_DEREVERB_DEFAULT_MODEL,\n",
152
+ ")\n",
153
+ "\n",
154
+ "assert sys.version_info[:2] == (3, 10), sys.version\n",
155
+ "print('Python:', sys.version)\n",
156
+ "print('Torch:', torch.__version__)\n",
157
+ "print('CUDA available:', torch.cuda.is_available())\n",
158
+ "if not torch.cuda.is_available():\n",
159
+ " raise RuntimeError('Colab runtime did not provide CUDA. Please switch runtime type to T4 GPU and rerun.')\n",
160
+ "\n",
161
+ "expected = {\n",
162
+ " 'audio-separator': '0.44.1',\n",
163
+ " 'fairseq': '0.12.2',\n",
164
+ " 'gradio': '3.50.2',\n",
165
+ "}\n",
166
+ "for dist, minimum in expected.items():\n",
167
+ " version = md.version(dist)\n",
168
+ " print(f'{dist}: {version}')\n",
169
+ "\n",
170
+ "assert ROFORMER_DEFAULT_MODEL == 'ensemble:vocal_rvc', ROFORMER_DEFAULT_MODEL\n",
171
+ "assert KARAOKE_DEFAULT_MODEL == 'ensemble:karaoke', KARAOKE_DEFAULT_MODEL\n",
172
+ "assert ROFORMER_DEREVERB_DEFAULT_MODEL == 'dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt', ROFORMER_DEREVERB_DEFAULT_MODEL\n",
173
+ "\n",
174
+ "import faiss # noqa: F401\n",
175
+ "from audio_separator.separator import Separator # noqa: F401\n",
176
+ "import fairseq # noqa: F401\n",
177
+ "print('Dependency and model default checks passed.')\n",
178
+ "PY"
179
+ ]
180
+ },
181
+ {
182
+ "cell_type": "markdown",
183
+ "metadata": {},
184
+ "source": [
185
+ "## 5. 下载基础模型\n",
186
+ "\n",
187
+ "下载 HuBERT、RMVPE、UVR5 HP2 等必需模型。模型会保存到仓库的 `assets/` 目录。"
188
+ ]
189
+ },
190
+ {
191
+ "cell_type": "code",
192
+ "execution_count": null,
193
+ "metadata": {},
194
+ "outputs": [],
195
+ "source": [
196
+ "%%bash\n",
197
+ "set -euo pipefail\n",
198
+ "cd /content/AI-RVC\n",
199
+ "PY=/content/AI-RVC/venv310/bin/python\n",
200
+ "$PY tools/download_models.py\n",
201
+ "$PY - <<'PY'\n",
202
+ "from tools.download_models import REQUIRED_MODELS, check_model\n",
203
+ "missing = [name for name in REQUIRED_MODELS if not check_model(name)]\n",
204
+ "if missing:\n",
205
+ " raise RuntimeError('Missing required models: ' + ', '.join(missing))\n",
206
+ "print('Required models are present:', ', '.join(REQUIRED_MODELS))\n",
207
+ "PY"
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "markdown",
212
+ "metadata": {},
213
+ "source": [
214
+ "## 6. 可选:查看或下载声线模型\n",
215
+ "\n",
216
+ "处理链路模型已经在前面准备好。这里的 `.pth` 声线模型是 RVC 目标音色,不属于分离 / F0 / HuBERT 等处理模型。可以先下载一个示例模型,也可以跳过后在 Web UI 中下载。"
217
+ ]
218
+ },
219
+ {
220
+ "cell_type": "code",
221
+ "execution_count": null,
222
+ "metadata": {},
223
+ "outputs": [],
224
+ "source": [
225
+ "%%bash\n",
226
+ "set -euo pipefail\n",
227
+ "cd /content/AI-RVC\n",
228
+ "PY=/content/AI-RVC/venv310/bin/python\n",
229
+ "$PY - <<'PY'\n",
230
+ "from tools.character_models import list_available_characters, list_available_series\n",
231
+ "print('Available series:')\n",
232
+ "for series in list_available_series():\n",
233
+ " print(' -', series)\n",
234
+ "chars = list_available_characters()\n",
235
+ "print('\\nFirst 20 voice models:')\n",
236
+ "for char in chars[:20]:\n",
237
+ " print(f\" - {char['name']}: {char.get('display', char.get('description', ''))}\")\n",
238
+ "print('\\nTotal:', len(chars))\n",
239
+ "PY"
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "code",
244
+ "execution_count": null,
245
+ "metadata": {},
246
+ "outputs": [],
247
+ "source": [
248
+ "# Optional: download one example voice model.\n",
249
+ "# Change character_name as needed, then run this cell.\n",
250
+ "\n",
251
+ "# %%bash\n",
252
+ "# set -euo pipefail\n",
253
+ "# cd /content/AI-RVC\n",
254
+ "# PY=/content/AI-RVC/venv310/bin/python\n",
255
+ "# $PY - <<'PY'\n",
256
+ "# from tools.character_models import download_character_model\n",
257
+ "# character_name = 'rin'\n",
258
+ "# if not download_character_model(character_name):\n",
259
+ "# raise RuntimeError(f'Failed to download voice model: {character_name}')\n",
260
+ "# print(f'Downloaded voice model: {character_name}')\n",
261
+ "# PY"
262
+ ]
263
+ },
264
+ {
265
+ "cell_type": "markdown",
266
+ "metadata": {},
267
+ "source": [
268
+ "## 7. 全模式处理矩阵验证\n",
269
+ "\n",
270
+ "上传一段含有人声的短音乐文件到 `/content/input_audio.*`,或把自己的文件路径填到 `INPUT_AUDIO`。此步骤会下载示例声线模型 `rin`,然后依次验证 RoFormer、Karaoke、Demucs、UVR5、官方 1:1、官方唱歌修复等处理模式。任何模式失败都会停止并在 JSON 里保留 traceback。\n"
271
+ ]
272
+ },
273
+ {
274
+ "cell_type": "code",
275
+ "execution_count": null,
276
+ "metadata": {},
277
+ "outputs": [],
278
+ "source": [
279
+ "# Upload a short vocal music file, then set INPUT_AUDIO to its path.\n",
280
+ "# Example after upload: INPUT_AUDIO = '/content/input_audio.wav'\n",
281
+ "INPUT_AUDIO = '/content/input_audio.wav'\n",
282
+ "\n",
283
+ "from pathlib import Path\n",
284
+ "if not Path(INPUT_AUDIO).exists():\n",
285
+ " raise FileNotFoundError(\n",
286
+ " f'Missing test audio: {INPUT_AUDIO}. Upload a short vocal music file to this path before running the matrix.'\n",
287
+ " )\n",
288
+ "print('Matrix input:', INPUT_AUDIO, Path(INPUT_AUDIO).stat().st_size, 'bytes')\n"
289
+ ]
290
+ },
291
+ {
292
+ "cell_type": "code",
293
+ "execution_count": null,
294
+ "metadata": {},
295
+ "outputs": [],
296
+ "source": [
297
+ "%%bash\n",
298
+ "set -euo pipefail\n",
299
+ "cd /content/AI-RVC\n",
300
+ "PY=/content/AI-RVC/venv310/bin/python\n",
301
+ "$PY - <<'PY'\n",
302
+ "from tools.character_models import download_character_model\n",
303
+ "if not download_character_model('rin'):\n",
304
+ " raise RuntimeError('Failed to download voice model: rin')\n",
305
+ "print('Voice model ready: rin')\n",
306
+ "PY\n"
307
+ ]
308
+ },
309
+ {
310
+ "cell_type": "code",
311
+ "execution_count": null,
312
+ "metadata": {},
313
+ "outputs": [],
314
+ "source": [
315
+ "%%bash\n",
316
+ "set -euo pipefail\n",
317
+ "cd /content/AI-RVC\n",
318
+ "PY=/content/AI-RVC/venv310/bin/python\n",
319
+ "$PY tools/run_mode_matrix.py \\\n",
320
+ " --input /content/input_audio.wav \\\n",
321
+ " --output-dir /content/AI-RVC/outputs/mode_matrix_colab \\\n",
322
+ " --require-cuda\n"
323
+ ]
324
+ },
325
+ {
326
+ "cell_type": "markdown",
327
+ "metadata": {},
328
+ "source": [
329
+ "## 8. 启动 Web UI\n",
330
+ "\n",
331
+ "运行后 Colab 会输出 Gradio 公共链接。打开链接即可使用 AI-RVC Web UI。此单元格会再次检查环境和必需模型。"
332
+ ]
333
+ },
334
+ {
335
+ "cell_type": "code",
336
+ "execution_count": null,
337
+ "metadata": {},
338
+ "outputs": [],
339
+ "source": [
340
+ "%%bash\n",
341
+ "set -euo pipefail\n",
342
+ "cd /content/AI-RVC\n",
343
+ "PY=/content/AI-RVC/venv310/bin/python\n",
344
+ "$PY run.py --host 0.0.0.0 --port 7860 --share"
345
+ ]
346
+ },
347
+ {
348
+ "cell_type": "markdown",
349
+ "metadata": {},
350
+ "source": [
351
+ "## 常见问题\n",
352
+ "\n",
353
+ "| 问题 | 处理方式 |\n",
354
+ "|------|----------|\n",
355
+ "| `nvidia-smi` 失败 | 在 Colab 菜单选择 GPU 运行时后重新运行 |\n",
356
+ "| `fairseq` 安装失败 | 确认本 notebook 的第 3 步使用了 Python 3.10 环境 |\n",
357
+ "| `CUDA out of memory` | 换短音频,或在 UI 中降低处理负载 / 切换分离器 |\n",
358
+ "| 模型下载失败 | 重新运行第 5 步;如果 Hugging Face 限流,可稍后再试或配置 token |\n",
359
+ "\n",
360
+ "本项目仅供学习研究和个人娱乐用途,请勿用于商业用途、欺诈、冒充他人或侵犯权益的场景。"
361
+ ]
362
+ }
363
+ ],
364
+ "metadata": {
365
+ "accelerator": "GPU",
366
+ "colab": {
367
+ "gpuType": "T4",
368
+ "provenance": []
369
+ },
370
+ "kernelspec": {
371
+ "display_name": "Python 3",
372
+ "name": "python3"
373
+ },
374
+ "language_info": {
375
+ "name": "python"
376
+ }
377
+ },
378
+ "nbformat": 4,
379
+ "nbformat_minor": 0
380
+ }
CLAUDE.md ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ RVC v2 voice conversion & AI cover system. Users upload a song, the pipeline separates vocals from accompaniment (Mel-Band Roformer), converts the vocal timbre via RVC v2 (HuBERT + RMVPE + FAISS), then mixes the result back with the accompaniment.
8
+
9
+ **Platform Support**: Windows / Linux / WSL2 / Google Colab
10
+
11
+ **Key Features**:
12
+ - AI song covers with automatic vocal separation and mixing
13
+ - 117 downloadable character models
14
+ - 4 mixing presets (universal, vocal-focused, accompaniment-focused, live)
15
+ - Karaoke mode (lead/backing vocal separation)
16
+ - 4 VC preprocessing modes (auto, direct, uvr_deecho, legacy)
17
+ - Dual VC pipeline (current implementation vs official RVC)
18
+ - Multi-backend GPU support (CUDA, ROCm, XPU, DirectML, MPS)
19
+
20
+ ## Commands
21
+
22
+ ```powershell
23
+ # Activate venv (Windows)
24
+ .\venv310\Scripts\Activate.ps1
25
+
26
+ # Activate venv (Linux/WSL2)
27
+ source venv310/bin/activate
28
+
29
+ # Install dependencies
30
+ python install.py # full install + launch
31
+ python install.py --check # check only
32
+ python install.py --cpu # CPU variant
33
+
34
+ # Run
35
+ python run.py # default: http://127.0.0.1:7860
36
+ python run.py --skip-check # skip env/model validation
37
+ python run.py --host 0.0.0.0 --port 8080 --share
38
+
39
+ # Download base models (HuBERT, RMVPE)
40
+ python tools/download_models.py
41
+
42
+ # Download character models
43
+ python -c "from tools.character_models import download_character_model; download_character_model('rin')"
44
+
45
+ # Quick CUDA check
46
+ python -c "import torch; print(torch.cuda.is_available())"
47
+
48
+ # Colab
49
+ # Open AI_RVC_Colab.ipynb in Google Colab, set runtime to GPU (T4), run cells sequentially
50
+ ```
51
+
52
+ ## Architecture
53
+
54
+ **Entry:** `run.py` → env check → model check → `ui/app.py:launch()`
55
+
56
+ **Pipeline flow** (`infer/cover_pipeline.py:CoverPipeline.process`):
57
+ 1. Vocal separation (`infer/separator.py`) — Roformer (default), Demucs, or UVR5
58
+ 2. RVC voice conversion (`infer/pipeline.py`) — HuBERT features → RMVPE F0 → RVC v2 inference with FAISS retrieval
59
+ 3. Mixing (`lib/mixer.py`) — volume adjust + reverb via pedalboard
60
+
61
+ **Character model system** (`tools/character_models.py`):
62
+ - 117 downloadable character models from HuggingFace (`trioskosmos/rvc_models`)
63
+ - Stored in `assets/weights/characters/`
64
+ - Version notes (epochs, sample rate) extracted from .pth metadata and cached in `_version_notes.json`
65
+ - Display name assembly: `_get_display_name()` appends `(500 epochs·40k)` style training info
66
+
67
+ **UI** (`ui/app.py`):
68
+ - Gradio 3.50.2, single-file ~2000 lines
69
+ - i18n via `i18n/zh_CN.json`, accessed through `t(key, section)` helper
70
+ - Three main tabs: song cover (full pipeline), model management, settings
71
+ - Cover tab features:
72
+ - Character model download/management with series filtering and keyword search
73
+ - 4 mixing presets (universal, vocal-focused, accompaniment-focused, live)
74
+ - Karaoke separation (lead/backing vocals)
75
+ - 4 VC preprocessing modes (auto, direct, uvr_deecho, legacy)
76
+ - Source constraint control (auto/off/on)
77
+ - Dual VC pipeline mode (current/official)
78
+ - Singing repair (official mode only)
79
+ - Real-time VC route status display
80
+ - Model management tab:
81
+ - Base model download (HuBERT, RMVPE)
82
+ - Mature DeEcho model download
83
+ - Model list table with refresh
84
+ - Settings tab:
85
+ - Device info display
86
+ - Backend selection (CUDA/ROCm/XPU/DirectML/MPS/CPU)
87
+ - Config save
88
+
89
+ **Config:** `configs/config.json` — device, F0 method, index rate, cover separator settings, path mappings
90
+
91
+ ## Key Conventions
92
+
93
+ - Python 3.10, UTF-8, 4-space indent
94
+ - `snake_case` functions/variables, `PascalCase` classes, `UPPER_SNAKE_CASE` constants
95
+ - User-facing text is bilingual Chinese/English
96
+ - Commit messages: short imperative subjects, Chinese/English mixed (e.g. `infer: fix CUDA OOM`)
97
+ - No automated test suite; verify changes by running one voice conversion + one cover through the UI
98
+ - `_official_rvc/` is vendored upstream reference — don't modify unless syncing
99
+
100
+ ## Important Paths
101
+
102
+ - `configs/config.json` — all runtime settings
103
+ - `infer/cover_pipeline.py` — orchestrates the full cover workflow
104
+ - `infer/pipeline.py` — RVC v2 inference core
105
+ - `infer/separator.py` — Roformer/Demucs vocal separation wrappers
106
+ - `tools/character_models.py` — character model registry (117 entries) + download logic
107
+ - `tools/download_models.py` — base model (HuBERT/RMVPE) + mature DeEcho downloader
108
+ - `lib/mixer.py` — audio mixing with volume/reverb
109
+ - `ui/app.py` — entire Gradio UI (~2000 lines)
110
+ - `mcp/server.py` + `mcp/tools.py` — MCP server integration for Claude Code
111
+ - `AI_RVC_Colab.ipynb` — Google Colab notebook with full feature parity
112
+ - `install.py` — cross-platform installation script (Windows/Linux)
113
+
114
+ ## Things to Watch
115
+
116
+ - `fairseq` is pinned to `0.12.2` — HuBERT loading breaks on other versions
117
+ - `audio-separator` must be installed with `[gpu]` extra for CUDA support
118
+ - Roformer model auto-downloads on first use to `assets/separator_models/`
119
+ - Gradio is pinned to `3.50.2`; the UI code uses v3 API patterns (not v4)
120
+ - Model weights (.pt, .pth) and audio files are gitignored — never commit them
121
+ - Path handling uses `pathlib.Path` for cross-platform compatibility (Windows/Linux)
122
+ - Virtual environment activation differs by platform: `Scripts/Activate.ps1` (Windows) vs `bin/activate` (Linux)
123
+ - `install.py` has hardcoded Windows Python paths in `PYTHON310_CANDIDATES` but falls back to `py -3.10` launcher
124
+ - Platform detection uses `os.name == "nt"` for Windows-specific logic (venv paths, etc.)
125
+ - All core functionality is platform-agnostic; audio libraries work better on Linux
126
+ - Colab notebook (`AI_RVC_Colab.ipynb`) provides full feature parity with Web UI
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README_HF.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: AI-RVC 语音转换 & AI 翻唱
3
  emoji: 🎤
4
  colorFrom: blue
5
  colorTo: purple
@@ -10,15 +10,15 @@ pinned: false
10
  license: mit
11
  ---
12
 
13
- # 🎤 AI-RVC 语音转换 & AI 翻唱
14
 
15
- 基于 RVC v2 + RMVPE 高质量语音转换系统,支持一键 AI 翻唱功能
16
 
17
  ## 功能特点
18
 
19
  - **AI 歌曲翻唱**:上传歌曲自动分离人声、转换音色、混合伴奏,一键生成翻唱
20
  - **人声分离**:默认 Mel-Band Roformer (KimberleyJensen),在 MVSEP 公开 Multisong 指标中为 Vocals SDR 11.01 / Instrum SDR 17.32
21
- - **音转换**:RVC v2 架构 + FAISS 检索增强流程
22
  - **RMVPE 音高提取**:高精度 F0 提取,噪声鲁棒性强
23
  - **角色模型**:内置 117 个可下载角色模型
24
  - **混音效果**:支持人声混响、音量调节、4 种混音预设
@@ -88,7 +88,7 @@ license: mit
88
 
89
  人声分离 (Mel-Band Roformer)
90
 
91
- RVC 音转换 (HuBERT + RMVPE + FAISS)
92
 
93
  混音 (音量调节 + 混响)
94
 
@@ -148,4 +148,4 @@ A: 建议选择与原唱性别、音色相近的角色,效果更自然。
148
 
149
  **License**: MIT
150
  **Version**: 2.0
151
- **Last Updated**: 2026-03-10
 
1
  ---
2
+ title: AI-RVC 一键 AI 翻唱
3
  emoji: 🎤
4
  colorFrom: blue
5
  colorTo: purple
 
10
  license: mit
11
  ---
12
 
13
+ # 🎤 AI-RVC 一键 AI 翻唱
14
 
15
+ 基于 RVC v2 的一键 AI 翻唱系统,自动完成人声分离、音色转换、混音合成全流程
16
 
17
  ## 功能特点
18
 
19
  - **AI 歌曲翻唱**:上传歌曲自动分离人声、转换音色、混合伴奏,一键生成翻唱
20
  - **人声分离**:默认 Mel-Band Roformer (KimberleyJensen),在 MVSEP 公开 Multisong 指标中为 Vocals SDR 11.01 / Instrum SDR 17.32
21
+ - **音转换**:RVC v2 架构 + FAISS 检索增强流程
22
  - **RMVPE 音高提取**:高精度 F0 提取,噪声鲁棒性强
23
  - **角色模型**:内置 117 个可下载角色模型
24
  - **混音效果**:支持人声混响、音量调节、4 种混音预设
 
88
 
89
  人声分离 (Mel-Band Roformer)
90
 
91
+ RVC 音转换 (HuBERT + RMVPE + FAISS)
92
 
93
  混音 (音量调节 + 混响)
94
 
 
148
 
149
  **License**: MIT
150
  **Version**: 2.0
151
+ **Last Updated**: 2026-03-15
app.py CHANGED
@@ -7,19 +7,26 @@ import os
7
  import sys
8
  from pathlib import Path
9
 
 
10
  ROOT_DIR = Path(__file__).parent
11
  sys.path.insert(0, str(ROOT_DIR))
12
 
 
 
 
 
 
13
  os.environ["GRADIO_SERVER_NAME"] = "0.0.0.0"
14
  os.environ["GRADIO_SERVER_PORT"] = "7860"
15
 
16
- from ui.app import create_ui
 
 
17
 
18
- app = create_ui()
19
- app.queue()
20
- app.launch(
21
- server_name="0.0.0.0",
22
- server_port=7860,
23
- share=False,
24
- inbrowser=False,
25
- )
 
7
  import sys
8
  from pathlib import Path
9
 
10
+ # 添加项目根目录到路径
11
  ROOT_DIR = Path(__file__).parent
12
  sys.path.insert(0, str(ROOT_DIR))
13
 
14
+ from lib.ffmpeg_runtime import configure_ffmpeg_runtime
15
+
16
+ configure_ffmpeg_runtime()
17
+
18
+ # 设置环境变量
19
  os.environ["GRADIO_SERVER_NAME"] = "0.0.0.0"
20
  os.environ["GRADIO_SERVER_PORT"] = "7860"
21
 
22
+ # 导入并启动应用
23
+ if __name__ == "__main__":
24
+ from ui.app import launch
25
 
26
+ # 启动 Gradio 界面
27
+ launch(
28
+ server_name="0.0.0.0",
29
+ server_port=7860,
30
+ share=False,
31
+ inbrowser=False
32
+ )
 
assets/weights/characters/_version_notes.json ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Aimi": "200 epochs · 48k",
3
+ "hanayo": "250 epochs · 40k",
4
+ "rin": "250 epochs · 40k",
5
+ "umi": "250 epochs · 40k",
6
+ "nozomi": "250 epochs · 48k",
7
+ "dia": "250 epochs · 40k",
8
+ "ayumu": "250 epochs · 40k",
9
+ "chika": "500 epochs · 40k",
10
+ "riko": "500 epochs · 40k",
11
+ "mari": "400 epochs · RVC v2",
12
+ "yohane": "500 epochs · 48k",
13
+ "nico": "300 epochs · 40k",
14
+ "kanata": "250 epochs · 40k",
15
+ "setsuna_yuki_nana": "300 epochs · 48k",
16
+ "hanamaru_zurakichi": "195 epochs · 40k",
17
+ "kanan": null,
18
+ "yu_takasaki": null,
19
+ "shizuku_osaka": null,
20
+ "sarah_kazuno": "500 epochs · 40k",
21
+ "leah_kazuno": "500 epochs · 40k",
22
+ "eli": "350 epochs · 40k",
23
+ "you": "400 epochs · 40k",
24
+ "honoka": "1000 epochs · 48k",
25
+ "kotori": "309 epochs · 48k",
26
+ "maki": "135 epochs · 40k",
27
+ "ruby": "207 epochs · 48k",
28
+ "kasumi": "414 epochs · 48k",
29
+ "karin": "1000 epochs · 48k",
30
+ "rina": "248 epochs · 48k",
31
+ "lanzhu": "248 epochs · 48k",
32
+ "keke": "151 epochs · 48k",
33
+ "ai_miyashita": "400 epochs · 32k",
34
+ "emma_verde": "450 epochs · 32k",
35
+ "shioriko_mifune": "500 epochs · 32k",
36
+ "chisato_arashi": "450 epochs · 32k",
37
+ "ren_hazuki": "400 epochs · 32k",
38
+ "sumire_heanna": "400 epochs · 32k",
39
+ "kinako_sakurakoji": "400 epochs · 32k",
40
+ "mei_yoneme": "400 epochs · 32k",
41
+ "shiki_wakana": "400 epochs · 32k",
42
+ "natsumi_onitsuka": "450 epochs · 32k",
43
+ "sayaka_murano": "400 epochs · 32k",
44
+ "tsuzuri_yugiri": "400 epochs · 32k",
45
+ "hanamaru": "410 epochs · 40k",
46
+ "setsuna_yuki_og": "910 epochs · 40k",
47
+ "setsuna_yuki_coco": "555 epochs · 48k",
48
+ "hanayo_pre_anime_v0": "222 epochs · 40k",
49
+ "hanayo_pre_anime_v1": "286 epochs · 48k",
50
+ "kotori_pre_anime": "216 epochs · 48k",
51
+ "kotori_pre_anime_v2": "401 epochs · 40k",
52
+ "wien_margarete": "453 epochs · 40k",
53
+ "mia_taylor": "285 epochs · 40k",
54
+ "anju_yuki_arise": "446 epochs · 40k",
55
+ "erena_todo_arise": "480 epochs · 40k",
56
+ "tsubasa_kira_arise": "729 epochs · 40k",
57
+ "kanon": "150 epochs · 40k",
58
+ "setsuna": "504 epochs · 48k",
59
+ "setsuna_v2": "302 epochs · 48k",
60
+ "chika_v2": "453 epochs · 48k",
61
+ "dia_v2": "250 epochs · 40k",
62
+ "hanamaru_v2": "229 epochs · 48k",
63
+ "honoka_v2": "214 epochs · 48k",
64
+ "ayumu_v2": "214 epochs · 48k",
65
+ "ayumu_v3": "250 epochs · 40k",
66
+ "ayumu_v4": "250 epochs · 40k",
67
+ "karin_v2": "301 epochs · 48k",
68
+ "keke_v2": "102 epochs · 48k",
69
+ "riko_v2": "346 epochs · 48k",
70
+ "umi_v2": "250 epochs · 40k",
71
+ "wien_v2": "257 epochs · 48k",
72
+ "yohane_trios": "155 epochs · 48k",
73
+ "yoshiko_trios": "20 epochs · 48k",
74
+ "yoshiko_v2": "211 epochs · 48k",
75
+ "tokai_teio": "570 epochs · RVC v2",
76
+ "focalors": "940 epochs · 48k",
77
+ "essex": "925 epochs · 40k",
78
+ "ellie": "40k",
79
+ "furina": "170 epochs · 40k",
80
+ "ayaka": "300 epochs · 40k",
81
+ "takane_shijou": "565 epochs · 40k",
82
+ "kobo": "500 epochs · 48k",
83
+ "kaela": "400 epochs · 48k",
84
+ "pekora": "600 epochs · 48k",
85
+ "kizuna_ai": "300 epochs · 40k",
86
+ "fuwawa": "250 epochs · 48k",
87
+ "mococo": "250 epochs · 48k",
88
+ "vo_furina_kr": "300 epochs · 48k",
89
+ "silverwolf_kr": "300 epochs · 48k",
90
+ "sparkle_e40": "40 epochs · 40k",
91
+ "ranko": "250 epochs · 40k",
92
+ "yumemiriamu": "250 epochs · 40k",
93
+ "hatsune_miku": null,
94
+ "nahida": null,
95
+ "nilou": null,
96
+ "herta_hsr": null,
97
+ "the_herta": null,
98
+ "firefly_hsr": null,
99
+ "tingyun_hsr": null,
100
+ "yunli_hsr": null,
101
+ "tribbie_hsr": null,
102
+ "huohuo_hsr": null,
103
+ "castorice_hsr": null,
104
+ "seele_hi3": null,
105
+ "herrscher_ego_hi3": null,
106
+ "miyabi_zzz": null,
107
+ "nene_kusanagi": null,
108
+ "miko_sakura": null,
109
+ "subaru_oozora": null,
110
+ "moona_hoshinova": null,
111
+ "risu_ayunda": null,
112
+ "reine_pavolia": null,
113
+ "zeta_vestia": null,
114
+ "anya_melfissa": null,
115
+ "luna_himemori": null,
116
+ "boothill": null,
117
+ "shiroko_rosmontis": null,
118
+ "rice_shower": null,
119
+ "suwawa": "211 epochs · 40k"
120
+ }
assets/weights/characters/mari/metadata.json ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "title": "Mari Ohara (Love Live! Sunshine!!) - RVC V2 (400 Epoch)",
3
+ "author": {
4
+ "name": "natehitsvtec",
5
+ "discordUserId": "641787156863385610"
6
+ },
7
+ "md5": "ff4bd8e126ab82a8145793f02db0cfee",
8
+ "uploadedAt": "2023-06-11T01:22:26.820Z",
9
+ "weightsLink": "https://www.weights.gg/models/clm72zu941aj1cctco2oj26p1",
10
+ "id": "clm72zu941aj1cctco2oj26p1",
11
+ "type": "v2",
12
+ "tags": [
13
+ "Anime Character",
14
+ "Fictional Character",
15
+ "Artist",
16
+ "RVC v2"
17
+ ],
18
+ "description": "400 Epoch\nhttps://drive.google.com/file/d/1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk/view?usp=sharing\n\nTested languages: Japanese, Korean, English\n\nNew version trained up to 400 epoch, revised the dataset to 25 songs down from 33, but made it much clearer and removed excessive silence and other parts with echo or reverb. I did not use SIFAS audio for my dataset because it ended up being too high pitched, I believe she speaks with a lot of exaggeration in the game. Unsure if I will train it more but I probably will eventually.\n\nPrevious 200 Epoch (old dataset)\nhttps://drive.google.com/file/d/1uBmoKluknMOWKivjfHcbu7YkOMtjoclH/view?usp=sharing",
19
+ "samples": [],
20
+ "files": [
21
+ {
22
+ "name": "model.index",
23
+ "size": 812116059,
24
+ "md5": "be135619c9fc2e04d0d9bf384f9eef11"
25
+ },
26
+ {
27
+ "name": "model.pth",
28
+ "size": 55227869,
29
+ "md5": "ff4bd8e126ab82a8145793f02db0cfee"
30
+ }
31
+ ],
32
+ "torchMetadata": {
33
+ "config": {
34
+ "spec_channels": 1025,
35
+ "segment_size": 32,
36
+ "inter_channels": 192,
37
+ "hidden_channels": 192,
38
+ "filter_channels": 768,
39
+ "n_heads": 2,
40
+ "n_layers": 6,
41
+ "kernel_size": 3,
42
+ "p_dropout": 0,
43
+ "resblock": "1",
44
+ "resblock_kernel_sizes": [
45
+ 3,
46
+ 7,
47
+ 11
48
+ ],
49
+ "resblock_dilation_sizes": [
50
+ [
51
+ 1,
52
+ 3,
53
+ 5
54
+ ],
55
+ [
56
+ 1,
57
+ 3,
58
+ 5
59
+ ],
60
+ [
61
+ 1,
62
+ 3,
63
+ 5
64
+ ]
65
+ ],
66
+ "upsample_rates": [
67
+ 10,
68
+ 10,
69
+ 2,
70
+ 2
71
+ ],
72
+ "upsample_initial_channel": 512,
73
+ "upsample_kernel_sizes": [
74
+ 16,
75
+ 16,
76
+ 4,
77
+ 4
78
+ ],
79
+ "emb_channels": null,
80
+ "spk_embed_dim": 109,
81
+ "gin_channels": 256,
82
+ "sr": 40000
83
+ },
84
+ "f0": 1,
85
+ "version": "v2",
86
+ "extra_info": {
87
+ "config": [
88
+ 1025,
89
+ 32,
90
+ 192,
91
+ 192,
92
+ 768,
93
+ 2,
94
+ 6,
95
+ 3,
96
+ 0,
97
+ "1",
98
+ [
99
+ 3,
100
+ 7,
101
+ 11
102
+ ],
103
+ [
104
+ [
105
+ 1,
106
+ 3,
107
+ 5
108
+ ],
109
+ [
110
+ 1,
111
+ 3,
112
+ 5
113
+ ],
114
+ [
115
+ 1,
116
+ 3,
117
+ 5
118
+ ]
119
+ ],
120
+ [
121
+ 10,
122
+ 10,
123
+ 2,
124
+ 2
125
+ ],
126
+ 512,
127
+ [
128
+ 16,
129
+ 16,
130
+ 4,
131
+ 4
132
+ ],
133
+ 109,
134
+ 256,
135
+ 40000
136
+ ],
137
+ "info": "400epoch",
138
+ "sr": "40k",
139
+ "f0": 1,
140
+ "version": "v2"
141
+ }
142
+ },
143
+ "url": "https://drive.google.com/file/d/1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk/view",
144
+ "urls": [
145
+ "https://huggingface.co/realmongx2/m0delz/resolve/main/mariohara_e400.zip?download=true",
146
+ "https://drive.google.com/file/d/1x5aZjlYea_HfaHNHnZCpqaKmU4N5J-1X/view?usp=sharing",
147
+ "https://drive.google.com/file/d/1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk/view?usp=sharing",
148
+ "https://drive.google.com/file/d/1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk/view?usp=drive_link",
149
+ "https://drive.google.com/file/d/1rJ3a4NJ-3MQ8Vv240-hhaP2bO1irbA9h/view?usp=sharing",
150
+ "https://drive.google.com/file/d/1rJ3a4NJ-3MQ8Vv240-hhaP2bO1irbA9h/view?usp=drive_link",
151
+ "https://drive.google.com/file/d/1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk/view?usp=drivesdk",
152
+ "https://drive.google.com/file/d/1Pklo3gYLRM8_wG0XZvwf_lJJnsDgZhry/view?usp=drive_link",
153
+ "https://drive.google.com/file/d/1tKgz422yovSPvuKHDuSETNYRQNq2wIoT/view?usp=sharing",
154
+ "https://drive.google.com/file/d/1AVYkFKf3SLkUhQqiJYWiqluc7KAnR2cC/view?usp=sharing",
155
+ "https://drive.google.com/file/d/1AVYkFKf3SLkUhQqiJYWiqluc7KAnR2cC/view?usp=drive_link",
156
+ "https://drive.google.com/file/d/1YLJ2GXYvbGQ2mCyE5KjF2z_6EAI8gE2E/view?usp=sharing",
157
+ "https://drive.google.com/file/d/1uBmoKluknMOWKivjfHcbu7YkOMtjoclH/view?usp=sharing",
158
+ "https://drive.google.com/file/d/1YLJ2GXYvbGQ2mCyE5KjF2z_6EAI8gE2E/view?usp=drive_link",
159
+ "https://drive.google.com/file/d/1hRcWKrDz1gEbEbmSjyVzthsJG0dIbJi8/view?usp=sharing",
160
+ "https://drive.google.com/file/d/1hRcWKrDz1gEbEbmSjyVzthsJG0dIbJi8/view?usp=drive_link",
161
+ "https://drive.google.com/file/d/1ou8SVFKN0ifqljKgufOAqZdjq7qvSk2e/view?usp=drive_link",
162
+ "https://drive.google.com/file/d/1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk/view",
163
+ "https://drive.google.com/file/d/1uBmoKluknMOWKivjfHcbu7YkOMtjoclH/view",
164
+ "https://drive.google.com/file/d/1YLJ2GXYvbGQ2mCyE5KjF2z_6EAI8gE2E/view",
165
+ "https://drive.google.com/file/d/1hRcWKrDz1gEbEbmSjyVzthsJG0dIbJi8/view",
166
+ "https://drive.google.com/file/d/1ou8SVFKN0ifqljKgufOAqZdjq7qvSk2e/view",
167
+ "https://drive.google.com/uc?id=1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk",
168
+ "https://drive.google.com/file/d/1rJ3a4NJ-3MQ8Vv240-hhaP2bO1irbA9h/view",
169
+ "https://drive.google.com/file/d/1Pklo3gYLRM8_wG0XZvwf_lJJnsDgZhry/view",
170
+ "https://drive.google.com/file/d/1tKgz422yovSPvuKHDuSETNYRQNq2wIoT/view",
171
+ "https://drive.google.com/file/d/1AVYkFKf3SLkUhQqiJYWiqluc7KAnR2cC/view"
172
+ ],
173
+ "originalFileList": [
174
+ "mariohara_e400/added_IVF6591_Flat_nprobe_1_v2.index",
175
+ "mariohara_e400/mariohara_v3_e400.pth"
176
+ ]
177
+ }
assets/weights/characters/tokai_teio/metadata.json ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "title": "Tokai Teio (Uma Musume) (RVC V2) (570 epochs)",
3
+ "author": {
4
+ "name": "realhijack",
5
+ "discordUserId": "903192512234127360"
6
+ },
7
+ "md5": "27ac7aee0860b447ed98553b84de43a7",
8
+ "uploadedAt": "2023-07-21T11:01:53.209Z",
9
+ "weightsLink": "https://www.weights.gg/models/clm72thvr0rf8cctcy1jn4no4",
10
+ "id": "clm72thvr0rf8cctcy1jn4no4",
11
+ "type": "v2",
12
+ "tags": [
13
+ "Anime Character",
14
+ "Fictional Character",
15
+ "RVC v2"
16
+ ],
17
+ "description": "https://huggingface.co/SANSSWEEP/rvc/resolve/main/TOKAITEIO.zip\nMADE BY <@299907871640911872> !!!\ncomm me",
18
+ "samples": [],
19
+ "files": [
20
+ {
21
+ "name": "model.index",
22
+ "size": 557899019,
23
+ "md5": "6bc29a4dc19d5f44cec8ee4cfa59f6b7"
24
+ },
25
+ {
26
+ "name": "model.pth",
27
+ "size": 57581999,
28
+ "md5": "27ac7aee0860b447ed98553b84de43a7"
29
+ }
30
+ ],
31
+ "torchMetadata": {
32
+ "config": {
33
+ "spec_channels": 1025,
34
+ "segment_size": 32,
35
+ "inter_channels": 192,
36
+ "hidden_channels": 192,
37
+ "filter_channels": 768,
38
+ "n_heads": 2,
39
+ "n_layers": 6,
40
+ "kernel_size": 3,
41
+ "p_dropout": 0,
42
+ "resblock": "1",
43
+ "resblock_kernel_sizes": [
44
+ 3,
45
+ 7,
46
+ 11
47
+ ],
48
+ "resblock_dilation_sizes": [
49
+ [
50
+ 1,
51
+ 3,
52
+ 5
53
+ ],
54
+ [
55
+ 1,
56
+ 3,
57
+ 5
58
+ ],
59
+ [
60
+ 1,
61
+ 3,
62
+ 5
63
+ ]
64
+ ],
65
+ "upsample_rates": [
66
+ 12,
67
+ 10,
68
+ 2,
69
+ 2
70
+ ],
71
+ "upsample_initial_channel": 512,
72
+ "upsample_kernel_sizes": [
73
+ 24,
74
+ 20,
75
+ 4,
76
+ 4
77
+ ],
78
+ "emb_channels": null,
79
+ "spk_embed_dim": 109,
80
+ "gin_channels": 256,
81
+ "sr": 48000
82
+ },
83
+ "f0": 1,
84
+ "version": "v2",
85
+ "extra_info": {
86
+ "config": [
87
+ 1025,
88
+ 32,
89
+ 192,
90
+ 192,
91
+ 768,
92
+ 2,
93
+ 6,
94
+ 3,
95
+ 0,
96
+ "1",
97
+ [
98
+ 3,
99
+ 7,
100
+ 11
101
+ ],
102
+ [
103
+ [
104
+ 1,
105
+ 3,
106
+ 5
107
+ ],
108
+ [
109
+ 1,
110
+ 3,
111
+ 5
112
+ ],
113
+ [
114
+ 1,
115
+ 3,
116
+ 5
117
+ ]
118
+ ],
119
+ [
120
+ 12,
121
+ 10,
122
+ 2,
123
+ 2
124
+ ],
125
+ 512,
126
+ [
127
+ 24,
128
+ 20,
129
+ 4,
130
+ 4
131
+ ],
132
+ 109,
133
+ 256,
134
+ 48000
135
+ ],
136
+ "info": "570epoch",
137
+ "sr": "48k",
138
+ "f0": 1,
139
+ "version": "v2"
140
+ }
141
+ },
142
+ "url": "https://huggingface.co/SANSSWEEP/rvc/resolve/main/TOKAITEIO.zip",
143
+ "originalFileList": [
144
+ "New folder (3)/TokaiTeio_e570_s37620.pth",
145
+ "New folder (3)/added_IVF4528_Flat_nprobe_1_TokaiTeio_v2.index"
146
+ ]
147
+ }
check_deecho_config.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ 回声处理配置验证脚本
5
+
6
+ 检查当前配置是否正确启用了激进的去回声处理
7
+ """
8
+
9
+ import json
10
+ import os
11
+ from pathlib import Path
12
+
13
+ def check_deecho_models():
14
+ """检查 DeEcho 模型是否存在"""
15
+ print("=" * 60)
16
+ print("检查 DeEcho 模型")
17
+ print("=" * 60)
18
+
19
+ models_dir = Path("assets/uvr5_weights")
20
+ required_models = [
21
+ "VR-DeEchoDeReverb.pth",
22
+ "VR-DeEchoAggressive.pth",
23
+ "VR-DeEchoNormal.pth",
24
+ "onnx_dereverb_By_FoxJoy"
25
+ ]
26
+
27
+ all_found = True
28
+ for model in required_models:
29
+ model_path = models_dir / model
30
+ exists = model_path.exists()
31
+ status = "[OK]" if exists else "[NO]"
32
+ size = ""
33
+ if exists:
34
+ if model_path.is_file():
35
+ size_mb = model_path.stat().st_size / (1024 * 1024)
36
+ size = f" ({size_mb:.1f} MB)"
37
+ else:
38
+ size = " (目录)"
39
+
40
+ print(f"{status} {model}{size}")
41
+ if not exists:
42
+ all_found = False
43
+
44
+ print()
45
+ if all_found:
46
+ print("[OK] All DeEcho models ready")
47
+ else:
48
+ print("[NO] Missing models, run: python tools/download_models.py")
49
+
50
+ return all_found
51
+
52
+ def check_config():
53
+ """检查配置文件"""
54
+ print("\n" + "=" * 60)
55
+ print("检查配置文件")
56
+ print("=" * 60)
57
+
58
+ config_path = Path("configs/config.json")
59
+ with open(config_path, 'r', encoding='utf-8') as f:
60
+ config = json.load(f)
61
+
62
+ cover_config = config.get("cover", {})
63
+
64
+ # 检查关键配置
65
+ checks = [
66
+ ("VC 预处理模式", "vc_preprocess_mode", "auto", cover_config.get("vc_preprocess_mode")),
67
+ ("源约束模式", "source_constraint_mode", "auto", cover_config.get("source_constraint_mode")),
68
+ ("Karaoke 分离", "karaoke_separation", True, cover_config.get("karaoke_separation")),
69
+ ("索引率", "index_rate", 0.50, cover_config.get("index_rate")),
70
+ ("保护系数", "protect", 0.33, cover_config.get("protect")),
71
+ ]
72
+
73
+ all_correct = True
74
+ for name, key, expected, actual in checks:
75
+ match = actual == expected
76
+ status = "[OK]" if match else "[NO]"
77
+ print(f"{status} {name} ({key}): {actual} {'==' if match else '!='} {expected}")
78
+ if not match:
79
+ all_correct = False
80
+
81
+ print()
82
+ if all_correct:
83
+ print("[OK] Config optimized for aggressive deecho")
84
+ else:
85
+ print("[WARN] Some configs not optimized")
86
+
87
+ return all_correct
88
+
89
+ def print_recommendations():
90
+ """打印使用建议"""
91
+ print("\n" + "=" * 60)
92
+ print("使用建议")
93
+ print("=" * 60)
94
+
95
+ print("""
96
+ 1. 当前配置使用自动模式:
97
+ - 优先使用 UVR DeEcho 模型,缺模型时回退到算法去混响
98
+ - DeEcho 质量好时跳过 blend 直接使用,避免混回原始回音
99
+ - 源约束仅在去过回音的预处理下自动启用
100
+
101
+ 2. 如果回声仍然明显,可以尝试:
102
+ - 在 UI 中调整"索引率"(降低到 0.2-0.3)
103
+ - 在 UI 中调整"保护系数"(降低到 0.2-0.25)
104
+ - 使用更高质量的输入音频
105
+
106
+ 3. 处理流程:
107
+ 原始音频 → Karaoke 分离 → UVR DeEcho(或算法去混响) → RVC 转换 → 输出
108
+
109
+ 4. 测试建议:
110
+ - 选择一首有明显回声的歌曲
111
+ - 查看日志中 DeEcho quality 指标
112
+ - 对比处理前后的回声强度
113
+ """)
114
+
115
+ def main():
116
+ print("\n回声处理配置验证\n")
117
+
118
+ models_ok = check_deecho_models()
119
+ config_ok = check_config()
120
+ print_recommendations()
121
+
122
+ print("\n" + "=" * 60)
123
+ if models_ok and config_ok:
124
+ print("[OK] System ready for testing")
125
+ else:
126
+ print("[WARN] Please fix issues above")
127
+ print("=" * 60 + "\n")
128
+
129
+ if __name__ == "__main__":
130
+ main()
configs/config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "device": "cuda",
3
  "sample_rate": 48000,
4
  "hop_length": 160,
@@ -29,8 +30,9 @@
29
  },
30
  "cover": {
31
  "separator": "roformer",
 
32
  "karaoke_separation": true,
33
- "karaoke_model": "mel_band_roformer_karaoke_gabox.ckpt",
34
  "karaoke_merge_backing_into_accompaniment": true,
35
  "uvr5_model": "HP2_all_vocals",
36
  "uvr5_agg": 10,
@@ -42,10 +44,10 @@
42
  "demucs_split": true,
43
  "f0_method": "hybrid",
44
  "disable_chunking": false,
45
- "index_rate": 0.70,
46
  "filter_radius": 3,
47
  "rms_mix_rate": 0.0,
48
- "protect": 0.50,
49
  "speaker_id": 0,
50
  "hubert_layer": 12,
51
  "silence_gate": false,
@@ -60,18 +62,20 @@
60
  "f0_min": 50,
61
  "f0_max": 1100,
62
  "f0_energy_threshold_db": -50,
63
- "f0_hybrid_mode": "off",
64
  "crepe_pd_threshold": 0.05,
65
- "crepe_force_ratio": 0.0,
66
  "crepe_replace_semitones": 3.0,
 
 
67
  "f0_stabilize": false,
68
  "f0_stabilize_window": 2,
69
  "f0_stabilize_max_semitones": 6.0,
70
  "f0_stabilize_octave": true,
71
  "f0_rate_limit": false,
72
  "f0_rate_limit_semitones": 12.0,
73
- "vc_preprocess_mode": "uvr_deecho",
74
- "source_constraint_mode": "on",
75
  "vc_pipeline_mode": "current",
76
  "singing_repair": false,
77
  "reverb_reapply": true,
@@ -81,10 +85,12 @@
81
  "f0_min": 50,
82
  "f0_max": 1100,
83
  "f0_energy_threshold_db": -60,
84
- "f0_hybrid_mode": "off",
85
  "crepe_pd_threshold": 0.1,
86
- "crepe_force_ratio": 0.05,
87
  "crepe_replace_semitones": 0.0,
 
 
88
  "f0_stabilize": false,
89
  "f0_stabilize_window": 2,
90
  "f0_stabilize_max_semitones": 4.0,
 
1
  {
2
+ "language": "zh_CN",
3
  "device": "cuda",
4
  "sample_rate": 48000,
5
  "hop_length": 160,
 
30
  },
31
  "cover": {
32
  "separator": "roformer",
33
+ "roformer_model": "ensemble:vocal_rvc",
34
  "karaoke_separation": true,
35
+ "karaoke_model": "ensemble:karaoke",
36
  "karaoke_merge_backing_into_accompaniment": true,
37
  "uvr5_model": "HP2_all_vocals",
38
  "uvr5_agg": 10,
 
44
  "demucs_split": true,
45
  "f0_method": "hybrid",
46
  "disable_chunking": false,
47
+ "index_rate": 0.50,
48
  "filter_radius": 3,
49
  "rms_mix_rate": 0.0,
50
+ "protect": 0.33,
51
  "speaker_id": 0,
52
  "hubert_layer": 12,
53
  "silence_gate": false,
 
62
  "f0_min": 50,
63
  "f0_max": 1100,
64
  "f0_energy_threshold_db": -50,
65
+ "f0_hybrid_mode": "fallback",
66
  "crepe_pd_threshold": 0.05,
67
+ "crepe_force_ratio": 1.0,
68
  "crepe_replace_semitones": 3.0,
69
+ "unvoiced_feature_gate_floor": 0.28,
70
+ "breath_active_margin_db": 52.0,
71
  "f0_stabilize": false,
72
  "f0_stabilize_window": 2,
73
  "f0_stabilize_max_semitones": 6.0,
74
  "f0_stabilize_octave": true,
75
  "f0_rate_limit": false,
76
  "f0_rate_limit_semitones": 12.0,
77
+ "vc_preprocess_mode": "auto",
78
+ "source_constraint_mode": "auto",
79
  "vc_pipeline_mode": "current",
80
  "singing_repair": false,
81
  "reverb_reapply": true,
 
85
  "f0_min": 50,
86
  "f0_max": 1100,
87
  "f0_energy_threshold_db": -60,
88
+ "f0_hybrid_mode": "fallback",
89
  "crepe_pd_threshold": 0.1,
90
+ "crepe_force_ratio": 1.0,
91
  "crepe_replace_semitones": 0.0,
92
+ "unvoiced_feature_gate_floor": 0.28,
93
+ "breath_active_margin_db": 52.0,
94
  "f0_stabilize": false,
95
  "f0_stabilize_window": 2,
96
  "f0_stabilize_max_semitones": 4.0,
configs/presets/balanced.json CHANGED
@@ -13,8 +13,8 @@
13
  "f0_stabilize": true,
14
  "f0_stabilize_window": 3,
15
  "f0_stabilize_max_semitones": 3.0,
16
- "vc_preprocess_mode": "uvr_deecho",
17
- "source_constraint_mode": "on",
18
  "uvr5_agg": 10
19
  }
20
  }
 
13
  "f0_stabilize": true,
14
  "f0_stabilize_window": 3,
15
  "f0_stabilize_max_semitones": 3.0,
16
+ "vc_preprocess_mode": "auto",
17
+ "source_constraint_mode": "auto",
18
  "uvr5_agg": 10
19
  }
20
  }
configs/presets/clarity_priority.json CHANGED
@@ -13,8 +13,8 @@
13
  "f0_stabilize": false,
14
  "f0_stabilize_window": 2,
15
  "f0_stabilize_max_semitones": 2.0,
16
- "vc_preprocess_mode": "uvr_deecho",
17
- "source_constraint_mode": "on",
18
  "uvr5_agg": 8
19
  }
20
  }
 
13
  "f0_stabilize": false,
14
  "f0_stabilize_window": 2,
15
  "f0_stabilize_max_semitones": 2.0,
16
+ "vc_preprocess_mode": "auto",
17
+ "source_constraint_mode": "auto",
18
  "uvr5_agg": 8
19
  }
20
  }
configs/presets/timbre_priority.json CHANGED
@@ -13,8 +13,8 @@
13
  "f0_stabilize": true,
14
  "f0_stabilize_window": 5,
15
  "f0_stabilize_max_semitones": 4.0,
16
- "vc_preprocess_mode": "uvr_deecho",
17
- "source_constraint_mode": "on",
18
  "uvr5_agg": 12
19
  }
20
  }
 
13
  "f0_stabilize": true,
14
  "f0_stabilize_window": 5,
15
  "f0_stabilize_max_semitones": 4.0,
16
+ "vc_preprocess_mode": "auto",
17
+ "source_constraint_mode": "auto",
18
  "uvr5_agg": 12
19
  }
20
  }
docs/Colab演示.png ADDED

Git LFS Details

  • SHA256: afed65f6707b2b83f4e0a30a159f3c1f3e74ed9d3b9809819ec633dfc5181338
  • Pointer size: 131 Bytes
  • Size of remote file: 236 kB
docs/Windows界面.png ADDED

Git LFS Details

  • SHA256: f3f221fd5ffe4b646f8fa4e19108b09f104918db96849cf5b4a5981e8b89d352
  • Pointer size: 131 Bytes
  • Size of remote file: 225 kB
docs/plans/2026-03-30-portable-ffmpeg-hardening.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Portable FFmpeg Hardening Implementation Plan
2
+
3
+ > **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
4
+
5
+ **Goal:** Make portable builds self-contained with bundled ffmpeg and ensure runtime code prefers the bundled binary instead of requiring a system installation.
6
+
7
+ **Architecture:** Add a dedicated runtime helper that resolves bundled ffmpeg paths, prepends them to `PATH`, and sets explicit environment variables. Update GitHub build workflows to copy platform ffmpeg binaries into the packaged tree, and add regression tests that lock both the runtime helper and packaging contract.
8
+
9
+ **Tech Stack:** Python, GitHub Actions, PyInstaller, pytest
10
+
11
+ ---
12
+
13
+ ### Task 1: Add failing tests for portable ffmpeg runtime
14
+
15
+ **Files:**
16
+ - Create: `tests/test_ffmpeg_runtime.py`
17
+
18
+ **Step 1: Write the failing test**
19
+ - Assert a bundled ffmpeg directory is preferred over system lookup.
20
+ - Assert runtime setup prepends the bundled directory to `PATH`.
21
+
22
+ **Step 2: Run test to verify it fails**
23
+ - Run: `pytest -q tests/test_ffmpeg_runtime.py`
24
+
25
+ **Step 3: Write minimal implementation**
26
+ - Add the helper module and call it from `run.py`.
27
+
28
+ **Step 4: Run test to verify it passes**
29
+ - Run: `pytest -q tests/test_ffmpeg_runtime.py`
30
+
31
+ **Step 5: Commit**
32
+ - Commit helper + test once green.
33
+
34
+ ### Task 2: Remove remaining hard dependency on `ffprobe`
35
+
36
+ **Files:**
37
+ - Modify: `infer/modules/uvr5/modules.py`
38
+
39
+ **Step 1: Write the failing test**
40
+ - Assert the module no longer calls `ffmpeg.probe(..., cmd="ffprobe")`.
41
+
42
+ **Step 2: Run test to verify it fails**
43
+ - Run: `pytest -q tests/test_ffmpeg_runtime.py`
44
+
45
+ **Step 3: Write minimal implementation**
46
+ - Replace the probe path with a library-based metadata read that does not shell out to `ffprobe`.
47
+
48
+ **Step 4: Run tests to verify they pass**
49
+ - Run: `pytest -q tests/test_ffmpeg_runtime.py`
50
+
51
+ **Step 5: Commit**
52
+ - Commit metadata-path cleanup.
53
+
54
+ ### Task 3: Bundle ffmpeg in GitHub release builds
55
+
56
+ **Files:**
57
+ - Modify: `.github/workflows/build-executables.yml`
58
+ - Modify: `AI-RVC.spec`
59
+
60
+ **Step 1: Write the failing test**
61
+ - Assert the workflow stages copy a bundled ffmpeg directory and package it.
62
+
63
+ **Step 2: Run test to verify it fails**
64
+ - Run: `pytest -q tests/test_ffmpeg_runtime.py`
65
+
66
+ **Step 3: Write minimal implementation**
67
+ - Add a workflow step to copy `ffmpeg` into `tools/ffmpeg/bin`.
68
+ - Add the ffmpeg directory to packaged data inputs for both workflow and spec.
69
+
70
+ **Step 4: Run tests to verify they pass**
71
+ - Run: `pytest -q tests/test_ffmpeg_runtime.py`
72
+
73
+ **Step 5: Commit**
74
+ - Commit workflow/spec hardening.
docs/reports/2026-03-24-cover-quality-final-root-cause-report.md ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 2026-03-24 Cover Quality Final Root Cause Report
2
+
3
+ ## 结论
4
+
5
+ 这次问题不是单一模块失效,而是三层策略在同一条链路上叠加失衡:
6
+
7
+ 1. `hybrid` 在官方 VC 路径里实际被解释成激进的 `RMVPE + CREPE` 混合。
8
+ 2. 源约束在活跃唱段对高能量回声压制不足。
9
+ 3. 后处理又把低能量无声段、呼吸段、换气段压得过重。
10
+
11
+ 最终结果就是:
12
+
13
+ - `SILENT PAIN` 这类回声更重、主唱和和声交叠更强的输入,会在活跃唱段残留明显回声。
14
+ - 两首歌都会在呼吸、回气、擦音附近出现电音感、机械感,因为这些段落先被错误赋音高,再被后处理继续削薄。
15
+
16
+ ## 严格根因
17
+
18
+ ### 1. `hybrid` 路由过于激进
19
+
20
+ 已确认官方 VC 路径里的 `hybrid` 请求会进入 `rmvpe+crepe` 方向,而不是“仅在短掉线时保守兜底”。
21
+
22
+ 影响:
23
+
24
+ - RMVPE 本来判为无声或无调性的段落,CREPE 仍可能给出“看起来有音高”的结果。
25
+ - 呼吸、换气、擦音、气声尾音会被错误写入 F0,随后进入 RVC 合成器,听起来就是电音化、机械化。
26
+
27
+ 这条根因比“RMVPE 本身坏了”更准确。RMVPE 不是主问题,激进混合策略才是主问题。
28
+
29
+ ### 2. 活跃唱段的源约束压制不够
30
+
31
+ 之前的源约束主公式本质上更偏向“低活动段清理”,对活跃唱段内的高能量回声缺少持续压力。
32
+
33
+ 影响:
34
+
35
+ - 低能量尾部回声能被抑制。
36
+ - 但高音量、贴着主唱本体的回声伪影更容易穿过去。
37
+
38
+ 这也解释了为什么 `SILENT PAIN` 更差:它不是单纯“DeEcho 没工作”,而是输入本身更容易在活跃唱段把残留回声带进 VC,再被较宽松的后续预算放过去。
39
+
40
+ ### 3. 后处理对呼吸段过度清理
41
+
42
+ 后处理链路里存在两次对低能量段的再压制:
43
+
44
+ - silence gate / unvoiced cleanup
45
+ - source gap suppression
46
+
47
+ 原策略过于激进时,会把真正的呼吸、回气、气声辅音也看成“应被抑制的无声残留”。
48
+
49
+ 影响:
50
+
51
+ - 如果前面 F0 已经有误判,这里会把真实呼吸进一步压没,只剩下 VC 合成残留。
52
+ - 听感上就会更像电音噪点,而不是自然的呼吸。
53
+
54
+ ## 两份旧诊断中需要收紧的地方
55
+
56
+ 以下判断方向是对的,但需要更严格表述:
57
+
58
+ - “高音量回声去不掉”并不等于 UVR DeEcho 完全失效。
59
+ 实际上,DeEcho 对低能量尾部仍有效,主要问题在于后续源约束对活跃唱段压力不足。
60
+
61
+ - “呼吸电音”也不应归结为单一的 silence gate。
62
+ 更准确的说法是:激进 F0 混合先污染了呼吸段,后处理再把真实气声继续削弱,二者叠加后才变成明显机械感。
63
+
64
+ ## 已实施修复
65
+
66
+ ### A. 将 `hybrid` 改为保守策略
67
+
68
+ 已新增统一策略层 `infer/quality_policy.py`,并接入:
69
+
70
+ - `infer/official_adapter.py`
71
+ - `infer/f0_extractor.py`
72
+ - `infer/cover_pipeline.py`
73
+
74
+ 修复后:
75
+
76
+ - 官方 VC 请求 `hybrid` 时,不再直接走激进混合。
77
+ - 主转换统一路由到 `RMVPE`。
78
+ - 只有在短、局部、可信、且处于有声上下文的掉线帧上,才允许 CREPE 参与修补。
79
+ - 后处理门控对 `hybrid` 也统一按 `RMVPE` 语义执行,避免前后语义不一致。
80
+
81
+ ### B. 强化活跃唱段的源约束
82
+
83
+ 已把源约束从“只对低活动段强约束”改成“在活跃且疑似回声重叠的帧上,也保留有效替换压力”。
84
+
85
+ 同时收紧了后续预算:
86
+
87
+ - 活跃段不再允许旧逻辑那样过宽的能量放行。
88
+ - 全局活动 RMS 也做了更对称、更保守的重新裁剪。
89
+
90
+ 目标不是把人声压瘪,而是减少活跃唱段里本不该存在的额外回声包络。
91
+
92
+ ### C. 放松对呼吸段的误杀
93
+
94
+ 已降低后处理对低能量段的攻击性:
95
+
96
+ - `source gap suppression` 触发更谨慎。
97
+ - 衰减深度减轻。
98
+ - `uvr_deecho` 路由下的二次 silence gate 更保守,且加入保护系数。
99
+
100
+ 目标是保留自然呼吸、回气和轻辅音,不再把它们当成必须清掉的伪影。
101
+
102
+ ## 预期效果
103
+
104
+ 修复后预期是:
105
+
106
+ - `SILENT PAIN` 这类高回声输入,在主唱活跃区的残留回声会明显减轻,但不会靠极端静音门去“硬切”。
107
+ - `Far far away` 这类本来底子更好的输入,呼吸和回气会更自然,机械感应下降。
108
+ - 如果模型本身训练音色或索引质量有限,仍可能保留一部分合成质感;这不是本次根因链的核心部分。
109
+
110
+ ## 已完成验证
111
+
112
+ 已完成本地轻量验证:
113
+
114
+ - `python -m unittest tests.test_quality_policy`
115
+ - `python -m py_compile infer\\quality_policy.py infer\\f0_extractor.py infer\\official_adapter.py infer\\cover_pipeline.py tests\\test_quality_policy.py`
116
+
117
+ 结果:
118
+
119
+ - 单测通过。
120
+ - 语法编译通过。
121
+
122
+ ## 尚未完成的验证
123
+
124
+ 尚未在当前 shell 环境执行重型端到端音频回归,因为当前会话未发现可直接复用的项目虚拟环境解释器。
125
+
126
+ 因此,本报告对“代码修复已落地”是确定的;
127
+ 对“具体两首歌的最终听感改善幅度”仍建议在用户实际运行环境里再做一次 A/B 回归确认。
docs/superpowers/plans/2026-05-17-v1.2.1-language-release.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # v1.2.1 Language Release Implementation Plan
2
+
3
+ > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
4
+
5
+ **Goal:** Add a Chinese/English UI language selector and publish v1.2.1 from current local `master`.
6
+
7
+ **Architecture:** Reuse the existing `i18n/*.json` pattern and `ui.app:t()` lookup. Locale is read from `configs/config.json` during startup and saved through a Gradio dropdown with an explicit restart-required status.
8
+
9
+ **Tech Stack:** Python 3.10, Gradio 3.50.2, unittest/pytest-compatible tests, GitHub CLI, Hugging Face CLI.
10
+
11
+ ---
12
+
13
+ ### Task 1: Locale Configuration Tests
14
+
15
+ **Files:**
16
+ - Create: `tests/test_ui_language.py`
17
+ - Modify: `ui/app.py`
18
+ - Create: `i18n/en_US.json`
19
+
20
+ - [ ] **Step 1: Write the failing tests**
21
+
22
+ Create tests that import `ui.app`, assert default locale selection, reject unsupported locales, save language to a temp config file, and require English translation keys.
23
+
24
+ - [ ] **Step 2: Run tests to verify they fail**
25
+
26
+ Run: `python -m pytest tests/test_ui_language.py -q`
27
+ Expected: fail because helper functions and English pack are not implemented.
28
+
29
+ - [ ] **Step 3: Implement minimal locale support**
30
+
31
+ Add `SUPPORTED_LANGUAGES`, config-driven `get_configured_language`, `save_language_setting`, and load `i18n` from the configured language. Add `i18n/en_US.json`.
32
+
33
+ - [ ] **Step 4: Run tests to verify they pass**
34
+
35
+ Run: `python -m pytest tests/test_ui_language.py -q`
36
+ Expected: pass.
37
+
38
+ ### Task 2: Gradio UI Selector
39
+
40
+ **Files:**
41
+ - Modify: `ui/app.py`
42
+ - Modify: `i18n/zh_CN.json`
43
+ - Modify: `i18n/en_US.json`
44
+
45
+ - [ ] **Step 1: Add selector labels to tests**
46
+
47
+ Extend `tests/test_ui_language.py` to assert both language packs include `language`, `save_language`, and `language_saved_restart` settings keys.
48
+
49
+ - [ ] **Step 2: Run tests to verify they fail**
50
+
51
+ Run: `python -m pytest tests/test_ui_language.py -q`
52
+ Expected: fail until translation keys exist.
53
+
54
+ - [ ] **Step 3: Add selector UI**
55
+
56
+ Place a compact language dropdown near the top and a matching Settings control. Saving updates config and returns restart-required text.
57
+
58
+ - [ ] **Step 4: Run tests and import check**
59
+
60
+ Run: `python -m pytest tests/test_ui_language.py tests/test_quality_policy.py::SourceRegressionTests::test_ui_only_exposes_strict_sota_vc_preprocess_modes -q`
61
+ Run: `python -c "import ui.app; print(ui.app.get_configured_language())"`
62
+ Expected: both commands exit 0.
63
+
64
+ ### Task 3: Release
65
+
66
+ **Files:**
67
+ - Git metadata and remote release state.
68
+
69
+ - [ ] **Step 1: Inspect diff and status**
70
+
71
+ Run: `git status -sb` and `git diff --stat`.
72
+
73
+ - [ ] **Step 2: Commit language changes**
74
+
75
+ Run: `git add docs/superpowers/specs/2026-05-17-v1.2.1-language-release-design.md docs/superpowers/plans/2026-05-17-v1.2.1-language-release.md tests/test_ui_language.py i18n/en_US.json i18n/zh_CN.json ui/app.py configs/config.json`
76
+ Run: `git commit -m "ui: add language switcher"`
77
+
78
+ - [ ] **Step 3: Push and release**
79
+
80
+ Run: `git push origin master`
81
+ Run: `gh release create v1.2.1 --target master --title "AI-RVC v1.2.1" --notes-file <release-notes-file>`
82
+
83
+ - [ ] **Step 4: Sync Hugging Face Space**
84
+
85
+ Run: `hf upload mason369/AI-RVC . --repo-type space --commit-message "Release v1.2.1" --exclude ...`
86
+
87
+ - [ ] **Step 5: Check package workflow**
88
+
89
+ Run: `gh run list --workflow "Build Executables" --limit 5`
90
+ Wait for the v1.2.1 workflow and report release assets status.
docs/superpowers/specs/2026-05-17-v1.2.1-language-release-design.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # v1.2.1 Language Release Design
2
+
3
+ ## Goal
4
+
5
+ Release AI-RVC v1.2.1 from the current local `master` branch, adding a visible Chinese/English language selector and publishing matching GitHub and Hugging Face Space versions.
6
+
7
+ ## Approach
8
+
9
+ The UI already centralizes most labels through `i18n/zh_CN.json` and `ui/app.py:t()`. Add `i18n/en_US.json`, store the selected locale in `configs/config.json`, and load that locale at startup. Because Gradio 3 builds component labels statically, changing the selector saves the preference and shows an explicit restart-required message instead of pretending to hot-swap all labels.
10
+
11
+ ## UI Behavior
12
+
13
+ Add a language selector near the top of the app and in Settings. Choices are `中文` and `English`. Saving writes `language` to `configs/config.json`; unsupported locale values fail explicitly and do not silently fall back.
14
+
15
+ ## Release Behavior
16
+
17
+ Publish from the current local `master`: run targeted tests, commit language changes, push `master`, create GitHub Release/tag `v1.2.1`, let the existing `Build Executables` workflow build portable packages, and sync the same source to Hugging Face Space `mason369/AI-RVC`.
18
+
19
+ ## Verification
20
+
21
+ Use test-first coverage for locale config loading, language option validation, English pack existence, and save behavior. After implementation, run the targeted tests plus a `ui.app` import check before committing and publishing.
i18n/en_US.json ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "app_title": "RVC AI Cover",
3
+ "app_description": "AI song cover system based on RVC v2",
4
+ "tabs": {
5
+ "models": "Model Management",
6
+ "cover": "Song Cover",
7
+ "settings": "Settings"
8
+ },
9
+ "ui": {
10
+ "cover_usage": "**One-click AI Cover**: upload a song → separate vocals → convert the voice → mix the accompaniment → export the cover\n\n**Steps:**\n1. Download a character model first (open \"Download Character Model\" below)\n2. Upload a song file (MP3/WAV/FLAC)\n3. Choose a downloaded character\n4. Adjust parameters and click \"Start Cover\"\n\n> ⚠️ First launch downloads the Mel-Band Roformer vocal separation model automatically (about 200 MB). Please wait patiently.",
11
+ "series_filter": "Series / Category",
12
+ "keyword_search": "Keyword Search",
13
+ "keyword_placeholder": "Character or series name",
14
+ "character_choice_info": "The list shows version tags, character ownership, and model source. In versions, epochs means training epochs, while 40k/48k means sample rate.",
15
+ "download_character_info": "The list shows version tags, character ownership, and source repositories so you can distinguish different models for the same character.",
16
+ "refresh_models": "Refresh Model List",
17
+ "download_selected_character": "Download Selected Character",
18
+ "download_series_all": "Download This Category",
19
+ "download_all_characters": "Download All Character Models",
20
+ "download_status": "Download Status",
21
+ "model_name": "Model Name",
22
+ "model_path": "Model Path",
23
+ "index_path": "Index Path",
24
+ "no_models": "(No models)",
25
+ "positive_pitch_info": "Positive raises pitch, negative lowers it",
26
+ "normal_volume_info": "100% keeps original volume",
27
+ "reverb_info": "Adds reverb to the vocals"
28
+ },
29
+ "conversion": {
30
+ "model_settings": "Model Settings",
31
+ "select_model": "Select Model",
32
+ "refresh": "Refresh",
33
+ "load_model": "Load Model",
34
+ "status": "Status",
35
+ "audio_input": "Audio Input",
36
+ "upload_audio": "Upload Audio",
37
+ "conversion_params": "Conversion Parameters",
38
+ "pitch_shift": "Pitch Shift (semitones)",
39
+ "pitch_shift_info": "Positive raises pitch, negative lowers it. Male to female often uses +12; female to male often uses -12",
40
+ "f0_method": "F0 Extraction Method",
41
+ "f0_method_info": "rmvpe gives the highest quality (recommended)",
42
+ "index_rate": "Index Rate",
43
+ "index_rate_info": "Higher values sound closer to the training voice, but may reduce clarity",
44
+ "filter_radius": "Median Filter Radius",
45
+ "filter_radius_info": "Helps reduce breath noise",
46
+ "rms_mix_rate": "Loudness Envelope Mix",
47
+ "rms_mix_rate_info": "Closer to 1 uses more of the output loudness envelope",
48
+ "protect": "Unvoiced Protection",
49
+ "protect_info": "Protects consonants and breath sounds; 0.5 means no protection",
50
+ "start_conversion": "Start Conversion",
51
+ "conversion_result": "Conversion Result",
52
+ "conversion_status": "Conversion Status"
53
+ },
54
+ "cover": {
55
+ "song_cover": "AI Song Cover",
56
+ "cover_description": "Upload a song, choose a character, and generate an AI cover. The pipeline separates vocals, converts the voice, and mixes the accompaniment automatically.",
57
+ "upload_song": "Upload Song",
58
+ "input_song": "Input Song (MP3/WAV/FLAC)",
59
+ "select_character": "Select Character",
60
+ "character": "Character",
61
+ "conversion_settings": "Conversion Settings",
62
+ "pitch_shift": "Pitch Shift (semitones)",
63
+ "index_rate": "Index Rate (%)",
64
+ "index_rate_info": "Recommended: 30-45%; too low sounds more synthetic, too high may blur articulation",
65
+ "speaker_id": "Speaker ID",
66
+ "speaker_id_info": "Single-character models usually use 0; try other IDs for multi-speaker models",
67
+ "mix_settings": "Mix Settings",
68
+ "mix_preset": "Mix Preset",
69
+ "mix_preset_info": "Apply a mix profile suitable for most songs",
70
+ "mix_preset_universal": "Universal (recommended)",
71
+ "mix_preset_vocal": "Vocal Focus",
72
+ "mix_preset_accompaniment": "Accompaniment Focus",
73
+ "mix_preset_live": "Live Feel (more reverb)",
74
+ "vocals_volume": "Vocal Volume (%)",
75
+ "accompaniment_volume": "Accompaniment Volume (%)",
76
+ "vocals_reverb": "Vocal Reverb (%)",
77
+ "rms_mix_rate": "Loudness Envelope Match (%)",
78
+ "rms_mix_rate_info": "Moderately follows the original vocal loudness contour; recommended: 10-30%",
79
+ "backing_mix": "Original Lead Vocal Blend (%)",
80
+ "backing_mix_info": "Blend a small amount of original lead vocal to restore thickness and articulation; too high leaks the source timbre",
81
+ "karaoke_separation": "Harmony Separation",
82
+ "karaoke_separation_info": "Separate lead and backing vocals, convert only the lead, and preserve original harmonies. Useful for songs with backing vocals",
83
+ "karaoke_merge_backing": "Blend Backing Vocals Into Accompaniment",
84
+ "karaoke_merge_backing_info": "Mix separated original backing vocals back into the accompaniment. Enabled by default to preserve harmony layers",
85
+ "start_cover": "Start Cover",
86
+ "progress": "Progress",
87
+ "results": "Cover Results",
88
+ "final_cover": "Final Cover",
89
+ "converted_vocals": "Converted Vocals",
90
+ "original_vocals": "Original Vocals (Separated)",
91
+ "lead_vocals": "Lead Vocals",
92
+ "backing_vocals": "Backing Vocals",
93
+ "accompaniment": "Accompaniment",
94
+ "download_character": "Download Character Model",
95
+ "select_to_download": "Select Character to Download",
96
+ "download": "Download",
97
+ "vc_preprocess_mode": "VC Preprocess Strategy",
98
+ "vc_preprocess_mode_info": "Strictly uses RoFormer De-Reverb SOTA; if unavailable, processing stops instead of degrading",
99
+ "vc_preprocess_auto": "Auto (recommended)",
100
+ "vc_preprocess_uvr_deecho": "Strict RoFormer De-Reverb",
101
+ "source_constraint_mode": "Source Constraint Strategy",
102
+ "source_constraint_mode_info": "Auto (recommended) enables this only for strict RoFormer De-Reverb preprocessing; if unavailable, processing stops instead of degrading",
103
+ "source_constraint_auto": "Auto (recommended)",
104
+ "source_constraint_off": "Off",
105
+ "source_constraint_on": "Always On",
106
+ "vc_preprocess_status": "Current VC Preprocess Route",
107
+ "vc_preprocess_status_info": "Shows the actual route this machine will use in auto mode",
108
+ "vc_pipeline_mode": "VC Pipeline Mode",
109
+ "vc_pipeline_mode_info": "Current implementation keeps this project's existing path; official mode calls the bundled official RVC directly and skips custom VC pre/post-processing",
110
+ "vc_pipeline_mode_current": "Current Implementation (recommended)",
111
+ "vc_pipeline_mode_official": "Official Implementation (1:1)",
112
+ "singing_repair": "Singing Repair",
113
+ "singing_repair_info": "For high-note dropouts, tearing, metallic artifacts, and sibilance. When enabled, this exits strict 1:1 mode and uses the official-compatible repair path (FP32 + hybrid F0 + F0 stabilization/rate limiting)"
114
+ },
115
+ "models": {
116
+ "base_models": "Base Models",
117
+ "base_models_desc": "These models are required to run RVC and must be downloaded before first use.",
118
+ "check_status": "Check Model Status",
119
+ "download_required": "Download Required Models",
120
+ "model_status": "Model Status",
121
+ "voice_models": "Voice Models",
122
+ "voice_models_desc": "Put .pth model files into assets/weights/, then refresh the model list.",
123
+ "mature_deecho_models": "Mature DeEcho Models",
124
+ "mature_deecho_models_desc": "Current covers strictly use RoFormer De-Reverb SOTA; if unavailable, processing stops instead of degrading to older models or algorithm chains.",
125
+ "download_mature_deecho": "Download Mature DeEcho Model",
126
+ "mature_deecho_status": "Mature DeEcho Status",
127
+ "mature_deecho_check": "Check Mature DeEcho Status"
128
+ },
129
+ "settings": {
130
+ "language_settings": "Language Settings",
131
+ "language": "Interface Language",
132
+ "save_language": "Save Language",
133
+ "language_saved_restart": "✅ Interface language saved: {language}. Please restart the app for it to take effect.",
134
+ "language_restart_info": "The language choice is saved to the config file. Because Gradio 3 builds UI labels at startup, restart the app to apply it.",
135
+ "runtime_settings": "Runtime Settings",
136
+ "compute_device": "Compute Device",
137
+ "save_settings": "Save Settings",
138
+ "settings_saved_restart": "✅ Settings saved. Restart the app for changes to take effect.",
139
+ "status": "Status",
140
+ "cpu_slow": "CPU (slow)",
141
+ "about_body": "**RVC AI Cover System**\n\n- Built on RVC v2 + Mel-Band Roformer\n- Uses RMVPE for high-quality F0 extraction\n- Supports CUDA GPU acceleration\n\n[GitHub](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)",
142
+ "model_sources": "### 📥 Character Model Sources\n\nThese are the Hugging Face repositories used by the built-in character model list. You can also download models manually and place them under `assets/weights/characters/<character_name>/`:\n\n**Love Live! Series**\n- [trioskosmos/rvc_models](https://huggingface.co/trioskosmos/rvc_models) — μ's / Aqours / Nijigasaki / Liella! characters\n- [Icchan/LoveLive](https://huggingface.co/Icchan/LoveLive) — Chika, Riko, Eli, You\n- [0xMifune/LoveLive](https://huggingface.co/0xMifune/LoveLive) — Nijigasaki / Liella! / Hasunosora\n- [Swordsmagus/Love-Live-RVC](https://huggingface.co/Swordsmagus/Love-Live-RVC) — Hanamaru, Setsuna, Kotori, A-RISE, and more\n- [Zurakichi/RVC](https://huggingface.co/Zurakichi/RVC) — Nico, Kanata, Setsuna, Hanamaru\n- [Phos252/RVCmodels](https://huggingface.co/Phos252/RVCmodels) — Shibuya Kanon\n- [ChocoKat/Mari_Ohara](https://huggingface.co/ChocoKat/Mari_Ohara) — Mari Ohara\n- [HarunaKasuga/YoshikoTsushima](https://huggingface.co/HarunaKasuga/YoshikoTsushima) — Yoshiko Tsushima\n- [thebuddyadrian/RVC_Models](https://huggingface.co/thebuddyadrian/RVC_Models) — Kazuno sisters\n\n**Genshin / Honkai / Zenless Zone Zero (HoYoverse)**\n- [makiligon/RVC-Models](https://huggingface.co/makiligon/RVC-Models) — Furina, Ayaka, Focalors\n- [kohaku12/RVC-MODELS](https://huggingface.co/kohaku12/RVC-MODELS) — Nahida, Herta, Firefly, Tingyun, Hoshimi Miyabi, and more\n- [jarari/RVC-v2](https://huggingface.co/jarari/RVC-v2) — Furina (Korean), Silver Wolf (Korean)\n- [mrmocciai/genshin-impact](https://huggingface.co/mrmocciai/genshin-impact) — 50+ Genshin characters (manual download)\n\n**VOCALOID**\n- [javinfamous/infamous_miku_v2](https://huggingface.co/javinfamous/infamous_miku_v2) — Hatsune Miku (1000 epochs)\n\n**Hololive / VTuber**\n- [megaaziib/my-rvc-models-collection](https://huggingface.co/megaaziib/my-rvc-models-collection) — Pekora, Sakura Miko, Subaru, Kobo, Kaela, and more\n- [Kit-Lemonfoot/kitlemonfoot_rvc_models](https://huggingface.co/Kit-Lemonfoot/kitlemonfoot_rvc_models) — Hololive JP/EN characters\n\n**THE IDOLM@STER / Uma Musume**\n- [trioskosmos/rvc_models](https://huggingface.co/trioskosmos/rvc_models) — Ranko Kanzaki, Riamu Yumemi\n- [makiligon/RVC-Models](https://huggingface.co/makiligon/RVC-Models) — Takane Shijou, Rice Shower\n\n**Project SEKAI**\n- [kohaku12/RVC-MODELS](https://huggingface.co/kohaku12/RVC-MODELS) — Nene Kusanagi\n\n> 💡 After manual download, put `.pth` and `.index` files under `assets/weights/characters/<character_name>/`, then refresh the list.",
143
+ "device_info": "Device Information",
144
+ "current_device": "Current Device",
145
+ "refresh_device": "Refresh Device Information",
146
+ "about": "About"
147
+ },
148
+ "messages": {
149
+ "model_loaded": "Model loaded",
150
+ "model_not_found": "Model not found",
151
+ "load_failed": "Load failed",
152
+ "please_select_model": "Please select a model",
153
+ "please_load_model": "Please load a model first",
154
+ "please_upload_audio": "Please upload an audio file",
155
+ "conversion_complete": "Conversion complete",
156
+ "conversion_failed": "Conversion failed",
157
+ "download_complete": "Download complete",
158
+ "download_failed": "Download failed",
159
+ "cover_complete": "Cover complete",
160
+ "cover_failed": "Cover failed",
161
+ "separating_vocals": "Separating vocals...",
162
+ "converting_vocals": "Converting vocals...",
163
+ "mixing_audio": "Mixing audio..."
164
+ }
165
+ }
i18n/zh_CN.json CHANGED
@@ -6,6 +6,26 @@
6
  "cover": "歌曲翻唱",
7
  "settings": "设置"
8
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  "conversion": {
10
  "model_settings": "模型设置",
11
  "select_model": "选择模型",
@@ -75,13 +95,11 @@
75
  "select_to_download": "选择要下载的角色",
76
  "download": "下载",
77
  "vc_preprocess_mode": "VC预处理策略",
78
- "vc_preprocess_mode_info": "参考成熟项目:优先学习型 DeEcho/DeReverb,否则主唱直通 RVC",
79
  "vc_preprocess_auto": "自动(推荐)",
80
- "vc_preprocess_direct": "主唱直通",
81
- "vc_preprocess_uvr_deecho": "官方DeEcho优先",
82
- "vc_preprocess_legacy": "旧版手工链",
83
  "source_constraint_mode": "源约束策略",
84
- "source_constraint_mode_info": "自动(推荐)下仅旧版手工链启用;学习型 DeEcho 或主唱直通默认再追加自定义源约束",
85
  "source_constraint_auto": "自动(推荐)",
86
  "source_constraint_off": "关闭",
87
  "source_constraint_on": "始终开启",
@@ -89,10 +107,10 @@
89
  "vc_preprocess_status_info": "显示本机当前自动模式会走的实际路径",
90
  "vc_pipeline_mode": "VC管线模式",
91
  "vc_pipeline_mode_info": "当前实现保留本项目现有链路;官方实现会直接调用内置官方 RVC,并跳过自定义 VC 前后处理",
92
- "vc_pipeline_mode_current": "当前实现(推荐)",
93
- "vc_pipeline_mode_official": "官方实现(一比一)",
94
- "singing_repair": "唱歌修复",
95
- "singing_repair_info": "用于高音断音、撕裂、电音、齿音;开启后会退出严格一比一,改用官方兼容修复链(FP32 + 混合F0 + F0稳定/限速)"
96
  },
97
  "models": {
98
  "base_models": "基础模型",
@@ -103,12 +121,25 @@
103
  "voice_models": "语音模型",
104
  "voice_models_desc": "将 .pth 模型文件放入 assets/weights/ 目录,然后刷新模型列表。",
105
  "mature_deecho_models": "成熟 DeEcho 模型",
106
- "mature_deecho_models_desc": "用于成熟项目常见的学习型去回声/去混响流程。未下载时,翻唱页的自动式会回退为主唱直通 RVC。",
107
  "download_mature_deecho": "下载成熟 DeEcho 模型",
108
  "mature_deecho_status": "成熟 DeEcho 状态",
109
  "mature_deecho_check": "检查成熟 DeEcho 状态"
110
  },
111
  "settings": {
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  "device_info": "设备信息",
113
  "current_device": "当前设备",
114
  "refresh_device": "刷新设备信息",
 
6
  "cover": "歌曲翻唱",
7
  "settings": "设置"
8
  },
9
+ "ui": {
10
+ "cover_usage": "**一键 AI 翻唱**:上传歌曲 → 自动分离人声 → 转换音色 → 混合伴奏 → 输出翻唱\n\n**使用步骤:**\n1. 先下载角色模型(展开下方「下载角色模型」)\n2. 上传歌曲文件(支持 MP3/WAV/FLAC)\n3. 选择已下载的角色\n4. 调整参数后点击「开始翻唱」\n\n> ⚠️ 首次运行会自动下载 Mel-Band Roformer 人声分离模型(约 200MB),请耐心等待",
11
+ "series_filter": "作品/分类",
12
+ "keyword_search": "关键词搜索",
13
+ "keyword_placeholder": "输入角色名/作品名",
14
+ "character_choice_info": "列表会显示版本标识、角色归属和模型来源;版本里常见的 epochs 表示训练轮数,40k/48k 表示采样率。",
15
+ "download_character_info": "列表会显示版本标识、角色归属和来源仓库,便于区分同角色的不同模型。",
16
+ "refresh_models": "刷新模型列表",
17
+ "download_selected_character": "下载选中角色",
18
+ "download_series_all": "下载该分类全部",
19
+ "download_all_characters": "下载全部角色模型",
20
+ "download_status": "下载状态",
21
+ "model_name": "模型名称",
22
+ "model_path": "模型路径",
23
+ "index_path": "索引路径",
24
+ "no_models": "(无模型)",
25
+ "positive_pitch_info": "正数升调,负数降调",
26
+ "normal_volume_info": "100% 为原始音量",
27
+ "reverb_info": "为人声添加混响效果"
28
+ },
29
  "conversion": {
30
  "model_settings": "模型设置",
31
  "select_model": "选择模型",
 
95
  "select_to_download": "选择要下载的角色",
96
  "download": "下载",
97
  "vc_preprocess_mode": "VC预处理策略",
98
+ "vc_preprocess_mode_info": "严格使用 RoFormer De-Reverb SOTA;不可用时停止,不降级",
99
  "vc_preprocess_auto": "自动(推荐)",
100
+ "vc_preprocess_uvr_deecho": "严格RoFormer De-Reverb",
 
 
101
  "source_constraint_mode": "源约束策略",
102
+ "source_constraint_mode_info": "自动(推荐)下仅严格 RoFormer De-Reverb 预处理启用;不可用时停止,不降级",
103
  "source_constraint_auto": "自动(推荐)",
104
  "source_constraint_off": "关闭",
105
  "source_constraint_on": "始终开启",
 
107
  "vc_preprocess_status_info": "显示本机当前自动模式会走的实际路径",
108
  "vc_pipeline_mode": "VC管线模式",
109
  "vc_pipeline_mode_info": "当前实现保留本项目现有链路;官方实现会直接调用内置官方 RVC,并跳过自定义 VC 前后处理",
110
+ "vc_pipeline_mode_current": "当前实现(推荐)",
111
+ "vc_pipeline_mode_official": "官方实现(一比一)",
112
+ "singing_repair": "唱歌修复",
113
+ "singing_repair_info": "用于高音断音、撕裂、电音、齿音;开启后会退出严格一比一,改用官方兼容修复链(FP32 + 混合F0 + F0稳定/限速)"
114
  },
115
  "models": {
116
  "base_models": "基础模型",
 
121
  "voice_models": "语音模型",
122
  "voice_models_desc": "将 .pth 模型文件放入 assets/weights/ 目录,然后刷新模型列表。",
123
  "mature_deecho_models": "成熟 DeEcho 模型",
124
+ "mature_deecho_models_desc": "当前翻唱严格使 RoFormer De-Reverb SOTA;不可用停止处理不降级到旧型或算法链。",
125
  "download_mature_deecho": "下载成熟 DeEcho 模型",
126
  "mature_deecho_status": "成熟 DeEcho 状态",
127
  "mature_deecho_check": "检查成熟 DeEcho 状态"
128
  },
129
  "settings": {
130
+ "language_settings": "语言设置",
131
+ "language": "界面语言",
132
+ "save_language": "保存语言",
133
+ "language_saved_restart": "✅ 已保存界面语言:{language}。请重启应用后生效。",
134
+ "language_restart_info": "切换语言会保存到配置文件;由于 Gradio 3 的界面文案在启动时生成,重启后生效。",
135
+ "runtime_settings": "运行设置",
136
+ "compute_device": "计算设备",
137
+ "save_settings": "保存设置",
138
+ "settings_saved_restart": "✅ 设置���保存,重启后生效",
139
+ "status": "状态",
140
+ "cpu_slow": "CPU (较慢)",
141
+ "about_body": "**RVC AI 翻唱系统**\n\n- 基于 RVC v2 + Mel-Band Roformer\n- 使用 RMVPE 进行高质量 F0 提取\n- 支持 CUDA GPU 加速\n\n[GitHub](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)",
142
+ "model_sources": "### 📥 角色模型来源\n\n以下是本项目角色模型的 HuggingFace 仓库来源,你也可以手动下载模型后放入 `assets/weights/characters/<角色名>/` 目录使用:\n\n**Love Live! 系列**\n- [trioskosmos/rvc_models](https://huggingface.co/trioskosmos/rvc_models) — μ's / Aqours / 虹咲 / Liella! 多角色\n- [Icchan/LoveLive](https://huggingface.co/Icchan/LoveLive) — 千歌、梨子、绘里、曜\n- [0xMifune/LoveLive](https://huggingface.co/0xMifune/LoveLive) — 虹咲 / Liella! / 莲之空\n- [Swordsmagus/Love-Live-RVC](https://huggingface.co/Swordsmagus/Love-Live-RVC) — 花丸、雪菜、小鸟、A-RISE 等\n- [Zurakichi/RVC](https://huggingface.co/Zurakichi/RVC) — 妮可、彼方、雪菜、花丸\n- [Phos252/RVCmodels](https://huggingface.co/Phos252/RVCmodels) — 涩谷香音\n- [ChocoKat/Mari_Ohara](https://huggingface.co/ChocoKat/Mari_Ohara) — 小原鞠莉\n- [HarunaKasuga/YoshikoTsushima](https://huggingface.co/HarunaKasuga/YoshikoTsushima) — 津岛善子\n- [thebuddyadrian/RVC_Models](https://huggingface.co/thebuddyadrian/RVC_Models) — 鹿角姐妹\n\n**原神 / 崩坏 / 绝区零 (米哈游)**\n- [makiligon/RVC-Models](https://huggingface.co/makiligon/RVC-Models) — 芙宁娜、绫华、芙卡洛斯\n- [kohaku12/RVC-MODELS](https://huggingface.co/kohaku12/RVC-MODELS) — 纳西妲、黑塔、流萤、停云、星见雅 等\n- [jarari/RVC-v2](https://huggingface.co/jarari/RVC-v2) — 芙宁娜(韩语)、银狼(韩语)\n- [mrmocciai/genshin-impact](https://huggingface.co/mrmocciai/genshin-impact) — 原神 50+ 角色(需手动下载)\n\n**VOCALOID**\n- [javinfamous/infamous_miku_v2](https://huggingface.co/javinfamous/infamous_miku_v2) — 初音未来 (1000 epochs)\n\n**Hololive / VTuber**\n- [megaaziib/my-rvc-models-collection](https://huggingface.co/megaaziib/my-rvc-models-collection) — 佩克拉、樱巫女、大空昴、Kobo、Kaela 等\n- [Kit-Lemonfoot/kitlemonfoot_rvc_models](https://huggingface.co/Kit-Lemonfoot/kitlemonfoot_rvc_models) — Hololive JP/EN 多角色\n\n**偶像大师 / 赛马娘**\n- [trioskosmos/rvc_models](https://huggingface.co/trioskosmos/rvc_models) — 神崎兰子、梦见莉亚梦\n- [makiligon/RVC-Models](https://huggingface.co/makiligon/RVC-Models) — 四条贵音、米浴\n\n**Project SEKAI**\n- [kohaku12/RVC-MODELS](https://huggingface.co/kohaku12/RVC-MODELS) — 草薙宁宁\n\n> 💡 手动下载后,将 `.pth` 和 `.index` 文件放入 `assets/weights/characters/<角色名>/` 目录,刷新即可使用。",
143
  "device_info": "设备信息",
144
  "current_device": "当前设备",
145
  "refresh_device": "刷新设备信息",
infer/__init__.py CHANGED
@@ -1,17 +1,17 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- 推理模块
4
- """
5
- from .f0_extractor import (
6
- F0Extractor,
7
- get_f0_extractor,
8
- shift_f0,
9
- F0Method
10
- )
11
-
12
- __all__ = [
13
- "F0Extractor",
14
- "get_f0_extractor",
15
- "shift_f0",
16
- "F0Method"
17
- ]
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 推理模块
4
+ """
5
+ from .f0_extractor import (
6
+ F0Extractor,
7
+ get_f0_extractor,
8
+ shift_f0,
9
+ F0Method
10
+ )
11
+
12
+ __all__ = [
13
+ "F0Extractor",
14
+ "get_f0_extractor",
15
+ "shift_f0",
16
+ "F0Method"
17
+ ]
infer/advanced_dereverb.py CHANGED
@@ -213,38 +213,6 @@ def advanced_dereverb(
213
 
214
  return dry_signal.astype(np.float32), reverb_tail.astype(np.float32)
215
 
216
-
217
- def apply_reverb_to_converted(
218
- converted_dry: np.ndarray,
219
- original_reverb: np.ndarray,
220
- mix_ratio: float = 0.8
221
- ) -> np.ndarray:
222
- """
223
- 将原始混响重新应用到转换后的干声上
224
-
225
- Args:
226
- converted_dry: 转换后的干声
227
- original_reverb: 原始混响尾巴
228
- mix_ratio: 混响混合比例 (0-1)
229
-
230
- Returns:
231
- wet_signal: 带混响的转换结果
232
- """
233
- # 对齐长度
234
- min_len = min(len(converted_dry), len(original_reverb))
235
- converted_dry = converted_dry[:min_len]
236
- original_reverb = original_reverb[:min_len]
237
-
238
- # 混合
239
- wet_signal = converted_dry + mix_ratio * original_reverb
240
-
241
- # 软限幅
242
- from lib.audio import soft_clip
243
- wet_signal = soft_clip(wet_signal, threshold=0.9, ceiling=0.99)
244
-
245
- return wet_signal.astype(np.float32)
246
-
247
-
248
  if __name__ == "__main__":
249
  # 测试
250
  print("Testing advanced dereverberation...")
 
213
 
214
  return dry_signal.astype(np.float32), reverb_tail.astype(np.float32)
215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  if __name__ == "__main__":
217
  # 测试
218
  print("Testing advanced dereverberation...")
infer/cover_pipeline.py CHANGED
The diff for this file is too large to render. See raw diff
 
infer/f0_extractor.py CHANGED
@@ -1,17 +1,23 @@
1
  # -*- coding: utf-8 -*-
2
  """
3
- F0 (基频) 提取模块 - 支持多种提取方法
4
  """
 
 
 
 
 
5
  import numpy as np
6
  import torch
7
- from typing import Optional, Literal
8
 
9
- # F0 提取方法类型
 
 
10
  F0Method = Literal["rmvpe", "pm", "harvest", "crepe", "hybrid"]
11
 
12
 
13
  class F0Extractor:
14
- """F0 提取器基类"""
15
 
16
  def __init__(self, sample_rate: int = 16000, hop_length: int = 160):
17
  self.sample_rate = sample_rate
@@ -20,34 +26,30 @@ class F0Extractor:
20
  self.f0_max = 1100
21
 
22
  def extract(self, audio: np.ndarray) -> np.ndarray:
23
- """提取 F0,子类需实现此方法"""
24
  raise NotImplementedError
25
 
26
 
27
  class PMExtractor(F0Extractor):
28
- """Parselmouth (Praat) F0 提取器 - 速度快"""
29
 
30
  def extract(self, audio: np.ndarray) -> np.ndarray:
31
  import parselmouth
32
 
33
  time_step = self.hop_length / self.sample_rate
34
  sound = parselmouth.Sound(audio, self.sample_rate)
35
-
36
  pitch = sound.to_pitch_ac(
37
  time_step=time_step,
38
  voicing_threshold=0.6,
39
  pitch_floor=self.f0_min,
40
- pitch_ceiling=self.f0_max
41
  )
42
-
43
  f0 = pitch.selected_array["frequency"]
44
  f0[f0 == 0] = np.nan
45
-
46
  return f0
47
 
48
 
49
  class HarvestExtractor(F0Extractor):
50
- """PyWorld Harvest F0 提取器 - 质量较好"""
51
 
52
  def extract(self, audio: np.ndarray) -> np.ndarray:
53
  import pyworld
@@ -58,26 +60,27 @@ class HarvestExtractor(F0Extractor):
58
  self.sample_rate,
59
  f0_floor=self.f0_min,
60
  f0_ceil=self.f0_max,
61
- frame_period=self.hop_length / self.sample_rate * 1000
62
  )
63
-
64
  return f0
65
 
66
 
67
  class CrepeExtractor(F0Extractor):
68
- """TorchCrepe F0 提取器 - 深度学习方法"""
69
-
70
- def __init__(self, sample_rate: int = 16000, hop_length: int = 160,
71
- device: str = "cuda"):
 
 
 
 
72
  super().__init__(sample_rate, hop_length)
73
  self.device = device
74
 
75
  def extract(self, audio: np.ndarray) -> np.ndarray:
76
  import torchcrepe
77
 
78
- audio_tensor = torch.from_numpy(audio).float().unsqueeze(0)
79
- audio_tensor = audio_tensor.to(self.device)
80
-
81
  f0, _ = torchcrepe.predict(
82
  audio_tensor,
83
  self.sample_rate,
@@ -87,111 +90,120 @@ class CrepeExtractor(F0Extractor):
87
  model="full",
88
  batch_size=512,
89
  device=self.device,
90
- return_periodicity=True
91
  )
92
-
93
- f0 = f0.squeeze(0).cpu().numpy()
94
- return f0
95
 
96
 
97
  class RMVPEExtractor(F0Extractor):
98
- """RMVPE F0 提取器 - 质量最高 (推荐)"""
99
-
100
- def __init__(self, model_path: str, sample_rate: int = 16000,
101
- hop_length: int = 160, device: str = "cuda"):
 
 
 
 
 
102
  super().__init__(sample_rate, hop_length)
103
  self.device = device
104
  self.model = None
105
  self.model_path = model_path
106
 
107
- def load_model(self):
108
- """加载 RMVPE 模型"""
109
  if self.model is not None:
110
  return
111
 
112
  from models.rmvpe import RMVPE
113
 
114
  self.model = RMVPE(self.model_path, device=self.device)
115
- print(f"RMVPE 模型已加载: {self.device}")
116
 
117
  def extract(self, audio: np.ndarray) -> np.ndarray:
118
  self.load_model()
 
119
 
120
- # RMVPE 需要 16kHz 输入
121
- f0 = self.model.infer_from_audio(audio, thred=0.01)
122
-
123
- return f0
124
-
125
-
126
- def get_f0_extractor(method: F0Method, device: str = "cuda",
127
- rmvpe_path: str = None, crepe_threshold: float = 0.05) -> F0Extractor:
128
- """
129
- 获取 F0 提取器实例
130
 
131
- Args:
132
- method: 提取方法 ("rmvpe", "pm", "harvest", "crepe", "hybrid")
133
- device: 计算设备
134
- rmvpe_path: RMVPE 模型路径 (rmvpe/hybrid 方法需要)
135
- crepe_threshold: CREPE置信度阈值 (仅hybrid方��使用)
 
 
136
 
137
- Returns:
138
- F0Extractor: 提取器实例
139
- """
140
  if method == "rmvpe":
141
  if rmvpe_path is None:
142
- raise ValueError("RMVPE 方法需要指定模型路径")
143
  return RMVPEExtractor(rmvpe_path, device=device)
144
- elif method == "hybrid":
145
  if rmvpe_path is None:
146
- raise ValueError("Hybrid 方法需要指定RMVPE模型路径")
147
- return HybridF0Extractor(rmvpe_path, device=device, crepe_threshold=crepe_threshold)
148
- elif method == "pm":
 
 
 
 
149
  return PMExtractor()
150
- elif method == "harvest":
151
  return HarvestExtractor()
152
- elif method == "crepe":
153
  return CrepeExtractor(device=device)
154
- else:
155
- raise ValueError(f"未知的 F0 提取方法: {method}")
156
 
157
 
158
  class HybridF0Extractor(F0Extractor):
159
- """混合F0提取器 - RMVPE主导 + CREPE高精度补充"""
 
 
 
 
 
160
 
161
- def __init__(self, rmvpe_path: str, sample_rate: int = 16000,
162
- hop_length: int = 160, device: str = "cuda",
163
- crepe_threshold: float = 0.05):
 
 
 
 
 
 
 
 
164
  super().__init__(sample_rate, hop_length)
165
  self.device = device
166
  self.rmvpe = RMVPEExtractor(rmvpe_path, sample_rate, hop_length, device)
167
- self.crepe = None # 延迟加载
168
- self.crepe_threshold = crepe_threshold
169
-
170
- def _load_crepe(self):
171
- """延迟加载CREPE模型"""
172
- if self.crepe is None:
173
- try:
174
- self.crepe = CrepeExtractor(self.sample_rate, self.hop_length, self.device)
175
- except ImportError:
176
- print("警告: torchcrepe未安装,混合F0将仅使用RMVPE")
177
- self.crepe = False
 
 
 
 
 
 
 
 
178
 
179
  def extract(self, audio: np.ndarray) -> np.ndarray:
180
- """
181
- 混合提取F0:
182
- 1. 使用RMVPE作为主要方法(快速、稳定)
183
- 2. 在RMVPE不稳定的区域使用CREPE补充(高精度)
184
- """
185
- # 提取RMVPE F0
186
  f0_rmvpe = self.rmvpe.extract(audio)
187
 
188
- # 如果CREPE不可用,直接返回RMVPE结果
189
  self._load_crepe()
190
  if self.crepe is False:
191
  return f0_rmvpe
192
 
193
- # 提取CREPE F0和置信度
194
  import torchcrepe
 
195
  audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
196
  f0_crepe, confidence = torchcrepe.predict(
197
  audio_tensor,
@@ -202,64 +214,39 @@ class HybridF0Extractor(F0Extractor):
202
  model="full",
203
  batch_size=512,
204
  device=self.device,
205
- return_periodicity=True
206
  )
207
  f0_crepe = f0_crepe.squeeze(0).cpu().numpy()
208
  confidence = confidence.squeeze(0).cpu().numpy()
209
 
210
- # 对齐长度
211
  min_len = min(len(f0_rmvpe), len(f0_crepe), len(confidence))
212
- f0_rmvpe = f0_rmvpe[:min_len]
213
- f0_crepe = f0_crepe[:min_len]
214
- confidence = confidence[:min_len]
215
-
216
- # 检测RMVPE不稳定区域
217
- # 1. F0跳变过大(超过3个半音)
218
- f0_diff = np.abs(np.diff(f0_rmvpe, prepend=f0_rmvpe[0]))
219
- semitone_diff = np.abs(12 * np.log2((f0_rmvpe + 1e-6) / (np.roll(f0_rmvpe, 1) + 1e-6)))
220
- semitone_diff[0] = 0
221
- unstable_jump = semitone_diff > 3.0
222
-
223
- # 2. CREPE置信度高但RMVPE给出F0=0
224
- unstable_unvoiced = (f0_rmvpe < 1e-3) & (confidence > self.crepe_threshold)
225
-
226
- # 3. RMVPE和CREPE差异过大(超过2个半音)且CREPE置信度高
227
- f0_ratio = (f0_crepe + 1e-6) / (f0_rmvpe + 1e-6)
228
- semitone_gap = np.abs(12 * np.log2(f0_ratio))
229
- unstable_diverge = (semitone_gap > 2.0) & (confidence > self.crepe_threshold * 1.5)
230
 
231
- # 合并不稳定区域
232
- unstable_mask = unstable_jump | unstable_unvoiced | unstable_diverge
 
 
 
 
 
 
 
 
 
 
 
233
 
234
- # 扩展不稳定区域(前后各2帧)以平滑过渡
235
- kernel = np.ones(5, dtype=bool)
236
- unstable_mask = np.convolve(unstable_mask, kernel, mode='same')
237
 
238
- # 混合F0:不稳定区域使用CREPE,其他区域使用RMVPE
239
  f0_hybrid = f0_rmvpe.copy()
240
- f0_hybrid[unstable_mask] = f0_crepe[unstable_mask]
241
-
242
- # 平滑过渡边界
243
- for i in range(1, len(f0_hybrid) - 1):
244
- if unstable_mask[i] != unstable_mask[i-1]:
245
- # 边界处使用加权平均
246
- w = 0.5
247
- f0_hybrid[i] = w * f0_rmvpe[i] + (1-w) * f0_crepe[i]
248
-
249
  return f0_hybrid
250
 
251
 
252
  def shift_f0(f0: np.ndarray, semitones: float) -> np.ndarray:
253
- """
254
- 音调偏移
255
 
256
- Args:
257
- f0: 原始 F0
258
- semitones: 偏移半音数 (正数升调,负数降调)
259
-
260
- Returns:
261
- np.ndarray: 偏移后的 F0
262
- """
263
  factor = 2 ** (semitones / 12)
264
- f0_shifted = f0 * factor
265
- return f0_shifted
 
1
  # -*- coding: utf-8 -*-
2
  """
3
+ F0 extraction helpers used by the local VC pipeline.
4
  """
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import Literal
9
+
10
  import numpy as np
11
  import torch
 
12
 
13
+ from infer.quality_policy import build_conservative_crepe_fill_mask
14
+
15
+
16
  F0Method = Literal["rmvpe", "pm", "harvest", "crepe", "hybrid"]
17
 
18
 
19
  class F0Extractor:
20
+ """Base F0 extractor."""
21
 
22
  def __init__(self, sample_rate: int = 16000, hop_length: int = 160):
23
  self.sample_rate = sample_rate
 
26
  self.f0_max = 1100
27
 
28
  def extract(self, audio: np.ndarray) -> np.ndarray:
 
29
  raise NotImplementedError
30
 
31
 
32
  class PMExtractor(F0Extractor):
33
+ """Praat/Parselmouth extractor."""
34
 
35
  def extract(self, audio: np.ndarray) -> np.ndarray:
36
  import parselmouth
37
 
38
  time_step = self.hop_length / self.sample_rate
39
  sound = parselmouth.Sound(audio, self.sample_rate)
 
40
  pitch = sound.to_pitch_ac(
41
  time_step=time_step,
42
  voicing_threshold=0.6,
43
  pitch_floor=self.f0_min,
44
+ pitch_ceiling=self.f0_max,
45
  )
 
46
  f0 = pitch.selected_array["frequency"]
47
  f0[f0 == 0] = np.nan
 
48
  return f0
49
 
50
 
51
  class HarvestExtractor(F0Extractor):
52
+ """PyWorld harvest extractor."""
53
 
54
  def extract(self, audio: np.ndarray) -> np.ndarray:
55
  import pyworld
 
60
  self.sample_rate,
61
  f0_floor=self.f0_min,
62
  f0_ceil=self.f0_max,
63
+ frame_period=self.hop_length / self.sample_rate * 1000,
64
  )
 
65
  return f0
66
 
67
 
68
  class CrepeExtractor(F0Extractor):
69
+ """TorchCrepe extractor."""
70
+
71
+ def __init__(
72
+ self,
73
+ sample_rate: int = 16000,
74
+ hop_length: int = 160,
75
+ device: str = "cuda",
76
+ ):
77
  super().__init__(sample_rate, hop_length)
78
  self.device = device
79
 
80
  def extract(self, audio: np.ndarray) -> np.ndarray:
81
  import torchcrepe
82
 
83
+ audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
 
 
84
  f0, _ = torchcrepe.predict(
85
  audio_tensor,
86
  self.sample_rate,
 
90
  model="full",
91
  batch_size=512,
92
  device=self.device,
93
+ return_periodicity=True,
94
  )
95
+ return f0.squeeze(0).cpu().numpy()
 
 
96
 
97
 
98
  class RMVPEExtractor(F0Extractor):
99
+ """RMVPE extractor."""
100
+
101
+ def __init__(
102
+ self,
103
+ model_path: str,
104
+ sample_rate: int = 16000,
105
+ hop_length: int = 160,
106
+ device: str = "cuda",
107
+ ):
108
  super().__init__(sample_rate, hop_length)
109
  self.device = device
110
  self.model = None
111
  self.model_path = model_path
112
 
113
+ def load_model(self) -> None:
 
114
  if self.model is not None:
115
  return
116
 
117
  from models.rmvpe import RMVPE
118
 
119
  self.model = RMVPE(self.model_path, device=self.device)
120
+ print(f"RMVPE model loaded: {self.device}")
121
 
122
  def extract(self, audio: np.ndarray) -> np.ndarray:
123
  self.load_model()
124
+ return self.model.infer_from_audio(audio, thred=0.01)
125
 
 
 
 
 
 
 
 
 
 
 
126
 
127
+ def get_f0_extractor(
128
+ method: F0Method,
129
+ device: str = "cuda",
130
+ rmvpe_path: str | None = None,
131
+ crepe_threshold: float = 0.05,
132
+ ) -> F0Extractor:
133
+ """Create an F0 extractor."""
134
 
 
 
 
135
  if method == "rmvpe":
136
  if rmvpe_path is None:
137
+ raise ValueError("RMVPE requires a model path")
138
  return RMVPEExtractor(rmvpe_path, device=device)
139
+ if method == "hybrid":
140
  if rmvpe_path is None:
141
+ raise ValueError("Hybrid requires an RMVPE model path")
142
+ return HybridF0Extractor(
143
+ rmvpe_path,
144
+ device=device,
145
+ crepe_threshold=crepe_threshold,
146
+ )
147
+ if method == "pm":
148
  return PMExtractor()
149
+ if method == "harvest":
150
  return HarvestExtractor()
151
+ if method == "crepe":
152
  return CrepeExtractor(device=device)
153
+ raise ValueError(f"Unknown F0 method: {method}")
 
154
 
155
 
156
  class HybridF0Extractor(F0Extractor):
157
+ """
158
+ Conservative hybrid extractor.
159
+
160
+ RMVPE remains the primary estimator. CREPE is only allowed to repair short,
161
+ high-confidence dropouts that sit inside already-voiced context.
162
+ """
163
 
164
+ def __init__(
165
+ self,
166
+ rmvpe_path: str,
167
+ sample_rate: int = 16000,
168
+ hop_length: int = 160,
169
+ device: str = "cuda",
170
+ crepe_threshold: float = 0.05,
171
+ max_fill_ratio: float = 0.02,
172
+ max_fill_frames: int = 320,
173
+ context_radius: int = 6,
174
+ ):
175
  super().__init__(sample_rate, hop_length)
176
  self.device = device
177
  self.rmvpe = RMVPEExtractor(rmvpe_path, sample_rate, hop_length, device)
178
+ self.crepe = None
179
+ self.crepe_threshold = float(crepe_threshold)
180
+ self.max_fill_ratio = float(max_fill_ratio)
181
+ self.max_fill_frames = int(max_fill_frames)
182
+ self.context_radius = int(context_radius)
183
+
184
+ def _load_crepe(self) -> None:
185
+ if self.crepe is not None:
186
+ return
187
+
188
+ try:
189
+ self.crepe = CrepeExtractor(
190
+ self.sample_rate,
191
+ self.hop_length,
192
+ self.device,
193
+ )
194
+ except ImportError:
195
+ print("Warning: torchcrepe is unavailable, hybrid falls back to RMVPE")
196
+ self.crepe = False
197
 
198
  def extract(self, audio: np.ndarray) -> np.ndarray:
 
 
 
 
 
 
199
  f0_rmvpe = self.rmvpe.extract(audio)
200
 
 
201
  self._load_crepe()
202
  if self.crepe is False:
203
  return f0_rmvpe
204
 
 
205
  import torchcrepe
206
+
207
  audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
208
  f0_crepe, confidence = torchcrepe.predict(
209
  audio_tensor,
 
214
  model="full",
215
  batch_size=512,
216
  device=self.device,
217
+ return_periodicity=True,
218
  )
219
  f0_crepe = f0_crepe.squeeze(0).cpu().numpy()
220
  confidence = confidence.squeeze(0).cpu().numpy()
221
 
 
222
  min_len = min(len(f0_rmvpe), len(f0_crepe), len(confidence))
223
+ if min_len <= 0:
224
+ return f0_rmvpe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
+ f0_rmvpe = np.asarray(f0_rmvpe[:min_len], dtype=np.float32)
227
+ f0_crepe = np.asarray(f0_crepe[:min_len], dtype=np.float32)
228
+ confidence = np.asarray(confidence[:min_len], dtype=np.float32)
229
+
230
+ fill_mask = build_conservative_crepe_fill_mask(
231
+ f0_rmvpe=f0_rmvpe,
232
+ f0_crepe=f0_crepe,
233
+ confidence=confidence,
234
+ confidence_threshold=self.crepe_threshold,
235
+ max_ratio=self.max_fill_ratio,
236
+ max_frames=self.max_fill_frames,
237
+ context_radius=self.context_radius,
238
+ )
239
 
240
+ if not np.any(fill_mask):
241
+ return f0_rmvpe
 
242
 
 
243
  f0_hybrid = f0_rmvpe.copy()
244
+ f0_hybrid[fill_mask] = f0_crepe[fill_mask]
 
 
 
 
 
 
 
 
245
  return f0_hybrid
246
 
247
 
248
  def shift_f0(f0: np.ndarray, semitones: float) -> np.ndarray:
249
+ """Shift F0 by semitones."""
 
250
 
 
 
 
 
 
 
 
251
  factor = 2 ** (semitones / 12)
252
+ return f0 * factor
 
infer/modules/uvr5/modules.py CHANGED
@@ -4,7 +4,7 @@ import logging
4
 
5
  logger = logging.getLogger(__name__)
6
 
7
- import ffmpeg
8
  import torch
9
 
10
  from configs.config import Config
@@ -20,6 +20,24 @@ except ImportError:
20
  config = Config()
21
 
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0):
24
  infos = []
25
  try:
@@ -74,9 +92,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
74
  need_reformat = 1
75
  done = 0
76
  try:
77
- info = ffmpeg.probe(inp_path, cmd="ffprobe")
78
- channels = info["streams"][0]["channels"]
79
- sample_rate = info["streams"][0]["sample_rate"]
80
  if log:
81
  log.audio(f"音频信息: {channels}声道, {sample_rate}Hz")
82
 
 
4
 
5
  logger = logging.getLogger(__name__)
6
 
7
+ import av
8
  import torch
9
 
10
  from configs.config import Config
 
20
  config = Config()
21
 
22
 
23
+ def _read_audio_metadata(path: str) -> tuple[int, str]:
24
+ with av.open(path) as container:
25
+ stream = next((candidate for candidate in container.streams if candidate.type == "audio"), None)
26
+ if stream is None:
27
+ raise RuntimeError(f"No audio stream found in {path}")
28
+
29
+ codec_context = stream.codec_context
30
+ channels = getattr(codec_context, "channels", None)
31
+ if not channels and getattr(stream, "layout", None) is not None:
32
+ channels = getattr(stream.layout, "nb_channels", None)
33
+
34
+ sample_rate = getattr(codec_context, "sample_rate", None) or getattr(stream, "sample_rate", None)
35
+ if channels is None or sample_rate is None:
36
+ raise RuntimeError(f"Unable to read audio metadata from {path}")
37
+
38
+ return int(channels), str(sample_rate)
39
+
40
+
41
  def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0):
42
  infos = []
43
  try:
 
92
  need_reformat = 1
93
  done = 0
94
  try:
95
+ channels, sample_rate = _read_audio_metadata(inp_path)
 
 
96
  if log:
97
  log.audio(f"音频信息: {channels}声道, {sample_rate}Hz")
98
 
infer/modules/vc/pipeline.py CHANGED
@@ -29,6 +29,11 @@ except ImportError:
29
  log = None
30
 
31
  from lib.audio import soft_clip
 
 
 
 
 
32
 
33
  bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
34
 
@@ -372,6 +377,12 @@ class Pipeline(object):
372
  self.crepe_pd_threshold = float(getattr(config, "crepe_pd_threshold", 0.1))
373
  self.crepe_force_ratio = float(getattr(config, "crepe_force_ratio", 0.05))
374
  self.crepe_replace_semitones = float(getattr(config, "crepe_replace_semitones", 0.0))
 
 
 
 
 
 
375
  self.f0_fallback_context_radius = int(getattr(config, "f0_fallback_context_radius", 24))
376
  self.f0_fallback_repair_gap = int(getattr(config, "f0_fallback_repair_gap", 12))
377
  self.f0_fallback_post_gap = int(getattr(config, "f0_fallback_post_gap", 10))
@@ -394,6 +405,12 @@ class Pipeline(object):
394
  self.crepe_pd_threshold = 0.0
395
  if self.crepe_replace_semitones < 0:
396
  self.crepe_replace_semitones = 0.0
 
 
 
 
 
 
397
  if self.f0_fallback_context_radius < 1:
398
  self.f0_fallback_context_radius = 1
399
  if self.f0_fallback_repair_gap < 0:
@@ -421,6 +438,10 @@ class Pipeline(object):
421
  f"F0混合: {self.f0_hybrid_mode}, CREPE阈值: {self.crepe_pd_threshold}, "
422
  f"强制比率: {self.crepe_force_ratio}, 替换阈值(半音): {self.crepe_replace_semitones}"
423
  )
 
 
 
 
424
  log.detail(
425
  f"F0兜底: 上下文半径={self.f0_fallback_context_radius}, "
426
  f"预修补长度={self.f0_fallback_repair_gap}, 后修补长度={self.f0_fallback_post_gap}, "
@@ -466,12 +487,12 @@ class Pipeline(object):
466
  log.detail(f"时间步长: {time_step:.2f}ms, F0范围: {f0_min}-{f0_max}Hz")
467
  log.detail(f"音频长度: {len(x)} 样本, p_len: {p_len}")
468
 
469
- # hybrid映射到rmvpe+crepe模式
470
  if f0_method == "hybrid":
471
  f0_method = "rmvpe"
472
- # 临时设置hybrid模式
473
  original_hybrid_mode = self.f0_hybrid_mode
474
- self.f0_hybrid_mode = "rmvpe+crepe"
 
475
  restore_hybrid_mode = True
476
  else:
477
  restore_hybrid_mode = False
@@ -704,7 +725,19 @@ class Pipeline(object):
704
  else:
705
  f0_fb = f0_fb[: len(f0)]
706
 
707
- fill_mask = need_fill & (f0_fb > 0)
 
 
 
 
 
 
 
 
 
 
 
 
708
  f0[fill_mask] = f0_fb[fill_mask]
709
 
710
  need_fill2 = (f0 <= 0) & energy_mask & voiced_context
@@ -923,40 +956,58 @@ class Pipeline(object):
923
 
924
  _energy_db = 20.0 * np.log10(_frame_rms + 1e-8)
925
  _ref = energy_ref_db if energy_ref_db is not None else float(np.percentile(_energy_db, 95))
926
- # Soft gate: sigmoid curve centered at ref-45dB with 6dB transition width.
927
- # Frames well above threshold → gain≈1; frames well below → gain≈0.05
928
- # (keep a small floor to avoid zero-feature shock to the network).
929
- _silence_center = _ref - 45.0
930
- _transition_width = 6.0 # dB for the sigmoid ramp
931
- _energy_gate = 1.0 / (1.0 + np.exp(-(_energy_db - _silence_center) / (_transition_width / 4.0)))
932
- # Apply floor: never fully zero features (network handles near-zero better than hard zero)
933
- _energy_gate = np.clip(_energy_gate, 0.05, 1.0)
 
 
 
 
 
 
 
 
 
 
934
  # Smooth temporally
935
  _sm = np.array([1, 2, 3, 2, 1], dtype=np.float32)
936
  _sm /= _sm.sum()
937
- _energy_gate = np.convolve(_energy_gate, _sm, mode='same')[:_p_len_val]
938
- _energy_gate = np.clip(_energy_gate, 0.05, 1.0)
 
 
 
 
 
 
 
 
939
 
940
  # Apply soft gate to features
941
  _feat_len = feats.shape[1]
942
- if len(_energy_gate) > _feat_len:
943
- _feat_gate = _energy_gate[:_feat_len]
944
- elif len(_energy_gate) < _feat_len:
945
- _feat_gate = np.pad(_energy_gate, (0, _feat_len - len(_energy_gate)), mode='constant', constant_values=1.0)
946
  else:
947
- _feat_gate = _energy_gate
948
  _gate_t = torch.from_numpy(_feat_gate.astype(np.float32)).to(feats.device).unsqueeze(0).unsqueeze(-1)
949
  feats = feats * _gate_t
950
 
951
  # F0 soft gating: consistently soft-attenuate both pitch confidence and pitch value
952
  if pitch is not None and pitchf is not None:
953
  _pitch_len = pitch.shape[1]
954
- if len(_energy_gate) > _pitch_len:
955
- _f0_gate = _energy_gate[:_pitch_len]
956
- elif len(_energy_gate) < _pitch_len:
957
- _f0_gate = np.pad(_energy_gate, (0, _pitch_len - len(_energy_gate)), mode='constant', constant_values=1.0)
958
  else:
959
- _f0_gate = _energy_gate
960
  _f0_gate_t = torch.from_numpy(_f0_gate.astype(np.float32)).to(pitch.device).unsqueeze(0)
961
  pitchf = pitchf * _f0_gate_t
962
  # Soft-blend pitch toward silence bin (1) instead of hard switch
@@ -1075,6 +1126,9 @@ class Pipeline(object):
1075
  )
1076
  if log:
1077
  log.detail(f"分段数量: {len(opt_ts) + 1}")
 
 
 
1078
  else:
1079
  if log:
1080
  if self.disable_chunking:
@@ -1140,10 +1194,14 @@ class Pipeline(object):
1140
  segment_count = len(opt_ts) + 1
1141
  current_segment = 0
1142
 
1143
- # Crossfade length at target rate (~12ms). Each boundary segment
1144
- # keeps this many extra samples from the normally-trimmed padding
1145
- # region. The overlap between adjacent segments is 2 * _xfade_tgt.
1146
- _xfade_tgt = min(int(0.012 * tgt_sr), self.t_pad_tgt // 4) if len(opt_ts) > 0 else 0
 
 
 
 
1147
 
1148
  def _trim_segment(raw, is_first, is_last):
1149
  """Trim padding from vc() output, keeping crossfade overlap."""
@@ -1241,8 +1299,9 @@ class Pipeline(object):
1241
  for seg in audio_opt[1:]:
1242
  xf = min(overlap, len(result), len(seg))
1243
  if xf > 1:
1244
- fade_out = np.linspace(1.0, 0.0, xf, dtype=np.float32)
1245
- fade_in = 1.0 - fade_out
 
1246
  blended = result[-xf:] * fade_out + seg[:xf] * fade_in
1247
  result = np.concatenate([result[:-xf], blended, seg[xf:]])
1248
  else:
 
29
  log = None
30
 
31
  from lib.audio import soft_clip
32
+ from infer.quality_policy import (
33
+ build_conservative_harvest_fill_mask,
34
+ compute_breath_preserving_energy_gates,
35
+ compute_chunk_crossfade_samples,
36
+ )
37
 
38
  bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
39
 
 
377
  self.crepe_pd_threshold = float(getattr(config, "crepe_pd_threshold", 0.1))
378
  self.crepe_force_ratio = float(getattr(config, "crepe_force_ratio", 0.05))
379
  self.crepe_replace_semitones = float(getattr(config, "crepe_replace_semitones", 0.0))
380
+ self.unvoiced_feature_gate_floor = float(
381
+ getattr(config, "unvoiced_feature_gate_floor", 0.28)
382
+ )
383
+ self.breath_active_margin_db = float(
384
+ getattr(config, "breath_active_margin_db", 52.0)
385
+ )
386
  self.f0_fallback_context_radius = int(getattr(config, "f0_fallback_context_radius", 24))
387
  self.f0_fallback_repair_gap = int(getattr(config, "f0_fallback_repair_gap", 12))
388
  self.f0_fallback_post_gap = int(getattr(config, "f0_fallback_post_gap", 10))
 
405
  self.crepe_pd_threshold = 0.0
406
  if self.crepe_replace_semitones < 0:
407
  self.crepe_replace_semitones = 0.0
408
+ if self.unvoiced_feature_gate_floor < 0.05:
409
+ self.unvoiced_feature_gate_floor = 0.05
410
+ if self.unvoiced_feature_gate_floor > 1.0:
411
+ self.unvoiced_feature_gate_floor = 1.0
412
+ if self.breath_active_margin_db < 1.0:
413
+ self.breath_active_margin_db = 1.0
414
  if self.f0_fallback_context_radius < 1:
415
  self.f0_fallback_context_radius = 1
416
  if self.f0_fallback_repair_gap < 0:
 
438
  f"F0混合: {self.f0_hybrid_mode}, CREPE阈值: {self.crepe_pd_threshold}, "
439
  f"强制比率: {self.crepe_force_ratio}, 替换阈值(半音): {self.crepe_replace_semitones}"
440
  )
441
+ log.detail(
442
+ f"气声门控: 特征地板={self.unvoiced_feature_gate_floor:.2f}, "
443
+ f"激活边界=ref-{self.breath_active_margin_db:.1f}dB"
444
+ )
445
  log.detail(
446
  f"F0兜底: 上下文半径={self.f0_fallback_context_radius}, "
447
  f"预修补长度={self.f0_fallback_repair_gap}, 后修补长度={self.f0_fallback_post_gap}, "
 
487
  log.detail(f"时间步长: {time_step:.2f}ms, F0范围: {f0_min}-{f0_max}Hz")
488
  log.detail(f"音频长度: {len(x)} 样本, p_len: {p_len}")
489
 
490
+ # Normalize direct hybrid requests to the conservative RMVPE fallback path.
491
  if f0_method == "hybrid":
492
  f0_method = "rmvpe"
 
493
  original_hybrid_mode = self.f0_hybrid_mode
494
+ if self.f0_hybrid_mode not in self.rmvpe_fallback_modes:
495
+ self.f0_hybrid_mode = "fallback"
496
  restore_hybrid_mode = True
497
  else:
498
  restore_hybrid_mode = False
 
725
  else:
726
  f0_fb = f0_fb[: len(f0)]
727
 
728
+ fill_mask = build_conservative_harvest_fill_mask(
729
+ reference_f0=f0,
730
+ fallback_f0=f0_fb,
731
+ dropout_mask=need_fill,
732
+ max_run=10,
733
+ local_radius=4,
734
+ max_semitones=4.0,
735
+ )
736
+ rejected_fill_count = int(np.sum(need_fill)) - int(np.sum(fill_mask))
737
+ if rejected_fill_count > 0 and log:
738
+ log.detail(
739
+ f"Harvest兜底已拒绝 {rejected_fill_count} 帧长间隙/离群音高,避免呼吸与回气误赋音高"
740
+ )
741
  f0[fill_mask] = f0_fb[fill_mask]
742
 
743
  need_fill2 = (f0 <= 0) & energy_mask & voiced_context
 
956
 
957
  _energy_db = 20.0 * np.log10(_frame_rms + 1e-8)
958
  _ref = energy_ref_db if energy_ref_db is not None else float(np.percentile(_energy_db, 95))
959
+ _unvoiced_mask = None
960
+ if pitchf is not None:
961
+ _pitchf_np = pitchf[0].detach().float().cpu().numpy()
962
+ if len(_pitchf_np) > _p_len_val:
963
+ _pitchf_np = _pitchf_np[:_p_len_val]
964
+ elif len(_pitchf_np) < _p_len_val:
965
+ _pitchf_np = np.pad(_pitchf_np, (0, _p_len_val - len(_pitchf_np)), mode="edge")
966
+ _unvoiced_mask = _pitchf_np <= 1e-4
967
+
968
+ _feat_gate_curve, _f0_gate_curve = compute_breath_preserving_energy_gates(
969
+ energy_db=_energy_db,
970
+ ref_db=_ref,
971
+ unvoiced_mask=_unvoiced_mask,
972
+ quiet_floor=0.05,
973
+ breath_floor=self.unvoiced_feature_gate_floor,
974
+ breath_active_margin_db=self.breath_active_margin_db,
975
+ transition_width_db=6.0,
976
+ )
977
  # Smooth temporally
978
  _sm = np.array([1, 2, 3, 2, 1], dtype=np.float32)
979
  _sm /= _sm.sum()
980
+ _feat_gate_curve = np.convolve(_feat_gate_curve, _sm, mode='same')[:_p_len_val]
981
+ _f0_gate_curve = np.convolve(_f0_gate_curve, _sm, mode='same')[:_p_len_val]
982
+ _feat_gate_curve = np.clip(_feat_gate_curve, 0.05, 1.0)
983
+ _f0_gate_curve = np.clip(_f0_gate_curve, 0.05, 1.0)
984
+ if log and _unvoiced_mask is not None:
985
+ _breath_frames = int(np.sum(_feat_gate_curve > (_f0_gate_curve + 1e-4)))
986
+ if _breath_frames > 0:
987
+ log.detail(
988
+ f"气声保护门控: { _breath_frames }/{ _p_len_val } 帧保留更高特征底噪"
989
+ )
990
 
991
  # Apply soft gate to features
992
  _feat_len = feats.shape[1]
993
+ if len(_feat_gate_curve) > _feat_len:
994
+ _feat_gate = _feat_gate_curve[:_feat_len]
995
+ elif len(_feat_gate_curve) < _feat_len:
996
+ _feat_gate = np.pad(_feat_gate_curve, (0, _feat_len - len(_feat_gate_curve)), mode='constant', constant_values=1.0)
997
  else:
998
+ _feat_gate = _feat_gate_curve
999
  _gate_t = torch.from_numpy(_feat_gate.astype(np.float32)).to(feats.device).unsqueeze(0).unsqueeze(-1)
1000
  feats = feats * _gate_t
1001
 
1002
  # F0 soft gating: consistently soft-attenuate both pitch confidence and pitch value
1003
  if pitch is not None and pitchf is not None:
1004
  _pitch_len = pitch.shape[1]
1005
+ if len(_f0_gate_curve) > _pitch_len:
1006
+ _f0_gate = _f0_gate_curve[:_pitch_len]
1007
+ elif len(_f0_gate_curve) < _pitch_len:
1008
+ _f0_gate = np.pad(_f0_gate_curve, (0, _pitch_len - len(_f0_gate_curve)), mode='constant', constant_values=1.0)
1009
  else:
1010
+ _f0_gate = _f0_gate_curve
1011
  _f0_gate_t = torch.from_numpy(_f0_gate.astype(np.float32)).to(pitch.device).unsqueeze(0)
1012
  pitchf = pitchf * _f0_gate_t
1013
  # Soft-blend pitch toward silence bin (1) instead of hard switch
 
1126
  )
1127
  if log:
1128
  log.detail(f"分段数量: {len(opt_ts) + 1}")
1129
+ if opt_ts:
1130
+ boundary_times = [round(float(t) / 16000.0, 3) for t in opt_ts[:12]]
1131
+ log.detail(f"分段边界(秒): {boundary_times}")
1132
  else:
1133
  if log:
1134
  if self.disable_chunking:
 
1194
  segment_count = len(opt_ts) + 1
1195
  current_segment = 0
1196
 
1197
+ # Keep a wider overlap than the legacy 12ms crossfade. Adjacent
1198
+ # chunks are synthesized independently and benefit from a longer,
1199
+ # equal-power merge to suppress tearing at the boundaries.
1200
+ _xfade_tgt = compute_chunk_crossfade_samples(
1201
+ tgt_sr=tgt_sr,
1202
+ t_pad_tgt=self.t_pad_tgt,
1203
+ segment_count=segment_count,
1204
+ )
1205
 
1206
  def _trim_segment(raw, is_first, is_last):
1207
  """Trim padding from vc() output, keeping crossfade overlap."""
 
1299
  for seg in audio_opt[1:]:
1300
  xf = min(overlap, len(result), len(seg))
1301
  if xf > 1:
1302
+ theta = np.linspace(0.0, np.pi / 2.0, xf, dtype=np.float32)
1303
+ fade_out = np.cos(theta)
1304
+ fade_in = np.sin(theta)
1305
  blended = result[-xf:] * fade_out + seg[:xf] * fade_in
1306
  result = np.concatenate([result[:-xf], blended, seg[xf:]])
1307
  else:
infer/official_adapter.py CHANGED
@@ -16,9 +16,22 @@ from typing import Optional, Tuple
16
  import soundfile as sf
17
 
18
  from configs.config import Config as OfficialConfig
 
19
  from lib.logger import log
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def _load_app_config(root_dir: Path) -> dict:
23
  config_path = root_dir / "configs" / "config.json"
24
  if config_path.exists():
@@ -41,6 +54,17 @@ def _to_float(value, default):
41
  return float(default)
42
 
43
 
 
 
 
 
 
 
 
 
 
 
 
44
  def _resolve_index_path(model_path: Path, index_path: Optional[str]) -> Optional[Path]:
45
  """Best-effort resolve of the matching FAISS index for a model."""
46
  if index_path:
@@ -189,13 +213,15 @@ def separate_uvr5(
189
  fmt: str = "wav"
190
  ) -> Tuple[str, str]:
191
  """Run UVR5 separation and return vocals/ins paths."""
 
192
  log.progress("开始UVR5人声分离...")
193
  log.detail(f"输入音频: {input_audio}")
194
  log.detail(f"临时目录: {temp_dir}")
195
  log.config(f"激进度: {agg}, 输出格式: {fmt}")
196
 
197
  setup_official_env(Path(__file__).parent.parent)
198
- from infer.modules.uvr5.modules import uvr as uvr5_run
 
199
  try:
200
  import ffmpeg # noqa: F401
201
  except Exception as e:
@@ -246,6 +272,26 @@ def separate_uvr5(
246
  if not vocal_files or not ins_files:
247
  raise RuntimeError("UVR5 分离失败,未生成输出文件")
248
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  log.success(f"UVR5分离完成")
250
  log.audio(f"人声文件: {vocal_files[-1].name}")
251
  log.audio(f"伴奏文件: {ins_files[-1].name}")
@@ -271,6 +317,14 @@ def convert_vocals_official(
271
  rms_mix_rate = float(max(0.0, min(1.0, rms_mix_rate)))
272
  # Official vc pipeline uses the opposite convention: 1=off, 0=strongest.
273
  official_rms_mix_rate = 1.0 - rms_mix_rate
 
 
 
 
 
 
 
 
274
 
275
  log.progress("开始官方VC人声转换...")
276
  log.detail(f"输入人声: {vocals_path}")
@@ -280,6 +334,14 @@ def convert_vocals_official(
280
  log.model(f"索引文件: {Path(index_path).name}")
281
 
282
  log.config(f"F0方法: {f0_method}")
 
 
 
 
 
 
 
 
283
  log.config(f"音调偏移: {pitch_shift} 半音")
284
  log.config(f"索引率: {index_rate}")
285
  log.config(f"滤波半径: {filter_radius}")
@@ -287,8 +349,6 @@ def convert_vocals_official(
287
  log.config(f"保护系数: {protect}")
288
  if repair_profile:
289
  log.config("唱歌修复: 开启")
290
-
291
- root_dir = Path(__file__).parent.parent
292
  env_paths = setup_official_env(root_dir)
293
 
294
  log.detail("导入官方VC模块...")
@@ -306,7 +366,6 @@ def convert_vocals_official(
306
 
307
  log.detail("初始���官方配置...")
308
  config = OfficialConfig()
309
- app_cfg = _load_app_config(root_dir)
310
  config.disable_chunking = bool(app_cfg.get("disable_chunking", False))
311
  if "cover" in app_cfg and isinstance(app_cfg["cover"], dict):
312
  config.disable_chunking = bool(app_cfg["cover"].get("disable_chunking", config.disable_chunking))
@@ -317,7 +376,7 @@ def convert_vocals_official(
317
  # Keep RMVPE extraction aligned with RVC training pitch embedding range.
318
  # Allowing much wider ranges (e.g. 1600Hz) often tracks higher harmonics
319
  # instead of the fundamental and introduces synthetic buzzing artifacts.
320
- if f0_method == "rmvpe":
321
  if config.f0_min != 50.0 or config.f0_max != 1100.0:
322
  log.warning(
323
  "检测到RMVPE F0范围偏离RVC训练范围,已强制使用 50-1100Hz 以避免误跟踪高次谐波"
@@ -328,7 +387,7 @@ def convert_vocals_official(
328
  config.f0_energy_threshold_db = _to_float(
329
  _get_cfg_value(app_cfg, "f0_energy_threshold_db", -50), -50
330
  )
331
- config.f0_hybrid_mode = str(_get_cfg_value(app_cfg, "f0_hybrid_mode", "off"))
332
  config.crepe_pd_threshold = _to_float(
333
  _get_cfg_value(app_cfg, "crepe_pd_threshold", 0.1), 0.1
334
  )
@@ -338,6 +397,12 @@ def convert_vocals_official(
338
  config.crepe_replace_semitones = _to_float(
339
  _get_cfg_value(app_cfg, "crepe_replace_semitones", 0.0), 0.0
340
  )
 
 
 
 
 
 
341
  config.f0_stabilize = bool(_get_cfg_value(app_cfg, "f0_stabilize", False))
342
  config.f0_stabilize_window = int(_get_cfg_value(app_cfg, "f0_stabilize_window", 2))
343
  config.f0_stabilize_max_semitones = _to_float(
@@ -348,10 +413,32 @@ def convert_vocals_official(
348
  config.f0_rate_limit_semitones = _to_float(
349
  _get_cfg_value(app_cfg, "f0_rate_limit_semitones", 8.0), 8.0
350
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  if repair_profile:
352
  config.is_half = False
353
  config.f0_hybrid_mode = "fallback"
354
  config.f0_energy_threshold_db = -42.0
 
 
 
355
  config.f0_fallback_context_radius = 12
356
  config.f0_fallback_repair_gap = 6
357
  config.f0_fallback_post_gap = 4
@@ -365,6 +452,10 @@ def convert_vocals_official(
365
  log.config(f"F0范围: {config.f0_min}-{config.f0_max}Hz")
366
  log.config(f"RMVPE阈值: {config.rmvpe_threshold}")
367
  log.config(f"F0能量阈值: {config.f0_energy_threshold_db}dB")
 
 
 
 
368
  log.config(
369
  f"F0混合: {config.f0_hybrid_mode}, CREPE阈值: {config.crepe_pd_threshold}, "
370
  f"强制比率: {config.crepe_force_ratio}, 替换阈值(半音): {config.crepe_replace_semitones}"
@@ -402,7 +493,7 @@ def convert_vocals_official(
402
  vocals_path,
403
  pitch_shift,
404
  None,
405
- f0_method,
406
  official_index or "",
407
  "",
408
  index_rate,
@@ -513,6 +604,13 @@ def convert_vocals_official_upstream(
513
  ) -> str:
514
  """Run vendored upstream official RVC in an isolated subprocess."""
515
  root_dir = Path(__file__).parent.parent
 
 
 
 
 
 
 
516
  env_paths = setup_upstream_official_env(root_dir)
517
 
518
  sid, official_index = export_model_to_official(
@@ -537,7 +635,7 @@ def convert_vocals_official_upstream(
537
  "--output-path",
538
  str(output_path),
539
  "--f0-method",
540
- str(f0_method),
541
  "--pitch-shift",
542
  str(int(pitch_shift)),
543
  "--index-path",
@@ -558,6 +656,14 @@ def convert_vocals_official_upstream(
558
  log.detail(f"官方模型SID: {sid}")
559
  if official_index:
560
  log.detail(f"官方索引路径: {official_index}")
 
 
 
 
 
 
 
 
561
  log.detail(f"官方RMS混合率: {official_rms_mix_rate}")
562
 
563
  try:
 
16
  import soundfile as sf
17
 
18
  from configs.config import Config as OfficialConfig
19
+ from infer.quality_policy import resolve_cover_f0_policy
20
  from lib.logger import log
21
 
22
 
23
+ class _IsolatedArgv:
24
+ """Prevent vendored official modules from parsing this process' CLI args."""
25
+
26
+ def __enter__(self):
27
+ self._saved_argv = sys.argv[:]
28
+ sys.argv = [sys.argv[0]]
29
+
30
+ def __exit__(self, exc_type, exc, tb):
31
+ sys.argv = self._saved_argv
32
+ return False
33
+
34
+
35
  def _load_app_config(root_dir: Path) -> dict:
36
  config_path = root_dir / "configs" / "config.json"
37
  if config_path.exists():
 
54
  return float(default)
55
 
56
 
57
+ def _get_audio_activity_stats(audio_path: Path) -> Tuple[float, float, int]:
58
+ audio, _ = sf.read(str(audio_path), dtype="float32", always_2d=True)
59
+ if audio.size == 0:
60
+ return 0.0, 0.0, 0
61
+ mono = audio.mean(axis=1)
62
+ rms = float((mono.astype("float64") ** 2).mean() ** 0.5)
63
+ peak = float(abs(mono).max())
64
+ nonzero = int((abs(mono) > 1e-8).sum())
65
+ return rms, peak, nonzero
66
+
67
+
68
  def _resolve_index_path(model_path: Path, index_path: Optional[str]) -> Optional[Path]:
69
  """Best-effort resolve of the matching FAISS index for a model."""
70
  if index_path:
 
213
  fmt: str = "wav"
214
  ) -> Tuple[str, str]:
215
  """Run UVR5 separation and return vocals/ins paths."""
216
+ temp_dir = Path(temp_dir).resolve()
217
  log.progress("开始UVR5人声分离...")
218
  log.detail(f"输入音频: {input_audio}")
219
  log.detail(f"临时目录: {temp_dir}")
220
  log.config(f"激进度: {agg}, 输出格式: {fmt}")
221
 
222
  setup_official_env(Path(__file__).parent.parent)
223
+ with _IsolatedArgv():
224
+ from infer.modules.uvr5.modules import uvr as uvr5_run
225
  try:
226
  import ffmpeg # noqa: F401
227
  except Exception as e:
 
272
  if not vocal_files or not ins_files:
273
  raise RuntimeError("UVR5 分离失败,未生成输出文件")
274
 
275
+ vocal_rms, vocal_peak, vocal_nonzero = _get_audio_activity_stats(vocal_files[-1])
276
+ ins_rms, ins_peak, ins_nonzero = _get_audio_activity_stats(ins_files[-1])
277
+ log.detail(
278
+ "UVR5输出统计: "
279
+ f"vocals(rms={vocal_rms:.8f}, peak={vocal_peak:.8f}, nonzero={vocal_nonzero}), "
280
+ f"ins(rms={ins_rms:.8f}, peak={ins_peak:.8f}, nonzero={ins_nonzero})"
281
+ )
282
+ if vocal_peak <= 1e-6 or vocal_nonzero == 0:
283
+ raise RuntimeError(
284
+ "UVR5 分离失败:人声输出为空或静音,"
285
+ f"模型={model_name}, vocal={vocal_files[-1]}, "
286
+ f"rms={vocal_rms:.8f}, peak={vocal_peak:.8f}, nonzero={vocal_nonzero}"
287
+ )
288
+ if ins_peak <= 1e-6 or ins_nonzero == 0:
289
+ raise RuntimeError(
290
+ "UVR5 分离失败:伴奏输出为空或静音,"
291
+ f"模型={model_name}, instrumental={ins_files[-1]}, "
292
+ f"rms={ins_rms:.8f}, peak={ins_peak:.8f}, nonzero={ins_nonzero}"
293
+ )
294
+
295
  log.success(f"UVR5分离完成")
296
  log.audio(f"人声文件: {vocal_files[-1].name}")
297
  log.audio(f"伴奏文件: {ins_files[-1].name}")
 
317
  rms_mix_rate = float(max(0.0, min(1.0, rms_mix_rate)))
318
  # Official vc pipeline uses the opposite convention: 1=off, 0=strongest.
319
  official_rms_mix_rate = 1.0 - rms_mix_rate
320
+ root_dir = Path(__file__).parent.parent
321
+ app_cfg = _load_app_config(root_dir)
322
+ f0_policy = resolve_cover_f0_policy(
323
+ requested_method=f0_method,
324
+ configured_hybrid_mode=str(_get_cfg_value(app_cfg, "f0_hybrid_mode", "off")),
325
+ repair_profile=repair_profile,
326
+ )
327
+ effective_f0_method = f0_policy.vc_method
328
 
329
  log.progress("开始官方VC人声转换...")
330
  log.detail(f"输入人声: {vocals_path}")
 
334
  log.model(f"索引文件: {Path(index_path).name}")
335
 
336
  log.config(f"F0方法: {f0_method}")
337
+ if effective_f0_method != str(f0_method).strip().lower():
338
+ log.detail(
339
+ "F0路由解析: "
340
+ f"requested={f0_policy.requested_method}, "
341
+ f"vc={f0_policy.vc_method}, "
342
+ f"hybrid_mode={f0_policy.hybrid_mode}, "
343
+ f"gate={f0_policy.gate_method}"
344
+ )
345
  log.config(f"音调偏移: {pitch_shift} 半音")
346
  log.config(f"索引率: {index_rate}")
347
  log.config(f"滤波半径: {filter_radius}")
 
349
  log.config(f"保护系数: {protect}")
350
  if repair_profile:
351
  log.config("唱歌修复: 开启")
 
 
352
  env_paths = setup_official_env(root_dir)
353
 
354
  log.detail("导入官方VC模块...")
 
366
 
367
  log.detail("初始���官方配置...")
368
  config = OfficialConfig()
 
369
  config.disable_chunking = bool(app_cfg.get("disable_chunking", False))
370
  if "cover" in app_cfg and isinstance(app_cfg["cover"], dict):
371
  config.disable_chunking = bool(app_cfg["cover"].get("disable_chunking", config.disable_chunking))
 
376
  # Keep RMVPE extraction aligned with RVC training pitch embedding range.
377
  # Allowing much wider ranges (e.g. 1600Hz) often tracks higher harmonics
378
  # instead of the fundamental and introduces synthetic buzzing artifacts.
379
+ if effective_f0_method == "rmvpe":
380
  if config.f0_min != 50.0 or config.f0_max != 1100.0:
381
  log.warning(
382
  "检测到RMVPE F0范围偏离RVC训练范围,已强制使用 50-1100Hz 以避免误跟踪高次谐波"
 
387
  config.f0_energy_threshold_db = _to_float(
388
  _get_cfg_value(app_cfg, "f0_energy_threshold_db", -50), -50
389
  )
390
+ config.f0_hybrid_mode = f0_policy.hybrid_mode
391
  config.crepe_pd_threshold = _to_float(
392
  _get_cfg_value(app_cfg, "crepe_pd_threshold", 0.1), 0.1
393
  )
 
397
  config.crepe_replace_semitones = _to_float(
398
  _get_cfg_value(app_cfg, "crepe_replace_semitones", 0.0), 0.0
399
  )
400
+ config.unvoiced_feature_gate_floor = _to_float(
401
+ _get_cfg_value(app_cfg, "unvoiced_feature_gate_floor", 0.28), 0.28
402
+ )
403
+ config.breath_active_margin_db = _to_float(
404
+ _get_cfg_value(app_cfg, "breath_active_margin_db", 52.0), 52.0
405
+ )
406
  config.f0_stabilize = bool(_get_cfg_value(app_cfg, "f0_stabilize", False))
407
  config.f0_stabilize_window = int(_get_cfg_value(app_cfg, "f0_stabilize_window", 2))
408
  config.f0_stabilize_max_semitones = _to_float(
 
413
  config.f0_rate_limit_semitones = _to_float(
414
  _get_cfg_value(app_cfg, "f0_rate_limit_semitones", 8.0), 8.0
415
  )
416
+ if f0_policy.requested_method == "hybrid":
417
+ config.f0_fallback_context_radius = 12
418
+ config.f0_fallback_repair_gap = 6
419
+ config.f0_fallback_post_gap = 4
420
+ config.f0_fallback_use_crepe = True
421
+ config.f0_fallback_crepe_max_ratio = 0.006
422
+ config.f0_fallback_crepe_max_frames = 160
423
+ if not repair_profile:
424
+ config.f0_stabilize = True
425
+ config.f0_rate_limit = True
426
+ config.f0_stabilize_max_semitones = min(
427
+ 5.0,
428
+ float(config.f0_stabilize_max_semitones),
429
+ )
430
+ config.f0_rate_limit_semitones = min(
431
+ 7.0,
432
+ float(config.f0_rate_limit_semitones),
433
+ )
434
+ log.detail("Hybrid唱歌护栏已启用: F0稳定器 + F0限速")
435
  if repair_profile:
436
  config.is_half = False
437
  config.f0_hybrid_mode = "fallback"
438
  config.f0_energy_threshold_db = -42.0
439
+ config.unvoiced_feature_gate_floor = max(
440
+ 0.32, float(config.unvoiced_feature_gate_floor)
441
+ )
442
  config.f0_fallback_context_radius = 12
443
  config.f0_fallback_repair_gap = 6
444
  config.f0_fallback_post_gap = 4
 
452
  log.config(f"F0范围: {config.f0_min}-{config.f0_max}Hz")
453
  log.config(f"RMVPE阈值: {config.rmvpe_threshold}")
454
  log.config(f"F0能量阈值: {config.f0_energy_threshold_db}dB")
455
+ log.config(
456
+ f"无声气声特征地板: {config.unvoiced_feature_gate_floor:.2f}, "
457
+ f"呼吸激活边界: ref-{config.breath_active_margin_db:.1f}dB"
458
+ )
459
  log.config(
460
  f"F0混合: {config.f0_hybrid_mode}, CREPE阈值: {config.crepe_pd_threshold}, "
461
  f"强制比率: {config.crepe_force_ratio}, 替换阈值(半音): {config.crepe_replace_semitones}"
 
493
  vocals_path,
494
  pitch_shift,
495
  None,
496
+ effective_f0_method,
497
  official_index or "",
498
  "",
499
  index_rate,
 
604
  ) -> str:
605
  """Run vendored upstream official RVC in an isolated subprocess."""
606
  root_dir = Path(__file__).parent.parent
607
+ app_cfg = _load_app_config(root_dir)
608
+ f0_policy = resolve_cover_f0_policy(
609
+ requested_method=f0_method,
610
+ configured_hybrid_mode=str(_get_cfg_value(app_cfg, "f0_hybrid_mode", "off")),
611
+ repair_profile=False,
612
+ )
613
+ effective_f0_method = f0_policy.vc_method
614
  env_paths = setup_upstream_official_env(root_dir)
615
 
616
  sid, official_index = export_model_to_official(
 
635
  "--output-path",
636
  str(output_path),
637
  "--f0-method",
638
+ str(effective_f0_method),
639
  "--pitch-shift",
640
  str(int(pitch_shift)),
641
  "--index-path",
 
656
  log.detail(f"官方模型SID: {sid}")
657
  if official_index:
658
  log.detail(f"官方索引路径: {official_index}")
659
+ if effective_f0_method != str(f0_method).strip().lower():
660
+ log.detail(
661
+ "官方F0路由解析: "
662
+ f"requested={f0_policy.requested_method}, "
663
+ f"vc={f0_policy.vc_method}, "
664
+ f"hybrid_mode={f0_policy.hybrid_mode}, "
665
+ f"gate={f0_policy.gate_method}"
666
+ )
667
  log.detail(f"官方RMS混合率: {official_rms_mix_rate}")
668
 
669
  try:
infer/quality_policy.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+ import numpy as np
6
+
7
+
8
+ FALLBACK_HYBRID_MODES = {
9
+ "fallback",
10
+ "smart",
11
+ "rmvpe+fallback",
12
+ "rmvpe_fallback",
13
+ "rmvpe-fallback",
14
+ "hybrid_fallback",
15
+ "hybrid-fallback",
16
+ }
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class F0RoutingPolicy:
21
+ requested_method: str
22
+ vc_method: str
23
+ hybrid_mode: str
24
+ gate_method: str
25
+ description: str
26
+
27
+
28
+ def resolve_cover_f0_policy(
29
+ requested_method: str,
30
+ configured_hybrid_mode: str = "off",
31
+ repair_profile: bool = False,
32
+ ) -> F0RoutingPolicy:
33
+ requested = str(requested_method or "rmvpe").strip().lower()
34
+ configured = str(configured_hybrid_mode or "off").strip().lower()
35
+
36
+ if requested != "hybrid":
37
+ return F0RoutingPolicy(
38
+ requested_method=requested,
39
+ vc_method=requested,
40
+ hybrid_mode=configured or "off",
41
+ gate_method=requested,
42
+ description=f"{requested} uses the configured routing directly.",
43
+ )
44
+
45
+ hybrid_mode = configured if configured in FALLBACK_HYBRID_MODES else "fallback"
46
+ if repair_profile:
47
+ hybrid_mode = "fallback"
48
+
49
+ return F0RoutingPolicy(
50
+ requested_method="hybrid",
51
+ vc_method="rmvpe",
52
+ hybrid_mode=hybrid_mode,
53
+ gate_method="rmvpe",
54
+ description="hybrid request routed to RMVPE with conservative fallback; post-gate uses RMVPE only.",
55
+ )
56
+
57
+
58
+ def build_conservative_crepe_fill_mask(
59
+ f0_rmvpe: np.ndarray,
60
+ f0_crepe: np.ndarray,
61
+ confidence: np.ndarray,
62
+ confidence_threshold: float,
63
+ max_ratio: float = 0.02,
64
+ max_frames: int = 320,
65
+ context_radius: int = 6,
66
+ energy_mask: np.ndarray | None = None,
67
+ ) -> np.ndarray:
68
+ f0_rmvpe = np.asarray(f0_rmvpe, dtype=np.float32).reshape(-1)
69
+ f0_crepe = np.asarray(f0_crepe, dtype=np.float32).reshape(-1)
70
+ confidence = np.asarray(confidence, dtype=np.float32).reshape(-1)
71
+ n = min(len(f0_rmvpe), len(f0_crepe), len(confidence))
72
+ if n == 0:
73
+ return np.zeros(0, dtype=bool)
74
+
75
+ f0_rmvpe = f0_rmvpe[:n]
76
+ f0_crepe = f0_crepe[:n]
77
+ confidence = confidence[:n]
78
+ threshold = max(0.0, float(confidence_threshold))
79
+ context_radius = max(1, int(context_radius))
80
+ max_ratio = max(0.0, float(max_ratio))
81
+ max_frames = max(0, int(max_frames))
82
+
83
+ if energy_mask is None:
84
+ energy_mask = np.ones(n, dtype=bool)
85
+ else:
86
+ energy_mask = np.asarray(energy_mask, dtype=bool).reshape(-1)
87
+ if len(energy_mask) < n:
88
+ energy_mask = np.pad(energy_mask, (0, n - len(energy_mask)), mode="edge")
89
+ else:
90
+ energy_mask = energy_mask[:n]
91
+
92
+ voiced_seed = f0_rmvpe > 0
93
+ if not np.any(voiced_seed):
94
+ return np.zeros(n, dtype=bool)
95
+
96
+ idx = np.arange(n)
97
+ left_seen = np.where(voiced_seed, idx, -10**9)
98
+ left_seen = np.maximum.accumulate(left_seen)
99
+ right_seen = np.where(voiced_seed, idx, 10**9)
100
+ right_seen = np.minimum.accumulate(right_seen[::-1])[::-1]
101
+ voiced_context = ((idx - left_seen) <= context_radius) & ((right_seen - idx) <= context_radius)
102
+
103
+ fill_mask = (
104
+ (f0_rmvpe <= 0)
105
+ & (f0_crepe > 0)
106
+ & (confidence >= threshold)
107
+ & energy_mask
108
+ & voiced_context
109
+ )
110
+
111
+ fill_count = int(np.sum(fill_mask))
112
+ if fill_count == 0:
113
+ return fill_mask
114
+
115
+ fill_ratio = float(fill_count) / float(n)
116
+ if fill_count > max_frames or fill_ratio > max_ratio:
117
+ return np.zeros(n, dtype=bool)
118
+
119
+ return fill_mask
120
+
121
+
122
+ def build_conservative_harvest_fill_mask(
123
+ reference_f0: np.ndarray,
124
+ fallback_f0: np.ndarray,
125
+ dropout_mask: np.ndarray,
126
+ max_run: int = 10,
127
+ local_radius: int = 4,
128
+ max_semitones: float = 4.0,
129
+ min_neighbors: int = 2,
130
+ ) -> np.ndarray:
131
+ reference_f0 = np.asarray(reference_f0, dtype=np.float32).reshape(-1)
132
+ fallback_f0 = np.asarray(fallback_f0, dtype=np.float32).reshape(-1)
133
+ dropout_mask = np.asarray(dropout_mask, dtype=bool).reshape(-1)
134
+
135
+ n = min(reference_f0.size, fallback_f0.size, dropout_mask.size)
136
+ if n <= 0:
137
+ return np.zeros(0, dtype=bool)
138
+
139
+ reference_f0 = reference_f0[:n]
140
+ fallback_f0 = fallback_f0[:n]
141
+ dropout_mask = dropout_mask[:n]
142
+ accepted = np.zeros(n, dtype=bool)
143
+
144
+ max_run = max(1, int(max_run))
145
+ local_radius = max(1, int(local_radius))
146
+ max_semitones = max(0.0, float(max_semitones))
147
+ min_neighbors = max(1, int(min_neighbors))
148
+
149
+ padded = np.pad(dropout_mask.astype(np.int8), (1, 1), mode="constant")
150
+ edges = np.diff(padded)
151
+ starts = np.where(edges == 1)[0]
152
+ ends = np.where(edges == -1)[0]
153
+ eps = 1e-6
154
+
155
+ for start, end in zip(starts, ends):
156
+ run_slice = slice(start, end)
157
+ run_len = end - start
158
+ if run_len <= 0 or run_len > max_run:
159
+ continue
160
+
161
+ run_fallback = fallback_f0[run_slice]
162
+ voiced_run = run_fallback > 0
163
+ if not np.any(voiced_run):
164
+ continue
165
+
166
+ left = reference_f0[max(0, start - local_radius) : start]
167
+ right = reference_f0[end : min(n, end + local_radius)]
168
+ neighbors = np.concatenate([left[left > 0], right[right > 0]])
169
+ if neighbors.size < min_neighbors:
170
+ continue
171
+
172
+ local_median = float(np.median(neighbors))
173
+ if local_median <= 0:
174
+ continue
175
+
176
+ semitone_diff = np.abs(
177
+ 12.0 * np.log2((run_fallback + eps) / (local_median + eps))
178
+ )
179
+ accepted[run_slice] = voiced_run & (semitone_diff <= max_semitones)
180
+
181
+ return accepted
182
+
183
+
184
+ def compute_chunk_crossfade_samples(
185
+ tgt_sr: int,
186
+ t_pad_tgt: int,
187
+ segment_count: int,
188
+ ) -> int:
189
+ tgt_sr = int(max(tgt_sr, 0))
190
+ t_pad_tgt = int(max(t_pad_tgt, 0))
191
+ segment_count = int(max(segment_count, 0))
192
+ if tgt_sr <= 0 or t_pad_tgt <= 0 or segment_count <= 1:
193
+ return 0
194
+
195
+ base = int(round(tgt_sr * 0.018))
196
+ extra = int(round(tgt_sr * 0.002 * max(0, segment_count - 2)))
197
+ min_crossfade = int(round(tgt_sr * 0.012))
198
+ max_crossfade = max(min_crossfade, t_pad_tgt // 3)
199
+ return int(np.clip(base + extra, min_crossfade, max_crossfade))
200
+
201
+
202
+ def compute_active_source_replace(
203
+ activity: np.ndarray,
204
+ soft_mask: np.ndarray,
205
+ echo_ratio: np.ndarray,
206
+ direct_ratio: np.ndarray,
207
+ max_replace: float = 0.82,
208
+ ) -> np.ndarray:
209
+ activity = np.asarray(activity, dtype=np.float32).reshape(-1)
210
+ direct_ratio = np.asarray(direct_ratio, dtype=np.float32).reshape(-1)
211
+ soft_mask = np.asarray(soft_mask, dtype=np.float32)
212
+ echo_ratio = np.asarray(echo_ratio, dtype=np.float32)
213
+
214
+ if soft_mask.ndim == 1:
215
+ soft_mask = soft_mask[np.newaxis, :]
216
+ if echo_ratio.ndim == 1:
217
+ echo_ratio = echo_ratio[np.newaxis, :]
218
+
219
+ frame_count = min(soft_mask.shape[-1], echo_ratio.shape[-1], len(activity), len(direct_ratio))
220
+ if frame_count <= 0:
221
+ return np.zeros_like(soft_mask, dtype=np.float32)
222
+
223
+ soft_mask = soft_mask[..., :frame_count]
224
+ echo_ratio = echo_ratio[..., :frame_count]
225
+ activity = activity[:frame_count][np.newaxis, :]
226
+ direct_ratio = direct_ratio[:frame_count][np.newaxis, :]
227
+
228
+ base_replace = 0.85 * (1.0 - activity) * (1.0 - soft_mask)
229
+ active_echo_presence = np.clip(
230
+ echo_ratio * (0.35 + 0.65 * (1.0 - direct_ratio)),
231
+ 0.0,
232
+ 1.0,
233
+ )
234
+ active_replace = 0.65 * activity * active_echo_presence * (1.0 - soft_mask)
235
+ source_replace = np.clip(base_replace + active_replace, 0.0, float(max_replace))
236
+ return source_replace.astype(np.float32)
237
+
238
+
239
+ def compute_source_cleanup_budget(
240
+ energy_guard: np.ndarray,
241
+ phrase_activity: np.ndarray,
242
+ ) -> tuple[np.ndarray, np.ndarray]:
243
+ energy_guard = np.clip(np.asarray(energy_guard, dtype=np.float32), 0.0, 1.0)
244
+ phrase_activity = np.clip(np.asarray(phrase_activity, dtype=np.float32), 0.0, 1.0)
245
+ allowed_boost = 0.35 + 1.00 * energy_guard
246
+ cleanup_floor = 0.62 + 0.16 * phrase_activity
247
+ return allowed_boost.astype(np.float32), cleanup_floor.astype(np.float32)
248
+
249
+
250
+ def compute_breath_preserving_energy_gates(
251
+ energy_db: np.ndarray,
252
+ ref_db: float,
253
+ unvoiced_mask: np.ndarray | None,
254
+ quiet_floor: float = 0.05,
255
+ breath_floor: float = 0.28,
256
+ breath_active_margin_db: float = 52.0,
257
+ transition_width_db: float = 6.0,
258
+ ) -> tuple[np.ndarray, np.ndarray]:
259
+ energy_db = np.asarray(energy_db, dtype=np.float32).reshape(-1)
260
+ if energy_db.size == 0:
261
+ empty = np.zeros(0, dtype=np.float32)
262
+ return empty, empty
263
+
264
+ quiet_floor = float(np.clip(quiet_floor, 0.0, 1.0))
265
+ breath_floor = float(np.clip(max(breath_floor, quiet_floor), quiet_floor, 1.0))
266
+ breath_active_margin_db = float(max(1.0, breath_active_margin_db))
267
+ transition_width_db = float(max(0.5, transition_width_db))
268
+ ref_db = float(ref_db)
269
+
270
+ silence_center = ref_db - 45.0
271
+ slope = transition_width_db / 4.0
272
+ base_gate = 1.0 / (1.0 + np.exp(-((energy_db - silence_center) / slope)))
273
+ base_gate = np.clip(base_gate, quiet_floor, 1.0).astype(np.float32)
274
+
275
+ if unvoiced_mask is None:
276
+ return base_gate, base_gate.copy()
277
+
278
+ unvoiced_mask = np.asarray(unvoiced_mask, dtype=bool).reshape(-1)
279
+ if len(unvoiced_mask) < len(base_gate):
280
+ unvoiced_mask = np.pad(unvoiced_mask, (0, len(base_gate) - len(unvoiced_mask)), mode="edge")
281
+ else:
282
+ unvoiced_mask = unvoiced_mask[: len(base_gate)]
283
+
284
+ breath_activity = np.clip(
285
+ (energy_db - (ref_db - breath_active_margin_db)) / 10.0,
286
+ 0.0,
287
+ 1.0,
288
+ ).astype(np.float32)
289
+ feature_floor = quiet_floor + (breath_floor - quiet_floor) * breath_activity
290
+
291
+ feature_gate = base_gate.copy()
292
+ feature_gate[unvoiced_mask] = np.maximum(feature_gate[unvoiced_mask], feature_floor[unvoiced_mask])
293
+ feature_gate = np.clip(feature_gate, quiet_floor, 1.0).astype(np.float32)
294
+ return feature_gate, base_gate.copy()
infer/separator.py CHANGED
@@ -3,13 +3,13 @@
3
  人声分离模块 - 支持 Demucs 和 Mel-Band Roformer (audio-separator)
4
  """
5
  import os
6
- import gc
7
- import shutil
8
- import torch
9
- import numpy as np
10
- import soundfile as sf
11
- from pathlib import Path
12
- from typing import Tuple, Optional, Callable
13
 
14
  from lib.logger import log
15
  from lib.device import get_device, empty_device_cache
@@ -34,12 +34,82 @@ except ImportError:
34
  AUDIO_SEPARATOR_AVAILABLE = False
35
 
36
 
37
- # Mel-Band Roformer 默认模型
38
- ROFORMER_DEFAULT_MODEL = "vocals_mel_band_roformer.ckpt"
39
- KARAOKE_DEFAULT_MODEL = "mel_band_roformer_karaoke_gabox.ckpt"
40
- KARAOKE_FALLBACK_MODELS = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  "mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt",
 
 
42
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
 
45
  def _resolve_output_files(output_files, output_dir: Path) -> list[str]:
@@ -49,31 +119,59 @@ def _resolve_output_files(output_files, output_dir: Path) -> list[str]:
49
  file_path = Path(file_name)
50
  if not file_path.is_absolute():
51
  file_path = output_dir / file_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  resolved_files.append(str(file_path))
53
  return resolved_files
54
 
55
 
56
- def _safe_move(src_path: str, dst_path: str) -> None:
57
- """Move file with overwrite."""
58
- if src_path == dst_path:
59
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  dst = Path(dst_path)
61
  if dst.exists():
62
  dst.unlink()
63
- shutil.move(src_path, dst_path)
64
-
65
-
66
- def _get_audio_activity_stats(audio_path: str) -> tuple[float, float, int]:
67
- """Return simple activity stats for validating separator outputs."""
68
- audio, _ = sf.read(audio_path, dtype="float32", always_2d=True)
69
- if audio.size == 0:
70
- return 0.0, 0.0, 0
71
-
72
- mono = np.mean(audio, axis=1, dtype=np.float32)
73
- rms = float(np.sqrt(np.mean(np.square(mono), dtype=np.float64) + 1e-12))
74
- peak = float(np.max(np.abs(mono)))
75
- nonzero = int(np.count_nonzero(np.abs(mono) > 1e-6))
76
- return rms, peak, nonzero
77
 
78
 
79
  class RoformerSeparator:
@@ -81,19 +179,21 @@ class RoformerSeparator:
81
 
82
  def __init__(
83
  self,
84
- model_filename: str = ROFORMER_DEFAULT_MODEL,
85
  device: str = "cuda",
86
  ):
87
  if not AUDIO_SEPARATOR_AVAILABLE:
88
  raise ImportError(
89
  "请安装 audio-separator: pip install audio-separator[gpu]"
90
- )
91
  self.model_filename = model_filename
 
92
  self.device = str(get_device(device))
93
  self.separator = None
 
94
 
95
  def load_model(self, output_dir: str = ""):
96
- """加载 Roformer 模型"""
97
  model_dir = str(
98
  Path(__file__).parent.parent / "assets" / "separator_models"
99
  )
@@ -113,16 +213,23 @@ class RoformerSeparator:
113
  self.separator = None
114
  gc.collect()
115
 
116
- log.info(f"正在加载 Mel-Band Roformer 模型: {self.model_filename}")
117
-
118
- self.separator = Separator(
119
- log_level=_logging.WARNING,
 
 
 
120
  output_dir=target_dir,
121
- model_file_dir=model_dir,
122
  )
 
123
  self._init_output_dir = target_dir
124
- self.separator.load_model(self.model_filename)
125
- log.info("Mel-Band Roformer 模型已加载")
 
 
 
126
 
127
  def separate(
128
  self,
@@ -142,14 +249,12 @@ class RoformerSeparator:
142
  if progress_callback:
143
  progress_callback("正在加载 Roformer 模型...", 0.1)
144
 
145
- self.load_model(output_dir=str(output_path))
 
146
 
 
147
  # audio-separator 需要 output_dir 在实例上设置
148
  self.separator.output_dir = str(output_path)
149
-
150
- if progress_callback:
151
- progress_callback("正在使用 Mel-Band Roformer 分离人声...", 0.3)
152
-
153
  output_files = self.separator.separate(audio_path)
154
 
155
  # audio-separator 返回的可能是纯文件名,需要拼上 output_dir
@@ -160,7 +265,7 @@ class RoformerSeparator:
160
  p = output_path / p
161
  resolved_files.append(str(p))
162
 
163
- # Fallback: if resolved files don't exist, search the output dir
164
  # for freshly created files. This handles cases where audio-separator
165
  # writes to a slightly different path (e.g. after output_dir update
166
  # on a reused Separator instance).
@@ -233,6 +338,7 @@ class RoformerSeparator:
233
  if self.separator is not None:
234
  del self.separator
235
  self.separator = None
 
236
  gc.collect()
237
  empty_device_cache()
238
 
@@ -242,7 +348,7 @@ class KaraokeSeparator:
242
 
243
  def __init__(
244
  self,
245
- model_filename: str = KARAOKE_DEFAULT_MODEL,
246
  device: str = "cuda",
247
  ):
248
  if not AUDIO_SEPARATOR_AVAILABLE:
@@ -252,15 +358,11 @@ class KaraokeSeparator:
252
  self.device = str(get_device(device))
253
  self.separator = None
254
  self.active_model = None
255
-
256
- models = [model_filename]
257
- for fallback in KARAOKE_FALLBACK_MODELS:
258
- if fallback not in models:
259
- models.append(fallback)
260
- self.model_candidates = models
261
 
262
  def load_model(self, output_dir: str = ""):
263
- """加载 Karaoke 模型(主型失败时自动回退)"""
264
  model_dir = str(Path(__file__).parent.parent / "assets" / "separator_models")
265
  Path(model_dir).mkdir(parents=True, exist_ok=True)
266
 
@@ -277,26 +379,23 @@ class KaraokeSeparator:
277
  self.active_model = None
278
  gc.collect()
279
 
280
- last_error = None
281
- for model_name in self.model_candidates:
282
- try:
283
- log.info(f"正在加载 Karaoke 模型: {model_name}")
284
- separator = Separator(
285
- log_level=_logging.WARNING,
286
- output_dir=target_dir,
287
- model_file_dir=model_dir,
288
- )
289
- separator.load_model(model_name)
290
- self.separator = separator
291
- self._init_output_dir = target_dir
292
- self.active_model = model_name
293
- log.info("Karaoke 模型已加载")
294
- return
295
- except Exception as exc:
296
- last_error = exc
297
- log.warning(f"Karaoke 模型加载失败: {model_name} ({exc})")
298
-
299
- raise RuntimeError(f"无法加载 Karaoke 模型: {last_error}")
300
 
301
  @staticmethod
302
  def _classify_stem(file_name: str) -> Optional[str]:
@@ -332,36 +431,37 @@ class KaraokeSeparator:
332
  return "backing"
333
  return None
334
 
335
- def separate(self, audio_path: str, output_dir: str) -> Tuple[str, str]:
336
- """
337
- 分离主唱和和声
338
-
339
- Returns:
340
  Tuple[lead_vocals_path, backing_vocals_path]
341
  """
342
  output_path = Path(output_dir)
343
  output_path.mkdir(parents=True, exist_ok=True)
344
 
345
- self.load_model(output_dir=str(output_path))
346
- self.separator.output_dir = str(output_path)
347
- output_files = self.separator.separate(audio_path)
348
- resolved_files = _resolve_output_files(output_files, output_path)
349
- log.detail(
350
- f"Karaoke分离器输出文件: {[Path(file_path).name for file_path in resolved_files]}"
351
- )
352
-
353
- lead_vocals_path = None
354
- backing_vocals_path = None
355
- for file_path in resolved_files:
356
- stem_role = self._classify_stem(Path(file_path).name)
357
- log.detail(
358
- f" {Path(file_path).name} -> 分类为: {stem_role or 'unknown'}"
359
- )
360
- if stem_role == "lead" and lead_vocals_path is None:
361
- lead_vocals_path = file_path
362
- elif stem_role == "backing" and backing_vocals_path is None:
363
- backing_vocals_path = file_path
364
-
 
365
  if lead_vocals_path is None and resolved_files:
366
  lead_vocals_path = resolved_files[0]
367
  if backing_vocals_path is None:
@@ -374,30 +474,30 @@ class KaraokeSeparator:
374
  raise FileNotFoundError(
375
  f"Karaoke主唱轨未找到,输出文件: {[Path(p).name for p in resolved_files]}"
376
  )
377
- if not backing_vocals_path or not Path(backing_vocals_path).exists():
378
- raise FileNotFoundError(
379
- f"Karaoke和声轨未找到,输出文件: {[Path(p).name for p in resolved_files]}"
380
- )
381
-
382
- lead_rms, lead_peak, lead_nonzero = _get_audio_activity_stats(lead_vocals_path)
383
- backing_rms, backing_peak, backing_nonzero = _get_audio_activity_stats(backing_vocals_path)
384
- log.detail(
385
- "Karaoke输出能量检测: "
386
- f"lead_rms={lead_rms:.6f}, lead_peak={lead_peak:.6f}, lead_nonzero={lead_nonzero}; "
387
- f"backing_rms={backing_rms:.6f}, backing_peak={backing_peak:.6f}, backing_nonzero={backing_nonzero}"
388
- )
389
-
390
- lead_is_nearly_silent = lead_nonzero == 0 or (lead_rms < 1e-5 and lead_peak < 1e-4)
391
- backing_has_content = backing_nonzero > 0 and (backing_rms >= 5e-5 or backing_peak >= 5e-4)
392
- if lead_is_nearly_silent and backing_has_content:
393
- log.warning("Karaoke主唱轨几乎静音,检测到输出疑似反转,已自动交换主唱/和声")
394
- lead_vocals_path, backing_vocals_path = backing_vocals_path, lead_vocals_path
395
-
396
- final_lead = str(output_path / "lead_vocals.wav")
397
- final_backing = str(output_path / "backing_vocals.wav")
398
- _safe_move(lead_vocals_path, final_lead)
399
- _safe_move(backing_vocals_path, final_backing)
400
-
401
  return final_lead, final_backing
402
 
403
  def unload_model(self):
@@ -410,6 +510,132 @@ class KaraokeSeparator:
410
  empty_device_cache()
411
 
412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
  class VocalSeparator:
414
  """人声分离器 - 基于 Demucs"""
415
 
@@ -575,7 +801,7 @@ def get_available_models() -> list:
575
  if AUDIO_SEPARATOR_AVAILABLE:
576
  models.append({
577
  "name": "roformer",
578
- "description": "Mel-Band Roformer (Kimberley Jensen) - 高质量人声分离"
579
  })
580
  if DEMUCS_AVAILABLE:
581
  models.extend([
 
3
  人声分离模块 - 支持 Demucs 和 Mel-Band Roformer (audio-separator)
4
  """
5
  import os
6
+ import gc
7
+ import shutil
8
+ import torch
9
+ import numpy as np
10
+ import soundfile as sf
11
+ from pathlib import Path
12
+ from typing import Tuple, Optional, Callable, Union
13
 
14
  from lib.logger import log
15
  from lib.device import get_device, empty_device_cache
 
34
  AUDIO_SEPARATOR_AVAILABLE = False
35
 
36
 
37
+ ModelSpec = Union[str, list[str], tuple[str, ...]]
38
+
39
+
40
+ # Public scored SOTA defaults from audio-separator 0.44.1's model table.
41
+ # Keep the cover pipeline unchanged; only the separator model choices change.
42
+ ENSEMBLE_PRESET_PREFIX = "ensemble:"
43
+
44
+ ROFORMER_LEGACY_SINGLE_MODEL = "vocals_mel_band_roformer.ckpt"
45
+ ROFORMER_SOTA_PRESET = "vocal_rvc"
46
+ ROFORMER_DEFAULT_MODEL = f"{ENSEMBLE_PRESET_PREFIX}{ROFORMER_SOTA_PRESET}"
47
+ ROFORMER_SOTA_MODEL = ROFORMER_DEFAULT_MODEL
48
+ ROFORMER_SOTA_MODELS = [
49
+ "melband_roformer_big_beta6x.ckpt",
50
+ "mel_band_roformer_vocals_fv4_gabox.ckpt",
51
+ ]
52
+
53
+ KARAOKE_LEGACY_SINGLE_MODEL = "mel_band_roformer_karaoke_gabox.ckpt"
54
+ KARAOKE_SOTA_PRESET = "karaoke"
55
+ KARAOKE_DEFAULT_MODEL = f"{ENSEMBLE_PRESET_PREFIX}{KARAOKE_SOTA_PRESET}"
56
+ KARAOKE_SOTA_MODEL = KARAOKE_DEFAULT_MODEL
57
+ KARAOKE_SOTA_MODELS = [
58
  "mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt",
59
+ "mel_band_roformer_karaoke_gabox_v2.ckpt",
60
+ "mel_band_roformer_karaoke_becruily.ckpt",
61
  ]
62
+ KARAOKE_EXPERIMENTAL_MODELS = [
63
+ "mel_band_roformer_karaoke_gabox_v2.ckpt",
64
+ "mel_band_roformer_karaoke_becruily.ckpt",
65
+ ]
66
+
67
+ ROFORMER_DEREVERB_DEFAULT_MODEL = "dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt"
68
+
69
+
70
+ def _model_spec_key(model_spec: ModelSpec) -> tuple[str, ...]:
71
+ if isinstance(model_spec, (list, tuple)):
72
+ return tuple(str(item) for item in model_spec)
73
+ return (str(model_spec),)
74
+
75
+
76
+ def _model_spec_label(model_spec: ModelSpec) -> str:
77
+ if isinstance(model_spec, (list, tuple)):
78
+ return "ensemble[" + ", ".join(str(item) for item in model_spec) + "]"
79
+ return str(model_spec)
80
+
81
+
82
+ def _parse_ensemble_preset(model_spec: ModelSpec) -> Optional[str]:
83
+ if not isinstance(model_spec, str):
84
+ return None
85
+ spec = model_spec.strip()
86
+ if not spec.lower().startswith(ENSEMBLE_PRESET_PREFIX):
87
+ return None
88
+ preset = spec[len(ENSEMBLE_PRESET_PREFIX):].strip()
89
+ return preset or None
90
+
91
+
92
+ def _load_audio_separator_model(
93
+ *,
94
+ model_spec: ModelSpec,
95
+ output_dir: str,
96
+ model_dir: str,
97
+ ) -> Separator:
98
+ preset_name = _parse_ensemble_preset(model_spec)
99
+ separator_kwargs = {
100
+ "log_level": _logging.WARNING,
101
+ "output_dir": output_dir,
102
+ "model_file_dir": model_dir,
103
+ }
104
+ if preset_name:
105
+ separator_kwargs["ensemble_preset"] = preset_name
106
+
107
+ separator = Separator(**separator_kwargs)
108
+ if preset_name:
109
+ separator.load_model()
110
+ else:
111
+ separator.load_model(list(model_spec) if isinstance(model_spec, tuple) else model_spec)
112
+ return separator
113
 
114
 
115
  def _resolve_output_files(output_files, output_dir: Path) -> list[str]:
 
119
  file_path = Path(file_name)
120
  if not file_path.is_absolute():
121
  file_path = output_dir / file_path
122
+ if file_path.exists():
123
+ resolved_files.append(str(file_path))
124
+ continue
125
+
126
+ role = _classify_common_stem_role(file_path.name)
127
+ if role:
128
+ candidates = [
129
+ candidate
130
+ for candidate in output_dir.glob("*.wav")
131
+ if _classify_common_stem_role(candidate.name) == role
132
+ ]
133
+ if len(candidates) == 1:
134
+ resolved_files.append(str(candidates[0]))
135
+ continue
136
+
137
  resolved_files.append(str(file_path))
138
  return resolved_files
139
 
140
 
141
+ def _classify_common_stem_role(file_name: str) -> Optional[str]:
142
+ lower_name = file_name.lower()
143
+ if any(marker in lower_name for marker in ("(noreverb)", "(no_reverb)", "(no reverb)", "(dry)")):
144
+ return "dry"
145
+ if any(marker in lower_name for marker in ("(reverb)", "(echo)", "(wet)")):
146
+ return "wet"
147
+ if any(marker in lower_name for marker in ("(instrumental)", "(other)", "(backing)")):
148
+ return "backing"
149
+ if any(marker in lower_name for marker in ("(vocals)", "(lead)", "(main_vocal)", "(main vocals)")):
150
+ return "lead"
151
+ return None
152
+
153
+
154
+ def _safe_move(src_path: str, dst_path: str) -> None:
155
+ """Move file with overwrite."""
156
+ if src_path == dst_path:
157
+ return
158
  dst = Path(dst_path)
159
  if dst.exists():
160
  dst.unlink()
161
+ shutil.move(src_path, dst_path)
162
+
163
+
164
+ def _get_audio_activity_stats(audio_path: str) -> tuple[float, float, int]:
165
+ """Return simple activity stats for validating separator outputs."""
166
+ audio, _ = sf.read(audio_path, dtype="float32", always_2d=True)
167
+ if audio.size == 0:
168
+ return 0.0, 0.0, 0
169
+
170
+ mono = np.mean(audio, axis=1, dtype=np.float32)
171
+ rms = float(np.sqrt(np.mean(np.square(mono), dtype=np.float64) + 1e-12))
172
+ peak = float(np.max(np.abs(mono)))
173
+ nonzero = int(np.count_nonzero(np.abs(mono) > 1e-6))
174
+ return rms, peak, nonzero
175
 
176
 
177
  class RoformerSeparator:
 
179
 
180
  def __init__(
181
  self,
182
+ model_filename: ModelSpec = ROFORMER_DEFAULT_MODEL,
183
  device: str = "cuda",
184
  ):
185
  if not AUDIO_SEPARATOR_AVAILABLE:
186
  raise ImportError(
187
  "请安装 audio-separator: pip install audio-separator[gpu]"
188
+ )
189
  self.model_filename = model_filename
190
+ self.model_candidates = [model_filename]
191
  self.device = str(get_device(device))
192
  self.separator = None
193
+ self.active_model = None
194
 
195
  def load_model(self, output_dir: str = ""):
196
+ """加载指定 RoFormer 模型;严格 SOTA 模式下不自动降级。"""
197
  model_dir = str(
198
  Path(__file__).parent.parent / "assets" / "separator_models"
199
  )
 
213
  self.separator = None
214
  gc.collect()
215
 
216
+ model_name = self.model_filename
217
+ log.info(
218
+ "正在加载公开 SOTA RoFormer 分离模型: "
219
+ f"{_model_spec_label(model_name)}"
220
+ )
221
+ separator = _load_audio_separator_model(
222
+ model_spec=model_name,
223
  output_dir=target_dir,
224
+ model_dir=model_dir,
225
  )
226
+ self.separator = separator
227
  self._init_output_dir = target_dir
228
+ self.active_model = model_name
229
+ log.info(
230
+ "RoFormer 分离模型已加载: "
231
+ f"{_model_spec_label(model_name)}"
232
+ )
233
 
234
  def separate(
235
  self,
 
249
  if progress_callback:
250
  progress_callback("正在加载 Roformer 模型...", 0.1)
251
 
252
+ if progress_callback:
253
+ progress_callback("正在使用 RoFormer 分离人声...", 0.3)
254
 
255
+ self.load_model(output_dir=str(output_path))
256
  # audio-separator 需要 output_dir 在实例上设置
257
  self.separator.output_dir = str(output_path)
 
 
 
 
258
  output_files = self.separator.separate(audio_path)
259
 
260
  # audio-separator 返回的可能是纯文件名,需要拼上 output_dir
 
265
  p = output_path / p
266
  resolved_files.append(str(p))
267
 
268
+ # Recovery: if resolved files don't exist, search the output dir
269
  # for freshly created files. This handles cases where audio-separator
270
  # writes to a slightly different path (e.g. after output_dir update
271
  # on a reused Separator instance).
 
338
  if self.separator is not None:
339
  del self.separator
340
  self.separator = None
341
+ self.active_model = None
342
  gc.collect()
343
  empty_device_cache()
344
 
 
348
 
349
  def __init__(
350
  self,
351
+ model_filename: ModelSpec = KARAOKE_DEFAULT_MODEL,
352
  device: str = "cuda",
353
  ):
354
  if not AUDIO_SEPARATOR_AVAILABLE:
 
358
  self.device = str(get_device(device))
359
  self.separator = None
360
  self.active_model = None
361
+ self.model_filename = model_filename
362
+ self.model_candidates = [model_filename]
 
 
 
 
363
 
364
  def load_model(self, output_dir: str = ""):
365
+ """加载指定 Karaoke 模型;严格 SOTA 式下不自动降级。"""
366
  model_dir = str(Path(__file__).parent.parent / "assets" / "separator_models")
367
  Path(model_dir).mkdir(parents=True, exist_ok=True)
368
 
 
379
  self.active_model = None
380
  gc.collect()
381
 
382
+ model_name = self.model_filename
383
+ log.info(
384
+ "正在加载公开 SOTA Karaoke 模型: "
385
+ f"{_model_spec_label(model_name)}"
386
+ )
387
+ separator = _load_audio_separator_model(
388
+ model_spec=model_name,
389
+ output_dir=target_dir,
390
+ model_dir=model_dir,
391
+ )
392
+ self.separator = separator
393
+ self._init_output_dir = target_dir
394
+ self.active_model = model_name
395
+ log.info(
396
+ "Karaoke 模型已加载: "
397
+ f"{_model_spec_label(model_name)}"
398
+ )
 
 
 
399
 
400
  @staticmethod
401
  def _classify_stem(file_name: str) -> Optional[str]:
 
431
  return "backing"
432
  return None
433
 
434
+ def separate(self, audio_path: str, output_dir: str) -> Tuple[str, str]:
435
+ """
436
+ 分离主唱和和声
437
+
438
+ Returns:
439
  Tuple[lead_vocals_path, backing_vocals_path]
440
  """
441
  output_path = Path(output_dir)
442
  output_path.mkdir(parents=True, exist_ok=True)
443
 
444
+ self.load_model(output_dir=str(output_path))
445
+ self.separator.output_dir = str(output_path)
446
+ output_files = self.separator.separate(audio_path)
447
+
448
+ resolved_files = _resolve_output_files(output_files, output_path)
449
+ log.detail(
450
+ f"Karaoke分离器输出文件: {[Path(file_path).name for file_path in resolved_files]}"
451
+ )
452
+
453
+ lead_vocals_path = None
454
+ backing_vocals_path = None
455
+ for file_path in resolved_files:
456
+ stem_role = self._classify_stem(Path(file_path).name)
457
+ log.detail(
458
+ f" {Path(file_path).name} -> 分类为: {stem_role or 'unknown'}"
459
+ )
460
+ if stem_role == "lead" and lead_vocals_path is None:
461
+ lead_vocals_path = file_path
462
+ elif stem_role == "backing" and backing_vocals_path is None:
463
+ backing_vocals_path = file_path
464
+
465
  if lead_vocals_path is None and resolved_files:
466
  lead_vocals_path = resolved_files[0]
467
  if backing_vocals_path is None:
 
474
  raise FileNotFoundError(
475
  f"Karaoke主唱轨未找到,输出文件: {[Path(p).name for p in resolved_files]}"
476
  )
477
+ if not backing_vocals_path or not Path(backing_vocals_path).exists():
478
+ raise FileNotFoundError(
479
+ f"Karaoke和声轨未找到,输出文件: {[Path(p).name for p in resolved_files]}"
480
+ )
481
+
482
+ lead_rms, lead_peak, lead_nonzero = _get_audio_activity_stats(lead_vocals_path)
483
+ backing_rms, backing_peak, backing_nonzero = _get_audio_activity_stats(backing_vocals_path)
484
+ log.detail(
485
+ "Karaoke输出能量检测: "
486
+ f"lead_rms={lead_rms:.6f}, lead_peak={lead_peak:.6f}, lead_nonzero={lead_nonzero}; "
487
+ f"backing_rms={backing_rms:.6f}, backing_peak={backing_peak:.6f}, backing_nonzero={backing_nonzero}"
488
+ )
489
+
490
+ lead_is_nearly_silent = lead_nonzero == 0 or (lead_rms < 1e-5 and lead_peak < 1e-4)
491
+ backing_has_content = backing_nonzero > 0 and (backing_rms >= 5e-5 or backing_peak >= 5e-4)
492
+ if lead_is_nearly_silent and backing_has_content:
493
+ log.warning("Karaoke主唱轨几乎静音,检测到输出疑似反转,已自动交换主唱/和声")
494
+ lead_vocals_path, backing_vocals_path = backing_vocals_path, lead_vocals_path
495
+
496
+ final_lead = str(output_path / "lead_vocals.wav")
497
+ final_backing = str(output_path / "backing_vocals.wav")
498
+ _safe_move(lead_vocals_path, final_lead)
499
+ _safe_move(backing_vocals_path, final_backing)
500
+
501
  return final_lead, final_backing
502
 
503
  def unload_model(self):
 
510
  empty_device_cache()
511
 
512
 
513
+ class RoformerDereverbSeparator:
514
+ """学习型 RoFormer 去混响/去回声,输出更干的人声供 VC 使用。"""
515
+
516
+ def __init__(
517
+ self,
518
+ model_filename: str = ROFORMER_DEREVERB_DEFAULT_MODEL,
519
+ device: str = "cuda",
520
+ ):
521
+ if not AUDIO_SEPARATOR_AVAILABLE:
522
+ raise ImportError(
523
+ "请安装 audio-separator: pip install audio-separator[gpu]"
524
+ )
525
+ self.device = str(get_device(device))
526
+ self.separator = None
527
+ self.active_model = None
528
+ self.model_filename = model_filename
529
+ self.model_candidates = [model_filename]
530
+
531
+ def load_model(self, output_dir: str = ""):
532
+ model_dir = str(Path(__file__).parent.parent / "assets" / "separator_models")
533
+ Path(model_dir).mkdir(parents=True, exist_ok=True)
534
+
535
+ target_dir = output_dir or str(
536
+ Path(__file__).parent.parent / "temp" / "separator"
537
+ )
538
+
539
+ if self.separator is not None:
540
+ if getattr(self, "_init_output_dir", None) == target_dir:
541
+ return
542
+ del self.separator
543
+ self.separator = None
544
+ self.active_model = None
545
+ gc.collect()
546
+
547
+ model_name = self.model_filename
548
+ log.info(f"正在加载 RoFormer De-Reverb 模型: {model_name}")
549
+ separator = _load_audio_separator_model(
550
+ model_spec=model_name,
551
+ output_dir=target_dir,
552
+ model_dir=model_dir,
553
+ )
554
+ self.separator = separator
555
+ self._init_output_dir = target_dir
556
+ self.active_model = model_name
557
+ log.info(f"RoFormer De-Reverb 模型已加载: {model_name}")
558
+
559
+ @staticmethod
560
+ def _classify_stem(file_name: str) -> Optional[str]:
561
+ lower_name = file_name.lower()
562
+
563
+ dry_markers = [
564
+ "(dry)",
565
+ "(noreverb)",
566
+ "(no_reverb)",
567
+ "(no reverb)",
568
+ "(dereverb)",
569
+ "(de-reverb)",
570
+ "(vocals)",
571
+ "(primary)",
572
+ ]
573
+ wet_markers = [
574
+ "(no dry)",
575
+ "(no_dry)",
576
+ "(reverb)",
577
+ "(echo)",
578
+ "(wet)",
579
+ "(secondary)",
580
+ "(instrumental)",
581
+ "(other)",
582
+ ]
583
+
584
+ for marker in wet_markers:
585
+ if marker in lower_name:
586
+ return "wet"
587
+ for marker in dry_markers:
588
+ if marker in lower_name:
589
+ return "dry"
590
+
591
+ if "dry" in lower_name or "noreverb" in lower_name or "vocal" in lower_name:
592
+ return "dry"
593
+ if "no dry" in lower_name or "reverb" in lower_name or "echo" in lower_name:
594
+ return "wet"
595
+ return None
596
+
597
+ def separate_dry(self, audio_path: str, output_dir: str) -> str:
598
+ output_path = Path(output_dir)
599
+ output_path.mkdir(parents=True, exist_ok=True)
600
+
601
+ self.load_model(output_dir=str(output_path))
602
+ self.separator.output_dir = str(output_path)
603
+ output_files = self.separator.separate(audio_path)
604
+
605
+ resolved_files = _resolve_output_files(output_files, output_path)
606
+ log.detail(
607
+ "RoFormer De-Reverb 输出文件: "
608
+ f"{[Path(file_path).name for file_path in resolved_files]}"
609
+ )
610
+
611
+ dry_path = None
612
+ for file_path in resolved_files:
613
+ stem_role = self._classify_stem(Path(file_path).name)
614
+ log.detail(
615
+ f" {Path(file_path).name} -> 分类为: {stem_role or 'unknown'}"
616
+ )
617
+ if stem_role == "dry":
618
+ dry_path = file_path
619
+ break
620
+
621
+ if not dry_path or not Path(dry_path).exists():
622
+ raise FileNotFoundError(
623
+ f"RoFormer De-Reverb dry轨未找到,输出文件: {[Path(p).name for p in resolved_files]}"
624
+ )
625
+
626
+ final_dry = str(output_path / "roformer_deecho_vocals.wav")
627
+ _safe_move(dry_path, final_dry)
628
+ return final_dry
629
+
630
+ def unload_model(self):
631
+ if self.separator is not None:
632
+ del self.separator
633
+ self.separator = None
634
+ self.active_model = None
635
+ gc.collect()
636
+ empty_device_cache()
637
+
638
+
639
  class VocalSeparator:
640
  """人声分离器 - 基于 Demucs"""
641
 
 
801
  if AUDIO_SEPARATOR_AVAILABLE:
802
  models.append({
803
  "name": "roformer",
804
+ "description": "audio-separator public scored SOTA - 高质量人声/伴奏分离"
805
  })
806
  if DEMUCS_AVAILABLE:
807
  models.extend([
install.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ RVC 安装脚本
4
+ 自动创建虚拟环境 → 安装依赖 → 启动应用
5
+
6
+ 用法:
7
+ python install.py # 完整安装并启动
8
+ python install.py --check # 仅检查依赖
9
+ python install.py --no-run # 安装但不启动
10
+ python install.py --cpu # 安装 CPU 版本
11
+ """
12
+ import subprocess
13
+ import sys
14
+ import os
15
+ from pathlib import Path
16
+
17
+ ROOT_DIR = Path(__file__).parent
18
+ VENV_DIR = ROOT_DIR / "venv310"
19
+
20
+ PYTHON310_CANDIDATES = [
21
+ r"C:\Users\Administrator\AppData\Local\Programs\Python\Python310\python.exe",
22
+ r"C:\Python310\python.exe",
23
+ r"C:\Program Files\Python310\python.exe",
24
+ r"C:\Program Files (x86)\Python310\python.exe",
25
+ ]
26
+
27
+ PACKAGES = {
28
+ "torch": {"import": "torch", "name": "PyTorch", "pip": "torch"},
29
+ "torchaudio": {"import": "torchaudio", "name": "torchaudio", "pip": "torchaudio"},
30
+ "gradio": {"import": "gradio", "name": "Gradio", "pip": "gradio==3.50.2"},
31
+ "librosa": {"import": "librosa", "name": "librosa", "pip": "librosa"},
32
+ "soundfile": {"import": "soundfile", "name": "soundfile", "pip": "soundfile"},
33
+ "av": {"import": "av", "name": "PyAV", "pip": "av"},
34
+ "scipy": {"import": "scipy", "name": "scipy", "pip": "scipy"},
35
+ "numpy": {
36
+ "import": "numpy",
37
+ "name": "numpy",
38
+ "pip": "numpy<2,>=1.23.0",
39
+ "dist": "numpy",
40
+ "min_version": "1.23.0",
41
+ "max_exclusive_version": "2.0.0",
42
+ },
43
+ "parselmouth": {"import": "parselmouth", "name": "praat-parselmouth", "pip": "praat-parselmouth"},
44
+ "pyworld": {"import": "pyworld", "name": "pyworld", "pip": "pyworld"},
45
+ "torchcrepe": {"import": "torchcrepe", "name": "torchcrepe", "pip": "torchcrepe"},
46
+ "faiss": {"import": "faiss", "name": "faiss-cpu", "pip": "faiss-cpu"},
47
+ "tqdm": {"import": "tqdm", "name": "tqdm", "pip": "tqdm"},
48
+ "requests": {"import": "requests", "name": "requests", "pip": "requests"},
49
+ "dotenv": {"import": "dotenv", "name": "python-dotenv", "pip": "python-dotenv"},
50
+ "colorama": {"import": "colorama", "name": "colorama", "pip": "colorama"},
51
+ "mcp": {"import": "mcp", "name": "mcp", "pip": "mcp"},
52
+ "demucs": {"import": "demucs", "name": "demucs", "pip": "demucs"},
53
+ "audio_separator": {
54
+ "import": "audio_separator",
55
+ "name": "audio-separator",
56
+ "pip": "audio-separator",
57
+ "dist": "audio-separator",
58
+ "min_version": "0.44.1",
59
+ },
60
+ "huggingface_hub": {"import": "huggingface_hub", "name": "huggingface_hub", "pip": "huggingface_hub"},
61
+ "pedalboard": {"import": "pedalboard", "name": "pedalboard", "pip": "pedalboard"},
62
+ "ffmpeg": {"import": "ffmpeg", "name": "ffmpeg-python", "pip": "ffmpeg-python"},
63
+ "fairseq": {"import": "fairseq", "name": "fairseq", "pip": "fairseq==0.12.2"},
64
+ }
65
+
66
+ # === 虚拟环境 ===
67
+
68
+ def find_python310():
69
+ """查找系统中的 Python 3.10"""
70
+ if sys.version_info[:2] == (3, 10):
71
+ return sys.executable
72
+ for p in PYTHON310_CANDIDATES:
73
+ if os.path.isfile(p):
74
+ return p
75
+ try:
76
+ r = subprocess.run(
77
+ ["py", "-3.10", "-c", "import sys; print(sys.executable)"],
78
+ capture_output=True, text=True, timeout=10,
79
+ )
80
+ if r.returncode == 0:
81
+ return r.stdout.strip()
82
+ except (FileNotFoundError, subprocess.TimeoutExpired):
83
+ pass
84
+ return None
85
+
86
+
87
+ def get_venv_python():
88
+ """获取虚拟环境的 Python 路径"""
89
+ if os.name == "nt":
90
+ return str(VENV_DIR / "Scripts" / "python.exe")
91
+ return str(VENV_DIR / "bin" / "python")
92
+
93
+
94
+ def create_venv():
95
+ """创建 Python 3.10 虚拟环境"""
96
+ venv_py = get_venv_python()
97
+ if os.path.isfile(venv_py):
98
+ r = subprocess.run([venv_py, "--version"], capture_output=True, text=True)
99
+ if r.returncode == 0 and "3.10" in r.stdout:
100
+ print(f" [OK] 虚拟环境已存在: {VENV_DIR}")
101
+ return True
102
+
103
+ py310 = find_python310()
104
+ if not py310:
105
+ print(" [错误] 未找到 Python 3.10")
106
+ print(" 下载: https://www.python.org/downloads/release/python-31011/")
107
+ return False
108
+
109
+ print(f" 使用 Python: {py310}")
110
+ print(f" 创建虚拟环境: {VENV_DIR}")
111
+ r = subprocess.run([py310, "-m", "venv", str(VENV_DIR)], capture_output=True, text=True)
112
+ if r.returncode != 0:
113
+ print(f" [错误] 创建失败:\n{r.stderr}")
114
+ return False
115
+
116
+ print(" [OK] 虚拟环境创建成功")
117
+ print(" 升级 pip ...")
118
+ subprocess.run([venv_py, "-m", "pip", "install", "--upgrade", "pip"],
119
+ capture_output=True, text=True)
120
+ return True
121
+
122
+ # === 依赖检查与安装 ===
123
+
124
+ def check_package(venv_py, import_name):
125
+ """用虚拟环境的 Python 检查包是否已安装"""
126
+ r = subprocess.run(
127
+ [venv_py, "-c", f"import {import_name}"],
128
+ capture_output=True, text=True,
129
+ )
130
+ return r.returncode == 0
131
+
132
+
133
+ def get_installed_version(venv_py, distribution_name):
134
+ """返回虚拟环境中已安装发行包版本;无法读取时返回 None。"""
135
+ code = (
136
+ "from importlib.metadata import PackageNotFoundError, version\n"
137
+ f"dist = {distribution_name!r}\n"
138
+ "try:\n"
139
+ " print(version(dist))\n"
140
+ "except PackageNotFoundError:\n"
141
+ " raise SystemExit(1)\n"
142
+ )
143
+ r = subprocess.run([venv_py, "-c", code], capture_output=True, text=True)
144
+ if r.returncode != 0:
145
+ return None
146
+ return r.stdout.strip() or None
147
+
148
+
149
+ def _version_parts(version_text):
150
+ parts = []
151
+ for part in str(version_text or "").split("."):
152
+ digits = ""
153
+ for char in part:
154
+ if not char.isdigit():
155
+ break
156
+ digits += char
157
+ parts.append(int(digits or 0))
158
+ return tuple(parts)
159
+
160
+
161
+ def _version_at_least(installed, required):
162
+ installed_parts = _version_parts(installed)
163
+ required_parts = _version_parts(required)
164
+ width = max(len(installed_parts), len(required_parts))
165
+ installed_parts += (0,) * (width - len(installed_parts))
166
+ required_parts += (0,) * (width - len(required_parts))
167
+ return installed_parts >= required_parts
168
+
169
+
170
+ def _version_less_than(installed, upper_bound):
171
+ installed_parts = _version_parts(installed)
172
+ upper_parts = _version_parts(upper_bound)
173
+ width = max(len(installed_parts), len(upper_parts))
174
+ installed_parts += (0,) * (width - len(installed_parts))
175
+ upper_parts += (0,) * (width - len(upper_parts))
176
+ return installed_parts < upper_parts
177
+
178
+
179
+ def detect_cuda_version():
180
+ """检测系统 CUDA 版本,返回对应的 PyTorch index-url"""
181
+ try:
182
+ r = subprocess.run(
183
+ ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"],
184
+ capture_output=True, text=True, timeout=10,
185
+ )
186
+ if r.returncode == 0:
187
+ # nvidia-smi 存在,尝试获取 CUDA 版本
188
+ r2 = subprocess.run(
189
+ ["nvidia-smi"],
190
+ capture_output=True, text=True, timeout=10,
191
+ )
192
+ output = r2.stdout
193
+ # 从 nvidia-smi 输出中提取 CUDA Version
194
+ import re
195
+ match = re.search(r"CUDA Version:\s*(\d+)\.(\d+)", output)
196
+ if match:
197
+ major, minor = int(match.group(1)), int(match.group(2))
198
+ if (major, minor) >= (12, 6):
199
+ return "https://download.pytorch.org/whl/cu126"
200
+ elif (major, minor) >= (12, 4):
201
+ return "https://download.pytorch.org/whl/cu124"
202
+ elif (major, minor) >= (12, 1):
203
+ return "https://download.pytorch.org/whl/cu121"
204
+ elif (major, minor) >= (11, 8):
205
+ return "https://download.pytorch.org/whl/cu118"
206
+ else:
207
+ return None
208
+ except (FileNotFoundError, subprocess.TimeoutExpired):
209
+ pass
210
+ return None
211
+
212
+
213
+ def pip_install(venv_py, package, extra="", index_url=None, no_deps=False, version_spec=""):
214
+ """用虚拟环境的 pip 安装包"""
215
+ target = f"{package}[{extra}]{version_spec}" if extra else f"{package}{version_spec}"
216
+ print(f" 安装 {target} ...")
217
+ cmd = [venv_py, "-m", "pip", "install", target]
218
+ if index_url:
219
+ cmd.extend(["--index-url", index_url])
220
+ if no_deps:
221
+ cmd.append("--no-deps")
222
+ r = subprocess.run(cmd, capture_output=True, text=True)
223
+ if r.returncode != 0:
224
+ # fairseq 依赖冲突时尝试 --no-deps 回退
225
+ if not no_deps and "ResolutionImpossible" in r.stderr:
226
+ print(f" [依赖冲突] 尝试 --no-deps 安装 {target} ...")
227
+ return pip_install(venv_py, package, extra=extra, index_url=index_url, no_deps=True)
228
+ print(f" [失败] {target}")
229
+ lines = r.stderr.strip().splitlines()
230
+ if lines:
231
+ print(f" {lines[-1]}")
232
+ return False
233
+ print(f" [完成] {target}")
234
+ return True
235
+
236
+
237
+ def check_all(venv_py):
238
+ """检查所有依赖"""
239
+ print("=" * 50)
240
+ print("RVC 依赖检查")
241
+ print("=" * 50)
242
+ missing = []
243
+ for key, info in PACKAGES.items():
244
+ ok = check_package(venv_py, info["import"])
245
+ status = "OK" if ok else "未安装"
246
+ min_version = info.get("min_version")
247
+ max_exclusive_version = info.get("max_exclusive_version")
248
+ if ok and (min_version or max_exclusive_version):
249
+ installed_version = get_installed_version(
250
+ venv_py,
251
+ info.get("dist", info["pip"]),
252
+ )
253
+ if not installed_version:
254
+ ok = False
255
+ requirements = []
256
+ if min_version:
257
+ requirements.append(f">= {min_version}")
258
+ if max_exclusive_version:
259
+ requirements.append(f"< {max_exclusive_version}")
260
+ status = f"需更新 (无法读取版本,要求 {' 且 '.join(requirements)})"
261
+ elif (
262
+ (not min_version or _version_at_least(installed_version, min_version))
263
+ and (
264
+ not max_exclusive_version
265
+ or _version_less_than(installed_version, max_exclusive_version)
266
+ )
267
+ ):
268
+ status = f"OK ({installed_version})"
269
+ else:
270
+ ok = False
271
+ requirements = []
272
+ if min_version:
273
+ requirements.append(f">= {min_version}")
274
+ if max_exclusive_version:
275
+ requirements.append(f"< {max_exclusive_version}")
276
+ status = f"需更新 ({installed_version} 不满足 {' 且 '.join(requirements)})"
277
+ mark = "[v]" if ok else "[x]"
278
+ print(f" {mark} {info['name']:30s} {status}")
279
+ if not ok:
280
+ missing.append(info)
281
+ print("-" * 50)
282
+ if missing:
283
+ print(f"缺少 {len(missing)} 个依赖包")
284
+ else:
285
+ print("所有依赖已安装")
286
+ return missing
287
+
288
+
289
+ def install_all(venv_py, gpu=True):
290
+ """安装所有缺失的依赖"""
291
+ missing = check_all(venv_py)
292
+ if not missing:
293
+ print("\n无需安装,所有依赖已就绪。")
294
+ return True
295
+
296
+ # 检测 CUDA 版本
297
+ cuda_index_url = None
298
+ if gpu:
299
+ cuda_index_url = detect_cuda_version()
300
+ if cuda_index_url:
301
+ print(f"\n 检测到 CUDA,使用 PyTorch 源: {cuda_index_url}")
302
+ else:
303
+ print("\n [错误] 未检测到支持的 CUDA,GPU 安装停止。")
304
+ print(" 如需 CPU 版 PyTorch,请显式使用 --cpu。")
305
+ return False
306
+
307
+ print(f"\n开始安装 {len(missing)} 个缺失的依赖...\n")
308
+ failed = []
309
+ for info in missing:
310
+ pip_name = info["pip"]
311
+ if pip_name in ("torch", "torchaudio"):
312
+ if gpu and cuda_index_url:
313
+ ok = pip_install(venv_py, pip_name, index_url=cuda_index_url)
314
+ else:
315
+ ok = pip_install(venv_py, pip_name, index_url="https://download.pytorch.org/whl/cpu")
316
+ elif pip_name == "audio-separator":
317
+ version_spec = f">={info['min_version']}" if info.get("min_version") else ""
318
+ ok = pip_install(
319
+ venv_py,
320
+ pip_name,
321
+ extra="gpu" if gpu else "cpu",
322
+ version_spec=version_spec,
323
+ )
324
+ if ok:
325
+ # audio-separator 0.31+ declares numpy>=2, while the current
326
+ # Gradio 3.x UI stack is pinned to numpy 1.x. The separator
327
+ # model table and runtime imports work with numpy 1.26, so
328
+ # restore the app-compatible numpy line after separator install.
329
+ ok = pip_install(venv_py, "numpy<2,>=1.23.0")
330
+ else:
331
+ ok = pip_install(venv_py, pip_name)
332
+ if not ok:
333
+ failed.append(info["name"])
334
+
335
+ print("\n" + "=" * 50)
336
+ if failed:
337
+ print(f"安装完成,{len(failed)} 个包失败: {', '.join(failed)}")
338
+ return False
339
+ print("所有依赖安装成功!")
340
+ return True
341
+
342
+
343
+ def launch_app(venv_py):
344
+ """用虚拟环境启动应用"""
345
+ run_script = str(ROOT_DIR / "run.py")
346
+ print(f"\n启动应用: {run_script}")
347
+ print("=" * 50)
348
+ try:
349
+ subprocess.run([venv_py, run_script], cwd=str(ROOT_DIR))
350
+ except KeyboardInterrupt:
351
+ print("\n已停止")
352
+
353
+ # === 主入口 ===
354
+
355
+ def main():
356
+ import argparse
357
+ parser = argparse.ArgumentParser(description="RVC 安装脚本")
358
+ parser.add_argument("--cpu", action="store_true", help="安装 CPU 版本")
359
+ parser.add_argument("--check", action="store_true", help="仅检查依赖")
360
+ parser.add_argument("--no-run", action="store_true", help="安装后不启动")
361
+ args = parser.parse_args()
362
+
363
+ print("=" * 50)
364
+ print("RVC 安装程序")
365
+ print("=" * 50)
366
+
367
+ # 1. 创建虚拟环境
368
+ print("\n[1/3] 检查虚拟环境")
369
+ if not create_venv():
370
+ sys.exit(1)
371
+
372
+ venv_py = get_venv_python()
373
+
374
+ # 2. 安装依赖
375
+ print(f"\n[2/3] 检查依赖")
376
+ if args.check:
377
+ check_all(venv_py)
378
+ return
379
+
380
+ gpu = not args.cpu
381
+ if not install_all(venv_py, gpu=gpu):
382
+ print("\n部分依赖安装失败,可尝试手动安装。")
383
+ sys.exit(1)
384
+
385
+ # 3. 启动应用
386
+ if args.no_run:
387
+ print("\n安装完成。运行方式:")
388
+ print(f" {venv_py} run.py")
389
+ return
390
+
391
+ print(f"\n[3/3] 启动应用")
392
+ launch_app(venv_py)
393
+
394
+
395
+ if __name__ == "__main__":
396
+ main()
lib/audio.py CHANGED
@@ -2,10 +2,10 @@
2
  """
3
  音频处理模块 - 加载、保存和处理音频文件
4
  """
5
- import numpy as np
6
- import librosa
7
- import soundfile as sf
8
- from typing import Tuple, Optional
9
 
10
 
11
  def load_audio(path: str, sr: int = 16000) -> np.ndarray:
@@ -27,7 +27,7 @@ def load_audio(path: str, sr: int = 16000) -> np.ndarray:
27
  return audio.astype(np.float32)
28
 
29
 
30
- def save_audio(path: str, audio: np.ndarray, sr: int = 48000):
31
  """
32
  保存音频到文件
33
 
@@ -37,52 +37,52 @@ def save_audio(path: str, audio: np.ndarray, sr: int = 48000):
37
  sr: 采样率 (默认 48000)
38
  """
39
  # 确保音频在 [-1, 1] 范围内
40
- audio = np.clip(audio, -1.0, 1.0)
41
- sf.write(path, audio, sr)
42
-
43
-
44
- def soft_clip(
45
- audio: np.ndarray,
46
- threshold: float = 0.9,
47
- ceiling: float = 0.99,
48
- ) -> np.ndarray:
49
- """
50
- 使用平滑软削波抑制峰值,尽量保留主体响度。
51
-
52
- Args:
53
- audio: 输入音频
54
- threshold: 开始压缩的阈值
55
- ceiling: 软削波上限
56
-
57
- Returns:
58
- np.ndarray: 处理后的音频
59
- """
60
- audio = np.asarray(audio, dtype=np.float32)
61
-
62
- if threshold <= 0:
63
- raise ValueError("threshold 必须大于 0")
64
- if ceiling <= threshold:
65
- raise ValueError("ceiling 必须大于 threshold")
66
-
67
- result = audio.copy()
68
- abs_audio = np.abs(result)
69
- mask = abs_audio > threshold
70
- if not np.any(mask):
71
- return result
72
-
73
- overshoot = (abs_audio[mask] - threshold) / (ceiling - threshold + 1e-8)
74
- compressed = threshold + (ceiling - threshold) * np.tanh(overshoot)
75
- result[mask] = np.sign(result[mask]) * compressed
76
- return result.astype(np.float32, copy=False)
77
-
78
-
79
- def soft_clip_array(
80
- audio: np.ndarray,
81
- threshold: float = 0.9,
82
- ceiling: float = 0.99,
83
- ) -> np.ndarray:
84
- """软削波数组版本,支持单声道/多声道。"""
85
- return soft_clip(audio, threshold=threshold, ceiling=ceiling)
86
 
87
 
88
  def get_audio_info(path: str) -> dict:
 
2
  """
3
  音频处理模块 - 加载、保存和处理音频文件
4
  """
5
+ import numpy as np
6
+ import librosa
7
+ import soundfile as sf
8
+ from typing import Tuple, Optional
9
 
10
 
11
  def load_audio(path: str, sr: int = 16000) -> np.ndarray:
 
27
  return audio.astype(np.float32)
28
 
29
 
30
+ def save_audio(path: str, audio: np.ndarray, sr: int = 48000):
31
  """
32
  保存音频到文件
33
 
 
37
  sr: 采样率 (默认 48000)
38
  """
39
  # 确保音频在 [-1, 1] 范围内
40
+ audio = np.clip(audio, -1.0, 1.0)
41
+ sf.write(path, audio, sr)
42
+
43
+
44
+ def soft_clip(
45
+ audio: np.ndarray,
46
+ threshold: float = 0.9,
47
+ ceiling: float = 0.99,
48
+ ) -> np.ndarray:
49
+ """
50
+ 使用平滑软削波抑制峰值,尽量保留主体响度。
51
+
52
+ Args:
53
+ audio: 输入音频
54
+ threshold: 开始压缩的阈值
55
+ ceiling: 软削波上限
56
+
57
+ Returns:
58
+ np.ndarray: 处理后的音频
59
+ """
60
+ audio = np.asarray(audio, dtype=np.float32)
61
+
62
+ if threshold <= 0:
63
+ raise ValueError("threshold 必须大于 0")
64
+ if ceiling <= threshold:
65
+ raise ValueError("ceiling 必须大于 threshold")
66
+
67
+ result = audio.copy()
68
+ abs_audio = np.abs(result)
69
+ mask = abs_audio > threshold
70
+ if not np.any(mask):
71
+ return result
72
+
73
+ overshoot = (abs_audio[mask] - threshold) / (ceiling - threshold + 1e-8)
74
+ compressed = threshold + (ceiling - threshold) * np.tanh(overshoot)
75
+ result[mask] = np.sign(result[mask]) * compressed
76
+ return result.astype(np.float32, copy=False)
77
+
78
+
79
+ def soft_clip_array(
80
+ audio: np.ndarray,
81
+ threshold: float = 0.9,
82
+ ceiling: float = 0.99,
83
+ ) -> np.ndarray:
84
+ """软削波数组版本,支持单声道/多声道。"""
85
+ return soft_clip(audio, threshold=threshold, ceiling=ceiling)
86
 
87
 
88
  def get_audio_info(path: str) -> dict:
lib/audio_metrics.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Reference-based audio metrics for separation/cover evaluation."""
3
+ from __future__ import annotations
4
+
5
+ from typing import Mapping
6
+
7
+ import numpy as np
8
+
9
+
10
+ EPS = 1e-10
11
+
12
+
13
+ def _as_mono_float(audio: np.ndarray) -> np.ndarray:
14
+ arr = np.asarray(audio, dtype=np.float64)
15
+ if arr.ndim == 2:
16
+ arr = np.mean(arr, axis=1)
17
+ return arr.reshape(-1)
18
+
19
+
20
+ def _align_pair(reference: np.ndarray, estimate: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
21
+ ref = _as_mono_float(reference)
22
+ est = _as_mono_float(estimate)
23
+ n = min(ref.size, est.size)
24
+ if n <= 0:
25
+ raise ValueError("Audio metric received empty reference or estimate.")
26
+ return ref[:n], est[:n]
27
+
28
+
29
+ def _power(audio: np.ndarray) -> float:
30
+ arr = np.asarray(audio, dtype=np.float64).reshape(-1)
31
+ return float(np.sum(arr * arr))
32
+
33
+
34
+ def _db_ratio(signal_power: float, noise_power: float) -> float:
35
+ signal_power = max(float(signal_power), EPS)
36
+ noise_power = max(float(noise_power), EPS)
37
+ return float(10.0 * np.log10(signal_power / noise_power))
38
+
39
+
40
+ def signal_distortion_ratio(reference: np.ndarray, estimate: np.ndarray) -> float:
41
+ """Scale-dependent SDR: 10 log10(||s||^2 / ||s - shat||^2)."""
42
+ ref, est = _align_pair(reference, estimate)
43
+ return _db_ratio(_power(ref), _power(ref - est))
44
+
45
+
46
+ def scale_invariant_signal_distortion_ratio(reference: np.ndarray, estimate: np.ndarray) -> float:
47
+ """SI-SDR as used by modern source-separation literature."""
48
+ ref, est = _align_pair(reference, estimate)
49
+ ref = ref - float(np.mean(ref))
50
+ est = est - float(np.mean(est))
51
+ ref_power = _power(ref)
52
+ if ref_power <= EPS:
53
+ raise ValueError("SI-SDR reference is silent.")
54
+ scale = float(np.dot(est, ref) / (ref_power + EPS))
55
+ target = scale * ref
56
+ residual = est - target
57
+ return _db_ratio(_power(target), _power(residual))
58
+
59
+
60
+ def signal_to_noise_ratio(reference: np.ndarray, estimate: np.ndarray) -> float:
61
+ """Alias for scale-dependent reconstruction SNR."""
62
+ return signal_distortion_ratio(reference, estimate)
63
+
64
+
65
+ def evaluate_reference_stems(
66
+ references: Mapping[str, np.ndarray],
67
+ estimates: Mapping[str, np.ndarray],
68
+ ) -> dict:
69
+ """Compute true reference-based metrics for matching stems.
70
+
71
+ The caller must provide time-aligned reference stems. Without references,
72
+ SI-SDR/SDR cannot be interpreted as source-separation quality.
73
+ """
74
+ stem_metrics: dict[str, dict[str, float]] = {}
75
+ for stem_name, reference_audio in references.items():
76
+ if stem_name not in estimates:
77
+ raise KeyError(f"Missing estimated stem for reference: {stem_name}")
78
+ estimate_audio = estimates[stem_name]
79
+ stem_metrics[stem_name] = {
80
+ "si_sdr": scale_invariant_signal_distortion_ratio(reference_audio, estimate_audio),
81
+ "sdr": signal_distortion_ratio(reference_audio, estimate_audio),
82
+ "snr": signal_to_noise_ratio(reference_audio, estimate_audio),
83
+ }
84
+
85
+ if not stem_metrics:
86
+ raise ValueError("No reference stems were provided.")
87
+
88
+ return {
89
+ "mean_si_sdr": float(np.mean([metrics["si_sdr"] for metrics in stem_metrics.values()])),
90
+ "mean_sdr": float(np.mean([metrics["sdr"] for metrics in stem_metrics.values()])),
91
+ "mean_snr": float(np.mean([metrics["snr"] for metrics in stem_metrics.values()])),
92
+ "stems": stem_metrics,
93
+ }
lib/ffmpeg_runtime.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import shutil
5
+ import subprocess
6
+ import sys
7
+ from pathlib import Path
8
+ from typing import MutableMapping, Optional
9
+
10
+
11
+ ROOT_DIR = Path(__file__).resolve().parent.parent
12
+
13
+
14
+ def get_runtime_root(root_dir: Path | str | None = None) -> Path:
15
+ if root_dir is not None:
16
+ return Path(root_dir).expanduser()
17
+ if getattr(sys, "frozen", False):
18
+ return Path(sys.executable).resolve().parent
19
+ return ROOT_DIR
20
+
21
+
22
+ def _normalize_path_key(path: Path | str) -> str:
23
+ return os.path.normcase(os.path.normpath(str(path)))
24
+
25
+
26
+ def _iter_ffmpeg_bin_candidates(root_dir: Path) -> list[Path]:
27
+ candidates = [
28
+ root_dir / "tools" / "ffmpeg" / "bin",
29
+ root_dir / "tools" / "ffmpeg",
30
+ ]
31
+ meipass = getattr(sys, "_MEIPASS", None)
32
+ if meipass:
33
+ meipass_root = Path(meipass)
34
+ candidates.extend(
35
+ [
36
+ meipass_root / "tools" / "ffmpeg" / "bin",
37
+ meipass_root / "tools" / "ffmpeg",
38
+ ]
39
+ )
40
+
41
+ try:
42
+ import imageio_ffmpeg
43
+
44
+ candidates.append(Path(imageio_ffmpeg.get_ffmpeg_exe()).resolve().parent)
45
+ except Exception:
46
+ pass
47
+
48
+ unique: list[Path] = []
49
+ seen: set[str] = set()
50
+ for candidate in candidates:
51
+ key = _normalize_path_key(candidate)
52
+ if key not in seen:
53
+ unique.append(candidate)
54
+ seen.add(key)
55
+ return unique
56
+
57
+
58
+ def get_ffmpeg_bin_dir(root_dir: Path | str | None = None) -> Optional[Path]:
59
+ runtime_root = get_runtime_root(root_dir)
60
+ ffmpeg_name = "ffmpeg.exe" if os.name == "nt" else "ffmpeg"
61
+
62
+ for candidate in _iter_ffmpeg_bin_candidates(runtime_root):
63
+ if (candidate / ffmpeg_name).exists():
64
+ return candidate
65
+
66
+ found = shutil.which("ffmpeg")
67
+ if found:
68
+ return Path(found).resolve().parent
69
+
70
+ return None
71
+
72
+
73
+ def _check_executable_runs(executable: Path, label: str) -> None:
74
+ try:
75
+ result = subprocess.run(
76
+ [str(executable), "-version"],
77
+ capture_output=True,
78
+ text=True,
79
+ timeout=10,
80
+ )
81
+ except Exception as exc:
82
+ raise RuntimeError(f"{label} 无法启动: {executable} ({exc})") from exc
83
+
84
+ if result.returncode != 0:
85
+ details = "\n".join(
86
+ part.strip()
87
+ for part in (result.stdout, result.stderr)
88
+ if part and part.strip()
89
+ )
90
+ if not details:
91
+ details = f"退出码: {result.returncode}"
92
+ raise RuntimeError(f"{label} 无法正常运行: {executable}\n{details}")
93
+
94
+
95
+ def configure_ffmpeg_runtime(
96
+ root_dir: Path | str | None = None,
97
+ env: MutableMapping[str, str] | None = None,
98
+ ) -> Optional[Path]:
99
+ env = os.environ if env is None else env
100
+ bin_dir = get_ffmpeg_bin_dir(root_dir=root_dir)
101
+ if bin_dir is None:
102
+ return None
103
+
104
+ bin_dir_str = str(bin_dir)
105
+ current_path = env.get("PATH", "")
106
+ path_entries = [entry for entry in current_path.split(os.pathsep) if entry]
107
+ normalized_entries = {_normalize_path_key(entry) for entry in path_entries}
108
+ if _normalize_path_key(bin_dir_str) not in normalized_entries:
109
+ env["PATH"] = bin_dir_str if not current_path else bin_dir_str + os.pathsep + current_path
110
+
111
+ ffmpeg_name = "ffmpeg.exe" if os.name == "nt" else "ffmpeg"
112
+ ffmpeg_path = bin_dir / ffmpeg_name
113
+ if ffmpeg_path.exists():
114
+ _check_executable_runs(ffmpeg_path, "ffmpeg")
115
+ env.setdefault("FFMPEG_BINARY", str(ffmpeg_path))
116
+ env.setdefault("IMAGEIO_FFMPEG_EXE", str(ffmpeg_path))
117
+
118
+ ffprobe_name = "ffprobe.exe" if os.name == "nt" else "ffprobe"
119
+ ffprobe_path = bin_dir / ffprobe_name
120
+ if ffprobe_path.exists():
121
+ _check_executable_runs(ffprobe_path, "ffprobe")
122
+ env.setdefault("FFPROBE_BINARY", str(ffprobe_path))
123
+ else:
124
+ resolved_ffprobe = shutil.which("ffprobe", path=env.get("PATH", ""))
125
+ if resolved_ffprobe is None:
126
+ raise RuntimeError("未找到 ffprobe。请安装 ffmpeg/ffprobe,或将 ffprobe 放入 tools/ffmpeg/bin。")
127
+ _check_executable_runs(Path(resolved_ffprobe), "ffprobe")
128
+ env.setdefault("FFPROBE_BINARY", str(Path(resolved_ffprobe)))
129
+
130
+ return bin_dir
lib/logger.py CHANGED
@@ -1,254 +1,254 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- 日志工具模块 - 支持时间戳和颜色输出
4
- """
5
- import sys
6
- import logging
7
- from datetime import datetime
8
-
9
- try:
10
- from colorama import init, Fore, Style, Back
11
- init(autoreset=True) # 初始化 colorama (Windows 兼容), autoreset确保每行重置
12
- COLORAMA_AVAILABLE = True
13
- except ImportError:
14
- COLORAMA_AVAILABLE = False
15
- # 定义空的占位符
16
- class Fore:
17
- LIGHTBLACK_EX = GREEN = YELLOW = RED = CYAN = BLUE = MAGENTA = WHITE = LIGHTGREEN_EX = LIGHTCYAN_EX = LIGHTYELLOW_EX = LIGHTMAGENTA_EX = ""
18
- class Style:
19
- RESET_ALL = BRIGHT = DIM = ""
20
- class Back:
21
- pass
22
-
23
-
24
- class Logger:
25
- """统一日志工具"""
26
-
27
- SAFE_CHAR_MAP = {
28
- "✓": "[OK] ",
29
- "✗": "[X] ",
30
- "→": "->",
31
- "◆": "*",
32
- }
33
-
34
- COLORS = {
35
- "DEBUG": Fore.LIGHTBLACK_EX,
36
- "INFO": Fore.GREEN,
37
- "SUCCESS": Fore.LIGHTGREEN_EX,
38
- "WARNING": Fore.YELLOW,
39
- "ERROR": Fore.RED,
40
- "STEP": Fore.CYAN,
41
- "DETAIL": Fore.LIGHTCYAN_EX,
42
- "PROGRESS": Fore.MAGENTA,
43
- "MODEL": Fore.LIGHTMAGENTA_EX,
44
- "AUDIO": Fore.BLUE,
45
- "CONFIG": Fore.LIGHTYELLOW_EX,
46
- }
47
-
48
- RESET = Style.RESET_ALL
49
- BRIGHT = Style.BRIGHT
50
- DIM = Style.DIM
51
-
52
- # 详细日志开关
53
- verbose = True
54
-
55
- @staticmethod
56
- def _sanitize_console_text(text: str) -> str:
57
- """将不兼容当前终端编码的字符替换为安全文本。"""
58
- sanitized = text
59
- for src, dst in Logger.SAFE_CHAR_MAP.items():
60
- sanitized = sanitized.replace(src, dst)
61
- return sanitized
62
-
63
- @staticmethod
64
- def _emit(text: str):
65
- """安全输出到终端,避免 Windows/GBK 控制台因 Unicode 崩溃。"""
66
- try:
67
- print(text, flush=True)
68
- return
69
- except UnicodeEncodeError:
70
- pass
71
-
72
- fallback = Logger._sanitize_console_text(text)
73
- encoding = getattr(sys.stdout, "encoding", None) or "utf-8"
74
- try:
75
- print(
76
- fallback.encode(encoding, errors="replace").decode(encoding),
77
- flush=True,
78
- )
79
- except Exception:
80
- print(
81
- fallback.encode("ascii", errors="replace").decode("ascii"),
82
- flush=True,
83
- )
84
-
85
- @staticmethod
86
- def _log(level: str, msg: str, force_print: bool = True):
87
- """部日志方法"""
88
- timestamp = datetime.now().strftime("%H:%M:%S")
89
- color = Logger.COLORS.get(level, "")
90
- reset = Logger.RESET
91
-
92
- # 根据级别决定前缀
93
- if level in ("INFO", "STEP", "SUCCESS"):
94
- prefix = ""
95
- elif level == "DETAIL":
96
- prefix = " → "
97
- elif level == "PROGRESS":
98
- prefix = " ◆ "
99
- elif level == "MODEL":
100
- prefix = "[模型] "
101
- elif level == "AUDIO":
102
- prefix = "[音频] "
103
- elif level == "CONFIG":
104
- prefix = "[配置] "
105
- else:
106
- prefix = f"[{level}] "
107
-
108
- output = f"{color}[{timestamp}]{prefix}{msg}{reset}"
109
- Logger._emit(output)
110
-
111
- @staticmethod
112
- def debug(msg: str):
113
- """调试日志 (灰色) - 仅在verbose模式下显示"""
114
- if Logger.verbose:
115
- Logger._log("DEBUG", msg)
116
-
117
- @staticmethod
118
- def info(msg: str):
119
- """信息日志 (绿色)"""
120
- Logger._log("INFO", msg)
121
-
122
- @staticmethod
123
- def success(msg: str):
124
- """成功日志 (亮绿色)"""
125
- Logger._log("SUCCESS", f"✓ {msg}")
126
-
127
- @staticmethod
128
- def warning(msg: str):
129
- """警告日志 (黄色)"""
130
- Logger._log("WARNING", msg)
131
-
132
- @staticmethod
133
- def error(msg: str):
134
- """错误日志 (红色)"""
135
- Logger._log("ERROR", msg)
136
-
137
- @staticmethod
138
- def step(current: int, total: int, msg: str):
139
- """步骤日志 (青色)"""
140
- timestamp = datetime.now().strftime("%H:%M:%S")
141
- color = Logger.COLORS.get("STEP", "")
142
- reset = Logger.RESET
143
- Logger._emit(f"{color}[{timestamp}][{current}/{total}] {msg}{reset}")
144
-
145
- @staticmethod
146
- def detail(msg: str):
147
- """详细日志 (浅青色) - 用于显示处理细节"""
148
- if Logger.verbose:
149
- Logger._log("DETAIL", msg)
150
-
151
- @staticmethod
152
- def progress(msg: str):
153
- """进度日志 (紫色) - 用于显示处理进度"""
154
- Logger._log("PROGRESS", msg)
155
-
156
- @staticmethod
157
- def model(msg: str):
158
- """模型日志 (浅紫色) - 用于模型加载/卸载信息"""
159
- Logger._log("MODEL", msg)
160
-
161
- @staticmethod
162
- def audio(msg: str):
163
- """音频日志 (蓝色) - 用于音频处理信息"""
164
- Logger._log("AUDIO", msg)
165
-
166
- @staticmethod
167
- def config(msg: str):
168
- """配置日志 (浅黄色) - 用于配置信息"""
169
- if Logger.verbose:
170
- Logger._log("CONFIG", msg)
171
-
172
- @staticmethod
173
- def header(msg: str):
174
- """标题日志 (带分隔线)"""
175
- timestamp = datetime.now().strftime("%H:%M:%S")
176
- color = Logger.COLORS.get("INFO", "")
177
- reset = Logger.RESET
178
- Logger._emit(f"{color}[{timestamp}] {'=' * 50}{reset}")
179
- Logger._emit(f"{color}[{timestamp}] {msg}{reset}")
180
- Logger._emit(f"{color}[{timestamp}] {'=' * 50}{reset}")
181
-
182
- @staticmethod
183
- def separator(char: str = "-", length: int = 40):
184
- """分隔线"""
185
- timestamp = datetime.now().strftime("%H:%M:%S")
186
- color = Logger.COLORS.get("DEBUG", "")
187
- reset = Logger.RESET
188
- Logger._emit(f"{color}[{timestamp}] {char * length}{reset}")
189
-
190
- @staticmethod
191
- def set_verbose(enabled: bool):
192
- """设置详细日志模式"""
193
- Logger.verbose = enabled
194
-
195
-
196
- # 便捷实例
197
- log = Logger()
198
-
199
-
200
- # ============ 配置标准 logging 模块使用颜色 ============
201
-
202
- class ColoredFormatter(logging.Formatter):
203
- """为标准logging模块添加颜色支持"""
204
-
205
- LEVEL_COLORS = {
206
- logging.DEBUG: Fore.LIGHTBLACK_EX,
207
- logging.INFO: Fore.GREEN,
208
- logging.WARNING: Fore.YELLOW,
209
- logging.ERROR: Fore.RED,
210
- logging.CRITICAL: Fore.RED + Style.BRIGHT,
211
- }
212
-
213
- def format(self, record):
214
- # 获取颜色
215
- color = self.LEVEL_COLORS.get(record.levelno, "")
216
- reset = Style.RESET_ALL
217
-
218
- # 格式化时间
219
- timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
220
-
221
- # 构建消息
222
- level_name = record.levelname
223
- module_name = record.name
224
-
225
- # 格式化输出
226
- formatted = f"{color}{timestamp} | {level_name} | {module_name} | {record.getMessage()}{reset}"
227
- return formatted
228
-
229
-
230
- def setup_colored_logging(level=logging.INFO):
231
- """配置全局logging使用颜色输出"""
232
- # 获取根logger
233
- root_logger = logging.getLogger()
234
- root_logger.setLevel(level)
235
-
236
- # 移除现有的handlers
237
- for handler in root_logger.handlers[:]:
238
- root_logger.removeHandler(handler)
239
-
240
- # 添加带颜色的handler
241
- console_handler = logging.StreamHandler(sys.stdout)
242
- console_handler.setLevel(level)
243
- console_handler.setFormatter(ColoredFormatter())
244
- root_logger.addHandler(console_handler)
245
-
246
- return root_logger
247
-
248
-
249
- # 自动配置logging颜色
250
- setup_colored_logging(logging.INFO)
251
-
252
- # 抑制第三方库的英文日志
253
- logging.getLogger("faiss").setLevel(logging.WARNING)
254
- logging.getLogger("audio_separator").setLevel(logging.WARNING)
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 日志工具模块 - 支持时间戳和颜色输出
4
+ """
5
+ import sys
6
+ import logging
7
+ from datetime import datetime
8
+
9
+ try:
10
+ from colorama import init, Fore, Style, Back
11
+ init(autoreset=True) # 初始化 colorama (Windows 兼容), autoreset确保每行重置
12
+ COLORAMA_AVAILABLE = True
13
+ except ImportError:
14
+ COLORAMA_AVAILABLE = False
15
+ # 定义空的占位符
16
+ class Fore:
17
+ LIGHTBLACK_EX = GREEN = YELLOW = RED = CYAN = BLUE = MAGENTA = WHITE = LIGHTGREEN_EX = LIGHTCYAN_EX = LIGHTYELLOW_EX = LIGHTMAGENTA_EX = ""
18
+ class Style:
19
+ RESET_ALL = BRIGHT = DIM = ""
20
+ class Back:
21
+ pass
22
+
23
+
24
+ class Logger:
25
+ """统一日志工具"""
26
+
27
+ SAFE_CHAR_MAP = {
28
+ "✓": "[OK] ",
29
+ "✗": "[X] ",
30
+ "→": "->",
31
+ "◆": "*",
32
+ }
33
+
34
+ COLORS = {
35
+ "DEBUG": Fore.LIGHTBLACK_EX,
36
+ "INFO": Fore.GREEN,
37
+ "SUCCESS": Fore.LIGHTGREEN_EX,
38
+ "WARNING": Fore.YELLOW,
39
+ "ERROR": Fore.RED,
40
+ "STEP": Fore.CYAN,
41
+ "DETAIL": Fore.LIGHTCYAN_EX,
42
+ "PROGRESS": Fore.MAGENTA,
43
+ "MODEL": Fore.LIGHTMAGENTA_EX,
44
+ "AUDIO": Fore.BLUE,
45
+ "CONFIG": Fore.LIGHTYELLOW_EX,
46
+ }
47
+
48
+ RESET = Style.RESET_ALL
49
+ BRIGHT = Style.BRIGHT
50
+ DIM = Style.DIM
51
+
52
+ # 详细日志开关
53
+ verbose = True
54
+
55
+ @staticmethod
56
+ def _sanitize_console_text(text: str) -> str:
57
+ """将不兼容当前终端编码的字符替换为安全文本。"""
58
+ sanitized = text
59
+ for src, dst in Logger.SAFE_CHAR_MAP.items():
60
+ sanitized = sanitized.replace(src, dst)
61
+ return sanitized
62
+
63
+ @staticmethod
64
+ def _emit(text: str):
65
+ """安全输出到终端,避免 Windows/GBK 控制台因 Unicode 崩溃。"""
66
+ try:
67
+ print(text, flush=True)
68
+ return
69
+ except UnicodeEncodeError:
70
+ pass
71
+
72
+ fallback = Logger._sanitize_console_text(text)
73
+ encoding = getattr(sys.stdout, "encoding", None) or "utf-8"
74
+ try:
75
+ print(
76
+ fallback.encode(encoding, errors="replace").decode(encoding),
77
+ flush=True,
78
+ )
79
+ except Exception:
80
+ print(
81
+ fallback.encode("ascii", errors="replace").decode("ascii"),
82
+ flush=True,
83
+ )
84
+
85
+ @staticmethod
86
+ def _log(level: str, msg: str, force_print: bool = True):
87
+ """��部日志方法"""
88
+ timestamp = datetime.now().strftime("%H:%M:%S")
89
+ color = Logger.COLORS.get(level, "")
90
+ reset = Logger.RESET
91
+
92
+ # 根据级别决定前缀
93
+ if level in ("INFO", "STEP", "SUCCESS"):
94
+ prefix = ""
95
+ elif level == "DETAIL":
96
+ prefix = " → "
97
+ elif level == "PROGRESS":
98
+ prefix = " ◆ "
99
+ elif level == "MODEL":
100
+ prefix = "[模型] "
101
+ elif level == "AUDIO":
102
+ prefix = "[音频] "
103
+ elif level == "CONFIG":
104
+ prefix = "[配置] "
105
+ else:
106
+ prefix = f"[{level}] "
107
+
108
+ output = f"{color}[{timestamp}]{prefix}{msg}{reset}"
109
+ Logger._emit(output)
110
+
111
+ @staticmethod
112
+ def debug(msg: str):
113
+ """调试日志 (灰色) - 仅在verbose模式下显示"""
114
+ if Logger.verbose:
115
+ Logger._log("DEBUG", msg)
116
+
117
+ @staticmethod
118
+ def info(msg: str):
119
+ """信息日志 (绿色)"""
120
+ Logger._log("INFO", msg)
121
+
122
+ @staticmethod
123
+ def success(msg: str):
124
+ """成功日志 (亮绿色)"""
125
+ Logger._log("SUCCESS", f"✓ {msg}")
126
+
127
+ @staticmethod
128
+ def warning(msg: str):
129
+ """警告日志 (黄色)"""
130
+ Logger._log("WARNING", msg)
131
+
132
+ @staticmethod
133
+ def error(msg: str):
134
+ """错误日志 (红色)"""
135
+ Logger._log("ERROR", msg)
136
+
137
+ @staticmethod
138
+ def step(current: int, total: int, msg: str):
139
+ """步骤日志 (青色)"""
140
+ timestamp = datetime.now().strftime("%H:%M:%S")
141
+ color = Logger.COLORS.get("STEP", "")
142
+ reset = Logger.RESET
143
+ Logger._emit(f"{color}[{timestamp}][{current}/{total}] {msg}{reset}")
144
+
145
+ @staticmethod
146
+ def detail(msg: str):
147
+ """详细日志 (浅青色) - 用于显示处理细节"""
148
+ if Logger.verbose:
149
+ Logger._log("DETAIL", msg)
150
+
151
+ @staticmethod
152
+ def progress(msg: str):
153
+ """进度日志 (紫色) - 用于显示处理进度"""
154
+ Logger._log("PROGRESS", msg)
155
+
156
+ @staticmethod
157
+ def model(msg: str):
158
+ """模型日志 (浅紫色) - 用于模型加载/卸载信息"""
159
+ Logger._log("MODEL", msg)
160
+
161
+ @staticmethod
162
+ def audio(msg: str):
163
+ """音频日志 (蓝色) - 用于音频处理信息"""
164
+ Logger._log("AUDIO", msg)
165
+
166
+ @staticmethod
167
+ def config(msg: str):
168
+ """配置日志 (浅黄色) - 用于配置信息"""
169
+ if Logger.verbose:
170
+ Logger._log("CONFIG", msg)
171
+
172
+ @staticmethod
173
+ def header(msg: str):
174
+ """标题日志 (带分隔线)"""
175
+ timestamp = datetime.now().strftime("%H:%M:%S")
176
+ color = Logger.COLORS.get("INFO", "")
177
+ reset = Logger.RESET
178
+ Logger._emit(f"{color}[{timestamp}] {'=' * 50}{reset}")
179
+ Logger._emit(f"{color}[{timestamp}] {msg}{reset}")
180
+ Logger._emit(f"{color}[{timestamp}] {'=' * 50}{reset}")
181
+
182
+ @staticmethod
183
+ def separator(char: str = "-", length: int = 40):
184
+ """分隔线"""
185
+ timestamp = datetime.now().strftime("%H:%M:%S")
186
+ color = Logger.COLORS.get("DEBUG", "")
187
+ reset = Logger.RESET
188
+ Logger._emit(f"{color}[{timestamp}] {char * length}{reset}")
189
+
190
+ @staticmethod
191
+ def set_verbose(enabled: bool):
192
+ """设置详细日志模式"""
193
+ Logger.verbose = enabled
194
+
195
+
196
+ # 便捷实例
197
+ log = Logger()
198
+
199
+
200
+ # ============ 配置标准 logging 模块使用颜色 ============
201
+
202
+ class ColoredFormatter(logging.Formatter):
203
+ """为标准logging模块添加颜色支持"""
204
+
205
+ LEVEL_COLORS = {
206
+ logging.DEBUG: Fore.LIGHTBLACK_EX,
207
+ logging.INFO: Fore.GREEN,
208
+ logging.WARNING: Fore.YELLOW,
209
+ logging.ERROR: Fore.RED,
210
+ logging.CRITICAL: Fore.RED + Style.BRIGHT,
211
+ }
212
+
213
+ def format(self, record):
214
+ # 获取颜色
215
+ color = self.LEVEL_COLORS.get(record.levelno, "")
216
+ reset = Style.RESET_ALL
217
+
218
+ # 格式化时间
219
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
220
+
221
+ # 构建消息
222
+ level_name = record.levelname
223
+ module_name = record.name
224
+
225
+ # 格式化输出
226
+ formatted = f"{color}{timestamp} | {level_name} | {module_name} | {record.getMessage()}{reset}"
227
+ return formatted
228
+
229
+
230
+ def setup_colored_logging(level=logging.INFO):
231
+ """配置全局logging使用颜色输出"""
232
+ # 获取根logger
233
+ root_logger = logging.getLogger()
234
+ root_logger.setLevel(level)
235
+
236
+ # 移除现有的handlers
237
+ for handler in root_logger.handlers[:]:
238
+ root_logger.removeHandler(handler)
239
+
240
+ # 添加带颜色的handler
241
+ console_handler = logging.StreamHandler(sys.stdout)
242
+ console_handler.setLevel(level)
243
+ console_handler.setFormatter(ColoredFormatter())
244
+ root_logger.addHandler(console_handler)
245
+
246
+ return root_logger
247
+
248
+
249
+ # 自动配置logging颜色
250
+ setup_colored_logging(logging.INFO)
251
+
252
+ # 抑制第三方库的英文日志
253
+ logging.getLogger("faiss").setLevel(logging.WARNING)
254
+ logging.getLogger("audio_separator").setLevel(logging.WARNING)
lib/mixer.py CHANGED
@@ -2,13 +2,13 @@
2
  """
3
  混音模块 - 人声与伴奏混合
4
  """
5
- import numpy as np
6
- import librosa
7
- import soundfile as sf
8
- from pathlib import Path
9
- from typing import Optional
10
-
11
- from lib.audio import soft_clip_array
12
 
13
  try:
14
  from lib.logger import log
@@ -152,12 +152,12 @@ def mix_vocals_and_accompaniment(
152
  elif reverb_amount > 0 and log:
153
  log.warning("Pedalboard 不可用,跳过混响")
154
 
155
- vocals = soft_clip_array(vocals * vocals_volume, threshold=0.85, ceiling=0.95)
156
- accompaniment = soft_clip_array(
157
- accompaniment * accompaniment_volume,
158
- threshold=0.85,
159
- ceiling=0.95,
160
- )
161
 
162
  vocals_len = vocals.shape[-1]
163
  accompaniment_len = accompaniment.shape[-1]
@@ -174,18 +174,18 @@ def mix_vocals_and_accompaniment(
174
  vocals = adjust_audio_length(vocals, target_len)
175
  accompaniment = adjust_audio_length(accompaniment, target_len)
176
 
177
- if log:
178
- log.progress("混合音轨...")
179
- mixed = vocals + accompaniment
180
-
181
- max_val = float(np.max(np.abs(mixed)))
182
  if log:
183
  log.detail(f"混合后峰值: {max_val:.4f}")
184
 
185
- mixed = soft_clip_array(mixed, threshold=0.90, ceiling=0.98)
186
- if log:
187
- final_peak = float(np.max(np.abs(mixed)))
188
- log.detail(f"软削波后峰值: {final_peak:.4f}")
189
 
190
  if mixed.ndim == 2:
191
  mixed = mixed.T
 
2
  """
3
  混音模块 - 人声与伴奏混合
4
  """
5
+ import numpy as np
6
+ import librosa
7
+ import soundfile as sf
8
+ from pathlib import Path
9
+ from typing import Optional
10
+
11
+ from lib.audio import soft_clip_array
12
 
13
  try:
14
  from lib.logger import log
 
152
  elif reverb_amount > 0 and log:
153
  log.warning("Pedalboard 不可用,跳过混响")
154
 
155
+ vocals = soft_clip_array(vocals * vocals_volume, threshold=0.85, ceiling=0.95)
156
+ accompaniment = soft_clip_array(
157
+ accompaniment * accompaniment_volume,
158
+ threshold=0.85,
159
+ ceiling=0.95,
160
+ )
161
 
162
  vocals_len = vocals.shape[-1]
163
  accompaniment_len = accompaniment.shape[-1]
 
174
  vocals = adjust_audio_length(vocals, target_len)
175
  accompaniment = adjust_audio_length(accompaniment, target_len)
176
 
177
+ if log:
178
+ log.progress("混合音轨...")
179
+ mixed = vocals + accompaniment
180
+
181
+ max_val = float(np.max(np.abs(mixed)))
182
  if log:
183
  log.detail(f"混合后峰值: {max_val:.4f}")
184
 
185
+ mixed = soft_clip_array(mixed, threshold=0.90, ceiling=0.98)
186
+ if log:
187
+ final_peak = float(np.max(np.abs(mixed)))
188
+ log.detail(f"软削波后峰值: {final_peak:.4f}")
189
 
190
  if mixed.ndim == 2:
191
  mixed = mixed.T
lib/runtime_build.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ from typing import Dict
6
+
7
+
8
+ ROOT_DIR = Path(__file__).resolve().parent.parent
9
+ WATCHED_FILES = (
10
+ "run.py",
11
+ "app.py",
12
+ "lib/ffmpeg_runtime.py",
13
+ "ui/app.py",
14
+ "infer/cover_pipeline.py",
15
+ "infer/official_adapter.py",
16
+ "infer/quality_policy.py",
17
+ )
18
+
19
+
20
+ def _existing_watched_files() -> list[Path]:
21
+ files: list[Path] = []
22
+ for relative_path in WATCHED_FILES:
23
+ path = ROOT_DIR / relative_path
24
+ if path.exists():
25
+ files.append(path)
26
+ return files
27
+
28
+
29
+ def _compute_runtime_build_info() -> Dict[str, str]:
30
+ files = _existing_watched_files()
31
+ if not files:
32
+ return {
33
+ "timestamp": "unknown",
34
+ "source": "unknown",
35
+ }
36
+
37
+ latest = max(files, key=lambda path: path.stat().st_mtime)
38
+ timestamp = datetime.fromtimestamp(latest.stat().st_mtime).strftime("%Y-%m-%d %H:%M:%S")
39
+ try:
40
+ source = latest.relative_to(ROOT_DIR).as_posix()
41
+ except ValueError:
42
+ source = str(latest)
43
+ return {
44
+ "timestamp": timestamp,
45
+ "source": source,
46
+ }
47
+
48
+
49
+ RUNTIME_BUILD_INFO = _compute_runtime_build_info()
50
+
51
+
52
+ def get_runtime_build_label() -> str:
53
+ info = RUNTIME_BUILD_INFO
54
+ return f"当前运行代码标记: {info['timestamp']} ({info['source']})"
55
+
56
+
57
+ def get_runtime_build_short_label() -> str:
58
+ info = RUNTIME_BUILD_INFO
59
+ return f"{info['timestamp']} ({info['source']})"
mcp/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ MCP 模块
4
+ """
mcp/server.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ MCP 服务器 - 为 Claude Code 提供 AI 翻唱工具
4
+ """
5
+ import asyncio
6
+ import json
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ from mcp.server import Server
11
+ from mcp.server.stdio import stdio_server
12
+ from mcp.types import Tool, TextContent
13
+
14
+ from mcp.tools import (
15
+ list_models,
16
+ convert_voice,
17
+ download_model,
18
+ get_model_status
19
+ )
20
+
21
+ # 创建 MCP 服务器
22
+ server = Server("rvc-voice-conversion")
23
+
24
+
25
+ @server.list_tools()
26
+ async def list_tools() -> list[Tool]:
27
+ """列出可用工具"""
28
+ return [
29
+ Tool(
30
+ name="list_voice_models",
31
+ description="列出所有可用的 RVC 语音模型",
32
+ inputSchema={
33
+ "type": "object",
34
+ "properties": {},
35
+ "required": []
36
+ }
37
+ ),
38
+ Tool(
39
+ name="convert_voice",
40
+ description="使用 RVC 模型进行 AI 翻唱",
41
+ inputSchema={
42
+ "type": "object",
43
+ "properties": {
44
+ "input_path": {
45
+ "type": "string",
46
+ "description": "输入音频文件的绝对路径"
47
+ },
48
+ "output_path": {
49
+ "type": "string",
50
+ "description": "输出音频文件的绝对路径"
51
+ },
52
+ "model_name": {
53
+ "type": "string",
54
+ "description": "要使用的语音模型名称"
55
+ },
56
+ "pitch_shift": {
57
+ "type": "number",
58
+ "description": "音调偏移 (半音),正数升调,负数降调",
59
+ "default": 0
60
+ },
61
+ "index_ratio": {
62
+ "type": "number",
63
+ "description": "索引混合比率 (0-1)",
64
+ "default": 0.5
65
+ }
66
+ },
67
+ "required": ["input_path", "output_path", "model_name"]
68
+ }
69
+ ),
70
+ Tool(
71
+ name="download_base_models",
72
+ description="下载 RVC 所需的基础模型 (HuBERT, RMVPE)",
73
+ inputSchema={
74
+ "type": "object",
75
+ "properties": {},
76
+ "required": []
77
+ }
78
+ ),
79
+ Tool(
80
+ name="get_model_status",
81
+ description="获取基础模型的下载状态",
82
+ inputSchema={
83
+ "type": "object",
84
+ "properties": {},
85
+ "required": []
86
+ }
87
+ )
88
+ ]
89
+
90
+
91
+ @server.call_tool()
92
+ async def call_tool(name: str, arguments: dict[str, Any]) -> list[TextContent]:
93
+ """执行工具调用"""
94
+
95
+ if name == "list_voice_models":
96
+ models = list_models()
97
+ result = {
98
+ "models": models,
99
+ "count": len(models)
100
+ }
101
+ return [TextContent(type="text", text=json.dumps(result, ensure_ascii=False, indent=2))]
102
+
103
+ elif name == "convert_voice":
104
+ result = convert_voice(
105
+ input_path=arguments["input_path"],
106
+ output_path=arguments["output_path"],
107
+ model_name=arguments["model_name"],
108
+ pitch_shift=arguments.get("pitch_shift", 0),
109
+ index_ratio=arguments.get("index_ratio", 0.5),
110
+ filter_radius=arguments.get("filter_radius", 3),
111
+ rms_mix_rate=arguments.get("rms_mix_rate", 0.25),
112
+ protect=arguments.get("protect", 0.33)
113
+ )
114
+ return [TextContent(type="text", text=json.dumps(result, ensure_ascii=False, indent=2))]
115
+
116
+ elif name == "download_base_models":
117
+ result = download_model()
118
+ return [TextContent(type="text", text=json.dumps(result, ensure_ascii=False, indent=2))]
119
+
120
+ elif name == "get_model_status":
121
+ status = get_model_status()
122
+ return [TextContent(type="text", text=json.dumps(status, ensure_ascii=False, indent=2))]
123
+
124
+ else:
125
+ return [TextContent(type="text", text=f"未知工具: {name}")]
126
+
127
+
128
+ async def main():
129
+ """启动 MCP 服务器"""
130
+ async with stdio_server() as (read_stream, write_stream):
131
+ await server.run(
132
+ read_stream,
133
+ write_stream,
134
+ server.create_initialization_options()
135
+ )
136
+
137
+
138
+ if __name__ == "__main__":
139
+ asyncio.run(main())
mcp/tools.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ MCP 工具函数 - 提供 AI 翻唱功能的工具接口
4
+ """
5
+ import os
6
+ import json
7
+ from pathlib import Path
8
+ from typing import Optional, List, Dict, Any
9
+
10
+ # 项目根目录
11
+ ROOT_DIR = Path(__file__).parent.parent
12
+
13
+
14
+ def get_config() -> dict:
15
+ """获取配置"""
16
+ config_path = ROOT_DIR / "configs" / "config.json"
17
+ if config_path.exists():
18
+ with open(config_path, "r", encoding="utf-8") as f:
19
+ config = json.load(f)
20
+ return _normalize_config(config)
21
+ return {}
22
+
23
+
24
+ def _normalize_config(config: dict) -> dict:
25
+ """Normalize legacy path keys to top-level entries."""
26
+ if not config:
27
+ return {}
28
+
29
+ paths = config.get("paths", {})
30
+ if "hubert_path" not in config and "hubert" in paths:
31
+ config["hubert_path"] = paths["hubert"]
32
+ if "rmvpe_path" not in config and "rmvpe" in paths:
33
+ config["rmvpe_path"] = paths["rmvpe"]
34
+ if "weights_dir" not in config and "weights" in paths:
35
+ config["weights_dir"] = paths["weights"]
36
+ if "output_dir" not in config and "outputs" in paths:
37
+ config["output_dir"] = paths["outputs"]
38
+ if "temp_dir" not in config and "temp" in paths:
39
+ config["temp_dir"] = paths["temp"]
40
+
41
+ return config
42
+
43
+
44
+ def list_models() -> List[Dict[str, Any]]:
45
+ """
46
+ 列出所有可用的语音模型
47
+
48
+ Returns:
49
+ List[Dict]: 模型列表,每个模型包含 name, model_path, index_path
50
+ """
51
+ from infer.pipeline import list_voice_models
52
+
53
+ config = get_config()
54
+ weights_dir = ROOT_DIR / config.get("weights_dir", "assets/weights")
55
+
56
+ return list_voice_models(str(weights_dir))
57
+
58
+
59
+ def convert_voice(
60
+ input_path: str,
61
+ output_path: str,
62
+ model_name: str,
63
+ pitch_shift: float = 0,
64
+ index_ratio: float = 0.5,
65
+ filter_radius: int = 3,
66
+ rms_mix_rate: float = 0.25,
67
+ protect: float = 0.33
68
+ ) -> Dict[str, Any]:
69
+ """
70
+ 执行 AI 翻唱
71
+
72
+ Args:
73
+ input_path: 输入音频文件路径
74
+ output_path: 输出音频文件路径
75
+ model_name: 模型名称
76
+ pitch_shift: 音调偏移 (半音)
77
+ index_ratio: 索引混合比率
78
+ filter_radius: 中值滤波半径
79
+ rms_mix_rate: 响度混合比率
80
+ protect: 保护系数
81
+
82
+ Returns:
83
+ Dict: 转换结果,包含 success, output_path, error
84
+ """
85
+ try:
86
+ from infer.pipeline import VoiceConversionPipeline, list_voice_models
87
+
88
+ config = get_config()
89
+ use_official_vc = config.get("use_official_vc", True)
90
+
91
+ # 查找模型
92
+ weights_dir = ROOT_DIR / config.get("weights_dir", "assets/weights")
93
+ models = list_voice_models(str(weights_dir))
94
+
95
+ model_info = None
96
+ for m in models:
97
+ if m["name"] == model_name:
98
+ model_info = m
99
+ break
100
+
101
+ if model_info is None:
102
+ return {
103
+ "success": False,
104
+ "output_path": None,
105
+ "error": f"模型不存在: {model_name}"
106
+ }
107
+
108
+ if use_official_vc:
109
+ from infer.official_adapter import convert_vocals_official
110
+
111
+ result_path = convert_vocals_official(
112
+ vocals_path=input_path,
113
+ output_path=output_path,
114
+ model_path=model_info["model_path"],
115
+ index_path=model_info.get("index_path"),
116
+ f0_method=config.get("f0_method", "rmvpe"),
117
+ pitch_shift=int(pitch_shift),
118
+ index_rate=index_ratio,
119
+ filter_radius=filter_radius,
120
+ rms_mix_rate=rms_mix_rate,
121
+ protect=protect,
122
+ )
123
+ return {
124
+ "success": True,
125
+ "output_path": result_path,
126
+ "error": None,
127
+ }
128
+
129
+ # 初始化管道
130
+ device = config.get("device", "cuda")
131
+ pipeline = VoiceConversionPipeline(device=device)
132
+ pipeline.hubert_layer = config.get("hubert_layer", 12)
133
+
134
+ # 加载 HuBERT
135
+ hubert_path = ROOT_DIR / config.get("hubert_path", "assets/hubert/hubert_base.pt")
136
+ if not hubert_path.exists():
137
+ return {
138
+ "success": False,
139
+ "output_path": None,
140
+ "error": "HuBERT 模型不存在,请先下载基础模型"
141
+ }
142
+ pipeline.load_hubert(str(hubert_path))
143
+
144
+ # 加载 F0 提取器
145
+ rmvpe_path = ROOT_DIR / config.get("rmvpe_path", "assets/rmvpe/rmvpe.pt")
146
+ if not rmvpe_path.exists():
147
+ return {
148
+ "success": False,
149
+ "output_path": None,
150
+ "error": "RMVPE 模型不存在,请先下载基础模型"
151
+ }
152
+ pipeline.load_f0_extractor("rmvpe", str(rmvpe_path))
153
+
154
+ # 加载语音模型
155
+ pipeline.load_voice_model(model_info["model_path"])
156
+
157
+ # 加载索引
158
+ if model_info.get("index_path"):
159
+ pipeline.load_index(model_info["index_path"])
160
+
161
+ # 执行转换
162
+ result_path = pipeline.convert(
163
+ audio_path=input_path,
164
+ output_path=output_path,
165
+ pitch_shift=pitch_shift,
166
+ index_ratio=index_ratio,
167
+ filter_radius=filter_radius,
168
+ rms_mix_rate=rms_mix_rate,
169
+ protect=protect
170
+ )
171
+
172
+ return {
173
+ "success": True,
174
+ "output_path": result_path,
175
+ "error": None
176
+ }
177
+
178
+ except Exception as e:
179
+ return {
180
+ "success": False,
181
+ "output_path": None,
182
+ "error": str(e)
183
+ }
184
+
185
+
186
+ def download_model(model_name: str = None) -> Dict[str, Any]:
187
+ """
188
+ 下载模型
189
+
190
+ Args:
191
+ model_name: 模型名称,为 None 时下载所有必需模型
192
+
193
+ Returns:
194
+ Dict: 下载结果
195
+ """
196
+ try:
197
+ from tools.download_models import (
198
+ download_model as dl_model,
199
+ download_required_models,
200
+ MODELS
201
+ )
202
+
203
+ if model_name:
204
+ if model_name not in MODELS:
205
+ return {
206
+ "success": False,
207
+ "error": f"未知模型: {model_name}"
208
+ }
209
+ success = dl_model(model_name)
210
+ else:
211
+ success = download_required_models()
212
+
213
+ return {
214
+ "success": success,
215
+ "error": None if success else "下载失败"
216
+ }
217
+
218
+ except Exception as e:
219
+ return {
220
+ "success": False,
221
+ "error": str(e)
222
+ }
223
+
224
+
225
+ def get_model_status() -> Dict[str, bool]:
226
+ """
227
+ 获取模型下载状态
228
+
229
+ Returns:
230
+ Dict: 模型名称 -> 是否已下载
231
+ """
232
+ from tools.download_models import check_all_models
233
+ return check_all_models()
models/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- 模型定义模块
4
- """
5
- from .rmvpe import RMVPE
6
-
7
- __all__ = ["RMVPE"]
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 模型定义模块
4
+ """
5
+ from .rmvpe import RMVPE
6
+
7
+ __all__ = ["RMVPE"]
models/rmvpe.py CHANGED
@@ -1,439 +1,439 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- RMVPE 模型 - 用于高质量 F0 提取
4
- """
5
- import torch
6
- import torch.nn as nn
7
- import torch.nn.functional as F
8
- import numpy as np
9
- from typing import Optional
10
-
11
-
12
- class BiGRU(nn.Module):
13
- """双向 GRU 层"""
14
-
15
- def __init__(self, input_features: int, hidden_features: int, num_layers: int):
16
- super().__init__()
17
- self.gru = nn.GRU(
18
- input_features,
19
- hidden_features,
20
- num_layers=num_layers,
21
- batch_first=True,
22
- bidirectional=True
23
- )
24
-
25
- def forward(self, x):
26
- return self.gru(x)[0]
27
-
28
-
29
- class ConvBlockRes(nn.Module):
30
- """残差卷积块"""
31
-
32
- def __init__(self, in_channels: int, out_channels: int, momentum: float = 0.01,
33
- force_shortcut: bool = False):
34
- super().__init__()
35
- self.conv = nn.Sequential(
36
- nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False),
37
- nn.BatchNorm2d(out_channels, momentum=momentum),
38
- nn.ReLU(),
39
- nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False),
40
- nn.BatchNorm2d(out_channels, momentum=momentum),
41
- nn.ReLU()
42
- )
43
-
44
- # 当通道数不同或强制使用时才创建 shortcut
45
- if in_channels != out_channels or force_shortcut:
46
- self.shortcut = nn.Conv2d(in_channels, out_channels, 1)
47
- self.has_shortcut = True
48
- else:
49
- self.has_shortcut = False
50
-
51
- def forward(self, x):
52
- if self.has_shortcut:
53
- return self.conv(x) + self.shortcut(x)
54
- else:
55
- return self.conv(x) + x
56
-
57
-
58
- class EncoderBlock(nn.Module):
59
- """编码器块 - 包含多个 ConvBlockRes 和一个池化层"""
60
-
61
- def __init__(self, in_channels: int, out_channels: int, kernel_size: int,
62
- n_blocks: int, momentum: float = 0.01):
63
- super().__init__()
64
- self.conv = nn.ModuleList()
65
- self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
66
- for _ in range(n_blocks - 1):
67
- self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
68
- self.pool = nn.AvgPool2d(kernel_size)
69
-
70
- def forward(self, x):
71
- for block in self.conv:
72
- x = block(x)
73
- # 返回池化前的张量用于 skip connection
74
- return self.pool(x), x
75
-
76
-
77
- class Encoder(nn.Module):
78
- """RMVPE 编码器"""
79
-
80
- def __init__(self, in_channels: int, in_size: int, n_encoders: int,
81
- kernel_size: int, n_blocks: int, out_channels: int = 16,
82
- momentum: float = 0.01):
83
- super().__init__()
84
-
85
- self.n_encoders = n_encoders
86
- self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
87
- self.layers = nn.ModuleList()
88
- self.latent_channels = []
89
-
90
- for i in range(n_encoders):
91
- self.layers.append(
92
- EncoderBlock(
93
- in_channels if i == 0 else out_channels * (2 ** (i - 1)),
94
- out_channels * (2 ** i),
95
- kernel_size,
96
- n_blocks,
97
- momentum
98
- )
99
- )
100
- self.latent_channels.append(out_channels * (2 ** i))
101
-
102
- def forward(self, x):
103
- x = self.bn(x)
104
- concat_tensors = []
105
- for layer in self.layers:
106
- x, skip = layer(x)
107
- concat_tensors.append(skip)
108
- return x, concat_tensors
109
-
110
-
111
- class Intermediate(nn.Module):
112
- """中间层"""
113
-
114
- def __init__(self, in_channels: int, out_channels: int, n_inters: int,
115
- n_blocks: int, momentum: float = 0.01):
116
- super().__init__()
117
-
118
- self.layers = nn.ModuleList()
119
- for i in range(n_inters):
120
- if i == 0:
121
- # 第一层: in_channels -> out_channels (256 -> 512)
122
- self.layers.append(
123
- IntermediateBlock(in_channels, out_channels, n_blocks, momentum, first_block_shortcut=True)
124
- )
125
- else:
126
- # 后续层: out_channels -> out_channels (512 -> 512)
127
- self.layers.append(
128
- IntermediateBlock(out_channels, out_channels, n_blocks, momentum, first_block_shortcut=False)
129
- )
130
-
131
- def forward(self, x):
132
- for layer in self.layers:
133
- x = layer(x)
134
- return x
135
-
136
-
137
- class IntermediateBlock(nn.Module):
138
- """中间层块"""
139
-
140
- def __init__(self, in_channels: int, out_channels: int, n_blocks: int,
141
- momentum: float = 0.01, first_block_shortcut: bool = False):
142
- super().__init__()
143
- self.conv = nn.ModuleList()
144
- # 第一个块可能需要强制使用 shortcut
145
- self.conv.append(ConvBlockRes(in_channels, out_channels, momentum, force_shortcut=first_block_shortcut))
146
- for _ in range(n_blocks - 1):
147
- self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
148
-
149
- def forward(self, x):
150
- for block in self.conv:
151
- x = block(x)
152
- return x
153
-
154
-
155
- class DecoderBlock(nn.Module):
156
- """解码器块"""
157
-
158
- def __init__(self, in_channels: int, out_channels: int, stride: int,
159
- n_blocks: int, momentum: float = 0.01):
160
- super().__init__()
161
- # conv1: 转置卷积 + BatchNorm (kernel_size=3, stride=stride, padding=1, output_padding=1)
162
- self.conv1 = nn.Sequential(
163
- nn.ConvTranspose2d(in_channels, out_channels, 3, stride, padding=1, output_padding=1, bias=False),
164
- nn.BatchNorm2d(out_channels, momentum=momentum)
165
- )
166
- # conv2: ConvBlockRes 列表
167
- # 第一个块: in_channels = out_channels * 2 (concat 后), out_channels = out_channels
168
- # 后续块: in_channels = out_channels, out_channels = out_channels
169
- self.conv2 = nn.ModuleList()
170
- self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
171
- for _ in range(n_blocks - 1):
172
- self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
173
-
174
- def forward(self, x, concat_tensor):
175
- x = self.conv1(x)
176
- # 处理尺寸不匹配:填充较小的张量使其匹配较大的
177
- diff_h = concat_tensor.size(2) - x.size(2)
178
- diff_w = concat_tensor.size(3) - x.size(3)
179
- if diff_h != 0 or diff_w != 0:
180
- # 填充 x 使其与 concat_tensor 尺寸匹配
181
- x = F.pad(x, [0, diff_w, 0, diff_h])
182
- x = torch.cat([x, concat_tensor], dim=1)
183
- for block in self.conv2:
184
- x = block(x)
185
- return x
186
-
187
-
188
- class Decoder(nn.Module):
189
- """RMVPE 解码器"""
190
-
191
- def __init__(self, in_channels: int, n_decoders: int, stride: int,
192
- n_blocks: int, out_channels: int = 16, momentum: float = 0.01):
193
- super().__init__()
194
-
195
- self.layers = nn.ModuleList()
196
- for i in range(n_decoders):
197
- out_ch = out_channels * (2 ** (n_decoders - 1 - i))
198
- in_ch = in_channels if i == 0 else out_channels * (2 ** (n_decoders - i))
199
- self.layers.append(
200
- DecoderBlock(in_ch, out_ch, stride, n_blocks, momentum)
201
- )
202
-
203
- def forward(self, x, concat_tensors):
204
- for i, layer in enumerate(self.layers):
205
- x = layer(x, concat_tensors[-1 - i])
206
- return x
207
-
208
-
209
- class DeepUnet(nn.Module):
210
- """Deep U-Net 架构"""
211
-
212
- def __init__(self, kernel_size: int, n_blocks: int, en_de_layers: int = 5,
213
- inter_layers: int = 4, in_channels: int = 1, en_out_channels: int = 16):
214
- super().__init__()
215
-
216
- # Encoder 输出通道: en_out_channels * 2^(en_de_layers-1) = 16 * 16 = 256
217
- encoder_out_channels = en_out_channels * (2 ** (en_de_layers - 1))
218
- # Intermediate 输出通道: encoder_out_channels * 2 = 512
219
- intermediate_out_channels = encoder_out_channels * 2
220
-
221
- self.encoder = Encoder(
222
- in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
223
- )
224
- self.intermediate = Intermediate(
225
- encoder_out_channels,
226
- intermediate_out_channels,
227
- inter_layers, n_blocks
228
- )
229
- self.decoder = Decoder(
230
- intermediate_out_channels,
231
- en_de_layers, kernel_size, n_blocks, en_out_channels
232
- )
233
-
234
- def forward(self, x):
235
- x, concat_tensors = self.encoder(x)
236
- x = self.intermediate(x)
237
- x = self.decoder(x, concat_tensors)
238
- return x
239
-
240
-
241
- class E2E(nn.Module):
242
- """端到端 RMVPE 模型"""
243
-
244
- def __init__(self, n_blocks: int, n_gru: int, kernel_size: int,
245
- en_de_layers: int = 5, inter_layers: int = 4,
246
- in_channels: int = 1, en_out_channels: int = 16):
247
- super().__init__()
248
-
249
- self.unet = DeepUnet(
250
- kernel_size, n_blocks, en_de_layers, inter_layers,
251
- in_channels, en_out_channels
252
- )
253
- self.cnn = nn.Conv2d(en_out_channels, 3, 3, 1, 1)
254
-
255
- if n_gru:
256
- self.fc = nn.Sequential(
257
- BiGRU(3 * 128, 256, n_gru),
258
- nn.Linear(512, 360),
259
- nn.Dropout(0.25),
260
- nn.Sigmoid()
261
- )
262
- else:
263
- self.fc = nn.Sequential(
264
- nn.Linear(3 * 128, 360),
265
- nn.Dropout(0.25),
266
- nn.Sigmoid()
267
- )
268
-
269
- def forward(self, mel):
270
- # 输入 mel: [B, 128, T] 或 [B, 1, 128, T]
271
- # 官方实现期望 [B, 1, T, 128],即 time 在 height,mel bins 在 width
272
- if mel.dim() == 3:
273
- # [B, 128, T] -> [B, T, 128] -> [B, 1, T, 128]
274
- mel = mel.transpose(-1, -2).unsqueeze(1)
275
- elif mel.dim() == 4 and mel.shape[1] == 1:
276
- # [B, 1, 128, T] -> [B, 1, T, 128]
277
- mel = mel.transpose(-1, -2)
278
-
279
- x = self.unet(mel)
280
- x = self.cnn(x)
281
- # x shape: (batch, 3, T, 128)
282
- # 转换为 (batch, T, 384) 其中 384 = 3 * 128
283
- x = x.transpose(1, 2).flatten(-2) # (batch, T, 384)
284
- x = self.fc(x)
285
- return x
286
-
287
-
288
- class MelSpectrogram(nn.Module):
289
- """Mel 频谱提取"""
290
-
291
- def __init__(self, n_mel: int = 128, n_fft: int = 1024, win_size: int = 1024,
292
- hop_length: int = 160, sample_rate: int = 16000,
293
- fmin: int = 30, fmax: int = 8000):
294
- super().__init__()
295
-
296
- self.n_fft = n_fft
297
- self.hop_length = hop_length
298
- self.win_size = win_size
299
- self.sample_rate = sample_rate
300
- self.n_mel = n_mel
301
-
302
- # 创建 Mel 滤波器组
303
- mel_basis = self._mel_filterbank(sample_rate, n_fft, n_mel, fmin, fmax)
304
- self.register_buffer("mel_basis", mel_basis)
305
- self.register_buffer("window", torch.hann_window(win_size))
306
-
307
- def _mel_filterbank(self, sr, n_fft, n_mels, fmin, fmax):
308
- """创建 Mel 滤波器组"""
309
- import librosa
310
- # 必须使用 htk=True,与官方 RVC RMVPE 保持一致
311
- mel = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=True)
312
- return torch.from_numpy(mel).float()
313
-
314
- def forward(self, audio):
315
- # STFT
316
- spec = torch.stft(
317
- audio,
318
- self.n_fft,
319
- hop_length=self.hop_length,
320
- win_length=self.win_size,
321
- window=self.window,
322
- center=True,
323
- pad_mode="reflect",
324
- normalized=False,
325
- onesided=True,
326
- return_complex=True
327
- )
328
- # 使用功率谱(幅度的平方),与官方 RMVPE 一致
329
- spec = torch.abs(spec) ** 2
330
-
331
- # Mel 变换
332
- mel = torch.matmul(self.mel_basis, spec)
333
- mel = torch.log(torch.clamp(mel, min=1e-5))
334
-
335
- return mel
336
-
337
-
338
- class RMVPE:
339
- """RMVPE F0 提取器封装类"""
340
-
341
- def __init__(self, model_path: str, device: str = "cuda"):
342
- self.device = device
343
-
344
- # 加载模型
345
- self.model = E2E(n_blocks=4, n_gru=1, kernel_size=2)
346
- ckpt = torch.load(model_path, map_location="cpu", weights_only=False)
347
- self.model.load_state_dict(ckpt)
348
- self.model = self.model.to(device).eval()
349
-
350
- # Mel 频谱提取器
351
- self.mel_extractor = MelSpectrogram().to(device)
352
-
353
- # 频率映射
354
- cents_mapping = 20 * np.arange(360) + 1997.3794084376191
355
- self.cents_mapping = np.pad(cents_mapping, (4, 4))
356
-
357
- @torch.no_grad()
358
- def infer_from_audio(self, audio: np.ndarray, thred: float = 0.03) -> np.ndarray:
359
- """
360
- 从音频提取 F0
361
-
362
- Args:
363
- audio: 16kHz 音频数据
364
- thred: 置信度阈值
365
-
366
- Returns:
367
- np.ndarray: F0 序列
368
- """
369
- # 转换为张量
370
- audio = torch.from_numpy(audio).float().to(self.device)
371
- if audio.dim() == 1:
372
- audio = audio.unsqueeze(0)
373
-
374
- # 提取 Mel 频谱: [B, 128, T]
375
- mel = self.mel_extractor(audio)
376
-
377
- # 记录原始帧数
378
- n_frames = mel.shape[-1]
379
-
380
- # 填充时间维度使其可被 32 整除(5 层池化,每层 /2)
381
- n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames
382
- if n_pad > 0:
383
- mel = F.pad(mel, (0, n_pad), mode='constant', value=0)
384
-
385
- # 模型推理 - E2E.forward 会处理 transpose
386
- hidden = self.model(mel)
387
-
388
- # 移除填充部分,只保留原始帧数
389
- hidden = hidden[:, :n_frames, :]
390
- hidden = hidden.squeeze(0).cpu().numpy()
391
-
392
- # 解码 F0
393
- f0 = self._decode(hidden, thred)
394
-
395
- return f0
396
-
397
- def _decode(self, hidden: np.ndarray, thred: float) -> np.ndarray:
398
- """解码隐藏状态为 F0 - 使用官方 RVC 算法"""
399
- # 使用官方的 to_local_average_cents 算法
400
- cents = self._to_local_average_cents(hidden, thred)
401
-
402
- # 转换 cents 到 Hz
403
- f0 = 10 * (2 ** (cents / 1200))
404
- f0[f0 == 10] = 0 # cents=0 时 f0=10,需要置零
405
-
406
- return f0
407
-
408
- def _to_local_average_cents(self, salience: np.ndarray, thred: float) -> np.ndarray:
409
- """官方 RVC 的 to_local_average_cents 算法"""
410
- # Step 1: 找到每帧的峰值 bin
411
- center = np.argmax(salience, axis=1) # [T]
412
-
413
- # Step 2: 对 salience 进行 padding
414
- salience = np.pad(salience, ((0, 0), (4, 4))) # [T, 368]
415
- center += 4 # 调整 center 索引
416
-
417
- # Step 3: 提取峰值附近 9 个 bin 的窗口并计算加权平均
418
- todo_salience = []
419
- todo_cents_mapping = []
420
- starts = center - 4
421
- ends = center + 5
422
-
423
- for idx in range(salience.shape[0]):
424
- todo_salience.append(salience[idx, starts[idx]:ends[idx]])
425
- todo_cents_mapping.append(self.cents_mapping[starts[idx]:ends[idx]])
426
-
427
- todo_salience = np.array(todo_salience) # [T, 9]
428
- todo_cents_mapping = np.array(todo_cents_mapping) # [T, 9]
429
-
430
- # Step 4: 加权平均
431
- product_sum = np.sum(todo_salience * todo_cents_mapping, axis=1)
432
- weight_sum = np.sum(todo_salience, axis=1) + 1e-9
433
- cents = product_sum / weight_sum
434
-
435
- # Step 5: 阈值���滤 - 使用原始 salience 的最大值
436
- maxx = np.max(salience, axis=1)
437
- cents[maxx <= thred] = 0
438
-
439
- return cents
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ RMVPE 模型 - 用于高质量 F0 提取
4
+ """
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+ import numpy as np
9
+ from typing import Optional
10
+
11
+
12
+ class BiGRU(nn.Module):
13
+ """双向 GRU 层"""
14
+
15
+ def __init__(self, input_features: int, hidden_features: int, num_layers: int):
16
+ super().__init__()
17
+ self.gru = nn.GRU(
18
+ input_features,
19
+ hidden_features,
20
+ num_layers=num_layers,
21
+ batch_first=True,
22
+ bidirectional=True
23
+ )
24
+
25
+ def forward(self, x):
26
+ return self.gru(x)[0]
27
+
28
+
29
+ class ConvBlockRes(nn.Module):
30
+ """残差卷积块"""
31
+
32
+ def __init__(self, in_channels: int, out_channels: int, momentum: float = 0.01,
33
+ force_shortcut: bool = False):
34
+ super().__init__()
35
+ self.conv = nn.Sequential(
36
+ nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False),
37
+ nn.BatchNorm2d(out_channels, momentum=momentum),
38
+ nn.ReLU(),
39
+ nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False),
40
+ nn.BatchNorm2d(out_channels, momentum=momentum),
41
+ nn.ReLU()
42
+ )
43
+
44
+ # 当通道数不同或强制使用时才创建 shortcut
45
+ if in_channels != out_channels or force_shortcut:
46
+ self.shortcut = nn.Conv2d(in_channels, out_channels, 1)
47
+ self.has_shortcut = True
48
+ else:
49
+ self.has_shortcut = False
50
+
51
+ def forward(self, x):
52
+ if self.has_shortcut:
53
+ return self.conv(x) + self.shortcut(x)
54
+ else:
55
+ return self.conv(x) + x
56
+
57
+
58
+ class EncoderBlock(nn.Module):
59
+ """编码器块 - 包含多个 ConvBlockRes 和一个池化层"""
60
+
61
+ def __init__(self, in_channels: int, out_channels: int, kernel_size: int,
62
+ n_blocks: int, momentum: float = 0.01):
63
+ super().__init__()
64
+ self.conv = nn.ModuleList()
65
+ self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
66
+ for _ in range(n_blocks - 1):
67
+ self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
68
+ self.pool = nn.AvgPool2d(kernel_size)
69
+
70
+ def forward(self, x):
71
+ for block in self.conv:
72
+ x = block(x)
73
+ # 返回池化前的张量用于 skip connection
74
+ return self.pool(x), x
75
+
76
+
77
+ class Encoder(nn.Module):
78
+ """RMVPE 编码器"""
79
+
80
+ def __init__(self, in_channels: int, in_size: int, n_encoders: int,
81
+ kernel_size: int, n_blocks: int, out_channels: int = 16,
82
+ momentum: float = 0.01):
83
+ super().__init__()
84
+
85
+ self.n_encoders = n_encoders
86
+ self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
87
+ self.layers = nn.ModuleList()
88
+ self.latent_channels = []
89
+
90
+ for i in range(n_encoders):
91
+ self.layers.append(
92
+ EncoderBlock(
93
+ in_channels if i == 0 else out_channels * (2 ** (i - 1)),
94
+ out_channels * (2 ** i),
95
+ kernel_size,
96
+ n_blocks,
97
+ momentum
98
+ )
99
+ )
100
+ self.latent_channels.append(out_channels * (2 ** i))
101
+
102
+ def forward(self, x):
103
+ x = self.bn(x)
104
+ concat_tensors = []
105
+ for layer in self.layers:
106
+ x, skip = layer(x)
107
+ concat_tensors.append(skip)
108
+ return x, concat_tensors
109
+
110
+
111
+ class Intermediate(nn.Module):
112
+ """中间层"""
113
+
114
+ def __init__(self, in_channels: int, out_channels: int, n_inters: int,
115
+ n_blocks: int, momentum: float = 0.01):
116
+ super().__init__()
117
+
118
+ self.layers = nn.ModuleList()
119
+ for i in range(n_inters):
120
+ if i == 0:
121
+ # 第一层: in_channels -> out_channels (256 -> 512)
122
+ self.layers.append(
123
+ IntermediateBlock(in_channels, out_channels, n_blocks, momentum, first_block_shortcut=True)
124
+ )
125
+ else:
126
+ # 后续层: out_channels -> out_channels (512 -> 512)
127
+ self.layers.append(
128
+ IntermediateBlock(out_channels, out_channels, n_blocks, momentum, first_block_shortcut=False)
129
+ )
130
+
131
+ def forward(self, x):
132
+ for layer in self.layers:
133
+ x = layer(x)
134
+ return x
135
+
136
+
137
+ class IntermediateBlock(nn.Module):
138
+ """中间层块"""
139
+
140
+ def __init__(self, in_channels: int, out_channels: int, n_blocks: int,
141
+ momentum: float = 0.01, first_block_shortcut: bool = False):
142
+ super().__init__()
143
+ self.conv = nn.ModuleList()
144
+ # 第一个块可能需要强制使用 shortcut
145
+ self.conv.append(ConvBlockRes(in_channels, out_channels, momentum, force_shortcut=first_block_shortcut))
146
+ for _ in range(n_blocks - 1):
147
+ self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
148
+
149
+ def forward(self, x):
150
+ for block in self.conv:
151
+ x = block(x)
152
+ return x
153
+
154
+
155
+ class DecoderBlock(nn.Module):
156
+ """解码器块"""
157
+
158
+ def __init__(self, in_channels: int, out_channels: int, stride: int,
159
+ n_blocks: int, momentum: float = 0.01):
160
+ super().__init__()
161
+ # conv1: 转置卷积 + BatchNorm (kernel_size=3, stride=stride, padding=1, output_padding=1)
162
+ self.conv1 = nn.Sequential(
163
+ nn.ConvTranspose2d(in_channels, out_channels, 3, stride, padding=1, output_padding=1, bias=False),
164
+ nn.BatchNorm2d(out_channels, momentum=momentum)
165
+ )
166
+ # conv2: ConvBlockRes 列表
167
+ # 第一个块: in_channels = out_channels * 2 (concat 后), out_channels = out_channels
168
+ # 后续块: in_channels = out_channels, out_channels = out_channels
169
+ self.conv2 = nn.ModuleList()
170
+ self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
171
+ for _ in range(n_blocks - 1):
172
+ self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
173
+
174
+ def forward(self, x, concat_tensor):
175
+ x = self.conv1(x)
176
+ # 处理尺寸不匹配:填充较小的张量使其匹配较大的
177
+ diff_h = concat_tensor.size(2) - x.size(2)
178
+ diff_w = concat_tensor.size(3) - x.size(3)
179
+ if diff_h != 0 or diff_w != 0:
180
+ # 填充 x 使其与 concat_tensor 尺寸匹配
181
+ x = F.pad(x, [0, diff_w, 0, diff_h])
182
+ x = torch.cat([x, concat_tensor], dim=1)
183
+ for block in self.conv2:
184
+ x = block(x)
185
+ return x
186
+
187
+
188
+ class Decoder(nn.Module):
189
+ """RMVPE 解码器"""
190
+
191
+ def __init__(self, in_channels: int, n_decoders: int, stride: int,
192
+ n_blocks: int, out_channels: int = 16, momentum: float = 0.01):
193
+ super().__init__()
194
+
195
+ self.layers = nn.ModuleList()
196
+ for i in range(n_decoders):
197
+ out_ch = out_channels * (2 ** (n_decoders - 1 - i))
198
+ in_ch = in_channels if i == 0 else out_channels * (2 ** (n_decoders - i))
199
+ self.layers.append(
200
+ DecoderBlock(in_ch, out_ch, stride, n_blocks, momentum)
201
+ )
202
+
203
+ def forward(self, x, concat_tensors):
204
+ for i, layer in enumerate(self.layers):
205
+ x = layer(x, concat_tensors[-1 - i])
206
+ return x
207
+
208
+
209
+ class DeepUnet(nn.Module):
210
+ """Deep U-Net 架构"""
211
+
212
+ def __init__(self, kernel_size: int, n_blocks: int, en_de_layers: int = 5,
213
+ inter_layers: int = 4, in_channels: int = 1, en_out_channels: int = 16):
214
+ super().__init__()
215
+
216
+ # Encoder 输出通道: en_out_channels * 2^(en_de_layers-1) = 16 * 16 = 256
217
+ encoder_out_channels = en_out_channels * (2 ** (en_de_layers - 1))
218
+ # Intermediate 输出通道: encoder_out_channels * 2 = 512
219
+ intermediate_out_channels = encoder_out_channels * 2
220
+
221
+ self.encoder = Encoder(
222
+ in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
223
+ )
224
+ self.intermediate = Intermediate(
225
+ encoder_out_channels,
226
+ intermediate_out_channels,
227
+ inter_layers, n_blocks
228
+ )
229
+ self.decoder = Decoder(
230
+ intermediate_out_channels,
231
+ en_de_layers, kernel_size, n_blocks, en_out_channels
232
+ )
233
+
234
+ def forward(self, x):
235
+ x, concat_tensors = self.encoder(x)
236
+ x = self.intermediate(x)
237
+ x = self.decoder(x, concat_tensors)
238
+ return x
239
+
240
+
241
+ class E2E(nn.Module):
242
+ """端到端 RMVPE 模型"""
243
+
244
+ def __init__(self, n_blocks: int, n_gru: int, kernel_size: int,
245
+ en_de_layers: int = 5, inter_layers: int = 4,
246
+ in_channels: int = 1, en_out_channels: int = 16):
247
+ super().__init__()
248
+
249
+ self.unet = DeepUnet(
250
+ kernel_size, n_blocks, en_de_layers, inter_layers,
251
+ in_channels, en_out_channels
252
+ )
253
+ self.cnn = nn.Conv2d(en_out_channels, 3, 3, 1, 1)
254
+
255
+ if n_gru:
256
+ self.fc = nn.Sequential(
257
+ BiGRU(3 * 128, 256, n_gru),
258
+ nn.Linear(512, 360),
259
+ nn.Dropout(0.25),
260
+ nn.Sigmoid()
261
+ )
262
+ else:
263
+ self.fc = nn.Sequential(
264
+ nn.Linear(3 * 128, 360),
265
+ nn.Dropout(0.25),
266
+ nn.Sigmoid()
267
+ )
268
+
269
+ def forward(self, mel):
270
+ # 输入 mel: [B, 128, T] 或 [B, 1, 128, T]
271
+ # 官方实现期望 [B, 1, T, 128],即 time 在 height,mel bins 在 width
272
+ if mel.dim() == 3:
273
+ # [B, 128, T] -> [B, T, 128] -> [B, 1, T, 128]
274
+ mel = mel.transpose(-1, -2).unsqueeze(1)
275
+ elif mel.dim() == 4 and mel.shape[1] == 1:
276
+ # [B, 1, 128, T] -> [B, 1, T, 128]
277
+ mel = mel.transpose(-1, -2)
278
+
279
+ x = self.unet(mel)
280
+ x = self.cnn(x)
281
+ # x shape: (batch, 3, T, 128)
282
+ # 转换为 (batch, T, 384) 其中 384 = 3 * 128
283
+ x = x.transpose(1, 2).flatten(-2) # (batch, T, 384)
284
+ x = self.fc(x)
285
+ return x
286
+
287
+
288
+ class MelSpectrogram(nn.Module):
289
+ """Mel 频谱提取"""
290
+
291
+ def __init__(self, n_mel: int = 128, n_fft: int = 1024, win_size: int = 1024,
292
+ hop_length: int = 160, sample_rate: int = 16000,
293
+ fmin: int = 30, fmax: int = 8000):
294
+ super().__init__()
295
+
296
+ self.n_fft = n_fft
297
+ self.hop_length = hop_length
298
+ self.win_size = win_size
299
+ self.sample_rate = sample_rate
300
+ self.n_mel = n_mel
301
+
302
+ # 创建 Mel 滤波器组
303
+ mel_basis = self._mel_filterbank(sample_rate, n_fft, n_mel, fmin, fmax)
304
+ self.register_buffer("mel_basis", mel_basis)
305
+ self.register_buffer("window", torch.hann_window(win_size))
306
+
307
+ def _mel_filterbank(self, sr, n_fft, n_mels, fmin, fmax):
308
+ """创建 Mel 滤波器组"""
309
+ import librosa
310
+ # 必须使用 htk=True,与官方 RVC RMVPE 保持一致
311
+ mel = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=True)
312
+ return torch.from_numpy(mel).float()
313
+
314
+ def forward(self, audio):
315
+ # STFT
316
+ spec = torch.stft(
317
+ audio,
318
+ self.n_fft,
319
+ hop_length=self.hop_length,
320
+ win_length=self.win_size,
321
+ window=self.window,
322
+ center=True,
323
+ pad_mode="reflect",
324
+ normalized=False,
325
+ onesided=True,
326
+ return_complex=True
327
+ )
328
+ # 使用功率谱(幅度的平方),与官方 RMVPE 一致
329
+ spec = torch.abs(spec) ** 2
330
+
331
+ # Mel 变换
332
+ mel = torch.matmul(self.mel_basis, spec)
333
+ mel = torch.log(torch.clamp(mel, min=1e-5))
334
+
335
+ return mel
336
+
337
+
338
+ class RMVPE:
339
+ """RMVPE F0 提取器封装类"""
340
+
341
+ def __init__(self, model_path: str, device: str = "cuda"):
342
+ self.device = device
343
+
344
+ # 加载模型
345
+ self.model = E2E(n_blocks=4, n_gru=1, kernel_size=2)
346
+ ckpt = torch.load(model_path, map_location="cpu", weights_only=False)
347
+ self.model.load_state_dict(ckpt)
348
+ self.model = self.model.to(device).eval()
349
+
350
+ # Mel 频谱提取器
351
+ self.mel_extractor = MelSpectrogram().to(device)
352
+
353
+ # 频率映射
354
+ cents_mapping = 20 * np.arange(360) + 1997.3794084376191
355
+ self.cents_mapping = np.pad(cents_mapping, (4, 4))
356
+
357
+ @torch.no_grad()
358
+ def infer_from_audio(self, audio: np.ndarray, thred: float = 0.03) -> np.ndarray:
359
+ """
360
+ 从音频提取 F0
361
+
362
+ Args:
363
+ audio: 16kHz 音频数据
364
+ thred: 置信度阈值
365
+
366
+ Returns:
367
+ np.ndarray: F0 序列
368
+ """
369
+ # 转换为张量
370
+ audio = torch.from_numpy(audio).float().to(self.device)
371
+ if audio.dim() == 1:
372
+ audio = audio.unsqueeze(0)
373
+
374
+ # 提取 Mel 频谱: [B, 128, T]
375
+ mel = self.mel_extractor(audio)
376
+
377
+ # 记录原始帧数
378
+ n_frames = mel.shape[-1]
379
+
380
+ # 填充时间维度使其可被 32 整除(5 层池化,每层 /2)
381
+ n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames
382
+ if n_pad > 0:
383
+ mel = F.pad(mel, (0, n_pad), mode='constant', value=0)
384
+
385
+ # 模型推理 - E2E.forward 会处理 transpose
386
+ hidden = self.model(mel)
387
+
388
+ # 移除填充部分,只保留原始帧数
389
+ hidden = hidden[:, :n_frames, :]
390
+ hidden = hidden.squeeze(0).cpu().numpy()
391
+
392
+ # 解码 F0
393
+ f0 = self._decode(hidden, thred)
394
+
395
+ return f0
396
+
397
+ def _decode(self, hidden: np.ndarray, thred: float) -> np.ndarray:
398
+ """解码隐藏状态为 F0 - 使用官方 RVC 算法"""
399
+ # 使用官方的 to_local_average_cents 算法
400
+ cents = self._to_local_average_cents(hidden, thred)
401
+
402
+ # 转换 cents 到 Hz
403
+ f0 = 10 * (2 ** (cents / 1200))
404
+ f0[f0 == 10] = 0 # cents=0 时 f0=10,需要置零
405
+
406
+ return f0
407
+
408
+ def _to_local_average_cents(self, salience: np.ndarray, thred: float) -> np.ndarray:
409
+ """官方 RVC 的 to_local_average_cents 算法"""
410
+ # Step 1: 找到每帧的峰值 bin
411
+ center = np.argmax(salience, axis=1) # [T]
412
+
413
+ # Step 2: 对 salience 进行 padding
414
+ salience = np.pad(salience, ((0, 0), (4, 4))) # [T, 368]
415
+ center += 4 # 调整 center 索引
416
+
417
+ # Step 3: 提取峰值附近 9 个 bin 的窗口并计算加权平均
418
+ todo_salience = []
419
+ todo_cents_mapping = []
420
+ starts = center - 4
421
+ ends = center + 5
422
+
423
+ for idx in range(salience.shape[0]):
424
+ todo_salience.append(salience[idx, starts[idx]:ends[idx]])
425
+ todo_cents_mapping.append(self.cents_mapping[starts[idx]:ends[idx]])
426
+
427
+ todo_salience = np.array(todo_salience) # [T, 9]
428
+ todo_cents_mapping = np.array(todo_cents_mapping) # [T, 9]
429
+
430
+ # Step 4: 加权平均
431
+ product_sum = np.sum(todo_salience * todo_cents_mapping, axis=1)
432
+ weight_sum = np.sum(todo_salience, axis=1) + 1e-9
433
+ cents = product_sum / weight_sum
434
+
435
+ # Step 5: 阈值滤 - 使用原始 salience 的最大值
436
+ maxx = np.max(salience, axis=1)
437
+ cents[maxx <= thred] = 0
438
+
439
+ return cents
models/synthesizer.py CHANGED
@@ -1,853 +1,853 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- RVC v2 合成器模型定义
4
- """
5
- import math
6
- import torch
7
- import torch.nn as nn
8
- import torch.nn.functional as F
9
- from typing import Optional, Tuple
10
- import numpy as np
11
-
12
-
13
- class LayerNorm(nn.Module):
14
- """Layer normalization for channels-first tensors"""
15
-
16
- def __init__(self, channels: int, eps: float = 1e-5):
17
- super().__init__()
18
- self.channels = channels
19
- self.eps = eps
20
- self.gamma = nn.Parameter(torch.ones(channels))
21
- self.beta = nn.Parameter(torch.zeros(channels))
22
-
23
- def forward(self, x):
24
- # x: [B, C, T]
25
- x = x.transpose(1, -1) # [B, T, C]
26
- x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
27
- return x.transpose(1, -1) # [B, C, T]
28
-
29
-
30
- class MultiHeadAttention(nn.Module):
31
- """Multi-head attention module"""
32
-
33
- def __init__(self, channels: int, out_channels: int, n_heads: int,
34
- p_dropout: float = 0.0, window_size: Optional[int] = None,
35
- heads_share: bool = True, block_length: Optional[int] = None,
36
- proximal_bias: bool = False, proximal_init: bool = False):
37
- super().__init__()
38
- assert channels % n_heads == 0
39
-
40
- self.channels = channels
41
- self.out_channels = out_channels
42
- self.n_heads = n_heads
43
- self.p_dropout = p_dropout
44
- self.window_size = window_size
45
- self.heads_share = heads_share
46
- self.block_length = block_length
47
- self.proximal_bias = proximal_bias
48
- self.proximal_init = proximal_init
49
- self.attn = None
50
-
51
- self.k_channels = channels // n_heads
52
- self.conv_q = nn.Conv1d(channels, channels, 1)
53
- self.conv_k = nn.Conv1d(channels, channels, 1)
54
- self.conv_v = nn.Conv1d(channels, channels, 1)
55
- self.conv_o = nn.Conv1d(channels, out_channels, 1)
56
- self.drop = nn.Dropout(p_dropout)
57
-
58
- if window_size is not None:
59
- n_heads_rel = 1 if heads_share else n_heads
60
- rel_stddev = self.k_channels ** -0.5
61
- self.emb_rel_k = nn.Parameter(
62
- torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev
63
- )
64
- self.emb_rel_v = nn.Parameter(
65
- torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev
66
- )
67
-
68
- nn.init.xavier_uniform_(self.conv_q.weight)
69
- nn.init.xavier_uniform_(self.conv_k.weight)
70
- nn.init.xavier_uniform_(self.conv_v.weight)
71
- if proximal_init:
72
- with torch.no_grad():
73
- self.conv_k.weight.copy_(self.conv_q.weight)
74
- self.conv_k.bias.copy_(self.conv_q.bias)
75
-
76
- def forward(self, x, c, attn_mask=None):
77
- q = self.conv_q(x)
78
- k = self.conv_k(c)
79
- v = self.conv_v(c)
80
-
81
- x, self.attn = self.attention(q, k, v, mask=attn_mask)
82
-
83
- x = self.conv_o(x)
84
- return x
85
-
86
- def attention(self, query, key, value, mask=None):
87
- # query, key, value: [B, C, T]
88
- b, d, t_s = key.size()
89
- t_t = query.size(2)
90
-
91
- query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
92
- key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
93
- value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
94
-
95
- scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
96
-
97
- if self.window_size is not None:
98
- assert t_s == t_t, "Relative attention only for self-attention"
99
- key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
100
- rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
101
- scores_local = self._relative_position_to_absolute_position(rel_logits)
102
- scores = scores + scores_local
103
-
104
- if self.proximal_bias:
105
- assert t_s == t_t, "Proximal bias only for self-attention"
106
- scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
107
-
108
- if mask is not None:
109
- scores = scores.masked_fill(mask == 0, -1e4)
110
- if self.block_length is not None:
111
- assert t_s == t_t, "Block length only for self-attention"
112
- block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
113
- scores = scores.masked_fill(block_mask == 0, -1e4)
114
-
115
- p_attn = F.softmax(scores, dim=-1)
116
- p_attn = self.drop(p_attn)
117
- output = torch.matmul(p_attn, value)
118
-
119
- if self.window_size is not None:
120
- relative_weights = self._absolute_position_to_relative_position(p_attn)
121
- value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
122
- output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
123
-
124
- output = output.transpose(2, 3).contiguous().view(b, d, t_t)
125
- return output, p_attn
126
-
127
- def _matmul_with_relative_values(self, x, y):
128
- ret = torch.matmul(x, y.unsqueeze(0))
129
- return ret
130
-
131
- def _matmul_with_relative_keys(self, x, y):
132
- ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
133
- return ret
134
-
135
- def _get_relative_embeddings(self, relative_embeddings, length):
136
- max_relative_position = 2 * self.window_size + 1
137
- pad_length = max(length - (self.window_size + 1), 0)
138
- slice_start_position = max((self.window_size + 1) - length, 0)
139
- slice_end_position = slice_start_position + 2 * length - 1
140
- if pad_length > 0:
141
- padded_relative_embeddings = F.pad(
142
- relative_embeddings,
143
- (0, 0, pad_length, pad_length, 0, 0)
144
- )
145
- else:
146
- padded_relative_embeddings = relative_embeddings
147
- used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
148
- return used_relative_embeddings
149
-
150
- def _relative_position_to_absolute_position(self, x):
151
- batch, heads, length, _ = x.size()
152
- x = F.pad(x, (0, 1, 0, 0, 0, 0, 0, 0))
153
- x_flat = x.view(batch, heads, length * 2 * length)
154
- x_flat = F.pad(x_flat, (0, length - 1, 0, 0, 0, 0))
155
- x_final = x_flat.view(batch, heads, length + 1, 2 * length - 1)[:, :, :length, length - 1:]
156
- return x_final
157
-
158
- def _absolute_position_to_relative_position(self, x):
159
- batch, heads, length, _ = x.size()
160
- x = F.pad(x, (0, length - 1, 0, 0, 0, 0, 0, 0))
161
- x_flat = x.view(batch, heads, length ** 2 + length * (length - 1))
162
- x_flat = F.pad(x_flat, (length, 0, 0, 0, 0, 0))
163
- x_final = x_flat.view(batch, heads, length, 2 * length)[:, :, :, 1:]
164
- return x_final
165
-
166
- def _attention_bias_proximal(self, length):
167
- r = torch.arange(length, dtype=torch.float32)
168
- diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
169
- return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
170
-
171
-
172
- class FFN(nn.Module):
173
- """Feed-forward network with optional causal convolution"""
174
-
175
- def __init__(self, in_channels: int, out_channels: int, filter_channels: int,
176
- kernel_size: int, p_dropout: float = 0.0, activation: str = None,
177
- causal: bool = False):
178
- super().__init__()
179
- self.in_channels = in_channels
180
- self.out_channels = out_channels
181
- self.filter_channels = filter_channels
182
- self.kernel_size = kernel_size
183
- self.p_dropout = p_dropout
184
- self.activation = activation
185
- self.causal = causal
186
-
187
- if causal:
188
- self.padding = self._causal_padding
189
- else:
190
- self.padding = self._same_padding
191
-
192
- self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
193
- self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
194
- self.drop = nn.Dropout(p_dropout)
195
-
196
- def forward(self, x, x_mask):
197
- x = self.conv_1(self.padding(x))
198
- if self.activation == "gelu":
199
- x = x * torch.sigmoid(1.702 * x)
200
- else:
201
- x = torch.relu(x)
202
- x = self.drop(x)
203
- x = self.conv_2(self.padding(x))
204
- return x * x_mask
205
-
206
- def _causal_padding(self, x):
207
- if self.kernel_size == 1:
208
- return x
209
- pad_l = self.kernel_size - 1
210
- pad_r = 0
211
- return F.pad(x, (pad_l, pad_r, 0, 0, 0, 0))
212
-
213
- def _same_padding(self, x):
214
- if self.kernel_size == 1:
215
- return x
216
- pad_l = (self.kernel_size - 1) // 2
217
- pad_r = self.kernel_size // 2
218
- return F.pad(x, (pad_l, pad_r, 0, 0, 0, 0))
219
-
220
-
221
- class Encoder(nn.Module):
222
- """Transformer encoder with multi-head attention"""
223
-
224
- def __init__(self, hidden_channels: int, filter_channels: int, n_heads: int,
225
- n_layers: int, kernel_size: int = 1, p_dropout: float = 0.0,
226
- window_size: int = 10):
227
- super().__init__()
228
- self.hidden_channels = hidden_channels
229
- self.filter_channels = filter_channels
230
- self.n_heads = n_heads
231
- self.n_layers = n_layers
232
- self.kernel_size = kernel_size
233
- self.p_dropout = p_dropout
234
- self.window_size = window_size
235
-
236
- self.drop = nn.Dropout(p_dropout)
237
- self.attn_layers = nn.ModuleList()
238
- self.norm_layers_1 = nn.ModuleList()
239
- self.ffn_layers = nn.ModuleList()
240
- self.norm_layers_2 = nn.ModuleList()
241
-
242
- for _ in range(n_layers):
243
- self.attn_layers.append(
244
- MultiHeadAttention(
245
- hidden_channels, hidden_channels, n_heads,
246
- p_dropout=p_dropout, window_size=window_size
247
- )
248
- )
249
- self.norm_layers_1.append(LayerNorm(hidden_channels))
250
- self.ffn_layers.append(
251
- FFN(hidden_channels, hidden_channels, filter_channels,
252
- kernel_size, p_dropout=p_dropout)
253
- )
254
- self.norm_layers_2.append(LayerNorm(hidden_channels))
255
-
256
- def forward(self, x, x_mask):
257
- attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
258
- x = x * x_mask
259
- for i in range(self.n_layers):
260
- y = self.attn_layers[i](x, x, attn_mask)
261
- y = self.drop(y)
262
- x = self.norm_layers_1[i](x + y)
263
-
264
- y = self.ffn_layers[i](x, x_mask)
265
- y = self.drop(y)
266
- x = self.norm_layers_2[i](x + y)
267
- x = x * x_mask
268
- return x
269
-
270
-
271
- class TextEncoder(nn.Module):
272
- """Text encoder for RVC - encodes phone and pitch embeddings"""
273
-
274
- def __init__(self, out_channels: int, hidden_channels: int, filter_channels: int,
275
- n_heads: int, n_layers: int, kernel_size: int, p_dropout: float,
276
- f0: bool = True):
277
- super().__init__()
278
- self.out_channels = out_channels
279
- self.hidden_channels = hidden_channels
280
- self.filter_channels = filter_channels
281
- self.n_heads = n_heads
282
- self.n_layers = n_layers
283
- self.kernel_size = kernel_size
284
- self.p_dropout = p_dropout
285
- self.f0 = f0
286
-
287
- # Phone embedding: Linear projection from 768-dim HuBERT features
288
- self.emb_phone = nn.Linear(768, hidden_channels)
289
-
290
- # Pitch embedding (only if f0 is enabled)
291
- if f0:
292
- self.emb_pitch = nn.Embedding(256, hidden_channels)
293
-
294
- # Transformer encoder
295
- self.encoder = Encoder(
296
- hidden_channels, filter_channels, n_heads, n_layers,
297
- kernel_size, p_dropout
298
- )
299
-
300
- # Output projection to mean and log-variance
301
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
302
-
303
- def forward(self, phone, pitch, lengths):
304
- """
305
- Args:
306
- phone: [B, 768, T] phone features from HuBERT (channels first)
307
- pitch: [B, T] pitch indices (0-255)
308
- lengths: [B] sequence lengths
309
-
310
- Returns:
311
- m: [B, out_channels, T] mean
312
- logs: [B, out_channels, T] log-variance
313
- x_mask: [B, 1, T] mask
314
- """
315
- import logging
316
- log = logging.getLogger(__name__)
317
-
318
- log.debug(f"[TextEncoder] 输入 phone: shape={phone.shape}")
319
- log.debug(f"[TextEncoder] 输入 pitch: shape={pitch.shape}, max={pitch.max().item()}, min={pitch.min().item()}")
320
- log.debug(f"[TextEncoder] 输入 lengths: {lengths}")
321
-
322
- # Transpose phone from [B, C, T] to [B, T, C] for linear layer
323
- phone = phone.transpose(1, 2) # [B, T, 768]
324
- log.debug(f"[TextEncoder] 转置后 phone: shape={phone.shape}")
325
-
326
- # Create mask
327
- x_mask = torch.unsqueeze(
328
- self._sequence_mask(lengths, phone.size(1)), 1
329
- ).to(phone.dtype)
330
- log.debug(f"[TextEncoder] x_mask: shape={x_mask.shape}, sum={x_mask.sum().item()}")
331
-
332
- # Phone embedding
333
- x = self.emb_phone(phone) # [B, T, hidden_channels]
334
- log.debug(f"[TextEncoder] emb_phone 输出: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}")
335
-
336
- # Add pitch embedding if enabled
337
- if self.f0 and pitch is not None:
338
- # Clamp pitch to valid range
339
- pitch_clamped = torch.clamp(pitch, 0, 255)
340
- pitch_emb = self.emb_pitch(pitch_clamped)
341
- log.debug(f"[TextEncoder] emb_pitch 输出: shape={pitch_emb.shape}, max={pitch_emb.abs().max().item():.4f}")
342
- x = x + pitch_emb
343
-
344
- # Transpose for conv layers: [B, hidden_channels, T]
345
- x = x.transpose(1, 2)
346
- log.debug(f"[TextEncoder] 转置后 x: shape={x.shape}")
347
-
348
- # Apply mask
349
- x = x * x_mask
350
-
351
- # Transformer encoder
352
- x = self.encoder(x, x_mask)
353
- log.debug(f"[TextEncoder] Transformer 输出: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}")
354
-
355
- # Project to mean and log-variance
356
- stats = self.proj(x) * x_mask
357
- m, logs = torch.split(stats, self.out_channels, dim=1)
358
- log.debug(f"[TextEncoder] 最终输出 m: shape={m.shape}, max={m.abs().max().item():.4f}")
359
- log.debug(f"[TextEncoder] 最终输出 logs: shape={logs.shape}, max={logs.max().item():.4f}, min={logs.min().item():.4f}")
360
-
361
- return m, logs, x_mask
362
-
363
- def _sequence_mask(self, length, max_length=None):
364
- if max_length is None:
365
- max_length = length.max()
366
- x = torch.arange(max_length, dtype=length.dtype, device=length.device)
367
- return x.unsqueeze(0) < length.unsqueeze(1)
368
-
369
-
370
- class ResidualCouplingBlock(nn.Module):
371
- """残差耦合块"""
372
-
373
- def __init__(self, channels: int, hidden_channels: int, kernel_size: int,
374
- dilation_rate: int, n_layers: int, n_flows: int = 4,
375
- gin_channels: int = 0):
376
- super().__init__()
377
- self.flows = nn.ModuleList()
378
-
379
- for _ in range(n_flows):
380
- self.flows.append(
381
- ResidualCouplingLayer(
382
- channels, hidden_channels, kernel_size,
383
- dilation_rate, n_layers, gin_channels=gin_channels
384
- )
385
- )
386
- self.flows.append(Flip())
387
-
388
- def forward(self, x, x_mask, g=None, reverse=False):
389
- if not reverse:
390
- for flow in self.flows:
391
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
392
- else:
393
- for flow in reversed(self.flows):
394
- x = flow(x, x_mask, g=g, reverse=reverse)
395
- return x
396
-
397
-
398
- class ResidualCouplingLayer(nn.Module):
399
- """残差耦合层"""
400
-
401
- def __init__(self, channels: int, hidden_channels: int, kernel_size: int,
402
- dilation_rate: int, n_layers: int, mean_only: bool = True,
403
- gin_channels: int = 0):
404
- super().__init__()
405
- self.half_channels = channels // 2
406
- self.mean_only = mean_only
407
-
408
- self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
409
- self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels)
410
- self.post = nn.Conv1d(hidden_channels, self.half_channels, 1)
411
- self.post.weight.data.zero_()
412
- self.post.bias.data.zero_()
413
-
414
- def forward(self, x, x_mask, g=None, reverse=False):
415
- x0, x1 = torch.split(x, [self.half_channels] * 2, dim=1)
416
- h = self.pre(x0) * x_mask
417
- h = self.enc(h, x_mask, g=g)
418
- stats = self.post(h) * x_mask
419
- m = stats
420
-
421
- if not reverse:
422
- x1 = m + x1 * x_mask
423
- x = torch.cat([x0, x1], dim=1)
424
- return x, None
425
- else:
426
- x1 = (x1 - m) * x_mask
427
- x = torch.cat([x0, x1], dim=1)
428
- return x
429
-
430
-
431
- class Flip(nn.Module):
432
- """翻转层"""
433
-
434
- def forward(self, x, *args, reverse=False, **kwargs):
435
- x = torch.flip(x, [1])
436
- return x
437
-
438
-
439
- class WN(nn.Module):
440
- """WaveNet 风格网络 (带权重归一化)"""
441
-
442
- def __init__(self, hidden_channels: int, kernel_size: int,
443
- dilation_rate: int, n_layers: int, gin_channels: int = 0,
444
- p_dropout: float = 0):
445
- super().__init__()
446
- self.n_layers = n_layers
447
- self.hidden_channels = hidden_channels
448
- self.gin_channels = gin_channels
449
-
450
- self.in_layers = nn.ModuleList()
451
- self.res_skip_layers = nn.ModuleList()
452
- self.drop = nn.Dropout(p_dropout)
453
-
454
- if gin_channels > 0:
455
- self.cond_layer = nn.utils.weight_norm(
456
- nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
457
- )
458
-
459
- for i in range(n_layers):
460
- dilation = dilation_rate ** i
461
- padding = (kernel_size * dilation - dilation) // 2
462
- self.in_layers.append(
463
- nn.utils.weight_norm(
464
- nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size,
465
- dilation=dilation, padding=padding)
466
- )
467
- )
468
- # 前 n-1 层输出 2 * hidden_channels,最后一层输出 hidden_channels
469
- if i < n_layers - 1:
470
- res_skip_channels = 2 * hidden_channels
471
- else:
472
- res_skip_channels = hidden_channels
473
- self.res_skip_layers.append(
474
- nn.utils.weight_norm(
475
- nn.Conv1d(hidden_channels, res_skip_channels, 1)
476
- )
477
- )
478
-
479
- def forward(self, x, x_mask, g=None):
480
- output = torch.zeros_like(x)
481
-
482
- if g is not None and self.gin_channels > 0:
483
- g = self.cond_layer(g)
484
-
485
- for i in range(self.n_layers):
486
- x_in = self.in_layers[i](x)
487
- if g is not None:
488
- cond_offset = i * 2 * self.hidden_channels
489
- g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
490
- x_in = x_in + g_l
491
-
492
- acts = torch.tanh(x_in[:, :self.hidden_channels]) * torch.sigmoid(x_in[:, self.hidden_channels:])
493
- acts = self.drop(acts)
494
- res_skip = self.res_skip_layers[i](acts)
495
-
496
- if i < self.n_layers - 1:
497
- # 前 n-1 层:residual + skip
498
- x = (x + res_skip[:, :self.hidden_channels]) * x_mask
499
- output = output + res_skip[:, self.hidden_channels:]
500
- else:
501
- # 最后一层:只有 residual,加到 output
502
- x = (x + res_skip) * x_mask
503
- output = output + res_skip
504
-
505
- return output * x_mask
506
-
507
-
508
- class PosteriorEncoder(nn.Module):
509
- """后验编码器"""
510
-
511
- def __init__(self, in_channels: int, out_channels: int, hidden_channels: int,
512
- kernel_size: int, dilation_rate: int, n_layers: int,
513
- gin_channels: int = 0):
514
- super().__init__()
515
- self.out_channels = out_channels
516
-
517
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
518
- self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels)
519
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
520
-
521
- def forward(self, x, x_lengths, g=None):
522
- x_mask = torch.unsqueeze(
523
- self._sequence_mask(x_lengths, x.size(2)), 1
524
- ).to(x.dtype)
525
-
526
- x = self.pre(x) * x_mask
527
- x = self.enc(x, x_mask, g=g)
528
- stats = self.proj(x) * x_mask
529
- m, logs = torch.split(stats, self.out_channels, dim=1)
530
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
531
- return z, m, logs, x_mask
532
-
533
- def _sequence_mask(self, length, max_length=None):
534
- if max_length is None:
535
- max_length = length.max()
536
- x = torch.arange(max_length, dtype=length.dtype, device=length.device)
537
- return x.unsqueeze(0) < length.unsqueeze(1)
538
-
539
-
540
- class Generator(nn.Module):
541
- """NSF-HiFi-GAN 生成器 (带权重归一化)"""
542
-
543
- def __init__(self, initial_channel: int, resblock_kernel_sizes: list,
544
- resblock_dilation_sizes: list, upsample_rates: list,
545
- upsample_initial_channel: int, upsample_kernel_sizes: list,
546
- gin_channels: int = 0, sr: int = 40000, is_half: bool = False):
547
- super().__init__()
548
- self.num_kernels = len(resblock_kernel_sizes)
549
- self.num_upsamples = len(upsample_rates)
550
- self.sr = sr
551
- self.is_half = is_half
552
-
553
- # 计算上采样因子
554
- self.upp = int(np.prod(upsample_rates))
555
-
556
- self.conv_pre = nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, 3)
557
-
558
- # NSF 源模块
559
- self.m_source = SourceModuleHnNSF(sample_rate=sr, harmonic_num=0)
560
-
561
- # 噪声卷积层
562
- self.noise_convs = nn.ModuleList()
563
-
564
- self.ups = nn.ModuleList()
565
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
566
- c_cur = upsample_initial_channel // (2 ** (i + 1))
567
- self.ups.append(
568
- nn.utils.weight_norm(
569
- nn.ConvTranspose1d(
570
- upsample_initial_channel // (2 ** i),
571
- c_cur,
572
- k, u, (k - u) // 2
573
- )
574
- )
575
- )
576
- # 噪声卷积
577
- if i + 1 < len(upsample_rates):
578
- stride_f0 = int(np.prod(upsample_rates[i + 1:]))
579
- self.noise_convs.append(
580
- nn.Conv1d(1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2)
581
- )
582
- else:
583
- self.noise_convs.append(nn.Conv1d(1, c_cur, kernel_size=1))
584
-
585
- self.resblocks = nn.ModuleList()
586
- for i in range(len(self.ups)):
587
- ch = upsample_initial_channel // (2 ** (i + 1))
588
- for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
589
- self.resblocks.append(ResBlock(ch, k, d))
590
-
591
- self.conv_post = nn.Conv1d(ch, 1, 7, 1, 3, bias=False)
592
-
593
- if gin_channels > 0:
594
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
595
-
596
- def forward(self, x, f0, g=None):
597
- import logging
598
- log = logging.getLogger(__name__)
599
-
600
- log.debug(f"[Generator] 输入 x: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}")
601
- log.debug(f"[Generator] 输入 f0: shape={f0.shape}, max={f0.max().item():.1f}, min={f0.min().item():.1f}")
602
- if g is not None:
603
- log.debug(f"[Generator] 输入 g: shape={g.shape}, max={g.abs().max().item():.4f}")
604
-
605
- # 生成 NSF 激励信号
606
- har_source, _, _ = self.m_source(f0, self.upp)
607
- har_source = har_source.transpose(1, 2) # [B, 1, T*upp]
608
- log.debug(f"[Generator] NSF har_source: shape={har_source.shape}, max={har_source.abs().max().item():.4f}")
609
-
610
- x = self.conv_pre(x)
611
- log.debug(f"[Generator] conv_pre 输出: shape={x.shape}, max={x.abs().max().item():.4f}")
612
-
613
- if g is not None:
614
- x = x + self.cond(g)
615
- log.debug(f"[Generator] 加入条件后: max={x.abs().max().item():.4f}")
616
-
617
- for i in range(self.num_upsamples):
618
- x = F.leaky_relu(x, 0.1)
619
- x = self.ups[i](x)
620
-
621
- # 融合噪声
622
- x_source = self.noise_convs[i](har_source)
623
- x = x + x_source
624
-
625
- xs = None
626
- for j in range(self.num_kernels):
627
- if xs is None:
628
- xs = self.resblocks[i * self.num_kernels + j](x)
629
- else:
630
- xs += self.resblocks[i * self.num_kernels + j](x)
631
- x = xs / self.num_kernels
632
- log.debug(f"[Generator] 上采样层 {i}: shape={x.shape}, max={x.abs().max().item():.4f}")
633
-
634
- x = F.leaky_relu(x)
635
- x = self.conv_post(x)
636
- log.debug(f"[Generator] conv_post 输出: shape={x.shape}, max={x.abs().max().item():.4f}")
637
- x = torch.tanh(x)
638
- log.debug(f"[Generator] tanh 输出: shape={x.shape}, max={x.abs().max().item():.4f}")
639
-
640
- return x
641
-
642
- def remove_weight_norm(self):
643
- for l in self.ups:
644
- nn.utils.remove_weight_norm(l)
645
- for l in self.resblocks:
646
- l.remove_weight_norm()
647
-
648
-
649
- class ResBlock(nn.Module):
650
- """残��块 (带权重归一化)"""
651
-
652
- def __init__(self, channels: int, kernel_size: int = 3, dilation: tuple = (1, 3, 5)):
653
- super().__init__()
654
- self.convs1 = nn.ModuleList([
655
- nn.utils.weight_norm(
656
- nn.Conv1d(channels, channels, kernel_size, 1,
657
- (kernel_size * d - d) // 2, dilation=d)
658
- )
659
- for d in dilation
660
- ])
661
- self.convs2 = nn.ModuleList([
662
- nn.utils.weight_norm(
663
- nn.Conv1d(channels, channels, kernel_size, 1,
664
- (kernel_size - 1) // 2)
665
- )
666
- for _ in dilation
667
- ])
668
-
669
- def forward(self, x):
670
- for c1, c2 in zip(self.convs1, self.convs2):
671
- xt = F.leaky_relu(x, 0.1)
672
- xt = c1(xt)
673
- xt = F.leaky_relu(xt, 0.1)
674
- xt = c2(xt)
675
- x = xt + x
676
- return x
677
-
678
- def remove_weight_norm(self):
679
- for l in self.convs1:
680
- nn.utils.remove_weight_norm(l)
681
- for l in self.convs2:
682
- nn.utils.remove_weight_norm(l)
683
-
684
-
685
- class SineGenerator(nn.Module):
686
- """正弦波生成器 - NSF 的核心组件"""
687
-
688
- def __init__(self, sample_rate: int, harmonic_num: int = 0,
689
- sine_amp: float = 0.1, noise_std: float = 0.003,
690
- voiced_threshold: float = 10):
691
- super().__init__()
692
- self.sample_rate = sample_rate
693
- self.harmonic_num = harmonic_num
694
- self.sine_amp = sine_amp
695
- self.noise_std = noise_std
696
- self.voiced_threshold = voiced_threshold
697
- self.dim = harmonic_num + 1
698
-
699
- def forward(self, f0: torch.Tensor, upp: int):
700
- """
701
- 生成正弦波激励信号
702
-
703
- Args:
704
- f0: 基频张量 [B, T]
705
- upp: 上采样因子
706
-
707
- Returns:
708
- 正弦波信号 [B, T*upp, 1]
709
- """
710
- with torch.no_grad():
711
- # 上采样 F0
712
- f0 = f0.unsqueeze(1) # [B, 1, T]
713
- f0_up = F.interpolate(f0, scale_factor=upp, mode='nearest')
714
- f0_up = f0_up.transpose(1, 2) # [B, T*upp, 1]
715
-
716
- # 生成正弦波
717
- rad = f0_up / self.sample_rate # 归一化频率
718
- rad_acc = torch.cumsum(rad, dim=1) % 1 # 累积相位
719
- sine_wave = torch.sin(2 * np.pi * rad_acc) * self.sine_amp
720
-
721
- # 静音区域(F0=0)使用噪声
722
- voiced_mask = (f0_up > self.voiced_threshold).float()
723
- noise = torch.randn_like(sine_wave) * self.noise_std
724
- sine_wave = sine_wave * voiced_mask + noise * (1 - voiced_mask)
725
-
726
- return sine_wave
727
-
728
-
729
- class SourceModuleHnNSF(nn.Module):
730
- """谐波加噪声源模块"""
731
-
732
- def __init__(self, sample_rate: int, harmonic_num: int = 0,
733
- sine_amp: float = 0.1, noise_std: float = 0.003,
734
- add_noise_std: float = 0.003):
735
- super().__init__()
736
- self.sine_generator = SineGenerator(
737
- sample_rate, harmonic_num, sine_amp, noise_std
738
- )
739
- self.l_linear = nn.Linear(harmonic_num + 1, 1)
740
- self.l_tanh = nn.Tanh()
741
-
742
- def forward(self, f0: torch.Tensor, upp: int):
743
- sine = self.sine_generator(f0, upp) # [B, T*upp, 1]
744
- sine = self.l_tanh(self.l_linear(sine))
745
- noise = torch.randn_like(sine) * 0.003
746
- return sine, noise, None # 返回 3 个值以匹配接口
747
-
748
-
749
- class SynthesizerTrnMs768NSFsid(nn.Module):
750
- """RVC v2 合成器 (768 维 HuBERT + NSF + SID)"""
751
-
752
- def __init__(self, spec_channels: int, segment_size: int,
753
- inter_channels: int, hidden_channels: int, filter_channels: int,
754
- n_heads: int, n_layers: int, kernel_size: int, p_dropout: float,
755
- resblock: str, resblock_kernel_sizes: list,
756
- resblock_dilation_sizes: list, upsample_rates: list,
757
- upsample_initial_channel: int, upsample_kernel_sizes: list,
758
- spk_embed_dim: int, gin_channels: int, sr: int):
759
- super().__init__()
760
-
761
- self.spec_channels = spec_channels
762
- self.inter_channels = inter_channels
763
- self.hidden_channels = hidden_channels
764
- self.filter_channels = filter_channels
765
- self.n_heads = n_heads
766
- self.n_layers = n_layers
767
- self.kernel_size = kernel_size
768
- self.p_dropout = p_dropout
769
- self.resblock = resblock
770
- self.resblock_kernel_sizes = resblock_kernel_sizes
771
- self.resblock_dilation_sizes = resblock_dilation_sizes
772
- self.upsample_rates = upsample_rates
773
- self.upsample_initial_channel = upsample_initial_channel
774
- self.upsample_kernel_sizes = upsample_kernel_sizes
775
- self.segment_size = segment_size
776
- self.gin_channels = gin_channels
777
- self.spk_embed_dim = spk_embed_dim
778
- self.sr = sr
779
-
780
- # 文本编码器 (使用 TextEncoder 替代 PosteriorEncoder)
781
- self.enc_p = TextEncoder(
782
- inter_channels, hidden_channels, filter_channels,
783
- n_heads, n_layers, kernel_size, p_dropout, f0=True
784
- )
785
-
786
- # 解码器/生成器 (NSF-HiFiGAN,内部包含 m_source)
787
- self.dec = Generator(
788
- inter_channels, resblock_kernel_sizes, resblock_dilation_sizes,
789
- upsample_rates, upsample_initial_channel, upsample_kernel_sizes,
790
- gin_channels, sr=sr
791
- )
792
-
793
- # 流
794
- self.flow = ResidualCouplingBlock(
795
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
796
- )
797
-
798
- # 说话人嵌入
799
- self.emb_g = nn.Embedding(spk_embed_dim, gin_channels)
800
-
801
- def forward(self, phone, phone_lengths, pitch, nsff0, sid, skip_head=0, return_length=0):
802
- """前向传播"""
803
- g = self.emb_g(sid).unsqueeze(-1)
804
-
805
- # TextEncoder 返回 mean 和 log-variance
806
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
807
-
808
- # 在编码器外部采样
809
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
810
-
811
- # 正向 flow
812
- z = self.flow(z_p, x_mask, g=g)
813
-
814
- # 生成音频 (传入 f0)
815
- o = self.dec(z, nsff0, g=g)
816
-
817
- return o
818
-
819
- def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=1.0):
820
- """推理"""
821
- import logging
822
- log = logging.getLogger(__name__)
823
-
824
- log.debug(f"[infer] 输入 phone: shape={phone.shape}, dtype={phone.dtype}")
825
- log.debug(f"[infer] 输入 phone 统计: max={phone.abs().max().item():.4f}, mean={phone.abs().mean().item():.4f}")
826
- log.debug(f"[infer] 输入 phone_lengths: {phone_lengths}")
827
- log.debug(f"[infer] 输入 pitch: shape={pitch.shape}, max={pitch.max().item()}, min={pitch.min().item()}")
828
- log.debug(f"[infer] 输入 nsff0: shape={nsff0.shape}, max={nsff0.max().item():.1f}, min={nsff0.min().item():.1f}")
829
- log.debug(f"[infer] 输入 sid: {sid}")
830
-
831
- g = self.emb_g(sid).unsqueeze(-1)
832
- log.debug(f"[infer] 说话人嵌入 g: shape={g.shape}, max={g.abs().max().item():.4f}")
833
-
834
- # TextEncoder 返回 mean 和 log-variance
835
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
836
- log.debug(f"[infer] TextEncoder 输出:")
837
- log.debug(f"[infer] m_p: shape={m_p.shape}, max={m_p.abs().max().item():.4f}, mean={m_p.abs().mean().item():.4f}")
838
- log.debug(f"[infer] logs_p: shape={logs_p.shape}, max={logs_p.max().item():.4f}, min={logs_p.min().item():.4f}")
839
- log.debug(f"[infer] x_mask: shape={x_mask.shape}, sum={x_mask.sum().item()}")
840
-
841
- # 在编码器外部采样 (使用较小的噪声系数以获得更稳定的输出)
842
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
843
- log.debug(f"[infer] 采样后 z_p: shape={z_p.shape}, max={z_p.abs().max().item():.4f}, mean={z_p.abs().mean().item():.4f}")
844
-
845
- # 反向 flow
846
- z = self.flow(z_p, x_mask, g=g, reverse=True)
847
- log.debug(f"[infer] Flow 输出 z: shape={z.shape}, max={z.abs().max().item():.4f}, mean={z.abs().mean().item():.4f}")
848
-
849
- # 生成音频 (传入 f0,Generator 内部会生成 NSF 激励信号)
850
- o = self.dec(z * x_mask, nsff0, g=g)
851
- log.debug(f"[infer] Generator 输出 o: shape={o.shape}, max={o.abs().max().item():.4f}, mean={o.abs().mean().item():.4f}")
852
-
853
- return o, x_mask
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ RVC v2 合成器模型定义
4
+ """
5
+ import math
6
+ import torch
7
+ import torch.nn as nn
8
+ import torch.nn.functional as F
9
+ from typing import Optional, Tuple
10
+ import numpy as np
11
+
12
+
13
+ class LayerNorm(nn.Module):
14
+ """Layer normalization for channels-first tensors"""
15
+
16
+ def __init__(self, channels: int, eps: float = 1e-5):
17
+ super().__init__()
18
+ self.channels = channels
19
+ self.eps = eps
20
+ self.gamma = nn.Parameter(torch.ones(channels))
21
+ self.beta = nn.Parameter(torch.zeros(channels))
22
+
23
+ def forward(self, x):
24
+ # x: [B, C, T]
25
+ x = x.transpose(1, -1) # [B, T, C]
26
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
27
+ return x.transpose(1, -1) # [B, C, T]
28
+
29
+
30
+ class MultiHeadAttention(nn.Module):
31
+ """Multi-head attention module"""
32
+
33
+ def __init__(self, channels: int, out_channels: int, n_heads: int,
34
+ p_dropout: float = 0.0, window_size: Optional[int] = None,
35
+ heads_share: bool = True, block_length: Optional[int] = None,
36
+ proximal_bias: bool = False, proximal_init: bool = False):
37
+ super().__init__()
38
+ assert channels % n_heads == 0
39
+
40
+ self.channels = channels
41
+ self.out_channels = out_channels
42
+ self.n_heads = n_heads
43
+ self.p_dropout = p_dropout
44
+ self.window_size = window_size
45
+ self.heads_share = heads_share
46
+ self.block_length = block_length
47
+ self.proximal_bias = proximal_bias
48
+ self.proximal_init = proximal_init
49
+ self.attn = None
50
+
51
+ self.k_channels = channels // n_heads
52
+ self.conv_q = nn.Conv1d(channels, channels, 1)
53
+ self.conv_k = nn.Conv1d(channels, channels, 1)
54
+ self.conv_v = nn.Conv1d(channels, channels, 1)
55
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
56
+ self.drop = nn.Dropout(p_dropout)
57
+
58
+ if window_size is not None:
59
+ n_heads_rel = 1 if heads_share else n_heads
60
+ rel_stddev = self.k_channels ** -0.5
61
+ self.emb_rel_k = nn.Parameter(
62
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev
63
+ )
64
+ self.emb_rel_v = nn.Parameter(
65
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev
66
+ )
67
+
68
+ nn.init.xavier_uniform_(self.conv_q.weight)
69
+ nn.init.xavier_uniform_(self.conv_k.weight)
70
+ nn.init.xavier_uniform_(self.conv_v.weight)
71
+ if proximal_init:
72
+ with torch.no_grad():
73
+ self.conv_k.weight.copy_(self.conv_q.weight)
74
+ self.conv_k.bias.copy_(self.conv_q.bias)
75
+
76
+ def forward(self, x, c, attn_mask=None):
77
+ q = self.conv_q(x)
78
+ k = self.conv_k(c)
79
+ v = self.conv_v(c)
80
+
81
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
82
+
83
+ x = self.conv_o(x)
84
+ return x
85
+
86
+ def attention(self, query, key, value, mask=None):
87
+ # query, key, value: [B, C, T]
88
+ b, d, t_s = key.size()
89
+ t_t = query.size(2)
90
+
91
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
92
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
93
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
94
+
95
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
96
+
97
+ if self.window_size is not None:
98
+ assert t_s == t_t, "Relative attention only for self-attention"
99
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
100
+ rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
101
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
102
+ scores = scores + scores_local
103
+
104
+ if self.proximal_bias:
105
+ assert t_s == t_t, "Proximal bias only for self-attention"
106
+ scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
107
+
108
+ if mask is not None:
109
+ scores = scores.masked_fill(mask == 0, -1e4)
110
+ if self.block_length is not None:
111
+ assert t_s == t_t, "Block length only for self-attention"
112
+ block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
113
+ scores = scores.masked_fill(block_mask == 0, -1e4)
114
+
115
+ p_attn = F.softmax(scores, dim=-1)
116
+ p_attn = self.drop(p_attn)
117
+ output = torch.matmul(p_attn, value)
118
+
119
+ if self.window_size is not None:
120
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
121
+ value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
122
+ output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
123
+
124
+ output = output.transpose(2, 3).contiguous().view(b, d, t_t)
125
+ return output, p_attn
126
+
127
+ def _matmul_with_relative_values(self, x, y):
128
+ ret = torch.matmul(x, y.unsqueeze(0))
129
+ return ret
130
+
131
+ def _matmul_with_relative_keys(self, x, y):
132
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
133
+ return ret
134
+
135
+ def _get_relative_embeddings(self, relative_embeddings, length):
136
+ max_relative_position = 2 * self.window_size + 1
137
+ pad_length = max(length - (self.window_size + 1), 0)
138
+ slice_start_position = max((self.window_size + 1) - length, 0)
139
+ slice_end_position = slice_start_position + 2 * length - 1
140
+ if pad_length > 0:
141
+ padded_relative_embeddings = F.pad(
142
+ relative_embeddings,
143
+ (0, 0, pad_length, pad_length, 0, 0)
144
+ )
145
+ else:
146
+ padded_relative_embeddings = relative_embeddings
147
+ used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
148
+ return used_relative_embeddings
149
+
150
+ def _relative_position_to_absolute_position(self, x):
151
+ batch, heads, length, _ = x.size()
152
+ x = F.pad(x, (0, 1, 0, 0, 0, 0, 0, 0))
153
+ x_flat = x.view(batch, heads, length * 2 * length)
154
+ x_flat = F.pad(x_flat, (0, length - 1, 0, 0, 0, 0))
155
+ x_final = x_flat.view(batch, heads, length + 1, 2 * length - 1)[:, :, :length, length - 1:]
156
+ return x_final
157
+
158
+ def _absolute_position_to_relative_position(self, x):
159
+ batch, heads, length, _ = x.size()
160
+ x = F.pad(x, (0, length - 1, 0, 0, 0, 0, 0, 0))
161
+ x_flat = x.view(batch, heads, length ** 2 + length * (length - 1))
162
+ x_flat = F.pad(x_flat, (length, 0, 0, 0, 0, 0))
163
+ x_final = x_flat.view(batch, heads, length, 2 * length)[:, :, :, 1:]
164
+ return x_final
165
+
166
+ def _attention_bias_proximal(self, length):
167
+ r = torch.arange(length, dtype=torch.float32)
168
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
169
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
170
+
171
+
172
+ class FFN(nn.Module):
173
+ """Feed-forward network with optional causal convolution"""
174
+
175
+ def __init__(self, in_channels: int, out_channels: int, filter_channels: int,
176
+ kernel_size: int, p_dropout: float = 0.0, activation: str = None,
177
+ causal: bool = False):
178
+ super().__init__()
179
+ self.in_channels = in_channels
180
+ self.out_channels = out_channels
181
+ self.filter_channels = filter_channels
182
+ self.kernel_size = kernel_size
183
+ self.p_dropout = p_dropout
184
+ self.activation = activation
185
+ self.causal = causal
186
+
187
+ if causal:
188
+ self.padding = self._causal_padding
189
+ else:
190
+ self.padding = self._same_padding
191
+
192
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
193
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
194
+ self.drop = nn.Dropout(p_dropout)
195
+
196
+ def forward(self, x, x_mask):
197
+ x = self.conv_1(self.padding(x))
198
+ if self.activation == "gelu":
199
+ x = x * torch.sigmoid(1.702 * x)
200
+ else:
201
+ x = torch.relu(x)
202
+ x = self.drop(x)
203
+ x = self.conv_2(self.padding(x))
204
+ return x * x_mask
205
+
206
+ def _causal_padding(self, x):
207
+ if self.kernel_size == 1:
208
+ return x
209
+ pad_l = self.kernel_size - 1
210
+ pad_r = 0
211
+ return F.pad(x, (pad_l, pad_r, 0, 0, 0, 0))
212
+
213
+ def _same_padding(self, x):
214
+ if self.kernel_size == 1:
215
+ return x
216
+ pad_l = (self.kernel_size - 1) // 2
217
+ pad_r = self.kernel_size // 2
218
+ return F.pad(x, (pad_l, pad_r, 0, 0, 0, 0))
219
+
220
+
221
+ class Encoder(nn.Module):
222
+ """Transformer encoder with multi-head attention"""
223
+
224
+ def __init__(self, hidden_channels: int, filter_channels: int, n_heads: int,
225
+ n_layers: int, kernel_size: int = 1, p_dropout: float = 0.0,
226
+ window_size: int = 10):
227
+ super().__init__()
228
+ self.hidden_channels = hidden_channels
229
+ self.filter_channels = filter_channels
230
+ self.n_heads = n_heads
231
+ self.n_layers = n_layers
232
+ self.kernel_size = kernel_size
233
+ self.p_dropout = p_dropout
234
+ self.window_size = window_size
235
+
236
+ self.drop = nn.Dropout(p_dropout)
237
+ self.attn_layers = nn.ModuleList()
238
+ self.norm_layers_1 = nn.ModuleList()
239
+ self.ffn_layers = nn.ModuleList()
240
+ self.norm_layers_2 = nn.ModuleList()
241
+
242
+ for _ in range(n_layers):
243
+ self.attn_layers.append(
244
+ MultiHeadAttention(
245
+ hidden_channels, hidden_channels, n_heads,
246
+ p_dropout=p_dropout, window_size=window_size
247
+ )
248
+ )
249
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
250
+ self.ffn_layers.append(
251
+ FFN(hidden_channels, hidden_channels, filter_channels,
252
+ kernel_size, p_dropout=p_dropout)
253
+ )
254
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
255
+
256
+ def forward(self, x, x_mask):
257
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
258
+ x = x * x_mask
259
+ for i in range(self.n_layers):
260
+ y = self.attn_layers[i](x, x, attn_mask)
261
+ y = self.drop(y)
262
+ x = self.norm_layers_1[i](x + y)
263
+
264
+ y = self.ffn_layers[i](x, x_mask)
265
+ y = self.drop(y)
266
+ x = self.norm_layers_2[i](x + y)
267
+ x = x * x_mask
268
+ return x
269
+
270
+
271
+ class TextEncoder(nn.Module):
272
+ """Text encoder for RVC - encodes phone and pitch embeddings"""
273
+
274
+ def __init__(self, out_channels: int, hidden_channels: int, filter_channels: int,
275
+ n_heads: int, n_layers: int, kernel_size: int, p_dropout: float,
276
+ f0: bool = True):
277
+ super().__init__()
278
+ self.out_channels = out_channels
279
+ self.hidden_channels = hidden_channels
280
+ self.filter_channels = filter_channels
281
+ self.n_heads = n_heads
282
+ self.n_layers = n_layers
283
+ self.kernel_size = kernel_size
284
+ self.p_dropout = p_dropout
285
+ self.f0 = f0
286
+
287
+ # Phone embedding: Linear projection from 768-dim HuBERT features
288
+ self.emb_phone = nn.Linear(768, hidden_channels)
289
+
290
+ # Pitch embedding (only if f0 is enabled)
291
+ if f0:
292
+ self.emb_pitch = nn.Embedding(256, hidden_channels)
293
+
294
+ # Transformer encoder
295
+ self.encoder = Encoder(
296
+ hidden_channels, filter_channels, n_heads, n_layers,
297
+ kernel_size, p_dropout
298
+ )
299
+
300
+ # Output projection to mean and log-variance
301
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
302
+
303
+ def forward(self, phone, pitch, lengths):
304
+ """
305
+ Args:
306
+ phone: [B, 768, T] phone features from HuBERT (channels first)
307
+ pitch: [B, T] pitch indices (0-255)
308
+ lengths: [B] sequence lengths
309
+
310
+ Returns:
311
+ m: [B, out_channels, T] mean
312
+ logs: [B, out_channels, T] log-variance
313
+ x_mask: [B, 1, T] mask
314
+ """
315
+ import logging
316
+ log = logging.getLogger(__name__)
317
+
318
+ log.debug(f"[TextEncoder] 输入 phone: shape={phone.shape}")
319
+ log.debug(f"[TextEncoder] 输入 pitch: shape={pitch.shape}, max={pitch.max().item()}, min={pitch.min().item()}")
320
+ log.debug(f"[TextEncoder] 输入 lengths: {lengths}")
321
+
322
+ # Transpose phone from [B, C, T] to [B, T, C] for linear layer
323
+ phone = phone.transpose(1, 2) # [B, T, 768]
324
+ log.debug(f"[TextEncoder] 转置后 phone: shape={phone.shape}")
325
+
326
+ # Create mask
327
+ x_mask = torch.unsqueeze(
328
+ self._sequence_mask(lengths, phone.size(1)), 1
329
+ ).to(phone.dtype)
330
+ log.debug(f"[TextEncoder] x_mask: shape={x_mask.shape}, sum={x_mask.sum().item()}")
331
+
332
+ # Phone embedding
333
+ x = self.emb_phone(phone) # [B, T, hidden_channels]
334
+ log.debug(f"[TextEncoder] emb_phone 输出: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}")
335
+
336
+ # Add pitch embedding if enabled
337
+ if self.f0 and pitch is not None:
338
+ # Clamp pitch to valid range
339
+ pitch_clamped = torch.clamp(pitch, 0, 255)
340
+ pitch_emb = self.emb_pitch(pitch_clamped)
341
+ log.debug(f"[TextEncoder] emb_pitch 输出: shape={pitch_emb.shape}, max={pitch_emb.abs().max().item():.4f}")
342
+ x = x + pitch_emb
343
+
344
+ # Transpose for conv layers: [B, hidden_channels, T]
345
+ x = x.transpose(1, 2)
346
+ log.debug(f"[TextEncoder] 转置后 x: shape={x.shape}")
347
+
348
+ # Apply mask
349
+ x = x * x_mask
350
+
351
+ # Transformer encoder
352
+ x = self.encoder(x, x_mask)
353
+ log.debug(f"[TextEncoder] Transformer 输出: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}")
354
+
355
+ # Project to mean and log-variance
356
+ stats = self.proj(x) * x_mask
357
+ m, logs = torch.split(stats, self.out_channels, dim=1)
358
+ log.debug(f"[TextEncoder] 最终输出 m: shape={m.shape}, max={m.abs().max().item():.4f}")
359
+ log.debug(f"[TextEncoder] 最终输出 logs: shape={logs.shape}, max={logs.max().item():.4f}, min={logs.min().item():.4f}")
360
+
361
+ return m, logs, x_mask
362
+
363
+ def _sequence_mask(self, length, max_length=None):
364
+ if max_length is None:
365
+ max_length = length.max()
366
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
367
+ return x.unsqueeze(0) < length.unsqueeze(1)
368
+
369
+
370
+ class ResidualCouplingBlock(nn.Module):
371
+ """残差耦合块"""
372
+
373
+ def __init__(self, channels: int, hidden_channels: int, kernel_size: int,
374
+ dilation_rate: int, n_layers: int, n_flows: int = 4,
375
+ gin_channels: int = 0):
376
+ super().__init__()
377
+ self.flows = nn.ModuleList()
378
+
379
+ for _ in range(n_flows):
380
+ self.flows.append(
381
+ ResidualCouplingLayer(
382
+ channels, hidden_channels, kernel_size,
383
+ dilation_rate, n_layers, gin_channels=gin_channels
384
+ )
385
+ )
386
+ self.flows.append(Flip())
387
+
388
+ def forward(self, x, x_mask, g=None, reverse=False):
389
+ if not reverse:
390
+ for flow in self.flows:
391
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
392
+ else:
393
+ for flow in reversed(self.flows):
394
+ x = flow(x, x_mask, g=g, reverse=reverse)
395
+ return x
396
+
397
+
398
+ class ResidualCouplingLayer(nn.Module):
399
+ """残差耦合层"""
400
+
401
+ def __init__(self, channels: int, hidden_channels: int, kernel_size: int,
402
+ dilation_rate: int, n_layers: int, mean_only: bool = True,
403
+ gin_channels: int = 0):
404
+ super().__init__()
405
+ self.half_channels = channels // 2
406
+ self.mean_only = mean_only
407
+
408
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
409
+ self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels)
410
+ self.post = nn.Conv1d(hidden_channels, self.half_channels, 1)
411
+ self.post.weight.data.zero_()
412
+ self.post.bias.data.zero_()
413
+
414
+ def forward(self, x, x_mask, g=None, reverse=False):
415
+ x0, x1 = torch.split(x, [self.half_channels] * 2, dim=1)
416
+ h = self.pre(x0) * x_mask
417
+ h = self.enc(h, x_mask, g=g)
418
+ stats = self.post(h) * x_mask
419
+ m = stats
420
+
421
+ if not reverse:
422
+ x1 = m + x1 * x_mask
423
+ x = torch.cat([x0, x1], dim=1)
424
+ return x, None
425
+ else:
426
+ x1 = (x1 - m) * x_mask
427
+ x = torch.cat([x0, x1], dim=1)
428
+ return x
429
+
430
+
431
+ class Flip(nn.Module):
432
+ """翻转层"""
433
+
434
+ def forward(self, x, *args, reverse=False, **kwargs):
435
+ x = torch.flip(x, [1])
436
+ return x
437
+
438
+
439
+ class WN(nn.Module):
440
+ """WaveNet 风格网络 (带权重归一化)"""
441
+
442
+ def __init__(self, hidden_channels: int, kernel_size: int,
443
+ dilation_rate: int, n_layers: int, gin_channels: int = 0,
444
+ p_dropout: float = 0):
445
+ super().__init__()
446
+ self.n_layers = n_layers
447
+ self.hidden_channels = hidden_channels
448
+ self.gin_channels = gin_channels
449
+
450
+ self.in_layers = nn.ModuleList()
451
+ self.res_skip_layers = nn.ModuleList()
452
+ self.drop = nn.Dropout(p_dropout)
453
+
454
+ if gin_channels > 0:
455
+ self.cond_layer = nn.utils.weight_norm(
456
+ nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
457
+ )
458
+
459
+ for i in range(n_layers):
460
+ dilation = dilation_rate ** i
461
+ padding = (kernel_size * dilation - dilation) // 2
462
+ self.in_layers.append(
463
+ nn.utils.weight_norm(
464
+ nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size,
465
+ dilation=dilation, padding=padding)
466
+ )
467
+ )
468
+ # 前 n-1 层输出 2 * hidden_channels,最后一层输出 hidden_channels
469
+ if i < n_layers - 1:
470
+ res_skip_channels = 2 * hidden_channels
471
+ else:
472
+ res_skip_channels = hidden_channels
473
+ self.res_skip_layers.append(
474
+ nn.utils.weight_norm(
475
+ nn.Conv1d(hidden_channels, res_skip_channels, 1)
476
+ )
477
+ )
478
+
479
+ def forward(self, x, x_mask, g=None):
480
+ output = torch.zeros_like(x)
481
+
482
+ if g is not None and self.gin_channels > 0:
483
+ g = self.cond_layer(g)
484
+
485
+ for i in range(self.n_layers):
486
+ x_in = self.in_layers[i](x)
487
+ if g is not None:
488
+ cond_offset = i * 2 * self.hidden_channels
489
+ g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
490
+ x_in = x_in + g_l
491
+
492
+ acts = torch.tanh(x_in[:, :self.hidden_channels]) * torch.sigmoid(x_in[:, self.hidden_channels:])
493
+ acts = self.drop(acts)
494
+ res_skip = self.res_skip_layers[i](acts)
495
+
496
+ if i < self.n_layers - 1:
497
+ # 前 n-1 层:residual + skip
498
+ x = (x + res_skip[:, :self.hidden_channels]) * x_mask
499
+ output = output + res_skip[:, self.hidden_channels:]
500
+ else:
501
+ # 最后一层:只有 residual,加到 output
502
+ x = (x + res_skip) * x_mask
503
+ output = output + res_skip
504
+
505
+ return output * x_mask
506
+
507
+
508
+ class PosteriorEncoder(nn.Module):
509
+ """后验编码器"""
510
+
511
+ def __init__(self, in_channels: int, out_channels: int, hidden_channels: int,
512
+ kernel_size: int, dilation_rate: int, n_layers: int,
513
+ gin_channels: int = 0):
514
+ super().__init__()
515
+ self.out_channels = out_channels
516
+
517
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
518
+ self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels)
519
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
520
+
521
+ def forward(self, x, x_lengths, g=None):
522
+ x_mask = torch.unsqueeze(
523
+ self._sequence_mask(x_lengths, x.size(2)), 1
524
+ ).to(x.dtype)
525
+
526
+ x = self.pre(x) * x_mask
527
+ x = self.enc(x, x_mask, g=g)
528
+ stats = self.proj(x) * x_mask
529
+ m, logs = torch.split(stats, self.out_channels, dim=1)
530
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
531
+ return z, m, logs, x_mask
532
+
533
+ def _sequence_mask(self, length, max_length=None):
534
+ if max_length is None:
535
+ max_length = length.max()
536
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
537
+ return x.unsqueeze(0) < length.unsqueeze(1)
538
+
539
+
540
+ class Generator(nn.Module):
541
+ """NSF-HiFi-GAN 生成器 (带权重归一化)"""
542
+
543
+ def __init__(self, initial_channel: int, resblock_kernel_sizes: list,
544
+ resblock_dilation_sizes: list, upsample_rates: list,
545
+ upsample_initial_channel: int, upsample_kernel_sizes: list,
546
+ gin_channels: int = 0, sr: int = 40000, is_half: bool = False):
547
+ super().__init__()
548
+ self.num_kernels = len(resblock_kernel_sizes)
549
+ self.num_upsamples = len(upsample_rates)
550
+ self.sr = sr
551
+ self.is_half = is_half
552
+
553
+ # 计算上采样因子
554
+ self.upp = int(np.prod(upsample_rates))
555
+
556
+ self.conv_pre = nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, 3)
557
+
558
+ # NSF 源模块
559
+ self.m_source = SourceModuleHnNSF(sample_rate=sr, harmonic_num=0)
560
+
561
+ # 噪声卷积层
562
+ self.noise_convs = nn.ModuleList()
563
+
564
+ self.ups = nn.ModuleList()
565
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
566
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
567
+ self.ups.append(
568
+ nn.utils.weight_norm(
569
+ nn.ConvTranspose1d(
570
+ upsample_initial_channel // (2 ** i),
571
+ c_cur,
572
+ k, u, (k - u) // 2
573
+ )
574
+ )
575
+ )
576
+ # 噪声卷积
577
+ if i + 1 < len(upsample_rates):
578
+ stride_f0 = int(np.prod(upsample_rates[i + 1:]))
579
+ self.noise_convs.append(
580
+ nn.Conv1d(1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2)
581
+ )
582
+ else:
583
+ self.noise_convs.append(nn.Conv1d(1, c_cur, kernel_size=1))
584
+
585
+ self.resblocks = nn.ModuleList()
586
+ for i in range(len(self.ups)):
587
+ ch = upsample_initial_channel // (2 ** (i + 1))
588
+ for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
589
+ self.resblocks.append(ResBlock(ch, k, d))
590
+
591
+ self.conv_post = nn.Conv1d(ch, 1, 7, 1, 3, bias=False)
592
+
593
+ if gin_channels > 0:
594
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
595
+
596
+ def forward(self, x, f0, g=None):
597
+ import logging
598
+ log = logging.getLogger(__name__)
599
+
600
+ log.debug(f"[Generator] 输入 x: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}")
601
+ log.debug(f"[Generator] 输入 f0: shape={f0.shape}, max={f0.max().item():.1f}, min={f0.min().item():.1f}")
602
+ if g is not None:
603
+ log.debug(f"[Generator] 输入 g: shape={g.shape}, max={g.abs().max().item():.4f}")
604
+
605
+ # 生成 NSF 激励信号
606
+ har_source, _, _ = self.m_source(f0, self.upp)
607
+ har_source = har_source.transpose(1, 2) # [B, 1, T*upp]
608
+ log.debug(f"[Generator] NSF har_source: shape={har_source.shape}, max={har_source.abs().max().item():.4f}")
609
+
610
+ x = self.conv_pre(x)
611
+ log.debug(f"[Generator] conv_pre 输出: shape={x.shape}, max={x.abs().max().item():.4f}")
612
+
613
+ if g is not None:
614
+ x = x + self.cond(g)
615
+ log.debug(f"[Generator] 加入条件后: max={x.abs().max().item():.4f}")
616
+
617
+ for i in range(self.num_upsamples):
618
+ x = F.leaky_relu(x, 0.1)
619
+ x = self.ups[i](x)
620
+
621
+ # 融合噪声
622
+ x_source = self.noise_convs[i](har_source)
623
+ x = x + x_source
624
+
625
+ xs = None
626
+ for j in range(self.num_kernels):
627
+ if xs is None:
628
+ xs = self.resblocks[i * self.num_kernels + j](x)
629
+ else:
630
+ xs += self.resblocks[i * self.num_kernels + j](x)
631
+ x = xs / self.num_kernels
632
+ log.debug(f"[Generator] 上采样层 {i}: shape={x.shape}, max={x.abs().max().item():.4f}")
633
+
634
+ x = F.leaky_relu(x)
635
+ x = self.conv_post(x)
636
+ log.debug(f"[Generator] conv_post 输出: shape={x.shape}, max={x.abs().max().item():.4f}")
637
+ x = torch.tanh(x)
638
+ log.debug(f"[Generator] tanh 输出: shape={x.shape}, max={x.abs().max().item():.4f}")
639
+
640
+ return x
641
+
642
+ def remove_weight_norm(self):
643
+ for l in self.ups:
644
+ nn.utils.remove_weight_norm(l)
645
+ for l in self.resblocks:
646
+ l.remove_weight_norm()
647
+
648
+
649
+ class ResBlock(nn.Module):
650
+ """残块 (带权重归一化)"""
651
+
652
+ def __init__(self, channels: int, kernel_size: int = 3, dilation: tuple = (1, 3, 5)):
653
+ super().__init__()
654
+ self.convs1 = nn.ModuleList([
655
+ nn.utils.weight_norm(
656
+ nn.Conv1d(channels, channels, kernel_size, 1,
657
+ (kernel_size * d - d) // 2, dilation=d)
658
+ )
659
+ for d in dilation
660
+ ])
661
+ self.convs2 = nn.ModuleList([
662
+ nn.utils.weight_norm(
663
+ nn.Conv1d(channels, channels, kernel_size, 1,
664
+ (kernel_size - 1) // 2)
665
+ )
666
+ for _ in dilation
667
+ ])
668
+
669
+ def forward(self, x):
670
+ for c1, c2 in zip(self.convs1, self.convs2):
671
+ xt = F.leaky_relu(x, 0.1)
672
+ xt = c1(xt)
673
+ xt = F.leaky_relu(xt, 0.1)
674
+ xt = c2(xt)
675
+ x = xt + x
676
+ return x
677
+
678
+ def remove_weight_norm(self):
679
+ for l in self.convs1:
680
+ nn.utils.remove_weight_norm(l)
681
+ for l in self.convs2:
682
+ nn.utils.remove_weight_norm(l)
683
+
684
+
685
+ class SineGenerator(nn.Module):
686
+ """正弦波生成器 - NSF 的核心组件"""
687
+
688
+ def __init__(self, sample_rate: int, harmonic_num: int = 0,
689
+ sine_amp: float = 0.1, noise_std: float = 0.003,
690
+ voiced_threshold: float = 10):
691
+ super().__init__()
692
+ self.sample_rate = sample_rate
693
+ self.harmonic_num = harmonic_num
694
+ self.sine_amp = sine_amp
695
+ self.noise_std = noise_std
696
+ self.voiced_threshold = voiced_threshold
697
+ self.dim = harmonic_num + 1
698
+
699
+ def forward(self, f0: torch.Tensor, upp: int):
700
+ """
701
+ 生成正弦波激励信号
702
+
703
+ Args:
704
+ f0: 基频张量 [B, T]
705
+ upp: 上采样因子
706
+
707
+ Returns:
708
+ 正弦波信号 [B, T*upp, 1]
709
+ """
710
+ with torch.no_grad():
711
+ # 上采样 F0
712
+ f0 = f0.unsqueeze(1) # [B, 1, T]
713
+ f0_up = F.interpolate(f0, scale_factor=upp, mode='nearest')
714
+ f0_up = f0_up.transpose(1, 2) # [B, T*upp, 1]
715
+
716
+ # 生成正弦波
717
+ rad = f0_up / self.sample_rate # 归一化频率
718
+ rad_acc = torch.cumsum(rad, dim=1) % 1 # 累积相位
719
+ sine_wave = torch.sin(2 * np.pi * rad_acc) * self.sine_amp
720
+
721
+ # 静音区域(F0=0)使用噪声
722
+ voiced_mask = (f0_up > self.voiced_threshold).float()
723
+ noise = torch.randn_like(sine_wave) * self.noise_std
724
+ sine_wave = sine_wave * voiced_mask + noise * (1 - voiced_mask)
725
+
726
+ return sine_wave
727
+
728
+
729
+ class SourceModuleHnNSF(nn.Module):
730
+ """谐波加噪声源模块"""
731
+
732
+ def __init__(self, sample_rate: int, harmonic_num: int = 0,
733
+ sine_amp: float = 0.1, noise_std: float = 0.003,
734
+ add_noise_std: float = 0.003):
735
+ super().__init__()
736
+ self.sine_generator = SineGenerator(
737
+ sample_rate, harmonic_num, sine_amp, noise_std
738
+ )
739
+ self.l_linear = nn.Linear(harmonic_num + 1, 1)
740
+ self.l_tanh = nn.Tanh()
741
+
742
+ def forward(self, f0: torch.Tensor, upp: int):
743
+ sine = self.sine_generator(f0, upp) # [B, T*upp, 1]
744
+ sine = self.l_tanh(self.l_linear(sine))
745
+ noise = torch.randn_like(sine) * 0.003
746
+ return sine, noise, None # 返回 3 个值以匹配接口
747
+
748
+
749
+ class SynthesizerTrnMs768NSFsid(nn.Module):
750
+ """RVC v2 合成器 (768 维 HuBERT + NSF + SID)"""
751
+
752
+ def __init__(self, spec_channels: int, segment_size: int,
753
+ inter_channels: int, hidden_channels: int, filter_channels: int,
754
+ n_heads: int, n_layers: int, kernel_size: int, p_dropout: float,
755
+ resblock: str, resblock_kernel_sizes: list,
756
+ resblock_dilation_sizes: list, upsample_rates: list,
757
+ upsample_initial_channel: int, upsample_kernel_sizes: list,
758
+ spk_embed_dim: int, gin_channels: int, sr: int):
759
+ super().__init__()
760
+
761
+ self.spec_channels = spec_channels
762
+ self.inter_channels = inter_channels
763
+ self.hidden_channels = hidden_channels
764
+ self.filter_channels = filter_channels
765
+ self.n_heads = n_heads
766
+ self.n_layers = n_layers
767
+ self.kernel_size = kernel_size
768
+ self.p_dropout = p_dropout
769
+ self.resblock = resblock
770
+ self.resblock_kernel_sizes = resblock_kernel_sizes
771
+ self.resblock_dilation_sizes = resblock_dilation_sizes
772
+ self.upsample_rates = upsample_rates
773
+ self.upsample_initial_channel = upsample_initial_channel
774
+ self.upsample_kernel_sizes = upsample_kernel_sizes
775
+ self.segment_size = segment_size
776
+ self.gin_channels = gin_channels
777
+ self.spk_embed_dim = spk_embed_dim
778
+ self.sr = sr
779
+
780
+ # 文本编码器 (使用 TextEncoder 替代 PosteriorEncoder)
781
+ self.enc_p = TextEncoder(
782
+ inter_channels, hidden_channels, filter_channels,
783
+ n_heads, n_layers, kernel_size, p_dropout, f0=True
784
+ )
785
+
786
+ # 解码器/生成器 (NSF-HiFiGAN,内部包含 m_source)
787
+ self.dec = Generator(
788
+ inter_channels, resblock_kernel_sizes, resblock_dilation_sizes,
789
+ upsample_rates, upsample_initial_channel, upsample_kernel_sizes,
790
+ gin_channels, sr=sr
791
+ )
792
+
793
+ # 流
794
+ self.flow = ResidualCouplingBlock(
795
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
796
+ )
797
+
798
+ # 说话人嵌入
799
+ self.emb_g = nn.Embedding(spk_embed_dim, gin_channels)
800
+
801
+ def forward(self, phone, phone_lengths, pitch, nsff0, sid, skip_head=0, return_length=0):
802
+ """前向传播"""
803
+ g = self.emb_g(sid).unsqueeze(-1)
804
+
805
+ # TextEncoder 返回 mean 和 log-variance
806
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
807
+
808
+ # 在编码器外部采样
809
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
810
+
811
+ # 正向 flow
812
+ z = self.flow(z_p, x_mask, g=g)
813
+
814
+ # 生成音频 (传入 f0)
815
+ o = self.dec(z, nsff0, g=g)
816
+
817
+ return o
818
+
819
+ def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=1.0):
820
+ """推理"""
821
+ import logging
822
+ log = logging.getLogger(__name__)
823
+
824
+ log.debug(f"[infer] 输入 phone: shape={phone.shape}, dtype={phone.dtype}")
825
+ log.debug(f"[infer] 输入 phone 统计: max={phone.abs().max().item():.4f}, mean={phone.abs().mean().item():.4f}")
826
+ log.debug(f"[infer] 输入 phone_lengths: {phone_lengths}")
827
+ log.debug(f"[infer] 输入 pitch: shape={pitch.shape}, max={pitch.max().item()}, min={pitch.min().item()}")
828
+ log.debug(f"[infer] 输入 nsff0: shape={nsff0.shape}, max={nsff0.max().item():.1f}, min={nsff0.min().item():.1f}")
829
+ log.debug(f"[infer] 输入 sid: {sid}")
830
+
831
+ g = self.emb_g(sid).unsqueeze(-1)
832
+ log.debug(f"[infer] 说话人嵌入 g: shape={g.shape}, max={g.abs().max().item():.4f}")
833
+
834
+ # TextEncoder 返回 mean 和 log-variance
835
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
836
+ log.debug(f"[infer] TextEncoder 输出:")
837
+ log.debug(f"[infer] m_p: shape={m_p.shape}, max={m_p.abs().max().item():.4f}, mean={m_p.abs().mean().item():.4f}")
838
+ log.debug(f"[infer] logs_p: shape={logs_p.shape}, max={logs_p.max().item():.4f}, min={logs_p.min().item():.4f}")
839
+ log.debug(f"[infer] x_mask: shape={x_mask.shape}, sum={x_mask.sum().item()}")
840
+
841
+ # 在编码器外部采样 (使用较小的噪声系数以获得更稳定的输出)
842
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
843
+ log.debug(f"[infer] 采样后 z_p: shape={z_p.shape}, max={z_p.abs().max().item():.4f}, mean={z_p.abs().mean().item():.4f}")
844
+
845
+ # 反向 flow
846
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
847
+ log.debug(f"[infer] Flow 输出 z: shape={z.shape}, max={z.abs().max().item():.4f}, mean={z.abs().mean().item():.4f}")
848
+
849
+ # 生成音频 (传入 f0,Generator 内部会生成 NSF 激励信号)
850
+ o = self.dec(z * x_mask, nsff0, g=g)
851
+ log.debug(f"[infer] Generator 输出 o: shape={o.shape}, max={o.abs().max().item():.4f}, mean={o.abs().mean().item():.4f}")
852
+
853
+ return o, x_mask
requirements_hf.txt ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RVC AI 翻唱依赖 (Hugging Face Space - CPU 精简版)
2
+ # 注意:此文件用于 HF Space 部署,同步到 Space 时需重命名为 requirements.txt
3
+ # 本地安装请使用 requirements.txt(包含完整 GPU 依赖)
4
+
5
+ # PyTorch
6
+ torch>=2.0.0
7
+ torchaudio>=2.0.0
8
+
9
+ # Gradio 界面
10
+ gradio==3.50.2
11
+
12
+ # 音频处理
13
+ librosa>=0.9.0
14
+ soundfile>=0.12.0
15
+ scipy>=1.10.0
16
+ numpy>=1.23.0
17
+ praat-parselmouth>=0.4.3
18
+ torchcrepe>=0.0.20
19
+
20
+ # 向量检索
21
+ faiss-cpu>=1.7.4
22
+
23
+ # 工具库
24
+ tqdm>=4.65.0
25
+ requests>=2.28.0
26
+ python-dotenv>=1.0.0
27
+ colorama>=0.4.6
28
+
29
+ # AI 翻唱功能(核心)
30
+ audio-separator
31
+ huggingface_hub>=0.19.0
32
+ pedalboard>=0.7.0
33
+ ffmpeg-python>=0.2.0
34
+
35
+ # 以下包在 HF Space 构建环境中编译失败,改为运行时按需安装:
36
+ # fairseq==0.12.2 (HuBERT 特征提取)
37
+ # demucs>=4.0.0 (人声分离备选)
38
+ # pyworld>=0.3.4 (F0 提取备选)
39
+ # av>=10.0.0 (音频解码备选)