diff --git a/.env.example b/.env.example new file mode 100644 index 0000000000000000000000000000000000000000..5d7a456273388d736f7d8926cf94bc86212168fd --- /dev/null +++ b/.env.example @@ -0,0 +1,15 @@ +# RVC 语音转换配置 + +# 设备设置 (cuda / cpu) +DEVICE=cuda + +# Gradio 服务器设置 +SERVER_HOST=127.0.0.1 +SERVER_PORT=7860 + +# 模型路径 +HUBERT_PATH=assets/hubert/hubert_base.pt +RMVPE_PATH=assets/rmvpe/rmvpe.pt + +# 日志级别 (DEBUG / INFO / WARNING / ERROR) +LOG_LEVEL=INFO diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..ff8517ba801b9aedda2c42b89ca5ac6c1a0d9ca8 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +docs/Colab演示.png filter=lfs diff=lfs merge=lfs -text +docs/Windows界面.png filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/build-executables.yml b/.github/workflows/build-executables.yml new file mode 100644 index 0000000000000000000000000000000000000000..d90e2454f380ebf50c468d402f71c8b12a13717e --- /dev/null +++ b/.github/workflows/build-executables.yml @@ -0,0 +1,373 @@ +name: Build Executables + +on: + push: + tags: + - 'v*' + workflow_dispatch: + inputs: + tag: + description: '要上传资源的 Release 标签(如 v1.2.0),留空则上传到最新 Release' + required: false + default: '' + +permissions: + contents: write + +jobs: + build: + strategy: + fail-fast: false + matrix: + include: + - os: windows-latest + variant: CPU + platform: Windows + pytorch_url: https://download.pytorch.org/whl/cpu + sep: ";" + shell: pwsh + - os: windows-latest + variant: GPU + platform: Windows + pytorch_url: https://download.pytorch.org/whl/cu121 + sep: ";" + shell: pwsh + - os: ubuntu-latest + variant: CPU + platform: Linux + pytorch_url: https://download.pytorch.org/whl/cpu + sep: ":" + shell: bash + - os: ubuntu-latest + variant: GPU + platform: Linux + pytorch_url: https://download.pytorch.org/whl/cu121 + sep: ":" + shell: bash + + runs-on: ${{ matrix.os }} + name: Build ${{ matrix.platform }}-${{ matrix.variant }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install system dependencies (Linux) + if: runner.os == 'Linux' + run: | + sudo apt-get update + sudo apt-get install -y build-essential libsndfile1 ffmpeg portaudio19-dev + + - name: Install FFmpeg (Windows) + if: runner.os == 'Windows' + run: choco install ffmpeg -y + + - name: Install Python dependencies + run: | + python -m pip install --upgrade "pip<24.1" + pip install pyinstaller + pip install torch torchaudio --index-url ${{ matrix.pytorch_url }} + pip install omegaconf==2.0.6 --no-deps + pip install PyYAML antlr4-python3-runtime hydra-core + pip install fairseq==0.12.2 --no-deps + pip install "jinja2<3.1.5" + pip install gradio==3.50.2 librosa soundfile scipy numpy praat-parselmouth pyworld faiss-cpu tqdm requests python-dotenv colorama huggingface_hub pedalboard ffmpeg-python av imageio-ffmpeg + pip install demucs torchcrepe --no-deps + pip install julius dora-search lameenc openunmix treetable + python -c "import torch; print(f'PyTorch: {torch.__version__}, CUDA: {torch.cuda.is_available()}')" + + - name: Install audio-separator + run: pip install audio-separator onnxruntime + + - name: Bundle FFmpeg runtime + shell: bash + run: | + python - <<'PY' + import os + import shutil + import stat + import subprocess + from pathlib import Path + + import imageio_ffmpeg + + def check_executable(executable: Path) -> None: + result = subprocess.run([str(executable), "-version"], text=True, capture_output=True) + if result.returncode != 0: + details = "\n".join( + part.strip() + for part in (result.stdout, result.stderr) + if part and part.strip() + ) + raise RuntimeError(f"{executable} failed -version:\n{details}") + + def resolve_ffprobe() -> Path: + ffprobe_src = shutil.which("ffprobe") + if not ffprobe_src: + raise RuntimeError("ffprobe not found after installing FFmpeg") + + ffprobe_path = Path(ffprobe_src) + if os.name == "nt": + chocolatey_root = Path(os.environ.get("ChocolateyInstall", r"C:\ProgramData\chocolatey")) + chocolatey_shim_dir = chocolatey_root / "bin" + if ffprobe_path.parent.resolve() == chocolatey_shim_dir.resolve(): + real_ffprobe = chocolatey_root / "lib" / "ffmpeg" / "tools" / "ffmpeg" / "bin" / "ffprobe.exe" + if not real_ffprobe.exists(): + raise RuntimeError(f"Chocolatey ffprobe shim found, but real binary is missing: {real_ffprobe}") + return real_ffprobe + + return ffprobe_path + + bundle_dir = Path("tools/ffmpeg/bin") + bundle_dir.mkdir(parents=True, exist_ok=True) + + ffmpeg_src = Path(imageio_ffmpeg.get_ffmpeg_exe()) + ffmpeg_name = "ffmpeg.exe" if os.name == "nt" else "ffmpeg" + ffmpeg_dest = bundle_dir / ffmpeg_name + shutil.copy2(ffmpeg_src, ffmpeg_dest) + ffmpeg_dest.chmod(ffmpeg_dest.stat().st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) + check_executable(ffmpeg_dest) + print(f"Bundled ffmpeg: {ffmpeg_dest}") + + ffprobe_src = resolve_ffprobe() + ffprobe_name = "ffprobe.exe" if os.name == "nt" else "ffprobe" + ffprobe_dest = bundle_dir / ffprobe_name + shutil.copy2(ffprobe_src, ffprobe_dest) + ffprobe_dest.chmod(ffprobe_dest.stat().st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH) + subprocess.run([str(ffprobe_dest), "-version"], text=True, capture_output=True, check=True) + print(f"Bundled ffprobe: {ffprobe_dest}") + PY + + - name: Download all AI models + shell: bash + env: + PYTHONIOENCODING: utf-8 + run: | + echo "=== Download all base models (HuBERT, RMVPE, UVR5, DeEcho, pretrained) ===" + python tools/download_models.py --all + + echo "=== Download Roformer separator models ===" + python -c " + from audio_separator.separator import Separator + import os + model_dir = os.path.join('assets', 'separator_models') + os.makedirs(model_dir, exist_ok=True) + # Vocal separation model + sep = Separator(output_dir='.', model_file_dir=model_dir) + sep.load_model('model_bs_roformer_ep_317_sdr_12.9755.ckpt') + del sep + # Karaoke SOTA ensemble models + for karaoke_model in [ + 'mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt', + 'mel_band_roformer_karaoke_gabox_v2.ckpt', + 'mel_band_roformer_karaoke_becruily.ckpt', + ]: + sep2 = Separator(output_dir='.', model_file_dir=model_dir) + sep2.load_model(karaoke_model) + del sep2 + print('Roformer models downloaded.') + " + + echo "=== Verify downloaded models ===" + find assets/ -name "*.pt" -o -name "*.pth" -o -name "*.ckpt" -o -name "*.onnx" | while read f; do + SIZE=$(stat -c%s "$f" 2>/dev/null || stat -f%z "$f") + echo " $f ($(( SIZE / 1048576 )) MB)" + done + + - name: Build executable + shell: bash + run: | + SEP="${{ matrix.sep }}" + NAME="AI-RVC-${{ matrix.platform }}-${{ matrix.variant }}" + pyinstaller --name "${NAME}" \ + --onedir \ + --add-data "ui${SEP}ui" \ + --add-data "infer${SEP}infer" \ + --add-data "lib${SEP}lib" \ + --add-data "models${SEP}models" \ + --add-data "tools${SEP}tools" \ + --add-data "i18n${SEP}i18n" \ + --add-data "configs${SEP}configs" \ + --add-data "assets/hubert${SEP}assets/hubert" \ + --add-data "assets/rmvpe${SEP}assets/rmvpe" \ + --add-data "assets/uvr5_weights${SEP}assets/uvr5_weights" \ + --add-data "assets/pretrained_v2${SEP}assets/pretrained_v2" \ + --add-data "assets/separator_models${SEP}assets/separator_models" \ + --hidden-import=torch \ + --hidden-import=torchaudio \ + --hidden-import=gradio \ + --hidden-import=librosa \ + --hidden-import=soundfile \ + --hidden-import=fairseq \ + --hidden-import=audio_separator \ + --hidden-import=demucs \ + --hidden-import=pedalboard \ + --collect-all torch \ + --collect-all torchaudio \ + --collect-all gradio \ + --collect-all gradio_client \ + run.py + + - name: Create portable package + shell: bash + run: | + NAME="AI-RVC-${{ matrix.platform }}-${{ matrix.variant }}" + PKG="${NAME}-Portable" + mkdir -p "${PKG}" + # onedir 输出在 dist/NAME/ 目录下,复制全部内容 + cp -r dist/${NAME}/* "${PKG}/" + cp README.md "${PKG}/" + [ -f LICENSE ] && cp LICENSE "${PKG}/" + + if [ "${{ matrix.variant }}" = "GPU" ]; then + VARIANT_NOTE="GPU 版(CUDA 12.1),支持 NVIDIA 显卡加速 + 如果没有 NVIDIA 显卡,程序会自动回退到 CPU 推理" + else + VARIANT_NOTE="CPU 版,无需显卡即可运行 + 如需 GPU 加速,请下载 GPU 版本或使用本地安装方式:python install.py" + fi + + if [ "${{ matrix.platform }}" = "Windows" ]; then + EXE_NAME="${NAME}.exe" + cat > "${PKG}/使用说明.txt" << HEREDOC + AI-RVC ${{ matrix.platform }} 便携版(${{ matrix.variant }}) + + 使用方法: + 双击 ${EXE_NAME} 启动 + 浏览器访问 http://127.0.0.1:7860 + + ${VARIANT_NOTE} + + AI 模型已内置,无需额外下载 + 无需安装 Python,解压即用 + HEREDOC + else + EXE_NAME="${NAME}" + chmod +x "${PKG}/${EXE_NAME}" + cat > "${PKG}/使用说明.txt" << HEREDOC + AI-RVC ${{ matrix.platform }} 便携版(${{ matrix.variant }}) + + 使用方法: + chmod +x ${EXE_NAME} + ./${EXE_NAME} + 浏览器访问 http://127.0.0.1:7860 + + ${VARIANT_NOTE} + + AI 模型已内置,无需额外下载 + 无需安装 Python,解压即用 + HEREDOC + fi + + - name: Compress package + shell: bash + run: | + NAME="AI-RVC-${{ matrix.platform }}-${{ matrix.variant }}" + PKG="${NAME}-Portable" + + if [ "${{ matrix.platform }}" = "Windows" ]; then + # 先压缩成 zip + 7z a -tzip "${PKG}.zip" "${PKG}" + # 超过 1.9GB 时删除 zip 改用 7z(压缩率更高) + FILE_SIZE=$(stat -c%s "${PKG}.zip" 2>/dev/null || wc -c < "${PKG}.zip" | tr -d ' ') + if [ "$FILE_SIZE" -gt 1900000000 ]; then + rm "${PKG}.zip" + # 先尝试不分卷的 7z + 7z a "${PKG}.7z" "${PKG}" + SEVENZ_SIZE=$(stat -c%s "${PKG}.7z" 2>/dev/null || wc -c < "${PKG}.7z" | tr -d ' ') + if [ "$SEVENZ_SIZE" -gt 1900000000 ]; then + # 7z 也超限,改用分卷 + rm "${PKG}.7z" + 7z a -v1900m "${PKG}.7z" "${PKG}" + fi + fi + else + tar -czf "${PKG}.tar.gz" "${PKG}" + # 超过 1.9GB 时拆分 + FILE_SIZE=$(stat -c%s "${PKG}.tar.gz" 2>/dev/null || stat -f%z "${PKG}.tar.gz") + if [ "$FILE_SIZE" -gt 1900000000 ]; then + split -b 1900M "${PKG}.tar.gz" "${PKG}.tar.gz.part" + rm "${PKG}.tar.gz" + fi + fi + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: AI-RVC-${{ matrix.platform }}-${{ matrix.variant }} + path: AI-RVC-${{ matrix.platform }}-${{ matrix.variant }}-Portable* + + upload-release: + needs: [build] + runs-on: ubuntu-latest + + steps: + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + merge-multiple: true + + - name: List artifacts + run: ls -lh AI-RVC-* + + - name: Determine target tag + id: tag + run: | + if [ "${{ github.event_name }}" = "push" ]; then + echo "tag=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT + elif [ -n "${{ github.event.inputs.tag }}" ]; then + echo "tag=${{ github.event.inputs.tag }}" >> $GITHUB_OUTPUT + else + LATEST=$(gh release list --repo ${{ github.repository }} --limit 1 --json tagName -q '.[0].tagName') + echo "tag=${LATEST}" >> $GITHUB_OUTPUT + fi + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + - name: Upload assets to release + run: | + TAG="${{ steps.tag.outputs.tag }}" + echo "上传资源到 Release: ${TAG}" + # 收集所有构建产物(含分卷文件) + mapfile -t FILES < <(find . -maxdepth 1 -name "AI-RVC-*-Portable*" -type f | sort) + upload_asset_with_retry() { + local tag="$1" + local file="$2" + local attempt + local exit_code + + for attempt in 1 2 3; do + echo "上传(${attempt}/3): ${file}" + if timeout 30m gh release upload "${tag}" "${file}" \ + --repo ${{ github.repository }} \ + --clobber; then + return 0 + fi + + exit_code=$? + echo "上传失败(退出码=${exit_code}): ${file}" + if [ "${attempt}" -lt 3 ]; then + sleep $(( attempt * 20 )) + fi + done + + echo "上传最终失败: ${file}" + return 1 + } + echo "找到以下文件:" + for f in "${FILES[@]}"; do + SIZE=$(stat -c%s "$f" 2>/dev/null || stat -f%z "$f") + echo " $f ($(( SIZE / 1048576 )) MB)" + done + # 逐个上传 + for f in "${FILES[@]}"; do + upload_asset_with_retry "${TAG}" "$f" + done + echo "全部上传完成" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..0934bde22db48d298170a60541b019e0a232fae9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,80 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# 虚拟环境 +venv/ +ENV/ +env/ +.venv/ + +# IDE +.idea/ +.vscode/ +.claude/ +*.swp +*.swo + +# 环境变量 +.env + +# 模型文件 (太大,不提交) +assets/hubert/*.pt +assets/rmvpe/*.pt +assets/pretrained_v2/*.pth +assets/separator_models/ +assets/uvr5_weights/ +assets/weights/*.pth +assets/weights/*.index +assets/weights/characters/**/*.pth +assets/weights/characters/**/*.index +assets/weights/characters/**/ai_rvc_model.json +assets/weights/official_models/ +assets/weights/official_indexes/ + +# 临时文件 +temp/ +*.tmp +*.temp +*.log +logs/ + +# 音频文件 +*.wav +*.mp3 +*.flac +*.ogg +!assets/test/ + +# Gradio +flagged/ +/venv310 + +# 上游参考代码 (vendored, 独立 git repo) +_official_rvc/ + +# 输出文件 +output/ +outputs/ +/.playwright-cli +/AI-RVC-Windows-GPU-Portable +/AI-RVC-Windows-GPU-Portable.7z.* +/tools/ffmpeg/ +/scratch/ diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000000000000000000000000000000000000..b8f86ca797e7f91327405284a84188615bfb3dc7 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,126 @@ +# AGENTS.md + +This file provides guidance to Codex (Codex.ai/code) when working with code in this repository. + +## Project Overview + +RVC v2 voice conversion & AI cover system. Users upload a song, the pipeline separates vocals from accompaniment (Mel-Band Roformer), converts the vocal timbre via RVC v2 (HuBERT + RMVPE + FAISS), then mixes the result back with the accompaniment. + +**Platform Support**: Windows / Linux / WSL2 / Google Colab + +**Key Features**: +- AI song covers with automatic vocal separation and mixing +- 117 downloadable character models +- 4 mixing presets (universal, vocal-focused, accompaniment-focused, live) +- Karaoke mode (lead/backing vocal separation) +- 4 VC preprocessing modes (auto, direct, uvr_deecho, legacy) +- Dual VC pipeline (current implementation vs official RVC) +- Multi-backend GPU support (CUDA, ROCm, XPU, DirectML, MPS) + +## Commands + +```powershell +# Activate venv (Windows) +.\venv310\Scripts\Activate.ps1 + +# Activate venv (Linux/WSL2) +source venv310/bin/activate + +# Install dependencies +python install.py # full install + launch +python install.py --check # check only +python install.py --cpu # CPU variant + +# Run +python run.py # default: http://127.0.0.1:7860 +python run.py --skip-check # skip env/model validation +python run.py --host 0.0.0.0 --port 8080 --share + +# Download base models (HuBERT, RMVPE) +python tools/download_models.py + +# Download character models +python -c "from tools.character_models import download_character_model; download_character_model('rin')" + +# Quick CUDA check +python -c "import torch; print(torch.cuda.is_available())" + +# Colab +# Open AI_RVC_Colab.ipynb in Google Colab, set runtime to GPU (T4), run cells sequentially +``` + +## Architecture + +**Entry:** `run.py` → env check → model check → `ui/app.py:launch()` + +**Pipeline flow** (`infer/cover_pipeline.py:CoverPipeline.process`): +1. Vocal separation (`infer/separator.py`) — Roformer (default), Demucs, or UVR5 +2. RVC voice conversion (`infer/pipeline.py`) — HuBERT features → RMVPE F0 → RVC v2 inference with FAISS retrieval +3. Mixing (`lib/mixer.py`) — volume adjust + reverb via pedalboard + +**Character model system** (`tools/character_models.py`): +- 117 downloadable character models from HuggingFace (`trioskosmos/rvc_models`) +- Stored in `assets/weights/characters/` +- Version notes (epochs, sample rate) extracted from .pth metadata and cached in `_version_notes.json` +- Display name assembly: `_get_display_name()` appends `(500 epochs·40k)` style training info + +**UI** (`ui/app.py`): +- Gradio 3.50.2, single-file ~2000 lines +- i18n via `i18n/zh_CN.json`, accessed through `t(key, section)` helper +- Three main tabs: song cover (full pipeline), model management, settings +- Cover tab features: + - Character model download/management with series filtering and keyword search + - 4 mixing presets (universal, vocal-focused, accompaniment-focused, live) + - Karaoke separation (lead/backing vocals) + - 4 VC preprocessing modes (auto, direct, uvr_deecho, legacy) + - Source constraint control (auto/off/on) + - Dual VC pipeline mode (current/official) + - Singing repair (official mode only) + - Real-time VC route status display +- Model management tab: + - Base model download (HuBERT, RMVPE) + - Mature DeEcho model download + - Model list table with refresh +- Settings tab: + - Device info display + - Backend selection (CUDA/ROCm/XPU/DirectML/MPS/CPU) + - Config save + +**Config:** `configs/config.json` — device, F0 method, index rate, cover separator settings, path mappings + +## Key Conventions + +- Python 3.10, UTF-8, 4-space indent +- `snake_case` functions/variables, `PascalCase` classes, `UPPER_SNAKE_CASE` constants +- User-facing text is bilingual Chinese/English +- Commit messages: short imperative subjects, Chinese/English mixed (e.g. `infer: fix CUDA OOM`) +- No automated test suite; verify changes by running one voice conversion + one cover through the UI +- `_official_rvc/` is vendored upstream reference — don't modify unless syncing + +## Important Paths + +- `configs/config.json` — all runtime settings +- `infer/cover_pipeline.py` — orchestrates the full cover workflow +- `infer/pipeline.py` — RVC v2 inference core +- `infer/separator.py` — Roformer/Demucs vocal separation wrappers +- `tools/character_models.py` — character model registry (117 entries) + download logic +- `tools/download_models.py` — base model (HuBERT/RMVPE) + mature DeEcho downloader +- `lib/mixer.py` — audio mixing with volume/reverb +- `ui/app.py` — entire Gradio UI (~2000 lines) +- `mcp/server.py` + `mcp/tools.py` — MCP server integration for Codex +- `AI_RVC_Colab.ipynb` — Google Colab notebook with full feature parity +- `install.py` — cross-platform installation script (Windows/Linux) + +## Things to Watch + +- `fairseq` is pinned to `0.12.2` — HuBERT loading breaks on other versions +- `audio-separator` must be installed with `[gpu]` extra for CUDA support +- Roformer model auto-downloads on first use to `assets/separator_models/` +- Gradio is pinned to `3.50.2`; the UI code uses v3 API patterns (not v4) +- Model weights (.pt, .pth) and audio files are gitignored — never commit them +- Path handling uses `pathlib.Path` for cross-platform compatibility (Windows/Linux) +- Virtual environment activation differs by platform: `Scripts/Activate.ps1` (Windows) vs `bin/activate` (Linux) +- `install.py` has hardcoded Windows Python paths in `PYTHON310_CANDIDATES` but falls back to `py -3.10` launcher +- Platform detection uses `os.name == "nt"` for Windows-specific logic (venv paths, etc.) +- All core functionality is platform-agnostic; audio libraries work better on Linux +- Colab notebook (`AI_RVC_Colab.ipynb`) provides full feature parity with Web UI diff --git a/AI-RVC.spec b/AI-RVC.spec new file mode 100644 index 0000000000000000000000000000000000000000..2f990d0a618d7cb9a05d7581c154da6e8909c036 --- /dev/null +++ b/AI-RVC.spec @@ -0,0 +1,71 @@ +# -*- mode: python ; coding: utf-8 -*- + +block_cipher = None + +a = Analysis( + ['run.py'], + pathex=[], + binaries=[], + datas=[ + ('ui', 'ui'), + ('infer', 'infer'), + ('lib', 'lib'), + ('models', 'models'), + ('tools', 'tools'), + ('i18n', 'i18n'), + ('configs', 'configs'), + ('assets/hubert', 'assets/hubert'), + ('assets/rmvpe', 'assets/rmvpe'), + ], + hiddenimports=[ + 'torch', + 'torchaudio', + 'gradio', + 'librosa', + 'soundfile', + 'scipy', + 'numpy', + 'fairseq', + 'audio_separator', + 'demucs', + 'pedalboard', + 'praat-parselmouth', + 'pyworld', + 'torchcrepe', + 'faiss', + 'huggingface_hub', + 'ffmpeg', + ], + hookspath=[], + hooksconfig={}, + runtime_hooks=[], + excludes=[], + win_no_prefer_redirects=False, + win_private_assemblies=False, + cipher=block_cipher, + noarchive=False, +) + +pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher) + +exe = EXE( + pyz, + a.scripts, + a.binaries, + a.zipfiles, + a.datas, + [], + name='AI-RVC', + debug=False, + bootloader_ignore_signals=False, + strip=False, + upx=True, + upx_exclude=[], + runtime_tmpdir=None, + console=True, + disable_windowed_traceback=False, + argv_emulation=False, + target_arch=None, + codesign_identity=None, + entitlements_file=None, +) diff --git a/AI_RVC_Colab.ipynb b/AI_RVC_Colab.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..c3413bc6ea519b7f645d30e8a3f9e0f6b70ee793 --- /dev/null +++ b/AI_RVC_Colab.ipynb @@ -0,0 +1,380 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# AI-RVC 一键 AI 翻唱 (Colab)\n", + "\n", + "[Open in Colab](https://colab.research.google.com/github/mason369/AI-RVC/blob/master/AI_RVC_Colab.ipynb)\n", + "\n", + "这是 AI-RVC 的 Google Colab 运行入口。Notebook 会创建独立 Python 3.10 环境,再运行项目安装脚本,避免 Colab 默认 Python 版本变化影响 `fairseq==0.12.2`。\n", + "\n", + "当前处理链路模型:\n", + "\n", + "| 环节 | 默认模型 |\n", + "|------|----------|\n", + "| 人声分离 | `ensemble:vocal_rvc` |\n", + "| 卡拉OK分离 | `ensemble:karaoke` |\n", + "| 去混响 / DeEcho | `dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt` |\n", + "| 内容特征 | `hubert_base.pt` |\n", + "| F0 音高 | `rmvpe.pt` |\n", + "| VC 推理 | RVC v2 `.pth` + 可选 FAISS `.index` |\n", + "\n", + "使用前请在 Colab 菜单选择:`代码执行程序 -> 更改运行时类型 -> T4 GPU`,然后按顺序运行每个单元格。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. 检查 Colab 运行时\n", + "\n", + "确认当前机器有 GPU,并显示 Colab 默认 Python。后续实际安装会使用独立 Python 3.10 环境。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import subprocess\n", + "\n", + "print('Colab 默认 Python:', sys.version)\n", + "print('\\nGPU 信息:')\n", + "subprocess.run(['nvidia-smi'], check=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. 克隆或刷新仓库\n", + "\n", + "如果 `/content/AI-RVC` 已存在,会重置到 GitHub `master` 分支,确保 Colab 环境和仓库最新代码一致。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "set -euo pipefail\n", + "cd /content\n", + "if [ -d AI-RVC/.git ]; then\n", + " git -C AI-RVC fetch origin master\n", + " git -C AI-RVC reset --hard origin/master\n", + "else\n", + " git clone https://github.com/mason369/AI-RVC.git AI-RVC\n", + "fi\n", + "cd /content/AI-RVC\n", + "git rev-parse --short HEAD\n", + "ls -la" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. 创建 Python 3.10 环境并安装依赖\n", + "\n", + "这个步骤会安装 `uv`,用它准备 Python 3.10,然后调用项目自带 `install.py --no-run`。如果任何依赖安装失败,单元格会直接停止并显示错误。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "set -euo pipefail\n", + "cd /content/AI-RVC\n", + "\n", + "sudo apt-get update -y\n", + "sudo apt-get install -y ffmpeg build-essential python3-dev curl git\n", + "\n", + "if ! command -v uv >/dev/null 2>&1; then\n", + " curl -LsSf https://astral.sh/uv/install.sh | sh\n", + "fi\n", + "export PATH=\"$HOME/.local/bin:$PATH\"\n", + "uv --version\n", + "uv python install 3.10\n", + "uv venv --seed --python 3.10 venv310\n", + "PY=/content/AI-RVC/venv310/bin/python\n", + "$PY --version\n", + "$PY -m pip --version\n", + "$PY -m pip install --upgrade pip\n", + "$PY -m pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu126\n", + "$PY - <<'PY'\n", + "import torch\n", + "print('Torch:', torch.__version__)\n", + "print('CUDA available:', torch.cuda.is_available())\n", + "if not torch.cuda.is_available():\n", + " raise RuntimeError('GPU PyTorch install completed but CUDA is not available.')\n", + "print('CUDA:', torch.version.cuda)\n", + "print('GPU:', torch.cuda.get_device_name(0))\n", + "PY\n", + "uv run --python 3.10 python install.py --no-run" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. 严格验证依赖与默认模型配置\n", + "\n", + "这里会确认关键包、Python 版本、CUDA、Colab 默认分离模型、卡拉OK模型、DeEcho模型都符合当前项目配置。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "set -euo pipefail\n", + "cd /content/AI-RVC\n", + "PY=/content/AI-RVC/venv310/bin/python\n", + "$PY - <<'PY'\n", + "import importlib.metadata as md\n", + "import sys\n", + "import torch\n", + "from infer.separator import (\n", + " ROFORMER_DEFAULT_MODEL,\n", + " KARAOKE_DEFAULT_MODEL,\n", + " ROFORMER_DEREVERB_DEFAULT_MODEL,\n", + ")\n", + "\n", + "assert sys.version_info[:2] == (3, 10), sys.version\n", + "print('Python:', sys.version)\n", + "print('Torch:', torch.__version__)\n", + "print('CUDA available:', torch.cuda.is_available())\n", + "if not torch.cuda.is_available():\n", + " raise RuntimeError('Colab runtime did not provide CUDA. Please switch runtime type to T4 GPU and rerun.')\n", + "\n", + "expected = {\n", + " 'audio-separator': '0.44.1',\n", + " 'fairseq': '0.12.2',\n", + " 'gradio': '3.50.2',\n", + "}\n", + "for dist, minimum in expected.items():\n", + " version = md.version(dist)\n", + " print(f'{dist}: {version}')\n", + "\n", + "assert ROFORMER_DEFAULT_MODEL == 'ensemble:vocal_rvc', ROFORMER_DEFAULT_MODEL\n", + "assert KARAOKE_DEFAULT_MODEL == 'ensemble:karaoke', KARAOKE_DEFAULT_MODEL\n", + "assert ROFORMER_DEREVERB_DEFAULT_MODEL == 'dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt', ROFORMER_DEREVERB_DEFAULT_MODEL\n", + "\n", + "import faiss # noqa: F401\n", + "from audio_separator.separator import Separator # noqa: F401\n", + "import fairseq # noqa: F401\n", + "print('Dependency and model default checks passed.')\n", + "PY" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. 下载基础模型\n", + "\n", + "下载 HuBERT、RMVPE、UVR5 HP2 等必需模型。模型会保存到仓库的 `assets/` 目录。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "set -euo pipefail\n", + "cd /content/AI-RVC\n", + "PY=/content/AI-RVC/venv310/bin/python\n", + "$PY tools/download_models.py\n", + "$PY - <<'PY'\n", + "from tools.download_models import REQUIRED_MODELS, check_model\n", + "missing = [name for name in REQUIRED_MODELS if not check_model(name)]\n", + "if missing:\n", + " raise RuntimeError('Missing required models: ' + ', '.join(missing))\n", + "print('Required models are present:', ', '.join(REQUIRED_MODELS))\n", + "PY" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. 可选:查看或下载声线模型\n", + "\n", + "处理链路模型已经在前面准备好。这里的 `.pth` 声线模型是 RVC 目标音色,不属于分离 / F0 / HuBERT 等处理模型。可以先下载一个示例模型,也可以跳过后在 Web UI 中下载。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "set -euo pipefail\n", + "cd /content/AI-RVC\n", + "PY=/content/AI-RVC/venv310/bin/python\n", + "$PY - <<'PY'\n", + "from tools.character_models import list_available_characters, list_available_series\n", + "print('Available series:')\n", + "for series in list_available_series():\n", + " print(' -', series)\n", + "chars = list_available_characters()\n", + "print('\\nFirst 20 voice models:')\n", + "for char in chars[:20]:\n", + " print(f\" - {char['name']}: {char.get('display', char.get('description', ''))}\")\n", + "print('\\nTotal:', len(chars))\n", + "PY" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Optional: download one example voice model.\n", + "# Change character_name as needed, then run this cell.\n", + "\n", + "# %%bash\n", + "# set -euo pipefail\n", + "# cd /content/AI-RVC\n", + "# PY=/content/AI-RVC/venv310/bin/python\n", + "# $PY - <<'PY'\n", + "# from tools.character_models import download_character_model\n", + "# character_name = 'rin'\n", + "# if not download_character_model(character_name):\n", + "# raise RuntimeError(f'Failed to download voice model: {character_name}')\n", + "# print(f'Downloaded voice model: {character_name}')\n", + "# PY" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. 全模式处理矩阵验证\n", + "\n", + "上传一段含有人声的短音乐文件到 `/content/input_audio.*`,或把自己的文件路径填到 `INPUT_AUDIO`。此步骤会下载示例声线模型 `rin`,然后依次验证 RoFormer、Karaoke、Demucs、UVR5、官方 1:1、官方唱歌修复等处理模式。任何模式失败都会停止并在 JSON 里保留 traceback。\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Upload a short vocal music file, then set INPUT_AUDIO to its path.\n", + "# Example after upload: INPUT_AUDIO = '/content/input_audio.wav'\n", + "INPUT_AUDIO = '/content/input_audio.wav'\n", + "\n", + "from pathlib import Path\n", + "if not Path(INPUT_AUDIO).exists():\n", + " raise FileNotFoundError(\n", + " f'Missing test audio: {INPUT_AUDIO}. Upload a short vocal music file to this path before running the matrix.'\n", + " )\n", + "print('Matrix input:', INPUT_AUDIO, Path(INPUT_AUDIO).stat().st_size, 'bytes')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "set -euo pipefail\n", + "cd /content/AI-RVC\n", + "PY=/content/AI-RVC/venv310/bin/python\n", + "$PY - <<'PY'\n", + "from tools.character_models import download_character_model\n", + "if not download_character_model('rin'):\n", + " raise RuntimeError('Failed to download voice model: rin')\n", + "print('Voice model ready: rin')\n", + "PY\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "set -euo pipefail\n", + "cd /content/AI-RVC\n", + "PY=/content/AI-RVC/venv310/bin/python\n", + "$PY tools/run_mode_matrix.py \\\n", + " --input /content/input_audio.wav \\\n", + " --output-dir /content/AI-RVC/outputs/mode_matrix_colab \\\n", + " --require-cuda\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. 启动 Web UI\n", + "\n", + "运行后 Colab 会输出 Gradio 公共链接。打开链接即可使用 AI-RVC Web UI。此单元格会再次检查环境和必需模型。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "set -euo pipefail\n", + "cd /content/AI-RVC\n", + "PY=/content/AI-RVC/venv310/bin/python\n", + "$PY run.py --host 0.0.0.0 --port 7860 --share" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 常见问题\n", + "\n", + "| 问题 | 处理方式 |\n", + "|------|----------|\n", + "| `nvidia-smi` 失败 | 在 Colab 菜单选择 GPU 运行时后重新运行 |\n", + "| `fairseq` 安装失败 | 确认本 notebook 的第 3 步使用了 Python 3.10 环境 |\n", + "| `CUDA out of memory` | 换短音频,或在 UI 中降低处理负载 / 切换分离器 |\n", + "| 模型下载失败 | 重新运行第 5 步;如果 Hugging Face 限流,可稍后再试或配置 token |\n", + "\n", + "本项目仅供学习研究和个人娱乐用途,请勿用于商业用途、欺诈、冒充他人或侵犯权益的场景。" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000000000000000000000000000000000000..76c32da3f935e9502f72d8dcaa510316c5b9cc38 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,126 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +RVC v2 voice conversion & AI cover system. Users upload a song, the pipeline separates vocals from accompaniment (Mel-Band Roformer), converts the vocal timbre via RVC v2 (HuBERT + RMVPE + FAISS), then mixes the result back with the accompaniment. + +**Platform Support**: Windows / Linux / WSL2 / Google Colab + +**Key Features**: +- AI song covers with automatic vocal separation and mixing +- 117 downloadable character models +- 4 mixing presets (universal, vocal-focused, accompaniment-focused, live) +- Karaoke mode (lead/backing vocal separation) +- 4 VC preprocessing modes (auto, direct, uvr_deecho, legacy) +- Dual VC pipeline (current implementation vs official RVC) +- Multi-backend GPU support (CUDA, ROCm, XPU, DirectML, MPS) + +## Commands + +```powershell +# Activate venv (Windows) +.\venv310\Scripts\Activate.ps1 + +# Activate venv (Linux/WSL2) +source venv310/bin/activate + +# Install dependencies +python install.py # full install + launch +python install.py --check # check only +python install.py --cpu # CPU variant + +# Run +python run.py # default: http://127.0.0.1:7860 +python run.py --skip-check # skip env/model validation +python run.py --host 0.0.0.0 --port 8080 --share + +# Download base models (HuBERT, RMVPE) +python tools/download_models.py + +# Download character models +python -c "from tools.character_models import download_character_model; download_character_model('rin')" + +# Quick CUDA check +python -c "import torch; print(torch.cuda.is_available())" + +# Colab +# Open AI_RVC_Colab.ipynb in Google Colab, set runtime to GPU (T4), run cells sequentially +``` + +## Architecture + +**Entry:** `run.py` → env check → model check → `ui/app.py:launch()` + +**Pipeline flow** (`infer/cover_pipeline.py:CoverPipeline.process`): +1. Vocal separation (`infer/separator.py`) — Roformer (default), Demucs, or UVR5 +2. RVC voice conversion (`infer/pipeline.py`) — HuBERT features → RMVPE F0 → RVC v2 inference with FAISS retrieval +3. Mixing (`lib/mixer.py`) — volume adjust + reverb via pedalboard + +**Character model system** (`tools/character_models.py`): +- 117 downloadable character models from HuggingFace (`trioskosmos/rvc_models`) +- Stored in `assets/weights/characters/` +- Version notes (epochs, sample rate) extracted from .pth metadata and cached in `_version_notes.json` +- Display name assembly: `_get_display_name()` appends `(500 epochs·40k)` style training info + +**UI** (`ui/app.py`): +- Gradio 3.50.2, single-file ~2000 lines +- i18n via `i18n/zh_CN.json`, accessed through `t(key, section)` helper +- Three main tabs: song cover (full pipeline), model management, settings +- Cover tab features: + - Character model download/management with series filtering and keyword search + - 4 mixing presets (universal, vocal-focused, accompaniment-focused, live) + - Karaoke separation (lead/backing vocals) + - 4 VC preprocessing modes (auto, direct, uvr_deecho, legacy) + - Source constraint control (auto/off/on) + - Dual VC pipeline mode (current/official) + - Singing repair (official mode only) + - Real-time VC route status display +- Model management tab: + - Base model download (HuBERT, RMVPE) + - Mature DeEcho model download + - Model list table with refresh +- Settings tab: + - Device info display + - Backend selection (CUDA/ROCm/XPU/DirectML/MPS/CPU) + - Config save + +**Config:** `configs/config.json` — device, F0 method, index rate, cover separator settings, path mappings + +## Key Conventions + +- Python 3.10, UTF-8, 4-space indent +- `snake_case` functions/variables, `PascalCase` classes, `UPPER_SNAKE_CASE` constants +- User-facing text is bilingual Chinese/English +- Commit messages: short imperative subjects, Chinese/English mixed (e.g. `infer: fix CUDA OOM`) +- No automated test suite; verify changes by running one voice conversion + one cover through the UI +- `_official_rvc/` is vendored upstream reference — don't modify unless syncing + +## Important Paths + +- `configs/config.json` — all runtime settings +- `infer/cover_pipeline.py` — orchestrates the full cover workflow +- `infer/pipeline.py` — RVC v2 inference core +- `infer/separator.py` — Roformer/Demucs vocal separation wrappers +- `tools/character_models.py` — character model registry (117 entries) + download logic +- `tools/download_models.py` — base model (HuBERT/RMVPE) + mature DeEcho downloader +- `lib/mixer.py` — audio mixing with volume/reverb +- `ui/app.py` — entire Gradio UI (~2000 lines) +- `mcp/server.py` + `mcp/tools.py` — MCP server integration for Claude Code +- `AI_RVC_Colab.ipynb` — Google Colab notebook with full feature parity +- `install.py` — cross-platform installation script (Windows/Linux) + +## Things to Watch + +- `fairseq` is pinned to `0.12.2` — HuBERT loading breaks on other versions +- `audio-separator` must be installed with `[gpu]` extra for CUDA support +- Roformer model auto-downloads on first use to `assets/separator_models/` +- Gradio is pinned to `3.50.2`; the UI code uses v3 API patterns (not v4) +- Model weights (.pt, .pth) and audio files are gitignored — never commit them +- Path handling uses `pathlib.Path` for cross-platform compatibility (Windows/Linux) +- Virtual environment activation differs by platform: `Scripts/Activate.ps1` (Windows) vs `bin/activate` (Linux) +- `install.py` has hardcoded Windows Python paths in `PYTHON310_CANDIDATES` but falls back to `py -3.10` launcher +- Platform detection uses `os.name == "nt"` for Windows-specific logic (venv paths, etc.) +- All core functionality is platform-agnostic; audio libraries work better on Linux +- Colab notebook (`AI_RVC_Colab.ipynb`) provides full feature parity with Web UI diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..7c6dbbd3836d321b0dacc20943e189727c5e4c92 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README_HF.md b/README_HF.md index a6a94b65ae0c7c82e8199435e05e92e11292194d..71833c954495cf043b2b9509243d8a87156a6ba9 100644 --- a/README_HF.md +++ b/README_HF.md @@ -1,5 +1,5 @@ --- -title: AI-RVC 语音转换 & AI 翻唱 +title: AI-RVC 一键 AI 翻唱 emoji: 🎤 colorFrom: blue colorTo: purple @@ -10,15 +10,15 @@ pinned: false license: mit --- -# 🎤 AI-RVC 语音转换 & AI 翻唱 +# 🎤 AI-RVC 一键 AI 翻唱 -基于 RVC v2 + RMVPE 的高质量语音转换系统,支持一键 AI 翻唱功能。 +基于 RVC v2 的一键 AI 翻唱系统,自动完成人声分离、音色转换、混音合成全流程。 ## 功能特点 - **AI 歌曲翻唱**:上传歌曲自动分离人声、转换音色、混合伴奏,一键生成翻唱 - **人声分离**:默认 Mel-Band Roformer (KimberleyJensen),在 MVSEP 公开 Multisong 指标中为 Vocals SDR 11.01 / Instrum SDR 17.32 -- **语音转换**:RVC v2 架构 + FAISS 检索增强流程 +- **音色转换**:RVC v2 架构 + FAISS 检索增强流程 - **RMVPE 音高提取**:高精度 F0 提取,噪声鲁棒性强 - **角色模型**:内置 117 个可下载角色模型 - **混音效果**:支持人声混响、音量调节、4 种混音预设 @@ -88,7 +88,7 @@ license: mit ↓ 人声分离 (Mel-Band Roformer) ↓ - RVC 语音转换 (HuBERT + RMVPE + FAISS) + RVC 音色转换 (HuBERT + RMVPE + FAISS) ↓ 混音 (音量调节 + 混响) ↓ @@ -148,4 +148,4 @@ A: 建议选择与原唱性别、音色相近的角色,效果更自然。 **License**: MIT **Version**: 2.0 -**Last Updated**: 2026-03-10 +**Last Updated**: 2026-03-15 diff --git a/app.py b/app.py index cda3912ce8201a332a2cc7f05b024bbc0822d7f0..275c5ffff748eb201d4a411c7d6e0a40e02f38fe 100644 --- a/app.py +++ b/app.py @@ -7,19 +7,26 @@ import os import sys from pathlib import Path +# 添加项目根目录到路径 ROOT_DIR = Path(__file__).parent sys.path.insert(0, str(ROOT_DIR)) +from lib.ffmpeg_runtime import configure_ffmpeg_runtime + +configure_ffmpeg_runtime() + +# 设置环境变量 os.environ["GRADIO_SERVER_NAME"] = "0.0.0.0" os.environ["GRADIO_SERVER_PORT"] = "7860" -from ui.app import create_ui +# 导入并启动应用 +if __name__ == "__main__": + from ui.app import launch -app = create_ui() -app.queue() -app.launch( - server_name="0.0.0.0", - server_port=7860, - share=False, - inbrowser=False, -) + # 启动 Gradio 界面 + launch( + server_name="0.0.0.0", + server_port=7860, + share=False, + inbrowser=False + ) diff --git a/assets/weights/characters/_version_notes.json b/assets/weights/characters/_version_notes.json new file mode 100644 index 0000000000000000000000000000000000000000..7265828a3a0cafece460fe7bb7fedaf2d1a6bdbc --- /dev/null +++ b/assets/weights/characters/_version_notes.json @@ -0,0 +1,120 @@ +{ + "Aimi": "200 epochs · 48k", + "hanayo": "250 epochs · 40k", + "rin": "250 epochs · 40k", + "umi": "250 epochs · 40k", + "nozomi": "250 epochs · 48k", + "dia": "250 epochs · 40k", + "ayumu": "250 epochs · 40k", + "chika": "500 epochs · 40k", + "riko": "500 epochs · 40k", + "mari": "400 epochs · RVC v2", + "yohane": "500 epochs · 48k", + "nico": "300 epochs · 40k", + "kanata": "250 epochs · 40k", + "setsuna_yuki_nana": "300 epochs · 48k", + "hanamaru_zurakichi": "195 epochs · 40k", + "kanan": null, + "yu_takasaki": null, + "shizuku_osaka": null, + "sarah_kazuno": "500 epochs · 40k", + "leah_kazuno": "500 epochs · 40k", + "eli": "350 epochs · 40k", + "you": "400 epochs · 40k", + "honoka": "1000 epochs · 48k", + "kotori": "309 epochs · 48k", + "maki": "135 epochs · 40k", + "ruby": "207 epochs · 48k", + "kasumi": "414 epochs · 48k", + "karin": "1000 epochs · 48k", + "rina": "248 epochs · 48k", + "lanzhu": "248 epochs · 48k", + "keke": "151 epochs · 48k", + "ai_miyashita": "400 epochs · 32k", + "emma_verde": "450 epochs · 32k", + "shioriko_mifune": "500 epochs · 32k", + "chisato_arashi": "450 epochs · 32k", + "ren_hazuki": "400 epochs · 32k", + "sumire_heanna": "400 epochs · 32k", + "kinako_sakurakoji": "400 epochs · 32k", + "mei_yoneme": "400 epochs · 32k", + "shiki_wakana": "400 epochs · 32k", + "natsumi_onitsuka": "450 epochs · 32k", + "sayaka_murano": "400 epochs · 32k", + "tsuzuri_yugiri": "400 epochs · 32k", + "hanamaru": "410 epochs · 40k", + "setsuna_yuki_og": "910 epochs · 40k", + "setsuna_yuki_coco": "555 epochs · 48k", + "hanayo_pre_anime_v0": "222 epochs · 40k", + "hanayo_pre_anime_v1": "286 epochs · 48k", + "kotori_pre_anime": "216 epochs · 48k", + "kotori_pre_anime_v2": "401 epochs · 40k", + "wien_margarete": "453 epochs · 40k", + "mia_taylor": "285 epochs · 40k", + "anju_yuki_arise": "446 epochs · 40k", + "erena_todo_arise": "480 epochs · 40k", + "tsubasa_kira_arise": "729 epochs · 40k", + "kanon": "150 epochs · 40k", + "setsuna": "504 epochs · 48k", + "setsuna_v2": "302 epochs · 48k", + "chika_v2": "453 epochs · 48k", + "dia_v2": "250 epochs · 40k", + "hanamaru_v2": "229 epochs · 48k", + "honoka_v2": "214 epochs · 48k", + "ayumu_v2": "214 epochs · 48k", + "ayumu_v3": "250 epochs · 40k", + "ayumu_v4": "250 epochs · 40k", + "karin_v2": "301 epochs · 48k", + "keke_v2": "102 epochs · 48k", + "riko_v2": "346 epochs · 48k", + "umi_v2": "250 epochs · 40k", + "wien_v2": "257 epochs · 48k", + "yohane_trios": "155 epochs · 48k", + "yoshiko_trios": "20 epochs · 48k", + "yoshiko_v2": "211 epochs · 48k", + "tokai_teio": "570 epochs · RVC v2", + "focalors": "940 epochs · 48k", + "essex": "925 epochs · 40k", + "ellie": "40k", + "furina": "170 epochs · 40k", + "ayaka": "300 epochs · 40k", + "takane_shijou": "565 epochs · 40k", + "kobo": "500 epochs · 48k", + "kaela": "400 epochs · 48k", + "pekora": "600 epochs · 48k", + "kizuna_ai": "300 epochs · 40k", + "fuwawa": "250 epochs · 48k", + "mococo": "250 epochs · 48k", + "vo_furina_kr": "300 epochs · 48k", + "silverwolf_kr": "300 epochs · 48k", + "sparkle_e40": "40 epochs · 40k", + "ranko": "250 epochs · 40k", + "yumemiriamu": "250 epochs · 40k", + "hatsune_miku": null, + "nahida": null, + "nilou": null, + "herta_hsr": null, + "the_herta": null, + "firefly_hsr": null, + "tingyun_hsr": null, + "yunli_hsr": null, + "tribbie_hsr": null, + "huohuo_hsr": null, + "castorice_hsr": null, + "seele_hi3": null, + "herrscher_ego_hi3": null, + "miyabi_zzz": null, + "nene_kusanagi": null, + "miko_sakura": null, + "subaru_oozora": null, + "moona_hoshinova": null, + "risu_ayunda": null, + "reine_pavolia": null, + "zeta_vestia": null, + "anya_melfissa": null, + "luna_himemori": null, + "boothill": null, + "shiroko_rosmontis": null, + "rice_shower": null, + "suwawa": "211 epochs · 40k" +} \ No newline at end of file diff --git a/assets/weights/characters/mari/metadata.json b/assets/weights/characters/mari/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..8164dc33207fd70b5bace0543ab3f0205865bf11 --- /dev/null +++ b/assets/weights/characters/mari/metadata.json @@ -0,0 +1,177 @@ +{ + "title": "Mari Ohara (Love Live! Sunshine!!) - RVC V2 (400 Epoch)", + "author": { + "name": "natehitsvtec", + "discordUserId": "641787156863385610" + }, + "md5": "ff4bd8e126ab82a8145793f02db0cfee", + "uploadedAt": "2023-06-11T01:22:26.820Z", + "weightsLink": "https://www.weights.gg/models/clm72zu941aj1cctco2oj26p1", + "id": "clm72zu941aj1cctco2oj26p1", + "type": "v2", + "tags": [ + "Anime Character", + "Fictional Character", + "Artist", + "RVC v2" + ], + "description": "400 Epoch\nhttps://drive.google.com/file/d/1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk/view?usp=sharing\n\nTested languages: Japanese, Korean, English\n\nNew version trained up to 400 epoch, revised the dataset to 25 songs down from 33, but made it much clearer and removed excessive silence and other parts with echo or reverb. I did not use SIFAS audio for my dataset because it ended up being too high pitched, I believe she speaks with a lot of exaggeration in the game. Unsure if I will train it more but I probably will eventually.\n\nPrevious 200 Epoch (old dataset)\nhttps://drive.google.com/file/d/1uBmoKluknMOWKivjfHcbu7YkOMtjoclH/view?usp=sharing", + "samples": [], + "files": [ + { + "name": "model.index", + "size": 812116059, + "md5": "be135619c9fc2e04d0d9bf384f9eef11" + }, + { + "name": "model.pth", + "size": 55227869, + "md5": "ff4bd8e126ab82a8145793f02db0cfee" + } + ], + "torchMetadata": { + "config": { + "spec_channels": 1025, + "segment_size": 32, + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "upsample_rates": [ + 10, + 10, + 2, + 2 + ], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 16, + 16, + 4, + 4 + ], + "emb_channels": null, + "spk_embed_dim": 109, + "gin_channels": 256, + "sr": 40000 + }, + "f0": 1, + "version": "v2", + "extra_info": { + "config": [ + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [ + 3, + 7, + 11 + ], + [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + [ + 10, + 10, + 2, + 2 + ], + 512, + [ + 16, + 16, + 4, + 4 + ], + 109, + 256, + 40000 + ], + "info": "400epoch", + "sr": "40k", + "f0": 1, + "version": "v2" + } + }, + "url": "https://drive.google.com/file/d/1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk/view", + "urls": [ + "https://huggingface.co/realmongx2/m0delz/resolve/main/mariohara_e400.zip?download=true", + "https://drive.google.com/file/d/1x5aZjlYea_HfaHNHnZCpqaKmU4N5J-1X/view?usp=sharing", + "https://drive.google.com/file/d/1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk/view?usp=sharing", + "https://drive.google.com/file/d/1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk/view?usp=drive_link", + "https://drive.google.com/file/d/1rJ3a4NJ-3MQ8Vv240-hhaP2bO1irbA9h/view?usp=sharing", + "https://drive.google.com/file/d/1rJ3a4NJ-3MQ8Vv240-hhaP2bO1irbA9h/view?usp=drive_link", + "https://drive.google.com/file/d/1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk/view?usp=drivesdk", + "https://drive.google.com/file/d/1Pklo3gYLRM8_wG0XZvwf_lJJnsDgZhry/view?usp=drive_link", + "https://drive.google.com/file/d/1tKgz422yovSPvuKHDuSETNYRQNq2wIoT/view?usp=sharing", + "https://drive.google.com/file/d/1AVYkFKf3SLkUhQqiJYWiqluc7KAnR2cC/view?usp=sharing", + "https://drive.google.com/file/d/1AVYkFKf3SLkUhQqiJYWiqluc7KAnR2cC/view?usp=drive_link", + "https://drive.google.com/file/d/1YLJ2GXYvbGQ2mCyE5KjF2z_6EAI8gE2E/view?usp=sharing", + "https://drive.google.com/file/d/1uBmoKluknMOWKivjfHcbu7YkOMtjoclH/view?usp=sharing", + "https://drive.google.com/file/d/1YLJ2GXYvbGQ2mCyE5KjF2z_6EAI8gE2E/view?usp=drive_link", + "https://drive.google.com/file/d/1hRcWKrDz1gEbEbmSjyVzthsJG0dIbJi8/view?usp=sharing", + "https://drive.google.com/file/d/1hRcWKrDz1gEbEbmSjyVzthsJG0dIbJi8/view?usp=drive_link", + "https://drive.google.com/file/d/1ou8SVFKN0ifqljKgufOAqZdjq7qvSk2e/view?usp=drive_link", + "https://drive.google.com/file/d/1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk/view", + "https://drive.google.com/file/d/1uBmoKluknMOWKivjfHcbu7YkOMtjoclH/view", + "https://drive.google.com/file/d/1YLJ2GXYvbGQ2mCyE5KjF2z_6EAI8gE2E/view", + "https://drive.google.com/file/d/1hRcWKrDz1gEbEbmSjyVzthsJG0dIbJi8/view", + "https://drive.google.com/file/d/1ou8SVFKN0ifqljKgufOAqZdjq7qvSk2e/view", + "https://drive.google.com/uc?id=1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk", + "https://drive.google.com/file/d/1rJ3a4NJ-3MQ8Vv240-hhaP2bO1irbA9h/view", + "https://drive.google.com/file/d/1Pklo3gYLRM8_wG0XZvwf_lJJnsDgZhry/view", + "https://drive.google.com/file/d/1tKgz422yovSPvuKHDuSETNYRQNq2wIoT/view", + "https://drive.google.com/file/d/1AVYkFKf3SLkUhQqiJYWiqluc7KAnR2cC/view" + ], + "originalFileList": [ + "mariohara_e400/added_IVF6591_Flat_nprobe_1_v2.index", + "mariohara_e400/mariohara_v3_e400.pth" + ] +} \ No newline at end of file diff --git a/assets/weights/characters/tokai_teio/metadata.json b/assets/weights/characters/tokai_teio/metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..fd546b7a8420bcb0940804d82ac60bd1d8e5770b --- /dev/null +++ b/assets/weights/characters/tokai_teio/metadata.json @@ -0,0 +1,147 @@ +{ + "title": "Tokai Teio (Uma Musume) (RVC V2) (570 epochs)", + "author": { + "name": "realhijack", + "discordUserId": "903192512234127360" + }, + "md5": "27ac7aee0860b447ed98553b84de43a7", + "uploadedAt": "2023-07-21T11:01:53.209Z", + "weightsLink": "https://www.weights.gg/models/clm72thvr0rf8cctcy1jn4no4", + "id": "clm72thvr0rf8cctcy1jn4no4", + "type": "v2", + "tags": [ + "Anime Character", + "Fictional Character", + "RVC v2" + ], + "description": "https://huggingface.co/SANSSWEEP/rvc/resolve/main/TOKAITEIO.zip\nMADE BY <@299907871640911872> !!!\ncomm me", + "samples": [], + "files": [ + { + "name": "model.index", + "size": 557899019, + "md5": "6bc29a4dc19d5f44cec8ee4cfa59f6b7" + }, + { + "name": "model.pth", + "size": 57581999, + "md5": "27ac7aee0860b447ed98553b84de43a7" + } + ], + "torchMetadata": { + "config": { + "spec_channels": 1025, + "segment_size": 32, + "inter_channels": 192, + "hidden_channels": 192, + "filter_channels": 768, + "n_heads": 2, + "n_layers": 6, + "kernel_size": 3, + "p_dropout": 0, + "resblock": "1", + "resblock_kernel_sizes": [ + 3, + 7, + 11 + ], + "resblock_dilation_sizes": [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + "upsample_rates": [ + 12, + 10, + 2, + 2 + ], + "upsample_initial_channel": 512, + "upsample_kernel_sizes": [ + 24, + 20, + 4, + 4 + ], + "emb_channels": null, + "spk_embed_dim": 109, + "gin_channels": 256, + "sr": 48000 + }, + "f0": 1, + "version": "v2", + "extra_info": { + "config": [ + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [ + 3, + 7, + 11 + ], + [ + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ], + [ + 1, + 3, + 5 + ] + ], + [ + 12, + 10, + 2, + 2 + ], + 512, + [ + 24, + 20, + 4, + 4 + ], + 109, + 256, + 48000 + ], + "info": "570epoch", + "sr": "48k", + "f0": 1, + "version": "v2" + } + }, + "url": "https://huggingface.co/SANSSWEEP/rvc/resolve/main/TOKAITEIO.zip", + "originalFileList": [ + "New folder (3)/TokaiTeio_e570_s37620.pth", + "New folder (3)/added_IVF4528_Flat_nprobe_1_TokaiTeio_v2.index" + ] +} \ No newline at end of file diff --git a/check_deecho_config.py b/check_deecho_config.py new file mode 100644 index 0000000000000000000000000000000000000000..97b9d4611dc2fa3ff236b73eafa6335f55694e87 --- /dev/null +++ b/check_deecho_config.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +回声处理配置验证脚本 + +检查当前配置是否正确启用了激进的去回声处理 +""" + +import json +import os +from pathlib import Path + +def check_deecho_models(): + """检查 DeEcho 模型是否存在""" + print("=" * 60) + print("检查 DeEcho 模型") + print("=" * 60) + + models_dir = Path("assets/uvr5_weights") + required_models = [ + "VR-DeEchoDeReverb.pth", + "VR-DeEchoAggressive.pth", + "VR-DeEchoNormal.pth", + "onnx_dereverb_By_FoxJoy" + ] + + all_found = True + for model in required_models: + model_path = models_dir / model + exists = model_path.exists() + status = "[OK]" if exists else "[NO]" + size = "" + if exists: + if model_path.is_file(): + size_mb = model_path.stat().st_size / (1024 * 1024) + size = f" ({size_mb:.1f} MB)" + else: + size = " (目录)" + + print(f"{status} {model}{size}") + if not exists: + all_found = False + + print() + if all_found: + print("[OK] All DeEcho models ready") + else: + print("[NO] Missing models, run: python tools/download_models.py") + + return all_found + +def check_config(): + """检查配置文件""" + print("\n" + "=" * 60) + print("检查配置文件") + print("=" * 60) + + config_path = Path("configs/config.json") + with open(config_path, 'r', encoding='utf-8') as f: + config = json.load(f) + + cover_config = config.get("cover", {}) + + # 检查关键配置 + checks = [ + ("VC 预处理模式", "vc_preprocess_mode", "auto", cover_config.get("vc_preprocess_mode")), + ("源约束模式", "source_constraint_mode", "auto", cover_config.get("source_constraint_mode")), + ("Karaoke 分离", "karaoke_separation", True, cover_config.get("karaoke_separation")), + ("索引率", "index_rate", 0.50, cover_config.get("index_rate")), + ("保护系数", "protect", 0.33, cover_config.get("protect")), + ] + + all_correct = True + for name, key, expected, actual in checks: + match = actual == expected + status = "[OK]" if match else "[NO]" + print(f"{status} {name} ({key}): {actual} {'==' if match else '!='} {expected}") + if not match: + all_correct = False + + print() + if all_correct: + print("[OK] Config optimized for aggressive deecho") + else: + print("[WARN] Some configs not optimized") + + return all_correct + +def print_recommendations(): + """打印使用建议""" + print("\n" + "=" * 60) + print("使用建议") + print("=" * 60) + + print(""" +1. 当前配置使用自动模式: + - 优先使用 UVR DeEcho 模型,缺模型时回退到算法去混响 + - DeEcho 质量好时跳过 blend 直接使用,避免混回原始回音 + - 源约束仅在去过回音的预处理下自动启用 + +2. 如果回声仍然明显,可以尝试: + - 在 UI 中调整"索引率"(降低到 0.2-0.3) + - 在 UI 中调整"保护系数"(降低到 0.2-0.25) + - 使用更高质量的输入音频 + +3. 处理流程: + 原始音频 → Karaoke 分离 → UVR DeEcho(或算法去混响) → RVC 转换 → 输出 + +4. 测试建议: + - 选择一首有明显回声的歌曲 + - 查看日志中 DeEcho quality 指标 + - 对比处理前后的回声强度 +""") + +def main(): + print("\n回声处理配置验证\n") + + models_ok = check_deecho_models() + config_ok = check_config() + print_recommendations() + + print("\n" + "=" * 60) + if models_ok and config_ok: + print("[OK] System ready for testing") + else: + print("[WARN] Please fix issues above") + print("=" * 60 + "\n") + +if __name__ == "__main__": + main() diff --git a/configs/config.json b/configs/config.json index dc0229d50ac4c0b4776ef63fe307aec13dfbdeb5..15d1f7509505df9d3d53b71f5bb088a42bd655f7 100644 --- a/configs/config.json +++ b/configs/config.json @@ -1,4 +1,5 @@ { + "language": "zh_CN", "device": "cuda", "sample_rate": 48000, "hop_length": 160, @@ -29,8 +30,9 @@ }, "cover": { "separator": "roformer", + "roformer_model": "ensemble:vocal_rvc", "karaoke_separation": true, - "karaoke_model": "mel_band_roformer_karaoke_gabox.ckpt", + "karaoke_model": "ensemble:karaoke", "karaoke_merge_backing_into_accompaniment": true, "uvr5_model": "HP2_all_vocals", "uvr5_agg": 10, @@ -42,10 +44,10 @@ "demucs_split": true, "f0_method": "hybrid", "disable_chunking": false, - "index_rate": 0.70, + "index_rate": 0.50, "filter_radius": 3, "rms_mix_rate": 0.0, - "protect": 0.50, + "protect": 0.33, "speaker_id": 0, "hubert_layer": 12, "silence_gate": false, @@ -60,18 +62,20 @@ "f0_min": 50, "f0_max": 1100, "f0_energy_threshold_db": -50, - "f0_hybrid_mode": "off", + "f0_hybrid_mode": "fallback", "crepe_pd_threshold": 0.05, - "crepe_force_ratio": 0.0, + "crepe_force_ratio": 1.0, "crepe_replace_semitones": 3.0, + "unvoiced_feature_gate_floor": 0.28, + "breath_active_margin_db": 52.0, "f0_stabilize": false, "f0_stabilize_window": 2, "f0_stabilize_max_semitones": 6.0, "f0_stabilize_octave": true, "f0_rate_limit": false, "f0_rate_limit_semitones": 12.0, - "vc_preprocess_mode": "uvr_deecho", - "source_constraint_mode": "on", + "vc_preprocess_mode": "auto", + "source_constraint_mode": "auto", "vc_pipeline_mode": "current", "singing_repair": false, "reverb_reapply": true, @@ -81,10 +85,12 @@ "f0_min": 50, "f0_max": 1100, "f0_energy_threshold_db": -60, - "f0_hybrid_mode": "off", + "f0_hybrid_mode": "fallback", "crepe_pd_threshold": 0.1, - "crepe_force_ratio": 0.05, + "crepe_force_ratio": 1.0, "crepe_replace_semitones": 0.0, + "unvoiced_feature_gate_floor": 0.28, + "breath_active_margin_db": 52.0, "f0_stabilize": false, "f0_stabilize_window": 2, "f0_stabilize_max_semitones": 4.0, diff --git a/configs/presets/balanced.json b/configs/presets/balanced.json index 5b897f35f62787b18277c249a42a4017e8ace18f..a3a3997a44efb6d916ea475cb5653c3ca7481348 100644 --- a/configs/presets/balanced.json +++ b/configs/presets/balanced.json @@ -13,8 +13,8 @@ "f0_stabilize": true, "f0_stabilize_window": 3, "f0_stabilize_max_semitones": 3.0, - "vc_preprocess_mode": "uvr_deecho", - "source_constraint_mode": "on", + "vc_preprocess_mode": "auto", + "source_constraint_mode": "auto", "uvr5_agg": 10 } } diff --git a/configs/presets/clarity_priority.json b/configs/presets/clarity_priority.json index 6c27b33d9ff820d62645a0cdc608f626ce62e6a1..71779d3895d3a31fc098375f0a59dd17a6164435 100644 --- a/configs/presets/clarity_priority.json +++ b/configs/presets/clarity_priority.json @@ -13,8 +13,8 @@ "f0_stabilize": false, "f0_stabilize_window": 2, "f0_stabilize_max_semitones": 2.0, - "vc_preprocess_mode": "uvr_deecho", - "source_constraint_mode": "on", + "vc_preprocess_mode": "auto", + "source_constraint_mode": "auto", "uvr5_agg": 8 } } diff --git a/configs/presets/timbre_priority.json b/configs/presets/timbre_priority.json index be795d356523dc68d47cb1042dbca88d227ad7e8..fa6da8d860088c75ea1467cd3866553defce55c9 100644 --- a/configs/presets/timbre_priority.json +++ b/configs/presets/timbre_priority.json @@ -13,8 +13,8 @@ "f0_stabilize": true, "f0_stabilize_window": 5, "f0_stabilize_max_semitones": 4.0, - "vc_preprocess_mode": "uvr_deecho", - "source_constraint_mode": "on", + "vc_preprocess_mode": "auto", + "source_constraint_mode": "auto", "uvr5_agg": 12 } } diff --git "a/docs/Colab\346\274\224\347\244\272.png" "b/docs/Colab\346\274\224\347\244\272.png" new file mode 100644 index 0000000000000000000000000000000000000000..f03ff7185e99edc7bb77f76fd3936be881a02e30 --- /dev/null +++ "b/docs/Colab\346\274\224\347\244\272.png" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afed65f6707b2b83f4e0a30a159f3c1f3e74ed9d3b9809819ec633dfc5181338 +size 236173 diff --git "a/docs/Windows\347\225\214\351\235\242.png" "b/docs/Windows\347\225\214\351\235\242.png" new file mode 100644 index 0000000000000000000000000000000000000000..4d8fc51f2ad21a22ae24bf4e8523c0b45af91453 --- /dev/null +++ "b/docs/Windows\347\225\214\351\235\242.png" @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3f221fd5ffe4b646f8fa4e19108b09f104918db96849cf5b4a5981e8b89d352 +size 224800 diff --git a/docs/plans/2026-03-30-portable-ffmpeg-hardening.md b/docs/plans/2026-03-30-portable-ffmpeg-hardening.md new file mode 100644 index 0000000000000000000000000000000000000000..94a05e199e33b010c497858f5a3ee8b451107492 --- /dev/null +++ b/docs/plans/2026-03-30-portable-ffmpeg-hardening.md @@ -0,0 +1,74 @@ +# Portable FFmpeg Hardening Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Make portable builds self-contained with bundled ffmpeg and ensure runtime code prefers the bundled binary instead of requiring a system installation. + +**Architecture:** Add a dedicated runtime helper that resolves bundled ffmpeg paths, prepends them to `PATH`, and sets explicit environment variables. Update GitHub build workflows to copy platform ffmpeg binaries into the packaged tree, and add regression tests that lock both the runtime helper and packaging contract. + +**Tech Stack:** Python, GitHub Actions, PyInstaller, pytest + +--- + +### Task 1: Add failing tests for portable ffmpeg runtime + +**Files:** +- Create: `tests/test_ffmpeg_runtime.py` + +**Step 1: Write the failing test** +- Assert a bundled ffmpeg directory is preferred over system lookup. +- Assert runtime setup prepends the bundled directory to `PATH`. + +**Step 2: Run test to verify it fails** +- Run: `pytest -q tests/test_ffmpeg_runtime.py` + +**Step 3: Write minimal implementation** +- Add the helper module and call it from `run.py`. + +**Step 4: Run test to verify it passes** +- Run: `pytest -q tests/test_ffmpeg_runtime.py` + +**Step 5: Commit** +- Commit helper + test once green. + +### Task 2: Remove remaining hard dependency on `ffprobe` + +**Files:** +- Modify: `infer/modules/uvr5/modules.py` + +**Step 1: Write the failing test** +- Assert the module no longer calls `ffmpeg.probe(..., cmd="ffprobe")`. + +**Step 2: Run test to verify it fails** +- Run: `pytest -q tests/test_ffmpeg_runtime.py` + +**Step 3: Write minimal implementation** +- Replace the probe path with a library-based metadata read that does not shell out to `ffprobe`. + +**Step 4: Run tests to verify they pass** +- Run: `pytest -q tests/test_ffmpeg_runtime.py` + +**Step 5: Commit** +- Commit metadata-path cleanup. + +### Task 3: Bundle ffmpeg in GitHub release builds + +**Files:** +- Modify: `.github/workflows/build-executables.yml` +- Modify: `AI-RVC.spec` + +**Step 1: Write the failing test** +- Assert the workflow stages copy a bundled ffmpeg directory and package it. + +**Step 2: Run test to verify it fails** +- Run: `pytest -q tests/test_ffmpeg_runtime.py` + +**Step 3: Write minimal implementation** +- Add a workflow step to copy `ffmpeg` into `tools/ffmpeg/bin`. +- Add the ffmpeg directory to packaged data inputs for both workflow and spec. + +**Step 4: Run tests to verify they pass** +- Run: `pytest -q tests/test_ffmpeg_runtime.py` + +**Step 5: Commit** +- Commit workflow/spec hardening. diff --git a/docs/reports/2026-03-24-cover-quality-final-root-cause-report.md b/docs/reports/2026-03-24-cover-quality-final-root-cause-report.md new file mode 100644 index 0000000000000000000000000000000000000000..78bdad05bcdf8a24a4069d60c2af8ecb2057b72c --- /dev/null +++ b/docs/reports/2026-03-24-cover-quality-final-root-cause-report.md @@ -0,0 +1,127 @@ +# 2026-03-24 Cover Quality Final Root Cause Report + +## 结论 + +这次问题不是单一模块失效,而是三层策略在同一条链路上叠加失衡: + +1. `hybrid` 在官方 VC 路径里实际被解释成激进的 `RMVPE + CREPE` 混合。 +2. 源约束在活跃唱段对高能量回声压制不足。 +3. 后处理又把低能量无声段、呼吸段、换气段压得过重。 + +最终结果就是: + +- `SILENT PAIN` 这类回声更重、主唱和和声交叠更强的输入,会在活跃唱段残留明显回声。 +- 两首歌都会在呼吸、回气、擦音附近出现电音感、机械感,因为这些段落先被错误赋音高,再被后处理继续削薄。 + +## 严格根因 + +### 1. `hybrid` 路由过于激进 + +已确认官方 VC 路径里的 `hybrid` 请求会进入 `rmvpe+crepe` 方向,而不是“仅在短掉线时保守兜底”。 + +影响: + +- RMVPE 本来判为无声或无调性的段落,CREPE 仍可能给出“看起来有音高”的结果。 +- 呼吸、换气、擦音、气声尾音会被错误写入 F0,随后进入 RVC 合成器,听起来就是电音化、机械化。 + +这条根因比“RMVPE 本身坏了”更准确。RMVPE 不是主问题,激进混合策略才是主问题。 + +### 2. 活跃唱段的源约束压制不够 + +之前的源约束主公式本质上更偏向“低活动段清理”,对活跃唱段内的高能量回声缺少持续压力。 + +影响: + +- 低能量尾部回声能被抑制。 +- 但高音量、贴着主唱本体的回声伪影更容易穿过去。 + +这也解释了为什么 `SILENT PAIN` 更差:它不是单纯“DeEcho 没工作”,而是输入本身更容易在活跃唱段把残留回声带进 VC,再被较宽松的后续预算放过去。 + +### 3. 后处理对呼吸段过度清理 + +后处理链路里存在两次对低能量段的再压制: + +- silence gate / unvoiced cleanup +- source gap suppression + +原策略过于激进时,会把真正的呼吸、回气、气声辅音也看成“应被抑制的无声残留”。 + +影响: + +- 如果前面 F0 已经有误判,这里会把真实呼吸进一步压没,只剩下 VC 合成残留。 +- 听感上就会更像电音噪点,而不是自然的呼吸。 + +## 两份旧诊断中需要收紧的地方 + +以下判断方向是对的,但需要更严格表述: + +- “高音量回声去不掉”并不等于 UVR DeEcho 完全失效。 + 实际上,DeEcho 对低能量尾部仍有效,主要问题在于后续源约束对活跃唱段压力不足。 + +- “呼吸电音”也不应归结为单一的 silence gate。 + 更准确的说法是:激进 F0 混合先污染了呼吸段,后处理再把真实气声继续削弱,二者叠加后才变成明显机械感。 + +## 已实施修复 + +### A. 将 `hybrid` 改为保守策略 + +已新增统一策略层 `infer/quality_policy.py`,并接入: + +- `infer/official_adapter.py` +- `infer/f0_extractor.py` +- `infer/cover_pipeline.py` + +修复后: + +- 官方 VC 请求 `hybrid` 时,不再直接走激进混合。 +- 主转换统一路由到 `RMVPE`。 +- 只有在短、局部、可信、且处于有声上下文的掉线帧上,才允许 CREPE 参与修补。 +- 后处理门控对 `hybrid` 也统一按 `RMVPE` 语义执行,避免前后语义不一致。 + +### B. 强化活跃唱段的源约束 + +已把源约束从“只对低活动段强约束”改成“在活跃且疑似回声重叠的帧上,也保留有效替换压力”。 + +同时收紧了后续预算: + +- 活跃段不再允许旧逻辑那样过宽的能量放行。 +- 全局活动 RMS 也做了更对称、更保守的重新裁剪。 + +目标不是把人声压瘪,而是减少活跃唱段里本不该存在的额外回声包络。 + +### C. 放松对呼吸段的误杀 + +已降低后处理对低能量段的攻击性: + +- `source gap suppression` 触发更谨慎。 +- 衰减深度减轻。 +- `uvr_deecho` 路由下的二次 silence gate 更保守,且加入保护系数。 + +目标是保留自然呼吸、回气和轻辅音,不再把它们当成必须清掉的伪影。 + +## 预期效果 + +修复后预期是: + +- `SILENT PAIN` 这类高回声输入,在主唱活跃区的残留回声会明显减轻,但不会靠极端静音门去“硬切”。 +- `Far far away` 这类本来底子更好的输入,呼吸和回气会更自然,机械感应下降。 +- 如果模型本身训练音色或索引质量有限,仍可能保留一部分合成质感;这不是本次根因链的核心部分。 + +## 已完成验证 + +已完成本地轻量验证: + +- `python -m unittest tests.test_quality_policy` +- `python -m py_compile infer\\quality_policy.py infer\\f0_extractor.py infer\\official_adapter.py infer\\cover_pipeline.py tests\\test_quality_policy.py` + +结果: + +- 单测通过。 +- 语法编译通过。 + +## 尚未完成的验证 + +尚未在当前 shell 环境执行重型端到端音频回归,因为当前会话未发现可直接复用的项目虚拟环境解释器。 + +因此,本报告对“代码修复已落地”是确定的; +对“具体两首歌的最终听感改善幅度”仍建议在用户实际运行环境里再做一次 A/B 回归确认。 diff --git a/docs/superpowers/plans/2026-05-17-v1.2.1-language-release.md b/docs/superpowers/plans/2026-05-17-v1.2.1-language-release.md new file mode 100644 index 0000000000000000000000000000000000000000..051699f4301b5284de200df09f10d29b41752df8 --- /dev/null +++ b/docs/superpowers/plans/2026-05-17-v1.2.1-language-release.md @@ -0,0 +1,90 @@ +# v1.2.1 Language Release Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a Chinese/English UI language selector and publish v1.2.1 from current local `master`. + +**Architecture:** Reuse the existing `i18n/*.json` pattern and `ui.app:t()` lookup. Locale is read from `configs/config.json` during startup and saved through a Gradio dropdown with an explicit restart-required status. + +**Tech Stack:** Python 3.10, Gradio 3.50.2, unittest/pytest-compatible tests, GitHub CLI, Hugging Face CLI. + +--- + +### Task 1: Locale Configuration Tests + +**Files:** +- Create: `tests/test_ui_language.py` +- Modify: `ui/app.py` +- Create: `i18n/en_US.json` + +- [ ] **Step 1: Write the failing tests** + +Create tests that import `ui.app`, assert default locale selection, reject unsupported locales, save language to a temp config file, and require English translation keys. + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `python -m pytest tests/test_ui_language.py -q` +Expected: fail because helper functions and English pack are not implemented. + +- [ ] **Step 3: Implement minimal locale support** + +Add `SUPPORTED_LANGUAGES`, config-driven `get_configured_language`, `save_language_setting`, and load `i18n` from the configured language. Add `i18n/en_US.json`. + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `python -m pytest tests/test_ui_language.py -q` +Expected: pass. + +### Task 2: Gradio UI Selector + +**Files:** +- Modify: `ui/app.py` +- Modify: `i18n/zh_CN.json` +- Modify: `i18n/en_US.json` + +- [ ] **Step 1: Add selector labels to tests** + +Extend `tests/test_ui_language.py` to assert both language packs include `language`, `save_language`, and `language_saved_restart` settings keys. + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `python -m pytest tests/test_ui_language.py -q` +Expected: fail until translation keys exist. + +- [ ] **Step 3: Add selector UI** + +Place a compact language dropdown near the top and a matching Settings control. Saving updates config and returns restart-required text. + +- [ ] **Step 4: Run tests and import check** + +Run: `python -m pytest tests/test_ui_language.py tests/test_quality_policy.py::SourceRegressionTests::test_ui_only_exposes_strict_sota_vc_preprocess_modes -q` +Run: `python -c "import ui.app; print(ui.app.get_configured_language())"` +Expected: both commands exit 0. + +### Task 3: Release + +**Files:** +- Git metadata and remote release state. + +- [ ] **Step 1: Inspect diff and status** + +Run: `git status -sb` and `git diff --stat`. + +- [ ] **Step 2: Commit language changes** + +Run: `git add docs/superpowers/specs/2026-05-17-v1.2.1-language-release-design.md docs/superpowers/plans/2026-05-17-v1.2.1-language-release.md tests/test_ui_language.py i18n/en_US.json i18n/zh_CN.json ui/app.py configs/config.json` +Run: `git commit -m "ui: add language switcher"` + +- [ ] **Step 3: Push and release** + +Run: `git push origin master` +Run: `gh release create v1.2.1 --target master --title "AI-RVC v1.2.1" --notes-file ` + +- [ ] **Step 4: Sync Hugging Face Space** + +Run: `hf upload mason369/AI-RVC . --repo-type space --commit-message "Release v1.2.1" --exclude ...` + +- [ ] **Step 5: Check package workflow** + +Run: `gh run list --workflow "Build Executables" --limit 5` +Wait for the v1.2.1 workflow and report release assets status. diff --git a/docs/superpowers/specs/2026-05-17-v1.2.1-language-release-design.md b/docs/superpowers/specs/2026-05-17-v1.2.1-language-release-design.md new file mode 100644 index 0000000000000000000000000000000000000000..b89e26dc9e592f21bb5c463b632bec7edde4c8ef --- /dev/null +++ b/docs/superpowers/specs/2026-05-17-v1.2.1-language-release-design.md @@ -0,0 +1,21 @@ +# v1.2.1 Language Release Design + +## Goal + +Release AI-RVC v1.2.1 from the current local `master` branch, adding a visible Chinese/English language selector and publishing matching GitHub and Hugging Face Space versions. + +## Approach + +The UI already centralizes most labels through `i18n/zh_CN.json` and `ui/app.py:t()`. Add `i18n/en_US.json`, store the selected locale in `configs/config.json`, and load that locale at startup. Because Gradio 3 builds component labels statically, changing the selector saves the preference and shows an explicit restart-required message instead of pretending to hot-swap all labels. + +## UI Behavior + +Add a language selector near the top of the app and in Settings. Choices are `中文` and `English`. Saving writes `language` to `configs/config.json`; unsupported locale values fail explicitly and do not silently fall back. + +## Release Behavior + +Publish from the current local `master`: run targeted tests, commit language changes, push `master`, create GitHub Release/tag `v1.2.1`, let the existing `Build Executables` workflow build portable packages, and sync the same source to Hugging Face Space `mason369/AI-RVC`. + +## Verification + +Use test-first coverage for locale config loading, language option validation, English pack existence, and save behavior. After implementation, run the targeted tests plus a `ui.app` import check before committing and publishing. diff --git a/i18n/en_US.json b/i18n/en_US.json new file mode 100644 index 0000000000000000000000000000000000000000..8e2ceab3e46a89ffd4f7faf73d1cc4700a6fec73 --- /dev/null +++ b/i18n/en_US.json @@ -0,0 +1,165 @@ +{ + "app_title": "RVC AI Cover", + "app_description": "AI song cover system based on RVC v2", + "tabs": { + "models": "Model Management", + "cover": "Song Cover", + "settings": "Settings" + }, + "ui": { + "cover_usage": "**One-click AI Cover**: upload a song → separate vocals → convert the voice → mix the accompaniment → export the cover\n\n**Steps:**\n1. Download a character model first (open \"Download Character Model\" below)\n2. Upload a song file (MP3/WAV/FLAC)\n3. Choose a downloaded character\n4. Adjust parameters and click \"Start Cover\"\n\n> ⚠️ First launch downloads the Mel-Band Roformer vocal separation model automatically (about 200 MB). Please wait patiently.", + "series_filter": "Series / Category", + "keyword_search": "Keyword Search", + "keyword_placeholder": "Character or series name", + "character_choice_info": "The list shows version tags, character ownership, and model source. In versions, epochs means training epochs, while 40k/48k means sample rate.", + "download_character_info": "The list shows version tags, character ownership, and source repositories so you can distinguish different models for the same character.", + "refresh_models": "Refresh Model List", + "download_selected_character": "Download Selected Character", + "download_series_all": "Download This Category", + "download_all_characters": "Download All Character Models", + "download_status": "Download Status", + "model_name": "Model Name", + "model_path": "Model Path", + "index_path": "Index Path", + "no_models": "(No models)", + "positive_pitch_info": "Positive raises pitch, negative lowers it", + "normal_volume_info": "100% keeps original volume", + "reverb_info": "Adds reverb to the vocals" + }, + "conversion": { + "model_settings": "Model Settings", + "select_model": "Select Model", + "refresh": "Refresh", + "load_model": "Load Model", + "status": "Status", + "audio_input": "Audio Input", + "upload_audio": "Upload Audio", + "conversion_params": "Conversion Parameters", + "pitch_shift": "Pitch Shift (semitones)", + "pitch_shift_info": "Positive raises pitch, negative lowers it. Male to female often uses +12; female to male often uses -12", + "f0_method": "F0 Extraction Method", + "f0_method_info": "rmvpe gives the highest quality (recommended)", + "index_rate": "Index Rate", + "index_rate_info": "Higher values sound closer to the training voice, but may reduce clarity", + "filter_radius": "Median Filter Radius", + "filter_radius_info": "Helps reduce breath noise", + "rms_mix_rate": "Loudness Envelope Mix", + "rms_mix_rate_info": "Closer to 1 uses more of the output loudness envelope", + "protect": "Unvoiced Protection", + "protect_info": "Protects consonants and breath sounds; 0.5 means no protection", + "start_conversion": "Start Conversion", + "conversion_result": "Conversion Result", + "conversion_status": "Conversion Status" + }, + "cover": { + "song_cover": "AI Song Cover", + "cover_description": "Upload a song, choose a character, and generate an AI cover. The pipeline separates vocals, converts the voice, and mixes the accompaniment automatically.", + "upload_song": "Upload Song", + "input_song": "Input Song (MP3/WAV/FLAC)", + "select_character": "Select Character", + "character": "Character", + "conversion_settings": "Conversion Settings", + "pitch_shift": "Pitch Shift (semitones)", + "index_rate": "Index Rate (%)", + "index_rate_info": "Recommended: 30-45%; too low sounds more synthetic, too high may blur articulation", + "speaker_id": "Speaker ID", + "speaker_id_info": "Single-character models usually use 0; try other IDs for multi-speaker models", + "mix_settings": "Mix Settings", + "mix_preset": "Mix Preset", + "mix_preset_info": "Apply a mix profile suitable for most songs", + "mix_preset_universal": "Universal (recommended)", + "mix_preset_vocal": "Vocal Focus", + "mix_preset_accompaniment": "Accompaniment Focus", + "mix_preset_live": "Live Feel (more reverb)", + "vocals_volume": "Vocal Volume (%)", + "accompaniment_volume": "Accompaniment Volume (%)", + "vocals_reverb": "Vocal Reverb (%)", + "rms_mix_rate": "Loudness Envelope Match (%)", + "rms_mix_rate_info": "Moderately follows the original vocal loudness contour; recommended: 10-30%", + "backing_mix": "Original Lead Vocal Blend (%)", + "backing_mix_info": "Blend a small amount of original lead vocal to restore thickness and articulation; too high leaks the source timbre", + "karaoke_separation": "Harmony Separation", + "karaoke_separation_info": "Separate lead and backing vocals, convert only the lead, and preserve original harmonies. Useful for songs with backing vocals", + "karaoke_merge_backing": "Blend Backing Vocals Into Accompaniment", + "karaoke_merge_backing_info": "Mix separated original backing vocals back into the accompaniment. Enabled by default to preserve harmony layers", + "start_cover": "Start Cover", + "progress": "Progress", + "results": "Cover Results", + "final_cover": "Final Cover", + "converted_vocals": "Converted Vocals", + "original_vocals": "Original Vocals (Separated)", + "lead_vocals": "Lead Vocals", + "backing_vocals": "Backing Vocals", + "accompaniment": "Accompaniment", + "download_character": "Download Character Model", + "select_to_download": "Select Character to Download", + "download": "Download", + "vc_preprocess_mode": "VC Preprocess Strategy", + "vc_preprocess_mode_info": "Strictly uses RoFormer De-Reverb SOTA; if unavailable, processing stops instead of degrading", + "vc_preprocess_auto": "Auto (recommended)", + "vc_preprocess_uvr_deecho": "Strict RoFormer De-Reverb", + "source_constraint_mode": "Source Constraint Strategy", + "source_constraint_mode_info": "Auto (recommended) enables this only for strict RoFormer De-Reverb preprocessing; if unavailable, processing stops instead of degrading", + "source_constraint_auto": "Auto (recommended)", + "source_constraint_off": "Off", + "source_constraint_on": "Always On", + "vc_preprocess_status": "Current VC Preprocess Route", + "vc_preprocess_status_info": "Shows the actual route this machine will use in auto mode", + "vc_pipeline_mode": "VC Pipeline Mode", + "vc_pipeline_mode_info": "Current implementation keeps this project's existing path; official mode calls the bundled official RVC directly and skips custom VC pre/post-processing", + "vc_pipeline_mode_current": "Current Implementation (recommended)", + "vc_pipeline_mode_official": "Official Implementation (1:1)", + "singing_repair": "Singing Repair", + "singing_repair_info": "For high-note dropouts, tearing, metallic artifacts, and sibilance. When enabled, this exits strict 1:1 mode and uses the official-compatible repair path (FP32 + hybrid F0 + F0 stabilization/rate limiting)" + }, + "models": { + "base_models": "Base Models", + "base_models_desc": "These models are required to run RVC and must be downloaded before first use.", + "check_status": "Check Model Status", + "download_required": "Download Required Models", + "model_status": "Model Status", + "voice_models": "Voice Models", + "voice_models_desc": "Put .pth model files into assets/weights/, then refresh the model list.", + "mature_deecho_models": "Mature DeEcho Models", + "mature_deecho_models_desc": "Current covers strictly use RoFormer De-Reverb SOTA; if unavailable, processing stops instead of degrading to older models or algorithm chains.", + "download_mature_deecho": "Download Mature DeEcho Model", + "mature_deecho_status": "Mature DeEcho Status", + "mature_deecho_check": "Check Mature DeEcho Status" + }, + "settings": { + "language_settings": "Language Settings", + "language": "Interface Language", + "save_language": "Save Language", + "language_saved_restart": "✅ Interface language saved: {language}. Please restart the app for it to take effect.", + "language_restart_info": "The language choice is saved to the config file. Because Gradio 3 builds UI labels at startup, restart the app to apply it.", + "runtime_settings": "Runtime Settings", + "compute_device": "Compute Device", + "save_settings": "Save Settings", + "settings_saved_restart": "✅ Settings saved. Restart the app for changes to take effect.", + "status": "Status", + "cpu_slow": "CPU (slow)", + "about_body": "**RVC AI Cover System**\n\n- Built on RVC v2 + Mel-Band Roformer\n- Uses RMVPE for high-quality F0 extraction\n- Supports CUDA GPU acceleration\n\n[GitHub](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)", + "model_sources": "### 📥 Character Model Sources\n\nThese are the Hugging Face repositories used by the built-in character model list. You can also download models manually and place them under `assets/weights/characters//`:\n\n**Love Live! Series**\n- [trioskosmos/rvc_models](https://huggingface.co/trioskosmos/rvc_models) — μ's / Aqours / Nijigasaki / Liella! characters\n- [Icchan/LoveLive](https://huggingface.co/Icchan/LoveLive) — Chika, Riko, Eli, You\n- [0xMifune/LoveLive](https://huggingface.co/0xMifune/LoveLive) — Nijigasaki / Liella! / Hasunosora\n- [Swordsmagus/Love-Live-RVC](https://huggingface.co/Swordsmagus/Love-Live-RVC) — Hanamaru, Setsuna, Kotori, A-RISE, and more\n- [Zurakichi/RVC](https://huggingface.co/Zurakichi/RVC) — Nico, Kanata, Setsuna, Hanamaru\n- [Phos252/RVCmodels](https://huggingface.co/Phos252/RVCmodels) — Shibuya Kanon\n- [ChocoKat/Mari_Ohara](https://huggingface.co/ChocoKat/Mari_Ohara) — Mari Ohara\n- [HarunaKasuga/YoshikoTsushima](https://huggingface.co/HarunaKasuga/YoshikoTsushima) — Yoshiko Tsushima\n- [thebuddyadrian/RVC_Models](https://huggingface.co/thebuddyadrian/RVC_Models) — Kazuno sisters\n\n**Genshin / Honkai / Zenless Zone Zero (HoYoverse)**\n- [makiligon/RVC-Models](https://huggingface.co/makiligon/RVC-Models) — Furina, Ayaka, Focalors\n- [kohaku12/RVC-MODELS](https://huggingface.co/kohaku12/RVC-MODELS) — Nahida, Herta, Firefly, Tingyun, Hoshimi Miyabi, and more\n- [jarari/RVC-v2](https://huggingface.co/jarari/RVC-v2) — Furina (Korean), Silver Wolf (Korean)\n- [mrmocciai/genshin-impact](https://huggingface.co/mrmocciai/genshin-impact) — 50+ Genshin characters (manual download)\n\n**VOCALOID**\n- [javinfamous/infamous_miku_v2](https://huggingface.co/javinfamous/infamous_miku_v2) — Hatsune Miku (1000 epochs)\n\n**Hololive / VTuber**\n- [megaaziib/my-rvc-models-collection](https://huggingface.co/megaaziib/my-rvc-models-collection) — Pekora, Sakura Miko, Subaru, Kobo, Kaela, and more\n- [Kit-Lemonfoot/kitlemonfoot_rvc_models](https://huggingface.co/Kit-Lemonfoot/kitlemonfoot_rvc_models) — Hololive JP/EN characters\n\n**THE IDOLM@STER / Uma Musume**\n- [trioskosmos/rvc_models](https://huggingface.co/trioskosmos/rvc_models) — Ranko Kanzaki, Riamu Yumemi\n- [makiligon/RVC-Models](https://huggingface.co/makiligon/RVC-Models) — Takane Shijou, Rice Shower\n\n**Project SEKAI**\n- [kohaku12/RVC-MODELS](https://huggingface.co/kohaku12/RVC-MODELS) — Nene Kusanagi\n\n> 💡 After manual download, put `.pth` and `.index` files under `assets/weights/characters//`, then refresh the list.", + "device_info": "Device Information", + "current_device": "Current Device", + "refresh_device": "Refresh Device Information", + "about": "About" + }, + "messages": { + "model_loaded": "Model loaded", + "model_not_found": "Model not found", + "load_failed": "Load failed", + "please_select_model": "Please select a model", + "please_load_model": "Please load a model first", + "please_upload_audio": "Please upload an audio file", + "conversion_complete": "Conversion complete", + "conversion_failed": "Conversion failed", + "download_complete": "Download complete", + "download_failed": "Download failed", + "cover_complete": "Cover complete", + "cover_failed": "Cover failed", + "separating_vocals": "Separating vocals...", + "converting_vocals": "Converting vocals...", + "mixing_audio": "Mixing audio..." + } +} diff --git a/i18n/zh_CN.json b/i18n/zh_CN.json index 12cc95e2a6af8fcc70046409c872e030c57dcccc..6e1a6e69f41d1a198cad96248c8a9d0dbbe8ca73 100644 --- a/i18n/zh_CN.json +++ b/i18n/zh_CN.json @@ -6,6 +6,26 @@ "cover": "歌曲翻唱", "settings": "设置" }, + "ui": { + "cover_usage": "**一键 AI 翻唱**:上传歌曲 → 自动分离人声 → 转换音色 → 混合伴奏 → 输出翻唱\n\n**使用步骤:**\n1. 先下载角色模型(展开下方「下载角色模型」)\n2. 上传歌曲文件(支持 MP3/WAV/FLAC)\n3. 选择已下载的角色\n4. 调整参数后点击「开始翻唱」\n\n> ⚠️ 首次运行会自动下载 Mel-Band Roformer 人声分离模型(约 200MB),请耐心等待", + "series_filter": "作品/分类", + "keyword_search": "关键词搜索", + "keyword_placeholder": "输入角色名/作品名", + "character_choice_info": "列表会显示版本标识、角色归属和模型来源;版本里常见的 epochs 表示训练轮数,40k/48k 表示采样率。", + "download_character_info": "列表会显示版本标识、角色归属和来源仓库,便于区分同角色的不同模型。", + "refresh_models": "刷新模型列表", + "download_selected_character": "下载选中角色", + "download_series_all": "下载该分类全部", + "download_all_characters": "下载全部角色模型", + "download_status": "下载状态", + "model_name": "模型名称", + "model_path": "模型路径", + "index_path": "索引路径", + "no_models": "(无模型)", + "positive_pitch_info": "正数升调,负数降调", + "normal_volume_info": "100% 为原始音量", + "reverb_info": "为人声添加混响效果" + }, "conversion": { "model_settings": "模型设置", "select_model": "选择模型", @@ -75,13 +95,11 @@ "select_to_download": "选择要下载的角色", "download": "下载", "vc_preprocess_mode": "VC预处理策略", - "vc_preprocess_mode_info": "参考成熟项目:优先学习型 DeEcho/DeReverb,否则主唱直通 RVC", + "vc_preprocess_mode_info": "严格使用 RoFormer De-Reverb SOTA;不可用时停止,不降级", "vc_preprocess_auto": "自动(推荐)", - "vc_preprocess_direct": "主唱直通", - "vc_preprocess_uvr_deecho": "官方DeEcho优先", - "vc_preprocess_legacy": "旧版手工链", + "vc_preprocess_uvr_deecho": "严格RoFormer De-Reverb", "source_constraint_mode": "源约束策略", - "source_constraint_mode_info": "自动(推荐)下仅旧版手工链启用;学习型 DeEcho 或主唱直通默认不再追加自定义源约束", + "source_constraint_mode_info": "自动(推荐)下仅严格 RoFormer De-Reverb 预处理启用;不可用时停止,不降级", "source_constraint_auto": "自动(推荐)", "source_constraint_off": "关闭", "source_constraint_on": "始终开启", @@ -89,10 +107,10 @@ "vc_preprocess_status_info": "显示本机当前自动模式会走的实际路径", "vc_pipeline_mode": "VC管线模式", "vc_pipeline_mode_info": "当前实现保留本项目现有链路;官方实现会直接调用内置官方 RVC,并跳过自定义 VC 前后处理", - "vc_pipeline_mode_current": "当前实现(推荐)", - "vc_pipeline_mode_official": "官方实现(一比一)", - "singing_repair": "唱歌修复", - "singing_repair_info": "用于高音断音、撕裂、电音、齿音;开启后会退出严格一比一,改用官方兼容修复链(FP32 + 混合F0 + F0稳定/限速)" + "vc_pipeline_mode_current": "当前实现(推荐)", + "vc_pipeline_mode_official": "官方实现(一比一)", + "singing_repair": "唱歌修复", + "singing_repair_info": "用于高音断音、撕裂、电音、齿音;开启后会退出严格一比一,改用官方兼容修复链(FP32 + 混合F0 + F0稳定/限速)" }, "models": { "base_models": "基础模型", @@ -103,12 +121,25 @@ "voice_models": "语音模型", "voice_models_desc": "将 .pth 模型文件放入 assets/weights/ 目录,然后刷新模型列表。", "mature_deecho_models": "成熟 DeEcho 模型", - "mature_deecho_models_desc": "用于成熟项目常见的学习型去回声/去混响流程。未下载时,翻唱页的自动模式会回退为主唱直通 RVC。", + "mature_deecho_models_desc": "当前翻唱严格使用 RoFormer De-Reverb SOTA;不可用时停止处理,不降级到旧模型或算法链。", "download_mature_deecho": "下载成熟 DeEcho 模型", "mature_deecho_status": "成熟 DeEcho 状态", "mature_deecho_check": "检查成熟 DeEcho 状态" }, "settings": { + "language_settings": "语言设置", + "language": "界面语言", + "save_language": "保存语言", + "language_saved_restart": "✅ 已保存界面语言:{language}。请重启应用后生效。", + "language_restart_info": "切换语言会保存到配置文件;由于 Gradio 3 的界面文案在启动时生成,重启后生效。", + "runtime_settings": "运行设置", + "compute_device": "计算设备", + "save_settings": "保存设置", + "settings_saved_restart": "✅ 设置已保存,重启后生效", + "status": "状态", + "cpu_slow": "CPU (较慢)", + "about_body": "**RVC AI 翻唱系统**\n\n- 基于 RVC v2 + Mel-Band Roformer\n- 使用 RMVPE 进行高质量 F0 提取\n- 支持 CUDA GPU 加速\n\n[GitHub](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)", + "model_sources": "### 📥 角色模型来源\n\n以下是本项目角色模型的 HuggingFace 仓库来源,你也可以手动下载模型后放入 `assets/weights/characters/<角色名>/` 目录使用:\n\n**Love Live! 系列**\n- [trioskosmos/rvc_models](https://huggingface.co/trioskosmos/rvc_models) — μ's / Aqours / 虹咲 / Liella! 多角色\n- [Icchan/LoveLive](https://huggingface.co/Icchan/LoveLive) — 千歌、梨子、绘里、曜\n- [0xMifune/LoveLive](https://huggingface.co/0xMifune/LoveLive) — 虹咲 / Liella! / 莲之空\n- [Swordsmagus/Love-Live-RVC](https://huggingface.co/Swordsmagus/Love-Live-RVC) — 花丸、雪菜、小鸟、A-RISE 等\n- [Zurakichi/RVC](https://huggingface.co/Zurakichi/RVC) — 妮可、彼方、雪菜、花丸\n- [Phos252/RVCmodels](https://huggingface.co/Phos252/RVCmodels) — 涩谷香音\n- [ChocoKat/Mari_Ohara](https://huggingface.co/ChocoKat/Mari_Ohara) — 小原鞠莉\n- [HarunaKasuga/YoshikoTsushima](https://huggingface.co/HarunaKasuga/YoshikoTsushima) — 津岛善子\n- [thebuddyadrian/RVC_Models](https://huggingface.co/thebuddyadrian/RVC_Models) — 鹿角姐妹\n\n**原神 / 崩坏 / 绝区零 (米哈游)**\n- [makiligon/RVC-Models](https://huggingface.co/makiligon/RVC-Models) — 芙宁娜、绫华、芙卡洛斯\n- [kohaku12/RVC-MODELS](https://huggingface.co/kohaku12/RVC-MODELS) — 纳西妲、黑塔、流萤、停云、星见雅 等\n- [jarari/RVC-v2](https://huggingface.co/jarari/RVC-v2) — 芙宁娜(韩语)、银狼(韩语)\n- [mrmocciai/genshin-impact](https://huggingface.co/mrmocciai/genshin-impact) — 原神 50+ 角色(需手动下载)\n\n**VOCALOID**\n- [javinfamous/infamous_miku_v2](https://huggingface.co/javinfamous/infamous_miku_v2) — 初音未来 (1000 epochs)\n\n**Hololive / VTuber**\n- [megaaziib/my-rvc-models-collection](https://huggingface.co/megaaziib/my-rvc-models-collection) — 佩克拉、樱巫女、大空昴、Kobo、Kaela 等\n- [Kit-Lemonfoot/kitlemonfoot_rvc_models](https://huggingface.co/Kit-Lemonfoot/kitlemonfoot_rvc_models) — Hololive JP/EN 多角色\n\n**偶像大师 / 赛马娘**\n- [trioskosmos/rvc_models](https://huggingface.co/trioskosmos/rvc_models) — 神崎兰子、梦见莉亚梦\n- [makiligon/RVC-Models](https://huggingface.co/makiligon/RVC-Models) — 四条贵音、米浴\n\n**Project SEKAI**\n- [kohaku12/RVC-MODELS](https://huggingface.co/kohaku12/RVC-MODELS) — 草薙宁宁\n\n> 💡 手动下载后,将 `.pth` 和 `.index` 文件放入 `assets/weights/characters/<角色名>/` 目录,刷新即可使用。", "device_info": "设备信息", "current_device": "当前设备", "refresh_device": "刷新设备信息", diff --git a/infer/__init__.py b/infer/__init__.py index 4db79bd0416ee76bb36e7238e9e374d6596ed425..63bd1ef5f77df48769c180b039c09aa4e9ef4ccc 100644 --- a/infer/__init__.py +++ b/infer/__init__.py @@ -1,17 +1,17 @@ -# -*- coding: utf-8 -*- -""" -推理模块 -""" -from .f0_extractor import ( - F0Extractor, - get_f0_extractor, - shift_f0, - F0Method -) - -__all__ = [ - "F0Extractor", - "get_f0_extractor", - "shift_f0", - "F0Method" -] +# -*- coding: utf-8 -*- +""" +推理模块 +""" +from .f0_extractor import ( + F0Extractor, + get_f0_extractor, + shift_f0, + F0Method +) + +__all__ = [ + "F0Extractor", + "get_f0_extractor", + "shift_f0", + "F0Method" +] diff --git a/infer/advanced_dereverb.py b/infer/advanced_dereverb.py index c08143bf4108b5f9c14f342e9c00365695c0ca53..ec154d366fb38e545d66bf94f1ba0bcabf1151e3 100644 --- a/infer/advanced_dereverb.py +++ b/infer/advanced_dereverb.py @@ -213,38 +213,6 @@ def advanced_dereverb( return dry_signal.astype(np.float32), reverb_tail.astype(np.float32) - -def apply_reverb_to_converted( - converted_dry: np.ndarray, - original_reverb: np.ndarray, - mix_ratio: float = 0.8 -) -> np.ndarray: - """ - 将原始混响重新应用到转换后的干声上 - - Args: - converted_dry: 转换后的干声 - original_reverb: 原始混响尾巴 - mix_ratio: 混响混合比例 (0-1) - - Returns: - wet_signal: 带混响的转换结果 - """ - # 对齐长度 - min_len = min(len(converted_dry), len(original_reverb)) - converted_dry = converted_dry[:min_len] - original_reverb = original_reverb[:min_len] - - # 混合 - wet_signal = converted_dry + mix_ratio * original_reverb - - # 软限幅 - from lib.audio import soft_clip - wet_signal = soft_clip(wet_signal, threshold=0.9, ceiling=0.99) - - return wet_signal.astype(np.float32) - - if __name__ == "__main__": # 测试 print("Testing advanced dereverberation...") diff --git a/infer/cover_pipeline.py b/infer/cover_pipeline.py index 1784e90eaf823e83cc494070cc0219fa9e57d25c..214222ad89b8b6ab3a15e212b2caefc17be80534 100644 --- a/infer/cover_pipeline.py +++ b/infer/cover_pipeline.py @@ -5,6 +5,7 @@ import os import gc import re +import json import uuid import shutil import torch @@ -15,8 +16,10 @@ from typing import Optional, Callable, Dict, Tuple, List from infer.separator import ( VocalSeparator, RoformerSeparator, + RoformerDereverbSeparator, KaraokeSeparator, ROFORMER_DEFAULT_MODEL, + ROFORMER_DEREVERB_DEFAULT_MODEL, KARAOKE_DEFAULT_MODEL, check_demucs_available, check_roformer_available, @@ -29,7 +32,12 @@ from infer.official_adapter import ( convert_vocals_official, convert_vocals_official_upstream, ) -from infer.advanced_dereverb import advanced_dereverb, apply_reverb_to_converted +from infer.advanced_dereverb import advanced_dereverb +from infer.quality_policy import ( + compute_active_source_replace, + compute_source_cleanup_budget, + resolve_cover_f0_policy, +) from lib.audio import soft_clip from lib.mixer import mix_vocals_and_accompaniment from lib.logger import log @@ -77,7 +85,7 @@ class CoverPipeline: self.karaoke_separator = None self.rvc_pipeline = None self.temp_dir = Path(__file__).parent.parent / "temp" / "cover" - self._last_vc_preprocess_mode = "direct" + self._last_vc_preprocess_mode = "strict_deecho" def _get_session_dir(self, session_id: str = None) -> Path: """获取会话临时目录""" @@ -88,62 +96,47 @@ class CoverPipeline: return session_dir @staticmethod - def _get_available_uvr_deecho_model() -> Optional[str]: - """优先使用学习型 DeEcho / DeReverb,而不是手工频谱去回声。""" - root = Path(__file__).parent.parent / "assets" / "uvr5_weights" - candidates = [ - ("VR-DeEchoDeReverb", root / "VR-DeEchoDeReverb.pth"), - ("onnx_dereverb_By_FoxJoy", root / "onnx_dereverb_By_FoxJoy" / "vocals.onnx"), - ("VR-DeEchoNormal", root / "VR-DeEchoNormal.pth"), - ("VR-DeEchoAggressive", root / "VR-DeEchoAggressive.pth"), - ] - for model_name, model_path in candidates: - if model_path.exists(): - return model_name + def _get_preferred_deecho_model_label() -> Optional[str]: + """Return the highest-quality learned deecho route available to this install.""" + if check_roformer_available(): + return f"RoFormer:{ROFORMER_DEREVERB_DEFAULT_MODEL}" return None - def _apply_uvr_deecho_for_vc(self, vocals_path: str, session_dir: Path) -> Optional[str]: - """如果本地已有 UVR DeEcho 模型,则优先用学习型方法清理回声。""" - model_name = self._get_available_uvr_deecho_model() - if not model_name: - return None - - from infer.modules.uvr5.modules import uvr - - root = Path(__file__).parent.parent - os.environ["weight_uvr5_root"] = str(root / "assets" / "uvr5_weights") - - input_dir = session_dir / "vc_deecho_input" - vocal_dir = session_dir / "vc_deecho_vocal" - ins_dir = session_dir / "vc_deecho_ins" - input_dir.mkdir(parents=True, exist_ok=True) - vocal_dir.mkdir(parents=True, exist_ok=True) - ins_dir.mkdir(parents=True, exist_ok=True) - - input_file = input_dir / Path(vocals_path).name - shutil.copy2(vocals_path, input_file) - - log.model(f"VC预处理使用UVR DeEcho模型: {model_name}") - for _ in uvr(model_name, str(input_dir), str(vocal_dir), [], str(ins_dir), 10, "wav"): - pass + def _apply_roformer_deecho_for_vc(self, vocals_path: str, session_dir: Path) -> str: + """Use the public RoFormer dereverb/deecho model. Strict SOTA mode does not degrade.""" + if not check_roformer_available(): + raise RuntimeError( + "严格SOTA模式需要 audio-separator[gpu] 才能运行 RoFormer De-Reverb" + ) - candidate_files = sorted( - list(vocal_dir.glob("*.wav")) + list(ins_dir.glob("*.wav")), - key=lambda path: path.stat().st_mtime, - ) - if not candidate_files: - log.warning("UVR DeEcho produced no usable vocal output; falling back to direct lead input") - return None + deecho_dir = session_dir / "vc_roformer_deecho" + separator = RoformerDereverbSeparator(device=self.device) - selected_file = self._select_best_uvr_deecho_output(vocals_path, candidate_files) - if selected_file is None: - selected_file = candidate_files[-1] - log.audio(f"UVR DeEcho selected vocal output: {selected_file.name}") - return str(selected_file) + try: + log.model( + "VC预处理使用 RoFormer De-Reverb SOTA 模型: " + f"{ROFORMER_DEREVERB_DEFAULT_MODEL}" + ) + dry_path = separator.separate_dry(vocals_path, str(deecho_dir)) + scored = self._score_deecho_candidate(vocals_path, Path(dry_path)) + if scored is not None: + _, metrics = scored + self._deecho_metrics = metrics + log.detail( + "RoFormer De-Reverb candidate: " + f"{Path(dry_path).name}, score={metrics['score']:.2f}, " + f"sep={metrics['separation_db']:.2f}dB, corr={metrics['corr']:.3f}, " + f"ratio={metrics['active_ratio']:.3f}, " + f"reduction={metrics['reduction_ratio']:.3f}" + ) + log.audio(f"RoFormer De-Reverb selected dry vocal output: {Path(dry_path).name}") + return dry_path + finally: + separator.unload_model() @staticmethod - def _score_uvr_deecho_candidate(reference_path: str, candidate_path: Path) -> Optional[Tuple[float, Dict[str, float]]]: - """Score UVR DeEcho candidate for VC: keep direct lead, minimize quiet residuals.""" + def _score_deecho_candidate(reference_path: str, candidate_path: Path) -> Optional[Tuple[float, Dict[str, float]]]: + """Score learned DeEcho candidate for VC: keep direct lead, minimize quiet residuals.""" import librosa try: @@ -212,6 +205,10 @@ class CoverPipeline: separation_db = float(20.0 * np.log10((active_rms + 1e-12) / (quiet_rms + 1e-12))) active_ratio = float(active_rms / (ref_active_rms + 1e-12)) ratio_penalty = abs(float(np.log2(max(active_ratio, 1e-4)))) + active_diff_rms = float( + np.sqrt(np.mean(np.square(reference_audio[active_mask] - candidate_audio[active_mask])) + 1e-12) + ) + reduction_ratio = float(active_diff_rms / (ref_active_rms + 1e-12)) score = separation_db + 18.0 * corr - 6.0 * ratio_penalty return score, { @@ -219,37 +216,376 @@ class CoverPipeline: "separation_db": separation_db, "corr": corr, "active_ratio": active_ratio, + "reduction_ratio": reduction_ratio, } - def _select_best_uvr_deecho_output(self, reference_path: str, candidate_files: List[Path]) -> Optional[Path]: - """Pick the UVR DeEcho branch best suited for VC input.""" - best_path = None - best_score = None + @staticmethod + def _save_debug_audio_snapshot( + session_dir: Path, + audio_path: str, + label: str, + ) -> Optional[str]: + source_path = Path(audio_path) + if not source_path.exists(): + return None + suffix = source_path.suffix or ".wav" + snapshot_path = session_dir / f"{label}{suffix}" + shutil.copy2(source_path, snapshot_path) + return str(snapshot_path) - for candidate_path in candidate_files: - scored = self._score_uvr_deecho_candidate(reference_path, candidate_path) - if scored is None: - continue + @staticmethod + def _append_quality_debug_entry(session_dir: Path, entry: Dict[str, object]) -> None: + report_path = session_dir / "quality_debug.json" + payload: Dict[str, object] = {"stages": []} + if report_path.exists(): + try: + payload = json.loads(report_path.read_text(encoding="utf-8")) + except Exception: + payload = {"stages": []} + stages = payload.get("stages") + if not isinstance(stages, list): + stages = [] + stages.append(entry) + payload["stages"] = stages + report_path.write_text( + json.dumps(payload, ensure_ascii=False, indent=2), + encoding="utf-8", + ) - score, metrics = scored - log.detail( - "UVR DeEcho candidate: " - f"{candidate_path.name}, score={metrics['score']:.2f}, " - f"sep={metrics['separation_db']:.2f}dB, corr={metrics['corr']:.3f}, " - f"ratio={metrics['active_ratio']:.3f}" + @staticmethod + def _export_suspect_transition_clips( + session_dir: Path, + stage: str, + candidate_path: str, + reference_path: Optional[str], + suspect_times: List[float], + max_clips: int = 6, + clip_duration_sec: float = 1.2, + ) -> List[str]: + import soundfile as sf + + if not suspect_times: + return [] + + clip_dir = session_dir / "debug_clips" / stage + clip_dir.mkdir(parents=True, exist_ok=True) + + exported: List[str] = [] + + def _export_one(prefix: str, path: str, time_sec: float) -> Optional[str]: + if not path or not Path(path).exists(): + return None + audio, sr = sf.read(path, always_2d=True) + audio = np.asarray(audio, dtype=np.float32) + clip_samples = int(max(1, round(float(clip_duration_sec) * sr))) + center = int(round(float(time_sec) * sr)) + start = max(0, center - clip_samples // 2) + end = min(audio.shape[0], start + clip_samples) + start = max(0, end - clip_samples) + clip = audio[start:end] + out_path = clip_dir / f"{prefix}_{float(time_sec):07.3f}s.wav" + sf.write(str(out_path), clip, sr) + return str(out_path) + + for time_sec in suspect_times[: max(1, int(max_clips))]: + candidate_clip = _export_one("candidate", candidate_path, float(time_sec)) + reference_clip = _export_one("reference", reference_path, float(time_sec)) if reference_path else None + if candidate_clip: + exported.append(candidate_clip) + if reference_clip: + exported.append(reference_clip) + + return exported + + @staticmethod + def _analyze_quality_stage( + candidate_path: str, + reference_path: Optional[str] = None, + ) -> Dict[str, object]: + import librosa + import soundfile as sf + + def _load_mono(path: str) -> Tuple[np.ndarray, int]: + audio, sr = sf.read(path, always_2d=True) + audio = np.asarray(audio, dtype=np.float32) + mono = audio.mean(axis=1) + return mono.astype(np.float32), int(sr) + + def _safe_rms(values: np.ndarray, mask: np.ndarray) -> float: + values = np.asarray(values, dtype=np.float32) + mask = np.asarray(mask, dtype=bool) + if values.size == 0 or mask.size == 0 or not np.any(mask): + return 0.0 + return float(np.sqrt(np.mean(np.square(values[mask])) + 1e-12)) + + def _preemphasis_rms(audio: np.ndarray, frame_length: int, hop_length: int) -> np.ndarray: + audio = np.asarray(audio, dtype=np.float32).reshape(-1) + if audio.size == 0: + return np.zeros(0, dtype=np.float32) + residual = np.empty_like(audio) + residual[0] = audio[0] + residual[1:] = audio[1:] - 0.97 * audio[:-1] + return librosa.feature.rms( + y=residual, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + + candidate_audio, candidate_sr = _load_mono(candidate_path) + aligned_len = int(candidate_audio.size) + peak = float(np.max(np.abs(candidate_audio))) if aligned_len > 0 else 0.0 + clip_samples = int(np.sum(np.abs(candidate_audio) >= 0.995)) if aligned_len > 0 else 0 + analysis: Dict[str, object] = { + "sample_rate": int(candidate_sr), + "duration_sec": float(aligned_len / max(candidate_sr, 1)), + "global_rms": float(np.sqrt(np.mean(np.square(candidate_audio)) + 1e-12)) if aligned_len > 0 else 0.0, + "peak": peak, + "clip_samples": clip_samples, + "clip_ratio": float(clip_samples / max(aligned_len, 1)), + } + + if not reference_path: + return analysis + + reference_audio, reference_sr = _load_mono(reference_path) + if reference_sr != candidate_sr: + reference_audio = librosa.resample( + reference_audio, + orig_sr=reference_sr, + target_sr=candidate_sr, + ).astype(np.float32) + + aligned_len = min(reference_audio.size, candidate_audio.size) + if aligned_len <= 2048: + return analysis + + reference_audio = reference_audio[:aligned_len] + candidate_audio = candidate_audio[:aligned_len] + + frame_length = 2048 + hop_length = 512 + eps = 1e-8 + + ref_rms = librosa.feature.rms( + y=reference_audio, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + cand_rms = librosa.feature.rms( + y=candidate_audio, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + ref_hf = _preemphasis_rms(reference_audio, frame_length, hop_length) + cand_hf = _preemphasis_rms(candidate_audio, frame_length, hop_length) + frame_count = min(ref_rms.size, cand_rms.size, ref_hf.size, cand_hf.size) + if frame_count <= 4: + return analysis + + ref_rms = ref_rms[:frame_count] + cand_rms = cand_rms[:frame_count] + ref_hf = ref_hf[:frame_count] + cand_hf = cand_hf[:frame_count] + + ref_db = 20.0 * np.log10(ref_rms + eps) + ref_db_peak = float(np.percentile(ref_db, 95)) + active_mask = ref_db >= (ref_db_peak - 24.0) + quiet_mask = ref_db <= (ref_db_peak - 38.0) + midquiet_mask = (ref_db > (ref_db_peak - 38.0)) & (ref_db <= (ref_db_peak - 28.0)) + + corr_active = 0.0 + if np.sum(active_mask) > 8: + corr_val = np.corrcoef(ref_rms[active_mask], cand_rms[active_mask])[0, 1] + if np.isfinite(corr_val): + corr_active = float(np.clip(corr_val, -1.0, 1.0)) + + cand_delta = np.abs(np.diff(cand_rms)) + ref_delta = np.abs(np.diff(ref_rms)) + spike_threshold = 0.010 + 1.8 * ref_delta + spike_mask = ( + (cand_delta > spike_threshold) + & (np.maximum(cand_rms[:-1], cand_rms[1:]) > float(np.percentile(cand_rms, 60))) + ) + suspect_frames = np.where(spike_mask)[0] + suspect_times = [ + round(float(idx * hop_length / candidate_sr), 3) + for idx in suspect_frames[:12] + ] + + low_mid_energy = np.clip((ref_db_peak - 18.0 - ref_db) / 22.0, 0.0, 1.0) + not_sustained_gap = 1.0 - np.clip((ref_db_peak - 42.0 - ref_db) / 8.0, 0.0, 1.0) + breath_excess = low_mid_energy * not_sustained_gap * np.maximum( + np.clip((cand_rms - 1.15 * ref_rms) / (cand_rms + eps), 0.0, 1.0), + np.clip((cand_hf - 1.10 * ref_hf) / (cand_hf + eps), 0.0, 1.0), + ) + + analysis.update( + { + "reference_duration_sec": float(reference_audio.size / max(candidate_sr, 1)), + "active_rms_ratio": float(_safe_rms(cand_rms, active_mask) / (_safe_rms(ref_rms, active_mask) + 1e-12)), + "active_hf_ratio": float(_safe_rms(cand_hf, active_mask) / (_safe_rms(ref_hf, active_mask) + 1e-12)), + "quiet_rms_ratio": float(_safe_rms(cand_rms, quiet_mask) / (_safe_rms(ref_rms, quiet_mask) + 1e-12)), + "midquiet_rms_ratio": float(_safe_rms(cand_rms, midquiet_mask) / (_safe_rms(ref_rms, midquiet_mask) + 1e-12)), + "quiet_hf_ratio": float(_safe_rms(cand_hf, quiet_mask) / (_safe_rms(ref_hf, quiet_mask) + 1e-12)), + "midquiet_hf_ratio": float(_safe_rms(cand_hf, midquiet_mask) / (_safe_rms(ref_hf, midquiet_mask) + 1e-12)), + "corr_active": corr_active, + "transition_spike_ratio": float(np.percentile(cand_delta, 95) / (np.percentile(ref_delta, 95) + 1e-12)), + "transition_spike_frames": int(suspect_frames.size), + "transition_spike_times_sec": suspect_times, + "synthetic_breath_frames": int(np.sum(breath_excess > 0.20)), + } + ) + return analysis + + def _record_quality_debug( + self, + session_dir: Path, + stage: str, + candidate_path: str, + reference_path: Optional[str] = None, + snapshot_label: Optional[str] = None, + extra: Optional[Dict[str, object]] = None, + ) -> None: + try: + snapshot_path = None + if snapshot_label: + snapshot_path = self._save_debug_audio_snapshot( + session_dir=session_dir, + audio_path=candidate_path, + label=snapshot_label, + ) + analysis = self._analyze_quality_stage( + candidate_path=snapshot_path or candidate_path, + reference_path=reference_path, ) - if best_score is None or score > best_score: - best_score = score - best_path = candidate_path + entry: Dict[str, object] = { + "stage": stage, + "candidate_path": str(snapshot_path or candidate_path), + "reference_path": str(reference_path) if reference_path else None, + "preprocess_mode": self._last_vc_preprocess_mode, + "analysis": analysis, + } + if extra: + entry["extra"] = extra + suspect_times = analysis.get("transition_spike_times_sec", []) + if isinstance(suspect_times, list) and suspect_times: + exported_clips = self._export_suspect_transition_clips( + session_dir=session_dir, + stage=stage, + candidate_path=str(snapshot_path or candidate_path), + reference_path=reference_path, + suspect_times=[float(t) for t in suspect_times], + ) + if exported_clips: + entry["suspect_clips"] = exported_clips + self._append_quality_debug_entry(session_dir, entry) + + if reference_path: + log.detail( + f"Quality[{stage}]: " + f"active={analysis.get('active_rms_ratio', 0.0):.3f}, " + f"active_hf={analysis.get('active_hf_ratio', 0.0):.3f}, " + f"quiet={analysis.get('quiet_rms_ratio', 0.0):.3f}, " + f"midquiet={analysis.get('midquiet_rms_ratio', 0.0):.3f}, " + f"quiet_hf={analysis.get('quiet_hf_ratio', 0.0):.3f}, " + f"spike={analysis.get('transition_spike_ratio', 0.0):.3f}, " + f"breath_frames={analysis.get('synthetic_breath_frames', 0)}" + ) + suspect_times = analysis.get("transition_spike_times_sec", []) + if isinstance(suspect_times, list) and suspect_times: + times_text = ", ".join(f"{float(t):.2f}s" for t in suspect_times[:8]) + log.detail(f"Quality[{stage}] suspect transitions: {times_text}") + else: + log.detail( + f"Quality[{stage}]: " + f"peak={analysis.get('peak', 0.0):.3f}, " + f"clip_ratio={analysis.get('clip_ratio', 0.0):.6f}, " + f"rms={analysis.get('global_rms', 0.0):.4f}" + ) + except Exception as e: + log.warning(f"Quality debug capture failed at {stage}: {e}") + + @staticmethod + def _load_quality_stage_analysis( + session_dir: Path, + stage: str, + ) -> Optional[Dict[str, object]]: + report_path = session_dir / "quality_debug.json" + if not report_path.exists(): + return None + try: + with open(report_path, "r", encoding="utf-8") as f: + payload = json.load(f) + except Exception: + return None + + stages = payload.get("stages", []) + if not isinstance(stages, list): + return None + + for entry in reversed(stages): + if not isinstance(entry, dict): + continue + if entry.get("stage") != stage: + continue + analysis = entry.get("analysis") + if isinstance(analysis, dict): + return analysis + return None + + def _maybe_log_diagnostic_hint( + self, + session_dir: Path, + model_path: str, + index_path: Optional[str], + ) -> None: + final_analysis = self._load_quality_stage_analysis(session_dir, "vc_final_state") + raw_analysis = self._load_quality_stage_analysis(session_dir, "vc_raw") + analysis = final_analysis or raw_analysis + if not analysis: + return + + quiet_rms = float(analysis.get("quiet_rms_ratio", 0.0) or 0.0) + quiet_hf = float(analysis.get("quiet_hf_ratio", 0.0) or 0.0) + spike = float(analysis.get("transition_spike_ratio", 0.0) or 0.0) + breath_frames = int(analysis.get("synthetic_breath_frames", 0) or 0) + should_hint = ( + quiet_rms >= 3.0 + or quiet_hf >= 2.5 + or spike >= 1.6 + or breath_frames >= 350 + ) + if not should_hint: + return + + command = ( + f'python tools/diagnose_vc_session.py --session-dir "{session_dir}" ' + f'--model-path "{model_path}"' + ) + if index_path: + command += f' --index-path "{index_path}"' - return best_path + log.warning( + "Persistent VC artifacts detected after final cleanup; " + "run the diagnostic matrix to compare short A/B clips." + ) + log.warning( + "Diagnostic summary: " + f"quiet={quiet_rms:.3f}, quiet_hf={quiet_hf:.3f}, " + f"spike={spike:.3f}, breath_frames={breath_frames}" + ) + log.detail(f"Diagnostic command: {command}") def _init_separator( self, model_name: str = "htdemucs", shifts: int = 2, overlap: float = 0.25, - split: bool = True + split: bool = True, + roformer_model: str = ROFORMER_DEFAULT_MODEL, ): """初始化人声分离器 (Demucs 或 Roformer)""" # Roformer 模式 @@ -261,12 +597,16 @@ class CoverPipeline: if ( self.separator is not None and isinstance(self.separator, RoformerSeparator) + and getattr(self.separator, "model_filename", None) == roformer_model ): return if self.separator is not None: self.separator.unload_model() self.separator = None - self.separator = RoformerSeparator(device=self.device) + self.separator = RoformerSeparator( + model_filename=roformer_model, + device=self.device, + ) return # Demucs 模式 @@ -275,10 +615,7 @@ class CoverPipeline: available = {m["name"] for m in get_available_models() if m["name"] != "roformer"} if model_name not in available: - log.warning( - f"未知的 Demucs 模型 '{model_name}',回退到 'htdemucs'" - ) - model_name = "htdemucs" + raise ValueError(f"未知的 Demucs 模型 '{model_name}'") if ( self.separator is not None @@ -310,7 +647,7 @@ class CoverPipeline: if ( self.karaoke_separator is not None and isinstance(self.karaoke_separator, KaraokeSeparator) - and model_name in getattr(self.karaoke_separator, "model_candidates", []) + and getattr(self.karaoke_separator, "model_candidates", [None])[0] == model_name ): return @@ -528,25 +865,36 @@ class CoverPipeline: eps = 1e-8 orig_rms_db = 20.0 * np.log10(orig_rms + eps) + derev_rms_db = 20.0 * np.log10(derev_rms + eps) ref_db = float(np.percentile(orig_rms_db, 95)) + derev_ref_db = float(np.percentile(derev_rms_db, 95)) attenuation_ratio = derev_rms / (orig_rms + eps) - vocal_activity = np.clip((orig_rms_db - (ref_db - 30.0)) / 18.0, 0.0, 1.0) - hold_frames = max(1, int(0.28 * sr / hop_length)) - vocal_activity = CoverPipeline._hold_activity_curve(vocal_activity, hold_frames) + direct_activity = np.clip((derev_rms_db - (derev_ref_db - 30.0)) / 18.0, 0.0, 1.0) + direct_activity = CoverPipeline._hold_activity_curve( + direct_activity, + max(1, int(0.20 * sr / hop_length)), + ) - # Mark frames: original is quiet (echo tail) AND dereverb removed a lot - quiet_mask = ( - (orig_rms_db < (ref_db - 40.0)) - & (attenuation_ratio < 0.25) - & (vocal_activity < 0.15) + removed_strength = np.clip((0.62 - attenuation_ratio) / 0.48, 0.0, 1.0) + direct_absence = np.clip((0.30 - direct_activity) / 0.30, 0.0, 1.0) + quiet_tail_strength = ( + np.clip(((ref_db - 36.0) - orig_rms_db) / 12.0, 0.0, 1.0) + * removed_strength + * direct_absence + ) + loud_echo_strength = ( + np.clip((orig_rms_db - (ref_db - 30.0)) / 18.0, 0.0, 1.0) + * removed_strength + * direct_absence ) + echo_strength = np.maximum(quiet_tail_strength, loud_echo_strength) # Enforce minimum duration of 100ms min_frames = max(1, int(0.1 * sr / hop_length)) # Dilate: only keep runs >= min_frames - gate = quiet_mask.astype(np.float32) + gate = (echo_strength > 0.20).astype(np.float32) # Simple run-length filter filtered = np.zeros_like(gate) run_start = 0 @@ -567,7 +915,7 @@ class CoverPipeline: # 50ms sigmoid transition transition_frames = max(1, int(0.05 * sr / hop_length)) kernel = np.ones(transition_frames, dtype=np.float32) / transition_frames - filtered = np.convolve(filtered, kernel, mode="same") + filtered = np.convolve(filtered * echo_strength, kernel, mode="same") filtered = np.clip(filtered, 0.0, 1.0) # Apply: gated frames attenuated to 0.18x,保留更多尾音避免不自然断裂 @@ -669,53 +1017,926 @@ class CoverPipeline: frame_db = 20.0 * np.log10(frame_rms + eps) ref_db = float(np.percentile(frame_db, 95)) - activity = np.clip((frame_db - (ref_db - 30.0)) / 18.0, 0.0, 1.0) - kernel = np.array([1, 2, 3, 2, 1], dtype=np.float32) - kernel /= np.sum(kernel) - activity = np.convolve(activity, kernel, mode="same") - activity = CoverPipeline._hold_activity_curve( - activity, - max(1, int(0.24 * sr / hop_length)), + activity = np.clip((frame_db - (ref_db - 30.0)) / 18.0, 0.0, 1.0) + kernel = np.array([1, 2, 3, 2, 1], dtype=np.float32) + kernel /= np.sum(kernel) + activity = np.convolve(activity, kernel, mode="same") + activity = CoverPipeline._hold_activity_curve( + activity, + max(1, int(0.24 * sr / hop_length)), + ) + frame_weights = np.clip(activity * activity, 0.0, 1.0) + return CoverPipeline._frame_curve_to_sample_gain( + frame_weights, + len(reference_audio), + hop_length, + ) + + @staticmethod + def _weighted_rms(audio: np.ndarray, weights: np.ndarray) -> float: + """Compute RMS under sample-domain weights.""" + audio = np.asarray(audio, dtype=np.float32).reshape(-1) + weights = np.asarray(weights, dtype=np.float32).reshape(-1) + if audio.size == 0 or weights.size == 0: + return 0.0 + + aligned_len = min(audio.size, weights.size) + if aligned_len <= 0: + return 0.0 + + audio = audio[:aligned_len] + weights = np.clip(weights[:aligned_len], 0.0, 1.0) + total = float(np.sum(weights)) + if total <= 1e-6: + return 0.0 + return float(np.sqrt(np.sum((audio * audio) * weights) / total + 1e-12)) + + def _apply_source_gap_suppression( + self, + source_vocals_path: str, + converted_vocals_path: str, + ) -> None: + """Suppress hallucinated noise in sustained no-vocal gaps only.""" + import librosa + import soundfile as sf + + source_audio, source_sr = librosa.load(source_vocals_path, sr=None, mono=True) + converted_audio, converted_sr = sf.read(converted_vocals_path) + if converted_audio.ndim > 1: + converted_audio = converted_audio.mean(axis=1) + source_audio = np.asarray(source_audio, dtype=np.float32) + converted_audio = np.asarray(converted_audio, dtype=np.float32) + + if source_sr != converted_sr: + source_audio = librosa.resample( + source_audio, + orig_sr=source_sr, + target_sr=converted_sr, + ).astype(np.float32) + + aligned_len = min(len(source_audio), len(converted_audio)) + if aligned_len <= 0: + return + + source_audio = source_audio[:aligned_len] + converted_main = converted_audio[:aligned_len] + gain, gated_frames, total_frames = self._compute_quiet_gap_sample_gain( + source_audio, + converted_sr, + ) + gain = np.clip(gain[:aligned_len], 0.0, 1.0).astype(np.float32) + suppressed = converted_main * gain + + attenuated_samples = int(np.sum(gain < 0.50)) + if attenuated_samples > 0: + log.detail( + f"Source gap suppression: attenuated {attenuated_samples}/{aligned_len} samples in no-vocal regions" + ) + if gated_frames > 0: + log.detail( + f"Source gap suppression: detected {gated_frames}/{total_frames} sustained quiet frames" + ) + + if len(converted_audio) > aligned_len: + tail = converted_audio[aligned_len:] * 0.0 + converted_audio = np.concatenate([suppressed, tail.astype(np.float32)]) + else: + converted_audio = suppressed + + sf.write(converted_vocals_path, converted_audio.astype(np.float32), converted_sr) + + def _apply_source_breath_cleanup( + self, + source_vocals_path: str, + converted_vocals_path: str, + ) -> None: + """Blend dry source breaths back where converted low-energy regions sound too synthetic.""" + import librosa + import soundfile as sf + + def _preemphasis_rms(audio: np.ndarray, frame_length: int, hop_length: int) -> np.ndarray: + audio = np.asarray(audio, dtype=np.float32).reshape(-1) + if audio.size == 0: + return np.zeros(0, dtype=np.float32) + residual = np.empty_like(audio) + residual[0] = audio[0] + residual[1:] = audio[1:] - 0.97 * audio[:-1] + return librosa.feature.rms( + y=residual, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0] + + def _spectral_flatness(audio: np.ndarray, frame_length: int, hop_length: int) -> np.ndarray: + audio = np.asarray(audio, dtype=np.float32).reshape(-1) + if audio.size == 0: + return np.zeros(0, dtype=np.float32) + return librosa.feature.spectral_flatness( + y=audio + 1e-8, + n_fft=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + + source_audio, source_sr = librosa.load(source_vocals_path, sr=None, mono=True) + converted_audio, converted_sr = sf.read(converted_vocals_path) + if converted_audio.ndim > 1: + converted_audio = converted_audio.mean(axis=1) + source_audio = np.asarray(source_audio, dtype=np.float32) + converted_audio = np.asarray(converted_audio, dtype=np.float32) + + if source_sr != converted_sr: + source_audio = librosa.resample( + source_audio, + orig_sr=source_sr, + target_sr=converted_sr, + ).astype(np.float32) + + aligned_len = min(len(source_audio), len(converted_audio)) + if aligned_len <= 0: + return + + source_main = source_audio[:aligned_len] + converted_main = converted_audio[:aligned_len] + + frame_length = 2048 + hop_length = 512 + eps = 1e-8 + + source_rms = librosa.feature.rms( + y=source_main, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0] + converted_rms = librosa.feature.rms( + y=converted_main, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0] + source_hf = _preemphasis_rms(source_main, frame_length, hop_length) + converted_hf = _preemphasis_rms(converted_main, frame_length, hop_length) + source_flatness = _spectral_flatness(source_main, frame_length, hop_length) + converted_flatness = _spectral_flatness(converted_main, frame_length, hop_length) + + frame_count = min( + source_rms.size, + converted_rms.size, + source_hf.size, + converted_hf.size, + source_flatness.size, + converted_flatness.size, + ) + if frame_count <= 0: + return + + source_rms = source_rms[:frame_count].astype(np.float32) + converted_rms = converted_rms[:frame_count].astype(np.float32) + source_hf = source_hf[:frame_count].astype(np.float32) + converted_hf = converted_hf[:frame_count].astype(np.float32) + source_flatness = source_flatness[:frame_count].astype(np.float32) + converted_flatness = converted_flatness[:frame_count].astype(np.float32) + + source_db = 20.0 * np.log10(source_rms + eps) + ref_db = float(np.percentile(source_db, 95)) + + low_mid_energy = np.clip((ref_db - 18.0 - source_db) / 22.0, 0.0, 1.0) + not_sustained_gap = 1.0 - np.clip((ref_db - 42.0 - source_db) / 8.0, 0.0, 1.0) + breath_like = ( + not_sustained_gap + * np.clip((source_flatness - 0.22) / 0.22, 0.0, 1.0) + * np.clip(source_hf / (float(np.percentile(source_hf, 72)) + eps), 0.0, 1.25) + ) + excess_rms = np.clip( + (converted_rms - 1.15 * source_rms) / (converted_rms + eps), + 0.0, + 1.0, + ) + excess_hf = np.clip( + (converted_hf - 1.10 * source_hf) / (converted_hf + eps), + 0.0, + 1.0, + ) + tonalized_breath = breath_like * np.clip( + (source_flatness - converted_flatness) / 0.18, + 0.0, + 1.0, + ) + hf_ratio = converted_hf / (source_hf + eps) + low_energy_tonal_hf = ( + np.sqrt(low_mid_energy) + * not_sustained_gap + * excess_hf + * np.clip((hf_ratio - 1.02) / 1.80, 0.0, 1.0) + ) + + blend_curve = np.maximum( + np.maximum( + low_mid_energy * not_sustained_gap * np.maximum(excess_rms, excess_hf), + 1.35 * low_energy_tonal_hf, + ), + 0.88 + * tonalized_breath + * np.maximum( + np.clip((converted_rms - 0.92 * source_rms) / (converted_rms + eps), 0.0, 1.0), + np.clip((converted_hf - 0.95 * source_hf) / (converted_hf + eps), 0.0, 1.0), + ), + ) + smooth_kernel = np.array([1, 2, 3, 2, 1], dtype=np.float32) + smooth_kernel /= np.sum(smooth_kernel) + blend_curve = np.convolve(blend_curve, smooth_kernel, mode="same") + blend_curve = self._hold_activity_curve( + blend_curve, + max(1, int(0.05 * converted_sr / hop_length)), + ) + blend_curve = np.clip(blend_curve, 0.0, 1.0).astype(np.float32) + + blended_frames = int(np.sum(blend_curve > 0.20)) + if blended_frames <= 0: + return + mechanical_frames = int(np.sum(tonalized_breath > 0.20)) + + sample_blend = 0.82 * self._frame_curve_to_sample_gain( + blend_curve, + aligned_len, + hop_length, + ) + sample_blend = np.clip(sample_blend[:aligned_len], 0.0, 0.82).astype(np.float32) + blended_main = ( + converted_main * (1.0 - sample_blend) + + source_main * sample_blend + ).astype(np.float32) + + if len(converted_audio) > aligned_len: + converted_audio = np.concatenate([blended_main, converted_audio[aligned_len:]]) + else: + converted_audio = blended_main + + sf.write(converted_vocals_path, converted_audio.astype(np.float32), converted_sr) + log.detail( + "Source breath cleanup: " + f"blended {blended_frames}/{frame_count} low-mid-energy frames toward dry source " + f"(mechanical={mechanical_frames})" + ) + + def _apply_source_transition_cleanup( + self, + source_vocals_path: str, + converted_vocals_path: str, + ) -> None: + """Blend dry source into glitch-prone breaths and transition spikes.""" + import librosa + import soundfile as sf + + def _preemphasis_rms(audio: np.ndarray, frame_length: int, hop_length: int) -> np.ndarray: + audio = np.asarray(audio, dtype=np.float32).reshape(-1) + if audio.size == 0: + return np.zeros(0, dtype=np.float32) + residual = np.empty_like(audio) + residual[0] = audio[0] + residual[1:] = audio[1:] - 0.97 * audio[:-1] + return librosa.feature.rms( + y=residual, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + + source_audio, source_sr = librosa.load(source_vocals_path, sr=None, mono=True) + converted_audio, converted_sr = sf.read(converted_vocals_path) + if converted_audio.ndim > 1: + converted_audio = converted_audio.mean(axis=1) + source_audio = np.asarray(source_audio, dtype=np.float32) + converted_audio = np.asarray(converted_audio, dtype=np.float32) + + if source_sr != converted_sr: + source_audio = librosa.resample( + source_audio, + orig_sr=source_sr, + target_sr=converted_sr, + ).astype(np.float32) + + aligned_len = min(len(source_audio), len(converted_audio)) + if aligned_len <= 0: + return + + source_main = source_audio[:aligned_len] + converted_main = converted_audio[:aligned_len] + + frame_length = 2048 + hop_length = 512 + eps = 1e-8 + + source_rms = librosa.feature.rms( + y=source_main, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + converted_rms = librosa.feature.rms( + y=converted_main, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + source_hf = _preemphasis_rms(source_main, frame_length, hop_length) + converted_hf = _preemphasis_rms(converted_main, frame_length, hop_length) + + frame_count = min( + source_rms.size, + converted_rms.size, + source_hf.size, + converted_hf.size, + ) + if frame_count <= 4: + return + + source_rms = source_rms[:frame_count] + converted_rms = converted_rms[:frame_count] + source_hf = source_hf[:frame_count] + converted_hf = converted_hf[:frame_count] + + source_db = 20.0 * np.log10(source_rms + eps) + ref_db = float(np.percentile(source_db, 95)) + low_mid_energy = np.clip((ref_db - 18.0 - source_db) / 22.0, 0.0, 1.0) + not_sustained_gap = 1.0 - np.clip((ref_db - 42.0 - source_db) / 8.0, 0.0, 1.0) + excess_rms = np.clip( + (converted_rms - 1.10 * source_rms) / (converted_rms + eps), + 0.0, + 1.0, + ) + excess_hf = np.clip( + (converted_hf - 1.07 * source_hf) / (converted_hf + eps), + 0.0, + 1.0, + ) + + breath_curve = low_mid_energy * not_sustained_gap * np.maximum(excess_rms, excess_hf) + cand_delta = np.abs(np.diff(converted_rms, prepend=converted_rms[:1])) + ref_delta = np.abs(np.diff(source_rms, prepend=source_rms[:1])) + spike_excess = np.clip( + (cand_delta - (0.007 + 1.35 * ref_delta)) / (cand_delta + eps), + 0.0, + 1.0, + ) + activity = np.clip((source_db - (ref_db - 28.0)) / 14.0, 0.0, 1.0) + transition_curve = spike_excess * np.maximum(activity, 0.35 * low_mid_energy) + quiet_curve = np.clip((ref_db - 35.0 - source_db) / 12.0, 0.0, 1.0) * np.maximum( + excess_rms, + excess_hf, + ) + hf_ratio = converted_hf / (source_hf + eps) + tonal_hf_curve = ( + np.sqrt(low_mid_energy) + * not_sustained_gap + * excess_hf + * np.clip((hf_ratio - 1.02) / 1.80, 0.0, 1.0) + ) + overshoot_energy = np.maximum( + np.clip((converted_rms - 1.09 * source_rms) / (converted_rms + eps), 0.0, 1.0), + np.clip((converted_hf - 1.07 * source_hf) / (converted_hf + eps), 0.0, 1.0), + ) + overshoot_focus = np.clip( + np.maximum( + 1.15 * spike_excess, + 0.55 * low_mid_energy * np.maximum(excess_rms, excess_hf), + ), + 0.0, + 1.0, + ) + overshoot_curve = activity * overshoot_energy * overshoot_focus + + smooth_kernel = np.array([1, 2, 3, 2, 1], dtype=np.float32) + smooth_kernel /= np.sum(smooth_kernel) + for _ in range(2): + breath_curve = np.convolve(breath_curve, smooth_kernel, mode="same") + transition_curve = np.convolve(transition_curve, smooth_kernel, mode="same") + quiet_curve = np.convolve(quiet_curve, smooth_kernel, mode="same") + tonal_hf_curve = np.convolve(tonal_hf_curve, smooth_kernel, mode="same") + overshoot_curve = np.convolve(overshoot_curve, smooth_kernel, mode="same") + + breath_curve = self._hold_activity_curve( + breath_curve, + max(1, int(0.07 * converted_sr / hop_length)), + ) + transition_curve = self._hold_activity_curve( + transition_curve, + max(1, int(0.07 * converted_sr / hop_length)), + ) + quiet_curve = self._hold_activity_curve( + quiet_curve, + max(1, int(0.08 * converted_sr / hop_length)), + ) + tonal_hf_curve = self._hold_activity_curve( + tonal_hf_curve, + max(1, int(0.06 * converted_sr / hop_length)), + ) + overshoot_curve = self._hold_activity_curve( + overshoot_curve, + max(1, int(0.04 * converted_sr / hop_length)), + ) + + blend_curve = np.maximum.reduce( + [ + 0.72 * np.clip(breath_curve, 0.0, 1.0), + 0.74 * np.clip(transition_curve, 0.0, 1.0), + 0.50 * np.clip(quiet_curve, 0.0, 1.0), + 0.76 * np.clip(tonal_hf_curve, 0.0, 1.0), + 0.40 * np.clip(overshoot_curve, 0.0, 1.0), + ] + ) + blend_curve = np.clip(blend_curve, 0.0, 0.88).astype(np.float32) + + blended_frames = int(np.sum(blend_curve > 0.20)) + if blended_frames <= 0: + return + + sample_blend = self._frame_curve_to_sample_gain( + blend_curve, + aligned_len, + hop_length, + )[:aligned_len] + repaired_main = ( + converted_main * (1.0 - sample_blend) + + source_main * sample_blend + ).astype(np.float32) + trim_curve = np.clip( + np.maximum( + 0.72 * overshoot_curve, + 0.55 * transition_curve, + 0.50 * tonal_hf_curve, + ), + 0.0, + 1.0, + ).astype(np.float32) + trim_weights = self._frame_curve_to_sample_gain( + trim_curve, + aligned_len, + hop_length, + )[:aligned_len] + trim_weights = ( + trim_weights + * self._compute_activity_sample_weights(source_main, converted_sr)[:aligned_len] + ).astype(np.float32) + trim_weights = np.clip(trim_weights, 0.0, 1.0) + + ref_active_rms = self._weighted_rms(source_main, trim_weights) + out_active_rms = self._weighted_rms(repaired_main, trim_weights) + gain = 1.0 + if ( + float(np.sum(trim_weights)) > 1e-3 + and ref_active_rms > 1e-6 + and out_active_rms > ref_active_rms * 1.05 + ): + gain = float( + np.clip( + (1.04 * ref_active_rms) / (out_active_rms + eps), + 0.92, + 1.0, + ) + ) + if gain < 0.999: + repaired_main = self._apply_weighted_gain( + repaired_main, + trim_weights, + gain, + ).astype(np.float32) + + if len(converted_audio) > aligned_len: + converted_audio = np.concatenate([repaired_main, converted_audio[aligned_len:]]) + else: + converted_audio = repaired_main + + sf.write(converted_vocals_path, converted_audio.astype(np.float32), converted_sr) + log.detail( + "Source transition cleanup: " + f"blended {blended_frames}/{frame_count} frames " + f"(breath={int(np.sum(breath_curve > 0.20))}, " + f"spike={int(np.sum(transition_curve > 0.20))}, " + f"quiet={int(np.sum(quiet_curve > 0.20))}, " + f"overshoot={int(np.sum(overshoot_curve > 0.20))})" + ) + if gain < 0.999: + log.detail( + "Source transition cleanup: " + f"hotspot RMS trim ref={ref_active_rms:.6f}, " + f"out={out_active_rms:.6f}, gain={gain:.3f}" + ) + + def _restore_active_vocal_loudness( + self, + reference_vocals_path: str, + converted_vocals_path: str, + target_ratio: float = 0.86, + max_gain: float = 1.85, + ) -> None: + """Restore vocal body loudness without boosting breaths and unstable frames.""" + import librosa + import soundfile as sf + + def _preemphasis_rms(audio: np.ndarray, frame_length: int, hop_length: int) -> np.ndarray: + audio = np.asarray(audio, dtype=np.float32).reshape(-1) + if audio.size == 0: + return np.zeros(0, dtype=np.float32) + residual = np.empty_like(audio) + residual[0] = audio[0] + residual[1:] = audio[1:] - 0.97 * audio[:-1] + return librosa.feature.rms( + y=residual, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + + reference_audio, reference_sr = librosa.load(reference_vocals_path, sr=None, mono=True) + converted_audio, converted_sr = sf.read(converted_vocals_path) + if converted_audio.ndim > 1: + converted_audio = converted_audio.mean(axis=1) + reference_audio = np.asarray(reference_audio, dtype=np.float32) + converted_audio = np.asarray(converted_audio, dtype=np.float32) + + if reference_sr != converted_sr: + reference_audio = librosa.resample( + reference_audio, + orig_sr=reference_sr, + target_sr=converted_sr, + ).astype(np.float32) + + aligned_len = min(len(reference_audio), len(converted_audio)) + if aligned_len <= 0: + return + + reference_main = reference_audio[:aligned_len] + converted_main = converted_audio[:aligned_len] + + frame_length = 2048 + hop_length = 512 + eps = 1e-8 + + reference_rms = librosa.feature.rms( + y=reference_main, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + converted_rms = librosa.feature.rms( + y=converted_main, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + reference_hf = _preemphasis_rms(reference_main, frame_length, hop_length) + converted_hf = _preemphasis_rms(converted_main, frame_length, hop_length) + + frame_count = min( + reference_rms.size, + converted_rms.size, + reference_hf.size, + converted_hf.size, + ) + if frame_count <= 4: + return + + reference_rms = reference_rms[:frame_count] + converted_rms = converted_rms[:frame_count] + reference_hf = reference_hf[:frame_count] + converted_hf = converted_hf[:frame_count] + + reference_db = 20.0 * np.log10(reference_rms + eps) + ref_db = float(np.percentile(reference_db, 95)) + body_curve = np.square(np.clip((reference_db - (ref_db - 18.0)) / 9.0, 0.0, 1.0)) + smooth_kernel = np.array([1, 2, 3, 2, 1], dtype=np.float32) + smooth_kernel /= np.sum(smooth_kernel) + body_curve = np.convolve(body_curve, smooth_kernel, mode="same") + body_curve = self._hold_activity_curve( + body_curve, + max(1, int(0.08 * converted_sr / hop_length)), + ) + under_target = np.clip( + (0.95 * reference_rms - converted_rms) / (0.95 * reference_rms + eps), + 0.0, + 1.0, + ) + artifact_guard = 1.0 - np.maximum( + np.clip((converted_hf - 1.12 * reference_hf) / (converted_hf + eps), 0.0, 1.0), + np.clip((converted_rms - 1.10 * reference_rms) / (converted_rms + eps), 0.0, 1.0), + ) + boost_curve = np.clip(body_curve * under_target * artifact_guard, 0.0, 1.0).astype(np.float32) + + boosted_frames = int(np.sum(boost_curve > 0.20)) + if boosted_frames <= 0: + return + + sample_weights = self._frame_curve_to_sample_gain( + boost_curve, + aligned_len, + hop_length, + )[:aligned_len] + ref_body_rms = self._weighted_rms(reference_main, sample_weights) + out_body_rms = self._weighted_rms(converted_main, sample_weights) + if ref_body_rms <= 1e-6 or out_body_rms <= 1e-6: + return + + target_body_rms = target_ratio * ref_body_rms + if out_body_rms >= target_body_rms * 0.98: + return + + gain = float(np.clip(target_body_rms / (out_body_rms + eps), 1.0, max_gain)) + if gain <= 1.001: + return + + restored_main = self._apply_weighted_gain( + converted_main, + sample_weights, + gain, + ).astype(np.float32) + restored_main = soft_clip(restored_main, threshold=0.92, ceiling=0.985) + + if len(converted_audio) > aligned_len: + converted_audio = np.concatenate([restored_main, converted_audio[aligned_len:]]) + else: + converted_audio = restored_main + + sf.write(converted_vocals_path, converted_audio.astype(np.float32), converted_sr) + log.detail( + "Source loudness restore: " + f"boosted {boosted_frames}/{frame_count} body frames, " + f"ref={ref_body_rms:.6f}, out={out_body_rms:.6f}, gain={gain:.3f}" + ) + + def _restore_voiced_body_from_raw( + self, + raw_vocals_path: str, + source_vocals_path: str, + converted_vocals_path: str, + max_blend: float = 0.18, + ) -> None: + """Restore target timbre body from raw VC only on stable voiced phrases.""" + import librosa + import soundfile as sf + + def _preemphasis_rms(audio: np.ndarray, frame_length: int, hop_length: int) -> np.ndarray: + audio = np.asarray(audio, dtype=np.float32).reshape(-1) + if audio.size == 0: + return np.zeros(0, dtype=np.float32) + residual = np.empty_like(audio) + residual[0] = audio[0] + residual[1:] = audio[1:] - 0.97 * audio[:-1] + return librosa.feature.rms( + y=residual, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + + def _spectral_flatness(audio: np.ndarray, frame_length: int, hop_length: int) -> np.ndarray: + audio = np.asarray(audio, dtype=np.float32).reshape(-1) + if audio.size == 0: + return np.zeros(0, dtype=np.float32) + return librosa.feature.spectral_flatness( + y=audio + 1e-8, + n_fft=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + + raw_audio, raw_sr = sf.read(raw_vocals_path) + source_audio, source_sr = librosa.load(source_vocals_path, sr=None, mono=True) + converted_audio, converted_sr = sf.read(converted_vocals_path) + + if raw_audio.ndim > 1: + raw_audio = raw_audio.mean(axis=1) + if converted_audio.ndim > 1: + converted_audio = converted_audio.mean(axis=1) + + raw_audio = np.asarray(raw_audio, dtype=np.float32) + source_audio = np.asarray(source_audio, dtype=np.float32) + converted_audio = np.asarray(converted_audio, dtype=np.float32) + + if raw_sr != converted_sr: + raw_audio = librosa.resample( + raw_audio, + orig_sr=raw_sr, + target_sr=converted_sr, + ).astype(np.float32) + if source_sr != converted_sr: + source_audio = librosa.resample( + source_audio, + orig_sr=source_sr, + target_sr=converted_sr, + ).astype(np.float32) + + aligned_len = min(len(raw_audio), len(source_audio), len(converted_audio)) + if aligned_len <= 0: + return + + raw_main = raw_audio[:aligned_len] + source_main = source_audio[:aligned_len] + converted_main = converted_audio[:aligned_len] + + frame_length = 2048 + hop_length = 512 + eps = 1e-8 + + raw_rms = librosa.feature.rms( + y=raw_main, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + source_rms = librosa.feature.rms( + y=source_main, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + converted_rms = librosa.feature.rms( + y=converted_main, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + raw_hf = _preemphasis_rms(raw_main, frame_length, hop_length) + source_hf = _preemphasis_rms(source_main, frame_length, hop_length) + converted_hf = _preemphasis_rms(converted_main, frame_length, hop_length) + raw_flatness = _spectral_flatness(raw_main, frame_length, hop_length) + source_flatness = _spectral_flatness(source_main, frame_length, hop_length) + converted_flatness = _spectral_flatness(converted_main, frame_length, hop_length) + + frame_count = min( + raw_rms.size, + source_rms.size, + converted_rms.size, + raw_hf.size, + source_hf.size, + converted_hf.size, + raw_flatness.size, + source_flatness.size, + converted_flatness.size, + ) + if frame_count <= 4: + return + + raw_rms = raw_rms[:frame_count] + source_rms = source_rms[:frame_count] + converted_rms = converted_rms[:frame_count] + raw_hf = raw_hf[:frame_count] + source_hf = source_hf[:frame_count] + converted_hf = converted_hf[:frame_count] + raw_flatness = raw_flatness[:frame_count] + source_flatness = source_flatness[:frame_count] + converted_flatness = converted_flatness[:frame_count] + + source_db = 20.0 * np.log10(source_rms + eps) + ref_db = float(np.percentile(source_db, 95)) + activity = np.square(np.clip((source_db - (ref_db - 20.0)) / 10.0, 0.0, 1.0)) + low_mid_energy = np.clip((ref_db - 18.0 - source_db) / 22.0, 0.0, 1.0) + breath_like = ( + low_mid_energy + * np.clip((source_flatness - 0.22) / 0.22, 0.0, 1.0) + * np.clip(source_hf / (float(np.percentile(source_hf, 72)) + eps), 0.0, 1.25) + ) + + body_need = np.maximum( + np.clip((0.93 * source_rms - converted_rms) / (0.93 * source_rms + eps), 0.0, 1.0), + np.clip((0.90 * source_hf - converted_hf) / (0.90 * source_hf + eps), 0.0, 1.0), + ) + source_delta = np.abs(np.diff(source_rms, prepend=source_rms[:1])) + raw_delta = np.abs(np.diff(raw_rms, prepend=raw_rms[:1])) + converted_delta = np.abs(np.diff(converted_rms, prepend=converted_rms[:1])) + glitch_curve = np.clip( + (converted_delta - (0.008 + 1.40 * source_delta)) / (converted_delta + eps), + 0.0, + 1.0, + ) + body_gap = np.maximum( + np.clip((0.90 * raw_rms - converted_rms) / (0.90 * raw_rms + eps), 0.0, 1.0), + np.clip((0.84 * raw_hf - converted_hf) / (0.84 * raw_hf + eps), 0.0, 1.0), + ) + raw_overshoot = np.maximum( + np.clip((raw_rms - 1.18 * source_rms) / (raw_rms + eps), 0.0, 1.0), + np.clip((raw_hf - 1.18 * source_hf) / (raw_hf + eps), 0.0, 1.0), + ) + raw_roughness = np.maximum.reduce( + [ + np.clip((raw_flatness - source_flatness - 0.015) / 0.085, 0.0, 1.0), + np.clip((raw_hf - 1.08 * source_hf) / (raw_hf + eps), 0.0, 1.0), + np.clip((raw_delta - (0.006 + 1.18 * source_delta)) / (raw_delta + eps), 0.0, 1.0), + np.clip((raw_flatness - converted_flatness - 0.020) / 0.080, 0.0, 1.0), + ] + ) + stable_curve = ( + activity + * (1.0 - 0.90 * breath_like) + * (1.0 - 0.75 * glitch_curve) + * (1.0 - 0.65 * raw_overshoot) + * (1.0 - 0.82 * raw_roughness) + ) + blend_curve = np.clip(stable_curve * body_gap * body_need, 0.0, 1.0).astype(np.float32) + + smooth_kernel = np.array([1, 2, 3, 2, 1], dtype=np.float32) + smooth_kernel /= np.sum(smooth_kernel) + for _ in range(2): + blend_curve = np.convolve(blend_curve, smooth_kernel, mode="same") + blend_curve = self._hold_activity_curve( + blend_curve, + max(1, int(0.06 * converted_sr / hop_length)), ) - frame_weights = np.clip(activity * activity, 0.0, 1.0) - return CoverPipeline._frame_curve_to_sample_gain( - frame_weights, - len(reference_audio), + blend_curve = np.clip(blend_curve, 0.0, 1.0).astype(np.float32) + + need_curve = np.clip(activity * body_need * (1.0 - 0.75 * raw_roughness), 0.0, 1.0).astype(np.float32) + if float(np.mean(need_curve)) < 0.035: + return + + restored_frames = int(np.sum(blend_curve > 0.20)) + if restored_frames <= 0: + return + + sample_blend = max_blend * self._frame_curve_to_sample_gain( + blend_curve, + aligned_len, hop_length, - ) + )[:aligned_len] + sample_blend = np.clip(sample_blend, 0.0, max_blend).astype(np.float32) + restored_main = ( + converted_main * (1.0 - sample_blend) + + raw_main * sample_blend + ).astype(np.float32) - @staticmethod - def _weighted_rms(audio: np.ndarray, weights: np.ndarray) -> float: - """Compute RMS under sample-domain weights.""" - audio = np.asarray(audio, dtype=np.float32).reshape(-1) - weights = np.asarray(weights, dtype=np.float32).reshape(-1) - if audio.size == 0 or weights.size == 0: - return 0.0 + trim_weights = self._frame_curve_to_sample_gain( + np.clip(stable_curve, 0.0, 1.0), + aligned_len, + hop_length, + )[:aligned_len] + ref_body_rms = self._weighted_rms(raw_main, trim_weights) + out_body_rms = self._weighted_rms(restored_main, trim_weights) + gain = 1.0 + if ref_body_rms > 1e-6 and out_body_rms > ref_body_rms * 0.99: + gain = float(np.clip((0.99 * ref_body_rms) / (out_body_rms + eps), 0.92, 1.0)) + if gain < 0.999: + restored_main = self._apply_weighted_gain( + restored_main, + trim_weights, + gain, + ).astype(np.float32) + + restored_main = soft_clip(restored_main, threshold=0.92, ceiling=0.985) - aligned_len = min(audio.size, weights.size) - if aligned_len <= 0: - return 0.0 + if len(converted_audio) > aligned_len: + converted_audio = np.concatenate([restored_main, converted_audio[aligned_len:]]) + else: + converted_audio = restored_main - audio = audio[:aligned_len] - weights = np.clip(weights[:aligned_len], 0.0, 1.0) - total = float(np.sum(weights)) - if total <= 1e-6: - return 0.0 - return float(np.sqrt(np.sum((audio * audio) * weights) / total + 1e-12)) + sf.write(converted_vocals_path, converted_audio.astype(np.float32), converted_sr) + log.detail( + "Raw body restore: " + f"blended {restored_frames}/{frame_count} stable voiced frames from raw VC " + f"(max_blend={max_blend:.2f}, gain={gain:.3f}, " + f"rough_frames={int(np.sum(raw_roughness > 0.20))})" + ) - def _apply_source_gap_suppression( + def _apply_artifact_segment_rescue( self, source_vocals_path: str, converted_vocals_path: str, + max_source_blend: float = 0.86, ) -> None: - """Suppress hallucinated noise in sustained no-vocal gaps only.""" + """Blend toward dry source only on short segments where final VC still sounds unstable.""" import librosa import soundfile as sf + def _preemphasis_rms(audio: np.ndarray, frame_length: int, hop_length: int) -> np.ndarray: + audio = np.asarray(audio, dtype=np.float32).reshape(-1) + if audio.size == 0: + return np.zeros(0, dtype=np.float32) + residual = np.empty_like(audio) + residual[0] = audio[0] + residual[1:] = audio[1:] - 0.97 * audio[:-1] + return librosa.feature.rms( + y=residual, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + + def _spectral_flatness(audio: np.ndarray, frame_length: int, hop_length: int) -> np.ndarray: + audio = np.asarray(audio, dtype=np.float32).reshape(-1) + if audio.size == 0: + return np.zeros(0, dtype=np.float32) + return librosa.feature.spectral_flatness( + y=audio + 1e-8, + n_fft=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + source_audio, source_sr = librosa.load(source_vocals_path, sr=None, mono=True) converted_audio, converted_sr = sf.read(converted_vocals_path) if converted_audio.ndim > 1: converted_audio = converted_audio.mean(axis=1) + source_audio = np.asarray(source_audio, dtype=np.float32) converted_audio = np.asarray(converted_audio, dtype=np.float32) @@ -730,32 +1951,216 @@ class CoverPipeline: if aligned_len <= 0: return - source_audio = source_audio[:aligned_len] + source_main = source_audio[:aligned_len] converted_main = converted_audio[:aligned_len] - gain, gated_frames, total_frames = self._compute_quiet_gap_sample_gain( - source_audio, - converted_sr, + + frame_length = 2048 + hop_length = 512 + eps = 1e-8 + + source_rms = librosa.feature.rms( + y=source_main, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + converted_rms = librosa.feature.rms( + y=converted_main, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + source_hf = _preemphasis_rms(source_main, frame_length, hop_length) + converted_hf = _preemphasis_rms(converted_main, frame_length, hop_length) + source_flatness = _spectral_flatness(source_main, frame_length, hop_length) + converted_flatness = _spectral_flatness(converted_main, frame_length, hop_length) + + frame_count = min( + source_rms.size, + converted_rms.size, + source_hf.size, + converted_hf.size, + source_flatness.size, + converted_flatness.size, ) - gain = np.clip(gain[:aligned_len], 0.0, 1.0).astype(np.float32) - suppressed = converted_main * gain + if frame_count <= 4: + return - attenuated_samples = int(np.sum(gain < 0.08)) - if attenuated_samples > 0: - log.detail( - f"Source gap suppression: attenuated {attenuated_samples}/{aligned_len} samples in no-vocal regions" + source_rms = source_rms[:frame_count] + converted_rms = converted_rms[:frame_count] + source_hf = source_hf[:frame_count] + converted_hf = converted_hf[:frame_count] + source_flatness = source_flatness[:frame_count] + converted_flatness = converted_flatness[:frame_count] + + source_db = 20.0 * np.log10(source_rms + eps) + ref_db = float(np.percentile(source_db, 95)) + activity = np.square(np.clip((source_db - (ref_db - 24.0)) / 11.0, 0.0, 1.0)) + support_activity = np.clip((source_db - (ref_db - 30.0)) / 14.0, 0.0, 1.0) + low_mid_energy = np.clip((ref_db - 18.0 - source_db) / 22.0, 0.0, 1.0) + not_sustained_gap = 1.0 - np.clip((ref_db - 42.0 - source_db) / 8.0, 0.0, 1.0) + quiet_curve = np.clip((ref_db - 35.0 - source_db) / 12.0, 0.0, 1.0) + + excess_rms = np.clip( + (converted_rms - 1.05 * source_rms) / (converted_rms + eps), + 0.0, + 1.0, + ) + excess_hf = np.clip( + (converted_hf - 1.03 * source_hf) / (converted_hf + eps), + 0.0, + 1.0, + ) + source_delta = np.abs(np.diff(source_rms, prepend=source_rms[:1])) + converted_delta = np.abs(np.diff(converted_rms, prepend=converted_rms[:1])) + spike_excess = np.clip( + (converted_delta - (0.006 + 1.35 * source_delta)) / (converted_delta + eps), + 0.0, + 1.0, + ) + + tonalized_breath = ( + not_sustained_gap + * np.clip((source_flatness - 0.20) / 0.24, 0.0, 1.0) + * np.maximum( + np.clip((source_flatness - converted_flatness) / 0.15, 0.0, 1.0), + np.clip((converted_hf - 0.95 * source_hf) / (converted_hf + eps), 0.0, 1.0), ) - if gated_frames > 0: - log.detail( - f"Source gap suppression: detected {gated_frames}/{total_frames} sustained quiet frames" + ) + quiet_residue = quiet_curve * np.maximum(excess_rms, excess_hf) + active_residue = activity * np.maximum( + spike_excess, + np.clip((converted_hf - 1.15 * source_hf) / (converted_hf + eps), 0.0, 1.0), + ) + converted_db = 20.0 * np.log10(converted_rms + eps) + audible_output = np.clip((converted_db + 58.0) / 12.0, 0.0, 1.0) + hf_ratio = converted_hf / (source_hf + eps) + low_energy_tonal_hf = ( + audible_output + * low_mid_energy + * not_sustained_gap + * np.maximum( + np.clip((hf_ratio - 1.18) / 2.20, 0.0, 1.0), + np.clip((converted_rms - 1.18 * source_rms) / (converted_rms + eps), 0.0, 1.0), ) + ) + + breath_curve = 0.96 * tonalized_breath * np.maximum(excess_rms, excess_hf) + quiet_fix_curve = 0.82 * quiet_residue + active_fix_curve = 0.52 * active_residue + low_energy_hf_curve = 0.90 * low_energy_tonal_hf + blend_curve = np.maximum.reduce( + [breath_curve, quiet_fix_curve, active_fix_curve, low_energy_hf_curve] + ) + + smooth_kernel = np.array([1, 2, 3, 2, 1], dtype=np.float32) + smooth_kernel /= np.sum(smooth_kernel) + for _ in range(2): + blend_curve = np.convolve(blend_curve, smooth_kernel, mode="same") + + blend_curve = self._hold_activity_curve( + blend_curve, + max(1, int(0.05 * converted_sr / hop_length)), + ) + blend_curve = np.clip(blend_curve, 0.0, 1.0).astype(np.float32) + + rescued_frames = int(np.sum(blend_curve > 0.18)) + if rescued_frames <= 0: + return + + frame_cap = np.maximum.reduce( + [ + 0.40 * activity, + 0.58 * support_activity * np.clip(spike_excess, 0.0, 1.0), + 0.70 * quiet_curve, + 0.84 * tonalized_breath, + 0.90 * low_energy_tonal_hf, + ] + ) + effective_curve = np.minimum( + blend_curve, + np.clip(frame_cap / max(max_source_blend, 1e-6), 0.0, 1.0), + ).astype(np.float32) + + sample_blend = max_source_blend * self._frame_curve_to_sample_gain( + effective_curve, + aligned_len, + hop_length, + )[:aligned_len] + sample_blend = np.clip(sample_blend, 0.0, max_source_blend).astype(np.float32) + + rescued_main = ( + converted_main * (1.0 - sample_blend) + + source_main * sample_blend + ).astype(np.float32) + rescued_rms = librosa.feature.rms( + y=rescued_main, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0].astype(np.float32) + rescued_rms = self._fit_frame_curve(rescued_rms, frame_count) + risk_trim_curve = np.clip( + low_energy_tonal_hf * (1.0 - 0.65 * activity), + 0.0, + 1.0, + ) + for _ in range(2): + risk_trim_curve = np.convolve(risk_trim_curve, smooth_kernel, mode="same") + risk_trim_curve = self._hold_activity_curve( + risk_trim_curve, + max(1, int(0.04 * converted_sr / hop_length)), + ) + risk_trim_curve = np.clip(risk_trim_curve, 0.0, 1.0).astype(np.float32) + residual_floor = (10.0 ** (-58.0 / 20.0)) + target_rms = np.maximum( + 1.20 * source_rms + float(np.percentile(source_rms, 95)) * 0.0025, + residual_floor, + ) + trim_gain = np.clip(target_rms / (rescued_rms + eps), 0.42, 1.0) + trim_gain = 1.0 - risk_trim_curve * (1.0 - trim_gain) + trim_sample_gain = self._frame_curve_to_sample_gain( + trim_gain, + aligned_len, + hop_length, + )[:aligned_len] + rescued_main = (rescued_main * trim_sample_gain).astype(np.float32) + + activity_weights = self._frame_curve_to_sample_gain( + np.clip(activity, 0.0, 1.0), + aligned_len, + hop_length, + )[:aligned_len] + ref_active_rms = self._weighted_rms(converted_main, activity_weights) + out_active_rms = self._weighted_rms(rescued_main, activity_weights) + gain = 1.0 + if ref_active_rms > 1e-6 and out_active_rms < ref_active_rms * 0.92: + gain = float(np.clip((0.95 * ref_active_rms) / (out_active_rms + eps), 1.0, 1.08)) + if gain > 1.001: + rescued_main = self._apply_weighted_gain( + rescued_main, + activity_weights, + gain, + ).astype(np.float32) + + rescued_main = soft_clip(rescued_main, threshold=0.92, ceiling=0.985) if len(converted_audio) > aligned_len: - tail = converted_audio[aligned_len:] * 0.0 - converted_audio = np.concatenate([suppressed, tail.astype(np.float32)]) + converted_audio = np.concatenate([rescued_main, converted_audio[aligned_len:]]) else: - converted_audio = suppressed + converted_audio = rescued_main sf.write(converted_vocals_path, converted_audio.astype(np.float32), converted_sr) + log.detail( + "Artifact segment rescue: " + f"rescued {rescued_frames}/{frame_count} frames " + f"(breath={int(np.sum(breath_curve > 0.18))}, " + f"quiet={int(np.sum(quiet_fix_curve > 0.18))}, " + f"active={int(np.sum(active_fix_curve > 0.18))}, " + f"tonal_hf={int(np.sum(low_energy_hf_curve > 0.18))}, " + f"trim={int(np.sum(risk_trim_curve > 0.18))}, " + f"gain={gain:.3f})" + ) @staticmethod def _compute_quiet_gap_sample_gain( @@ -795,11 +2200,11 @@ class CoverPipeline: ) quiet_mask = ( - (frame_db < (ref_db - 36.0)) - & (activity < 0.12) + (frame_db < (ref_db - 42.0)) + & (activity < 0.08) ) - min_frames = max(1, int(0.12 * sr / hop_length)) + min_frames = max(1, int(0.16 * sr / hop_length)) gate = quiet_mask.astype(np.float32) filtered = np.zeros_like(gate) run_start = 0 @@ -817,12 +2222,12 @@ class CoverPipeline: if in_run and (len(gate) - run_start) >= min_frames: filtered[run_start:len(gate)] = 1.0 - transition_frames = max(1, int(0.04 * sr / hop_length)) + transition_frames = max(1, int(0.05 * sr / hop_length)) smooth_kernel = np.ones(transition_frames, dtype=np.float32) / transition_frames filtered = np.convolve(filtered, smooth_kernel, mode="same") filtered = np.clip(filtered, 0.0, 1.0) - gain_curve = 1.0 - filtered * 0.92 + gain_curve = 1.0 - filtered * 0.55 sample_gain = CoverPipeline._frame_curve_to_sample_gain( gain_curve, len(reference_audio), @@ -904,7 +2309,10 @@ class CoverPipeline: if normalized_mode == "on": return vc_preprocessed if normalized_mode == "auto": - return vc_preprocessed and self._last_vc_preprocess_mode in {"uvr_deecho", "legacy"} + return vc_preprocessed and self._last_vc_preprocess_mode in { + "strict_deecho", + "strict_deecho_plus", + } return False def _refine_source_constrained_output( @@ -913,25 +2321,67 @@ class CoverPipeline: converted_vocals_path: str, source_constraint_mode: str, f0_method: str, + original_vocals_path: Optional[str] = None, + session_dir: Optional[Path] = None, ) -> None: - """Apply extra cleanup passes for mature UVR DeEcho routing.""" + """Apply extra cleanup passes for the strict learned DeEcho routing.""" normalized_mode = str(source_constraint_mode or "auto").strip().lower() if normalized_mode != "auto": return - if self._last_vc_preprocess_mode != "uvr_deecho": + if self._last_vc_preprocess_mode not in {"strict_deecho", "strict_deecho_plus"}: return self._apply_silence_gate_official( vocals_path=source_vocals_path, converted_path=converted_vocals_path, f0_method=f0_method, - silence_threshold_db=-42.0, + silence_threshold_db=-48.0, silence_smoothing_ms=35.0, - silence_min_duration_ms=80.0, - protect=0.0, + silence_min_duration_ms=120.0, + protect=0.35, ) log.detail("Low-energy unvoiced cleanup: applied after source-guided reconstruction") + self._apply_source_breath_cleanup( + source_vocals_path=source_vocals_path, + converted_vocals_path=converted_vocals_path, + ) + + self._apply_source_gap_suppression( + source_vocals_path=source_vocals_path, + converted_vocals_path=converted_vocals_path, + ) + self._apply_source_transition_cleanup( + source_vocals_path=source_vocals_path, + converted_vocals_path=converted_vocals_path, + ) + if original_vocals_path: + self._restore_active_vocal_loudness( + reference_vocals_path=original_vocals_path, + converted_vocals_path=converted_vocals_path, + ) + if session_dir: + raw_snapshot_path = None + for candidate_name in ( + "debug_converted_raw.wav", + "debug_converted_raw_current.wav", + "debug_converted_raw_upstream.wav", + "debug_converted_raw_repair.wav", + ): + candidate_path = session_dir / candidate_name + if candidate_path.exists(): + raw_snapshot_path = str(candidate_path) + break + if raw_snapshot_path: + self._restore_voiced_body_from_raw( + raw_vocals_path=raw_snapshot_path, + source_vocals_path=source_vocals_path, + converted_vocals_path=converted_vocals_path, + ) + self._apply_artifact_segment_rescue( + source_vocals_path=source_vocals_path, + converted_vocals_path=converted_vocals_path, + ) self._apply_source_gap_suppression( source_vocals_path=source_vocals_path, converted_vocals_path=converted_vocals_path, @@ -1022,14 +2472,27 @@ class CoverPipeline: echo_ratio = echo_ratio[:n_blend] # --- Blending weight --- - # Base: original low-activity weight (for silent gaps) - base_weight = 0.65 * np.square(1.0 - activity[:n_blend]) - # Echo boost: even during active singing, apply DeEcho proportional - # to detected echo. Max additional contribution capped at 0.55. - echo_boost = 0.55 * echo_ratio * activity[:n_blend] - deecho_weight = base_weight + echo_boost + # 全局回音水平驱动系数自适应 + global_echo = float(np.mean(echo_ratio)) + # 沉默段基权: 轻回音0.68, 重回音0.86 + base_coef = 0.68 + 0.18 * global_echo + base_weight = base_coef * np.square(1.0 - activity[:n_blend]) + # 活跃唱段在检测到明显回声时更激进,但避免把主唱主体削得过干。 + echo_boost_coef = 0.60 + 0.20 * global_echo + echo_boost = echo_boost_coef * echo_ratio * activity[:n_blend] + active_floor = ( + activity[:n_blend] + * np.clip((echo_ratio - 0.32) / 0.28, 0.0, 1.0) + * (0.58 + 0.18 * global_echo) + ) + tail_support = ( + (1.0 - activity[:n_blend]) + * np.clip((echo_ratio - 0.14) / 0.22, 0.0, 1.0) + * (0.16 + 0.08 * global_echo) + ) + deecho_weight = np.maximum(base_weight + echo_boost + tail_support, active_floor) deecho_weight = np.convolve(deecho_weight, smooth_kernel, mode="same") - deecho_weight = np.clip(deecho_weight, 0.0, 0.80) + deecho_weight = np.clip(deecho_weight, 0.0, 0.95) deecho_weight = CoverPipeline._frame_curve_to_sample_gain( deecho_weight, aligned_len, @@ -1041,6 +2504,117 @@ class CoverPipeline: blended = np.concatenate([blended, direct_mono[aligned_len:]]) return blended.astype(np.float32) + @staticmethod + def _merge_deecho_hotspot_cleanup( + direct_mono: np.ndarray, + deecho_mono: np.ndarray, + aggressive_mono: np.ndarray, + sr: int, + ) -> Tuple[np.ndarray, Dict[str, float]]: + """Apply the aggressive dereverb pass only where active echo is still obvious.""" + import librosa + + direct_mono = np.asarray(direct_mono, dtype=np.float32).reshape(-1) + deecho_mono = np.asarray(deecho_mono, dtype=np.float32).reshape(-1) + aggressive_mono = np.asarray(aggressive_mono, dtype=np.float32).reshape(-1) + aligned_len = min(direct_mono.size, deecho_mono.size, aggressive_mono.size) + if aligned_len <= 0: + return deecho_mono.astype(np.float32), { + "hotspot_frames": 0.0, + "avg_weight": 0.0, + "max_weight": 0.0, + "coverage_ratio": 0.0, + } + + direct_main = direct_mono[:aligned_len] + deecho_main = deecho_mono[:aligned_len] + aggressive_main = aggressive_mono[:aligned_len] + + frame_length = 2048 + hop_length = 512 + eps = 1e-8 + smooth_kernel = np.array([1, 2, 3, 2, 1], dtype=np.float32) + smooth_kernel /= np.sum(smooth_kernel) + + direct_rms = librosa.feature.rms( + y=direct_main, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0] + deecho_rms = librosa.feature.rms( + y=deecho_main, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0] + aggressive_rms = librosa.feature.rms( + y=aggressive_main, + frame_length=frame_length, + hop_length=hop_length, + center=True, + )[0] + + frame_count = min(direct_rms.size, deecho_rms.size, aggressive_rms.size) + if frame_count <= 0: + return deecho_mono.astype(np.float32), { + "hotspot_frames": 0.0, + "avg_weight": 0.0, + "max_weight": 0.0, + "coverage_ratio": 0.0, + } + + direct_rms = direct_rms[:frame_count].astype(np.float32) + deecho_rms = deecho_rms[:frame_count].astype(np.float32) + aggressive_rms = aggressive_rms[:frame_count].astype(np.float32) + + direct_db = 20.0 * np.log10(direct_rms + eps) + ref_db = float(np.percentile(direct_db, 95)) + activity = np.square(np.clip((direct_db - (ref_db - 18.0)) / 9.0, 0.0, 1.0)) + support_activity = np.clip((direct_db - (ref_db - 24.0)) / 12.0, 0.0, 1.0) + + primary_removed = np.clip(1.0 - (deecho_rms / (direct_rms + eps)), 0.0, 1.0) + secondary_removed = np.clip(1.0 - (aggressive_rms / (deecho_rms + eps)), 0.0, 1.0) + hotspot_curve = ( + activity + * np.clip((secondary_removed - 0.10) / 0.22, 0.0, 1.0) + * np.clip((primary_removed - 0.02) / 0.10, 0.0, 1.0) + ) + support_curve = ( + 0.22 + * support_activity + * np.clip((secondary_removed - 0.16) / 0.20, 0.0, 1.0) + ) + aggressive_weight = np.clip(hotspot_curve + support_curve, 0.0, 0.62) + aggressive_weight = np.convolve(aggressive_weight, smooth_kernel, mode="same") + aggressive_weight = CoverPipeline._hold_activity_curve( + aggressive_weight, + max(1, int(0.03 * sr / hop_length)), + ) + aggressive_weight = np.clip(aggressive_weight, 0.0, 0.62).astype(np.float32) + + sample_weight = CoverPipeline._frame_curve_to_sample_gain( + aggressive_weight, + aligned_len, + hop_length, + )[:aligned_len] + cleaned_main = ( + deecho_main * (1.0 - sample_weight) + + aggressive_main * sample_weight + ).astype(np.float32) + + if deecho_mono.size > aligned_len: + cleaned = np.concatenate([cleaned_main, deecho_mono[aligned_len:]]) + else: + cleaned = cleaned_main + + return cleaned.astype(np.float32), { + "hotspot_frames": float(np.sum(aggressive_weight > 0.20)), + "avg_weight": float(np.mean(aggressive_weight)), + "max_weight": float(np.max(aggressive_weight)), + "coverage_ratio": float(np.mean(aggressive_weight > 0.20)), + } + def _prepare_vocals_for_vc( self, vocals_path: str, @@ -1051,103 +2625,120 @@ class CoverPipeline: Prepare vocals for VC using a mature-project-friendly routing strategy. Modes: - - auto: prefer learned UVR DeEcho/DeReverb, otherwise advanced dereverb -> RVC - - direct: pass separated lead directly to RVC - - uvr_deecho: require learned UVR DeEcho if available, else fallback to advanced dereverb - - advanced_dereverb: use binary residual masking to separate dry/wet, convert dry only - - legacy: old hand-crafted dereverb + tail gating chain + - auto: require public RoFormer De-Reverb -> RVC; no model downgrade + - uvr_deecho: strict alias of the RoFormer De-Reverb path """ import librosa import soundfile as sf preprocess_mode = str(preprocess_mode or "auto").strip().lower() - if preprocess_mode not in {"auto", "direct", "uvr_deecho", "advanced_dereverb", "legacy"}: + if preprocess_mode not in {"auto", "uvr_deecho"}: + log.warning( + f"严格SOTA模式不再支持 VC 预处理 '{preprocess_mode}',已改用 RoFormer De-Reverb" + ) preprocess_mode = "auto" - # 保存原始混响用于后处理 - self._original_reverb_path = None - - if preprocess_mode == "advanced_dereverb": - # 使用高级去混响:分离干声和混响 - audio, sr = librosa.load(vocals_path, sr=None, mono=False) - audio = self._ensure_2d(audio).astype(np.float32) - mono = self._select_mono_for_vc(audio, sr) - - log.detail("VC preprocess: advanced dereverb (binary residual masking)") - dry_signal, reverb_tail = advanced_dereverb(mono, sr) - - # 保存混响用于后处理 - reverb_path = session_dir / "original_reverb.wav" - sf.write(str(reverb_path), reverb_tail, sr) - self._original_reverb_path = str(reverb_path) - - mono = dry_signal - self._last_vc_preprocess_mode = "advanced_dereverb" - log.detail(f"Dry/Wet separation: dry RMS={np.sqrt(np.mean(dry_signal**2)):.4f}, reverb RMS={np.sqrt(np.mean(reverb_tail**2)):.4f}") - - elif preprocess_mode == "legacy": - audio, sr = librosa.load(vocals_path, sr=None, mono=False) - audio = self._ensure_2d(audio).astype(np.float32) - mono = self._select_mono_for_vc(audio, sr) - mono_dry = mono.copy() - mono = self._dereverb_for_vc(mono, sr) - mono = self._gate_echo_tails(mono_dry, mono, sr) - self._last_vc_preprocess_mode = "legacy" - log.detail("VC preprocess: legacy dereverb chain -> mono select") - else: - preprocess_input = vocals_path - if preprocess_mode in {"auto", "uvr_deecho"}: - preprocess_input = self._apply_uvr_deecho_for_vc(vocals_path, session_dir) or vocals_path - - if preprocess_input == vocals_path: - # 如果UVR DeEcho不可用,在auto模式下使用advanced dereverb - if preprocess_mode == "auto": - audio, sr = librosa.load(vocals_path, sr=None, mono=False) - audio = self._ensure_2d(audio).astype(np.float32) - mono = self._select_mono_for_vc(audio, sr) - - log.detail("VC preprocess: UVR DeEcho not available, using advanced dereverb") - dry_signal, reverb_tail = advanced_dereverb(mono, sr) - - # 保存混响用于后处理 - reverb_path = session_dir / "original_reverb.wav" - sf.write(str(reverb_path), reverb_tail, sr) - self._original_reverb_path = str(reverb_path) - - mono = dry_signal - self._last_vc_preprocess_mode = "advanced_dereverb" - log.detail(f"Dry/Wet separation: dry RMS={np.sqrt(np.mean(dry_signal**2)):.4f}, reverb RMS={np.sqrt(np.mean(reverb_tail**2)):.4f}") - else: - self._last_vc_preprocess_mode = "direct" - if preprocess_mode == "uvr_deecho": - log.warning("Official DeEcho model not found, falling back to direct lead input") - log.detail("VC preprocess: direct lead -> mono select") - audio, sr = librosa.load(preprocess_input, sr=None, mono=False) - audio = self._ensure_2d(audio).astype(np.float32) - mono = self._select_mono_for_vc(audio, sr) - else: - self._last_vc_preprocess_mode = "uvr_deecho" - log.detail("VC preprocess: UVR learned DeEcho/DeReverb -> mono select") + self._deecho_metrics = None + + preprocess_input = self._apply_roformer_deecho_for_vc(vocals_path, session_dir) + if preprocess_input == vocals_path: + raise RuntimeError("严格SOTA模式下 RoFormer De-Reverb 未产生可用输出") + + self._last_vc_preprocess_mode = "strict_deecho" + log.detail("VC preprocess: strict RoFormer De-Reverb -> mono select") + + direct_audio, sr = librosa.load(vocals_path, sr=None, mono=False) + deecho_audio, deecho_sr = librosa.load(preprocess_input, sr=None, mono=False) + direct_audio = self._ensure_2d(direct_audio).astype(np.float32) + deecho_audio = self._ensure_2d(deecho_audio).astype(np.float32) + direct_mono = self._select_mono_for_vc(direct_audio, sr) + deecho_mono = self._select_mono_for_vc(deecho_audio, deecho_sr) + if deecho_sr != sr: + deecho_mono = librosa.resample( + deecho_mono, + orig_sr=deecho_sr, + target_sr=sr, + ).astype(np.float32) - if preprocess_input == vocals_path: - audio, sr = librosa.load(preprocess_input, sr=None, mono=False) - audio = self._ensure_2d(audio).astype(np.float32) - mono = self._select_mono_for_vc(audio, sr) + deecho_metrics = getattr(self, '_deecho_metrics', None) + skip_blend = False + secondary_dry_cleanup = False + hotspot_cleaned = False + hotspot_stats = { + "hotspot_frames": 0.0, + "avg_weight": 0.0, + "max_weight": 0.0, + "coverage_ratio": 0.0, + } + if deecho_metrics: + sep_db = deecho_metrics.get('separation_db', 0.0) + corr = deecho_metrics.get('corr', 0.0) + active_ratio = deecho_metrics.get('active_ratio', 1.0) + reduction_ratio = deecho_metrics.get('reduction_ratio', 0.0) + log.detail( + "DeEcho quality: " + f"sep={sep_db:.2f}dB, corr={corr:.3f}, " + f"active_ratio={active_ratio:.3f}, reduction={reduction_ratio:.3f}" + ) + # sep/corr only mean the output is coherent. Skip blend only when + # the strict DeEcho branch is also meaningfully drier than source. + if ( + sep_db > 30.0 + and corr > 0.9 + and not (active_ratio > 0.90 and reduction_ratio < 0.25) + ): + skip_blend = True + if active_ratio > 0.93 and reduction_ratio < 0.18: + secondary_dry_cleanup = True + + if secondary_dry_cleanup: + dry_signal, _ = advanced_dereverb(deecho_mono, sr) + deecho_rms = float(np.sqrt(np.mean(np.square(deecho_mono)) + 1e-12)) + delta_rms = float( + np.sqrt(np.mean(np.square(dry_signal.astype(np.float32) - deecho_mono)) + 1e-12) + ) + delta_ratio = float(delta_rms / (deecho_rms + 1e-12)) + if delta_ratio > 0.025: + cleaned_mono, hotspot_stats = CoverPipeline._merge_deecho_hotspot_cleanup( + direct_mono=direct_mono, + deecho_mono=deecho_mono, + aggressive_mono=dry_signal.astype(np.float32), + sr=sr, + ) + if hotspot_stats.get("hotspot_frames", 0.0) > 0: + deecho_mono = cleaned_mono.astype(np.float32) + hotspot_cleaned = True + if hotspot_cleaned: + self._last_vc_preprocess_mode = "strict_deecho_plus" + log.detail( + "VC preprocess: strict DeEcho active-echo hotspot cleanup " + f"(delta_ratio={delta_ratio:.3f}, " + f"hotspot_frames={int(hotspot_stats.get('hotspot_frames', 0.0))}, " + f"coverage={hotspot_stats.get('coverage_ratio', 0.0):.3f}, " + f"avg_weight={hotspot_stats.get('avg_weight', 0.0):.3f}, " + f"max_weight={hotspot_stats.get('max_weight', 0.0):.3f})" + ) else: - direct_audio, sr = librosa.load(vocals_path, sr=None, mono=False) - deecho_audio, deecho_sr = librosa.load(preprocess_input, sr=None, mono=False) - direct_audio = self._ensure_2d(direct_audio).astype(np.float32) - deecho_audio = self._ensure_2d(deecho_audio).astype(np.float32) - direct_mono = self._select_mono_for_vc(direct_audio, sr) - deecho_mono = self._select_mono_for_vc(deecho_audio, deecho_sr) - if deecho_sr != sr: - deecho_mono = librosa.resample( - deecho_mono, - orig_sr=deecho_sr, - target_sr=sr, - ).astype(np.float32) - mono = self._blend_direct_with_deecho(direct_mono, deecho_mono, sr) - log.detail("VC preprocess: blended direct lead with UVR DeEcho") + log.detail( + "VC preprocess: advanced dereverb second pass had no strong active-echo hotspots, " + f"keeping strict DeEcho output (delta_ratio={delta_ratio:.3f})" + ) + + if hotspot_cleaned: + skip_blend = False + coverage_ratio = float(hotspot_stats.get("coverage_ratio", 0.0)) + if coverage_ratio > 0.35: + log.detail( + "VC preprocess: hotspot cleanup touched a wide region; " + "forcing direct/deecho blend to preserve vocal body" + ) + + if skip_blend: + mono = deecho_mono + log.detail("VC preprocess: strict DeEcho quality sufficient, using deecho directly (skip blend)") + else: + mono = CoverPipeline._blend_direct_with_deecho(direct_mono, deecho_mono, sr) + log.detail("VC preprocess: blended direct lead with strict DeEcho (enhanced)") mono = soft_clip(mono, threshold=0.9, ceiling=0.99) @@ -1280,12 +2871,14 @@ class CoverPipeline: gate_pipe = VoiceConversionPipeline(device=self.device) root_dir = Path(__file__).parent.parent rmvpe_path = root_dir / "assets" / "rmvpe" / "rmvpe.pt" - if f0_method in ("rmvpe", "hybrid"): + gate_policy = resolve_cover_f0_policy(f0_method) + gate_method = gate_policy.gate_method + if gate_method in ("rmvpe", "hybrid"): if not rmvpe_path.exists(): raise FileNotFoundError(f"RMVPE 模型未找到: {rmvpe_path}") - gate_pipe.load_f0_extractor(f0_method, str(rmvpe_path)) + gate_pipe.load_f0_extractor(gate_method, str(rmvpe_path)) else: - gate_pipe.load_f0_extractor(f0_method, None) + gate_pipe.load_f0_extractor(gate_method, None) f0 = gate_pipe.f0_extractor.extract(audio_in) gate_pipe.unload_f0_extractor() @@ -1473,8 +3066,12 @@ class CoverPipeline: # phase interference / tearing artifacts), we only constrain the # MAGNITUDE toward the source envelope while preserving the converted # signal's phase. This eliminates phase cancellation. - source_replace = 0.85 * (1.0 - activity)[np.newaxis, :] * (1.0 - soft_mask) - source_replace = np.clip(source_replace, 0.0, 0.70) + source_replace = compute_active_source_replace( + activity=activity, + soft_mask=soft_mask, + echo_ratio=echo_ratio, + direct_ratio=direct_ratio, + ) # Target magnitude: blend toward source magnitude, keep converted phase target_mag = conv_mag * (1.0 - source_replace) + src_mag * source_replace @@ -1503,8 +3100,8 @@ class CoverPipeline: reference_audio=orig if orig is not None else src, target_audio=constrained, sr=conv_sr, - min_gain=0.95, # 放宽到0.95,只降低5%(从0.80改为0.95) - max_gain=1.30, # 允许更大的提升(从1.25改为1.30) + min_gain=0.85, + max_gain=1.12, ) if abs(gain - 1.0) > 1e-3 and out_rms > 1e-6 and ref_rms > 1e-6: constrained = self._apply_weighted_gain(constrained, gain_weights, gain) @@ -1522,12 +3119,15 @@ class CoverPipeline: base_budget_rms = np.maximum(src_frame_rms, orig_frame_rms) ref_frame_rms = float(np.percentile(base_budget_rms, 95)) energy_guard = np.clip(0.20 * direct_activity + 0.15 * direct_ratio + 0.65 * phrase_activity, 0.0, 1.0) - allowed_boost = 0.50 + 1.50 * energy_guard # 提高基础boost(从0.35改为0.50,从1.20改为1.50) noise_floor = ref_frame_rms * (0.002 + 0.005 * (1.0 - phrase_activity)) # 降低noise_floor + allowed_boost, cleanup_floor = compute_source_cleanup_budget( + energy_guard=energy_guard, + phrase_activity=phrase_activity, + ) frame_budget = base_budget_rms * allowed_boost + noise_floor cleanup_gain = np.clip( frame_budget / (constrained_frame_rms + eps), - 0.75 + 0.20 * phrase_activity, # 提高最小增益(从0.55改为0.75) + cleanup_floor, 1.0, ) cleanup_gain = np.convolve(cleanup_gain, frame_kernel, mode="same") @@ -1594,6 +3194,7 @@ class CoverPipeline: demucs_shifts: int = 2, demucs_overlap: float = 0.25, demucs_split: bool = True, + roformer_model: str = ROFORMER_DEFAULT_MODEL, separator: str = "uvr5", uvr5_model: Optional[str] = None, uvr5_agg: int = 10, @@ -1638,6 +3239,7 @@ class CoverPipeline: demucs_shifts: Demucs shifts 参数 demucs_overlap: Demucs overlap 参数 demucs_split: Demucs split 参数 + roformer_model: RoFormer / audio-separator 模型或 ensemble preset hubert_layer: HuBERT 输出层 silence_gate: 是否启用静音门限 silence_threshold_db: 静音阈值 (dB, 相对峰值) @@ -1694,6 +3296,7 @@ class CoverPipeline: log.detail(f"文件大小: {_format_size(input_size)}") log.detail(f"音频时长: {_format_duration(input_duration)}") log.detail(f"会话目录: {session_dir}") + log.detail(f"Quality debug report: {session_dir / 'quality_debug.json'}") log.separator() # 记录参数配置 log.config(f"RVC模型: {Path(model_path).name}") @@ -1710,7 +3313,7 @@ class CoverPipeline: log.config(f"UVR5模型: {uvr5_model or '自动选择'}") log.config(f"UVR5激进度: {uvr5_agg}") elif effective_separator == "roformer": - log.config(f"Roformer模型: {ROFORMER_DEFAULT_MODEL}") + log.config(f"Roformer模型: {roformer_model}") else: log.config(f"Demucs模型: {demucs_model}") log.config(f"Demucs shifts: {demucs_shifts}") @@ -1763,13 +3366,16 @@ class CoverPipeline: ) log.success("UVR5分离完成") elif effective_separator == "roformer": - log.model("使用 Mel-Band Roformer 进行人声分离") - self._init_separator("roformer") + log.model("使用公开 SOTA RoFormer ensemble 进行人声分离") + self._init_separator( + "roformer", + roformer_model=roformer_model, + ) vocals_path, accompaniment_path = self.separator.separate( input_audio, str(session_dir) ) - log.success("Mel-Band Roformer 分离完成") + log.success("RoFormer ensemble 分离完成") else: log.model(f"使用Demucs进行人声分离: {demucs_model}") self._init_separator( @@ -1806,34 +3412,47 @@ class CoverPipeline: vocals_path = lead_vocals_path normalized_vc_preprocess_mode = str(vc_preprocess_mode or "auto").strip().lower() + if normalized_vc_preprocess_mode not in {"auto", "uvr_deecho"}: + log.warning( + f"严格SOTA模式不再接受 VC 预处理 '{normalized_vc_preprocess_mode}',已改用 auto" + ) + normalized_vc_preprocess_mode = "auto" normalized_source_constraint_mode = str(source_constraint_mode or "auto").strip().lower() - available_uvr_deecho_model = self._get_available_uvr_deecho_model() + available_deecho_model = self._get_preferred_deecho_model_label() log.config(f"VC预处理模式: {normalized_vc_preprocess_mode}") if normalized_vc_preprocess_mode in {"auto", "uvr_deecho"}: - if available_uvr_deecho_model: - log.config(f"Mature DeEcho模型: {available_uvr_deecho_model}") + if available_deecho_model: + log.config(f"Mature DeEcho模型: {available_deecho_model}") else: - log.config("Mature DeEcho模型: 未找到,将回退到主唱直通") + log.config("Mature DeEcho模型: 未找到;严格SOTA模式将停止处理") log.config(f"源约束模式: {normalized_source_constraint_mode}") # 官方模式也必须经过去混响预处理,确保输入RVC的是纯净干声 - # 官方模式下如果用户选了 direct,强制提升为 auto(带混响的人声会破坏F0提取) effective_preprocess_mode = normalized_vc_preprocess_mode - if normalized_vc_pipeline_mode == "official" and effective_preprocess_mode == "direct": - effective_preprocess_mode = "auto" - log.warning("官方模式:direct预处理已提升为auto,确保去混响后再进入RVC推理") vc_input_path = vocals_path vc_preprocessed = False - try: - prepared_path = self._prepare_vocals_for_vc(vocals_path, session_dir, preprocess_mode=effective_preprocess_mode) - vc_input_path = prepared_path - vc_preprocessed = True - log.audio(f"VC预处理输入: {Path(vc_input_path).name}") - except Exception as e: - log.warning(f"VC预处理失败,回退原始输入: {e}") + prepared_path = self._prepare_vocals_for_vc( + vocals_path, + session_dir, + preprocess_mode=effective_preprocess_mode, + ) + vc_input_path = prepared_path + vc_preprocessed = True + log.audio(f"VC预处理输入: {Path(vc_input_path).name}") report_progress("正在转换人声...", step_convert) + self._record_quality_debug( + session_dir=session_dir, + stage="vc_input", + candidate_path=vc_input_path, + reference_path=vocals_path if vc_input_path != vocals_path else None, + extra={ + "vc_preprocessed": vc_preprocessed, + "requested_preprocess_mode": normalized_vc_preprocess_mode, + "effective_preprocess_mode": effective_preprocess_mode, + }, + ) converted_vocals_path = str(session_dir / "converted_vocals.wav") log.model(f"加载RVC模型: {Path(model_path).name}") @@ -1857,6 +3476,14 @@ class CoverPipeline: protect=protect, speaker_id=speaker_id, ) + self._record_quality_debug( + session_dir=session_dir, + stage="vc_raw_upstream", + candidate_path=converted_vocals_path, + reference_path=vc_input_path, + snapshot_label="debug_converted_raw_upstream", + extra={"source_constraint_mode": normalized_source_constraint_mode}, + ) log.detail("内置官方模式:去混响干声 -> 官方RVC推理(纯净管道)") log.success("内置官方VC转换完成") elif normalized_vc_pipeline_mode == "official" and singing_repair: @@ -1879,6 +3506,14 @@ class CoverPipeline: speaker_id=speaker_id, repair_profile=True, ) + self._record_quality_debug( + session_dir=session_dir, + stage="vc_raw_repair", + candidate_path=converted_vocals_path, + reference_path=vc_input_path, + snapshot_label="debug_converted_raw_repair", + extra={"source_constraint_mode": normalized_source_constraint_mode}, + ) try: self._apply_silence_gate_official( vocals_path=vc_input_path, @@ -1903,11 +3538,12 @@ class CoverPipeline: log.warning(f"唱歌修复静音区抑制失败,保留当前结果: {e}") log.success("官方兼容唱歌修复转换完成") elif effective_use_official: - log.detail("使用当前项目官方封装VC进行转换") + log.detail("VC backend: upstream_official_raw + current postprocess") + log.detail("VC route detail: vendored upstream official raw -> current cleanup chain") log.config(f"F0方法: {f0_method}, 音调: {pitch_shift}, 索引率: {index_ratio}") log.config(f"滤波半径: {filter_radius}, RMS混合: {rms_mix_rate}, 保护: {protect}") - convert_vocals_official( + convert_vocals_official_upstream( vocals_path=vc_input_path, output_path=converted_vocals_path, model_path=model_path, @@ -1920,6 +3556,14 @@ class CoverPipeline: protect=protect, speaker_id=speaker_id, ) + self._record_quality_debug( + session_dir=session_dir, + stage="vc_raw", + candidate_path=converted_vocals_path, + reference_path=vc_input_path, + snapshot_label="debug_converted_raw", + extra={"source_constraint_mode": normalized_source_constraint_mode}, + ) if silence_gate: log.detail("启用静音门限(当前项目官方封装VC后处理)") self._apply_silence_gate_official( @@ -1944,12 +3588,30 @@ class CoverPipeline: converted_vocals_path=converted_vocals_path, original_vocals_path=vocals_path, ) + self._record_quality_debug( + session_dir=session_dir, + stage="vc_source_constrained", + candidate_path=converted_vocals_path, + reference_path=vc_input_path, + snapshot_label="debug_converted_source_constrained", + extra={"source_constraint_mode": normalized_source_constraint_mode}, + ) log.detail("Applied source-guided reconstruction to suppress echo/noise") self._refine_source_constrained_output( source_vocals_path=vc_input_path, converted_vocals_path=converted_vocals_path, source_constraint_mode=normalized_source_constraint_mode, f0_method=f0_method, + original_vocals_path=vocals_path, + session_dir=session_dir, + ) + self._record_quality_debug( + session_dir=session_dir, + stage="vc_refined", + candidate_path=converted_vocals_path, + reference_path=vc_input_path, + snapshot_label="debug_converted_refined", + extra={"source_constraint_mode": normalized_source_constraint_mode}, ) except Exception as e: log.warning(f"Source-guided reconstruction failed, keeping raw conversion: {e}") @@ -1961,6 +3623,14 @@ class CoverPipeline: source_vocals_path=vc_input_path, converted_vocals_path=converted_vocals_path, ) + self._record_quality_debug( + session_dir=session_dir, + stage="vc_gap_suppressed", + candidate_path=converted_vocals_path, + reference_path=vc_input_path, + snapshot_label="debug_converted_gap_suppressed", + extra={"source_constraint_mode": normalized_source_constraint_mode}, + ) log.detail("Source gap suppression: applied for mature/default route") except Exception as e: log.warning(f"Source gap suppression failed, keeping raw conversion: {e}") @@ -1970,32 +3640,7 @@ class CoverPipeline: log.warning("VC preprocess unavailable, skipping source-guided reconstruction") log.success("官方VC转换完成") - # 如果使用了advanced dereverb,重新应用原始混响(仅非官方模式) - if ( - not effective_official_mode - and not effective_use_official - and hasattr(self, '_original_reverb_path') - and self._original_reverb_path - and Path(self._original_reverb_path).exists() - ): - log.detail("重新应用原始混响到转换后的干声...") - import librosa - import soundfile as sf - - converted_dry, sr = librosa.load(converted_vocals_path, sr=None, mono=True) - original_reverb, reverb_sr = librosa.load(self._original_reverb_path, sr=None, mono=True) - - if reverb_sr != sr: - original_reverb = librosa.resample(original_reverb, orig_sr=reverb_sr, target_sr=sr).astype(np.float32) - - # 重新应用混响(80%强度) - wet_signal = apply_reverb_to_converted(converted_dry, original_reverb, mix_ratio=0.8) - - # 保存带混响的版本 - sf.write(converted_vocals_path, wet_signal, sr) - log.detail(f"混响重应用完成: mix_ratio=0.8") - - elif not effective_official_mode and not effective_use_official: + if not effective_official_mode and not effective_use_official: # 使用自定义VC管道进行转换 log.detail("使用自定义VC管道进行转换") self._init_rvc_pipeline() @@ -2047,6 +3692,14 @@ class CoverPipeline: silence_smoothing_ms=silence_smoothing_ms, silence_min_duration_ms=silence_min_duration_ms, ) + self._record_quality_debug( + session_dir=session_dir, + stage="vc_raw_current", + candidate_path=converted_vocals_path, + reference_path=vc_input_path, + snapshot_label="debug_converted_raw_current", + extra={"source_constraint_mode": normalized_source_constraint_mode}, + ) normalized_source_constraint_mode = str(source_constraint_mode or "auto").strip().lower() should_apply_source_constraint = self._should_apply_source_constraint( vc_preprocessed=vc_preprocessed, @@ -2060,12 +3713,30 @@ class CoverPipeline: converted_vocals_path=converted_vocals_path, original_vocals_path=vocals_path, ) + self._record_quality_debug( + session_dir=session_dir, + stage="vc_source_constrained_current", + candidate_path=converted_vocals_path, + reference_path=vc_input_path, + snapshot_label="debug_converted_source_constrained_current", + extra={"source_constraint_mode": normalized_source_constraint_mode}, + ) log.detail("Applied source-guided reconstruction to suppress echo/noise") self._refine_source_constrained_output( source_vocals_path=vc_input_path, converted_vocals_path=converted_vocals_path, source_constraint_mode=normalized_source_constraint_mode, f0_method=f0_method, + original_vocals_path=vocals_path, + session_dir=session_dir, + ) + self._record_quality_debug( + session_dir=session_dir, + stage="vc_refined_current", + candidate_path=converted_vocals_path, + reference_path=vc_input_path, + snapshot_label="debug_converted_refined_current", + extra={"source_constraint_mode": normalized_source_constraint_mode}, ) except Exception as e: log.warning(f"Source-guided reconstruction failed, keeping raw conversion: {e}") @@ -2077,6 +3748,14 @@ class CoverPipeline: source_vocals_path=vc_input_path, converted_vocals_path=converted_vocals_path, ) + self._record_quality_debug( + session_dir=session_dir, + stage="vc_gap_suppressed_current", + candidate_path=converted_vocals_path, + reference_path=vc_input_path, + snapshot_label="debug_converted_gap_suppressed_current", + extra={"source_constraint_mode": normalized_source_constraint_mode}, + ) log.detail("Source gap suppression: applied for mature/default route") except Exception as e: log.warning(f"Source gap suppression failed, keeping raw conversion: {e}") @@ -2093,6 +3772,18 @@ class CoverPipeline: log.detail("已清理设备缓存") # 记录转换结果 + self._record_quality_debug( + session_dir=session_dir, + stage="vc_final_state", + candidate_path=converted_vocals_path, + reference_path=vc_input_path if Path(vc_input_path).exists() else None, + extra={"source_constraint_mode": normalized_source_constraint_mode}, + ) + self._maybe_log_diagnostic_hint( + session_dir=session_dir, + model_path=model_path, + index_path=index_path, + ) converted_size = Path(converted_vocals_path).stat().st_size if Path(converted_vocals_path).exists() else 0 log.audio(f"转换后人声: {Path(converted_vocals_path).name} ({_format_size(converted_size)})") @@ -2139,6 +3830,17 @@ class CoverPipeline: reverb_amount=reverb_amount ) + self._record_quality_debug( + session_dir=session_dir, + stage="cover_mix", + candidate_path=cover_path, + reference_path=None, + extra={ + "vocals_volume": vocals_volume, + "accompaniment_volume": accompaniment_volume, + "reverb_amount": reverb_amount, + }, + ) cover_size = Path(cover_path).stat().st_size if Path(cover_path).exists() else 0 log.success(f"混音完成: {_format_size(cover_size)}") diff --git a/infer/f0_extractor.py b/infer/f0_extractor.py index 7aed32d30aa70ce7ea02c2c7a917cf07dfd818e1..38aa0a3ac4112b8bb827f9effc1ea7101a0cb3fc 100644 --- a/infer/f0_extractor.py +++ b/infer/f0_extractor.py @@ -1,17 +1,23 @@ # -*- coding: utf-8 -*- """ -F0 (基频) 提取模块 - 支持多种提取方法 +F0 extraction helpers used by the local VC pipeline. """ + +from __future__ import annotations + +from typing import Literal + import numpy as np import torch -from typing import Optional, Literal -# F0 提取方法类型 +from infer.quality_policy import build_conservative_crepe_fill_mask + + F0Method = Literal["rmvpe", "pm", "harvest", "crepe", "hybrid"] class F0Extractor: - """F0 提取器基类""" + """Base F0 extractor.""" def __init__(self, sample_rate: int = 16000, hop_length: int = 160): self.sample_rate = sample_rate @@ -20,34 +26,30 @@ class F0Extractor: self.f0_max = 1100 def extract(self, audio: np.ndarray) -> np.ndarray: - """提取 F0,子类需实现此方法""" raise NotImplementedError class PMExtractor(F0Extractor): - """Parselmouth (Praat) F0 提取器 - 速度快""" + """Praat/Parselmouth extractor.""" def extract(self, audio: np.ndarray) -> np.ndarray: import parselmouth time_step = self.hop_length / self.sample_rate sound = parselmouth.Sound(audio, self.sample_rate) - pitch = sound.to_pitch_ac( time_step=time_step, voicing_threshold=0.6, pitch_floor=self.f0_min, - pitch_ceiling=self.f0_max + pitch_ceiling=self.f0_max, ) - f0 = pitch.selected_array["frequency"] f0[f0 == 0] = np.nan - return f0 class HarvestExtractor(F0Extractor): - """PyWorld Harvest F0 提取器 - 质量较好""" + """PyWorld harvest extractor.""" def extract(self, audio: np.ndarray) -> np.ndarray: import pyworld @@ -58,26 +60,27 @@ class HarvestExtractor(F0Extractor): self.sample_rate, f0_floor=self.f0_min, f0_ceil=self.f0_max, - frame_period=self.hop_length / self.sample_rate * 1000 + frame_period=self.hop_length / self.sample_rate * 1000, ) - return f0 class CrepeExtractor(F0Extractor): - """TorchCrepe F0 提取器 - 深度学习方法""" - - def __init__(self, sample_rate: int = 16000, hop_length: int = 160, - device: str = "cuda"): + """TorchCrepe extractor.""" + + def __init__( + self, + sample_rate: int = 16000, + hop_length: int = 160, + device: str = "cuda", + ): super().__init__(sample_rate, hop_length) self.device = device def extract(self, audio: np.ndarray) -> np.ndarray: import torchcrepe - audio_tensor = torch.from_numpy(audio).float().unsqueeze(0) - audio_tensor = audio_tensor.to(self.device) - + audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device) f0, _ = torchcrepe.predict( audio_tensor, self.sample_rate, @@ -87,111 +90,120 @@ class CrepeExtractor(F0Extractor): model="full", batch_size=512, device=self.device, - return_periodicity=True + return_periodicity=True, ) - - f0 = f0.squeeze(0).cpu().numpy() - return f0 + return f0.squeeze(0).cpu().numpy() class RMVPEExtractor(F0Extractor): - """RMVPE F0 提取器 - 质量最高 (推荐)""" - - def __init__(self, model_path: str, sample_rate: int = 16000, - hop_length: int = 160, device: str = "cuda"): + """RMVPE extractor.""" + + def __init__( + self, + model_path: str, + sample_rate: int = 16000, + hop_length: int = 160, + device: str = "cuda", + ): super().__init__(sample_rate, hop_length) self.device = device self.model = None self.model_path = model_path - def load_model(self): - """加载 RMVPE 模型""" + def load_model(self) -> None: if self.model is not None: return from models.rmvpe import RMVPE self.model = RMVPE(self.model_path, device=self.device) - print(f"RMVPE 模型已加载: {self.device}") + print(f"RMVPE model loaded: {self.device}") def extract(self, audio: np.ndarray) -> np.ndarray: self.load_model() + return self.model.infer_from_audio(audio, thred=0.01) - # RMVPE 需要 16kHz 输入 - f0 = self.model.infer_from_audio(audio, thred=0.01) - - return f0 - - -def get_f0_extractor(method: F0Method, device: str = "cuda", - rmvpe_path: str = None, crepe_threshold: float = 0.05) -> F0Extractor: - """ - 获取 F0 提取器实例 - Args: - method: 提取方法 ("rmvpe", "pm", "harvest", "crepe", "hybrid") - device: 计算设备 - rmvpe_path: RMVPE 模型路径 (rmvpe/hybrid 方法需要) - crepe_threshold: CREPE置信度阈值 (仅hybrid方法使用) +def get_f0_extractor( + method: F0Method, + device: str = "cuda", + rmvpe_path: str | None = None, + crepe_threshold: float = 0.05, +) -> F0Extractor: + """Create an F0 extractor.""" - Returns: - F0Extractor: 提取器实例 - """ if method == "rmvpe": if rmvpe_path is None: - raise ValueError("RMVPE 方法需要指定模型路径") + raise ValueError("RMVPE requires a model path") return RMVPEExtractor(rmvpe_path, device=device) - elif method == "hybrid": + if method == "hybrid": if rmvpe_path is None: - raise ValueError("Hybrid 方法需要指定RMVPE模型路径") - return HybridF0Extractor(rmvpe_path, device=device, crepe_threshold=crepe_threshold) - elif method == "pm": + raise ValueError("Hybrid requires an RMVPE model path") + return HybridF0Extractor( + rmvpe_path, + device=device, + crepe_threshold=crepe_threshold, + ) + if method == "pm": return PMExtractor() - elif method == "harvest": + if method == "harvest": return HarvestExtractor() - elif method == "crepe": + if method == "crepe": return CrepeExtractor(device=device) - else: - raise ValueError(f"未知的 F0 提取方法: {method}") + raise ValueError(f"Unknown F0 method: {method}") class HybridF0Extractor(F0Extractor): - """混合F0提取器 - RMVPE主导 + CREPE高精度补充""" + """ + Conservative hybrid extractor. + + RMVPE remains the primary estimator. CREPE is only allowed to repair short, + high-confidence dropouts that sit inside already-voiced context. + """ - def __init__(self, rmvpe_path: str, sample_rate: int = 16000, - hop_length: int = 160, device: str = "cuda", - crepe_threshold: float = 0.05): + def __init__( + self, + rmvpe_path: str, + sample_rate: int = 16000, + hop_length: int = 160, + device: str = "cuda", + crepe_threshold: float = 0.05, + max_fill_ratio: float = 0.02, + max_fill_frames: int = 320, + context_radius: int = 6, + ): super().__init__(sample_rate, hop_length) self.device = device self.rmvpe = RMVPEExtractor(rmvpe_path, sample_rate, hop_length, device) - self.crepe = None # 延迟加载 - self.crepe_threshold = crepe_threshold - - def _load_crepe(self): - """延迟加载CREPE模型""" - if self.crepe is None: - try: - self.crepe = CrepeExtractor(self.sample_rate, self.hop_length, self.device) - except ImportError: - print("警告: torchcrepe未安装,混合F0将仅使用RMVPE") - self.crepe = False + self.crepe = None + self.crepe_threshold = float(crepe_threshold) + self.max_fill_ratio = float(max_fill_ratio) + self.max_fill_frames = int(max_fill_frames) + self.context_radius = int(context_radius) + + def _load_crepe(self) -> None: + if self.crepe is not None: + return + + try: + self.crepe = CrepeExtractor( + self.sample_rate, + self.hop_length, + self.device, + ) + except ImportError: + print("Warning: torchcrepe is unavailable, hybrid falls back to RMVPE") + self.crepe = False def extract(self, audio: np.ndarray) -> np.ndarray: - """ - 混合提取F0: - 1. 使用RMVPE作为主要方法(快速、稳定) - 2. 在RMVPE不稳定的区域使用CREPE补充(高精度) - """ - # 提取RMVPE F0 f0_rmvpe = self.rmvpe.extract(audio) - # 如果CREPE不可用,直接返回RMVPE结果 self._load_crepe() if self.crepe is False: return f0_rmvpe - # 提取CREPE F0和置信度 import torchcrepe + audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device) f0_crepe, confidence = torchcrepe.predict( audio_tensor, @@ -202,64 +214,39 @@ class HybridF0Extractor(F0Extractor): model="full", batch_size=512, device=self.device, - return_periodicity=True + return_periodicity=True, ) f0_crepe = f0_crepe.squeeze(0).cpu().numpy() confidence = confidence.squeeze(0).cpu().numpy() - # 对齐长度 min_len = min(len(f0_rmvpe), len(f0_crepe), len(confidence)) - f0_rmvpe = f0_rmvpe[:min_len] - f0_crepe = f0_crepe[:min_len] - confidence = confidence[:min_len] - - # 检测RMVPE不稳定区域 - # 1. F0跳变过大(超过3个半音) - f0_diff = np.abs(np.diff(f0_rmvpe, prepend=f0_rmvpe[0])) - semitone_diff = np.abs(12 * np.log2((f0_rmvpe + 1e-6) / (np.roll(f0_rmvpe, 1) + 1e-6))) - semitone_diff[0] = 0 - unstable_jump = semitone_diff > 3.0 - - # 2. CREPE置信度高但RMVPE给出F0=0 - unstable_unvoiced = (f0_rmvpe < 1e-3) & (confidence > self.crepe_threshold) - - # 3. RMVPE和CREPE差异过大(超过2个半音)且CREPE置信度高 - f0_ratio = (f0_crepe + 1e-6) / (f0_rmvpe + 1e-6) - semitone_gap = np.abs(12 * np.log2(f0_ratio)) - unstable_diverge = (semitone_gap > 2.0) & (confidence > self.crepe_threshold * 1.5) + if min_len <= 0: + return f0_rmvpe - # 合并不稳定区域 - unstable_mask = unstable_jump | unstable_unvoiced | unstable_diverge + f0_rmvpe = np.asarray(f0_rmvpe[:min_len], dtype=np.float32) + f0_crepe = np.asarray(f0_crepe[:min_len], dtype=np.float32) + confidence = np.asarray(confidence[:min_len], dtype=np.float32) + + fill_mask = build_conservative_crepe_fill_mask( + f0_rmvpe=f0_rmvpe, + f0_crepe=f0_crepe, + confidence=confidence, + confidence_threshold=self.crepe_threshold, + max_ratio=self.max_fill_ratio, + max_frames=self.max_fill_frames, + context_radius=self.context_radius, + ) - # 扩展不稳定区域(前后各2帧)以平滑过渡 - kernel = np.ones(5, dtype=bool) - unstable_mask = np.convolve(unstable_mask, kernel, mode='same') + if not np.any(fill_mask): + return f0_rmvpe - # 混合F0:不稳定区域使用CREPE,其他区域使用RMVPE f0_hybrid = f0_rmvpe.copy() - f0_hybrid[unstable_mask] = f0_crepe[unstable_mask] - - # 平滑过渡边界 - for i in range(1, len(f0_hybrid) - 1): - if unstable_mask[i] != unstable_mask[i-1]: - # 边界处使用加权平均 - w = 0.5 - f0_hybrid[i] = w * f0_rmvpe[i] + (1-w) * f0_crepe[i] - + f0_hybrid[fill_mask] = f0_crepe[fill_mask] return f0_hybrid def shift_f0(f0: np.ndarray, semitones: float) -> np.ndarray: - """ - 音调偏移 + """Shift F0 by semitones.""" - Args: - f0: 原始 F0 - semitones: 偏移半音数 (正数升调,负数降调) - - Returns: - np.ndarray: 偏移后的 F0 - """ factor = 2 ** (semitones / 12) - f0_shifted = f0 * factor - return f0_shifted + return f0 * factor diff --git a/infer/modules/uvr5/modules.py b/infer/modules/uvr5/modules.py index e2a511888cd0739720a30fa226831755225d0368..03b9b7ed08909855f88a21e71ed47a764f6f518f 100644 --- a/infer/modules/uvr5/modules.py +++ b/infer/modules/uvr5/modules.py @@ -4,7 +4,7 @@ import logging logger = logging.getLogger(__name__) -import ffmpeg +import av import torch from configs.config import Config @@ -20,6 +20,24 @@ except ImportError: config = Config() +def _read_audio_metadata(path: str) -> tuple[int, str]: + with av.open(path) as container: + stream = next((candidate for candidate in container.streams if candidate.type == "audio"), None) + if stream is None: + raise RuntimeError(f"No audio stream found in {path}") + + codec_context = stream.codec_context + channels = getattr(codec_context, "channels", None) + if not channels and getattr(stream, "layout", None) is not None: + channels = getattr(stream.layout, "nb_channels", None) + + sample_rate = getattr(codec_context, "sample_rate", None) or getattr(stream, "sample_rate", None) + if channels is None or sample_rate is None: + raise RuntimeError(f"Unable to read audio metadata from {path}") + + return int(channels), str(sample_rate) + + def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0): infos = [] try: @@ -74,9 +92,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format need_reformat = 1 done = 0 try: - info = ffmpeg.probe(inp_path, cmd="ffprobe") - channels = info["streams"][0]["channels"] - sample_rate = info["streams"][0]["sample_rate"] + channels, sample_rate = _read_audio_metadata(inp_path) if log: log.audio(f"音频信息: {channels}声道, {sample_rate}Hz") diff --git a/infer/modules/vc/pipeline.py b/infer/modules/vc/pipeline.py index b92d0d760f73646bfc20ea1663d47af97b13d18f..8109f8a34f04456d46104544bbbd3cc4be915f39 100644 --- a/infer/modules/vc/pipeline.py +++ b/infer/modules/vc/pipeline.py @@ -29,6 +29,11 @@ except ImportError: log = None from lib.audio import soft_clip +from infer.quality_policy import ( + build_conservative_harvest_fill_mask, + compute_breath_preserving_energy_gates, + compute_chunk_crossfade_samples, +) bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) @@ -372,6 +377,12 @@ class Pipeline(object): self.crepe_pd_threshold = float(getattr(config, "crepe_pd_threshold", 0.1)) self.crepe_force_ratio = float(getattr(config, "crepe_force_ratio", 0.05)) self.crepe_replace_semitones = float(getattr(config, "crepe_replace_semitones", 0.0)) + self.unvoiced_feature_gate_floor = float( + getattr(config, "unvoiced_feature_gate_floor", 0.28) + ) + self.breath_active_margin_db = float( + getattr(config, "breath_active_margin_db", 52.0) + ) self.f0_fallback_context_radius = int(getattr(config, "f0_fallback_context_radius", 24)) self.f0_fallback_repair_gap = int(getattr(config, "f0_fallback_repair_gap", 12)) self.f0_fallback_post_gap = int(getattr(config, "f0_fallback_post_gap", 10)) @@ -394,6 +405,12 @@ class Pipeline(object): self.crepe_pd_threshold = 0.0 if self.crepe_replace_semitones < 0: self.crepe_replace_semitones = 0.0 + if self.unvoiced_feature_gate_floor < 0.05: + self.unvoiced_feature_gate_floor = 0.05 + if self.unvoiced_feature_gate_floor > 1.0: + self.unvoiced_feature_gate_floor = 1.0 + if self.breath_active_margin_db < 1.0: + self.breath_active_margin_db = 1.0 if self.f0_fallback_context_radius < 1: self.f0_fallback_context_radius = 1 if self.f0_fallback_repair_gap < 0: @@ -421,6 +438,10 @@ class Pipeline(object): f"F0混合: {self.f0_hybrid_mode}, CREPE阈值: {self.crepe_pd_threshold}, " f"强制比率: {self.crepe_force_ratio}, 替换阈值(半音): {self.crepe_replace_semitones}" ) + log.detail( + f"气声门控: 特征地板={self.unvoiced_feature_gate_floor:.2f}, " + f"激活边界=ref-{self.breath_active_margin_db:.1f}dB" + ) log.detail( f"F0兜底: 上下文半径={self.f0_fallback_context_radius}, " f"预修补长度={self.f0_fallback_repair_gap}, 后修补长度={self.f0_fallback_post_gap}, " @@ -466,12 +487,12 @@ class Pipeline(object): log.detail(f"时间步长: {time_step:.2f}ms, F0范围: {f0_min}-{f0_max}Hz") log.detail(f"音频长度: {len(x)} 样本, p_len: {p_len}") - # 将hybrid映射到rmvpe+crepe模式 + # Normalize direct hybrid requests to the conservative RMVPE fallback path. if f0_method == "hybrid": f0_method = "rmvpe" - # 临时设置hybrid模式 original_hybrid_mode = self.f0_hybrid_mode - self.f0_hybrid_mode = "rmvpe+crepe" + if self.f0_hybrid_mode not in self.rmvpe_fallback_modes: + self.f0_hybrid_mode = "fallback" restore_hybrid_mode = True else: restore_hybrid_mode = False @@ -704,7 +725,19 @@ class Pipeline(object): else: f0_fb = f0_fb[: len(f0)] - fill_mask = need_fill & (f0_fb > 0) + fill_mask = build_conservative_harvest_fill_mask( + reference_f0=f0, + fallback_f0=f0_fb, + dropout_mask=need_fill, + max_run=10, + local_radius=4, + max_semitones=4.0, + ) + rejected_fill_count = int(np.sum(need_fill)) - int(np.sum(fill_mask)) + if rejected_fill_count > 0 and log: + log.detail( + f"Harvest兜底已拒绝 {rejected_fill_count} 帧长间隙/离群音高,避免呼吸与回气误赋音高" + ) f0[fill_mask] = f0_fb[fill_mask] need_fill2 = (f0 <= 0) & energy_mask & voiced_context @@ -923,40 +956,58 @@ class Pipeline(object): _energy_db = 20.0 * np.log10(_frame_rms + 1e-8) _ref = energy_ref_db if energy_ref_db is not None else float(np.percentile(_energy_db, 95)) - # Soft gate: sigmoid curve centered at ref-45dB with 6dB transition width. - # Frames well above threshold → gain≈1; frames well below → gain≈0.05 - # (keep a small floor to avoid zero-feature shock to the network). - _silence_center = _ref - 45.0 - _transition_width = 6.0 # dB for the sigmoid ramp - _energy_gate = 1.0 / (1.0 + np.exp(-(_energy_db - _silence_center) / (_transition_width / 4.0))) - # Apply floor: never fully zero features (network handles near-zero better than hard zero) - _energy_gate = np.clip(_energy_gate, 0.05, 1.0) + _unvoiced_mask = None + if pitchf is not None: + _pitchf_np = pitchf[0].detach().float().cpu().numpy() + if len(_pitchf_np) > _p_len_val: + _pitchf_np = _pitchf_np[:_p_len_val] + elif len(_pitchf_np) < _p_len_val: + _pitchf_np = np.pad(_pitchf_np, (0, _p_len_val - len(_pitchf_np)), mode="edge") + _unvoiced_mask = _pitchf_np <= 1e-4 + + _feat_gate_curve, _f0_gate_curve = compute_breath_preserving_energy_gates( + energy_db=_energy_db, + ref_db=_ref, + unvoiced_mask=_unvoiced_mask, + quiet_floor=0.05, + breath_floor=self.unvoiced_feature_gate_floor, + breath_active_margin_db=self.breath_active_margin_db, + transition_width_db=6.0, + ) # Smooth temporally _sm = np.array([1, 2, 3, 2, 1], dtype=np.float32) _sm /= _sm.sum() - _energy_gate = np.convolve(_energy_gate, _sm, mode='same')[:_p_len_val] - _energy_gate = np.clip(_energy_gate, 0.05, 1.0) + _feat_gate_curve = np.convolve(_feat_gate_curve, _sm, mode='same')[:_p_len_val] + _f0_gate_curve = np.convolve(_f0_gate_curve, _sm, mode='same')[:_p_len_val] + _feat_gate_curve = np.clip(_feat_gate_curve, 0.05, 1.0) + _f0_gate_curve = np.clip(_f0_gate_curve, 0.05, 1.0) + if log and _unvoiced_mask is not None: + _breath_frames = int(np.sum(_feat_gate_curve > (_f0_gate_curve + 1e-4))) + if _breath_frames > 0: + log.detail( + f"气声保护门控: { _breath_frames }/{ _p_len_val } 帧保留更高特征底噪" + ) # Apply soft gate to features _feat_len = feats.shape[1] - if len(_energy_gate) > _feat_len: - _feat_gate = _energy_gate[:_feat_len] - elif len(_energy_gate) < _feat_len: - _feat_gate = np.pad(_energy_gate, (0, _feat_len - len(_energy_gate)), mode='constant', constant_values=1.0) + if len(_feat_gate_curve) > _feat_len: + _feat_gate = _feat_gate_curve[:_feat_len] + elif len(_feat_gate_curve) < _feat_len: + _feat_gate = np.pad(_feat_gate_curve, (0, _feat_len - len(_feat_gate_curve)), mode='constant', constant_values=1.0) else: - _feat_gate = _energy_gate + _feat_gate = _feat_gate_curve _gate_t = torch.from_numpy(_feat_gate.astype(np.float32)).to(feats.device).unsqueeze(0).unsqueeze(-1) feats = feats * _gate_t # F0 soft gating: consistently soft-attenuate both pitch confidence and pitch value if pitch is not None and pitchf is not None: _pitch_len = pitch.shape[1] - if len(_energy_gate) > _pitch_len: - _f0_gate = _energy_gate[:_pitch_len] - elif len(_energy_gate) < _pitch_len: - _f0_gate = np.pad(_energy_gate, (0, _pitch_len - len(_energy_gate)), mode='constant', constant_values=1.0) + if len(_f0_gate_curve) > _pitch_len: + _f0_gate = _f0_gate_curve[:_pitch_len] + elif len(_f0_gate_curve) < _pitch_len: + _f0_gate = np.pad(_f0_gate_curve, (0, _pitch_len - len(_f0_gate_curve)), mode='constant', constant_values=1.0) else: - _f0_gate = _energy_gate + _f0_gate = _f0_gate_curve _f0_gate_t = torch.from_numpy(_f0_gate.astype(np.float32)).to(pitch.device).unsqueeze(0) pitchf = pitchf * _f0_gate_t # Soft-blend pitch toward silence bin (1) instead of hard switch @@ -1075,6 +1126,9 @@ class Pipeline(object): ) if log: log.detail(f"分段数量: {len(opt_ts) + 1}") + if opt_ts: + boundary_times = [round(float(t) / 16000.0, 3) for t in opt_ts[:12]] + log.detail(f"分段边界(秒): {boundary_times}") else: if log: if self.disable_chunking: @@ -1140,10 +1194,14 @@ class Pipeline(object): segment_count = len(opt_ts) + 1 current_segment = 0 - # Crossfade length at target rate (~12ms). Each boundary segment - # keeps this many extra samples from the normally-trimmed padding - # region. The overlap between adjacent segments is 2 * _xfade_tgt. - _xfade_tgt = min(int(0.012 * tgt_sr), self.t_pad_tgt // 4) if len(opt_ts) > 0 else 0 + # Keep a wider overlap than the legacy 12ms crossfade. Adjacent + # chunks are synthesized independently and benefit from a longer, + # equal-power merge to suppress tearing at the boundaries. + _xfade_tgt = compute_chunk_crossfade_samples( + tgt_sr=tgt_sr, + t_pad_tgt=self.t_pad_tgt, + segment_count=segment_count, + ) def _trim_segment(raw, is_first, is_last): """Trim padding from vc() output, keeping crossfade overlap.""" @@ -1241,8 +1299,9 @@ class Pipeline(object): for seg in audio_opt[1:]: xf = min(overlap, len(result), len(seg)) if xf > 1: - fade_out = np.linspace(1.0, 0.0, xf, dtype=np.float32) - fade_in = 1.0 - fade_out + theta = np.linspace(0.0, np.pi / 2.0, xf, dtype=np.float32) + fade_out = np.cos(theta) + fade_in = np.sin(theta) blended = result[-xf:] * fade_out + seg[:xf] * fade_in result = np.concatenate([result[:-xf], blended, seg[xf:]]) else: diff --git a/infer/official_adapter.py b/infer/official_adapter.py index 776b3e0814d160a22ee380b161c86714227b2967..7aa24848ceb7ecc048476ae2d462e60433e6d176 100644 --- a/infer/official_adapter.py +++ b/infer/official_adapter.py @@ -16,9 +16,22 @@ from typing import Optional, Tuple import soundfile as sf from configs.config import Config as OfficialConfig +from infer.quality_policy import resolve_cover_f0_policy from lib.logger import log +class _IsolatedArgv: + """Prevent vendored official modules from parsing this process' CLI args.""" + + def __enter__(self): + self._saved_argv = sys.argv[:] + sys.argv = [sys.argv[0]] + + def __exit__(self, exc_type, exc, tb): + sys.argv = self._saved_argv + return False + + def _load_app_config(root_dir: Path) -> dict: config_path = root_dir / "configs" / "config.json" if config_path.exists(): @@ -41,6 +54,17 @@ def _to_float(value, default): return float(default) +def _get_audio_activity_stats(audio_path: Path) -> Tuple[float, float, int]: + audio, _ = sf.read(str(audio_path), dtype="float32", always_2d=True) + if audio.size == 0: + return 0.0, 0.0, 0 + mono = audio.mean(axis=1) + rms = float((mono.astype("float64") ** 2).mean() ** 0.5) + peak = float(abs(mono).max()) + nonzero = int((abs(mono) > 1e-8).sum()) + return rms, peak, nonzero + + def _resolve_index_path(model_path: Path, index_path: Optional[str]) -> Optional[Path]: """Best-effort resolve of the matching FAISS index for a model.""" if index_path: @@ -189,13 +213,15 @@ def separate_uvr5( fmt: str = "wav" ) -> Tuple[str, str]: """Run UVR5 separation and return vocals/ins paths.""" + temp_dir = Path(temp_dir).resolve() log.progress("开始UVR5人声分离...") log.detail(f"输入音频: {input_audio}") log.detail(f"临时目录: {temp_dir}") log.config(f"激进度: {agg}, 输出格式: {fmt}") setup_official_env(Path(__file__).parent.parent) - from infer.modules.uvr5.modules import uvr as uvr5_run + with _IsolatedArgv(): + from infer.modules.uvr5.modules import uvr as uvr5_run try: import ffmpeg # noqa: F401 except Exception as e: @@ -246,6 +272,26 @@ def separate_uvr5( if not vocal_files or not ins_files: raise RuntimeError("UVR5 分离失败,未生成输出文件") + vocal_rms, vocal_peak, vocal_nonzero = _get_audio_activity_stats(vocal_files[-1]) + ins_rms, ins_peak, ins_nonzero = _get_audio_activity_stats(ins_files[-1]) + log.detail( + "UVR5输出统计: " + f"vocals(rms={vocal_rms:.8f}, peak={vocal_peak:.8f}, nonzero={vocal_nonzero}), " + f"ins(rms={ins_rms:.8f}, peak={ins_peak:.8f}, nonzero={ins_nonzero})" + ) + if vocal_peak <= 1e-6 or vocal_nonzero == 0: + raise RuntimeError( + "UVR5 分离失败:人声输出为空或静音," + f"模型={model_name}, vocal={vocal_files[-1]}, " + f"rms={vocal_rms:.8f}, peak={vocal_peak:.8f}, nonzero={vocal_nonzero}" + ) + if ins_peak <= 1e-6 or ins_nonzero == 0: + raise RuntimeError( + "UVR5 分离失败:伴奏输出为空或静音," + f"模型={model_name}, instrumental={ins_files[-1]}, " + f"rms={ins_rms:.8f}, peak={ins_peak:.8f}, nonzero={ins_nonzero}" + ) + log.success(f"UVR5分离完成") log.audio(f"人声文件: {vocal_files[-1].name}") log.audio(f"伴奏文件: {ins_files[-1].name}") @@ -271,6 +317,14 @@ def convert_vocals_official( rms_mix_rate = float(max(0.0, min(1.0, rms_mix_rate))) # Official vc pipeline uses the opposite convention: 1=off, 0=strongest. official_rms_mix_rate = 1.0 - rms_mix_rate + root_dir = Path(__file__).parent.parent + app_cfg = _load_app_config(root_dir) + f0_policy = resolve_cover_f0_policy( + requested_method=f0_method, + configured_hybrid_mode=str(_get_cfg_value(app_cfg, "f0_hybrid_mode", "off")), + repair_profile=repair_profile, + ) + effective_f0_method = f0_policy.vc_method log.progress("开始官方VC人声转换...") log.detail(f"输入人声: {vocals_path}") @@ -280,6 +334,14 @@ def convert_vocals_official( log.model(f"索引文件: {Path(index_path).name}") log.config(f"F0方法: {f0_method}") + if effective_f0_method != str(f0_method).strip().lower(): + log.detail( + "F0路由解析: " + f"requested={f0_policy.requested_method}, " + f"vc={f0_policy.vc_method}, " + f"hybrid_mode={f0_policy.hybrid_mode}, " + f"gate={f0_policy.gate_method}" + ) log.config(f"音调偏移: {pitch_shift} 半音") log.config(f"索引率: {index_rate}") log.config(f"滤波半径: {filter_radius}") @@ -287,8 +349,6 @@ def convert_vocals_official( log.config(f"保护系数: {protect}") if repair_profile: log.config("唱歌修复: 开启") - - root_dir = Path(__file__).parent.parent env_paths = setup_official_env(root_dir) log.detail("导入官方VC模块...") @@ -306,7 +366,6 @@ def convert_vocals_official( log.detail("初始化官方配置...") config = OfficialConfig() - app_cfg = _load_app_config(root_dir) config.disable_chunking = bool(app_cfg.get("disable_chunking", False)) if "cover" in app_cfg and isinstance(app_cfg["cover"], dict): config.disable_chunking = bool(app_cfg["cover"].get("disable_chunking", config.disable_chunking)) @@ -317,7 +376,7 @@ def convert_vocals_official( # Keep RMVPE extraction aligned with RVC training pitch embedding range. # Allowing much wider ranges (e.g. 1600Hz) often tracks higher harmonics # instead of the fundamental and introduces synthetic buzzing artifacts. - if f0_method == "rmvpe": + if effective_f0_method == "rmvpe": if config.f0_min != 50.0 or config.f0_max != 1100.0: log.warning( "检测到RMVPE F0范围偏离RVC训练范围,已强制使用 50-1100Hz 以避免误跟踪高次谐波" @@ -328,7 +387,7 @@ def convert_vocals_official( config.f0_energy_threshold_db = _to_float( _get_cfg_value(app_cfg, "f0_energy_threshold_db", -50), -50 ) - config.f0_hybrid_mode = str(_get_cfg_value(app_cfg, "f0_hybrid_mode", "off")) + config.f0_hybrid_mode = f0_policy.hybrid_mode config.crepe_pd_threshold = _to_float( _get_cfg_value(app_cfg, "crepe_pd_threshold", 0.1), 0.1 ) @@ -338,6 +397,12 @@ def convert_vocals_official( config.crepe_replace_semitones = _to_float( _get_cfg_value(app_cfg, "crepe_replace_semitones", 0.0), 0.0 ) + config.unvoiced_feature_gate_floor = _to_float( + _get_cfg_value(app_cfg, "unvoiced_feature_gate_floor", 0.28), 0.28 + ) + config.breath_active_margin_db = _to_float( + _get_cfg_value(app_cfg, "breath_active_margin_db", 52.0), 52.0 + ) config.f0_stabilize = bool(_get_cfg_value(app_cfg, "f0_stabilize", False)) config.f0_stabilize_window = int(_get_cfg_value(app_cfg, "f0_stabilize_window", 2)) config.f0_stabilize_max_semitones = _to_float( @@ -348,10 +413,32 @@ def convert_vocals_official( config.f0_rate_limit_semitones = _to_float( _get_cfg_value(app_cfg, "f0_rate_limit_semitones", 8.0), 8.0 ) + if f0_policy.requested_method == "hybrid": + config.f0_fallback_context_radius = 12 + config.f0_fallback_repair_gap = 6 + config.f0_fallback_post_gap = 4 + config.f0_fallback_use_crepe = True + config.f0_fallback_crepe_max_ratio = 0.006 + config.f0_fallback_crepe_max_frames = 160 + if not repair_profile: + config.f0_stabilize = True + config.f0_rate_limit = True + config.f0_stabilize_max_semitones = min( + 5.0, + float(config.f0_stabilize_max_semitones), + ) + config.f0_rate_limit_semitones = min( + 7.0, + float(config.f0_rate_limit_semitones), + ) + log.detail("Hybrid唱歌护栏已启用: F0稳定器 + F0限速") if repair_profile: config.is_half = False config.f0_hybrid_mode = "fallback" config.f0_energy_threshold_db = -42.0 + config.unvoiced_feature_gate_floor = max( + 0.32, float(config.unvoiced_feature_gate_floor) + ) config.f0_fallback_context_radius = 12 config.f0_fallback_repair_gap = 6 config.f0_fallback_post_gap = 4 @@ -365,6 +452,10 @@ def convert_vocals_official( log.config(f"F0范围: {config.f0_min}-{config.f0_max}Hz") log.config(f"RMVPE阈值: {config.rmvpe_threshold}") log.config(f"F0能量阈值: {config.f0_energy_threshold_db}dB") + log.config( + f"无声气声特征地板: {config.unvoiced_feature_gate_floor:.2f}, " + f"呼吸激活边界: ref-{config.breath_active_margin_db:.1f}dB" + ) log.config( f"F0混合: {config.f0_hybrid_mode}, CREPE阈值: {config.crepe_pd_threshold}, " f"强制比率: {config.crepe_force_ratio}, 替换阈值(半音): {config.crepe_replace_semitones}" @@ -402,7 +493,7 @@ def convert_vocals_official( vocals_path, pitch_shift, None, - f0_method, + effective_f0_method, official_index or "", "", index_rate, @@ -513,6 +604,13 @@ def convert_vocals_official_upstream( ) -> str: """Run vendored upstream official RVC in an isolated subprocess.""" root_dir = Path(__file__).parent.parent + app_cfg = _load_app_config(root_dir) + f0_policy = resolve_cover_f0_policy( + requested_method=f0_method, + configured_hybrid_mode=str(_get_cfg_value(app_cfg, "f0_hybrid_mode", "off")), + repair_profile=False, + ) + effective_f0_method = f0_policy.vc_method env_paths = setup_upstream_official_env(root_dir) sid, official_index = export_model_to_official( @@ -537,7 +635,7 @@ def convert_vocals_official_upstream( "--output-path", str(output_path), "--f0-method", - str(f0_method), + str(effective_f0_method), "--pitch-shift", str(int(pitch_shift)), "--index-path", @@ -558,6 +656,14 @@ def convert_vocals_official_upstream( log.detail(f"官方模型SID: {sid}") if official_index: log.detail(f"官方索引路径: {official_index}") + if effective_f0_method != str(f0_method).strip().lower(): + log.detail( + "官方F0路由解析: " + f"requested={f0_policy.requested_method}, " + f"vc={f0_policy.vc_method}, " + f"hybrid_mode={f0_policy.hybrid_mode}, " + f"gate={f0_policy.gate_method}" + ) log.detail(f"官方RMS混合率: {official_rms_mix_rate}") try: diff --git a/infer/quality_policy.py b/infer/quality_policy.py new file mode 100644 index 0000000000000000000000000000000000000000..1a6623d10803d7af6fbf264fb0cd6307d11c28be --- /dev/null +++ b/infer/quality_policy.py @@ -0,0 +1,294 @@ +from __future__ import annotations + +from dataclasses import dataclass + +import numpy as np + + +FALLBACK_HYBRID_MODES = { + "fallback", + "smart", + "rmvpe+fallback", + "rmvpe_fallback", + "rmvpe-fallback", + "hybrid_fallback", + "hybrid-fallback", +} + + +@dataclass(frozen=True) +class F0RoutingPolicy: + requested_method: str + vc_method: str + hybrid_mode: str + gate_method: str + description: str + + +def resolve_cover_f0_policy( + requested_method: str, + configured_hybrid_mode: str = "off", + repair_profile: bool = False, +) -> F0RoutingPolicy: + requested = str(requested_method or "rmvpe").strip().lower() + configured = str(configured_hybrid_mode or "off").strip().lower() + + if requested != "hybrid": + return F0RoutingPolicy( + requested_method=requested, + vc_method=requested, + hybrid_mode=configured or "off", + gate_method=requested, + description=f"{requested} uses the configured routing directly.", + ) + + hybrid_mode = configured if configured in FALLBACK_HYBRID_MODES else "fallback" + if repair_profile: + hybrid_mode = "fallback" + + return F0RoutingPolicy( + requested_method="hybrid", + vc_method="rmvpe", + hybrid_mode=hybrid_mode, + gate_method="rmvpe", + description="hybrid request routed to RMVPE with conservative fallback; post-gate uses RMVPE only.", + ) + + +def build_conservative_crepe_fill_mask( + f0_rmvpe: np.ndarray, + f0_crepe: np.ndarray, + confidence: np.ndarray, + confidence_threshold: float, + max_ratio: float = 0.02, + max_frames: int = 320, + context_radius: int = 6, + energy_mask: np.ndarray | None = None, +) -> np.ndarray: + f0_rmvpe = np.asarray(f0_rmvpe, dtype=np.float32).reshape(-1) + f0_crepe = np.asarray(f0_crepe, dtype=np.float32).reshape(-1) + confidence = np.asarray(confidence, dtype=np.float32).reshape(-1) + n = min(len(f0_rmvpe), len(f0_crepe), len(confidence)) + if n == 0: + return np.zeros(0, dtype=bool) + + f0_rmvpe = f0_rmvpe[:n] + f0_crepe = f0_crepe[:n] + confidence = confidence[:n] + threshold = max(0.0, float(confidence_threshold)) + context_radius = max(1, int(context_radius)) + max_ratio = max(0.0, float(max_ratio)) + max_frames = max(0, int(max_frames)) + + if energy_mask is None: + energy_mask = np.ones(n, dtype=bool) + else: + energy_mask = np.asarray(energy_mask, dtype=bool).reshape(-1) + if len(energy_mask) < n: + energy_mask = np.pad(energy_mask, (0, n - len(energy_mask)), mode="edge") + else: + energy_mask = energy_mask[:n] + + voiced_seed = f0_rmvpe > 0 + if not np.any(voiced_seed): + return np.zeros(n, dtype=bool) + + idx = np.arange(n) + left_seen = np.where(voiced_seed, idx, -10**9) + left_seen = np.maximum.accumulate(left_seen) + right_seen = np.where(voiced_seed, idx, 10**9) + right_seen = np.minimum.accumulate(right_seen[::-1])[::-1] + voiced_context = ((idx - left_seen) <= context_radius) & ((right_seen - idx) <= context_radius) + + fill_mask = ( + (f0_rmvpe <= 0) + & (f0_crepe > 0) + & (confidence >= threshold) + & energy_mask + & voiced_context + ) + + fill_count = int(np.sum(fill_mask)) + if fill_count == 0: + return fill_mask + + fill_ratio = float(fill_count) / float(n) + if fill_count > max_frames or fill_ratio > max_ratio: + return np.zeros(n, dtype=bool) + + return fill_mask + + +def build_conservative_harvest_fill_mask( + reference_f0: np.ndarray, + fallback_f0: np.ndarray, + dropout_mask: np.ndarray, + max_run: int = 10, + local_radius: int = 4, + max_semitones: float = 4.0, + min_neighbors: int = 2, +) -> np.ndarray: + reference_f0 = np.asarray(reference_f0, dtype=np.float32).reshape(-1) + fallback_f0 = np.asarray(fallback_f0, dtype=np.float32).reshape(-1) + dropout_mask = np.asarray(dropout_mask, dtype=bool).reshape(-1) + + n = min(reference_f0.size, fallback_f0.size, dropout_mask.size) + if n <= 0: + return np.zeros(0, dtype=bool) + + reference_f0 = reference_f0[:n] + fallback_f0 = fallback_f0[:n] + dropout_mask = dropout_mask[:n] + accepted = np.zeros(n, dtype=bool) + + max_run = max(1, int(max_run)) + local_radius = max(1, int(local_radius)) + max_semitones = max(0.0, float(max_semitones)) + min_neighbors = max(1, int(min_neighbors)) + + padded = np.pad(dropout_mask.astype(np.int8), (1, 1), mode="constant") + edges = np.diff(padded) + starts = np.where(edges == 1)[0] + ends = np.where(edges == -1)[0] + eps = 1e-6 + + for start, end in zip(starts, ends): + run_slice = slice(start, end) + run_len = end - start + if run_len <= 0 or run_len > max_run: + continue + + run_fallback = fallback_f0[run_slice] + voiced_run = run_fallback > 0 + if not np.any(voiced_run): + continue + + left = reference_f0[max(0, start - local_radius) : start] + right = reference_f0[end : min(n, end + local_radius)] + neighbors = np.concatenate([left[left > 0], right[right > 0]]) + if neighbors.size < min_neighbors: + continue + + local_median = float(np.median(neighbors)) + if local_median <= 0: + continue + + semitone_diff = np.abs( + 12.0 * np.log2((run_fallback + eps) / (local_median + eps)) + ) + accepted[run_slice] = voiced_run & (semitone_diff <= max_semitones) + + return accepted + + +def compute_chunk_crossfade_samples( + tgt_sr: int, + t_pad_tgt: int, + segment_count: int, +) -> int: + tgt_sr = int(max(tgt_sr, 0)) + t_pad_tgt = int(max(t_pad_tgt, 0)) + segment_count = int(max(segment_count, 0)) + if tgt_sr <= 0 or t_pad_tgt <= 0 or segment_count <= 1: + return 0 + + base = int(round(tgt_sr * 0.018)) + extra = int(round(tgt_sr * 0.002 * max(0, segment_count - 2))) + min_crossfade = int(round(tgt_sr * 0.012)) + max_crossfade = max(min_crossfade, t_pad_tgt // 3) + return int(np.clip(base + extra, min_crossfade, max_crossfade)) + + +def compute_active_source_replace( + activity: np.ndarray, + soft_mask: np.ndarray, + echo_ratio: np.ndarray, + direct_ratio: np.ndarray, + max_replace: float = 0.82, +) -> np.ndarray: + activity = np.asarray(activity, dtype=np.float32).reshape(-1) + direct_ratio = np.asarray(direct_ratio, dtype=np.float32).reshape(-1) + soft_mask = np.asarray(soft_mask, dtype=np.float32) + echo_ratio = np.asarray(echo_ratio, dtype=np.float32) + + if soft_mask.ndim == 1: + soft_mask = soft_mask[np.newaxis, :] + if echo_ratio.ndim == 1: + echo_ratio = echo_ratio[np.newaxis, :] + + frame_count = min(soft_mask.shape[-1], echo_ratio.shape[-1], len(activity), len(direct_ratio)) + if frame_count <= 0: + return np.zeros_like(soft_mask, dtype=np.float32) + + soft_mask = soft_mask[..., :frame_count] + echo_ratio = echo_ratio[..., :frame_count] + activity = activity[:frame_count][np.newaxis, :] + direct_ratio = direct_ratio[:frame_count][np.newaxis, :] + + base_replace = 0.85 * (1.0 - activity) * (1.0 - soft_mask) + active_echo_presence = np.clip( + echo_ratio * (0.35 + 0.65 * (1.0 - direct_ratio)), + 0.0, + 1.0, + ) + active_replace = 0.65 * activity * active_echo_presence * (1.0 - soft_mask) + source_replace = np.clip(base_replace + active_replace, 0.0, float(max_replace)) + return source_replace.astype(np.float32) + + +def compute_source_cleanup_budget( + energy_guard: np.ndarray, + phrase_activity: np.ndarray, +) -> tuple[np.ndarray, np.ndarray]: + energy_guard = np.clip(np.asarray(energy_guard, dtype=np.float32), 0.0, 1.0) + phrase_activity = np.clip(np.asarray(phrase_activity, dtype=np.float32), 0.0, 1.0) + allowed_boost = 0.35 + 1.00 * energy_guard + cleanup_floor = 0.62 + 0.16 * phrase_activity + return allowed_boost.astype(np.float32), cleanup_floor.astype(np.float32) + + +def compute_breath_preserving_energy_gates( + energy_db: np.ndarray, + ref_db: float, + unvoiced_mask: np.ndarray | None, + quiet_floor: float = 0.05, + breath_floor: float = 0.28, + breath_active_margin_db: float = 52.0, + transition_width_db: float = 6.0, +) -> tuple[np.ndarray, np.ndarray]: + energy_db = np.asarray(energy_db, dtype=np.float32).reshape(-1) + if energy_db.size == 0: + empty = np.zeros(0, dtype=np.float32) + return empty, empty + + quiet_floor = float(np.clip(quiet_floor, 0.0, 1.0)) + breath_floor = float(np.clip(max(breath_floor, quiet_floor), quiet_floor, 1.0)) + breath_active_margin_db = float(max(1.0, breath_active_margin_db)) + transition_width_db = float(max(0.5, transition_width_db)) + ref_db = float(ref_db) + + silence_center = ref_db - 45.0 + slope = transition_width_db / 4.0 + base_gate = 1.0 / (1.0 + np.exp(-((energy_db - silence_center) / slope))) + base_gate = np.clip(base_gate, quiet_floor, 1.0).astype(np.float32) + + if unvoiced_mask is None: + return base_gate, base_gate.copy() + + unvoiced_mask = np.asarray(unvoiced_mask, dtype=bool).reshape(-1) + if len(unvoiced_mask) < len(base_gate): + unvoiced_mask = np.pad(unvoiced_mask, (0, len(base_gate) - len(unvoiced_mask)), mode="edge") + else: + unvoiced_mask = unvoiced_mask[: len(base_gate)] + + breath_activity = np.clip( + (energy_db - (ref_db - breath_active_margin_db)) / 10.0, + 0.0, + 1.0, + ).astype(np.float32) + feature_floor = quiet_floor + (breath_floor - quiet_floor) * breath_activity + + feature_gate = base_gate.copy() + feature_gate[unvoiced_mask] = np.maximum(feature_gate[unvoiced_mask], feature_floor[unvoiced_mask]) + feature_gate = np.clip(feature_gate, quiet_floor, 1.0).astype(np.float32) + return feature_gate, base_gate.copy() diff --git a/infer/separator.py b/infer/separator.py index 74623671906aadbff3e758bab85da447483f240c..cdb64f7844787ab4a201ce4da29436bc57f74f12 100644 --- a/infer/separator.py +++ b/infer/separator.py @@ -3,13 +3,13 @@ 人声分离模块 - 支持 Demucs 和 Mel-Band Roformer (audio-separator) """ import os -import gc -import shutil -import torch -import numpy as np -import soundfile as sf -from pathlib import Path -from typing import Tuple, Optional, Callable +import gc +import shutil +import torch +import numpy as np +import soundfile as sf +from pathlib import Path +from typing import Tuple, Optional, Callable, Union from lib.logger import log from lib.device import get_device, empty_device_cache @@ -34,12 +34,82 @@ except ImportError: AUDIO_SEPARATOR_AVAILABLE = False -# Mel-Band Roformer 默认模型 -ROFORMER_DEFAULT_MODEL = "vocals_mel_band_roformer.ckpt" -KARAOKE_DEFAULT_MODEL = "mel_band_roformer_karaoke_gabox.ckpt" -KARAOKE_FALLBACK_MODELS = [ +ModelSpec = Union[str, list[str], tuple[str, ...]] + + +# Public scored SOTA defaults from audio-separator 0.44.1's model table. +# Keep the cover pipeline unchanged; only the separator model choices change. +ENSEMBLE_PRESET_PREFIX = "ensemble:" + +ROFORMER_LEGACY_SINGLE_MODEL = "vocals_mel_band_roformer.ckpt" +ROFORMER_SOTA_PRESET = "vocal_rvc" +ROFORMER_DEFAULT_MODEL = f"{ENSEMBLE_PRESET_PREFIX}{ROFORMER_SOTA_PRESET}" +ROFORMER_SOTA_MODEL = ROFORMER_DEFAULT_MODEL +ROFORMER_SOTA_MODELS = [ + "melband_roformer_big_beta6x.ckpt", + "mel_band_roformer_vocals_fv4_gabox.ckpt", +] + +KARAOKE_LEGACY_SINGLE_MODEL = "mel_band_roformer_karaoke_gabox.ckpt" +KARAOKE_SOTA_PRESET = "karaoke" +KARAOKE_DEFAULT_MODEL = f"{ENSEMBLE_PRESET_PREFIX}{KARAOKE_SOTA_PRESET}" +KARAOKE_SOTA_MODEL = KARAOKE_DEFAULT_MODEL +KARAOKE_SOTA_MODELS = [ "mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt", + "mel_band_roformer_karaoke_gabox_v2.ckpt", + "mel_band_roformer_karaoke_becruily.ckpt", ] +KARAOKE_EXPERIMENTAL_MODELS = [ + "mel_band_roformer_karaoke_gabox_v2.ckpt", + "mel_band_roformer_karaoke_becruily.ckpt", +] + +ROFORMER_DEREVERB_DEFAULT_MODEL = "dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt" + + +def _model_spec_key(model_spec: ModelSpec) -> tuple[str, ...]: + if isinstance(model_spec, (list, tuple)): + return tuple(str(item) for item in model_spec) + return (str(model_spec),) + + +def _model_spec_label(model_spec: ModelSpec) -> str: + if isinstance(model_spec, (list, tuple)): + return "ensemble[" + ", ".join(str(item) for item in model_spec) + "]" + return str(model_spec) + + +def _parse_ensemble_preset(model_spec: ModelSpec) -> Optional[str]: + if not isinstance(model_spec, str): + return None + spec = model_spec.strip() + if not spec.lower().startswith(ENSEMBLE_PRESET_PREFIX): + return None + preset = spec[len(ENSEMBLE_PRESET_PREFIX):].strip() + return preset or None + + +def _load_audio_separator_model( + *, + model_spec: ModelSpec, + output_dir: str, + model_dir: str, +) -> Separator: + preset_name = _parse_ensemble_preset(model_spec) + separator_kwargs = { + "log_level": _logging.WARNING, + "output_dir": output_dir, + "model_file_dir": model_dir, + } + if preset_name: + separator_kwargs["ensemble_preset"] = preset_name + + separator = Separator(**separator_kwargs) + if preset_name: + separator.load_model() + else: + separator.load_model(list(model_spec) if isinstance(model_spec, tuple) else model_spec) + return separator def _resolve_output_files(output_files, output_dir: Path) -> list[str]: @@ -49,31 +119,59 @@ def _resolve_output_files(output_files, output_dir: Path) -> list[str]: file_path = Path(file_name) if not file_path.is_absolute(): file_path = output_dir / file_path + if file_path.exists(): + resolved_files.append(str(file_path)) + continue + + role = _classify_common_stem_role(file_path.name) + if role: + candidates = [ + candidate + for candidate in output_dir.glob("*.wav") + if _classify_common_stem_role(candidate.name) == role + ] + if len(candidates) == 1: + resolved_files.append(str(candidates[0])) + continue + resolved_files.append(str(file_path)) return resolved_files -def _safe_move(src_path: str, dst_path: str) -> None: - """Move file with overwrite.""" - if src_path == dst_path: - return +def _classify_common_stem_role(file_name: str) -> Optional[str]: + lower_name = file_name.lower() + if any(marker in lower_name for marker in ("(noreverb)", "(no_reverb)", "(no reverb)", "(dry)")): + return "dry" + if any(marker in lower_name for marker in ("(reverb)", "(echo)", "(wet)")): + return "wet" + if any(marker in lower_name for marker in ("(instrumental)", "(other)", "(backing)")): + return "backing" + if any(marker in lower_name for marker in ("(vocals)", "(lead)", "(main_vocal)", "(main vocals)")): + return "lead" + return None + + +def _safe_move(src_path: str, dst_path: str) -> None: + """Move file with overwrite.""" + if src_path == dst_path: + return dst = Path(dst_path) if dst.exists(): dst.unlink() - shutil.move(src_path, dst_path) - - -def _get_audio_activity_stats(audio_path: str) -> tuple[float, float, int]: - """Return simple activity stats for validating separator outputs.""" - audio, _ = sf.read(audio_path, dtype="float32", always_2d=True) - if audio.size == 0: - return 0.0, 0.0, 0 - - mono = np.mean(audio, axis=1, dtype=np.float32) - rms = float(np.sqrt(np.mean(np.square(mono), dtype=np.float64) + 1e-12)) - peak = float(np.max(np.abs(mono))) - nonzero = int(np.count_nonzero(np.abs(mono) > 1e-6)) - return rms, peak, nonzero + shutil.move(src_path, dst_path) + + +def _get_audio_activity_stats(audio_path: str) -> tuple[float, float, int]: + """Return simple activity stats for validating separator outputs.""" + audio, _ = sf.read(audio_path, dtype="float32", always_2d=True) + if audio.size == 0: + return 0.0, 0.0, 0 + + mono = np.mean(audio, axis=1, dtype=np.float32) + rms = float(np.sqrt(np.mean(np.square(mono), dtype=np.float64) + 1e-12)) + peak = float(np.max(np.abs(mono))) + nonzero = int(np.count_nonzero(np.abs(mono) > 1e-6)) + return rms, peak, nonzero class RoformerSeparator: @@ -81,19 +179,21 @@ class RoformerSeparator: def __init__( self, - model_filename: str = ROFORMER_DEFAULT_MODEL, + model_filename: ModelSpec = ROFORMER_DEFAULT_MODEL, device: str = "cuda", ): if not AUDIO_SEPARATOR_AVAILABLE: raise ImportError( "请安装 audio-separator: pip install audio-separator[gpu]" - ) + ) self.model_filename = model_filename + self.model_candidates = [model_filename] self.device = str(get_device(device)) self.separator = None + self.active_model = None def load_model(self, output_dir: str = ""): - """加载 Roformer 模型""" + """加载指定 RoFormer 模型;严格 SOTA 模式下不自动降级。""" model_dir = str( Path(__file__).parent.parent / "assets" / "separator_models" ) @@ -113,16 +213,23 @@ class RoformerSeparator: self.separator = None gc.collect() - log.info(f"正在加载 Mel-Band Roformer 模型: {self.model_filename}") - - self.separator = Separator( - log_level=_logging.WARNING, + model_name = self.model_filename + log.info( + "正在加载公开 SOTA RoFormer 分离模型: " + f"{_model_spec_label(model_name)}" + ) + separator = _load_audio_separator_model( + model_spec=model_name, output_dir=target_dir, - model_file_dir=model_dir, + model_dir=model_dir, ) + self.separator = separator self._init_output_dir = target_dir - self.separator.load_model(self.model_filename) - log.info("Mel-Band Roformer 模型已加载") + self.active_model = model_name + log.info( + "RoFormer 分离模型已加载: " + f"{_model_spec_label(model_name)}" + ) def separate( self, @@ -142,14 +249,12 @@ class RoformerSeparator: if progress_callback: progress_callback("正在加载 Roformer 模型...", 0.1) - self.load_model(output_dir=str(output_path)) + if progress_callback: + progress_callback("正在使用 RoFormer 分离人声...", 0.3) + self.load_model(output_dir=str(output_path)) # audio-separator 需要 output_dir 在实例上设置 self.separator.output_dir = str(output_path) - - if progress_callback: - progress_callback("正在使用 Mel-Band Roformer 分离人声...", 0.3) - output_files = self.separator.separate(audio_path) # audio-separator 返回的可能是纯文件名,需要拼上 output_dir @@ -160,7 +265,7 @@ class RoformerSeparator: p = output_path / p resolved_files.append(str(p)) - # Fallback: if resolved files don't exist, search the output dir + # Recovery: if resolved files don't exist, search the output dir # for freshly created files. This handles cases where audio-separator # writes to a slightly different path (e.g. after output_dir update # on a reused Separator instance). @@ -233,6 +338,7 @@ class RoformerSeparator: if self.separator is not None: del self.separator self.separator = None + self.active_model = None gc.collect() empty_device_cache() @@ -242,7 +348,7 @@ class KaraokeSeparator: def __init__( self, - model_filename: str = KARAOKE_DEFAULT_MODEL, + model_filename: ModelSpec = KARAOKE_DEFAULT_MODEL, device: str = "cuda", ): if not AUDIO_SEPARATOR_AVAILABLE: @@ -252,15 +358,11 @@ class KaraokeSeparator: self.device = str(get_device(device)) self.separator = None self.active_model = None - - models = [model_filename] - for fallback in KARAOKE_FALLBACK_MODELS: - if fallback not in models: - models.append(fallback) - self.model_candidates = models + self.model_filename = model_filename + self.model_candidates = [model_filename] def load_model(self, output_dir: str = ""): - """加载 Karaoke 模型(主模型失败时自动回退)""" + """加载指定 Karaoke 模型;严格 SOTA 模式下不自动降级。""" model_dir = str(Path(__file__).parent.parent / "assets" / "separator_models") Path(model_dir).mkdir(parents=True, exist_ok=True) @@ -277,26 +379,23 @@ class KaraokeSeparator: self.active_model = None gc.collect() - last_error = None - for model_name in self.model_candidates: - try: - log.info(f"正在加载 Karaoke 模型: {model_name}") - separator = Separator( - log_level=_logging.WARNING, - output_dir=target_dir, - model_file_dir=model_dir, - ) - separator.load_model(model_name) - self.separator = separator - self._init_output_dir = target_dir - self.active_model = model_name - log.info("Karaoke 模型已加载") - return - except Exception as exc: - last_error = exc - log.warning(f"Karaoke 模型加载失败: {model_name} ({exc})") - - raise RuntimeError(f"无法加载 Karaoke 模型: {last_error}") + model_name = self.model_filename + log.info( + "正在加载公开 SOTA Karaoke 模型: " + f"{_model_spec_label(model_name)}" + ) + separator = _load_audio_separator_model( + model_spec=model_name, + output_dir=target_dir, + model_dir=model_dir, + ) + self.separator = separator + self._init_output_dir = target_dir + self.active_model = model_name + log.info( + "Karaoke 模型已加载: " + f"{_model_spec_label(model_name)}" + ) @staticmethod def _classify_stem(file_name: str) -> Optional[str]: @@ -332,36 +431,37 @@ class KaraokeSeparator: return "backing" return None - def separate(self, audio_path: str, output_dir: str) -> Tuple[str, str]: - """ - 分离主唱和和声 - - Returns: + def separate(self, audio_path: str, output_dir: str) -> Tuple[str, str]: + """ + 分离主唱和和声 + + Returns: Tuple[lead_vocals_path, backing_vocals_path] """ output_path = Path(output_dir) output_path.mkdir(parents=True, exist_ok=True) - self.load_model(output_dir=str(output_path)) - self.separator.output_dir = str(output_path) - output_files = self.separator.separate(audio_path) - resolved_files = _resolve_output_files(output_files, output_path) - log.detail( - f"Karaoke分离器输出文件: {[Path(file_path).name for file_path in resolved_files]}" - ) - - lead_vocals_path = None - backing_vocals_path = None - for file_path in resolved_files: - stem_role = self._classify_stem(Path(file_path).name) - log.detail( - f" {Path(file_path).name} -> 分类为: {stem_role or 'unknown'}" - ) - if stem_role == "lead" and lead_vocals_path is None: - lead_vocals_path = file_path - elif stem_role == "backing" and backing_vocals_path is None: - backing_vocals_path = file_path - + self.load_model(output_dir=str(output_path)) + self.separator.output_dir = str(output_path) + output_files = self.separator.separate(audio_path) + + resolved_files = _resolve_output_files(output_files, output_path) + log.detail( + f"Karaoke分离器输出文件: {[Path(file_path).name for file_path in resolved_files]}" + ) + + lead_vocals_path = None + backing_vocals_path = None + for file_path in resolved_files: + stem_role = self._classify_stem(Path(file_path).name) + log.detail( + f" {Path(file_path).name} -> 分类为: {stem_role or 'unknown'}" + ) + if stem_role == "lead" and lead_vocals_path is None: + lead_vocals_path = file_path + elif stem_role == "backing" and backing_vocals_path is None: + backing_vocals_path = file_path + if lead_vocals_path is None and resolved_files: lead_vocals_path = resolved_files[0] if backing_vocals_path is None: @@ -374,30 +474,30 @@ class KaraokeSeparator: raise FileNotFoundError( f"Karaoke主唱轨未找到,输出文件: {[Path(p).name for p in resolved_files]}" ) - if not backing_vocals_path or not Path(backing_vocals_path).exists(): - raise FileNotFoundError( - f"Karaoke和声轨未找到,输出文件: {[Path(p).name for p in resolved_files]}" - ) - - lead_rms, lead_peak, lead_nonzero = _get_audio_activity_stats(lead_vocals_path) - backing_rms, backing_peak, backing_nonzero = _get_audio_activity_stats(backing_vocals_path) - log.detail( - "Karaoke输出能量检测: " - f"lead_rms={lead_rms:.6f}, lead_peak={lead_peak:.6f}, lead_nonzero={lead_nonzero}; " - f"backing_rms={backing_rms:.6f}, backing_peak={backing_peak:.6f}, backing_nonzero={backing_nonzero}" - ) - - lead_is_nearly_silent = lead_nonzero == 0 or (lead_rms < 1e-5 and lead_peak < 1e-4) - backing_has_content = backing_nonzero > 0 and (backing_rms >= 5e-5 or backing_peak >= 5e-4) - if lead_is_nearly_silent and backing_has_content: - log.warning("Karaoke主唱轨几乎静音,检测到输出疑似反转,已自动交换主唱/和声") - lead_vocals_path, backing_vocals_path = backing_vocals_path, lead_vocals_path - - final_lead = str(output_path / "lead_vocals.wav") - final_backing = str(output_path / "backing_vocals.wav") - _safe_move(lead_vocals_path, final_lead) - _safe_move(backing_vocals_path, final_backing) - + if not backing_vocals_path or not Path(backing_vocals_path).exists(): + raise FileNotFoundError( + f"Karaoke和声轨未找到,输出文件: {[Path(p).name for p in resolved_files]}" + ) + + lead_rms, lead_peak, lead_nonzero = _get_audio_activity_stats(lead_vocals_path) + backing_rms, backing_peak, backing_nonzero = _get_audio_activity_stats(backing_vocals_path) + log.detail( + "Karaoke输出能量检测: " + f"lead_rms={lead_rms:.6f}, lead_peak={lead_peak:.6f}, lead_nonzero={lead_nonzero}; " + f"backing_rms={backing_rms:.6f}, backing_peak={backing_peak:.6f}, backing_nonzero={backing_nonzero}" + ) + + lead_is_nearly_silent = lead_nonzero == 0 or (lead_rms < 1e-5 and lead_peak < 1e-4) + backing_has_content = backing_nonzero > 0 and (backing_rms >= 5e-5 or backing_peak >= 5e-4) + if lead_is_nearly_silent and backing_has_content: + log.warning("Karaoke主唱轨几乎静音,检测到输出疑似反转,已自动交换主唱/和声") + lead_vocals_path, backing_vocals_path = backing_vocals_path, lead_vocals_path + + final_lead = str(output_path / "lead_vocals.wav") + final_backing = str(output_path / "backing_vocals.wav") + _safe_move(lead_vocals_path, final_lead) + _safe_move(backing_vocals_path, final_backing) + return final_lead, final_backing def unload_model(self): @@ -410,6 +510,132 @@ class KaraokeSeparator: empty_device_cache() +class RoformerDereverbSeparator: + """学习型 RoFormer 去混响/去回声,输出更干的人声供 VC 使用。""" + + def __init__( + self, + model_filename: str = ROFORMER_DEREVERB_DEFAULT_MODEL, + device: str = "cuda", + ): + if not AUDIO_SEPARATOR_AVAILABLE: + raise ImportError( + "请安装 audio-separator: pip install audio-separator[gpu]" + ) + self.device = str(get_device(device)) + self.separator = None + self.active_model = None + self.model_filename = model_filename + self.model_candidates = [model_filename] + + def load_model(self, output_dir: str = ""): + model_dir = str(Path(__file__).parent.parent / "assets" / "separator_models") + Path(model_dir).mkdir(parents=True, exist_ok=True) + + target_dir = output_dir or str( + Path(__file__).parent.parent / "temp" / "separator" + ) + + if self.separator is not None: + if getattr(self, "_init_output_dir", None) == target_dir: + return + del self.separator + self.separator = None + self.active_model = None + gc.collect() + + model_name = self.model_filename + log.info(f"正在加载 RoFormer De-Reverb 模型: {model_name}") + separator = _load_audio_separator_model( + model_spec=model_name, + output_dir=target_dir, + model_dir=model_dir, + ) + self.separator = separator + self._init_output_dir = target_dir + self.active_model = model_name + log.info(f"RoFormer De-Reverb 模型已加载: {model_name}") + + @staticmethod + def _classify_stem(file_name: str) -> Optional[str]: + lower_name = file_name.lower() + + dry_markers = [ + "(dry)", + "(noreverb)", + "(no_reverb)", + "(no reverb)", + "(dereverb)", + "(de-reverb)", + "(vocals)", + "(primary)", + ] + wet_markers = [ + "(no dry)", + "(no_dry)", + "(reverb)", + "(echo)", + "(wet)", + "(secondary)", + "(instrumental)", + "(other)", + ] + + for marker in wet_markers: + if marker in lower_name: + return "wet" + for marker in dry_markers: + if marker in lower_name: + return "dry" + + if "dry" in lower_name or "noreverb" in lower_name or "vocal" in lower_name: + return "dry" + if "no dry" in lower_name or "reverb" in lower_name or "echo" in lower_name: + return "wet" + return None + + def separate_dry(self, audio_path: str, output_dir: str) -> str: + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + self.load_model(output_dir=str(output_path)) + self.separator.output_dir = str(output_path) + output_files = self.separator.separate(audio_path) + + resolved_files = _resolve_output_files(output_files, output_path) + log.detail( + "RoFormer De-Reverb 输出文件: " + f"{[Path(file_path).name for file_path in resolved_files]}" + ) + + dry_path = None + for file_path in resolved_files: + stem_role = self._classify_stem(Path(file_path).name) + log.detail( + f" {Path(file_path).name} -> 分类为: {stem_role or 'unknown'}" + ) + if stem_role == "dry": + dry_path = file_path + break + + if not dry_path or not Path(dry_path).exists(): + raise FileNotFoundError( + f"RoFormer De-Reverb dry轨未找到,输出文件: {[Path(p).name for p in resolved_files]}" + ) + + final_dry = str(output_path / "roformer_deecho_vocals.wav") + _safe_move(dry_path, final_dry) + return final_dry + + def unload_model(self): + if self.separator is not None: + del self.separator + self.separator = None + self.active_model = None + gc.collect() + empty_device_cache() + + class VocalSeparator: """人声分离器 - 基于 Demucs""" @@ -575,7 +801,7 @@ def get_available_models() -> list: if AUDIO_SEPARATOR_AVAILABLE: models.append({ "name": "roformer", - "description": "Mel-Band Roformer (Kimberley Jensen) - 高质量人声分离" + "description": "audio-separator public scored SOTA - 最高质量人声/伴奏分离" }) if DEMUCS_AVAILABLE: models.extend([ diff --git a/install.py b/install.py new file mode 100644 index 0000000000000000000000000000000000000000..8c33451c31b59627a35508a5b0d7453edb8c1bad --- /dev/null +++ b/install.py @@ -0,0 +1,396 @@ +# -*- coding: utf-8 -*- +""" +RVC 安装脚本 +自动创建虚拟环境 → 安装依赖 → 启动应用 + +用法: + python install.py # 完整安装并启动 + python install.py --check # 仅检查依赖 + python install.py --no-run # 安装但不启动 + python install.py --cpu # 安装 CPU 版本 +""" +import subprocess +import sys +import os +from pathlib import Path + +ROOT_DIR = Path(__file__).parent +VENV_DIR = ROOT_DIR / "venv310" + +PYTHON310_CANDIDATES = [ + r"C:\Users\Administrator\AppData\Local\Programs\Python\Python310\python.exe", + r"C:\Python310\python.exe", + r"C:\Program Files\Python310\python.exe", + r"C:\Program Files (x86)\Python310\python.exe", +] + +PACKAGES = { + "torch": {"import": "torch", "name": "PyTorch", "pip": "torch"}, + "torchaudio": {"import": "torchaudio", "name": "torchaudio", "pip": "torchaudio"}, + "gradio": {"import": "gradio", "name": "Gradio", "pip": "gradio==3.50.2"}, + "librosa": {"import": "librosa", "name": "librosa", "pip": "librosa"}, + "soundfile": {"import": "soundfile", "name": "soundfile", "pip": "soundfile"}, + "av": {"import": "av", "name": "PyAV", "pip": "av"}, + "scipy": {"import": "scipy", "name": "scipy", "pip": "scipy"}, + "numpy": { + "import": "numpy", + "name": "numpy", + "pip": "numpy<2,>=1.23.0", + "dist": "numpy", + "min_version": "1.23.0", + "max_exclusive_version": "2.0.0", + }, + "parselmouth": {"import": "parselmouth", "name": "praat-parselmouth", "pip": "praat-parselmouth"}, + "pyworld": {"import": "pyworld", "name": "pyworld", "pip": "pyworld"}, + "torchcrepe": {"import": "torchcrepe", "name": "torchcrepe", "pip": "torchcrepe"}, + "faiss": {"import": "faiss", "name": "faiss-cpu", "pip": "faiss-cpu"}, + "tqdm": {"import": "tqdm", "name": "tqdm", "pip": "tqdm"}, + "requests": {"import": "requests", "name": "requests", "pip": "requests"}, + "dotenv": {"import": "dotenv", "name": "python-dotenv", "pip": "python-dotenv"}, + "colorama": {"import": "colorama", "name": "colorama", "pip": "colorama"}, + "mcp": {"import": "mcp", "name": "mcp", "pip": "mcp"}, + "demucs": {"import": "demucs", "name": "demucs", "pip": "demucs"}, + "audio_separator": { + "import": "audio_separator", + "name": "audio-separator", + "pip": "audio-separator", + "dist": "audio-separator", + "min_version": "0.44.1", + }, + "huggingface_hub": {"import": "huggingface_hub", "name": "huggingface_hub", "pip": "huggingface_hub"}, + "pedalboard": {"import": "pedalboard", "name": "pedalboard", "pip": "pedalboard"}, + "ffmpeg": {"import": "ffmpeg", "name": "ffmpeg-python", "pip": "ffmpeg-python"}, + "fairseq": {"import": "fairseq", "name": "fairseq", "pip": "fairseq==0.12.2"}, +} + +# === 虚拟环境 === + +def find_python310(): + """查找系统中的 Python 3.10""" + if sys.version_info[:2] == (3, 10): + return sys.executable + for p in PYTHON310_CANDIDATES: + if os.path.isfile(p): + return p + try: + r = subprocess.run( + ["py", "-3.10", "-c", "import sys; print(sys.executable)"], + capture_output=True, text=True, timeout=10, + ) + if r.returncode == 0: + return r.stdout.strip() + except (FileNotFoundError, subprocess.TimeoutExpired): + pass + return None + + +def get_venv_python(): + """获取虚拟环境的 Python 路径""" + if os.name == "nt": + return str(VENV_DIR / "Scripts" / "python.exe") + return str(VENV_DIR / "bin" / "python") + + +def create_venv(): + """创建 Python 3.10 虚拟环境""" + venv_py = get_venv_python() + if os.path.isfile(venv_py): + r = subprocess.run([venv_py, "--version"], capture_output=True, text=True) + if r.returncode == 0 and "3.10" in r.stdout: + print(f" [OK] 虚拟环境已存在: {VENV_DIR}") + return True + + py310 = find_python310() + if not py310: + print(" [错误] 未找到 Python 3.10") + print(" 下载: https://www.python.org/downloads/release/python-31011/") + return False + + print(f" 使用 Python: {py310}") + print(f" 创建虚拟环境: {VENV_DIR}") + r = subprocess.run([py310, "-m", "venv", str(VENV_DIR)], capture_output=True, text=True) + if r.returncode != 0: + print(f" [错误] 创建失败:\n{r.stderr}") + return False + + print(" [OK] 虚拟环境创建成功") + print(" 升级 pip ...") + subprocess.run([venv_py, "-m", "pip", "install", "--upgrade", "pip"], + capture_output=True, text=True) + return True + +# === 依赖检查与安装 === + +def check_package(venv_py, import_name): + """用虚拟环境的 Python 检查包是否已安装""" + r = subprocess.run( + [venv_py, "-c", f"import {import_name}"], + capture_output=True, text=True, + ) + return r.returncode == 0 + + +def get_installed_version(venv_py, distribution_name): + """返回虚拟环境中已安装发行包版本;无法读取时返回 None。""" + code = ( + "from importlib.metadata import PackageNotFoundError, version\n" + f"dist = {distribution_name!r}\n" + "try:\n" + " print(version(dist))\n" + "except PackageNotFoundError:\n" + " raise SystemExit(1)\n" + ) + r = subprocess.run([venv_py, "-c", code], capture_output=True, text=True) + if r.returncode != 0: + return None + return r.stdout.strip() or None + + +def _version_parts(version_text): + parts = [] + for part in str(version_text or "").split("."): + digits = "" + for char in part: + if not char.isdigit(): + break + digits += char + parts.append(int(digits or 0)) + return tuple(parts) + + +def _version_at_least(installed, required): + installed_parts = _version_parts(installed) + required_parts = _version_parts(required) + width = max(len(installed_parts), len(required_parts)) + installed_parts += (0,) * (width - len(installed_parts)) + required_parts += (0,) * (width - len(required_parts)) + return installed_parts >= required_parts + + +def _version_less_than(installed, upper_bound): + installed_parts = _version_parts(installed) + upper_parts = _version_parts(upper_bound) + width = max(len(installed_parts), len(upper_parts)) + installed_parts += (0,) * (width - len(installed_parts)) + upper_parts += (0,) * (width - len(upper_parts)) + return installed_parts < upper_parts + + +def detect_cuda_version(): + """检测系统 CUDA 版本,返回对应的 PyTorch index-url""" + try: + r = subprocess.run( + ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"], + capture_output=True, text=True, timeout=10, + ) + if r.returncode == 0: + # nvidia-smi 存在,尝试获取 CUDA 版本 + r2 = subprocess.run( + ["nvidia-smi"], + capture_output=True, text=True, timeout=10, + ) + output = r2.stdout + # 从 nvidia-smi 输出中提取 CUDA Version + import re + match = re.search(r"CUDA Version:\s*(\d+)\.(\d+)", output) + if match: + major, minor = int(match.group(1)), int(match.group(2)) + if (major, minor) >= (12, 6): + return "https://download.pytorch.org/whl/cu126" + elif (major, minor) >= (12, 4): + return "https://download.pytorch.org/whl/cu124" + elif (major, minor) >= (12, 1): + return "https://download.pytorch.org/whl/cu121" + elif (major, minor) >= (11, 8): + return "https://download.pytorch.org/whl/cu118" + else: + return None + except (FileNotFoundError, subprocess.TimeoutExpired): + pass + return None + + +def pip_install(venv_py, package, extra="", index_url=None, no_deps=False, version_spec=""): + """用虚拟环境的 pip 安装包""" + target = f"{package}[{extra}]{version_spec}" if extra else f"{package}{version_spec}" + print(f" 安装 {target} ...") + cmd = [venv_py, "-m", "pip", "install", target] + if index_url: + cmd.extend(["--index-url", index_url]) + if no_deps: + cmd.append("--no-deps") + r = subprocess.run(cmd, capture_output=True, text=True) + if r.returncode != 0: + # fairseq 依赖冲突时尝试 --no-deps 回退 + if not no_deps and "ResolutionImpossible" in r.stderr: + print(f" [依赖冲突] 尝试 --no-deps 安装 {target} ...") + return pip_install(venv_py, package, extra=extra, index_url=index_url, no_deps=True) + print(f" [失败] {target}") + lines = r.stderr.strip().splitlines() + if lines: + print(f" {lines[-1]}") + return False + print(f" [完成] {target}") + return True + + +def check_all(venv_py): + """检查所有依赖""" + print("=" * 50) + print("RVC 依赖检查") + print("=" * 50) + missing = [] + for key, info in PACKAGES.items(): + ok = check_package(venv_py, info["import"]) + status = "OK" if ok else "未安装" + min_version = info.get("min_version") + max_exclusive_version = info.get("max_exclusive_version") + if ok and (min_version or max_exclusive_version): + installed_version = get_installed_version( + venv_py, + info.get("dist", info["pip"]), + ) + if not installed_version: + ok = False + requirements = [] + if min_version: + requirements.append(f">= {min_version}") + if max_exclusive_version: + requirements.append(f"< {max_exclusive_version}") + status = f"需更新 (无法读取版本,要求 {' 且 '.join(requirements)})" + elif ( + (not min_version or _version_at_least(installed_version, min_version)) + and ( + not max_exclusive_version + or _version_less_than(installed_version, max_exclusive_version) + ) + ): + status = f"OK ({installed_version})" + else: + ok = False + requirements = [] + if min_version: + requirements.append(f">= {min_version}") + if max_exclusive_version: + requirements.append(f"< {max_exclusive_version}") + status = f"需更新 ({installed_version} 不满足 {' 且 '.join(requirements)})" + mark = "[v]" if ok else "[x]" + print(f" {mark} {info['name']:30s} {status}") + if not ok: + missing.append(info) + print("-" * 50) + if missing: + print(f"缺少 {len(missing)} 个依赖包") + else: + print("所有依赖已安装") + return missing + + +def install_all(venv_py, gpu=True): + """安装所有缺失的依赖""" + missing = check_all(venv_py) + if not missing: + print("\n无需安装,所有依赖已就绪。") + return True + + # 检测 CUDA 版本 + cuda_index_url = None + if gpu: + cuda_index_url = detect_cuda_version() + if cuda_index_url: + print(f"\n 检测到 CUDA,使用 PyTorch 源: {cuda_index_url}") + else: + print("\n [错误] 未检测到支持的 CUDA,GPU 安装停止。") + print(" 如需 CPU 版 PyTorch,请显式使用 --cpu。") + return False + + print(f"\n开始安装 {len(missing)} 个缺失的依赖...\n") + failed = [] + for info in missing: + pip_name = info["pip"] + if pip_name in ("torch", "torchaudio"): + if gpu and cuda_index_url: + ok = pip_install(venv_py, pip_name, index_url=cuda_index_url) + else: + ok = pip_install(venv_py, pip_name, index_url="https://download.pytorch.org/whl/cpu") + elif pip_name == "audio-separator": + version_spec = f">={info['min_version']}" if info.get("min_version") else "" + ok = pip_install( + venv_py, + pip_name, + extra="gpu" if gpu else "cpu", + version_spec=version_spec, + ) + if ok: + # audio-separator 0.31+ declares numpy>=2, while the current + # Gradio 3.x UI stack is pinned to numpy 1.x. The separator + # model table and runtime imports work with numpy 1.26, so + # restore the app-compatible numpy line after separator install. + ok = pip_install(venv_py, "numpy<2,>=1.23.0") + else: + ok = pip_install(venv_py, pip_name) + if not ok: + failed.append(info["name"]) + + print("\n" + "=" * 50) + if failed: + print(f"安装完成,{len(failed)} 个包失败: {', '.join(failed)}") + return False + print("所有依赖安装成功!") + return True + + +def launch_app(venv_py): + """用虚拟环境启动应用""" + run_script = str(ROOT_DIR / "run.py") + print(f"\n启动应用: {run_script}") + print("=" * 50) + try: + subprocess.run([venv_py, run_script], cwd=str(ROOT_DIR)) + except KeyboardInterrupt: + print("\n已停止") + +# === 主入口 === + +def main(): + import argparse + parser = argparse.ArgumentParser(description="RVC 安装脚本") + parser.add_argument("--cpu", action="store_true", help="安装 CPU 版本") + parser.add_argument("--check", action="store_true", help="仅检查依赖") + parser.add_argument("--no-run", action="store_true", help="安装后不启动") + args = parser.parse_args() + + print("=" * 50) + print("RVC 安装程序") + print("=" * 50) + + # 1. 创建虚拟环境 + print("\n[1/3] 检查虚拟环境") + if not create_venv(): + sys.exit(1) + + venv_py = get_venv_python() + + # 2. 安装依赖 + print(f"\n[2/3] 检查依赖") + if args.check: + check_all(venv_py) + return + + gpu = not args.cpu + if not install_all(venv_py, gpu=gpu): + print("\n部分依赖安装失败,可尝试手动安装。") + sys.exit(1) + + # 3. 启动应用 + if args.no_run: + print("\n安装完成。运行方式:") + print(f" {venv_py} run.py") + return + + print(f"\n[3/3] 启动应用") + launch_app(venv_py) + + +if __name__ == "__main__": + main() diff --git a/lib/audio.py b/lib/audio.py index 50bc460a6aa8195c6cd934b9dbe2977ff12255dd..20e97287bac33b5d663bd233f71756e48912b20d 100644 --- a/lib/audio.py +++ b/lib/audio.py @@ -2,10 +2,10 @@ """ 音频处理模块 - 加载、保存和处理音频文件 """ -import numpy as np -import librosa -import soundfile as sf -from typing import Tuple, Optional +import numpy as np +import librosa +import soundfile as sf +from typing import Tuple, Optional def load_audio(path: str, sr: int = 16000) -> np.ndarray: @@ -27,7 +27,7 @@ def load_audio(path: str, sr: int = 16000) -> np.ndarray: return audio.astype(np.float32) -def save_audio(path: str, audio: np.ndarray, sr: int = 48000): +def save_audio(path: str, audio: np.ndarray, sr: int = 48000): """ 保存音频到文件 @@ -37,52 +37,52 @@ def save_audio(path: str, audio: np.ndarray, sr: int = 48000): sr: 采样率 (默认 48000) """ # 确保音频在 [-1, 1] 范围内 - audio = np.clip(audio, -1.0, 1.0) - sf.write(path, audio, sr) - - -def soft_clip( - audio: np.ndarray, - threshold: float = 0.9, - ceiling: float = 0.99, -) -> np.ndarray: - """ - 使用平滑软削波抑制峰值,尽量保留主体响度。 - - Args: - audio: 输入音频 - threshold: 开始压缩的阈值 - ceiling: 软削波上限 - - Returns: - np.ndarray: 处理后的音频 - """ - audio = np.asarray(audio, dtype=np.float32) - - if threshold <= 0: - raise ValueError("threshold 必须大于 0") - if ceiling <= threshold: - raise ValueError("ceiling 必须大于 threshold") - - result = audio.copy() - abs_audio = np.abs(result) - mask = abs_audio > threshold - if not np.any(mask): - return result - - overshoot = (abs_audio[mask] - threshold) / (ceiling - threshold + 1e-8) - compressed = threshold + (ceiling - threshold) * np.tanh(overshoot) - result[mask] = np.sign(result[mask]) * compressed - return result.astype(np.float32, copy=False) - - -def soft_clip_array( - audio: np.ndarray, - threshold: float = 0.9, - ceiling: float = 0.99, -) -> np.ndarray: - """软削波数组版本,支持单声道/多声道。""" - return soft_clip(audio, threshold=threshold, ceiling=ceiling) + audio = np.clip(audio, -1.0, 1.0) + sf.write(path, audio, sr) + + +def soft_clip( + audio: np.ndarray, + threshold: float = 0.9, + ceiling: float = 0.99, +) -> np.ndarray: + """ + 使用平滑软削波抑制峰值,尽量保留主体响度。 + + Args: + audio: 输入音频 + threshold: 开始压缩的阈值 + ceiling: 软削波上限 + + Returns: + np.ndarray: 处理后的音频 + """ + audio = np.asarray(audio, dtype=np.float32) + + if threshold <= 0: + raise ValueError("threshold 必须大于 0") + if ceiling <= threshold: + raise ValueError("ceiling 必须大于 threshold") + + result = audio.copy() + abs_audio = np.abs(result) + mask = abs_audio > threshold + if not np.any(mask): + return result + + overshoot = (abs_audio[mask] - threshold) / (ceiling - threshold + 1e-8) + compressed = threshold + (ceiling - threshold) * np.tanh(overshoot) + result[mask] = np.sign(result[mask]) * compressed + return result.astype(np.float32, copy=False) + + +def soft_clip_array( + audio: np.ndarray, + threshold: float = 0.9, + ceiling: float = 0.99, +) -> np.ndarray: + """软削波数组版本,支持单声道/多声道。""" + return soft_clip(audio, threshold=threshold, ceiling=ceiling) def get_audio_info(path: str) -> dict: diff --git a/lib/audio_metrics.py b/lib/audio_metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..8af181687a361cd8dd492b652436d36c1c5ae776 --- /dev/null +++ b/lib/audio_metrics.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +"""Reference-based audio metrics for separation/cover evaluation.""" +from __future__ import annotations + +from typing import Mapping + +import numpy as np + + +EPS = 1e-10 + + +def _as_mono_float(audio: np.ndarray) -> np.ndarray: + arr = np.asarray(audio, dtype=np.float64) + if arr.ndim == 2: + arr = np.mean(arr, axis=1) + return arr.reshape(-1) + + +def _align_pair(reference: np.ndarray, estimate: np.ndarray) -> tuple[np.ndarray, np.ndarray]: + ref = _as_mono_float(reference) + est = _as_mono_float(estimate) + n = min(ref.size, est.size) + if n <= 0: + raise ValueError("Audio metric received empty reference or estimate.") + return ref[:n], est[:n] + + +def _power(audio: np.ndarray) -> float: + arr = np.asarray(audio, dtype=np.float64).reshape(-1) + return float(np.sum(arr * arr)) + + +def _db_ratio(signal_power: float, noise_power: float) -> float: + signal_power = max(float(signal_power), EPS) + noise_power = max(float(noise_power), EPS) + return float(10.0 * np.log10(signal_power / noise_power)) + + +def signal_distortion_ratio(reference: np.ndarray, estimate: np.ndarray) -> float: + """Scale-dependent SDR: 10 log10(||s||^2 / ||s - shat||^2).""" + ref, est = _align_pair(reference, estimate) + return _db_ratio(_power(ref), _power(ref - est)) + + +def scale_invariant_signal_distortion_ratio(reference: np.ndarray, estimate: np.ndarray) -> float: + """SI-SDR as used by modern source-separation literature.""" + ref, est = _align_pair(reference, estimate) + ref = ref - float(np.mean(ref)) + est = est - float(np.mean(est)) + ref_power = _power(ref) + if ref_power <= EPS: + raise ValueError("SI-SDR reference is silent.") + scale = float(np.dot(est, ref) / (ref_power + EPS)) + target = scale * ref + residual = est - target + return _db_ratio(_power(target), _power(residual)) + + +def signal_to_noise_ratio(reference: np.ndarray, estimate: np.ndarray) -> float: + """Alias for scale-dependent reconstruction SNR.""" + return signal_distortion_ratio(reference, estimate) + + +def evaluate_reference_stems( + references: Mapping[str, np.ndarray], + estimates: Mapping[str, np.ndarray], +) -> dict: + """Compute true reference-based metrics for matching stems. + + The caller must provide time-aligned reference stems. Without references, + SI-SDR/SDR cannot be interpreted as source-separation quality. + """ + stem_metrics: dict[str, dict[str, float]] = {} + for stem_name, reference_audio in references.items(): + if stem_name not in estimates: + raise KeyError(f"Missing estimated stem for reference: {stem_name}") + estimate_audio = estimates[stem_name] + stem_metrics[stem_name] = { + "si_sdr": scale_invariant_signal_distortion_ratio(reference_audio, estimate_audio), + "sdr": signal_distortion_ratio(reference_audio, estimate_audio), + "snr": signal_to_noise_ratio(reference_audio, estimate_audio), + } + + if not stem_metrics: + raise ValueError("No reference stems were provided.") + + return { + "mean_si_sdr": float(np.mean([metrics["si_sdr"] for metrics in stem_metrics.values()])), + "mean_sdr": float(np.mean([metrics["sdr"] for metrics in stem_metrics.values()])), + "mean_snr": float(np.mean([metrics["snr"] for metrics in stem_metrics.values()])), + "stems": stem_metrics, + } diff --git a/lib/ffmpeg_runtime.py b/lib/ffmpeg_runtime.py new file mode 100644 index 0000000000000000000000000000000000000000..b60ec826957a004fd2fc3575290f83988780ed56 --- /dev/null +++ b/lib/ffmpeg_runtime.py @@ -0,0 +1,130 @@ +from __future__ import annotations + +import os +import shutil +import subprocess +import sys +from pathlib import Path +from typing import MutableMapping, Optional + + +ROOT_DIR = Path(__file__).resolve().parent.parent + + +def get_runtime_root(root_dir: Path | str | None = None) -> Path: + if root_dir is not None: + return Path(root_dir).expanduser() + if getattr(sys, "frozen", False): + return Path(sys.executable).resolve().parent + return ROOT_DIR + + +def _normalize_path_key(path: Path | str) -> str: + return os.path.normcase(os.path.normpath(str(path))) + + +def _iter_ffmpeg_bin_candidates(root_dir: Path) -> list[Path]: + candidates = [ + root_dir / "tools" / "ffmpeg" / "bin", + root_dir / "tools" / "ffmpeg", + ] + meipass = getattr(sys, "_MEIPASS", None) + if meipass: + meipass_root = Path(meipass) + candidates.extend( + [ + meipass_root / "tools" / "ffmpeg" / "bin", + meipass_root / "tools" / "ffmpeg", + ] + ) + + try: + import imageio_ffmpeg + + candidates.append(Path(imageio_ffmpeg.get_ffmpeg_exe()).resolve().parent) + except Exception: + pass + + unique: list[Path] = [] + seen: set[str] = set() + for candidate in candidates: + key = _normalize_path_key(candidate) + if key not in seen: + unique.append(candidate) + seen.add(key) + return unique + + +def get_ffmpeg_bin_dir(root_dir: Path | str | None = None) -> Optional[Path]: + runtime_root = get_runtime_root(root_dir) + ffmpeg_name = "ffmpeg.exe" if os.name == "nt" else "ffmpeg" + + for candidate in _iter_ffmpeg_bin_candidates(runtime_root): + if (candidate / ffmpeg_name).exists(): + return candidate + + found = shutil.which("ffmpeg") + if found: + return Path(found).resolve().parent + + return None + + +def _check_executable_runs(executable: Path, label: str) -> None: + try: + result = subprocess.run( + [str(executable), "-version"], + capture_output=True, + text=True, + timeout=10, + ) + except Exception as exc: + raise RuntimeError(f"{label} 无法启动: {executable} ({exc})") from exc + + if result.returncode != 0: + details = "\n".join( + part.strip() + for part in (result.stdout, result.stderr) + if part and part.strip() + ) + if not details: + details = f"退出码: {result.returncode}" + raise RuntimeError(f"{label} 无法正常运行: {executable}\n{details}") + + +def configure_ffmpeg_runtime( + root_dir: Path | str | None = None, + env: MutableMapping[str, str] | None = None, +) -> Optional[Path]: + env = os.environ if env is None else env + bin_dir = get_ffmpeg_bin_dir(root_dir=root_dir) + if bin_dir is None: + return None + + bin_dir_str = str(bin_dir) + current_path = env.get("PATH", "") + path_entries = [entry for entry in current_path.split(os.pathsep) if entry] + normalized_entries = {_normalize_path_key(entry) for entry in path_entries} + if _normalize_path_key(bin_dir_str) not in normalized_entries: + env["PATH"] = bin_dir_str if not current_path else bin_dir_str + os.pathsep + current_path + + ffmpeg_name = "ffmpeg.exe" if os.name == "nt" else "ffmpeg" + ffmpeg_path = bin_dir / ffmpeg_name + if ffmpeg_path.exists(): + _check_executable_runs(ffmpeg_path, "ffmpeg") + env.setdefault("FFMPEG_BINARY", str(ffmpeg_path)) + env.setdefault("IMAGEIO_FFMPEG_EXE", str(ffmpeg_path)) + + ffprobe_name = "ffprobe.exe" if os.name == "nt" else "ffprobe" + ffprobe_path = bin_dir / ffprobe_name + if ffprobe_path.exists(): + _check_executable_runs(ffprobe_path, "ffprobe") + env.setdefault("FFPROBE_BINARY", str(ffprobe_path)) + else: + resolved_ffprobe = shutil.which("ffprobe", path=env.get("PATH", "")) + if resolved_ffprobe is None: + raise RuntimeError("未找到 ffprobe。请安装 ffmpeg/ffprobe,或将 ffprobe 放入 tools/ffmpeg/bin。") + _check_executable_runs(Path(resolved_ffprobe), "ffprobe") + env.setdefault("FFPROBE_BINARY", str(Path(resolved_ffprobe))) + + return bin_dir diff --git a/lib/logger.py b/lib/logger.py index 89a4b8fe612b49c14c4253dfcfb2350ae5612339..3bc9931a657dfe4162e0990329cdf46ed75591e5 100644 --- a/lib/logger.py +++ b/lib/logger.py @@ -1,254 +1,254 @@ -# -*- coding: utf-8 -*- -""" -日志工具模块 - 支持时间戳和颜色输出 -""" -import sys -import logging -from datetime import datetime - -try: - from colorama import init, Fore, Style, Back - init(autoreset=True) # 初始化 colorama (Windows 兼容), autoreset确保每行重置 - COLORAMA_AVAILABLE = True -except ImportError: - COLORAMA_AVAILABLE = False - # 定义空的占位符 - class Fore: - LIGHTBLACK_EX = GREEN = YELLOW = RED = CYAN = BLUE = MAGENTA = WHITE = LIGHTGREEN_EX = LIGHTCYAN_EX = LIGHTYELLOW_EX = LIGHTMAGENTA_EX = "" - class Style: - RESET_ALL = BRIGHT = DIM = "" - class Back: - pass - - -class Logger: - """统一日志工具""" - - SAFE_CHAR_MAP = { - "✓": "[OK] ", - "✗": "[X] ", - "→": "->", - "◆": "*", - } - - COLORS = { - "DEBUG": Fore.LIGHTBLACK_EX, - "INFO": Fore.GREEN, - "SUCCESS": Fore.LIGHTGREEN_EX, - "WARNING": Fore.YELLOW, - "ERROR": Fore.RED, - "STEP": Fore.CYAN, - "DETAIL": Fore.LIGHTCYAN_EX, - "PROGRESS": Fore.MAGENTA, - "MODEL": Fore.LIGHTMAGENTA_EX, - "AUDIO": Fore.BLUE, - "CONFIG": Fore.LIGHTYELLOW_EX, - } - - RESET = Style.RESET_ALL - BRIGHT = Style.BRIGHT - DIM = Style.DIM - - # 详细日志开关 - verbose = True - - @staticmethod - def _sanitize_console_text(text: str) -> str: - """将不兼容当前终端编码的字符替换为安全文本。""" - sanitized = text - for src, dst in Logger.SAFE_CHAR_MAP.items(): - sanitized = sanitized.replace(src, dst) - return sanitized - - @staticmethod - def _emit(text: str): - """安全输出到终端,避免 Windows/GBK 控制台因 Unicode 崩溃。""" - try: - print(text, flush=True) - return - except UnicodeEncodeError: - pass - - fallback = Logger._sanitize_console_text(text) - encoding = getattr(sys.stdout, "encoding", None) or "utf-8" - try: - print( - fallback.encode(encoding, errors="replace").decode(encoding), - flush=True, - ) - except Exception: - print( - fallback.encode("ascii", errors="replace").decode("ascii"), - flush=True, - ) - - @staticmethod - def _log(level: str, msg: str, force_print: bool = True): - """内部日志方法""" - timestamp = datetime.now().strftime("%H:%M:%S") - color = Logger.COLORS.get(level, "") - reset = Logger.RESET - - # 根据级别决定前缀 - if level in ("INFO", "STEP", "SUCCESS"): - prefix = "" - elif level == "DETAIL": - prefix = " → " - elif level == "PROGRESS": - prefix = " ◆ " - elif level == "MODEL": - prefix = "[模型] " - elif level == "AUDIO": - prefix = "[音频] " - elif level == "CONFIG": - prefix = "[配置] " - else: - prefix = f"[{level}] " - - output = f"{color}[{timestamp}]{prefix}{msg}{reset}" - Logger._emit(output) - - @staticmethod - def debug(msg: str): - """调试日志 (灰色) - 仅在verbose模式下显示""" - if Logger.verbose: - Logger._log("DEBUG", msg) - - @staticmethod - def info(msg: str): - """信息日志 (绿色)""" - Logger._log("INFO", msg) - - @staticmethod - def success(msg: str): - """成功日志 (亮绿色)""" - Logger._log("SUCCESS", f"✓ {msg}") - - @staticmethod - def warning(msg: str): - """警告日志 (黄色)""" - Logger._log("WARNING", msg) - - @staticmethod - def error(msg: str): - """错误日志 (红色)""" - Logger._log("ERROR", msg) - - @staticmethod - def step(current: int, total: int, msg: str): - """步骤日志 (青色)""" - timestamp = datetime.now().strftime("%H:%M:%S") - color = Logger.COLORS.get("STEP", "") - reset = Logger.RESET - Logger._emit(f"{color}[{timestamp}][{current}/{total}] {msg}{reset}") - - @staticmethod - def detail(msg: str): - """详细日志 (浅青色) - 用于显示处理细节""" - if Logger.verbose: - Logger._log("DETAIL", msg) - - @staticmethod - def progress(msg: str): - """进度日志 (紫色) - 用于显示处理进度""" - Logger._log("PROGRESS", msg) - - @staticmethod - def model(msg: str): - """模型日志 (浅紫色) - 用于模型加载/卸载信息""" - Logger._log("MODEL", msg) - - @staticmethod - def audio(msg: str): - """音频日志 (蓝色) - 用于音频处理信息""" - Logger._log("AUDIO", msg) - - @staticmethod - def config(msg: str): - """配置日志 (浅黄色) - 用于配置信息""" - if Logger.verbose: - Logger._log("CONFIG", msg) - - @staticmethod - def header(msg: str): - """标题日志 (带分隔线)""" - timestamp = datetime.now().strftime("%H:%M:%S") - color = Logger.COLORS.get("INFO", "") - reset = Logger.RESET - Logger._emit(f"{color}[{timestamp}] {'=' * 50}{reset}") - Logger._emit(f"{color}[{timestamp}] {msg}{reset}") - Logger._emit(f"{color}[{timestamp}] {'=' * 50}{reset}") - - @staticmethod - def separator(char: str = "-", length: int = 40): - """分隔线""" - timestamp = datetime.now().strftime("%H:%M:%S") - color = Logger.COLORS.get("DEBUG", "") - reset = Logger.RESET - Logger._emit(f"{color}[{timestamp}] {char * length}{reset}") - - @staticmethod - def set_verbose(enabled: bool): - """设置详细日志模式""" - Logger.verbose = enabled - - -# 便捷实例 -log = Logger() - - -# ============ 配置标准 logging 模块使用颜色 ============ - -class ColoredFormatter(logging.Formatter): - """为标准logging模块添加颜色支持""" - - LEVEL_COLORS = { - logging.DEBUG: Fore.LIGHTBLACK_EX, - logging.INFO: Fore.GREEN, - logging.WARNING: Fore.YELLOW, - logging.ERROR: Fore.RED, - logging.CRITICAL: Fore.RED + Style.BRIGHT, - } - - def format(self, record): - # 获取颜色 - color = self.LEVEL_COLORS.get(record.levelno, "") - reset = Style.RESET_ALL - - # 格式化时间 - timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - - # 构建消息 - level_name = record.levelname - module_name = record.name - - # 格式化输出 - formatted = f"{color}{timestamp} | {level_name} | {module_name} | {record.getMessage()}{reset}" - return formatted - - -def setup_colored_logging(level=logging.INFO): - """配置全局logging使用颜色输出""" - # 获取根logger - root_logger = logging.getLogger() - root_logger.setLevel(level) - - # 移除现有的handlers - for handler in root_logger.handlers[:]: - root_logger.removeHandler(handler) - - # 添加带颜色的handler - console_handler = logging.StreamHandler(sys.stdout) - console_handler.setLevel(level) - console_handler.setFormatter(ColoredFormatter()) - root_logger.addHandler(console_handler) - - return root_logger - - -# 自动配置logging颜色 -setup_colored_logging(logging.INFO) - -# 抑制第三方库的英文日志 -logging.getLogger("faiss").setLevel(logging.WARNING) -logging.getLogger("audio_separator").setLevel(logging.WARNING) +# -*- coding: utf-8 -*- +""" +日志工具模块 - 支持时间戳和颜色输出 +""" +import sys +import logging +from datetime import datetime + +try: + from colorama import init, Fore, Style, Back + init(autoreset=True) # 初始化 colorama (Windows 兼容), autoreset确保每行重置 + COLORAMA_AVAILABLE = True +except ImportError: + COLORAMA_AVAILABLE = False + # 定义空的占位符 + class Fore: + LIGHTBLACK_EX = GREEN = YELLOW = RED = CYAN = BLUE = MAGENTA = WHITE = LIGHTGREEN_EX = LIGHTCYAN_EX = LIGHTYELLOW_EX = LIGHTMAGENTA_EX = "" + class Style: + RESET_ALL = BRIGHT = DIM = "" + class Back: + pass + + +class Logger: + """统一日志工具""" + + SAFE_CHAR_MAP = { + "✓": "[OK] ", + "✗": "[X] ", + "→": "->", + "◆": "*", + } + + COLORS = { + "DEBUG": Fore.LIGHTBLACK_EX, + "INFO": Fore.GREEN, + "SUCCESS": Fore.LIGHTGREEN_EX, + "WARNING": Fore.YELLOW, + "ERROR": Fore.RED, + "STEP": Fore.CYAN, + "DETAIL": Fore.LIGHTCYAN_EX, + "PROGRESS": Fore.MAGENTA, + "MODEL": Fore.LIGHTMAGENTA_EX, + "AUDIO": Fore.BLUE, + "CONFIG": Fore.LIGHTYELLOW_EX, + } + + RESET = Style.RESET_ALL + BRIGHT = Style.BRIGHT + DIM = Style.DIM + + # 详细日志开关 + verbose = True + + @staticmethod + def _sanitize_console_text(text: str) -> str: + """将不兼容当前终端编码的字符替换为安全文本。""" + sanitized = text + for src, dst in Logger.SAFE_CHAR_MAP.items(): + sanitized = sanitized.replace(src, dst) + return sanitized + + @staticmethod + def _emit(text: str): + """安全输出到终端,避免 Windows/GBK 控制台因 Unicode 崩溃。""" + try: + print(text, flush=True) + return + except UnicodeEncodeError: + pass + + fallback = Logger._sanitize_console_text(text) + encoding = getattr(sys.stdout, "encoding", None) or "utf-8" + try: + print( + fallback.encode(encoding, errors="replace").decode(encoding), + flush=True, + ) + except Exception: + print( + fallback.encode("ascii", errors="replace").decode("ascii"), + flush=True, + ) + + @staticmethod + def _log(level: str, msg: str, force_print: bool = True): + """内部日志方法""" + timestamp = datetime.now().strftime("%H:%M:%S") + color = Logger.COLORS.get(level, "") + reset = Logger.RESET + + # 根据级别决定前缀 + if level in ("INFO", "STEP", "SUCCESS"): + prefix = "" + elif level == "DETAIL": + prefix = " → " + elif level == "PROGRESS": + prefix = " ◆ " + elif level == "MODEL": + prefix = "[模型] " + elif level == "AUDIO": + prefix = "[音频] " + elif level == "CONFIG": + prefix = "[配置] " + else: + prefix = f"[{level}] " + + output = f"{color}[{timestamp}]{prefix}{msg}{reset}" + Logger._emit(output) + + @staticmethod + def debug(msg: str): + """调试日志 (灰色) - 仅在verbose模式下显示""" + if Logger.verbose: + Logger._log("DEBUG", msg) + + @staticmethod + def info(msg: str): + """信息日志 (绿色)""" + Logger._log("INFO", msg) + + @staticmethod + def success(msg: str): + """成功日志 (亮绿色)""" + Logger._log("SUCCESS", f"✓ {msg}") + + @staticmethod + def warning(msg: str): + """警告日志 (黄色)""" + Logger._log("WARNING", msg) + + @staticmethod + def error(msg: str): + """错误日志 (红色)""" + Logger._log("ERROR", msg) + + @staticmethod + def step(current: int, total: int, msg: str): + """步骤日志 (青色)""" + timestamp = datetime.now().strftime("%H:%M:%S") + color = Logger.COLORS.get("STEP", "") + reset = Logger.RESET + Logger._emit(f"{color}[{timestamp}][{current}/{total}] {msg}{reset}") + + @staticmethod + def detail(msg: str): + """详细日志 (浅青色) - 用于显示处理细节""" + if Logger.verbose: + Logger._log("DETAIL", msg) + + @staticmethod + def progress(msg: str): + """进度日志 (紫色) - 用于显示处理进度""" + Logger._log("PROGRESS", msg) + + @staticmethod + def model(msg: str): + """模型日志 (浅紫色) - 用于模型加载/卸载信息""" + Logger._log("MODEL", msg) + + @staticmethod + def audio(msg: str): + """音频日志 (蓝色) - 用于音频处理信息""" + Logger._log("AUDIO", msg) + + @staticmethod + def config(msg: str): + """配置日志 (浅黄色) - 用于配置信息""" + if Logger.verbose: + Logger._log("CONFIG", msg) + + @staticmethod + def header(msg: str): + """标题日志 (带分隔线)""" + timestamp = datetime.now().strftime("%H:%M:%S") + color = Logger.COLORS.get("INFO", "") + reset = Logger.RESET + Logger._emit(f"{color}[{timestamp}] {'=' * 50}{reset}") + Logger._emit(f"{color}[{timestamp}] {msg}{reset}") + Logger._emit(f"{color}[{timestamp}] {'=' * 50}{reset}") + + @staticmethod + def separator(char: str = "-", length: int = 40): + """分隔线""" + timestamp = datetime.now().strftime("%H:%M:%S") + color = Logger.COLORS.get("DEBUG", "") + reset = Logger.RESET + Logger._emit(f"{color}[{timestamp}] {char * length}{reset}") + + @staticmethod + def set_verbose(enabled: bool): + """设置详细日志模式""" + Logger.verbose = enabled + + +# 便捷实例 +log = Logger() + + +# ============ 配置标准 logging 模块使用颜色 ============ + +class ColoredFormatter(logging.Formatter): + """为标准logging模块添加颜色支持""" + + LEVEL_COLORS = { + logging.DEBUG: Fore.LIGHTBLACK_EX, + logging.INFO: Fore.GREEN, + logging.WARNING: Fore.YELLOW, + logging.ERROR: Fore.RED, + logging.CRITICAL: Fore.RED + Style.BRIGHT, + } + + def format(self, record): + # 获取颜色 + color = self.LEVEL_COLORS.get(record.levelno, "") + reset = Style.RESET_ALL + + # 格式化时间 + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + # 构建消息 + level_name = record.levelname + module_name = record.name + + # 格式化输出 + formatted = f"{color}{timestamp} | {level_name} | {module_name} | {record.getMessage()}{reset}" + return formatted + + +def setup_colored_logging(level=logging.INFO): + """配置全局logging使用颜色输出""" + # 获取根logger + root_logger = logging.getLogger() + root_logger.setLevel(level) + + # 移除现有的handlers + for handler in root_logger.handlers[:]: + root_logger.removeHandler(handler) + + # 添加带颜色的handler + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setLevel(level) + console_handler.setFormatter(ColoredFormatter()) + root_logger.addHandler(console_handler) + + return root_logger + + +# 自动配置logging颜色 +setup_colored_logging(logging.INFO) + +# 抑制第三方库的英文日志 +logging.getLogger("faiss").setLevel(logging.WARNING) +logging.getLogger("audio_separator").setLevel(logging.WARNING) diff --git a/lib/mixer.py b/lib/mixer.py index 966a3b11e68e00f111ed35f04b1805435d221bca..af751fa1eb0b6f8558e43ab72babf58b21215479 100644 --- a/lib/mixer.py +++ b/lib/mixer.py @@ -2,13 +2,13 @@ """ 混音模块 - 人声与伴奏混合 """ -import numpy as np -import librosa -import soundfile as sf -from pathlib import Path -from typing import Optional - -from lib.audio import soft_clip_array +import numpy as np +import librosa +import soundfile as sf +from pathlib import Path +from typing import Optional + +from lib.audio import soft_clip_array try: from lib.logger import log @@ -152,12 +152,12 @@ def mix_vocals_and_accompaniment( elif reverb_amount > 0 and log: log.warning("Pedalboard 不可用,跳过混响") - vocals = soft_clip_array(vocals * vocals_volume, threshold=0.85, ceiling=0.95) - accompaniment = soft_clip_array( - accompaniment * accompaniment_volume, - threshold=0.85, - ceiling=0.95, - ) + vocals = soft_clip_array(vocals * vocals_volume, threshold=0.85, ceiling=0.95) + accompaniment = soft_clip_array( + accompaniment * accompaniment_volume, + threshold=0.85, + ceiling=0.95, + ) vocals_len = vocals.shape[-1] accompaniment_len = accompaniment.shape[-1] @@ -174,18 +174,18 @@ def mix_vocals_and_accompaniment( vocals = adjust_audio_length(vocals, target_len) accompaniment = adjust_audio_length(accompaniment, target_len) - if log: - log.progress("混合音轨...") - mixed = vocals + accompaniment - - max_val = float(np.max(np.abs(mixed))) + if log: + log.progress("混合音轨...") + mixed = vocals + accompaniment + + max_val = float(np.max(np.abs(mixed))) if log: log.detail(f"混合后峰值: {max_val:.4f}") - mixed = soft_clip_array(mixed, threshold=0.90, ceiling=0.98) - if log: - final_peak = float(np.max(np.abs(mixed))) - log.detail(f"软削波后峰值: {final_peak:.4f}") + mixed = soft_clip_array(mixed, threshold=0.90, ceiling=0.98) + if log: + final_peak = float(np.max(np.abs(mixed))) + log.detail(f"软削波后峰值: {final_peak:.4f}") if mixed.ndim == 2: mixed = mixed.T diff --git a/lib/runtime_build.py b/lib/runtime_build.py new file mode 100644 index 0000000000000000000000000000000000000000..3cb4f791d83c0007e661e8b5fc049ba1e7aa850d --- /dev/null +++ b/lib/runtime_build.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from datetime import datetime +from pathlib import Path +from typing import Dict + + +ROOT_DIR = Path(__file__).resolve().parent.parent +WATCHED_FILES = ( + "run.py", + "app.py", + "lib/ffmpeg_runtime.py", + "ui/app.py", + "infer/cover_pipeline.py", + "infer/official_adapter.py", + "infer/quality_policy.py", +) + + +def _existing_watched_files() -> list[Path]: + files: list[Path] = [] + for relative_path in WATCHED_FILES: + path = ROOT_DIR / relative_path + if path.exists(): + files.append(path) + return files + + +def _compute_runtime_build_info() -> Dict[str, str]: + files = _existing_watched_files() + if not files: + return { + "timestamp": "unknown", + "source": "unknown", + } + + latest = max(files, key=lambda path: path.stat().st_mtime) + timestamp = datetime.fromtimestamp(latest.stat().st_mtime).strftime("%Y-%m-%d %H:%M:%S") + try: + source = latest.relative_to(ROOT_DIR).as_posix() + except ValueError: + source = str(latest) + return { + "timestamp": timestamp, + "source": source, + } + + +RUNTIME_BUILD_INFO = _compute_runtime_build_info() + + +def get_runtime_build_label() -> str: + info = RUNTIME_BUILD_INFO + return f"当前运行代码标记: {info['timestamp']} ({info['source']})" + + +def get_runtime_build_short_label() -> str: + info = RUNTIME_BUILD_INFO + return f"{info['timestamp']} ({info['source']})" diff --git a/mcp/__init__.py b/mcp/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..12fafb868b4539e67d3277280f5e1e0530a79667 --- /dev/null +++ b/mcp/__init__.py @@ -0,0 +1,4 @@ +# -*- coding: utf-8 -*- +""" +MCP 模块 +""" diff --git a/mcp/server.py b/mcp/server.py new file mode 100644 index 0000000000000000000000000000000000000000..fc8eef08610abf0b96ab24301c9142865cdfcf13 --- /dev/null +++ b/mcp/server.py @@ -0,0 +1,139 @@ +# -*- coding: utf-8 -*- +""" +MCP 服务器 - 为 Claude Code 提供 AI 翻唱工具 +""" +import asyncio +import json +from pathlib import Path +from typing import Any + +from mcp.server import Server +from mcp.server.stdio import stdio_server +from mcp.types import Tool, TextContent + +from mcp.tools import ( + list_models, + convert_voice, + download_model, + get_model_status +) + +# 创建 MCP 服务器 +server = Server("rvc-voice-conversion") + + +@server.list_tools() +async def list_tools() -> list[Tool]: + """列出可用工具""" + return [ + Tool( + name="list_voice_models", + description="列出所有可用的 RVC 语音模型", + inputSchema={ + "type": "object", + "properties": {}, + "required": [] + } + ), + Tool( + name="convert_voice", + description="使用 RVC 模型进行 AI 翻唱", + inputSchema={ + "type": "object", + "properties": { + "input_path": { + "type": "string", + "description": "输入音频文件的绝对路径" + }, + "output_path": { + "type": "string", + "description": "输出音频文件的绝对路径" + }, + "model_name": { + "type": "string", + "description": "要使用的语音模型名称" + }, + "pitch_shift": { + "type": "number", + "description": "音调偏移 (半音),正数升调,负数降调", + "default": 0 + }, + "index_ratio": { + "type": "number", + "description": "索引混合比率 (0-1)", + "default": 0.5 + } + }, + "required": ["input_path", "output_path", "model_name"] + } + ), + Tool( + name="download_base_models", + description="下载 RVC 所需的基础模型 (HuBERT, RMVPE)", + inputSchema={ + "type": "object", + "properties": {}, + "required": [] + } + ), + Tool( + name="get_model_status", + description="获取基础模型的下载状态", + inputSchema={ + "type": "object", + "properties": {}, + "required": [] + } + ) + ] + + +@server.call_tool() +async def call_tool(name: str, arguments: dict[str, Any]) -> list[TextContent]: + """执行工具调用""" + + if name == "list_voice_models": + models = list_models() + result = { + "models": models, + "count": len(models) + } + return [TextContent(type="text", text=json.dumps(result, ensure_ascii=False, indent=2))] + + elif name == "convert_voice": + result = convert_voice( + input_path=arguments["input_path"], + output_path=arguments["output_path"], + model_name=arguments["model_name"], + pitch_shift=arguments.get("pitch_shift", 0), + index_ratio=arguments.get("index_ratio", 0.5), + filter_radius=arguments.get("filter_radius", 3), + rms_mix_rate=arguments.get("rms_mix_rate", 0.25), + protect=arguments.get("protect", 0.33) + ) + return [TextContent(type="text", text=json.dumps(result, ensure_ascii=False, indent=2))] + + elif name == "download_base_models": + result = download_model() + return [TextContent(type="text", text=json.dumps(result, ensure_ascii=False, indent=2))] + + elif name == "get_model_status": + status = get_model_status() + return [TextContent(type="text", text=json.dumps(status, ensure_ascii=False, indent=2))] + + else: + return [TextContent(type="text", text=f"未知工具: {name}")] + + +async def main(): + """启动 MCP 服务器""" + async with stdio_server() as (read_stream, write_stream): + await server.run( + read_stream, + write_stream, + server.create_initialization_options() + ) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/mcp/tools.py b/mcp/tools.py new file mode 100644 index 0000000000000000000000000000000000000000..f9160e81e6ae1c1b0b2ef8269ac1fb607362a8d8 --- /dev/null +++ b/mcp/tools.py @@ -0,0 +1,233 @@ +# -*- coding: utf-8 -*- +""" +MCP 工具函数 - 提供 AI 翻唱功能的工具接口 +""" +import os +import json +from pathlib import Path +from typing import Optional, List, Dict, Any + +# 项目根目录 +ROOT_DIR = Path(__file__).parent.parent + + +def get_config() -> dict: + """获取配置""" + config_path = ROOT_DIR / "configs" / "config.json" + if config_path.exists(): + with open(config_path, "r", encoding="utf-8") as f: + config = json.load(f) + return _normalize_config(config) + return {} + + +def _normalize_config(config: dict) -> dict: + """Normalize legacy path keys to top-level entries.""" + if not config: + return {} + + paths = config.get("paths", {}) + if "hubert_path" not in config and "hubert" in paths: + config["hubert_path"] = paths["hubert"] + if "rmvpe_path" not in config and "rmvpe" in paths: + config["rmvpe_path"] = paths["rmvpe"] + if "weights_dir" not in config and "weights" in paths: + config["weights_dir"] = paths["weights"] + if "output_dir" not in config and "outputs" in paths: + config["output_dir"] = paths["outputs"] + if "temp_dir" not in config and "temp" in paths: + config["temp_dir"] = paths["temp"] + + return config + + +def list_models() -> List[Dict[str, Any]]: + """ + 列出所有可用的语音模型 + + Returns: + List[Dict]: 模型列表,每个模型包含 name, model_path, index_path + """ + from infer.pipeline import list_voice_models + + config = get_config() + weights_dir = ROOT_DIR / config.get("weights_dir", "assets/weights") + + return list_voice_models(str(weights_dir)) + + +def convert_voice( + input_path: str, + output_path: str, + model_name: str, + pitch_shift: float = 0, + index_ratio: float = 0.5, + filter_radius: int = 3, + rms_mix_rate: float = 0.25, + protect: float = 0.33 +) -> Dict[str, Any]: + """ + 执行 AI 翻唱 + + Args: + input_path: 输入音频文件路径 + output_path: 输出音频文件路径 + model_name: 模型名称 + pitch_shift: 音调偏移 (半音) + index_ratio: 索引混合比率 + filter_radius: 中值滤波半径 + rms_mix_rate: 响度混合比率 + protect: 保护系数 + + Returns: + Dict: 转换结果,包含 success, output_path, error + """ + try: + from infer.pipeline import VoiceConversionPipeline, list_voice_models + + config = get_config() + use_official_vc = config.get("use_official_vc", True) + + # 查找模型 + weights_dir = ROOT_DIR / config.get("weights_dir", "assets/weights") + models = list_voice_models(str(weights_dir)) + + model_info = None + for m in models: + if m["name"] == model_name: + model_info = m + break + + if model_info is None: + return { + "success": False, + "output_path": None, + "error": f"模型不存在: {model_name}" + } + + if use_official_vc: + from infer.official_adapter import convert_vocals_official + + result_path = convert_vocals_official( + vocals_path=input_path, + output_path=output_path, + model_path=model_info["model_path"], + index_path=model_info.get("index_path"), + f0_method=config.get("f0_method", "rmvpe"), + pitch_shift=int(pitch_shift), + index_rate=index_ratio, + filter_radius=filter_radius, + rms_mix_rate=rms_mix_rate, + protect=protect, + ) + return { + "success": True, + "output_path": result_path, + "error": None, + } + + # 初始化管道 + device = config.get("device", "cuda") + pipeline = VoiceConversionPipeline(device=device) + pipeline.hubert_layer = config.get("hubert_layer", 12) + + # 加载 HuBERT + hubert_path = ROOT_DIR / config.get("hubert_path", "assets/hubert/hubert_base.pt") + if not hubert_path.exists(): + return { + "success": False, + "output_path": None, + "error": "HuBERT 模型不存在,请先下载基础模型" + } + pipeline.load_hubert(str(hubert_path)) + + # 加载 F0 提取器 + rmvpe_path = ROOT_DIR / config.get("rmvpe_path", "assets/rmvpe/rmvpe.pt") + if not rmvpe_path.exists(): + return { + "success": False, + "output_path": None, + "error": "RMVPE 模型不存在,请先下载基础模型" + } + pipeline.load_f0_extractor("rmvpe", str(rmvpe_path)) + + # 加载语音模型 + pipeline.load_voice_model(model_info["model_path"]) + + # 加载索引 + if model_info.get("index_path"): + pipeline.load_index(model_info["index_path"]) + + # 执行转换 + result_path = pipeline.convert( + audio_path=input_path, + output_path=output_path, + pitch_shift=pitch_shift, + index_ratio=index_ratio, + filter_radius=filter_radius, + rms_mix_rate=rms_mix_rate, + protect=protect + ) + + return { + "success": True, + "output_path": result_path, + "error": None + } + + except Exception as e: + return { + "success": False, + "output_path": None, + "error": str(e) + } + + +def download_model(model_name: str = None) -> Dict[str, Any]: + """ + 下载模型 + + Args: + model_name: 模型名称,为 None 时下载所有必需模型 + + Returns: + Dict: 下载结果 + """ + try: + from tools.download_models import ( + download_model as dl_model, + download_required_models, + MODELS + ) + + if model_name: + if model_name not in MODELS: + return { + "success": False, + "error": f"未知模型: {model_name}" + } + success = dl_model(model_name) + else: + success = download_required_models() + + return { + "success": success, + "error": None if success else "下载失败" + } + + except Exception as e: + return { + "success": False, + "error": str(e) + } + + +def get_model_status() -> Dict[str, bool]: + """ + 获取模型下载状态 + + Returns: + Dict: 模型名称 -> 是否已下载 + """ + from tools.download_models import check_all_models + return check_all_models() diff --git a/models/__init__.py b/models/__init__.py index 9c1fe281a1f354b26ebca0962b9df8d037754256..b969dbb51c9cb42771b9e411ed7d3b80c5451d22 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -1,7 +1,7 @@ -# -*- coding: utf-8 -*- -""" -模型定义模块 -""" -from .rmvpe import RMVPE - -__all__ = ["RMVPE"] +# -*- coding: utf-8 -*- +""" +模型定义模块 +""" +from .rmvpe import RMVPE + +__all__ = ["RMVPE"] diff --git a/models/rmvpe.py b/models/rmvpe.py index e2b642fc3b16708012410a1bce50a3300a696d52..6f02087d9bec5086ef08d180abf5654ce9ca7a3f 100644 --- a/models/rmvpe.py +++ b/models/rmvpe.py @@ -1,439 +1,439 @@ -# -*- coding: utf-8 -*- -""" -RMVPE 模型 - 用于高质量 F0 提取 -""" -import torch -import torch.nn as nn -import torch.nn.functional as F -import numpy as np -from typing import Optional - - -class BiGRU(nn.Module): - """双向 GRU 层""" - - def __init__(self, input_features: int, hidden_features: int, num_layers: int): - super().__init__() - self.gru = nn.GRU( - input_features, - hidden_features, - num_layers=num_layers, - batch_first=True, - bidirectional=True - ) - - def forward(self, x): - return self.gru(x)[0] - - -class ConvBlockRes(nn.Module): - """残差卷积块""" - - def __init__(self, in_channels: int, out_channels: int, momentum: float = 0.01, - force_shortcut: bool = False): - super().__init__() - self.conv = nn.Sequential( - nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False), - nn.BatchNorm2d(out_channels, momentum=momentum), - nn.ReLU(), - nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False), - nn.BatchNorm2d(out_channels, momentum=momentum), - nn.ReLU() - ) - - # 当通道数不同或强制使用时才创建 shortcut - if in_channels != out_channels or force_shortcut: - self.shortcut = nn.Conv2d(in_channels, out_channels, 1) - self.has_shortcut = True - else: - self.has_shortcut = False - - def forward(self, x): - if self.has_shortcut: - return self.conv(x) + self.shortcut(x) - else: - return self.conv(x) + x - - -class EncoderBlock(nn.Module): - """编码器块 - 包含多个 ConvBlockRes 和一个池化层""" - - def __init__(self, in_channels: int, out_channels: int, kernel_size: int, - n_blocks: int, momentum: float = 0.01): - super().__init__() - self.conv = nn.ModuleList() - self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) - for _ in range(n_blocks - 1): - self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) - self.pool = nn.AvgPool2d(kernel_size) - - def forward(self, x): - for block in self.conv: - x = block(x) - # 返回池化前的张量用于 skip connection - return self.pool(x), x - - -class Encoder(nn.Module): - """RMVPE 编码器""" - - def __init__(self, in_channels: int, in_size: int, n_encoders: int, - kernel_size: int, n_blocks: int, out_channels: int = 16, - momentum: float = 0.01): - super().__init__() - - self.n_encoders = n_encoders - self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) - self.layers = nn.ModuleList() - self.latent_channels = [] - - for i in range(n_encoders): - self.layers.append( - EncoderBlock( - in_channels if i == 0 else out_channels * (2 ** (i - 1)), - out_channels * (2 ** i), - kernel_size, - n_blocks, - momentum - ) - ) - self.latent_channels.append(out_channels * (2 ** i)) - - def forward(self, x): - x = self.bn(x) - concat_tensors = [] - for layer in self.layers: - x, skip = layer(x) - concat_tensors.append(skip) - return x, concat_tensors - - -class Intermediate(nn.Module): - """中间层""" - - def __init__(self, in_channels: int, out_channels: int, n_inters: int, - n_blocks: int, momentum: float = 0.01): - super().__init__() - - self.layers = nn.ModuleList() - for i in range(n_inters): - if i == 0: - # 第一层: in_channels -> out_channels (256 -> 512) - self.layers.append( - IntermediateBlock(in_channels, out_channels, n_blocks, momentum, first_block_shortcut=True) - ) - else: - # 后续层: out_channels -> out_channels (512 -> 512) - self.layers.append( - IntermediateBlock(out_channels, out_channels, n_blocks, momentum, first_block_shortcut=False) - ) - - def forward(self, x): - for layer in self.layers: - x = layer(x) - return x - - -class IntermediateBlock(nn.Module): - """中间层块""" - - def __init__(self, in_channels: int, out_channels: int, n_blocks: int, - momentum: float = 0.01, first_block_shortcut: bool = False): - super().__init__() - self.conv = nn.ModuleList() - # 第一个块可能需要强制使用 shortcut - self.conv.append(ConvBlockRes(in_channels, out_channels, momentum, force_shortcut=first_block_shortcut)) - for _ in range(n_blocks - 1): - self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) - - def forward(self, x): - for block in self.conv: - x = block(x) - return x - - -class DecoderBlock(nn.Module): - """解码器块""" - - def __init__(self, in_channels: int, out_channels: int, stride: int, - n_blocks: int, momentum: float = 0.01): - super().__init__() - # conv1: 转置卷积 + BatchNorm (kernel_size=3, stride=stride, padding=1, output_padding=1) - self.conv1 = nn.Sequential( - nn.ConvTranspose2d(in_channels, out_channels, 3, stride, padding=1, output_padding=1, bias=False), - nn.BatchNorm2d(out_channels, momentum=momentum) - ) - # conv2: ConvBlockRes 列表 - # 第一个块: in_channels = out_channels * 2 (concat 后), out_channels = out_channels - # 后续块: in_channels = out_channels, out_channels = out_channels - self.conv2 = nn.ModuleList() - self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) - for _ in range(n_blocks - 1): - self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) - - def forward(self, x, concat_tensor): - x = self.conv1(x) - # 处理尺寸不匹配:填充较小的张量使其匹配较大的 - diff_h = concat_tensor.size(2) - x.size(2) - diff_w = concat_tensor.size(3) - x.size(3) - if diff_h != 0 or diff_w != 0: - # 填充 x 使其与 concat_tensor 尺寸匹配 - x = F.pad(x, [0, diff_w, 0, diff_h]) - x = torch.cat([x, concat_tensor], dim=1) - for block in self.conv2: - x = block(x) - return x - - -class Decoder(nn.Module): - """RMVPE 解码器""" - - def __init__(self, in_channels: int, n_decoders: int, stride: int, - n_blocks: int, out_channels: int = 16, momentum: float = 0.01): - super().__init__() - - self.layers = nn.ModuleList() - for i in range(n_decoders): - out_ch = out_channels * (2 ** (n_decoders - 1 - i)) - in_ch = in_channels if i == 0 else out_channels * (2 ** (n_decoders - i)) - self.layers.append( - DecoderBlock(in_ch, out_ch, stride, n_blocks, momentum) - ) - - def forward(self, x, concat_tensors): - for i, layer in enumerate(self.layers): - x = layer(x, concat_tensors[-1 - i]) - return x - - -class DeepUnet(nn.Module): - """Deep U-Net 架构""" - - def __init__(self, kernel_size: int, n_blocks: int, en_de_layers: int = 5, - inter_layers: int = 4, in_channels: int = 1, en_out_channels: int = 16): - super().__init__() - - # Encoder 输出通道: en_out_channels * 2^(en_de_layers-1) = 16 * 16 = 256 - encoder_out_channels = en_out_channels * (2 ** (en_de_layers - 1)) - # Intermediate 输出通道: encoder_out_channels * 2 = 512 - intermediate_out_channels = encoder_out_channels * 2 - - self.encoder = Encoder( - in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels - ) - self.intermediate = Intermediate( - encoder_out_channels, - intermediate_out_channels, - inter_layers, n_blocks - ) - self.decoder = Decoder( - intermediate_out_channels, - en_de_layers, kernel_size, n_blocks, en_out_channels - ) - - def forward(self, x): - x, concat_tensors = self.encoder(x) - x = self.intermediate(x) - x = self.decoder(x, concat_tensors) - return x - - -class E2E(nn.Module): - """端到端 RMVPE 模型""" - - def __init__(self, n_blocks: int, n_gru: int, kernel_size: int, - en_de_layers: int = 5, inter_layers: int = 4, - in_channels: int = 1, en_out_channels: int = 16): - super().__init__() - - self.unet = DeepUnet( - kernel_size, n_blocks, en_de_layers, inter_layers, - in_channels, en_out_channels - ) - self.cnn = nn.Conv2d(en_out_channels, 3, 3, 1, 1) - - if n_gru: - self.fc = nn.Sequential( - BiGRU(3 * 128, 256, n_gru), - nn.Linear(512, 360), - nn.Dropout(0.25), - nn.Sigmoid() - ) - else: - self.fc = nn.Sequential( - nn.Linear(3 * 128, 360), - nn.Dropout(0.25), - nn.Sigmoid() - ) - - def forward(self, mel): - # 输入 mel: [B, 128, T] 或 [B, 1, 128, T] - # 官方实现期望 [B, 1, T, 128],即 time 在 height,mel bins 在 width - if mel.dim() == 3: - # [B, 128, T] -> [B, T, 128] -> [B, 1, T, 128] - mel = mel.transpose(-1, -2).unsqueeze(1) - elif mel.dim() == 4 and mel.shape[1] == 1: - # [B, 1, 128, T] -> [B, 1, T, 128] - mel = mel.transpose(-1, -2) - - x = self.unet(mel) - x = self.cnn(x) - # x shape: (batch, 3, T, 128) - # 转换为 (batch, T, 384) 其中 384 = 3 * 128 - x = x.transpose(1, 2).flatten(-2) # (batch, T, 384) - x = self.fc(x) - return x - - -class MelSpectrogram(nn.Module): - """Mel 频谱提取""" - - def __init__(self, n_mel: int = 128, n_fft: int = 1024, win_size: int = 1024, - hop_length: int = 160, sample_rate: int = 16000, - fmin: int = 30, fmax: int = 8000): - super().__init__() - - self.n_fft = n_fft - self.hop_length = hop_length - self.win_size = win_size - self.sample_rate = sample_rate - self.n_mel = n_mel - - # 创建 Mel 滤波器组 - mel_basis = self._mel_filterbank(sample_rate, n_fft, n_mel, fmin, fmax) - self.register_buffer("mel_basis", mel_basis) - self.register_buffer("window", torch.hann_window(win_size)) - - def _mel_filterbank(self, sr, n_fft, n_mels, fmin, fmax): - """创建 Mel 滤波器组""" - import librosa - # 必须使用 htk=True,与官方 RVC RMVPE 保持一致 - mel = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=True) - return torch.from_numpy(mel).float() - - def forward(self, audio): - # STFT - spec = torch.stft( - audio, - self.n_fft, - hop_length=self.hop_length, - win_length=self.win_size, - window=self.window, - center=True, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=True - ) - # 使用功率谱(幅度的平方),与官方 RMVPE 一致 - spec = torch.abs(spec) ** 2 - - # Mel 变换 - mel = torch.matmul(self.mel_basis, spec) - mel = torch.log(torch.clamp(mel, min=1e-5)) - - return mel - - -class RMVPE: - """RMVPE F0 提取器封装类""" - - def __init__(self, model_path: str, device: str = "cuda"): - self.device = device - - # 加载模型 - self.model = E2E(n_blocks=4, n_gru=1, kernel_size=2) - ckpt = torch.load(model_path, map_location="cpu", weights_only=False) - self.model.load_state_dict(ckpt) - self.model = self.model.to(device).eval() - - # Mel 频谱提取器 - self.mel_extractor = MelSpectrogram().to(device) - - # 频率映射 - cents_mapping = 20 * np.arange(360) + 1997.3794084376191 - self.cents_mapping = np.pad(cents_mapping, (4, 4)) - - @torch.no_grad() - def infer_from_audio(self, audio: np.ndarray, thred: float = 0.03) -> np.ndarray: - """ - 从音频提取 F0 - - Args: - audio: 16kHz 音频数据 - thred: 置信度阈值 - - Returns: - np.ndarray: F0 序列 - """ - # 转换为张量 - audio = torch.from_numpy(audio).float().to(self.device) - if audio.dim() == 1: - audio = audio.unsqueeze(0) - - # 提取 Mel 频谱: [B, 128, T] - mel = self.mel_extractor(audio) - - # 记录原始帧数 - n_frames = mel.shape[-1] - - # 填充时间维度使其可被 32 整除(5 层池化,每层 /2) - n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames - if n_pad > 0: - mel = F.pad(mel, (0, n_pad), mode='constant', value=0) - - # 模型推理 - E2E.forward 会处理 transpose - hidden = self.model(mel) - - # 移除填充部分,只保留原始帧数 - hidden = hidden[:, :n_frames, :] - hidden = hidden.squeeze(0).cpu().numpy() - - # 解码 F0 - f0 = self._decode(hidden, thred) - - return f0 - - def _decode(self, hidden: np.ndarray, thred: float) -> np.ndarray: - """解码隐藏状态为 F0 - 使用官方 RVC 算法""" - # 使用官方的 to_local_average_cents 算法 - cents = self._to_local_average_cents(hidden, thred) - - # 转换 cents 到 Hz - f0 = 10 * (2 ** (cents / 1200)) - f0[f0 == 10] = 0 # cents=0 时 f0=10,需要置零 - - return f0 - - def _to_local_average_cents(self, salience: np.ndarray, thred: float) -> np.ndarray: - """官方 RVC 的 to_local_average_cents 算法""" - # Step 1: 找到每帧的峰值 bin - center = np.argmax(salience, axis=1) # [T] - - # Step 2: 对 salience 进行 padding - salience = np.pad(salience, ((0, 0), (4, 4))) # [T, 368] - center += 4 # 调整 center 索引 - - # Step 3: 提取峰值附近 9 个 bin 的窗口并计算加权平均 - todo_salience = [] - todo_cents_mapping = [] - starts = center - 4 - ends = center + 5 - - for idx in range(salience.shape[0]): - todo_salience.append(salience[idx, starts[idx]:ends[idx]]) - todo_cents_mapping.append(self.cents_mapping[starts[idx]:ends[idx]]) - - todo_salience = np.array(todo_salience) # [T, 9] - todo_cents_mapping = np.array(todo_cents_mapping) # [T, 9] - - # Step 4: 加权平均 - product_sum = np.sum(todo_salience * todo_cents_mapping, axis=1) - weight_sum = np.sum(todo_salience, axis=1) + 1e-9 - cents = product_sum / weight_sum - - # Step 5: 阈值过滤 - 使用原始 salience 的最大值 - maxx = np.max(salience, axis=1) - cents[maxx <= thred] = 0 - - return cents +# -*- coding: utf-8 -*- +""" +RMVPE 模型 - 用于高质量 F0 提取 +""" +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from typing import Optional + + +class BiGRU(nn.Module): + """双向 GRU 层""" + + def __init__(self, input_features: int, hidden_features: int, num_layers: int): + super().__init__() + self.gru = nn.GRU( + input_features, + hidden_features, + num_layers=num_layers, + batch_first=True, + bidirectional=True + ) + + def forward(self, x): + return self.gru(x)[0] + + +class ConvBlockRes(nn.Module): + """残差卷积块""" + + def __init__(self, in_channels: int, out_channels: int, momentum: float = 0.01, + force_shortcut: bool = False): + super().__init__() + self.conv = nn.Sequential( + nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU() + ) + + # 当通道数不同或强制使用时才创建 shortcut + if in_channels != out_channels or force_shortcut: + self.shortcut = nn.Conv2d(in_channels, out_channels, 1) + self.has_shortcut = True + else: + self.has_shortcut = False + + def forward(self, x): + if self.has_shortcut: + return self.conv(x) + self.shortcut(x) + else: + return self.conv(x) + x + + +class EncoderBlock(nn.Module): + """编码器块 - 包含多个 ConvBlockRes 和一个池化层""" + + def __init__(self, in_channels: int, out_channels: int, kernel_size: int, + n_blocks: int, momentum: float = 0.01): + super().__init__() + self.conv = nn.ModuleList() + self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) + for _ in range(n_blocks - 1): + self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) + self.pool = nn.AvgPool2d(kernel_size) + + def forward(self, x): + for block in self.conv: + x = block(x) + # 返回池化前的张量用于 skip connection + return self.pool(x), x + + +class Encoder(nn.Module): + """RMVPE 编码器""" + + def __init__(self, in_channels: int, in_size: int, n_encoders: int, + kernel_size: int, n_blocks: int, out_channels: int = 16, + momentum: float = 0.01): + super().__init__() + + self.n_encoders = n_encoders + self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) + self.layers = nn.ModuleList() + self.latent_channels = [] + + for i in range(n_encoders): + self.layers.append( + EncoderBlock( + in_channels if i == 0 else out_channels * (2 ** (i - 1)), + out_channels * (2 ** i), + kernel_size, + n_blocks, + momentum + ) + ) + self.latent_channels.append(out_channels * (2 ** i)) + + def forward(self, x): + x = self.bn(x) + concat_tensors = [] + for layer in self.layers: + x, skip = layer(x) + concat_tensors.append(skip) + return x, concat_tensors + + +class Intermediate(nn.Module): + """中间层""" + + def __init__(self, in_channels: int, out_channels: int, n_inters: int, + n_blocks: int, momentum: float = 0.01): + super().__init__() + + self.layers = nn.ModuleList() + for i in range(n_inters): + if i == 0: + # 第一层: in_channels -> out_channels (256 -> 512) + self.layers.append( + IntermediateBlock(in_channels, out_channels, n_blocks, momentum, first_block_shortcut=True) + ) + else: + # 后续层: out_channels -> out_channels (512 -> 512) + self.layers.append( + IntermediateBlock(out_channels, out_channels, n_blocks, momentum, first_block_shortcut=False) + ) + + def forward(self, x): + for layer in self.layers: + x = layer(x) + return x + + +class IntermediateBlock(nn.Module): + """中间层块""" + + def __init__(self, in_channels: int, out_channels: int, n_blocks: int, + momentum: float = 0.01, first_block_shortcut: bool = False): + super().__init__() + self.conv = nn.ModuleList() + # 第一个块可能需要强制使用 shortcut + self.conv.append(ConvBlockRes(in_channels, out_channels, momentum, force_shortcut=first_block_shortcut)) + for _ in range(n_blocks - 1): + self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) + + def forward(self, x): + for block in self.conv: + x = block(x) + return x + + +class DecoderBlock(nn.Module): + """解码器块""" + + def __init__(self, in_channels: int, out_channels: int, stride: int, + n_blocks: int, momentum: float = 0.01): + super().__init__() + # conv1: 转置卷积 + BatchNorm (kernel_size=3, stride=stride, padding=1, output_padding=1) + self.conv1 = nn.Sequential( + nn.ConvTranspose2d(in_channels, out_channels, 3, stride, padding=1, output_padding=1, bias=False), + nn.BatchNorm2d(out_channels, momentum=momentum) + ) + # conv2: ConvBlockRes 列表 + # 第一个块: in_channels = out_channels * 2 (concat 后), out_channels = out_channels + # 后续块: in_channels = out_channels, out_channels = out_channels + self.conv2 = nn.ModuleList() + self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) + for _ in range(n_blocks - 1): + self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) + + def forward(self, x, concat_tensor): + x = self.conv1(x) + # 处理尺寸不匹配:填充较小的张量使其匹配较大的 + diff_h = concat_tensor.size(2) - x.size(2) + diff_w = concat_tensor.size(3) - x.size(3) + if diff_h != 0 or diff_w != 0: + # 填充 x 使其与 concat_tensor 尺寸匹配 + x = F.pad(x, [0, diff_w, 0, diff_h]) + x = torch.cat([x, concat_tensor], dim=1) + for block in self.conv2: + x = block(x) + return x + + +class Decoder(nn.Module): + """RMVPE 解码器""" + + def __init__(self, in_channels: int, n_decoders: int, stride: int, + n_blocks: int, out_channels: int = 16, momentum: float = 0.01): + super().__init__() + + self.layers = nn.ModuleList() + for i in range(n_decoders): + out_ch = out_channels * (2 ** (n_decoders - 1 - i)) + in_ch = in_channels if i == 0 else out_channels * (2 ** (n_decoders - i)) + self.layers.append( + DecoderBlock(in_ch, out_ch, stride, n_blocks, momentum) + ) + + def forward(self, x, concat_tensors): + for i, layer in enumerate(self.layers): + x = layer(x, concat_tensors[-1 - i]) + return x + + +class DeepUnet(nn.Module): + """Deep U-Net 架构""" + + def __init__(self, kernel_size: int, n_blocks: int, en_de_layers: int = 5, + inter_layers: int = 4, in_channels: int = 1, en_out_channels: int = 16): + super().__init__() + + # Encoder 输出通道: en_out_channels * 2^(en_de_layers-1) = 16 * 16 = 256 + encoder_out_channels = en_out_channels * (2 ** (en_de_layers - 1)) + # Intermediate 输出通道: encoder_out_channels * 2 = 512 + intermediate_out_channels = encoder_out_channels * 2 + + self.encoder = Encoder( + in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels + ) + self.intermediate = Intermediate( + encoder_out_channels, + intermediate_out_channels, + inter_layers, n_blocks + ) + self.decoder = Decoder( + intermediate_out_channels, + en_de_layers, kernel_size, n_blocks, en_out_channels + ) + + def forward(self, x): + x, concat_tensors = self.encoder(x) + x = self.intermediate(x) + x = self.decoder(x, concat_tensors) + return x + + +class E2E(nn.Module): + """端到端 RMVPE 模型""" + + def __init__(self, n_blocks: int, n_gru: int, kernel_size: int, + en_de_layers: int = 5, inter_layers: int = 4, + in_channels: int = 1, en_out_channels: int = 16): + super().__init__() + + self.unet = DeepUnet( + kernel_size, n_blocks, en_de_layers, inter_layers, + in_channels, en_out_channels + ) + self.cnn = nn.Conv2d(en_out_channels, 3, 3, 1, 1) + + if n_gru: + self.fc = nn.Sequential( + BiGRU(3 * 128, 256, n_gru), + nn.Linear(512, 360), + nn.Dropout(0.25), + nn.Sigmoid() + ) + else: + self.fc = nn.Sequential( + nn.Linear(3 * 128, 360), + nn.Dropout(0.25), + nn.Sigmoid() + ) + + def forward(self, mel): + # 输入 mel: [B, 128, T] 或 [B, 1, 128, T] + # 官方实现期望 [B, 1, T, 128],即 time 在 height,mel bins 在 width + if mel.dim() == 3: + # [B, 128, T] -> [B, T, 128] -> [B, 1, T, 128] + mel = mel.transpose(-1, -2).unsqueeze(1) + elif mel.dim() == 4 and mel.shape[1] == 1: + # [B, 1, 128, T] -> [B, 1, T, 128] + mel = mel.transpose(-1, -2) + + x = self.unet(mel) + x = self.cnn(x) + # x shape: (batch, 3, T, 128) + # 转换为 (batch, T, 384) 其中 384 = 3 * 128 + x = x.transpose(1, 2).flatten(-2) # (batch, T, 384) + x = self.fc(x) + return x + + +class MelSpectrogram(nn.Module): + """Mel 频谱提取""" + + def __init__(self, n_mel: int = 128, n_fft: int = 1024, win_size: int = 1024, + hop_length: int = 160, sample_rate: int = 16000, + fmin: int = 30, fmax: int = 8000): + super().__init__() + + self.n_fft = n_fft + self.hop_length = hop_length + self.win_size = win_size + self.sample_rate = sample_rate + self.n_mel = n_mel + + # 创建 Mel 滤波器组 + mel_basis = self._mel_filterbank(sample_rate, n_fft, n_mel, fmin, fmax) + self.register_buffer("mel_basis", mel_basis) + self.register_buffer("window", torch.hann_window(win_size)) + + def _mel_filterbank(self, sr, n_fft, n_mels, fmin, fmax): + """创建 Mel 滤波器组""" + import librosa + # 必须使用 htk=True,与官方 RVC RMVPE 保持一致 + mel = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=True) + return torch.from_numpy(mel).float() + + def forward(self, audio): + # STFT + spec = torch.stft( + audio, + self.n_fft, + hop_length=self.hop_length, + win_length=self.win_size, + window=self.window, + center=True, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=True + ) + # 使用功率谱(幅度的平方),与官方 RMVPE 一致 + spec = torch.abs(spec) ** 2 + + # Mel 变换 + mel = torch.matmul(self.mel_basis, spec) + mel = torch.log(torch.clamp(mel, min=1e-5)) + + return mel + + +class RMVPE: + """RMVPE F0 提取器封装类""" + + def __init__(self, model_path: str, device: str = "cuda"): + self.device = device + + # 加载模型 + self.model = E2E(n_blocks=4, n_gru=1, kernel_size=2) + ckpt = torch.load(model_path, map_location="cpu", weights_only=False) + self.model.load_state_dict(ckpt) + self.model = self.model.to(device).eval() + + # Mel 频谱提取器 + self.mel_extractor = MelSpectrogram().to(device) + + # 频率映射 + cents_mapping = 20 * np.arange(360) + 1997.3794084376191 + self.cents_mapping = np.pad(cents_mapping, (4, 4)) + + @torch.no_grad() + def infer_from_audio(self, audio: np.ndarray, thred: float = 0.03) -> np.ndarray: + """ + 从音频提取 F0 + + Args: + audio: 16kHz 音频数据 + thred: 置信度阈值 + + Returns: + np.ndarray: F0 序列 + """ + # 转换为张量 + audio = torch.from_numpy(audio).float().to(self.device) + if audio.dim() == 1: + audio = audio.unsqueeze(0) + + # 提取 Mel 频谱: [B, 128, T] + mel = self.mel_extractor(audio) + + # 记录原始帧数 + n_frames = mel.shape[-1] + + # 填充时间维度使其可被 32 整除(5 层池化,每层 /2) + n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames + if n_pad > 0: + mel = F.pad(mel, (0, n_pad), mode='constant', value=0) + + # 模型推理 - E2E.forward 会处理 transpose + hidden = self.model(mel) + + # 移除填充部分,只保留原始帧数 + hidden = hidden[:, :n_frames, :] + hidden = hidden.squeeze(0).cpu().numpy() + + # 解码 F0 + f0 = self._decode(hidden, thred) + + return f0 + + def _decode(self, hidden: np.ndarray, thred: float) -> np.ndarray: + """解码隐藏状态为 F0 - 使用官方 RVC 算法""" + # 使用官方的 to_local_average_cents 算法 + cents = self._to_local_average_cents(hidden, thred) + + # 转换 cents 到 Hz + f0 = 10 * (2 ** (cents / 1200)) + f0[f0 == 10] = 0 # cents=0 时 f0=10,需要置零 + + return f0 + + def _to_local_average_cents(self, salience: np.ndarray, thred: float) -> np.ndarray: + """官方 RVC 的 to_local_average_cents 算法""" + # Step 1: 找到每帧的峰值 bin + center = np.argmax(salience, axis=1) # [T] + + # Step 2: 对 salience 进行 padding + salience = np.pad(salience, ((0, 0), (4, 4))) # [T, 368] + center += 4 # 调整 center 索引 + + # Step 3: 提取峰值附近 9 个 bin 的窗口并计算加权平均 + todo_salience = [] + todo_cents_mapping = [] + starts = center - 4 + ends = center + 5 + + for idx in range(salience.shape[0]): + todo_salience.append(salience[idx, starts[idx]:ends[idx]]) + todo_cents_mapping.append(self.cents_mapping[starts[idx]:ends[idx]]) + + todo_salience = np.array(todo_salience) # [T, 9] + todo_cents_mapping = np.array(todo_cents_mapping) # [T, 9] + + # Step 4: 加权平均 + product_sum = np.sum(todo_salience * todo_cents_mapping, axis=1) + weight_sum = np.sum(todo_salience, axis=1) + 1e-9 + cents = product_sum / weight_sum + + # Step 5: 阈值过滤 - 使用原始 salience 的最大值 + maxx = np.max(salience, axis=1) + cents[maxx <= thred] = 0 + + return cents diff --git a/models/synthesizer.py b/models/synthesizer.py index 675ab707175b3ecfad061d5213d5ac0730a758a3..753b87a63e609e03efa2f0cb52496cf6987e0c1f 100644 --- a/models/synthesizer.py +++ b/models/synthesizer.py @@ -1,853 +1,853 @@ -# -*- coding: utf-8 -*- -""" -RVC v2 合成器模型定义 -""" -import math -import torch -import torch.nn as nn -import torch.nn.functional as F -from typing import Optional, Tuple -import numpy as np - - -class LayerNorm(nn.Module): - """Layer normalization for channels-first tensors""" - - def __init__(self, channels: int, eps: float = 1e-5): - super().__init__() - self.channels = channels - self.eps = eps - self.gamma = nn.Parameter(torch.ones(channels)) - self.beta = nn.Parameter(torch.zeros(channels)) - - def forward(self, x): - # x: [B, C, T] - x = x.transpose(1, -1) # [B, T, C] - x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) - return x.transpose(1, -1) # [B, C, T] - - -class MultiHeadAttention(nn.Module): - """Multi-head attention module""" - - def __init__(self, channels: int, out_channels: int, n_heads: int, - p_dropout: float = 0.0, window_size: Optional[int] = None, - heads_share: bool = True, block_length: Optional[int] = None, - proximal_bias: bool = False, proximal_init: bool = False): - super().__init__() - assert channels % n_heads == 0 - - self.channels = channels - self.out_channels = out_channels - self.n_heads = n_heads - self.p_dropout = p_dropout - self.window_size = window_size - self.heads_share = heads_share - self.block_length = block_length - self.proximal_bias = proximal_bias - self.proximal_init = proximal_init - self.attn = None - - self.k_channels = channels // n_heads - self.conv_q = nn.Conv1d(channels, channels, 1) - self.conv_k = nn.Conv1d(channels, channels, 1) - self.conv_v = nn.Conv1d(channels, channels, 1) - self.conv_o = nn.Conv1d(channels, out_channels, 1) - self.drop = nn.Dropout(p_dropout) - - if window_size is not None: - n_heads_rel = 1 if heads_share else n_heads - rel_stddev = self.k_channels ** -0.5 - self.emb_rel_k = nn.Parameter( - torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev - ) - self.emb_rel_v = nn.Parameter( - torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev - ) - - nn.init.xavier_uniform_(self.conv_q.weight) - nn.init.xavier_uniform_(self.conv_k.weight) - nn.init.xavier_uniform_(self.conv_v.weight) - if proximal_init: - with torch.no_grad(): - self.conv_k.weight.copy_(self.conv_q.weight) - self.conv_k.bias.copy_(self.conv_q.bias) - - def forward(self, x, c, attn_mask=None): - q = self.conv_q(x) - k = self.conv_k(c) - v = self.conv_v(c) - - x, self.attn = self.attention(q, k, v, mask=attn_mask) - - x = self.conv_o(x) - return x - - def attention(self, query, key, value, mask=None): - # query, key, value: [B, C, T] - b, d, t_s = key.size() - t_t = query.size(2) - - query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) - key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) - - scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) - - if self.window_size is not None: - assert t_s == t_t, "Relative attention only for self-attention" - key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) - rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings) - scores_local = self._relative_position_to_absolute_position(rel_logits) - scores = scores + scores_local - - if self.proximal_bias: - assert t_s == t_t, "Proximal bias only for self-attention" - scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) - - if mask is not None: - scores = scores.masked_fill(mask == 0, -1e4) - if self.block_length is not None: - assert t_s == t_t, "Block length only for self-attention" - block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) - scores = scores.masked_fill(block_mask == 0, -1e4) - - p_attn = F.softmax(scores, dim=-1) - p_attn = self.drop(p_attn) - output = torch.matmul(p_attn, value) - - if self.window_size is not None: - relative_weights = self._absolute_position_to_relative_position(p_attn) - value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) - output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) - - output = output.transpose(2, 3).contiguous().view(b, d, t_t) - return output, p_attn - - def _matmul_with_relative_values(self, x, y): - ret = torch.matmul(x, y.unsqueeze(0)) - return ret - - def _matmul_with_relative_keys(self, x, y): - ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) - return ret - - def _get_relative_embeddings(self, relative_embeddings, length): - max_relative_position = 2 * self.window_size + 1 - pad_length = max(length - (self.window_size + 1), 0) - slice_start_position = max((self.window_size + 1) - length, 0) - slice_end_position = slice_start_position + 2 * length - 1 - if pad_length > 0: - padded_relative_embeddings = F.pad( - relative_embeddings, - (0, 0, pad_length, pad_length, 0, 0) - ) - else: - padded_relative_embeddings = relative_embeddings - used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position] - return used_relative_embeddings - - def _relative_position_to_absolute_position(self, x): - batch, heads, length, _ = x.size() - x = F.pad(x, (0, 1, 0, 0, 0, 0, 0, 0)) - x_flat = x.view(batch, heads, length * 2 * length) - x_flat = F.pad(x_flat, (0, length - 1, 0, 0, 0, 0)) - x_final = x_flat.view(batch, heads, length + 1, 2 * length - 1)[:, :, :length, length - 1:] - return x_final - - def _absolute_position_to_relative_position(self, x): - batch, heads, length, _ = x.size() - x = F.pad(x, (0, length - 1, 0, 0, 0, 0, 0, 0)) - x_flat = x.view(batch, heads, length ** 2 + length * (length - 1)) - x_flat = F.pad(x_flat, (length, 0, 0, 0, 0, 0)) - x_final = x_flat.view(batch, heads, length, 2 * length)[:, :, :, 1:] - return x_final - - def _attention_bias_proximal(self, length): - r = torch.arange(length, dtype=torch.float32) - diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) - return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) - - -class FFN(nn.Module): - """Feed-forward network with optional causal convolution""" - - def __init__(self, in_channels: int, out_channels: int, filter_channels: int, - kernel_size: int, p_dropout: float = 0.0, activation: str = None, - causal: bool = False): - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.filter_channels = filter_channels - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.activation = activation - self.causal = causal - - if causal: - self.padding = self._causal_padding - else: - self.padding = self._same_padding - - self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) - self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) - self.drop = nn.Dropout(p_dropout) - - def forward(self, x, x_mask): - x = self.conv_1(self.padding(x)) - if self.activation == "gelu": - x = x * torch.sigmoid(1.702 * x) - else: - x = torch.relu(x) - x = self.drop(x) - x = self.conv_2(self.padding(x)) - return x * x_mask - - def _causal_padding(self, x): - if self.kernel_size == 1: - return x - pad_l = self.kernel_size - 1 - pad_r = 0 - return F.pad(x, (pad_l, pad_r, 0, 0, 0, 0)) - - def _same_padding(self, x): - if self.kernel_size == 1: - return x - pad_l = (self.kernel_size - 1) // 2 - pad_r = self.kernel_size // 2 - return F.pad(x, (pad_l, pad_r, 0, 0, 0, 0)) - - -class Encoder(nn.Module): - """Transformer encoder with multi-head attention""" - - def __init__(self, hidden_channels: int, filter_channels: int, n_heads: int, - n_layers: int, kernel_size: int = 1, p_dropout: float = 0.0, - window_size: int = 10): - super().__init__() - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.window_size = window_size - - self.drop = nn.Dropout(p_dropout) - self.attn_layers = nn.ModuleList() - self.norm_layers_1 = nn.ModuleList() - self.ffn_layers = nn.ModuleList() - self.norm_layers_2 = nn.ModuleList() - - for _ in range(n_layers): - self.attn_layers.append( - MultiHeadAttention( - hidden_channels, hidden_channels, n_heads, - p_dropout=p_dropout, window_size=window_size - ) - ) - self.norm_layers_1.append(LayerNorm(hidden_channels)) - self.ffn_layers.append( - FFN(hidden_channels, hidden_channels, filter_channels, - kernel_size, p_dropout=p_dropout) - ) - self.norm_layers_2.append(LayerNorm(hidden_channels)) - - def forward(self, x, x_mask): - attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) - x = x * x_mask - for i in range(self.n_layers): - y = self.attn_layers[i](x, x, attn_mask) - y = self.drop(y) - x = self.norm_layers_1[i](x + y) - - y = self.ffn_layers[i](x, x_mask) - y = self.drop(y) - x = self.norm_layers_2[i](x + y) - x = x * x_mask - return x - - -class TextEncoder(nn.Module): - """Text encoder for RVC - encodes phone and pitch embeddings""" - - def __init__(self, out_channels: int, hidden_channels: int, filter_channels: int, - n_heads: int, n_layers: int, kernel_size: int, p_dropout: float, - f0: bool = True): - super().__init__() - self.out_channels = out_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.f0 = f0 - - # Phone embedding: Linear projection from 768-dim HuBERT features - self.emb_phone = nn.Linear(768, hidden_channels) - - # Pitch embedding (only if f0 is enabled) - if f0: - self.emb_pitch = nn.Embedding(256, hidden_channels) - - # Transformer encoder - self.encoder = Encoder( - hidden_channels, filter_channels, n_heads, n_layers, - kernel_size, p_dropout - ) - - # Output projection to mean and log-variance - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, phone, pitch, lengths): - """ - Args: - phone: [B, 768, T] phone features from HuBERT (channels first) - pitch: [B, T] pitch indices (0-255) - lengths: [B] sequence lengths - - Returns: - m: [B, out_channels, T] mean - logs: [B, out_channels, T] log-variance - x_mask: [B, 1, T] mask - """ - import logging - log = logging.getLogger(__name__) - - log.debug(f"[TextEncoder] 输入 phone: shape={phone.shape}") - log.debug(f"[TextEncoder] 输入 pitch: shape={pitch.shape}, max={pitch.max().item()}, min={pitch.min().item()}") - log.debug(f"[TextEncoder] 输入 lengths: {lengths}") - - # Transpose phone from [B, C, T] to [B, T, C] for linear layer - phone = phone.transpose(1, 2) # [B, T, 768] - log.debug(f"[TextEncoder] 转置后 phone: shape={phone.shape}") - - # Create mask - x_mask = torch.unsqueeze( - self._sequence_mask(lengths, phone.size(1)), 1 - ).to(phone.dtype) - log.debug(f"[TextEncoder] x_mask: shape={x_mask.shape}, sum={x_mask.sum().item()}") - - # Phone embedding - x = self.emb_phone(phone) # [B, T, hidden_channels] - log.debug(f"[TextEncoder] emb_phone 输出: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}") - - # Add pitch embedding if enabled - if self.f0 and pitch is not None: - # Clamp pitch to valid range - pitch_clamped = torch.clamp(pitch, 0, 255) - pitch_emb = self.emb_pitch(pitch_clamped) - log.debug(f"[TextEncoder] emb_pitch 输出: shape={pitch_emb.shape}, max={pitch_emb.abs().max().item():.4f}") - x = x + pitch_emb - - # Transpose for conv layers: [B, hidden_channels, T] - x = x.transpose(1, 2) - log.debug(f"[TextEncoder] 转置后 x: shape={x.shape}") - - # Apply mask - x = x * x_mask - - # Transformer encoder - x = self.encoder(x, x_mask) - log.debug(f"[TextEncoder] Transformer 输出: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}") - - # Project to mean and log-variance - stats = self.proj(x) * x_mask - m, logs = torch.split(stats, self.out_channels, dim=1) - log.debug(f"[TextEncoder] 最终输出 m: shape={m.shape}, max={m.abs().max().item():.4f}") - log.debug(f"[TextEncoder] 最终输出 logs: shape={logs.shape}, max={logs.max().item():.4f}, min={logs.min().item():.4f}") - - return m, logs, x_mask - - def _sequence_mask(self, length, max_length=None): - if max_length is None: - max_length = length.max() - x = torch.arange(max_length, dtype=length.dtype, device=length.device) - return x.unsqueeze(0) < length.unsqueeze(1) - - -class ResidualCouplingBlock(nn.Module): - """残差耦合块""" - - def __init__(self, channels: int, hidden_channels: int, kernel_size: int, - dilation_rate: int, n_layers: int, n_flows: int = 4, - gin_channels: int = 0): - super().__init__() - self.flows = nn.ModuleList() - - for _ in range(n_flows): - self.flows.append( - ResidualCouplingLayer( - channels, hidden_channels, kernel_size, - dilation_rate, n_layers, gin_channels=gin_channels - ) - ) - self.flows.append(Flip()) - - def forward(self, x, x_mask, g=None, reverse=False): - if not reverse: - for flow in self.flows: - x, _ = flow(x, x_mask, g=g, reverse=reverse) - else: - for flow in reversed(self.flows): - x = flow(x, x_mask, g=g, reverse=reverse) - return x - - -class ResidualCouplingLayer(nn.Module): - """残差耦合层""" - - def __init__(self, channels: int, hidden_channels: int, kernel_size: int, - dilation_rate: int, n_layers: int, mean_only: bool = True, - gin_channels: int = 0): - super().__init__() - self.half_channels = channels // 2 - self.mean_only = mean_only - - self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) - self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels) - self.post = nn.Conv1d(hidden_channels, self.half_channels, 1) - self.post.weight.data.zero_() - self.post.bias.data.zero_() - - def forward(self, x, x_mask, g=None, reverse=False): - x0, x1 = torch.split(x, [self.half_channels] * 2, dim=1) - h = self.pre(x0) * x_mask - h = self.enc(h, x_mask, g=g) - stats = self.post(h) * x_mask - m = stats - - if not reverse: - x1 = m + x1 * x_mask - x = torch.cat([x0, x1], dim=1) - return x, None - else: - x1 = (x1 - m) * x_mask - x = torch.cat([x0, x1], dim=1) - return x - - -class Flip(nn.Module): - """翻转层""" - - def forward(self, x, *args, reverse=False, **kwargs): - x = torch.flip(x, [1]) - return x - - -class WN(nn.Module): - """WaveNet 风格网络 (带权重归一化)""" - - def __init__(self, hidden_channels: int, kernel_size: int, - dilation_rate: int, n_layers: int, gin_channels: int = 0, - p_dropout: float = 0): - super().__init__() - self.n_layers = n_layers - self.hidden_channels = hidden_channels - self.gin_channels = gin_channels - - self.in_layers = nn.ModuleList() - self.res_skip_layers = nn.ModuleList() - self.drop = nn.Dropout(p_dropout) - - if gin_channels > 0: - self.cond_layer = nn.utils.weight_norm( - nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1) - ) - - for i in range(n_layers): - dilation = dilation_rate ** i - padding = (kernel_size * dilation - dilation) // 2 - self.in_layers.append( - nn.utils.weight_norm( - nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, - dilation=dilation, padding=padding) - ) - ) - # 前 n-1 层输出 2 * hidden_channels,最后一层输出 hidden_channels - if i < n_layers - 1: - res_skip_channels = 2 * hidden_channels - else: - res_skip_channels = hidden_channels - self.res_skip_layers.append( - nn.utils.weight_norm( - nn.Conv1d(hidden_channels, res_skip_channels, 1) - ) - ) - - def forward(self, x, x_mask, g=None): - output = torch.zeros_like(x) - - if g is not None and self.gin_channels > 0: - g = self.cond_layer(g) - - for i in range(self.n_layers): - x_in = self.in_layers[i](x) - if g is not None: - cond_offset = i * 2 * self.hidden_channels - g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :] - x_in = x_in + g_l - - acts = torch.tanh(x_in[:, :self.hidden_channels]) * torch.sigmoid(x_in[:, self.hidden_channels:]) - acts = self.drop(acts) - res_skip = self.res_skip_layers[i](acts) - - if i < self.n_layers - 1: - # 前 n-1 层:residual + skip - x = (x + res_skip[:, :self.hidden_channels]) * x_mask - output = output + res_skip[:, self.hidden_channels:] - else: - # 最后一层:只有 residual,加到 output - x = (x + res_skip) * x_mask - output = output + res_skip - - return output * x_mask - - -class PosteriorEncoder(nn.Module): - """后验编码器""" - - def __init__(self, in_channels: int, out_channels: int, hidden_channels: int, - kernel_size: int, dilation_rate: int, n_layers: int, - gin_channels: int = 0): - super().__init__() - self.out_channels = out_channels - - self.pre = nn.Conv1d(in_channels, hidden_channels, 1) - self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels) - self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) - - def forward(self, x, x_lengths, g=None): - x_mask = torch.unsqueeze( - self._sequence_mask(x_lengths, x.size(2)), 1 - ).to(x.dtype) - - x = self.pre(x) * x_mask - x = self.enc(x, x_mask, g=g) - stats = self.proj(x) * x_mask - m, logs = torch.split(stats, self.out_channels, dim=1) - z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask - return z, m, logs, x_mask - - def _sequence_mask(self, length, max_length=None): - if max_length is None: - max_length = length.max() - x = torch.arange(max_length, dtype=length.dtype, device=length.device) - return x.unsqueeze(0) < length.unsqueeze(1) - - -class Generator(nn.Module): - """NSF-HiFi-GAN 生成器 (带权重归一化)""" - - def __init__(self, initial_channel: int, resblock_kernel_sizes: list, - resblock_dilation_sizes: list, upsample_rates: list, - upsample_initial_channel: int, upsample_kernel_sizes: list, - gin_channels: int = 0, sr: int = 40000, is_half: bool = False): - super().__init__() - self.num_kernels = len(resblock_kernel_sizes) - self.num_upsamples = len(upsample_rates) - self.sr = sr - self.is_half = is_half - - # 计算上采样因子 - self.upp = int(np.prod(upsample_rates)) - - self.conv_pre = nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, 3) - - # NSF 源模块 - self.m_source = SourceModuleHnNSF(sample_rate=sr, harmonic_num=0) - - # 噪声卷积层 - self.noise_convs = nn.ModuleList() - - self.ups = nn.ModuleList() - for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): - c_cur = upsample_initial_channel // (2 ** (i + 1)) - self.ups.append( - nn.utils.weight_norm( - nn.ConvTranspose1d( - upsample_initial_channel // (2 ** i), - c_cur, - k, u, (k - u) // 2 - ) - ) - ) - # 噪声卷积 - if i + 1 < len(upsample_rates): - stride_f0 = int(np.prod(upsample_rates[i + 1:])) - self.noise_convs.append( - nn.Conv1d(1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2) - ) - else: - self.noise_convs.append(nn.Conv1d(1, c_cur, kernel_size=1)) - - self.resblocks = nn.ModuleList() - for i in range(len(self.ups)): - ch = upsample_initial_channel // (2 ** (i + 1)) - for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): - self.resblocks.append(ResBlock(ch, k, d)) - - self.conv_post = nn.Conv1d(ch, 1, 7, 1, 3, bias=False) - - if gin_channels > 0: - self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) - - def forward(self, x, f0, g=None): - import logging - log = logging.getLogger(__name__) - - log.debug(f"[Generator] 输入 x: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}") - log.debug(f"[Generator] 输入 f0: shape={f0.shape}, max={f0.max().item():.1f}, min={f0.min().item():.1f}") - if g is not None: - log.debug(f"[Generator] 输入 g: shape={g.shape}, max={g.abs().max().item():.4f}") - - # 生成 NSF 激励信号 - har_source, _, _ = self.m_source(f0, self.upp) - har_source = har_source.transpose(1, 2) # [B, 1, T*upp] - log.debug(f"[Generator] NSF har_source: shape={har_source.shape}, max={har_source.abs().max().item():.4f}") - - x = self.conv_pre(x) - log.debug(f"[Generator] conv_pre 输出: shape={x.shape}, max={x.abs().max().item():.4f}") - - if g is not None: - x = x + self.cond(g) - log.debug(f"[Generator] 加入条件后: max={x.abs().max().item():.4f}") - - for i in range(self.num_upsamples): - x = F.leaky_relu(x, 0.1) - x = self.ups[i](x) - - # 融合噪声 - x_source = self.noise_convs[i](har_source) - x = x + x_source - - xs = None - for j in range(self.num_kernels): - if xs is None: - xs = self.resblocks[i * self.num_kernels + j](x) - else: - xs += self.resblocks[i * self.num_kernels + j](x) - x = xs / self.num_kernels - log.debug(f"[Generator] 上采样层 {i}: shape={x.shape}, max={x.abs().max().item():.4f}") - - x = F.leaky_relu(x) - x = self.conv_post(x) - log.debug(f"[Generator] conv_post 输出: shape={x.shape}, max={x.abs().max().item():.4f}") - x = torch.tanh(x) - log.debug(f"[Generator] tanh 输出: shape={x.shape}, max={x.abs().max().item():.4f}") - - return x - - def remove_weight_norm(self): - for l in self.ups: - nn.utils.remove_weight_norm(l) - for l in self.resblocks: - l.remove_weight_norm() - - -class ResBlock(nn.Module): - """残差块 (带权重归一化)""" - - def __init__(self, channels: int, kernel_size: int = 3, dilation: tuple = (1, 3, 5)): - super().__init__() - self.convs1 = nn.ModuleList([ - nn.utils.weight_norm( - nn.Conv1d(channels, channels, kernel_size, 1, - (kernel_size * d - d) // 2, dilation=d) - ) - for d in dilation - ]) - self.convs2 = nn.ModuleList([ - nn.utils.weight_norm( - nn.Conv1d(channels, channels, kernel_size, 1, - (kernel_size - 1) // 2) - ) - for _ in dilation - ]) - - def forward(self, x): - for c1, c2 in zip(self.convs1, self.convs2): - xt = F.leaky_relu(x, 0.1) - xt = c1(xt) - xt = F.leaky_relu(xt, 0.1) - xt = c2(xt) - x = xt + x - return x - - def remove_weight_norm(self): - for l in self.convs1: - nn.utils.remove_weight_norm(l) - for l in self.convs2: - nn.utils.remove_weight_norm(l) - - -class SineGenerator(nn.Module): - """正弦波生成器 - NSF 的核心组件""" - - def __init__(self, sample_rate: int, harmonic_num: int = 0, - sine_amp: float = 0.1, noise_std: float = 0.003, - voiced_threshold: float = 10): - super().__init__() - self.sample_rate = sample_rate - self.harmonic_num = harmonic_num - self.sine_amp = sine_amp - self.noise_std = noise_std - self.voiced_threshold = voiced_threshold - self.dim = harmonic_num + 1 - - def forward(self, f0: torch.Tensor, upp: int): - """ - 生成正弦波激励信号 - - Args: - f0: 基频张量 [B, T] - upp: 上采样因子 - - Returns: - 正弦波信号 [B, T*upp, 1] - """ - with torch.no_grad(): - # 上采样 F0 - f0 = f0.unsqueeze(1) # [B, 1, T] - f0_up = F.interpolate(f0, scale_factor=upp, mode='nearest') - f0_up = f0_up.transpose(1, 2) # [B, T*upp, 1] - - # 生成正弦波 - rad = f0_up / self.sample_rate # 归一化频率 - rad_acc = torch.cumsum(rad, dim=1) % 1 # 累积相位 - sine_wave = torch.sin(2 * np.pi * rad_acc) * self.sine_amp - - # 静音区域(F0=0)使用噪声 - voiced_mask = (f0_up > self.voiced_threshold).float() - noise = torch.randn_like(sine_wave) * self.noise_std - sine_wave = sine_wave * voiced_mask + noise * (1 - voiced_mask) - - return sine_wave - - -class SourceModuleHnNSF(nn.Module): - """谐波加噪声源模块""" - - def __init__(self, sample_rate: int, harmonic_num: int = 0, - sine_amp: float = 0.1, noise_std: float = 0.003, - add_noise_std: float = 0.003): - super().__init__() - self.sine_generator = SineGenerator( - sample_rate, harmonic_num, sine_amp, noise_std - ) - self.l_linear = nn.Linear(harmonic_num + 1, 1) - self.l_tanh = nn.Tanh() - - def forward(self, f0: torch.Tensor, upp: int): - sine = self.sine_generator(f0, upp) # [B, T*upp, 1] - sine = self.l_tanh(self.l_linear(sine)) - noise = torch.randn_like(sine) * 0.003 - return sine, noise, None # 返回 3 个值以匹配接口 - - -class SynthesizerTrnMs768NSFsid(nn.Module): - """RVC v2 合成器 (768 维 HuBERT + NSF + SID)""" - - def __init__(self, spec_channels: int, segment_size: int, - inter_channels: int, hidden_channels: int, filter_channels: int, - n_heads: int, n_layers: int, kernel_size: int, p_dropout: float, - resblock: str, resblock_kernel_sizes: list, - resblock_dilation_sizes: list, upsample_rates: list, - upsample_initial_channel: int, upsample_kernel_sizes: list, - spk_embed_dim: int, gin_channels: int, sr: int): - super().__init__() - - self.spec_channels = spec_channels - self.inter_channels = inter_channels - self.hidden_channels = hidden_channels - self.filter_channels = filter_channels - self.n_heads = n_heads - self.n_layers = n_layers - self.kernel_size = kernel_size - self.p_dropout = p_dropout - self.resblock = resblock - self.resblock_kernel_sizes = resblock_kernel_sizes - self.resblock_dilation_sizes = resblock_dilation_sizes - self.upsample_rates = upsample_rates - self.upsample_initial_channel = upsample_initial_channel - self.upsample_kernel_sizes = upsample_kernel_sizes - self.segment_size = segment_size - self.gin_channels = gin_channels - self.spk_embed_dim = spk_embed_dim - self.sr = sr - - # 文本编码器 (使用 TextEncoder 替代 PosteriorEncoder) - self.enc_p = TextEncoder( - inter_channels, hidden_channels, filter_channels, - n_heads, n_layers, kernel_size, p_dropout, f0=True - ) - - # 解码器/生成器 (NSF-HiFiGAN,内部包含 m_source) - self.dec = Generator( - inter_channels, resblock_kernel_sizes, resblock_dilation_sizes, - upsample_rates, upsample_initial_channel, upsample_kernel_sizes, - gin_channels, sr=sr - ) - - # 流 - self.flow = ResidualCouplingBlock( - inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels - ) - - # 说话人嵌入 - self.emb_g = nn.Embedding(spk_embed_dim, gin_channels) - - def forward(self, phone, phone_lengths, pitch, nsff0, sid, skip_head=0, return_length=0): - """前向传播""" - g = self.emb_g(sid).unsqueeze(-1) - - # TextEncoder 返回 mean 和 log-variance - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - - # 在编码器外部采样 - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - - # 正向 flow - z = self.flow(z_p, x_mask, g=g) - - # 生成音频 (传入 f0) - o = self.dec(z, nsff0, g=g) - - return o - - def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=1.0): - """推理""" - import logging - log = logging.getLogger(__name__) - - log.debug(f"[infer] 输入 phone: shape={phone.shape}, dtype={phone.dtype}") - log.debug(f"[infer] 输入 phone 统计: max={phone.abs().max().item():.4f}, mean={phone.abs().mean().item():.4f}") - log.debug(f"[infer] 输入 phone_lengths: {phone_lengths}") - log.debug(f"[infer] 输入 pitch: shape={pitch.shape}, max={pitch.max().item()}, min={pitch.min().item()}") - log.debug(f"[infer] 输入 nsff0: shape={nsff0.shape}, max={nsff0.max().item():.1f}, min={nsff0.min().item():.1f}") - log.debug(f"[infer] 输入 sid: {sid}") - - g = self.emb_g(sid).unsqueeze(-1) - log.debug(f"[infer] 说话人嵌入 g: shape={g.shape}, max={g.abs().max().item():.4f}") - - # TextEncoder 返回 mean 和 log-variance - m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) - log.debug(f"[infer] TextEncoder 输出:") - log.debug(f"[infer] m_p: shape={m_p.shape}, max={m_p.abs().max().item():.4f}, mean={m_p.abs().mean().item():.4f}") - log.debug(f"[infer] logs_p: shape={logs_p.shape}, max={logs_p.max().item():.4f}, min={logs_p.min().item():.4f}") - log.debug(f"[infer] x_mask: shape={x_mask.shape}, sum={x_mask.sum().item()}") - - # 在编码器外部采样 (使用较小的噪声系数以获得更稳定的输出) - z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask - log.debug(f"[infer] 采样后 z_p: shape={z_p.shape}, max={z_p.abs().max().item():.4f}, mean={z_p.abs().mean().item():.4f}") - - # 反向 flow - z = self.flow(z_p, x_mask, g=g, reverse=True) - log.debug(f"[infer] Flow 输出 z: shape={z.shape}, max={z.abs().max().item():.4f}, mean={z.abs().mean().item():.4f}") - - # 生成音频 (传入 f0,Generator 内部会生成 NSF 激励信号) - o = self.dec(z * x_mask, nsff0, g=g) - log.debug(f"[infer] Generator 输出 o: shape={o.shape}, max={o.abs().max().item():.4f}, mean={o.abs().mean().item():.4f}") - - return o, x_mask +# -*- coding: utf-8 -*- +""" +RVC v2 合成器模型定义 +""" +import math +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Optional, Tuple +import numpy as np + + +class LayerNorm(nn.Module): + """Layer normalization for channels-first tensors""" + + def __init__(self, channels: int, eps: float = 1e-5): + super().__init__() + self.channels = channels + self.eps = eps + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + # x: [B, C, T] + x = x.transpose(1, -1) # [B, T, C] + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) # [B, C, T] + + +class MultiHeadAttention(nn.Module): + """Multi-head attention module""" + + def __init__(self, channels: int, out_channels: int, n_heads: int, + p_dropout: float = 0.0, window_size: Optional[int] = None, + heads_share: bool = True, block_length: Optional[int] = None, + proximal_bias: bool = False, proximal_init: bool = False): + super().__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.p_dropout = p_dropout + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels ** -0.5 + self.emb_rel_k = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev + ) + self.emb_rel_v = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev + ) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + nn.init.xavier_uniform_(self.conv_v.weight) + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # query, key, value: [B, C, T] + b, d, t_s = key.size() + t_t = query.size(2) + + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + + if self.window_size is not None: + assert t_s == t_t, "Relative attention only for self-attention" + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings) + scores_local = self._relative_position_to_absolute_position(rel_logits) + scores = scores + scores_local + + if self.proximal_bias: + assert t_s == t_t, "Proximal bias only for self-attention" + scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype) + + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + assert t_s == t_t, "Block length only for self-attention" + block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length) + scores = scores.masked_fill(block_mask == 0, -1e4) + + p_attn = F.softmax(scores, dim=-1) + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s) + output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings) + + output = output.transpose(2, 3).contiguous().view(b, d, t_t) + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + (0, 0, pad_length, pad_length, 0, 0) + ) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + batch, heads, length, _ = x.size() + x = F.pad(x, (0, 1, 0, 0, 0, 0, 0, 0)) + x_flat = x.view(batch, heads, length * 2 * length) + x_flat = F.pad(x_flat, (0, length - 1, 0, 0, 0, 0)) + x_final = x_flat.view(batch, heads, length + 1, 2 * length - 1)[:, :, :length, length - 1:] + return x_final + + def _absolute_position_to_relative_position(self, x): + batch, heads, length, _ = x.size() + x = F.pad(x, (0, length - 1, 0, 0, 0, 0, 0, 0)) + x_flat = x.view(batch, heads, length ** 2 + length * (length - 1)) + x_flat = F.pad(x_flat, (length, 0, 0, 0, 0, 0)) + x_final = x_flat.view(batch, heads, length, 2 * length)[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(nn.Module): + """Feed-forward network with optional causal convolution""" + + def __init__(self, in_channels: int, out_channels: int, filter_channels: int, + kernel_size: int, p_dropout: float = 0.0, activation: str = None, + causal: bool = False): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + self.causal = causal + + if causal: + self.padding = self._causal_padding + else: + self.padding = self._same_padding + + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask): + x = self.conv_1(self.padding(x)) + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(self.padding(x)) + return x * x_mask + + def _causal_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = self.kernel_size - 1 + pad_r = 0 + return F.pad(x, (pad_l, pad_r, 0, 0, 0, 0)) + + def _same_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = (self.kernel_size - 1) // 2 + pad_r = self.kernel_size // 2 + return F.pad(x, (pad_l, pad_r, 0, 0, 0, 0)) + + +class Encoder(nn.Module): + """Transformer encoder with multi-head attention""" + + def __init__(self, hidden_channels: int, filter_channels: int, n_heads: int, + n_layers: int, kernel_size: int = 1, p_dropout: float = 0.0, + window_size: int = 10): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + + for _ in range(n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, hidden_channels, n_heads, + p_dropout=p_dropout, window_size=window_size + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN(hidden_channels, hidden_channels, filter_channels, + kernel_size, p_dropout=p_dropout) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class TextEncoder(nn.Module): + """Text encoder for RVC - encodes phone and pitch embeddings""" + + def __init__(self, out_channels: int, hidden_channels: int, filter_channels: int, + n_heads: int, n_layers: int, kernel_size: int, p_dropout: float, + f0: bool = True): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.f0 = f0 + + # Phone embedding: Linear projection from 768-dim HuBERT features + self.emb_phone = nn.Linear(768, hidden_channels) + + # Pitch embedding (only if f0 is enabled) + if f0: + self.emb_pitch = nn.Embedding(256, hidden_channels) + + # Transformer encoder + self.encoder = Encoder( + hidden_channels, filter_channels, n_heads, n_layers, + kernel_size, p_dropout + ) + + # Output projection to mean and log-variance + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + """ + Args: + phone: [B, 768, T] phone features from HuBERT (channels first) + pitch: [B, T] pitch indices (0-255) + lengths: [B] sequence lengths + + Returns: + m: [B, out_channels, T] mean + logs: [B, out_channels, T] log-variance + x_mask: [B, 1, T] mask + """ + import logging + log = logging.getLogger(__name__) + + log.debug(f"[TextEncoder] 输入 phone: shape={phone.shape}") + log.debug(f"[TextEncoder] 输入 pitch: shape={pitch.shape}, max={pitch.max().item()}, min={pitch.min().item()}") + log.debug(f"[TextEncoder] 输入 lengths: {lengths}") + + # Transpose phone from [B, C, T] to [B, T, C] for linear layer + phone = phone.transpose(1, 2) # [B, T, 768] + log.debug(f"[TextEncoder] 转置后 phone: shape={phone.shape}") + + # Create mask + x_mask = torch.unsqueeze( + self._sequence_mask(lengths, phone.size(1)), 1 + ).to(phone.dtype) + log.debug(f"[TextEncoder] x_mask: shape={x_mask.shape}, sum={x_mask.sum().item()}") + + # Phone embedding + x = self.emb_phone(phone) # [B, T, hidden_channels] + log.debug(f"[TextEncoder] emb_phone 输出: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}") + + # Add pitch embedding if enabled + if self.f0 and pitch is not None: + # Clamp pitch to valid range + pitch_clamped = torch.clamp(pitch, 0, 255) + pitch_emb = self.emb_pitch(pitch_clamped) + log.debug(f"[TextEncoder] emb_pitch 输出: shape={pitch_emb.shape}, max={pitch_emb.abs().max().item():.4f}") + x = x + pitch_emb + + # Transpose for conv layers: [B, hidden_channels, T] + x = x.transpose(1, 2) + log.debug(f"[TextEncoder] 转置后 x: shape={x.shape}") + + # Apply mask + x = x * x_mask + + # Transformer encoder + x = self.encoder(x, x_mask) + log.debug(f"[TextEncoder] Transformer 输出: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}") + + # Project to mean and log-variance + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + log.debug(f"[TextEncoder] 最终输出 m: shape={m.shape}, max={m.abs().max().item():.4f}") + log.debug(f"[TextEncoder] 最终输出 logs: shape={logs.shape}, max={logs.max().item():.4f}, min={logs.min().item():.4f}") + + return m, logs, x_mask + + def _sequence_mask(self, length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +class ResidualCouplingBlock(nn.Module): + """残差耦合块""" + + def __init__(self, channels: int, hidden_channels: int, kernel_size: int, + dilation_rate: int, n_layers: int, n_flows: int = 4, + gin_channels: int = 0): + super().__init__() + self.flows = nn.ModuleList() + + for _ in range(n_flows): + self.flows.append( + ResidualCouplingLayer( + channels, hidden_channels, kernel_size, + dilation_rate, n_layers, gin_channels=gin_channels + ) + ) + self.flows.append(Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + +class ResidualCouplingLayer(nn.Module): + """残差耦合层""" + + def __init__(self, channels: int, hidden_channels: int, kernel_size: int, + dilation_rate: int, n_layers: int, mean_only: bool = True, + gin_channels: int = 0): + super().__init__() + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels) + self.post = nn.Conv1d(hidden_channels, self.half_channels, 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, dim=1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + m = stats + + if not reverse: + x1 = m + x1 * x_mask + x = torch.cat([x0, x1], dim=1) + return x, None + else: + x1 = (x1 - m) * x_mask + x = torch.cat([x0, x1], dim=1) + return x + + +class Flip(nn.Module): + """翻转层""" + + def forward(self, x, *args, reverse=False, **kwargs): + x = torch.flip(x, [1]) + return x + + +class WN(nn.Module): + """WaveNet 风格网络 (带权重归一化)""" + + def __init__(self, hidden_channels: int, kernel_size: int, + dilation_rate: int, n_layers: int, gin_channels: int = 0, + p_dropout: float = 0): + super().__init__() + self.n_layers = n_layers + self.hidden_channels = hidden_channels + self.gin_channels = gin_channels + + self.in_layers = nn.ModuleList() + self.res_skip_layers = nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + + if gin_channels > 0: + self.cond_layer = nn.utils.weight_norm( + nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1) + ) + + for i in range(n_layers): + dilation = dilation_rate ** i + padding = (kernel_size * dilation - dilation) // 2 + self.in_layers.append( + nn.utils.weight_norm( + nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size, + dilation=dilation, padding=padding) + ) + ) + # 前 n-1 层输出 2 * hidden_channels,最后一层输出 hidden_channels + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + self.res_skip_layers.append( + nn.utils.weight_norm( + nn.Conv1d(hidden_channels, res_skip_channels, 1) + ) + ) + + def forward(self, x, x_mask, g=None): + output = torch.zeros_like(x) + + if g is not None and self.gin_channels > 0: + g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :] + x_in = x_in + g_l + + acts = torch.tanh(x_in[:, :self.hidden_channels]) * torch.sigmoid(x_in[:, self.hidden_channels:]) + acts = self.drop(acts) + res_skip = self.res_skip_layers[i](acts) + + if i < self.n_layers - 1: + # 前 n-1 层:residual + skip + x = (x + res_skip[:, :self.hidden_channels]) * x_mask + output = output + res_skip[:, self.hidden_channels:] + else: + # 最后一层:只有 residual,加到 output + x = (x + res_skip) * x_mask + output = output + res_skip + + return output * x_mask + + +class PosteriorEncoder(nn.Module): + """后验编码器""" + + def __init__(self, in_channels: int, out_channels: int, hidden_channels: int, + kernel_size: int, dilation_rate: int, n_layers: int, + gin_channels: int = 0): + super().__init__() + self.out_channels = out_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze( + self._sequence_mask(x_lengths, x.size(2)), 1 + ).to(x.dtype) + + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + def _sequence_mask(self, length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +class Generator(nn.Module): + """NSF-HiFi-GAN 生成器 (带权重归一化)""" + + def __init__(self, initial_channel: int, resblock_kernel_sizes: list, + resblock_dilation_sizes: list, upsample_rates: list, + upsample_initial_channel: int, upsample_kernel_sizes: list, + gin_channels: int = 0, sr: int = 40000, is_half: bool = False): + super().__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.sr = sr + self.is_half = is_half + + # 计算上采样因子 + self.upp = int(np.prod(upsample_rates)) + + self.conv_pre = nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, 3) + + # NSF 源模块 + self.m_source = SourceModuleHnNSF(sample_rate=sr, harmonic_num=0) + + # 噪声卷积层 + self.noise_convs = nn.ModuleList() + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + c_cur = upsample_initial_channel // (2 ** (i + 1)) + self.ups.append( + nn.utils.weight_norm( + nn.ConvTranspose1d( + upsample_initial_channel // (2 ** i), + c_cur, + k, u, (k - u) // 2 + ) + ) + ) + # 噪声卷积 + if i + 1 < len(upsample_rates): + stride_f0 = int(np.prod(upsample_rates[i + 1:])) + self.noise_convs.append( + nn.Conv1d(1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2) + ) + else: + self.noise_convs.append(nn.Conv1d(1, c_cur, kernel_size=1)) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)): + self.resblocks.append(ResBlock(ch, k, d)) + + self.conv_post = nn.Conv1d(ch, 1, 7, 1, 3, bias=False) + + if gin_channels > 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, f0, g=None): + import logging + log = logging.getLogger(__name__) + + log.debug(f"[Generator] 输入 x: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}") + log.debug(f"[Generator] 输入 f0: shape={f0.shape}, max={f0.max().item():.1f}, min={f0.min().item():.1f}") + if g is not None: + log.debug(f"[Generator] 输入 g: shape={g.shape}, max={g.abs().max().item():.4f}") + + # 生成 NSF 激励信号 + har_source, _, _ = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) # [B, 1, T*upp] + log.debug(f"[Generator] NSF har_source: shape={har_source.shape}, max={har_source.abs().max().item():.4f}") + + x = self.conv_pre(x) + log.debug(f"[Generator] conv_pre 输出: shape={x.shape}, max={x.abs().max().item():.4f}") + + if g is not None: + x = x + self.cond(g) + log.debug(f"[Generator] 加入条件后: max={x.abs().max().item():.4f}") + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, 0.1) + x = self.ups[i](x) + + # 融合噪声 + x_source = self.noise_convs[i](har_source) + x = x + x_source + + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + log.debug(f"[Generator] 上采样层 {i}: shape={x.shape}, max={x.abs().max().item():.4f}") + + x = F.leaky_relu(x) + x = self.conv_post(x) + log.debug(f"[Generator] conv_post 输出: shape={x.shape}, max={x.abs().max().item():.4f}") + x = torch.tanh(x) + log.debug(f"[Generator] tanh 输出: shape={x.shape}, max={x.abs().max().item():.4f}") + + return x + + def remove_weight_norm(self): + for l in self.ups: + nn.utils.remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class ResBlock(nn.Module): + """残差块 (带权重归一化)""" + + def __init__(self, channels: int, kernel_size: int = 3, dilation: tuple = (1, 3, 5)): + super().__init__() + self.convs1 = nn.ModuleList([ + nn.utils.weight_norm( + nn.Conv1d(channels, channels, kernel_size, 1, + (kernel_size * d - d) // 2, dilation=d) + ) + for d in dilation + ]) + self.convs2 = nn.ModuleList([ + nn.utils.weight_norm( + nn.Conv1d(channels, channels, kernel_size, 1, + (kernel_size - 1) // 2) + ) + for _ in dilation + ]) + + def forward(self, x): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, 0.1) + xt = c1(xt) + xt = F.leaky_relu(xt, 0.1) + xt = c2(xt) + x = xt + x + return x + + def remove_weight_norm(self): + for l in self.convs1: + nn.utils.remove_weight_norm(l) + for l in self.convs2: + nn.utils.remove_weight_norm(l) + + +class SineGenerator(nn.Module): + """正弦波生成器 - NSF 的核心组件""" + + def __init__(self, sample_rate: int, harmonic_num: int = 0, + sine_amp: float = 0.1, noise_std: float = 0.003, + voiced_threshold: float = 10): + super().__init__() + self.sample_rate = sample_rate + self.harmonic_num = harmonic_num + self.sine_amp = sine_amp + self.noise_std = noise_std + self.voiced_threshold = voiced_threshold + self.dim = harmonic_num + 1 + + def forward(self, f0: torch.Tensor, upp: int): + """ + 生成正弦波激励信号 + + Args: + f0: 基频张量 [B, T] + upp: 上采样因子 + + Returns: + 正弦波信号 [B, T*upp, 1] + """ + with torch.no_grad(): + # 上采样 F0 + f0 = f0.unsqueeze(1) # [B, 1, T] + f0_up = F.interpolate(f0, scale_factor=upp, mode='nearest') + f0_up = f0_up.transpose(1, 2) # [B, T*upp, 1] + + # 生成正弦波 + rad = f0_up / self.sample_rate # 归一化频率 + rad_acc = torch.cumsum(rad, dim=1) % 1 # 累积相位 + sine_wave = torch.sin(2 * np.pi * rad_acc) * self.sine_amp + + # 静音区域(F0=0)使用噪声 + voiced_mask = (f0_up > self.voiced_threshold).float() + noise = torch.randn_like(sine_wave) * self.noise_std + sine_wave = sine_wave * voiced_mask + noise * (1 - voiced_mask) + + return sine_wave + + +class SourceModuleHnNSF(nn.Module): + """谐波加噪声源模块""" + + def __init__(self, sample_rate: int, harmonic_num: int = 0, + sine_amp: float = 0.1, noise_std: float = 0.003, + add_noise_std: float = 0.003): + super().__init__() + self.sine_generator = SineGenerator( + sample_rate, harmonic_num, sine_amp, noise_std + ) + self.l_linear = nn.Linear(harmonic_num + 1, 1) + self.l_tanh = nn.Tanh() + + def forward(self, f0: torch.Tensor, upp: int): + sine = self.sine_generator(f0, upp) # [B, T*upp, 1] + sine = self.l_tanh(self.l_linear(sine)) + noise = torch.randn_like(sine) * 0.003 + return sine, noise, None # 返回 3 个值以匹配接口 + + +class SynthesizerTrnMs768NSFsid(nn.Module): + """RVC v2 合成器 (768 维 HuBERT + NSF + SID)""" + + def __init__(self, spec_channels: int, segment_size: int, + inter_channels: int, hidden_channels: int, filter_channels: int, + n_heads: int, n_layers: int, kernel_size: int, p_dropout: float, + resblock: str, resblock_kernel_sizes: list, + resblock_dilation_sizes: list, upsample_rates: list, + upsample_initial_channel: int, upsample_kernel_sizes: list, + spk_embed_dim: int, gin_channels: int, sr: int): + super().__init__() + + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + self.spk_embed_dim = spk_embed_dim + self.sr = sr + + # 文本编码器 (使用 TextEncoder 替代 PosteriorEncoder) + self.enc_p = TextEncoder( + inter_channels, hidden_channels, filter_channels, + n_heads, n_layers, kernel_size, p_dropout, f0=True + ) + + # 解码器/生成器 (NSF-HiFiGAN,内部包含 m_source) + self.dec = Generator( + inter_channels, resblock_kernel_sizes, resblock_dilation_sizes, + upsample_rates, upsample_initial_channel, upsample_kernel_sizes, + gin_channels, sr=sr + ) + + # 流 + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + + # 说话人嵌入 + self.emb_g = nn.Embedding(spk_embed_dim, gin_channels) + + def forward(self, phone, phone_lengths, pitch, nsff0, sid, skip_head=0, return_length=0): + """前向传播""" + g = self.emb_g(sid).unsqueeze(-1) + + # TextEncoder 返回 mean 和 log-variance + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + + # 在编码器外部采样 + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + + # 正向 flow + z = self.flow(z_p, x_mask, g=g) + + # 生成音频 (传入 f0) + o = self.dec(z, nsff0, g=g) + + return o + + def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=1.0): + """推理""" + import logging + log = logging.getLogger(__name__) + + log.debug(f"[infer] 输入 phone: shape={phone.shape}, dtype={phone.dtype}") + log.debug(f"[infer] 输入 phone 统计: max={phone.abs().max().item():.4f}, mean={phone.abs().mean().item():.4f}") + log.debug(f"[infer] 输入 phone_lengths: {phone_lengths}") + log.debug(f"[infer] 输入 pitch: shape={pitch.shape}, max={pitch.max().item()}, min={pitch.min().item()}") + log.debug(f"[infer] 输入 nsff0: shape={nsff0.shape}, max={nsff0.max().item():.1f}, min={nsff0.min().item():.1f}") + log.debug(f"[infer] 输入 sid: {sid}") + + g = self.emb_g(sid).unsqueeze(-1) + log.debug(f"[infer] 说话人嵌入 g: shape={g.shape}, max={g.abs().max().item():.4f}") + + # TextEncoder 返回 mean 和 log-variance + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + log.debug(f"[infer] TextEncoder 输出:") + log.debug(f"[infer] m_p: shape={m_p.shape}, max={m_p.abs().max().item():.4f}, mean={m_p.abs().mean().item():.4f}") + log.debug(f"[infer] logs_p: shape={logs_p.shape}, max={logs_p.max().item():.4f}, min={logs_p.min().item():.4f}") + log.debug(f"[infer] x_mask: shape={x_mask.shape}, sum={x_mask.sum().item()}") + + # 在编码器外部采样 (使用较小的噪声系数以获得更稳定的输出) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + log.debug(f"[infer] 采样后 z_p: shape={z_p.shape}, max={z_p.abs().max().item():.4f}, mean={z_p.abs().mean().item():.4f}") + + # 反向 flow + z = self.flow(z_p, x_mask, g=g, reverse=True) + log.debug(f"[infer] Flow 输出 z: shape={z.shape}, max={z.abs().max().item():.4f}, mean={z.abs().mean().item():.4f}") + + # 生成音频 (传入 f0,Generator 内部会生成 NSF 激励信号) + o = self.dec(z * x_mask, nsff0, g=g) + log.debug(f"[infer] Generator 输出 o: shape={o.shape}, max={o.abs().max().item():.4f}, mean={o.abs().mean().item():.4f}") + + return o, x_mask diff --git a/requirements_hf.txt b/requirements_hf.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba008778da89d5d4787d05507b5598b4e12368d8 --- /dev/null +++ b/requirements_hf.txt @@ -0,0 +1,39 @@ +# RVC AI 翻唱依赖 (Hugging Face Space - CPU 精简版) +# 注意:此文件用于 HF Space 部署,同步到 Space 时需重命名为 requirements.txt +# 本地安装请使用 requirements.txt(包含完整 GPU 依赖) + +# PyTorch +torch>=2.0.0 +torchaudio>=2.0.0 + +# Gradio 界面 +gradio==3.50.2 + +# 音频处理 +librosa>=0.9.0 +soundfile>=0.12.0 +scipy>=1.10.0 +numpy>=1.23.0 +praat-parselmouth>=0.4.3 +torchcrepe>=0.0.20 + +# 向量检索 +faiss-cpu>=1.7.4 + +# 工具库 +tqdm>=4.65.0 +requests>=2.28.0 +python-dotenv>=1.0.0 +colorama>=0.4.6 + +# AI 翻唱功能(核心) +audio-separator +huggingface_hub>=0.19.0 +pedalboard>=0.7.0 +ffmpeg-python>=0.2.0 + +# 以下包在 HF Space 构建环境中编译失败,改为运行时按需安装: +# fairseq==0.12.2 (HuBERT 特征提取) +# demucs>=4.0.0 (人声分离备选) +# pyworld>=0.3.4 (F0 提取备选) +# av>=10.0.0 (音频解码备选) diff --git a/run.py b/run.py index c599e56826f8edc2cffd4c3d6753e59325357cc0..8528ec9cf8558c2bc03958d65dd7dc4a32771641 100644 --- a/run.py +++ b/run.py @@ -1,133 +1,138 @@ -# -*- coding: utf-8 -*- -""" -RVC AI 翻唱 - 主入口 -""" -import os -import sys -import argparse -from pathlib import Path - -# 添加项目根目录到路径 -ROOT_DIR = Path(__file__).parent -sys.path.insert(0, str(ROOT_DIR)) - -from lib.logger import log - - -def check_environment(): - """检查运行环境""" - log.header("RVC AI 翻唱系统") - - # 检查 Python 版本 - py_version = sys.version_info - log.info(f"Python 版本: {py_version.major}.{py_version.minor}.{py_version.micro}") - - if py_version.major < 3 or (py_version.major == 3 and py_version.minor < 8): - log.warning("建议使用 Python 3.8 或更高版本") - - # 检查 PyTorch - try: - import torch - log.info(f"PyTorch 版本: {torch.__version__}") - - from lib.device import get_device_info, _is_rocm, _has_xpu, _has_directml, _has_mps - info = get_device_info() - log.info(f"可用加速后端: {', '.join(info['backends'])}") - - if torch.cuda.is_available(): - backend = "ROCm" if _is_rocm() else "CUDA" - log.info(f"{backend} 版本: {torch.version.hip if _is_rocm() else torch.version.cuda}") - log.info(f"GPU: {torch.cuda.get_device_name(0)}") - elif _has_xpu(): - log.info(f"Intel GPU: {torch.xpu.get_device_name(0)}") - elif _has_directml(): - import torch_directml - log.info(f"DirectML 设备: {torch_directml.device_name(0)}") - elif _has_mps(): - log.info("Apple MPS 加速可用") - else: - log.warning("未检测到 GPU 加速,将使用 CPU") - except ImportError: - log.error("未安装 PyTorch") - return False - - return True - - -def check_models(): - """检查必需模型""" - from tools.download_models import check_model, REQUIRED_MODELS - - missing = [] - for name in REQUIRED_MODELS: - if not check_model(name): - missing.append(name) - - if missing: - log.warning(f"缺少必需模型: {', '.join(missing)}") - log.info("正在下载...") - from tools.download_models import download_required_models - if not download_required_models(): - log.error("模型下载失败,请检查网络连接") - return False - - return True - - -def main(): - """主函数""" - parser = argparse.ArgumentParser(description="RVC AI 翻唱系统") - parser.add_argument( - "--host", - type=str, - default="127.0.0.1", - help="服务器地址 (默认: 127.0.0.1)" - ) - parser.add_argument( - "--port", - type=int, - default=7860, - help="服务器端口 (默认: 7860)" - ) - parser.add_argument( - "--share", - action="store_true", - help="创建公共链接" - ) - parser.add_argument( - "--skip-check", - action="store_true", - help="跳过环境检查" - ) - parser.add_argument( - "--download-models", - action="store_true", - help="仅下载模型" - ) - - args = parser.parse_args() - - # 仅下载模型 - if args.download_models: - from tools.download_models import download_all_models - download_all_models() - return - - # 环境检查 - if not args.skip_check: - if not check_environment(): - sys.exit(1) - - # 模型检查 - if not check_models(): - log.info("提示: 可以使用 --skip-check 跳过检查") - sys.exit(1) - - # 启动界面 - log.info(f"启动 Gradio 界面: http://{args.host}:{args.port}") - from ui.app import launch - launch(host=args.host, port=args.port, share=args.share) - - -if __name__ == "__main__": - main() +# -*- coding: utf-8 -*- +""" +RVC AI 翻唱 - 主入口 +""" +import os +import sys +import argparse +from pathlib import Path + +# 添加项目根目录到路径 +ROOT_DIR = Path(__file__).parent +sys.path.insert(0, str(ROOT_DIR)) + +from lib.ffmpeg_runtime import configure_ffmpeg_runtime +from lib.logger import log +from lib.runtime_build import get_runtime_build_label + +configure_ffmpeg_runtime() + + +def check_environment(): + """检查运行环境""" + log.header("RVC AI 翻唱系统") + log.info(get_runtime_build_label()) + + # 检查 Python 版本 + py_version = sys.version_info + log.info(f"Python 版本: {py_version.major}.{py_version.minor}.{py_version.micro}") + + if py_version.major < 3 or (py_version.major == 3 and py_version.minor < 8): + log.warning("建议使用 Python 3.8 或更高版本") + + # 检查 PyTorch + try: + import torch + log.info(f"PyTorch 版本: {torch.__version__}") + + from lib.device import get_device_info, _is_rocm, _has_xpu, _has_directml, _has_mps + info = get_device_info() + log.info(f"可用加速后端: {', '.join(info['backends'])}") + + if torch.cuda.is_available(): + backend = "ROCm" if _is_rocm() else "CUDA" + log.info(f"{backend} 版本: {torch.version.hip if _is_rocm() else torch.version.cuda}") + log.info(f"GPU: {torch.cuda.get_device_name(0)}") + elif _has_xpu(): + log.info(f"Intel GPU: {torch.xpu.get_device_name(0)}") + elif _has_directml(): + import torch_directml + log.info(f"DirectML 设备: {torch_directml.device_name(0)}") + elif _has_mps(): + log.info("Apple MPS 加速可用") + else: + log.warning("未检测到 GPU 加速,将使用 CPU") + except ImportError: + log.error("未安装 PyTorch") + return False + + return True + + +def check_models(): + """检查必需模型""" + from tools.download_models import check_model, REQUIRED_MODELS + + missing = [] + for name in REQUIRED_MODELS: + if not check_model(name): + missing.append(name) + + if missing: + log.warning(f"缺少必需模型: {', '.join(missing)}") + log.info("正在下载...") + from tools.download_models import download_required_models + if not download_required_models(): + log.error("模型下载失败,请检查网络连接") + return False + + return True + + +def main(): + """主函数""" + parser = argparse.ArgumentParser(description="RVC AI 翻唱系统") + parser.add_argument( + "--host", + type=str, + default="127.0.0.1", + help="服务器地址 (默认: 127.0.0.1)" + ) + parser.add_argument( + "--port", + type=int, + default=7860, + help="服务器端口 (默认: 7860)" + ) + parser.add_argument( + "--share", + action="store_true", + help="创建公共链接" + ) + parser.add_argument( + "--skip-check", + action="store_true", + help="跳过环境检查" + ) + parser.add_argument( + "--download-models", + action="store_true", + help="仅下载模型" + ) + + args = parser.parse_args() + + # 仅下载模型 + if args.download_models: + from tools.download_models import download_all_models + download_all_models() + return + + # 环境检查 + if not args.skip_check: + if not check_environment(): + sys.exit(1) + + # 模型检查 + if not check_models(): + log.info("提示: 可以使用 --skip-check 跳过检查") + sys.exit(1) + + # 启动界面 + log.info(f"启动 Gradio 界面: http://{args.host}:{args.port}") + from ui.app import launch + launch(host=args.host, port=args.port, share=args.share) + + +if __name__ == "__main__": + main() diff --git a/tests/run_mode_matrix.py b/tests/run_mode_matrix.py new file mode 100644 index 0000000000000000000000000000000000000000..44ce8cc9254105a04cb2448c850a2f0e9a452809 --- /dev/null +++ b/tests/run_mode_matrix.py @@ -0,0 +1,192 @@ +# -*- coding: utf-8 -*- +"""Run real AI cover processing modes and write a JSON report.""" + +from __future__ import annotations + +import argparse +import json +import platform +import sys +import time +import traceback +from pathlib import Path +from typing import Any, Dict, Iterable, Optional + + +REPO_ROOT = Path(__file__).resolve().parents[1] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + + +CASES: tuple[dict[str, Any], ...] = ( + dict(name="current_roformer_auto_source_auto", separator="roformer", vc_preprocess_mode="auto", source_constraint_mode="auto", vc_pipeline_mode="current", karaoke_separation=False, singing_repair=False), + dict(name="current_roformer_uvr_deecho_source_on", separator="roformer", vc_preprocess_mode="uvr_deecho", source_constraint_mode="on", vc_pipeline_mode="current", karaoke_separation=False, singing_repair=False), + dict(name="current_roformer_karaoke_source_off", separator="roformer", vc_preprocess_mode="auto", source_constraint_mode="off", vc_pipeline_mode="current", karaoke_separation=True, singing_repair=False), + dict(name="current_demucs_auto_source_auto", separator="demucs", vc_preprocess_mode="auto", source_constraint_mode="auto", vc_pipeline_mode="current", karaoke_separation=False, singing_repair=False), + dict(name="current_uvr5_auto_source_auto", separator="uvr5", vc_preprocess_mode="auto", source_constraint_mode="auto", vc_pipeline_mode="current", karaoke_separation=False, singing_repair=False), + dict(name="official_uvr5_one_to_one", separator="uvr5", vc_preprocess_mode="auto", source_constraint_mode="auto", vc_pipeline_mode="official", karaoke_separation=False, singing_repair=False), + dict(name="official_uvr5_singing_repair", separator="uvr5", vc_preprocess_mode="auto", source_constraint_mode="auto", vc_pipeline_mode="official", karaoke_separation=False, singing_repair=True), +) + + +def _require_file(path: Path, label: str) -> Path: + resolved = path.expanduser().resolve() + if not resolved.exists(): + raise FileNotFoundError(f"{label} does not exist: {resolved}") + if resolved.stat().st_size <= 0: + raise RuntimeError(f"{label} is empty: {resolved}") + return resolved + + +def _default_index(model_path: Path) -> Optional[Path]: + direct = model_path.with_suffix(".index") + if direct.exists(): + return direct + indexes = sorted(model_path.parent.glob("*.index")) + return indexes[0] if len(indexes) == 1 else None + + +def _device(require_cuda: bool) -> str: + import torch + + if torch.cuda.is_available(): + return "cuda" + if require_cuda: + raise RuntimeError("CUDA is required, but torch.cuda.is_available() is False.") + return "cpu" + + +def _validate_outputs(result: Dict[str, str], keys: Iterable[str]) -> Dict[str, Dict[str, Any]]: + files: Dict[str, Dict[str, Any]] = {} + for key in keys: + value = result.get(key) + if not value: + raise RuntimeError(f"Pipeline result missing key: {key}") + path = Path(value) + if not path.exists(): + raise FileNotFoundError(f"Pipeline output for {key} does not exist: {path}") + size = path.stat().st_size + if size <= 0: + raise RuntimeError(f"Pipeline output for {key} is empty: {path}") + files[key] = {"path": str(path), "bytes": size} + return files + + +def _run_case(case: Dict[str, Any], input_audio: Path, model_path: Path, index_path: Optional[Path], output_root: Path, device: str) -> Dict[str, Any]: + from infer.cover_pipeline import CoverPipeline + + started = time.time() + case_output = output_root / case["name"] + case_output.mkdir(parents=True, exist_ok=True) + pipeline = CoverPipeline(device=device) + try: + result = pipeline.process( + input_audio=str(input_audio), + model_path=str(model_path), + index_path=str(index_path) if index_path else None, + pitch_shift=0, + index_ratio=0.5, + filter_radius=3, + rms_mix_rate=0.0, + protect=0.33, + speaker_id=0, + f0_method="rmvpe", + demucs_model="htdemucs_ft", + demucs_shifts=1, + demucs_overlap=0.25, + demucs_split=True, + roformer_model="ensemble:vocal_rvc", + separator=case["separator"], + uvr5_model="HP2_all_vocals", + uvr5_agg=10, + uvr5_format="wav", + use_official=True, + hubert_layer=12, + vocals_volume=1.0, + accompaniment_volume=1.0, + reverb_amount=0.0, + backing_mix=0.0, + karaoke_separation=bool(case["karaoke_separation"]), + karaoke_model="ensemble:karaoke", + karaoke_merge_backing_into_accompaniment=True, + vc_preprocess_mode=case["vc_preprocess_mode"], + source_constraint_mode=case["source_constraint_mode"], + vc_pipeline_mode=case["vc_pipeline_mode"], + singing_repair=bool(case["singing_repair"]), + output_dir=str(case_output), + model_display_name="matrix", + ) + keys = ["cover", "vocals", "converted_vocals", "accompaniment", "all_files_dir"] + if case["karaoke_separation"]: + keys.extend(["lead_vocals", "backing_vocals"]) + return { + "name": case["name"], + "status": "pass", + "seconds": round(time.time() - started, 3), + "parameters": case, + "result": result, + "files": _validate_outputs(result, keys), + } + except Exception as exc: + return { + "name": case["name"], + "status": "fail", + "seconds": round(time.time() - started, 3), + "parameters": case, + "error": str(exc), + "traceback": traceback.format_exc(), + } + finally: + if pipeline.separator is not None: + pipeline.separator.unload_model() + if pipeline.karaoke_separator is not None: + pipeline.karaoke_separator.unload_model() + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--input", required=True) + parser.add_argument("--model-path", default=str(REPO_ROOT / "assets" / "weights" / "characters" / "rin" / "Rin.pth")) + parser.add_argument("--index-path") + parser.add_argument("--output-dir", default=str(REPO_ROOT / "outputs" / "mode_matrix")) + parser.add_argument("--require-cuda", action="store_true") + parser.add_argument("--case", action="append", choices=[case["name"] for case in CASES]) + args = parser.parse_args() + + input_audio = _require_file(Path(args.input), "Input audio") + model_path = _require_file(Path(args.model_path), "RVC model") + index_path = _require_file(Path(args.index_path), "RVC index") if args.index_path else _default_index(model_path) + device = _device(args.require_cuda) + stamp = time.strftime("%Y%m%d_%H%M%S") + output_root = Path(args.output_dir).expanduser().resolve() / f"{platform.system().lower()}_{stamp}" + output_root.mkdir(parents=True, exist_ok=True) + selected = set(args.case or []) + cases = [case for case in CASES if not selected or case["name"] in selected] + report = { + "platform": {"system": platform.system(), "release": platform.release(), "python": platform.python_version()}, + "device": device, + "input_audio": str(input_audio), + "model_path": str(model_path), + "index_path": str(index_path) if index_path else None, + "output_root": str(output_root), + "cases": [], + } + report_path = output_root / "mode_matrix_results.json" + for case in cases: + print(f"\n=== {case['name']} ===", flush=True) + record = _run_case(case, input_audio, model_path, index_path, output_root, device) + print(f"{case['name']}: {record['status']} ({record['seconds']}s)", flush=True) + if record["status"] == "fail": + print(record["error"], flush=True) + report["cases"].append(record) + report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") + failed = [case["name"] for case in report["cases"] if case["status"] != "pass"] + report["summary"] = {"total": len(cases), "passed": len(cases) - len(failed), "failed": len(failed), "failed_cases": failed} + report_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"\nReport: {report_path}", flush=True) + print(json.dumps(report["summary"], ensure_ascii=False, indent=2), flush=True) + return 1 if failed else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_audio_cleanup.py b/tests/test_audio_cleanup.py new file mode 100644 index 0000000000000000000000000000000000000000..6fea9120a0ade738900cb3911743d4d849a68a48 --- /dev/null +++ b/tests/test_audio_cleanup.py @@ -0,0 +1,109 @@ +import tempfile +import unittest +from pathlib import Path + +import numpy as np +import soundfile as sf + +from infer.cover_pipeline import CoverPipeline + + +def _rms(audio: np.ndarray) -> float: + return float(np.sqrt(np.mean(np.square(audio)) + 1e-12)) + + +def _preemphasis_rms(audio: np.ndarray) -> float: + audio = np.asarray(audio, dtype=np.float32).reshape(-1) + if audio.size == 0: + return 0.0 + residual = np.empty_like(audio) + residual[0] = audio[0] + residual[1:] = audio[1:] - 0.97 * audio[:-1] + return _rms(residual) + + +def _fade(length: int) -> np.ndarray: + env = np.ones(length, dtype=np.float32) + ramp = max(1, min(length // 4, 512)) + env[:ramp] = np.linspace(0.0, 1.0, ramp, dtype=np.float32) + env[-ramp:] = np.linspace(1.0, 0.0, ramp, dtype=np.float32) + return env + + +class EchoTailGateTests(unittest.TestCase): + def test_loud_echo_tail_is_suppressed_when_dereverb_removes_it(self): + sr = 48000 + t = np.arange(int(2.4 * sr), dtype=np.float32) / sr + original = np.zeros_like(t) + dereverbed = np.zeros_like(t) + + direct_mask = (t >= 0.20) & (t < 0.72) + echo_mask = (t >= 1.16) & (t < 1.68) + direct = 0.24 * np.sin(2.0 * np.pi * 330.0 * t[direct_mask]) + echo = 0.12 * np.sin(2.0 * np.pi * 330.0 * (t[echo_mask] - 0.96)) + direct *= _fade(direct.size) + echo *= _fade(echo.size) + original[direct_mask] = direct + original[echo_mask] = echo + dereverbed[direct_mask] = direct + + gain, gated_frames, total_frames = CoverPipeline._compute_echo_tail_sample_gain( + original=original, + dereverbed=dereverbed, + sr=sr, + ) + + direct_gain = gain[int(0.34 * sr): int(0.58 * sr)] + echo_gain = gain[int(1.28 * sr): int(1.54 * sr)] + self.assertGreater(total_frames, 0) + self.assertGreater(gated_frames, 0) + self.assertGreater(float(np.percentile(direct_gain, 5)), 0.85) + self.assertLess( + float(np.mean(echo_gain)), + 0.55, + "loud echo tails should be reduced even when they are not quiet", + ) + + +class BreathCleanupTests(unittest.TestCase): + def test_low_energy_recovery_breath_loses_tonal_hf_without_body_loss(self): + sr = 48000 + t = np.arange(int(2.0 * sr), dtype=np.float32) / sr + source = np.zeros_like(t) + converted = np.zeros_like(t) + + body = (t >= 0.25) & (t < 0.90) + breath = (t >= 1.15) & (t < 1.45) + source[body] = 0.15 * np.sin(2.0 * np.pi * 260.0 * t[body]) + converted[body] = source[body] + + breath_body = 0.012 * np.sin(2.0 * np.pi * 620.0 * t[breath]) + breath_body *= _fade(breath_body.size) + source[breath] = breath_body + converted[breath] = breath_body + 0.009 * np.sin(2.0 * np.pi * 5200.0 * t[breath]) + + with tempfile.TemporaryDirectory() as tmp_dir: + tmp = Path(tmp_dir) + source_path = tmp / "source.wav" + converted_path = tmp / "converted.wav" + sf.write(source_path, source, sr) + sf.write(converted_path, converted, sr) + + pipeline = CoverPipeline(device="cpu") + before, _ = sf.read(converted_path) + pipeline._apply_source_breath_cleanup(str(source_path), str(converted_path)) + pipeline._apply_source_transition_cleanup(str(source_path), str(converted_path)) + after, out_sr = sf.read(converted_path) + + self.assertEqual(out_sr, sr) + before_breath = before[int(1.20 * sr): int(1.40 * sr)] + after_breath = after[int(1.20 * sr): int(1.40 * sr)] + before_body = before[int(0.36 * sr): int(0.76 * sr)] + after_body = after[int(0.36 * sr): int(0.76 * sr)] + + self.assertLess(_preemphasis_rms(after_breath), _preemphasis_rms(before_breath) * 0.80) + self.assertGreater(_rms(after_body), _rms(before_body) * 0.96) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_audio_metrics.py b/tests/test_audio_metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..7fee6449bc47ec6cc1c8d3014b95f18c8637bf18 --- /dev/null +++ b/tests/test_audio_metrics.py @@ -0,0 +1,49 @@ +import sys +import unittest +from pathlib import Path + +import numpy as np + + +REPO_ROOT = Path(__file__).resolve().parents[1] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + + +class ReferenceAudioMetricTests(unittest.TestCase): + def test_si_sdr_is_scale_invariant_but_sdr_is_not(self): + from lib.audio_metrics import signal_distortion_ratio, scale_invariant_signal_distortion_ratio + + sr = 16000 + t = np.arange(sr, dtype=np.float32) / sr + reference = np.sin(2 * np.pi * 220 * t).astype(np.float32) + scaled = 2.0 * reference + + self.assertGreater(scale_invariant_signal_distortion_ratio(reference, scaled), 100.0) + self.assertLess(signal_distortion_ratio(reference, scaled), 0.1) + + def test_reference_stem_metrics_rank_clean_estimate_above_noisy_estimate(self): + from lib.audio_metrics import evaluate_reference_stems + + sr = 16000 + t = np.arange(sr, dtype=np.float32) / sr + lead = 0.18 * np.sin(2 * np.pi * 220 * t).astype(np.float32) + backing = 0.04 * np.sin(2 * np.pi * 330 * t + 0.4).astype(np.float32) + noise = 0.03 * np.sin(2 * np.pi * 910 * t).astype(np.float32) + + clean = evaluate_reference_stems( + references={"lead": lead, "backing": backing}, + estimates={"lead": lead, "backing": backing}, + ) + noisy = evaluate_reference_stems( + references={"lead": lead, "backing": backing}, + estimates={"lead": lead + noise, "backing": backing - noise}, + ) + + self.assertGreater(clean["mean_si_sdr"], noisy["mean_si_sdr"] + 20.0) + self.assertGreater(clean["stems"]["lead"]["si_sdr"], noisy["stems"]["lead"]["si_sdr"]) + self.assertGreater(clean["stems"]["backing"]["sdr"], noisy["stems"]["backing"]["sdr"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_colab_notebook.py b/tests/test_colab_notebook.py new file mode 100644 index 0000000000000000000000000000000000000000..e9f2e9e4210135918b10fd8a9c8f71b1f223d82e --- /dev/null +++ b/tests/test_colab_notebook.py @@ -0,0 +1,59 @@ +import json +import unittest +from pathlib import Path + + +class ColabNotebookTests(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.path = Path("AI_RVC_Colab.ipynb") + cls.notebook = json.loads(cls.path.read_text(encoding="utf-8")) + cls.source = "\n".join( + "".join(cell.get("source", [])) + for cell in cls.notebook.get("cells", []) + ) + + def test_notebook_json_is_valid_and_has_gpu_metadata(self): + self.assertEqual(self.notebook["nbformat"], 4) + self.assertEqual(self.notebook["metadata"].get("accelerator"), "GPU") + self.assertEqual(self.notebook["metadata"].get("colab", {}).get("gpuType"), "T4") + + def test_colab_uses_python_310_environment_for_fairseq(self): + self.assertIn("uv python install 3.10", self.source) + self.assertIn("uv venv --seed --python 3.10 venv310", self.source) + self.assertIn("$PY -m pip --version", self.source) + self.assertIn("uv run --python 3.10 python install.py --no-run", self.source) + self.assertIn("assert sys.version_info[:2] == (3, 10)", self.source) + + def test_colab_preinstalls_gpu_torch_before_project_dependencies(self): + self.assertIn("--index-url https://download.pytorch.org/whl/cu126", self.source) + self.assertIn("pip install torch torchaudio", self.source) + self.assertIn("raise RuntimeError('GPU PyTorch install completed but CUDA is not available.')", self.source) + self.assertNotIn("https://download.pytorch.org/whl/cpu", self.source) + + def test_colab_checks_current_processing_model_defaults(self): + self.assertIn("ROFORMER_DEFAULT_MODEL == 'ensemble:vocal_rvc'", self.source) + self.assertIn("KARAOKE_DEFAULT_MODEL == 'ensemble:karaoke'", self.source) + self.assertIn( + "ROFORMER_DEREVERB_DEFAULT_MODEL == 'dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt'", + self.source, + ) + + def test_colab_no_longer_documents_legacy_single_roformer_as_default(self): + self.assertNotIn("MVSEP Vocals SDR 11.01", self.source) + self.assertNotIn("vocals_mel_band_roformer", self.source) + + def test_colab_launch_does_not_skip_checks(self): + self.assertIn("$PY run.py --host 0.0.0.0 --port 7860 --share", self.source) + self.assertNotIn("--skip-check", self.source) + + def test_colab_includes_full_processing_mode_matrix(self): + self.assertIn("$PY tools/run_mode_matrix.py", self.source) + self.assertIn("--output-dir /content/AI-RVC/outputs/mode_matrix_colab", self.source) + self.assertIn("--require-cuda", self.source) + self.assertIn("download_character_model('rin')", self.source) + self.assertIn("Missing test audio", self.source) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_ffmpeg_runtime.py b/tests/test_ffmpeg_runtime.py new file mode 100644 index 0000000000000000000000000000000000000000..458b2c3969ac5d30ddff456731d425023b287745 --- /dev/null +++ b/tests/test_ffmpeg_runtime.py @@ -0,0 +1,123 @@ +import importlib.util +import os +import subprocess +import sys +import tempfile +import unittest +from pathlib import Path +from unittest import mock + + +REPO_ROOT = Path(__file__).resolve().parents[1] + + +def _load_ffmpeg_runtime_module(): + module_path = REPO_ROOT / "lib" / "ffmpeg_runtime.py" + spec = importlib.util.spec_from_file_location("ffmpeg_runtime", module_path) + if spec is None or spec.loader is None: + raise ImportError(f"Unable to load module from {module_path}") + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +class FfmpegRuntimeTests(unittest.TestCase): + def test_bundled_ffmpeg_dir_is_preferred(self): + module = _load_ffmpeg_runtime_module() + + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + bin_dir = root / "tools" / "ffmpeg" / "bin" + bin_dir.mkdir(parents=True, exist_ok=True) + (bin_dir / "ffmpeg.exe").write_bytes(b"exe") + + resolved = module.get_ffmpeg_bin_dir(root_dir=root) + + self.assertEqual(resolved, bin_dir) + + def test_runtime_setup_prepends_bundled_ffmpeg_to_path(self): + module = _load_ffmpeg_runtime_module() + + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + bin_dir = root / "tools" / "ffmpeg" / "bin" + bin_dir.mkdir(parents=True, exist_ok=True) + ffmpeg = bin_dir / "ffmpeg.exe" + ffmpeg.write_bytes(b"exe") + ffprobe = bin_dir / "ffprobe.exe" + ffprobe.write_bytes(b"exe") + + env = {"PATH": r"C:\Windows\System32"} + with mock.patch.object( + module, + "_check_executable_runs", + return_value=None, + ): + module.configure_ffmpeg_runtime(root_dir=root, env=env) + + self.assertTrue(env["PATH"].startswith(str(bin_dir))) + self.assertEqual(env["FFMPEG_BINARY"], str(ffmpeg)) + self.assertEqual(env["FFPROBE_BINARY"], str(ffprobe)) + + def test_runtime_rejects_broken_bundled_ffprobe(self): + module = _load_ffmpeg_runtime_module() + + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + bin_dir = root / "tools" / "ffmpeg" / "bin" + bin_dir.mkdir(parents=True, exist_ok=True) + (bin_dir / "ffmpeg.exe").write_bytes(b"exe") + ffprobe = bin_dir / "ffprobe.exe" + ffprobe.write_bytes(b"broken") + + def fake_run(command, **kwargs): + executable = Path(command[0]).name.lower() + if executable == "ffmpeg.exe": + return subprocess.CompletedProcess(command, 0, stdout="ffmpeg version", stderr="") + return subprocess.CompletedProcess( + command, + 1, + stdout="Cannot find file at '..\\lib\\ffmpeg\\tools\\ffmpeg\\bin\\ffprobe.exe'", + stderr="", + ) + + with mock.patch.object(module.subprocess, "run", side_effect=fake_run): + with self.assertRaises(RuntimeError) as context: + module.configure_ffmpeg_runtime(root_dir=root, env={"PATH": ""}) + + message = str(context.exception) + self.assertIn(str(ffprobe), message) + self.assertIn("Cannot find file", message) + + def test_uvr5_module_does_not_shell_out_to_ffprobe(self): + source = (REPO_ROOT / "infer" / "modules" / "uvr5" / "modules.py").read_text(encoding="utf-8") + + self.assertNotIn('cmd="ffprobe"', source) + + def test_run_configures_bundled_ffmpeg_runtime(self): + source = (REPO_ROOT / "run.py").read_text(encoding="utf-8") + + self.assertIn("configure_ffmpeg_runtime()", source) + + def test_workflow_packages_bundled_ffmpeg_directory(self): + workflow = (REPO_ROOT / ".github" / "workflows" / "build-executables.yml").read_text(encoding="utf-8") + + self.assertIn("tools/ffmpeg", workflow) + + def test_workflow_resolves_real_chocolatey_ffprobe_and_validates_bundle(self): + workflow = (REPO_ROOT / ".github" / "workflows" / "build-executables.yml").read_text(encoding="utf-8") + + self.assertIn("ChocolateyInstall", workflow) + self.assertIn("ffprobe_dest", workflow) + self.assertIn("subprocess.run([str(ffprobe_dest), \"-version\"]", workflow) + + def test_release_upload_uses_timeout_and_retry(self): + workflow = (REPO_ROOT / ".github" / "workflows" / "build-executables.yml").read_text(encoding="utf-8") + + self.assertIn("upload_asset_with_retry", workflow) + self.assertIn("timeout 30m gh release upload", workflow) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_install_requirements.py b/tests/test_install_requirements.py new file mode 100644 index 0000000000000000000000000000000000000000..40b8dc69f0b0599d7d6f38f474f7f4b888ad96b2 --- /dev/null +++ b/tests/test_install_requirements.py @@ -0,0 +1,109 @@ +import contextlib +import io +import unittest +from unittest import mock + +import install + + +class InstallRequirementTests(unittest.TestCase): + def test_audio_separator_below_required_version_is_marked_for_install(self): + with mock.patch("install.check_package", return_value=True), mock.patch( + "install.get_installed_version", + return_value="0.41.1", + create=True, + ), contextlib.redirect_stdout(io.StringIO()): + missing = install.check_all("python") + + self.assertIn( + "audio-separator", + {info["pip"] for info in missing}, + ) + + def test_audio_separator_at_required_version_is_accepted(self): + with mock.patch("install.check_package", return_value=True), mock.patch( + "install.get_installed_version", + return_value="0.44.1", + create=True, + ), contextlib.redirect_stdout(io.StringIO()): + missing = install.check_all("python") + + self.assertNotIn( + "audio-separator", + {info["pip"] for info in missing}, + ) + + def test_numpy_2_is_marked_for_downgrade(self): + def version_for_package(_venv_py, distribution_name): + if distribution_name == "numpy": + return "2.2.6" + if distribution_name == "audio-separator": + return "0.44.1" + return None + + with mock.patch("install.check_package", return_value=True), mock.patch( + "install.get_installed_version", + side_effect=version_for_package, + create=True, + ), contextlib.redirect_stdout(io.StringIO()): + missing = install.check_all("python") + + self.assertIn("numpy<2,>=1.23.0", {info["pip"] for info in missing}) + + def test_audio_separator_install_restores_numpy_1_x(self): + audio_separator_info = install.PACKAGES["audio_separator"] + calls = [] + + def fake_pip_install(_venv_py, package, **kwargs): + calls.append((package, kwargs)) + return True + + with mock.patch("install.check_all", return_value=[audio_separator_info]), mock.patch( + "install.detect_cuda_version", + return_value=None, + ), mock.patch("install.pip_install", side_effect=fake_pip_install), contextlib.redirect_stdout( + io.StringIO() + ): + ok = install.install_all("python", gpu=False) + + self.assertTrue(ok) + self.assertEqual(calls[0][0], "audio-separator") + self.assertEqual(calls[0][1]["version_spec"], ">=0.44.1") + self.assertIn(("numpy<2,>=1.23.0", {}), calls) + + def test_cuda_13_uses_latest_supported_pytorch_wheel(self): + def fake_run(cmd, **_kwargs): + if cmd == ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"]: + return mock.Mock(returncode=0, stdout="575.51.03\n") + if cmd == ["nvidia-smi"]: + return mock.Mock(returncode=0, stdout="CUDA Version: 13.0\n") + raise AssertionError(f"Unexpected command: {cmd}") + + with mock.patch("install.subprocess.run", side_effect=fake_run): + self.assertEqual( + install.detect_cuda_version(), + "https://download.pytorch.org/whl/cu126", + ) + + def test_gpu_install_does_not_fall_back_to_cpu_torch(self): + calls = [] + + def fake_pip_install(_venv_py, package, **kwargs): + calls.append((package, kwargs)) + return True + + with mock.patch( + "install.check_all", + return_value=[install.PACKAGES["torch"], install.PACKAGES["torchaudio"]], + ), mock.patch("install.detect_cuda_version", return_value=None), mock.patch( + "install.pip_install", + side_effect=fake_pip_install, + ), contextlib.redirect_stdout(io.StringIO()): + ok = install.install_all("python", gpu=True) + + self.assertFalse(ok) + self.assertEqual(calls, []) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_mixer.py b/tests/test_mixer.py new file mode 100644 index 0000000000000000000000000000000000000000..f19707989c57205397adbe9e8614f18bf8328273 --- /dev/null +++ b/tests/test_mixer.py @@ -0,0 +1,58 @@ +import tempfile +import unittest +from pathlib import Path + +import numpy as np +import soundfile as sf + +from lib.mixer import mix_vocals_and_accompaniment + + +def _rms(audio: np.ndarray) -> float: + return float(np.sqrt(np.mean(np.square(audio)) + 1e-12)) + + +class MixStabilityTests(unittest.TestCase): + def test_default_mix_does_not_duck_accompaniment_when_vocal_enters(self): + sr = 48000 + duration = 3.0 + t = np.arange(int(sr * duration), dtype=np.float32) / sr + accompaniment = 0.07 * np.sin(2.0 * np.pi * 220.0 * t) + vocals = np.zeros_like(accompaniment) + active = (t >= 1.10) & (t < 2.00) + vocals[active] = 0.045 * np.sin(2.0 * np.pi * 440.0 * t[active]) + + with tempfile.TemporaryDirectory() as tmp_dir: + tmp = Path(tmp_dir) + vocals_path = tmp / "vocals.wav" + accompaniment_path = tmp / "accompaniment.wav" + output_path = tmp / "mix.wav" + sf.write(vocals_path, vocals, sr) + sf.write(accompaniment_path, accompaniment, sr) + + mix_vocals_and_accompaniment( + str(vocals_path), + str(accompaniment_path), + str(output_path), + target_sr=sr, + ) + + mixed, out_sr = sf.read(output_path) + + self.assertEqual(out_sr, sr) + if mixed.ndim > 1: + mixed = mixed.mean(axis=1) + residual = mixed.astype(np.float32) - vocals.astype(np.float32) + before = residual[int(0.40 * sr): int(0.90 * sr)] + during = residual[int(1.30 * sr): int(1.80 * sr)] + drop_db = 20.0 * np.log10((_rms(during) + 1e-12) / (_rms(before) + 1e-12)) + + self.assertGreater( + drop_db, + -0.25, + f"default mix should keep accompaniment steady, got {drop_db:.2f} dB drop", + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_model_defaults.py b/tests/test_model_defaults.py new file mode 100644 index 0000000000000000000000000000000000000000..4397ad2cabeab44de0d088421bfacaf178bf3045 --- /dev/null +++ b/tests/test_model_defaults.py @@ -0,0 +1,171 @@ +import sys +import tempfile +import unittest +from pathlib import Path + +import numpy as np +import soundfile as sf + + +REPO_ROOT = Path(__file__).resolve().parents[1] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + + +class ModelDefaultTests(unittest.TestCase): + def test_roformer_default_uses_public_audio_separator_sota_model(self): + from infer import separator + + self.assertEqual( + separator.ROFORMER_DEFAULT_MODEL, + "ensemble:vocal_rvc", + ) + self.assertEqual( + separator.ROFORMER_SOTA_MODELS, + [ + "melband_roformer_big_beta6x.ckpt", + "mel_band_roformer_vocals_fv4_gabox.ckpt", + ], + ) + self.assertIn( + "vocals_mel_band_roformer.ckpt", + separator.ROFORMER_LEGACY_SINGLE_MODEL, + ) + + def test_karaoke_default_uses_public_sota_ensemble(self): + from infer import separator + + self.assertEqual( + separator.KARAOKE_DEFAULT_MODEL, + "ensemble:karaoke", + ) + self.assertEqual( + separator.KARAOKE_SOTA_MODEL, + "ensemble:karaoke", + ) + self.assertEqual( + separator.KARAOKE_SOTA_MODELS, + [ + "mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt", + "mel_band_roformer_karaoke_gabox_v2.ckpt", + "mel_band_roformer_karaoke_becruily.ckpt", + ], + ) + self.assertEqual( + separator.KARAOKE_LEGACY_SINGLE_MODEL, + "mel_band_roformer_karaoke_gabox.ckpt", + ) + + def test_deecho_default_uses_public_roformer_dereverb_model(self): + from infer import separator + + self.assertEqual( + separator.ROFORMER_DEREVERB_DEFAULT_MODEL, + "dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt", + ) + + def test_strict_sota_defaults_do_not_expose_model_fallback_lists(self): + from infer import separator + + self.assertFalse(hasattr(separator, "ROFORMER_FALLBACK_MODELS")) + self.assertFalse(hasattr(separator, "KARAOKE_FALLBACK_MODELS")) + self.assertFalse(hasattr(separator, "ROFORMER_DEREVERB_FALLBACK_MODELS")) + + +class KaraokeCandidateScoringTests(unittest.TestCase): + def test_karaoke_candidate_score_rewards_reconstruction_and_low_correlation(self): + from tools.evaluate_karaoke_models import score_karaoke_stems + + sr = 16000 + t = np.arange(sr, dtype=np.float32) / sr + lead_good = 0.18 * np.sin(2 * np.pi * 220 * t) + backing_good = 0.05 * np.sin(2 * np.pi * 330 * t + 0.4) + input_vocals = lead_good + backing_good + + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_path = Path(tmp_dir) + input_path = tmp_path / "input.wav" + lead_good_path = tmp_path / "lead_good.wav" + backing_good_path = tmp_path / "backing_good.wav" + lead_bad_path = tmp_path / "lead_bad.wav" + backing_bad_path = tmp_path / "backing_bad.wav" + + sf.write(input_path, input_vocals, sr) + sf.write(lead_good_path, lead_good, sr) + sf.write(backing_good_path, backing_good, sr) + sf.write(lead_bad_path, input_vocals, sr) + sf.write(backing_bad_path, 0.7 * input_vocals, sr) + + good = score_karaoke_stems(input_path, lead_good_path, backing_good_path) + bad = score_karaoke_stems(input_path, lead_bad_path, backing_bad_path) + + self.assertGreater(good["score"], bad["score"]) + self.assertLess(good["reconstruction_error"], bad["reconstruction_error"]) + self.assertLess(good["lead_backing_abs_corr"], bad["lead_backing_abs_corr"]) + + def test_karaoke_candidate_score_penalizes_truncated_stems(self): + from tools.evaluate_karaoke_models import score_karaoke_stems + + sr = 16000 + t = np.arange(sr, dtype=np.float32) / sr + lead_good = 0.18 * np.sin(2 * np.pi * 220 * t) + backing_good = 0.04 * np.sin(2 * np.pi * 330 * t + 0.4) + input_vocals = lead_good + backing_good + short_len = sr // 4 + + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_path = Path(tmp_dir) + input_path = tmp_path / "input.wav" + lead_short_path = tmp_path / "lead_short.wav" + backing_short_path = tmp_path / "backing_short.wav" + lead_full_path = tmp_path / "lead_full.wav" + backing_full_path = tmp_path / "backing_full.wav" + + sf.write(input_path, input_vocals, sr) + sf.write(lead_short_path, lead_good[:short_len], sr) + sf.write(backing_short_path, backing_good[:short_len], sr) + sf.write(lead_full_path, 0.97 * lead_good, sr) + sf.write(backing_full_path, 0.97 * backing_good, sr) + + short = score_karaoke_stems(input_path, lead_short_path, backing_short_path) + full = score_karaoke_stems(input_path, lead_full_path, backing_full_path) + + self.assertIn("length_coverage", short) + self.assertLess(short["length_coverage"], 0.999) + self.assertGreaterEqual(full["length_coverage"], 0.999) + self.assertGreater(full["score"], short["score"]) + + def test_reference_karaoke_score_uses_true_si_sdr_when_refs_exist(self): + from tools.evaluate_karaoke_models import score_reference_stems + + sr = 16000 + t = np.arange(sr, dtype=np.float32) / sr + lead = 0.18 * np.sin(2 * np.pi * 220 * t) + backing = 0.04 * np.sin(2 * np.pi * 330 * t + 0.4) + + with tempfile.TemporaryDirectory() as tmp_dir: + tmp_path = Path(tmp_dir) + reference_lead_path = tmp_path / "reference_lead.wav" + reference_backing_path = tmp_path / "reference_backing.wav" + lead_path = tmp_path / "lead.wav" + backing_path = tmp_path / "backing.wav" + + sf.write(reference_lead_path, lead, sr) + sf.write(reference_backing_path, backing, sr) + sf.write(lead_path, lead, sr) + sf.write(backing_path, backing, sr) + + metrics = score_reference_stems( + reference_lead_path, + reference_backing_path, + lead_path, + backing_path, + ) + + self.assertGreater(metrics["mean_si_sdr"], 100.0) + self.assertIn("lead", metrics["stems"]) + self.assertIn("backing", metrics["stems"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_official_adapter.py b/tests/test_official_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..07dd1787d0c91be546e05c27fedab2f2959c95d6 --- /dev/null +++ b/tests/test_official_adapter.py @@ -0,0 +1,34 @@ +import sys +import tempfile +import unittest +from pathlib import Path +from unittest import mock + +import numpy as np +import soundfile as sf + +from infer import official_adapter + + +class OfficialAdapterTests(unittest.TestCase): + def test_isolated_argv_restores_cli_args(self): + original = ["runner.py", "--input", "song.wav"] + with mock.patch.object(sys, "argv", original[:]): + with official_adapter._IsolatedArgv(): + self.assertEqual(sys.argv, ["runner.py"]) + self.assertEqual(sys.argv, original) + + def test_audio_activity_stats_detects_silent_file(self): + with tempfile.TemporaryDirectory() as tmp_dir: + path = Path(tmp_dir) / "silent.wav" + sf.write(path, np.zeros((3200, 2), dtype=np.float32), 16000) + + rms, peak, nonzero = official_adapter._get_audio_activity_stats(path) + + self.assertEqual(rms, 0.0) + self.assertEqual(peak, 0.0) + self.assertEqual(nonzero, 0) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_quality_policy.py b/tests/test_quality_policy.py new file mode 100644 index 0000000000000000000000000000000000000000..7caee54d5ee4b9dc64b49aefff9528dba57930a7 --- /dev/null +++ b/tests/test_quality_policy.py @@ -0,0 +1,247 @@ +import importlib.util +import re +import sys +import unittest +from pathlib import Path + +import numpy as np + +REPO_ROOT = Path(__file__).resolve().parents[1] + + +def _load_quality_policy_module(): + module_path = REPO_ROOT / "infer" / "quality_policy.py" + spec = importlib.util.spec_from_file_location("quality_policy", module_path) + if spec is None or spec.loader is None: + raise ImportError(f"Unable to load module from {module_path}") + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +quality_policy = _load_quality_policy_module() +build_conservative_crepe_fill_mask = quality_policy.build_conservative_crepe_fill_mask +build_conservative_harvest_fill_mask = quality_policy.build_conservative_harvest_fill_mask +compute_chunk_crossfade_samples = quality_policy.compute_chunk_crossfade_samples +compute_active_source_replace = quality_policy.compute_active_source_replace +compute_breath_preserving_energy_gates = quality_policy.compute_breath_preserving_energy_gates +compute_source_cleanup_budget = quality_policy.compute_source_cleanup_budget +resolve_cover_f0_policy = quality_policy.resolve_cover_f0_policy + + +class F0RoutingPolicyTests(unittest.TestCase): + def test_hybrid_routes_to_conservative_rmvpe_fallback(self): + policy = resolve_cover_f0_policy("hybrid", "off") + + self.assertEqual(policy.requested_method, "hybrid") + self.assertEqual(policy.vc_method, "rmvpe") + self.assertEqual(policy.hybrid_mode, "fallback") + self.assertEqual(policy.gate_method, "rmvpe") + + +class ConservativeCrepeFillTests(unittest.TestCase): + def test_only_short_internal_gaps_are_filled(self): + f0_rmvpe = np.array([0.0, 0.0, 120.0, 120.0, 0.0, 0.0, 120.0, 120.0, 0.0, 0.0], dtype=np.float32) + f0_crepe = np.full_like(f0_rmvpe, 121.0) + confidence = np.full_like(f0_rmvpe, 0.95) + + fill_mask = build_conservative_crepe_fill_mask( + f0_rmvpe, + f0_crepe, + confidence, + confidence_threshold=0.6, + max_ratio=0.5, + max_frames=4, + context_radius=2, + ) + + expected = np.array([False, False, False, False, True, True, False, False, False, False]) + np.testing.assert_array_equal(fill_mask, expected) + + def test_harvest_fill_rejects_long_gap_and_accepts_short_consistent_gap(self): + reference_f0 = np.array( + [120.0, 121.0, 122.0, 0.0, 0.0, 121.5, 122.0, 121.0, 0.0, 0.0, 0.0, 0.0, 122.0, 121.0], + dtype=np.float32, + ) + fallback_f0 = np.array( + [0.0, 0.0, 0.0, 121.0, 121.5, 0.0, 0.0, 0.0, 122.0, 122.0, 122.0, 122.0, 0.0, 0.0], + dtype=np.float32, + ) + dropout_mask = reference_f0 <= 0 + + fill_mask = build_conservative_harvest_fill_mask( + reference_f0=reference_f0, + fallback_f0=fallback_f0, + dropout_mask=dropout_mask, + max_run=3, + local_radius=2, + max_semitones=2.0, + ) + + expected = np.array( + [False, False, False, True, True, False, False, False, False, False, False, False, False, False], + dtype=bool, + ) + np.testing.assert_array_equal(fill_mask, expected) + + def test_chunk_crossfade_scales_above_legacy_floor_for_multi_segment_audio(self): + self.assertEqual( + compute_chunk_crossfade_samples(tgt_sr=48000, t_pad_tgt=144000, segment_count=1), + 0, + ) + self.assertEqual( + compute_chunk_crossfade_samples(tgt_sr=48000, t_pad_tgt=144000, segment_count=2), + 864, + ) + self.assertEqual( + compute_chunk_crossfade_samples(tgt_sr=48000, t_pad_tgt=144000, segment_count=5), + 1152, + ) + + +class SourceConstraintPolicyTests(unittest.TestCase): + def test_active_echo_frames_keep_nonzero_replace_pressure(self): + activity = np.array([1.0, 1.0, 0.0], dtype=np.float32) + soft_mask = np.array([[0.10, 0.55, 0.10]], dtype=np.float32) + echo_ratio = np.array([[0.90, 0.85, 0.40]], dtype=np.float32) + direct_ratio = np.array([0.10, 0.25, 0.15], dtype=np.float32) + + replace = compute_active_source_replace(activity, soft_mask, echo_ratio, direct_ratio) + + self.assertGreater(replace[0, 0], 0.40) + self.assertGreater(replace[0, 1], 0.10) + self.assertLess(replace[0, 1], replace[0, 0]) + self.assertLessEqual(float(np.max(replace)), 0.82) + + def test_cleanup_budget_caps_active_boost_below_two_x(self): + energy_guard = np.array([0.0, 0.5, 1.0], dtype=np.float32) + phrase_activity = np.array([0.0, 0.5, 1.0], dtype=np.float32) + + allowed_boost, cleanup_floor = compute_source_cleanup_budget( + energy_guard, + phrase_activity, + ) + + np.testing.assert_allclose( + allowed_boost, + np.array([0.35, 0.85, 1.35], dtype=np.float32), + atol=1e-6, + ) + np.testing.assert_allclose( + cleanup_floor, + np.array([0.62, 0.70, 0.78], dtype=np.float32), + atol=1e-6, + ) + self.assertLess(float(np.max(allowed_boost)), 1.5) + + +class BreathEnergyGateTests(unittest.TestCase): + def test_marginal_quiet_unvoiced_frames_keep_more_feature_than_pitch(self): + energy_db = np.array([-70.0, -60.0, -47.0, -38.0], dtype=np.float32) + unvoiced_mask = np.array([True, True, True, False], dtype=bool) + + feature_gate, pitch_gate = compute_breath_preserving_energy_gates( + energy_db=energy_db, + ref_db=-12.0, + unvoiced_mask=unvoiced_mask, + quiet_floor=0.05, + breath_floor=0.28, + breath_active_margin_db=52.0, + transition_width_db=6.0, + ) + + self.assertAlmostEqual(float(feature_gate[0]), 0.05, places=5) + self.assertAlmostEqual(float(pitch_gate[0]), 0.05, places=5) + self.assertGreater(float(feature_gate[1]), float(pitch_gate[1])) + self.assertGreater(float(feature_gate[1]), 0.13) + self.assertLess(float(pitch_gate[1]), 0.13) + self.assertGreater(float(feature_gate[1]), float(pitch_gate[1])) + self.assertAlmostEqual(float(feature_gate[2]), float(pitch_gate[2]), places=5) + self.assertAlmostEqual(float(feature_gate[3]), float(pitch_gate[3]), places=5) + + +class SourceRegressionTests(unittest.TestCase): + def test_cover_pipeline_uses_single_conservative_cleanup_budget(self): + source = (REPO_ROOT / "infer" / "cover_pipeline.py").read_text(encoding="utf-8") + + self.assertNotIn("allowed_boost = 0.50 + 1.50 * energy_guard", source) + self.assertNotRegex( + source, + r"cleanup_gain\s*=\s*np\.clip\(\s*frame_budget\s*/\s*\(constrained_frame_rms \+ eps\),\s*0\.75 \+ 0\.20 \* phrase_activity", + ) + + def test_cover_pipeline_gain_parameters_match_runtime_clip(self): + source = (REPO_ROOT / "infer" / "cover_pipeline.py").read_text(encoding="utf-8") + + self.assertIn("min_gain=0.85", source) + self.assertIn("max_gain=1.12", source) + self.assertNotIn("min_gain=0.95", source) + self.assertNotIn("max_gain=1.30", source) + self.assertIn("reduction_ratio", source) + self.assertIn("strict_deecho_plus", source) + self.assertIn("_apply_source_breath_cleanup", source) + self.assertIn("_apply_source_transition_cleanup", source) + self.assertIn("Source breath cleanup:", source) + self.assertIn("Source transition cleanup:", source) + self.assertIn("_record_quality_debug", source) + self.assertIn("quality_debug.json", source) + self.assertIn("debug_clips", source) + self.assertIn("VC backend: upstream_official_raw + current postprocess", source) + self.assertIn("convert_vocals_official_upstream(", source) + self.assertNotIn('log.detail("使用当前项目官方封装VC进行转换")\n convert_vocals_official(', source) + + def test_official_vc_pipeline_no_longer_forces_hybrid_to_aggressive_crepe(self): + source = (REPO_ROOT / "infer" / "modules" / "vc" / "pipeline.py").read_text(encoding="utf-8") + + self.assertNotIn('self.f0_hybrid_mode = "rmvpe+crepe"', source) + + def test_config_defaults_match_conservative_hybrid_policy(self): + source = (REPO_ROOT / "configs" / "config.json").read_text(encoding="utf-8") + + self.assertRegex(source, r'"f0_hybrid_mode"\s*:\s*"fallback"') + self.assertNotRegex(source, r'"crepe_force_ratio"\s*:\s*0\.0') + + def test_official_vc_pipeline_uses_breath_preserving_energy_gates(self): + source = (REPO_ROOT / "infer" / "modules" / "vc" / "pipeline.py").read_text(encoding="utf-8") + + self.assertIn("compute_breath_preserving_energy_gates", source) + self.assertIn("self.unvoiced_feature_gate_floor", source) + self.assertIn("compute_chunk_crossfade_samples", source) + self.assertIn("build_conservative_harvest_fill_mask", source) + self.assertIn("分段边界(秒)", source) + + def test_config_includes_breath_gate_defaults(self): + source = (REPO_ROOT / "configs" / "config.json").read_text(encoding="utf-8") + + self.assertRegex(source, r'"unvoiced_feature_gate_floor"\s*:\s*0\.28') + self.assertRegex(source, r'"breath_active_margin_db"\s*:\s*52\.0') + + def test_ui_only_exposes_strict_sota_vc_preprocess_modes(self): + ui_source = (REPO_ROOT / "ui" / "app.py").read_text(encoding="utf-8") + i18n_source = (REPO_ROOT / "i18n" / "zh_CN.json").read_text(encoding="utf-8") + cover_source = (REPO_ROOT / "infer" / "cover_pipeline.py").read_text(encoding="utf-8") + dereverb_source = (REPO_ROOT / "infer" / "advanced_dereverb.py").read_text(encoding="utf-8") + + self.assertIn('t("vc_preprocess_auto", "cover"): "auto"', ui_source) + self.assertIn('t("vc_preprocess_uvr_deecho", "cover"): "uvr_deecho"', ui_source) + self.assertNotIn('t("vc_preprocess_direct", "cover"): "direct"', ui_source) + self.assertNotIn('t("vc_preprocess_legacy", "cover"): "legacy"', ui_source) + self.assertIn('vc_preprocess_mode not in {"auto", "uvr_deecho"}', ui_source) + self.assertIn("不可用时停止,不降级", i18n_source) + self.assertNotIn('"vc_preprocess_direct"', i18n_source) + self.assertNotIn('"vc_preprocess_legacy"', i18n_source) + self.assertNotIn("主唱直通", i18n_source) + self.assertNotIn("回退到算法", i18n_source) + self.assertNotIn('self._last_vc_preprocess_mode == "direct"', cover_source) + self.assertNotIn("apply_reverb_to_converted", dereverb_source) + + def test_upstream_official_adapter_routes_hybrid_to_conservative_vc_method(self): + source = (REPO_ROOT / "infer" / "official_adapter.py").read_text(encoding="utf-8") + + self.assertIn("effective_f0_method = f0_policy.vc_method", source) + self.assertIn("官方F0路由解析", source) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_separator_outputs.py b/tests/test_separator_outputs.py new file mode 100644 index 0000000000000000000000000000000000000000..1c12c9ed269119f6e91018d37e1cf7228d49cd58 --- /dev/null +++ b/tests/test_separator_outputs.py @@ -0,0 +1,60 @@ +import tempfile +import unittest +from pathlib import Path + +from infer.separator import RoformerDereverbSeparator, _resolve_output_files + + +class SeparatorOutputResolutionTests(unittest.TestCase): + def test_resolves_existing_file_when_returned_name_has_stale_prefix(self): + with tempfile.TemporaryDirectory() as tmp: + output_dir = Path(tmp) + actual = output_dir / "song_(noreverb)_dereverb.wav" + actual.write_bytes(b"audio") + + resolved = _resolve_output_files( + ["vocal_song.wav_10_(noreverb)_dereverb.wav"], + output_dir, + ) + + self.assertEqual([Path(path).name for path in resolved], [actual.name]) + + def test_classifies_noreverb_as_dry_before_reverb(self): + self.assertEqual( + RoformerDereverbSeparator._classify_stem( + "vocal_song.wav_10_(noreverb)_dereverb_mel_band_roformer.wav" + ), + "dry", + ) + + def test_classifies_reverb_as_wet(self): + self.assertEqual( + RoformerDereverbSeparator._classify_stem( + "vocal_song.wav_10_(reverb)_dereverb_mel_band_roformer.wav" + ), + "wet", + ) + + def test_dereverb_does_not_accept_wet_output_as_dry(self): + with tempfile.TemporaryDirectory() as tmp: + output_dir = Path(tmp) + wet = output_dir / "song_(reverb)_dereverb.wav" + wet.write_bytes(b"audio") + + separator = RoformerDereverbSeparator.__new__(RoformerDereverbSeparator) + separator.load_model = lambda output_dir="": None + separator.separator = type( + "FakeSeparator", + (), + { + "output_dir": str(output_dir), + "separate": lambda self, audio_path: [str(wet)], + }, + )() + + with self.assertRaisesRegex(FileNotFoundError, "dry轨未找到"): + separator.separate_dry("input.wav", str(output_dir)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_ui_language.py b/tests/test_ui_language.py new file mode 100644 index 0000000000000000000000000000000000000000..64227759da8be9605ee40234682b98ff1a81b904 --- /dev/null +++ b/tests/test_ui_language.py @@ -0,0 +1,93 @@ +import json +import tempfile +import unittest +from pathlib import Path + +from ui import app as ui_app + + +class UiLanguageTests(unittest.TestCase): + def test_default_language_is_chinese_when_config_has_no_language(self): + self.assertEqual(ui_app.get_configured_language({}), "zh_CN") + + def test_configured_language_accepts_english(self): + self.assertEqual(ui_app.get_configured_language({"language": "en_US"}), "en_US") + + def test_configured_language_rejects_unsupported_locale(self): + with self.assertRaises(ValueError): + ui_app.get_configured_language({"language": "ja_JP"}) + + def test_language_choice_maps_display_labels_to_locale_codes(self): + self.assertEqual(ui_app.resolve_language_choice("中文"), "zh_CN") + self.assertEqual(ui_app.resolve_language_choice("English"), "en_US") + self.assertEqual(ui_app.resolve_language_choice("en_US"), "en_US") + + def test_save_language_setting_persists_locale_code(self): + with tempfile.TemporaryDirectory() as tmp_dir: + config_path = Path(tmp_dir) / "config.json" + config_path.write_text(json.dumps({"device": "cpu"}), encoding="utf-8") + + status = ui_app.save_language_setting("English", config_path=config_path) + saved = json.loads(config_path.read_text(encoding="utf-8")) + + self.assertEqual(saved["language"], "en_US") + self.assertIn("English", status) + self.assertIn("restart", status.lower()) + + def test_language_packs_include_selector_keys(self): + for lang in ("zh_CN", "en_US"): + data = ui_app.load_i18n(lang) + self.assertIn("settings", data) + self.assertIn("language", data["settings"]) + self.assertIn("save_language", data["settings"]) + self.assertIn("language_saved_restart", data["settings"]) + + def test_language_packs_include_primary_ui_keys(self): + required_ui_keys = { + "cover_usage", + "series_filter", + "keyword_search", + "keyword_placeholder", + "character_choice_info", + "download_character_info", + "refresh_models", + "download_selected_character", + "download_series_all", + "download_all_characters", + "download_status", + "model_name", + "model_path", + "index_path", + "no_models", + "positive_pitch_info", + "normal_volume_info", + "reverb_info", + } + required_settings_keys = { + "runtime_settings", + "compute_device", + "save_settings", + "settings_saved_restart", + "status", + "cpu_slow", + "about_body", + "model_sources", + } + + for lang in ("zh_CN", "en_US"): + data = ui_app.load_i18n(lang) + self.assertTrue(required_ui_keys.issubset(data.get("ui", {}))) + self.assertTrue(required_settings_keys.issubset(data.get("settings", {}))) + + def test_ui_exposes_language_selector_and_save_handler(self): + source = Path("ui/app.py").read_text(encoding="utf-8") + + self.assertIn('label=t("language", "settings")', source) + self.assertIn("choices=list(LANGUAGE_LABEL_TO_CODE.keys())", source) + self.assertIn("fn=save_language_setting", source) + self.assertIn('t("cover_usage", "ui")', source) + self.assertIn('t("runtime_settings", "settings")', source) + + +if __name__ == "__main__": + unittest.main() diff --git a/tools/character_models.py b/tools/character_models.py index b312b0ae4d4c16282bc5b7584ce4695d5794fdd3..a2c4f32b75057d0372327c570e4b48c0b3c18184 100644 --- a/tools/character_models.py +++ b/tools/character_models.py @@ -9,6 +9,7 @@ import zipfile import shutil from pathlib import Path from typing import Optional, List, Dict, Callable +from urllib.parse import quote try: from huggingface_hub import hf_hub_download, list_repo_files @@ -21,6 +22,17 @@ except ImportError: HF_REPO_ID = "trioskosmos/rvc_models" _VERSION_NOTE_CACHE: Dict[str, Optional[str]] = {} _VERSION_NOTE_CACHE_LOADED = False +LOCAL_MODEL_INFO_FILENAME = "ai_rvc_model.json" +INTEGRATED_REPO_IDS = { + "trioskosmos/rvc_models", + "Icchan/LoveLive", + "0xMifune/LoveLive", + "Zurakichi/RVC", + "Swordsmagus/Love-Live-RVC", + "makiligon/RVC-Models", + "kohaku12/RVC-MODELS", + "megaaziib/my-rvc-models-collection", +} def _get_hf_token() -> Optional[str]: @@ -59,8 +71,133 @@ def normalize_series(source: str) -> str: return source -def _get_display_name(info: Dict, fallback: str) -> str: - """拼接中文名 / 英文名 / 日文名用于展示""" +def _dedupe_parts(parts: List[str]) -> List[str]: + result = [] + seen = set() + for part in parts: + text = str(part or "").strip() + if not text: + continue + if text in seen: + continue + seen.add(text) + result.append(text) + return result + + +def _get_registry_repo_id(info: Dict) -> Optional[str]: + repo_id = str(info.get("repo") or "").strip() + if repo_id: + return repo_id + if info.get("gdrive_id") or info.get("url"): + return None + if info.get("file") or info.get("files") or info.get("pattern"): + return HF_REPO_ID + return None + + +def _build_repo_page_url(repo_id: Optional[str]) -> Optional[str]: + repo_id = str(repo_id or "").strip() + if not repo_id: + return None + return f"https://huggingface.co/{repo_id}" + + +def _build_repo_file_url(repo_id: Optional[str], file_name: Optional[str]) -> Optional[str]: + repo_id = str(repo_id or "").strip() + file_name = str(file_name or "").strip() + if not repo_id or not file_name: + return None + return f"https://huggingface.co/{repo_id}/resolve/main/{quote(file_name, safe='/')}" + + +def _build_gdrive_view_url(file_id: Optional[str]) -> Optional[str]: + file_id = str(file_id or "").strip() + if not file_id: + return None + return f"https://drive.google.com/file/d/{file_id}/view?usp=sharing" + + +def _infer_download_method(info: Dict) -> str: + if info.get("gdrive_id"): + return "google_drive" + if info.get("url"): + return "direct_url" + if info.get("files"): + return "huggingface_files" + if info.get("pattern"): + return "huggingface_pattern" + if info.get("file"): + return "huggingface_zip" + return "unknown" + + +def _infer_distribution(info: Dict) -> str: + distribution = str(info.get("distribution") or "").strip() + if distribution: + return distribution + if info.get("gdrive_id"): + return "社区直链" + if info.get("url"): + if "huggingface.co" in str(info.get("url", "")): + return "独立仓库" + return "外部直链" + repo_id = _get_registry_repo_id(info) + if repo_id in INTEGRATED_REPO_IDS: + return "整合仓库" + if repo_id: + return "独立仓库" + return "未知来源" + + +def _infer_continuity(info: Dict) -> Optional[str]: + continuity = str(info.get("continuity") or "").strip() + if continuity: + return continuity + + source = str(info.get("source") or "").strip() + joined_name = " ".join( + str(info.get(key) or "") + for key in ("zh_name", "en_name", "jp_name", "variant") + ) + + if "幻日夜羽" in source or "夜羽" in joined_name or "ヨハネ" in joined_name: + return "幻日夜羽" + if source.startswith("Love Live! Sunshine!!"): + return "Sunshine 正篇" + if source.startswith("Love Live! 虹咲"): + return "虹咲学园" + if source.startswith("Love Live! Superstar!!"): + return "Superstar!!" + if source.startswith("Love Live! 莲之空"): + return "莲之空" + if source == "Love Live!": + return "μ's" + return None + + +def _humanize_variant(info: Dict) -> Optional[str]: + variant = str(info.get("variant") or "").strip() + if not variant: + return None + if variant.lower() == "trios": + return "trios 版" + return variant + + +def _build_variant_label(name: str, info: Dict) -> Optional[str]: + parts = [] + variant = _humanize_variant(info) + if variant: + parts.append(variant) + variant_note = _get_version_note(name, info) + if variant_note and variant_note != "未提供版本说明": + parts.append(variant_note) + deduped = _dedupe_parts(parts) + return " · ".join(deduped) if deduped else None + + +def _get_base_display_name(info: Dict, fallback: str) -> str: zh_name = info.get("zh_name") or info.get("description") or fallback en_name = info.get("en_name") jp_name = info.get("jp_name") @@ -69,13 +206,15 @@ def _get_display_name(info: Dict, fallback: str) -> str: parts.append(en_name) if jp_name and jp_name != zh_name and jp_name != en_name: parts.append(jp_name) - display = " / ".join(parts) - variant = info.get("variant") - if variant: - display = f"{display} - {variant}" - variant_note = _get_version_note(fallback, info) - if variant_note: - display = f"{display} ({variant_note})" + return " / ".join(parts) + + +def _get_display_name(info: Dict, fallback: str) -> str: + """拼接中文名 / 英文名 / 日文名用于展示""" + display = _get_base_display_name(info, fallback) + variant_label = _build_variant_label(fallback, info) + if variant_label: + display = f"{display} - {variant_label}" return display @@ -185,6 +324,11 @@ def _note_from_metadata(path: Path) -> Optional[str]: except Exception: return None + for key in ("version_label", "version_note", "variant"): + value = _normalize_note(str(data.get(key) or "")) + if value: + return value + parts = [] title = str(data.get("title") or "") desc = str(data.get("description") or "") @@ -281,7 +425,7 @@ def _get_version_note(name: str, info: Dict) -> Optional[str]: # 1) 读取本地模型目录中的 metadata / info char_dir = get_character_models_dir() / name if char_dir.exists(): - for candidate in ("metadata.json", "model_info.json", "info.json"): + for candidate in (LOCAL_MODEL_INFO_FILENAME, "metadata.json", "model_info.json", "info.json"): path = char_dir / candidate if path.exists(): note = _note_from_metadata(path) @@ -322,6 +466,108 @@ def _get_version_note(name: str, info: Dict) -> Optional[str]: return note +def _load_local_model_info(char_dir: Path) -> Dict: + info_path = char_dir / LOCAL_MODEL_INFO_FILENAME + if not info_path.exists(): + return {} + try: + with open(info_path, "r", encoding="utf-8") as f: + data = json.load(f) + if isinstance(data, dict): + return data + except Exception: + pass + return {} + + +def _build_source_page_url(info: Dict) -> Optional[str]: + explicit = str(info.get("source_page_url") or "").strip() + if explicit: + return explicit + repo_id = _get_registry_repo_id(info) + if repo_id: + return _build_repo_page_url(repo_id) + if info.get("url"): + return str(info.get("url")).strip() + return _build_gdrive_view_url(info.get("gdrive_id")) + + +def _build_download_url(info: Dict) -> Optional[str]: + explicit = str(info.get("download_url") or "").strip() + if explicit: + return explicit + if info.get("url"): + return str(info.get("url")).strip() + if info.get("gdrive_id"): + return _build_gdrive_view_url(info.get("gdrive_id")) + repo_id = _get_registry_repo_id(info) + file_name = info.get("file") + if file_name: + return _build_repo_file_url(repo_id, file_name) + files = info.get("files") or [] + if files: + return _build_repo_file_url(repo_id, files[0]) + return _build_repo_page_url(repo_id) + + +def _build_character_record(name: str, info: Dict) -> Dict: + source = info.get("source", "未知") + base_display = _get_base_display_name(info, name) + display = _get_display_name(info, name) + repo_id = _get_registry_repo_id(info) + version_note = _get_version_note(name, info) + version_label = ( + str(info.get("version_label") or "").strip() + or _build_variant_label(name, info) + or version_note + or "" + ) + return { + "name": name, + "description": info.get("description", display), + "base_display": base_display, + "display": display, + "source": source, + "series": normalize_series(source), + "variant": str(info.get("variant") or "").strip(), + "version_note": version_note, + "version_label": version_label, + "role": str(info.get("role") or "角色模型").strip() or "角色模型", + "continuity": _infer_continuity(info) or "", + "distribution": _infer_distribution(info), + "repo": repo_id, + "source_page_url": _build_source_page_url(info), + "download_url": _build_download_url(info), + "download_method": _infer_download_method(info), + "file": info.get("file"), + "files": info.get("files"), + "url": info.get("url"), + "gdrive_id": info.get("gdrive_id"), + "zh_name": info.get("zh_name"), + "en_name": info.get("en_name"), + "jp_name": info.get("jp_name"), + } + + +def _write_local_model_info(name: str, char_dir: Path, info: Dict): + try: + payload = _build_character_record(name, info) + payload["registry_key"] = name + payload["local_files"] = { + "pth": sorted(p.name for p in char_dir.glob("*.pth")), + "index": sorted(p.name for p in char_dir.glob("*.index")), + "metadata": sorted( + p.name for p in char_dir.glob("*.json") + if p.name != LOCAL_MODEL_INFO_FILENAME + ), + } + info_path = char_dir / LOCAL_MODEL_INFO_FILENAME + with open(info_path, "w", encoding="utf-8") as f: + json.dump(payload, f, ensure_ascii=False, indent=2) + except Exception: + pass + + def refresh_version_notes(force: bool = False) -> Dict[str, Optional[str]]: """批量读取本地模型的版本说明,写入缓存文件""" if force: @@ -514,7 +760,10 @@ CHARACTER_MODELS = { "zh_name": "松浦果南", "en_name": "Kanan Matsuura", "jp_name": "松浦果南", - "source": "Love Live! Sunshine!!" + "source": "Love Live! Sunshine!!", + "distribution": "社区直链", + "source_page_url": "https://rentry.co/llrvc", + "version_label": "RVC V2 · 211 epochs" }, "yu_takasaki": { "gdrive_id": "1xjIG_bsBzOTOwghGaLSMnO_vL2GgMPNw", @@ -1382,19 +1631,16 @@ def list_available_characters() -> List[Dict]: """ result = [] for name, info in CHARACTER_MODELS.items(): - source = info.get("source", "未知") - display = _get_display_name(info, name) - result.append({ - "name": name, - "description": info.get("description", display), - "display": display, - "source": source, - "series": normalize_series(source), - "file": info.get("file"), - "files": info.get("files"), - "url": info.get("url"), - "repo": info.get("repo", HF_REPO_ID) - }) + result.append(_build_character_record(name, info)) + result.sort( + key=lambda item: ( + str(item.get("series", "")), + str(item.get("base_display", item.get("display", ""))), + str(item.get("continuity", "")), + str(item.get("version_label", "")), + str(item.get("name", "")), + ) + ) return result @@ -1425,23 +1671,51 @@ def list_downloaded_characters() -> List[Dict]: continue seen.add(char_name) - info = CHARACTER_MODELS.get(char_name, {}) - source = info.get("source", "未知") - display = _get_display_name(info, char_name) + info = dict(CHARACTER_MODELS.get(char_name, {})) + local_info = _load_local_model_info(models_dir / char_name) + if local_info: + info.update({ + key: value for key, value in local_info.items() + if value not in (None, "", [], {}) + }) + record = _build_character_record(char_name, info) index_file = _find_index_file(pth_file) - downloaded.append({ - "name": char_name, - "description": info.get("description", display), - "display": display, - "source": source, - "series": normalize_series(source), + record.update({ "model_path": str(pth_file), - "index_path": str(index_file) if index_file else None + "index_path": str(index_file) if index_file else None, }) - + downloaded.append(record) + + downloaded.sort( + key=lambda item: ( + str(item.get("series", "")), + str(item.get("base_display", item.get("display", ""))), + str(item.get("continuity", "")), + str(item.get("version_label", "")), + str(item.get("name", "")), + ) + ) return downloaded +def get_character_info(name: str, downloaded_only: bool = False) -> Optional[Dict]: + if not name: + return None + if downloaded_only: + pool = list_downloaded_characters() + else: + pool = list_downloaded_characters() + list_available_characters() + seen = set() + for item in pool: + item_name = item.get("name") + if item_name in seen: + continue + seen.add(item_name) + if item_name == name: + return item + return None + + def get_character_model_path(name: str) -> Optional[Dict]: """ 获取角色模型路径 @@ -1515,6 +1789,7 @@ def download_character_model( # 检查是否已下载 char_dir = get_character_models_dir() / name if char_dir.exists() and list(char_dir.glob("*.pth")): + _write_local_model_info(name, char_dir, char_info) _safe_print(f"角色模型已存在: {name}") return True @@ -1681,6 +1956,7 @@ def download_character_model( if progress_callback: progress_callback(f"{name} 模型下载完成", 1.0) + _write_local_model_info(name, char_dir, char_info) # 下载完成后更新版本说明缓存 _get_version_note(name, char_info) _safe_print(f"角色模型已下载: {name}") diff --git a/tools/diagnose_vc_session.py b/tools/diagnose_vc_session.py new file mode 100644 index 0000000000000000000000000000000000000000..ecf1fbdad157054ea596938c23e69e276c4986a6 --- /dev/null +++ b/tools/diagnose_vc_session.py @@ -0,0 +1,680 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Generate a short A/B diagnostic matrix for an existing cover session.""" +from __future__ import annotations + +import argparse +import json +import re +import sys +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Sequence, Tuple + +import numpy as np +import soundfile as sf + +REPO_ROOT = Path(__file__).resolve().parent.parent +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from infer.cover_pipeline import CoverPipeline +from infer.official_adapter import convert_vocals_official, convert_vocals_official_upstream + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Run several short VC variants on an existing session and export " + "per-clip WAVs plus summary.json for side-by-side diagnosis." + ) + ) + parser.add_argument("--session-dir", required=True, help="Session directory under temp/cover/") + parser.add_argument( + "--model-path", + help=( + "RVC .pth path. If omitted, try to infer from " + "outputs/__all_files_." + ), + ) + parser.add_argument("--index-path", help="Optional .index path. If omitted, try best-effort matching.") + parser.add_argument("--output-dir", help="Where to write diagnostic_matrix outputs.") + parser.add_argument("--stage", default="vc_final_state", help="Preferred stage in quality_debug.json for suspect times.") + parser.add_argument("--times", help="Comma-separated times in seconds. If set, skip automatic time selection.") + parser.add_argument("--max-clips", type=int, default=4, help="Maximum suspect clips to export.") + parser.add_argument("--window-sec", type=float, default=1.8, help="Window size around each suspect time.") + parser.add_argument("--gap-sec", type=float, default=0.35, help="Silence gap inserted between concatenated clips.") + parser.add_argument( + "--variants", + default="existing_raw,existing_final,upstream_raw,upstream_post,upstream_index0,official_repair", + help=( + "Comma-separated variants. Supported: " + "existing_raw, existing_final, upstream_raw, upstream_post, " + "upstream_index0, official_repair" + ), + ) + parser.add_argument("--f0-method", default="hybrid") + parser.add_argument("--pitch-shift", type=int, default=0) + parser.add_argument("--index-rate", type=float, default=0.5) + parser.add_argument("--filter-radius", type=int, default=3) + parser.add_argument("--rms-mix-rate", type=float, default=0.0) + parser.add_argument("--protect", type=float, default=0.33) + parser.add_argument("--speaker-id", type=int, default=0) + parser.add_argument("--source-constraint-mode", default="auto") + return parser.parse_args() + + +def _normalize_name(text: str) -> str: + return re.sub(r"[^a-z0-9]+", "", str(text or "").lower()) + + +def _tokenize_name(text: str) -> List[str]: + return [token for token in re.split(r"[^a-z0-9]+", str(text or "").lower()) if len(token) >= 2] + + +def _load_quality_debug(session_dir: Path) -> Dict[str, object]: + report_path = session_dir / "quality_debug.json" + if not report_path.exists(): + return {} + with open(report_path, "r", encoding="utf-8") as f: + return json.load(f) + + +def _get_stage_entry(entries: Sequence[Dict[str, object]], stage: str) -> Optional[Dict[str, object]]: + for entry in reversed(entries): + if entry.get("stage") == stage: + return entry + return None + + +def _dedupe_times(times: Iterable[float], min_delta_sec: float = 0.18) -> List[float]: + ordered = sorted(float(t) for t in times if float(t) >= 0.0) + deduped: List[float] = [] + for time_sec in ordered: + if not deduped or abs(time_sec - deduped[-1]) >= min_delta_sec: + deduped.append(round(time_sec, 3)) + return deduped + + +def _collect_suspect_times( + session_dir: Path, + preferred_stage: str, + max_clips: int, +) -> Tuple[List[float], Dict[str, object]]: + payload = _load_quality_debug(session_dir) + entries = payload.get("stages", []) + if not isinstance(entries, list): + entries = [] + + stage_order = [ + preferred_stage, + "vc_final_state", + "vc_refined", + "vc_source_constrained", + "vc_raw", + ] + collected: List[float] = [] + for stage in stage_order: + entry = _get_stage_entry(entries, stage) + if not entry: + continue + analysis = entry.get("analysis", {}) + if not isinstance(analysis, dict): + continue + times = analysis.get("transition_spike_times_sec", []) + if isinstance(times, list): + collected.extend(float(t) for t in times[:12]) + collected = _dedupe_times(collected) + if len(collected) >= max_clips: + break + + return collected[: max(1, int(max_clips))], payload if isinstance(payload, dict) else {} + + +def _choose_model_file(folder: Path) -> Path: + candidates = sorted(folder.glob("*.pth")) + if not candidates: + raise FileNotFoundError(f"No .pth model found in {folder}") + if len(candidates) == 1: + return candidates[0] + + folder_norm = _normalize_name(folder.name) + best_path = candidates[0] + best_score = -1 + for path in candidates: + stem_norm = _normalize_name(path.stem) + score = 0 + if stem_norm == folder_norm: + score += 1000 + if folder_norm and (folder_norm in stem_norm or stem_norm in folder_norm): + score += 300 + score += len(set(_tokenize_name(folder.name)) & set(_tokenize_name(path.stem))) * 40 + if score > best_score: + best_score = score + best_path = path + return best_path + + +def _resolve_index_for_model(model_path: Path, explicit_index: Optional[str] = None) -> Optional[Path]: + if explicit_index: + explicit = Path(explicit_index) + if explicit.exists(): + return explicit + + candidates = list(model_path.parent.glob("*.index")) + official_indexes = REPO_ROOT / "assets" / "weights" / "official_indexes" + if official_indexes.exists(): + candidates.extend(list(official_indexes.glob("*.index"))) + if not candidates: + return None + if len(candidates) == 1: + return candidates[0] + + model_norm = _normalize_name(model_path.stem) + model_tokens = set(_tokenize_name(model_path.stem)) + best_match = None + best_score = -1 + for candidate in candidates: + idx_norm = _normalize_name(candidate.stem) + idx_tokens = set(_tokenize_name(candidate.stem)) + score = 0 + if idx_norm == model_norm: + score += 1000 + if model_norm and (model_norm in idx_norm or idx_norm in model_norm): + score += 300 + score += len(model_tokens & idx_tokens) * 40 + if "added" in candidate.stem.lower(): + score += 10 + if score > best_score: + best_score = score + best_match = candidate + return best_match if best_score > 0 else None + + +def _infer_model_from_session(session_dir: Path) -> Tuple[Path, Optional[Path], Optional[str]]: + session_id = session_dir.name + outputs_root = REPO_ROOT / "outputs" + characters_root = REPO_ROOT / "assets" / "weights" / "characters" + if not outputs_root.exists() or not characters_root.exists(): + raise FileNotFoundError("Cannot infer model because outputs/ or assets/weights/characters/ is missing.") + + output_matches = sorted(outputs_root.glob(f"*_all_files_{session_id}")) + if not output_matches: + raise FileNotFoundError("Could not infer model from outputs/. Pass --model-path explicitly.") + + character_dirs = [path for path in characters_root.iterdir() if path.is_dir()] + best_dir = None + best_score = -1 + for output_dir in output_matches: + prefix = output_dir.name[: -len(f"_all_files_{session_id}")] + for character_dir in character_dirs: + char_name = character_dir.name + if prefix == char_name or prefix.endswith(f"_{char_name}"): + score = len(char_name) + if score > best_score: + best_score = score + best_dir = character_dir + + if best_dir is None: + raise FileNotFoundError( + "Could not match the session output folder to a character model folder. " + "Pass --model-path explicitly." + ) + + model_path = _choose_model_file(best_dir) + index_path = _resolve_index_for_model(model_path) + return model_path, index_path, best_dir.name + + +def _extract_bundle( + source_path: Path, + output_path: Path, + times_sec: Sequence[float], + window_sec: float, + gap_sec: float, +) -> List[Dict[str, float]]: + audio, sr = sf.read(str(source_path), always_2d=True) + audio = np.asarray(audio, dtype=np.float32) + if audio.shape[0] <= 0: + raise ValueError(f"Empty audio: {source_path}") + + duration_sec = float(audio.shape[0] / max(sr, 1)) + bundle_parts: List[np.ndarray] = [] + manifest: List[Dict[str, float]] = [] + cursor_sec = 0.0 + gap_samples = int(round(max(0.0, gap_sec) * sr)) + target_window = max(0.2, float(window_sec)) + + for index, time_sec in enumerate(times_sec): + clip_start_sec = 0.0 + if duration_sec > target_window: + clip_start_sec = min(max(0.0, float(time_sec) - target_window / 2.0), duration_sec - target_window) + clip_end_sec = min(duration_sec, clip_start_sec + target_window) + start_sample = int(round(clip_start_sec * sr)) + end_sample = max(start_sample + 1, int(round(clip_end_sec * sr))) + clip = audio[start_sample:end_sample] + if clip.size == 0: + continue + + actual_duration = float(clip.shape[0] / sr) + bundle_parts.append(clip.astype(np.float32)) + manifest.append( + { + "clip_id": index, + "source_time_sec": round(float(time_sec), 3), + "clip_start_sec": round(float(clip_start_sec), 6), + "clip_end_sec": round(float(clip_end_sec), 6), + "bundle_start_sec": round(float(cursor_sec), 6), + "bundle_end_sec": round(float(cursor_sec + actual_duration), 6), + "duration_sec": round(actual_duration, 6), + } + ) + cursor_sec += actual_duration + + if gap_samples > 0 and index < len(times_sec) - 1: + bundle_parts.append(np.zeros((gap_samples, audio.shape[1]), dtype=np.float32)) + cursor_sec += float(gap_samples / sr) + + if not bundle_parts: + raise ValueError(f"No clips were extracted from {source_path}") + + output_path.parent.mkdir(parents=True, exist_ok=True) + bundle_audio = np.concatenate(bundle_parts, axis=0).astype(np.float32) + sf.write(str(output_path), bundle_audio, sr) + return manifest + + +def _split_bundle( + bundle_path: Path, + manifest: Sequence[Dict[str, float]], + output_dir: Path, + prefix: str, +) -> Dict[str, str]: + audio, sr = sf.read(str(bundle_path), always_2d=True) + audio = np.asarray(audio, dtype=np.float32) + output_dir.mkdir(parents=True, exist_ok=True) + exported: Dict[str, str] = {} + + for segment in manifest: + bundle_start_sec = float(segment["bundle_start_sec"]) + bundle_end_sec = float(segment["bundle_end_sec"]) + start_sample = max(0, int(round(bundle_start_sec * sr))) + end_sample = min(audio.shape[0], max(start_sample + 1, int(round(bundle_end_sec * sr)))) + if end_sample <= start_sample: + continue + clip = audio[start_sample:end_sample].astype(np.float32) + clip_key = f"{float(segment['source_time_sec']):07.3f}s" + output_path = output_dir / f"{prefix}_{clip_key}.wav" + sf.write(str(output_path), clip, sr) + exported[clip_key] = str(output_path) + + return exported + + +def _estimate_artifact_score(analysis: Dict[str, object]) -> float: + quiet_rms = max(0.0, float(analysis.get("quiet_rms_ratio", 1.0) or 1.0) - 1.0) + quiet_hf = max(0.0, float(analysis.get("quiet_hf_ratio", 1.0) or 1.0) - 1.0) + midquiet_hf = max(0.0, float(analysis.get("midquiet_hf_ratio", 1.0) or 1.0) - 1.0) + spike = max(0.0, float(analysis.get("transition_spike_ratio", 1.0) or 1.0) - 1.0) + active = max(0.0, float(analysis.get("active_rms_ratio", 1.0) or 1.0) - 1.5) + breaths = max(0.0, float(analysis.get("synthetic_breath_frames", 0) or 0) - 80.0) / 220.0 + corr_penalty = max(0.0, 0.85 - float(analysis.get("corr_active", 0.0) or 0.0)) + return float( + 0.22 * quiet_rms + + 0.28 * quiet_hf + + 0.08 * midquiet_hf + + 0.22 * spike + + 0.05 * active + + 0.10 * breaths + + 0.05 * corr_penalty + ) + + +def _aggregate_variant_metrics(analyses: Sequence[Dict[str, object]]) -> Dict[str, object]: + if not analyses: + return {"clips": 0} + + keys = [ + "active_rms_ratio", + "quiet_rms_ratio", + "quiet_hf_ratio", + "midquiet_hf_ratio", + "transition_spike_ratio", + "corr_active", + "synthetic_breath_frames", + ] + aggregate: Dict[str, object] = {"clips": len(analyses)} + for key in keys: + values = [float(item.get(key, 0.0) or 0.0) for item in analyses] + aggregate[f"avg_{key}"] = float(sum(values) / max(len(values), 1)) + aggregate["avg_artifact_score_estimate"] = float( + sum(_estimate_artifact_score(item) for item in analyses) / max(len(analyses), 1) + ) + return aggregate + + +def _preprocess_mode_from_payload(payload: Dict[str, object]) -> str: + entries = payload.get("stages", []) + if not isinstance(entries, list): + return "direct" + vc_input = _get_stage_entry(entries, "vc_input") + if not vc_input: + return "direct" + mode = vc_input.get("preprocess_mode") + if isinstance(mode, str) and mode.strip(): + return mode.strip() + extra = vc_input.get("extra", {}) + if isinstance(extra, dict): + mode = extra.get("effective_preprocess_mode") + if isinstance(mode, str) and mode.strip(): + return mode.strip() + return "direct" + + +def _run_upstream_bundle( + source_bundle: Path, + output_bundle: Path, + model_path: Path, + index_path: Optional[Path], + args: argparse.Namespace, + index_rate: Optional[float] = None, +) -> None: + convert_vocals_official_upstream( + vocals_path=str(source_bundle), + output_path=str(output_bundle), + model_path=str(model_path), + index_path=str(index_path) if index_path else None, + f0_method=args.f0_method, + pitch_shift=args.pitch_shift, + index_rate=float(args.index_rate if index_rate is None else index_rate), + filter_radius=args.filter_radius, + rms_mix_rate=args.rms_mix_rate, + protect=args.protect, + speaker_id=args.speaker_id, + ) + + +def _run_variant( + variant: str, + output_dir: Path, + manifest: Sequence[Dict[str, float]], + source_bundle: Path, + lead_bundle: Optional[Path], + session_dir: Path, + model_path: Path, + index_path: Optional[Path], + args: argparse.Namespace, + pipeline: CoverPipeline, +) -> Dict[str, object]: + variant_dir = output_dir / variant + variant_dir.mkdir(parents=True, exist_ok=True) + bundle_path = variant_dir / "bundle.wav" + result: Dict[str, object] = {"name": variant, "bundle_path": str(bundle_path)} + clip_times = [float(item["source_time_sec"]) for item in manifest] + + if variant == "existing_raw": + existing_path = session_dir / "debug_converted_raw.wav" + if not existing_path.exists(): + return {"name": variant, "skipped": "missing debug_converted_raw.wav"} + _extract_bundle(existing_path, bundle_path, clip_times, args.window_sec, args.gap_sec) + elif variant == "existing_final": + existing_path = session_dir / "converted_vocals.wav" + if not existing_path.exists(): + return {"name": variant, "skipped": "missing converted_vocals.wav"} + _extract_bundle(existing_path, bundle_path, clip_times, args.window_sec, args.gap_sec) + elif variant == "upstream_raw": + _run_upstream_bundle(source_bundle, bundle_path, model_path, index_path, args) + elif variant == "upstream_index0": + _run_upstream_bundle(source_bundle, bundle_path, model_path, index_path, args, index_rate=0.0) + elif variant == "upstream_post": + raw_bundle_path = variant_dir / "bundle_raw.wav" + _run_upstream_bundle(source_bundle, raw_bundle_path, model_path, index_path, args) + should_apply = pipeline._should_apply_source_constraint( + vc_preprocessed=True, + source_constraint_mode=args.source_constraint_mode, + ) + if should_apply: + pipeline._constrain_converted_to_source( + source_vocals_path=str(source_bundle), + converted_vocals_path=str(raw_bundle_path), + original_vocals_path=str(lead_bundle) if lead_bundle else None, + output_path=str(bundle_path), + ) + pipeline._refine_source_constrained_output( + source_vocals_path=str(source_bundle), + converted_vocals_path=str(bundle_path), + source_constraint_mode=args.source_constraint_mode, + f0_method=args.f0_method, + ) + else: + bundle_path.write_bytes(raw_bundle_path.read_bytes()) + result["note"] = ( + "postprocess skipped because current preprocess/source-constraint " + "combination would not apply it" + ) + elif variant == "official_repair": + saved_argv = sys.argv[:] + try: + sys.argv = [sys.argv[0]] + convert_vocals_official( + vocals_path=str(source_bundle), + output_path=str(bundle_path), + model_path=str(model_path), + index_path=str(index_path) if index_path else None, + f0_method=args.f0_method, + pitch_shift=args.pitch_shift, + index_rate=float(args.index_rate), + filter_radius=args.filter_radius, + rms_mix_rate=args.rms_mix_rate, + protect=args.protect, + speaker_id=args.speaker_id, + repair_profile=True, + ) + finally: + sys.argv = saved_argv + else: + return {"name": variant, "skipped": f"unknown variant: {variant}"} + + clips_dir = variant_dir / "clips" + result["clips"] = _split_bundle(bundle_path, manifest, clips_dir, "candidate") + return result + + +def main() -> int: + args = parse_args() + session_dir = Path(args.session_dir) + if not session_dir.is_absolute(): + session_dir = (REPO_ROOT / session_dir).resolve() + if not session_dir.exists(): + raise FileNotFoundError(f"Session dir not found: {session_dir}") + + output_dir = Path(args.output_dir) if args.output_dir else session_dir / "diagnostic_matrix" + if not output_dir.is_absolute(): + output_dir = (REPO_ROOT / output_dir).resolve() + output_dir.mkdir(parents=True, exist_ok=True) + + if args.times: + times_sec = _dedupe_times(float(item.strip()) for item in args.times.split(",") if item.strip()) + payload = _load_quality_debug(session_dir) + else: + times_sec, payload = _collect_suspect_times(session_dir, args.stage, args.max_clips) + + if not times_sec: + raise ValueError( + "No suspect times found. Pass --times explicitly, for example " + '--times "22.48,23.04,24.17".' + ) + + inferred_model_key = None + if args.model_path: + model_path = Path(args.model_path) + if not model_path.is_absolute(): + model_path = (REPO_ROOT / model_path).resolve() + else: + model_path, inferred_index_path, inferred_model_key = _infer_model_from_session(session_dir) + if not args.index_path and inferred_index_path is not None: + args.index_path = str(inferred_index_path) + + if not model_path.exists(): + raise FileNotFoundError(f"Model path not found: {model_path}") + + index_path = _resolve_index_for_model(model_path, args.index_path) + preprocess_mode = _preprocess_mode_from_payload(payload) + + source_full_path = session_dir / "vocals_for_vc.wav" + if not source_full_path.exists(): + raise FileNotFoundError(f"Missing source vocals for VC: {source_full_path}") + + lead_full_path = session_dir / "karaoke" / "lead_vocals.wav" + if not lead_full_path.exists(): + fallback_lead = session_dir / "lead_vocals.wav" + lead_full_path = fallback_lead if fallback_lead.exists() else source_full_path + + reference_dir = output_dir / "reference" + source_bundle = reference_dir / "vocals_for_vc_bundle.wav" + lead_bundle = reference_dir / "lead_vocals_bundle.wav" + manifest = _extract_bundle(source_full_path, source_bundle, times_sec, args.window_sec, args.gap_sec) + _extract_bundle(lead_full_path, lead_bundle, times_sec, args.window_sec, args.gap_sec) + reference_clips = _split_bundle(source_bundle, manifest, reference_dir / "clips", "reference") + + pipeline = CoverPipeline(device="cuda") + pipeline._last_vc_preprocess_mode = preprocess_mode + + requested_variants = [item.strip() for item in args.variants.split(",") if item.strip()] + variant_results: Dict[str, Dict[str, object]] = {} + per_clip_results: List[Dict[str, object]] = [] + aggregate_results: Dict[str, Dict[str, object]] = {} + + print(f"Session: {session_dir}") + print(f"Output: {output_dir}") + print(f"Model: {model_path}") + print(f"Index: {index_path if index_path else '(none)'}") + print(f"Preprocess mode from session: {preprocess_mode}") + print("Suspect times:", ", ".join(f"{time_sec:.3f}s" for time_sec in times_sec)) + + for variant in requested_variants: + print(f"[diagnose] running {variant} ...") + variant_result = _run_variant( + variant=variant, + output_dir=output_dir, + manifest=manifest, + source_bundle=source_bundle, + lead_bundle=lead_bundle, + session_dir=session_dir, + model_path=model_path, + index_path=index_path, + args=args, + pipeline=pipeline, + ) + variant_results[variant] = variant_result + + for segment in manifest: + clip_key = f"{float(segment['source_time_sec']):07.3f}s" + clip_summary: Dict[str, object] = { + "time_sec": float(segment["source_time_sec"]), + "reference_clip": reference_clips.get(clip_key), + "variants": {}, + } + for variant in requested_variants: + variant_result = variant_results.get(variant, {}) + variant_clips = variant_result.get("clips", {}) if isinstance(variant_result, dict) else {} + if not isinstance(variant_clips, dict): + continue + candidate_path = variant_clips.get(clip_key) + reference_path = reference_clips.get(clip_key) + if not candidate_path or not reference_path: + continue + analysis = CoverPipeline._analyze_quality_stage(candidate_path=candidate_path, reference_path=reference_path) + analysis["artifact_score_estimate"] = _estimate_artifact_score(analysis) + clip_summary["variants"][variant] = { + "path": candidate_path, + "analysis": analysis, + } + ranking = [] + variants_dict = clip_summary.get("variants", {}) + if isinstance(variants_dict, dict): + for variant_name, data in variants_dict.items(): + if not isinstance(data, dict): + continue + analysis = data.get("analysis", {}) + if not isinstance(analysis, dict): + continue + ranking.append( + { + "variant": variant_name, + "artifact_score_estimate": float(analysis.get("artifact_score_estimate", 0.0) or 0.0), + } + ) + ranking.sort(key=lambda item: item["artifact_score_estimate"]) + clip_summary["ranking"] = ranking + per_clip_results.append(clip_summary) + + for variant in requested_variants: + analyses = [] + for clip_summary in per_clip_results: + variants_dict = clip_summary.get("variants", {}) + if not isinstance(variants_dict, dict): + continue + data = variants_dict.get(variant) + if not isinstance(data, dict): + continue + analysis = data.get("analysis") + if isinstance(analysis, dict): + analyses.append(analysis) + aggregate = _aggregate_variant_metrics(analyses) + variant_result = variant_results.get(variant, {}) + if isinstance(variant_result, dict) and "skipped" in variant_result: + aggregate["skipped"] = variant_result["skipped"] + aggregate_results[variant] = aggregate + + ranking = [ + { + "variant": variant, + "avg_artifact_score_estimate": float(data.get("avg_artifact_score_estimate", 9999.0) or 9999.0), + "clips": int(data.get("clips", 0) or 0), + } + for variant, data in aggregate_results.items() + if not data.get("skipped") + ] + ranking.sort(key=lambda item: item["avg_artifact_score_estimate"]) + + summary = { + "session_dir": str(session_dir), + "output_dir": str(output_dir), + "model_path": str(model_path), + "index_path": str(index_path) if index_path else None, + "model_inferred_from_session": inferred_model_key, + "preprocess_mode": preprocess_mode, + "source_constraint_mode": args.source_constraint_mode, + "times_sec": [float(time_sec) for time_sec in times_sec], + "window_sec": float(args.window_sec), + "gap_sec": float(args.gap_sec), + "manifest": manifest, + "variants": variant_results, + "per_clip": per_clip_results, + "aggregate": aggregate_results, + "ranking": ranking, + "score_note": ( + "artifact_score_estimate is a heuristic based on quiet-region energy, " + "high-frequency excess, transition spikes, and synthetic breath count. Lower is better." + ), + } + + summary_path = output_dir / "summary.json" + with open(summary_path, "w", encoding="utf-8") as f: + json.dump(summary, f, ensure_ascii=False, indent=2) + + print("\nAggregate ranking (lower artifact score is better):") + if ranking: + for item in ranking: + print( + f" {item['variant']}: score={item['avg_artifact_score_estimate']:.4f}, " + f"clips={item['clips']}" + ) + else: + print(" No completed variants.") + print(f"\nSummary written to: {summary_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/evaluate_karaoke_models.py b/tools/evaluate_karaoke_models.py new file mode 100644 index 0000000000000000000000000000000000000000..3f3dd02402eba7d218d178f193e26afbbe4c2734 --- /dev/null +++ b/tools/evaluate_karaoke_models.py @@ -0,0 +1,325 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Evaluate local karaoke lead/backing split candidates with reproducible metrics.""" +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path +from typing import Dict, Iterable, List + +import numpy as np +import soundfile as sf + +REPO_ROOT = Path(__file__).resolve().parent.parent +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from infer.separator import KARAOKE_DEFAULT_MODEL, KaraokeSeparator +from lib.audio_metrics import evaluate_reference_stems + + +KARAOKE_MIN_LENGTH_COVERAGE = 0.999 + + +def _load_mono(path: Path) -> tuple[np.ndarray, int]: + audio, sr = sf.read(str(path), always_2d=True) + mono = np.asarray(audio, dtype=np.float32).mean(axis=1) + return mono.astype(np.float32), int(sr) + + +def _rms(audio: np.ndarray) -> float: + audio = np.asarray(audio, dtype=np.float32).reshape(-1) + if audio.size == 0: + return 0.0 + return float(np.sqrt(np.mean(np.square(audio), dtype=np.float64) + 1e-12)) + + +def _abs_corr(a: np.ndarray, b: np.ndarray) -> float: + a = np.asarray(a, dtype=np.float32).reshape(-1) + b = np.asarray(b, dtype=np.float32).reshape(-1) + n = min(a.size, b.size) + if n < 8: + return 0.0 + a = a[:n] + b = b[:n] + if float(np.std(a)) <= 1e-8 or float(np.std(b)) <= 1e-8: + return 0.0 + corr = np.corrcoef(a, b)[0, 1] + if not np.isfinite(corr): + return 0.0 + return float(abs(corr)) + + +def score_karaoke_stems( + input_vocals_path: Path, + lead_path: Path, + backing_path: Path, +) -> Dict[str, float]: + """Score a karaoke split without ground-truth stems. + + This is a proxy, not SDR: good candidates reconstruct the input when summed, + keep lead/backing decorrelated, and preserve a plausible backing bed instead + of collapsing everything into either stem. + """ + input_audio, input_sr = _load_mono(Path(input_vocals_path)) + lead_audio, lead_sr = _load_mono(Path(lead_path)) + backing_audio, backing_sr = _load_mono(Path(backing_path)) + if lead_sr != input_sr or backing_sr != input_sr: + raise ValueError("Karaoke scoring expects matching sample rates.") + + input_len = input_audio.size + aligned_len = min(input_audio.size, lead_audio.size, backing_audio.size) + if aligned_len <= 0: + raise ValueError("Karaoke scoring received empty audio.") + length_coverage = float(aligned_len / max(1, input_len)) + + input_audio = input_audio[:aligned_len] + lead_audio = lead_audio[:aligned_len] + backing_audio = backing_audio[:aligned_len] + + input_rms = _rms(input_audio) + lead_rms = _rms(lead_audio) + backing_rms = _rms(backing_audio) + reconstruction_error = _rms(input_audio - lead_audio - backing_audio) / (input_rms + 1e-12) + lead_backing_abs_corr = _abs_corr(lead_audio, backing_audio) + lead_input_abs_corr = _abs_corr(lead_audio, input_audio) + lead_ratio = lead_rms / (input_rms + 1e-12) + backing_ratio = backing_rms / (input_rms + 1e-12) + + backing_target = 0.24 + backing_balance_penalty = abs(np.log2(max(float(backing_ratio), 1e-4) / backing_target)) + lead_body_penalty = max(0.0, 0.70 - float(lead_ratio)) + max(0.0, float(lead_ratio) - 1.15) + length_penalty = max(0.0, 1.0 - float(length_coverage)) + score = float( + 100.0 + - 46.0 * float(reconstruction_error) + - 30.0 * float(lead_backing_abs_corr) + - 8.0 * float(backing_balance_penalty) + - 12.0 * float(lead_body_penalty) + - 200.0 * float(length_penalty) + + 3.0 * float(lead_input_abs_corr) + ) + + return { + "score": score, + "input_rms": float(input_rms), + "lead_rms": float(lead_rms), + "backing_rms": float(backing_rms), + "lead_ratio": float(lead_ratio), + "backing_ratio": float(backing_ratio), + "length_coverage": float(length_coverage), + "length_penalty": float(length_penalty), + "reconstruction_error": float(reconstruction_error), + "lead_backing_abs_corr": float(lead_backing_abs_corr), + "lead_input_abs_corr": float(lead_input_abs_corr), + } + + +def score_reference_stems( + reference_lead_path: Path, + reference_backing_path: Path, + lead_path: Path, + backing_path: Path, +) -> Dict[str, object]: + """Compute true reference-based SI-SDR/SDR when reference stems exist.""" + reference_lead, reference_lead_sr = _load_mono(Path(reference_lead_path)) + reference_backing, reference_backing_sr = _load_mono(Path(reference_backing_path)) + lead_audio, lead_sr = _load_mono(Path(lead_path)) + backing_audio, backing_sr = _load_mono(Path(backing_path)) + if len({reference_lead_sr, reference_backing_sr, lead_sr, backing_sr}) != 1: + raise ValueError("Reference scoring expects matching sample rates.") + + return evaluate_reference_stems( + references={"lead": reference_lead, "backing": reference_backing}, + estimates={"lead": lead_audio, "backing": backing_audio}, + ) + + +def _unique(items: Iterable[str]) -> List[str]: + result: List[str] = [] + for item in items: + if item and item not in result: + result.append(item) + return result + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--vocals-path", required=True, help="Separated vocals.wav to split into lead/backing.") + parser.add_argument("--output-dir", required=True, help="Directory for candidate stems and reports.") + parser.add_argument( + "--models", + nargs="*", + default=None, + help="Karaoke model filenames. Defaults to the current strict SOTA default.", + ) + parser.add_argument("--reference-lead", default=None, help="Optional ground-truth/reference lead stem.") + parser.add_argument("--reference-backing", default=None, help="Optional ground-truth/reference backing stem.") + parser.add_argument("--device", default="cuda") + return parser.parse_args() + + +def _resolve_existing_path(path_value: str | None, label: str) -> Path | None: + if not path_value: + return None + path = Path(path_value) + if not path.is_absolute(): + path = (REPO_ROOT / path).resolve() + if not path.exists(): + raise FileNotFoundError(f"{label} not found: {path}") + return path + + +def _result_sort_key(item: dict) -> tuple: + reference_metrics = item.get("reference_metrics") + if reference_metrics: + return (1, float(reference_metrics["mean_si_sdr"])) + metrics = item["metrics"] + return ( + 0, + float(metrics["length_coverage"]) >= KARAOKE_MIN_LENGTH_COVERAGE, + float(metrics["score"]), + ) + + +def main() -> int: + args = parse_args() + vocals_path = Path(args.vocals_path) + if not vocals_path.is_absolute(): + vocals_path = (REPO_ROOT / vocals_path).resolve() + if not vocals_path.exists(): + raise FileNotFoundError(f"Vocals path not found: {vocals_path}") + + reference_lead_path = _resolve_existing_path(args.reference_lead, "Reference lead") + reference_backing_path = _resolve_existing_path(args.reference_backing, "Reference backing") + has_references = reference_lead_path is not None or reference_backing_path is not None + if has_references and not (reference_lead_path and reference_backing_path): + raise ValueError("--reference-lead and --reference-backing must be provided together.") + + output_dir = Path(args.output_dir) + if not output_dir.is_absolute(): + output_dir = (REPO_ROOT / output_dir).resolve() + output_dir.mkdir(parents=True, exist_ok=True) + + models = _unique(args.models or [KARAOKE_DEFAULT_MODEL]) + results = [] + for model_name in models: + candidate_dir = output_dir / Path(model_name).stem + candidate_dir.mkdir(parents=True, exist_ok=True) + separator = KaraokeSeparator(model_filename=model_name, device=args.device) + try: + lead_path, backing_path = separator.separate(str(vocals_path), str(candidate_dir)) + metrics = score_karaoke_stems(vocals_path, Path(lead_path), Path(backing_path)) + reference_metrics = None + if reference_lead_path and reference_backing_path: + reference_metrics = score_reference_stems( + reference_lead_path=reference_lead_path, + reference_backing_path=reference_backing_path, + lead_path=Path(lead_path), + backing_path=Path(backing_path), + ) + results.append( + { + "model": model_name, + "lead_path": str(lead_path), + "backing_path": str(backing_path), + "metrics": metrics, + "reference_metrics": reference_metrics, + } + ) + if reference_metrics: + print( + f"{model_name}: mean_si_sdr={reference_metrics['mean_si_sdr']:.3f}, " + f"diagnostic_score={metrics['score']:.3f}" + ) + else: + print(f"{model_name}: score={metrics['score']:.3f}, backing_ratio={metrics['backing_ratio']:.3f}") + finally: + separator.unload_model() + + results.sort(key=_result_sort_key, reverse=True) + summary = { + "vocals_path": str(vocals_path), + "output_dir": str(output_dir), + "reference_lead_path": str(reference_lead_path) if reference_lead_path else None, + "reference_backing_path": str(reference_backing_path) if reference_backing_path else None, + "ranking": [ + { + "rank": index + 1, + "model": item["model"], + **item["metrics"], + **( + { + "reference_mean_si_sdr": item["reference_metrics"]["mean_si_sdr"], + "reference_mean_sdr": item["reference_metrics"]["mean_sdr"], + } + if item.get("reference_metrics") + else {} + ), + } + for index, item in enumerate(results) + ], + "results": results, + "score_note": ( + "SI-SDR/SDR are only computed when reference stems are provided. " + "Without references, score is a diagnostic proxy for reconstruction, " + "decorrelation, plausible backing level, lead/input coherence, and full-length coverage." + ), + } + summary_path = output_dir / "karaoke_model_report.json" + summary_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") + + markdown_lines = [ + "# Karaoke Model Report", + "", + f"- vocals: `{vocals_path}`", + f"- output: `{output_dir}`", + "", + ] + if has_references: + markdown_lines.extend( + [ + f"- reference lead: `{reference_lead_path}`", + f"- reference backing: `{reference_backing_path}`", + "", + "| rank | model | mean_si_sdr | mean_sdr | diag_score | len | recon_err | corr |", + "|---:|---|---:|---:|---:|---:|---:|---:|", + ] + ) + else: + markdown_lines.extend( + [ + "| rank | model | score | len | recon_err | corr | lead_in_corr | lead_ratio | backing_ratio |", + "|---:|---|---:|---:|---:|---:|---:|---:|---:|", + ] + ) + for index, item in enumerate(results, start=1): + metrics = item["metrics"] + reference_metrics = item.get("reference_metrics") + if reference_metrics: + markdown_lines.append( + f"| {index} | `{item['model']}` | {reference_metrics['mean_si_sdr']:.3f} | " + f"{reference_metrics['mean_sdr']:.3f} | {metrics['score']:.3f} | " + f"{metrics['length_coverage']:.4f} | {metrics['reconstruction_error']:.4f} | " + f"{metrics['lead_backing_abs_corr']:.4f} |" + ) + else: + markdown_lines.append( + f"| {index} | `{item['model']}` | {metrics['score']:.3f} | " + f"{metrics['length_coverage']:.4f} | " + f"{metrics['reconstruction_error']:.4f} | {metrics['lead_backing_abs_corr']:.4f} | " + f"{metrics['lead_input_abs_corr']:.4f} | " + f"{metrics['lead_ratio']:.3f} | {metrics['backing_ratio']:.3f} |" + ) + markdown_path = output_dir / "karaoke_model_report.md" + markdown_path.write_text("\n".join(markdown_lines) + "\n", encoding="utf-8") + + print(f"Summary written to: {summary_path}") + print(f"Report written to: {markdown_path}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/run_mode_matrix.py b/tools/run_mode_matrix.py new file mode 100644 index 0000000000000000000000000000000000000000..cccacde4453693a3ca69d82a48c597c178b8de4c --- /dev/null +++ b/tools/run_mode_matrix.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +"""CLI wrapper for the real processing-mode matrix runner.""" + +from __future__ import annotations + +import sys +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[1] +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) + +from tests.run_mode_matrix import main + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/ui/__init__.py b/ui/__init__.py index 85909bb7144fd37c1332d2d50eb497cfcddb1b7f..fca8e3bd4f9cb9024f330f84bfa6cf05842bcd82 100644 --- a/ui/__init__.py +++ b/ui/__init__.py @@ -1,7 +1,7 @@ -# -*- coding: utf-8 -*- -""" -UI 模块 -""" -from .app import create_ui, launch - -__all__ = ["create_ui", "launch"] +# -*- coding: utf-8 -*- +""" +UI 模块 +""" +from .app import create_ui, launch + +__all__ = ["create_ui", "launch"] diff --git a/ui/app.py b/ui/app.py index f9936955640dbbbcaf1fc29dbedd9fb34d3c9130..920e09ba72248e0cc4862388bb9bf9d66545f613 100644 --- a/ui/app.py +++ b/ui/app.py @@ -8,28 +8,38 @@ import re import tempfile import gradio as gr from pathlib import Path -from typing import Optional, Tuple, Dict +from typing import Optional, Tuple, Dict, List from lib.logger import log +from lib.runtime_build import get_runtime_build_label +from infer.separator import ROFORMER_DEFAULT_MODEL, KARAOKE_DEFAULT_MODEL # 项目根目录 ROOT_DIR = Path(__file__).parent.parent +CONFIG_PATH = ROOT_DIR / "configs" / "config.json" +DEFAULT_LANGUAGE = "zh_CN" +SUPPORTED_LANGUAGES = { + "zh_CN": "中文", + "en_US": "English", +} +LANGUAGE_LABEL_TO_CODE = {label: code for code, label in SUPPORTED_LANGUAGES.items()} # 加载语言包 def load_i18n(lang: str = "zh_CN") -> dict: """加载语言包""" + if lang not in SUPPORTED_LANGUAGES: + raise ValueError(f"Unsupported language: {lang}") i18n_path = ROOT_DIR / "i18n" / f"{lang}.json" - if i18n_path.exists(): - with open(i18n_path, "r", encoding="utf-8") as f: - return json.load(f) - return {} + if not i18n_path.exists(): + raise FileNotFoundError(f"Language pack not found: {i18n_path}") + with open(i18n_path, "r", encoding="utf-8") as f: + return json.load(f) # 加载配置 def load_config() -> dict: """加载配置""" - config_path = ROOT_DIR / "configs" / "config.json" - if config_path.exists(): - with open(config_path, "r", encoding="utf-8") as f: + if CONFIG_PATH.exists(): + with open(CONFIG_PATH, "r", encoding="utf-8") as f: return json.load(f) return {} @@ -55,9 +65,8 @@ def normalize_config(config: dict) -> dict: return config -# 全局变量 -i18n = load_i18n() config = normalize_config(load_config()) +i18n = load_i18n(str(config.get("language", DEFAULT_LANGUAGE)).strip() or DEFAULT_LANGUAGE) pipeline = None @@ -68,6 +77,56 @@ def t(key: str, section: str = None) -> str: return i18n.get(key, key) +def get_configured_language(config_data: Optional[dict] = None) -> str: + """Return the configured UI language, failing on unsupported values.""" + selected_config = config if config_data is None else config_data + language = str(selected_config.get("language", DEFAULT_LANGUAGE)).strip() or DEFAULT_LANGUAGE + if language not in SUPPORTED_LANGUAGES: + raise ValueError(f"Unsupported language: {language}") + return language + + +def resolve_language_choice(language_choice: str) -> str: + """Normalize a dropdown label or language code to a supported locale code.""" + choice = str(language_choice or "").strip() + if choice in SUPPORTED_LANGUAGES: + return choice + if choice in LANGUAGE_LABEL_TO_CODE: + return LANGUAGE_LABEL_TO_CODE[choice] + raise ValueError(f"Unsupported language choice: {language_choice}") + + +def get_current_language_label() -> str: + return SUPPORTED_LANGUAGES[get_configured_language()] + + +def save_language_setting(language_choice: str, config_path: Optional[Path] = None) -> str: + """Persist UI language selection. Static Gradio labels update after restart.""" + global config, i18n + + language = resolve_language_choice(language_choice) + target_path = Path(config_path) if config_path is not None else CONFIG_PATH + + if target_path.exists(): + with open(target_path, "r", encoding="utf-8") as f: + next_config = normalize_config(json.load(f)) + else: + next_config = {} + + next_config["language"] = language + with open(target_path, "w", encoding="utf-8") as f: + json.dump(next_config, f, indent=4, ensure_ascii=False) + + if target_path == CONFIG_PATH: + config = next_config + i18n = load_i18n(language) + + message_i18n = load_i18n(language) + return message_i18n["settings"]["language_saved_restart"].format( + language=SUPPORTED_LANGUAGES[language] + ) + + def _to_int(value, fallback: int) -> int: try: return int(value) @@ -131,9 +190,7 @@ def get_vc_preprocess_option_maps() -> Tuple[Dict[str, str], Dict[str, str]]: """Build VC preprocess dropdown option maps.""" label_to_value = { t("vc_preprocess_auto", "cover"): "auto", - t("vc_preprocess_direct", "cover"): "direct", t("vc_preprocess_uvr_deecho", "cover"): "uvr_deecho", - t("vc_preprocess_legacy", "cover"): "legacy", } value_to_label = {value: label for label, value in label_to_value.items()} return label_to_value, value_to_label @@ -150,29 +207,29 @@ def get_source_constraint_option_maps() -> Tuple[Dict[str, str], Dict[str, str]] return label_to_value, value_to_label -def get_vc_pipeline_mode_option_maps() -> Tuple[Dict[str, str], Dict[str, str]]: - """Build VC pipeline mode dropdown option maps.""" - label_to_value = { - t("vc_pipeline_mode_current", "cover"): "current", - t("vc_pipeline_mode_official", "cover"): "official", +def get_vc_pipeline_mode_option_maps() -> Tuple[Dict[str, str], Dict[str, str]]: + """Build VC pipeline mode dropdown option maps.""" + label_to_value = { + t("vc_pipeline_mode_current", "cover"): "current", + t("vc_pipeline_mode_official", "cover"): "official", } - value_to_label = {value: label for label, value in label_to_value.items()} - return label_to_value, value_to_label - - -def update_singing_repair_visibility(vc_pipeline_mode: str): - """Only show singing repair option for official mode.""" - pipeline_label_to_value, _ = get_vc_pipeline_mode_option_maps() - normalized = pipeline_label_to_value.get( - str(vc_pipeline_mode), - str(vc_pipeline_mode or "").strip().lower(), - ) - return gr.update(visible=(normalized == "official")) - - -def init_pipeline(): - """初始化推理管道""" - global pipeline + value_to_label = {value: label for label, value in label_to_value.items()} + return label_to_value, value_to_label + + +def update_singing_repair_visibility(vc_pipeline_mode: str): + """Only show singing repair option for official mode.""" + pipeline_label_to_value, _ = get_vc_pipeline_mode_option_maps() + normalized = pipeline_label_to_value.get( + str(vc_pipeline_mode), + str(vc_pipeline_mode or "").strip().lower(), + ) + return gr.update(visible=(normalized == "official")) + + +def init_pipeline(): + """初始化推理管道""" + global pipeline if pipeline is not None: return pipeline @@ -238,12 +295,31 @@ def get_available_character_series() -> list: def format_character_label(char_info: dict) -> str: - """格式化角色展示名称:【语言】角色名(中/英/日) · 出处 · 内部名""" - display = char_info.get("display") or char_info.get("description") or char_info.get("name", "") + """格式化角色展示名称,明确显示版本、归属和来源。""" + display = char_info.get("base_display") or char_info.get("display") or char_info.get("description") or char_info.get("name", "") source = char_info.get("source", "未知") name = char_info.get("name", "") lang_tag = get_character_language_tag(char_info) - return f"【{lang_tag}】{display}(出自:{source})[{name}]" + parts: List[str] = [] + continuity = str(char_info.get("continuity") or "").strip() + version_label = str(char_info.get("version_label") or "").strip() + distribution = str(char_info.get("distribution") or "").strip() + repo = str(char_info.get("repo") or "").strip() + + if continuity: + parts.append(continuity) + if version_label: + parts.append(version_label) + parts.append(source) + if distribution and repo: + parts.append(f"{distribution}: {repo}") + elif distribution: + parts.append(distribution) + elif repo: + parts.append(repo) + + meta = "|".join(part for part in parts if part) + return f"【{lang_tag}】{display}|{meta} [{name}]" def get_character_language_tag(char_info: dict) -> str: @@ -277,6 +353,69 @@ def get_character_language_tag(char_info: dict) -> str: return "中文" +def _find_character_entry(selection: str, downloaded: bool) -> Optional[dict]: + if not selection: + return None + chars = get_downloaded_character_list() if downloaded else get_available_character_list() + resolved = resolve_character_name(selection) + for char in chars: + if selection == char.get("name") or selection == format_character_label(char): + return char + if resolved and char.get("name") == resolved: + return char + return None + + +def format_character_details(char_info: Optional[dict], downloaded: bool = False) -> str: + if not char_info: + if downloaded: + return "选择已下载角色后,这里会显示该模型的版本归属、来源仓库和本地文件路径。" + return "选择待下载角色后,这里会显示该模型的版本归属、来源仓库和下载来源。" + + title = char_info.get("base_display") or char_info.get("display") or char_info.get("name", "") + lines = [f"**{title}**"] + + version_label = str(char_info.get("version_label") or "").strip() + continuity = str(char_info.get("continuity") or "").strip() + role = str(char_info.get("role") or "").strip() + source = str(char_info.get("source") or "").strip() + distribution = str(char_info.get("distribution") or "").strip() + repo = str(char_info.get("repo") or "").strip() + source_page_url = str(char_info.get("source_page_url") or "").strip() + download_url = str(char_info.get("download_url") or "").strip() + + if version_label: + lines.append(f"- 版本标识:`{version_label}`") + if continuity: + lines.append(f"- 角色归属:`{continuity}`") + if role: + lines.append(f"- 模型类型:`{role}`") + if source: + lines.append(f"- 作品来源:`{source}`") + if distribution: + lines.append(f"- 分发方式:`{distribution}`") + if repo: + lines.append(f"- 来源仓库:`{repo}`") + if source_page_url: + lines.append(f"- 来源页面:{source_page_url}") + if download_url and download_url != source_page_url: + lines.append(f"- 下载链接:{download_url}") + if downloaded and char_info.get("model_path"): + lines.append(f"- 本地权重:`{char_info['model_path']}`") + if downloaded and char_info.get("index_path"): + lines.append(f"- 本地索引:`{char_info['index_path']}`") + lines.append(f"- 内部键:`{char_info.get('name', '')}`") + return "\n".join(lines) + + +def get_downloaded_character_details(selection: str) -> str: + return format_character_details(_find_character_entry(selection, downloaded=True), downloaded=True) + + +def get_available_character_details(selection: str) -> str: + return format_character_details(_find_character_entry(selection, downloaded=False), downloaded=False) + + def get_downloaded_character_choices(series: str = "全部", keyword: str = "") -> list: """获取已下载角色的下拉选项""" chars = get_downloaded_character_list() @@ -290,6 +429,10 @@ def get_downloaded_character_choices(series: str = "全部", keyword: str = "") if kw in c.get("name", "").lower() or kw in c.get("display", "").lower() or kw in c.get("source", "").lower() + or kw in str(c.get("repo", "")).lower() + or kw in str(c.get("continuity", "")).lower() + or kw in str(c.get("version_label", "")).lower() + or kw in str(c.get("distribution", "")).lower() ] return [(format_character_label(c), c["name"]) for c in chars] @@ -321,6 +464,10 @@ def get_available_character_choices(series: str = "全部", keyword: str = "") - if kw in c.get("name", "").lower() or kw in c.get("display", "").lower() or kw in c.get("source", "").lower() + or kw in str(c.get("repo", "")).lower() + or kw in str(c.get("continuity", "")).lower() + or kw in str(c.get("version_label", "")).lower() + or kw in str(c.get("distribution", "")).lower() ] return [(format_character_label(c), c["name"]) for c in chars] @@ -400,19 +547,19 @@ def refresh_downloaded_controls(series: str, keyword: str) -> Tuple[Dict, Dict]: return _refresh_downloaded_updates(series, keyword) -def process_cover( - audio_path: str, - character_name: str, +def process_cover( + audio_path: str, + character_name: str, pitch_shift: int, index_ratio: float, speaker_id: float, karaoke_separation: bool, karaoke_merge_backing_into_accompaniment: bool, - vc_preprocess_mode: str, - source_constraint_mode: str, - vc_pipeline_mode: str, - singing_repair: bool, - vocals_volume: float, + vc_preprocess_mode: str, + source_constraint_mode: str, + vc_pipeline_mode: str, + singing_repair: bool, + vocals_volume: float, accompaniment_volume: float, reverb_amount: float, rms_mix_rate: float, @@ -433,11 +580,12 @@ def process_cover( return *_none6, "请选择角色" try: - from tools.character_models import get_character_model_path + from tools.character_models import get_character_model_path, get_character_info from infer.cover_pipeline import get_cover_pipeline # 获取角色模型路径 resolved_name = resolve_character_name(character_name) + char_meta = get_character_info(resolved_name, downloaded_only=True) or {} model_info = get_character_model_path(resolved_name) if model_info is None: return *_none6, f"角色模型不存在: {resolved_name}" @@ -457,6 +605,7 @@ def process_cover( demucs_overlap = float(cover_cfg.get("demucs_overlap", 0.25)) demucs_split = bool(cover_cfg.get("demucs_split", True)) separator = cover_cfg.get("separator", "roformer") + roformer_model = cover_cfg.get("roformer_model", ROFORMER_DEFAULT_MODEL) uvr5_model = cover_cfg.get("uvr5_model") uvr5_agg = int(cover_cfg.get("uvr5_agg", 10)) uvr5_format = cover_cfg.get("uvr5_format", "wav") @@ -469,25 +618,28 @@ def process_cover( silence_smoothing_ms = cover_cfg.get("silence_smoothing_ms", 50.0) silence_min_duration_ms = cover_cfg.get("silence_min_duration_ms", 200.0) hubert_layer = cover_cfg.get("hubert_layer", config.get("hubert_layer", 12)) - karaoke_model = cover_cfg.get("karaoke_model", "mel_band_roformer_karaoke_gabox.ckpt") - default_vc_preprocess_mode = str(cover_cfg.get("vc_preprocess_mode", "auto")) - default_source_constraint_mode = str(cover_cfg.get("source_constraint_mode", "auto")) - default_vc_pipeline_mode = str(cover_cfg.get("vc_pipeline_mode", "current")) - default_singing_repair = bool(cover_cfg.get("singing_repair", False)) - vc_label_to_value, vc_value_to_label = get_vc_preprocess_option_maps() - source_label_to_value, source_value_to_label = get_source_constraint_option_maps() - pipeline_label_to_value, pipeline_value_to_label = get_vc_pipeline_mode_option_maps() + karaoke_model = cover_cfg.get( + "karaoke_model", + KARAOKE_DEFAULT_MODEL, + ) + default_vc_preprocess_mode = str(cover_cfg.get("vc_preprocess_mode", "auto")) + default_source_constraint_mode = str(cover_cfg.get("source_constraint_mode", "auto")) + default_vc_pipeline_mode = str(cover_cfg.get("vc_pipeline_mode", "current")) + default_singing_repair = bool(cover_cfg.get("singing_repair", False)) + vc_label_to_value, vc_value_to_label = get_vc_preprocess_option_maps() + source_label_to_value, source_value_to_label = get_source_constraint_option_maps() + pipeline_label_to_value, pipeline_value_to_label = get_vc_pipeline_mode_option_maps() vc_preprocess_mode = vc_label_to_value.get(str(vc_preprocess_mode), str(vc_preprocess_mode or default_vc_preprocess_mode).strip().lower()) - if vc_preprocess_mode not in {"auto", "direct", "uvr_deecho", "legacy"}: - vc_preprocess_mode = default_vc_preprocess_mode + if vc_preprocess_mode not in {"auto", "uvr_deecho"}: + vc_preprocess_mode = "auto" source_constraint_mode = source_label_to_value.get(str(source_constraint_mode), str(source_constraint_mode or default_source_constraint_mode).strip().lower()) if source_constraint_mode not in {"auto", "off", "on"}: source_constraint_mode = default_source_constraint_mode - vc_pipeline_mode = pipeline_label_to_value.get(str(vc_pipeline_mode), str(vc_pipeline_mode or default_vc_pipeline_mode).strip().lower()) - if vc_pipeline_mode not in {"current", "official"}: - vc_pipeline_mode = default_vc_pipeline_mode - singing_repair = bool(singing_repair if singing_repair is not None else default_singing_repair) + vc_pipeline_mode = pipeline_label_to_value.get(str(vc_pipeline_mode), str(vc_pipeline_mode or default_vc_pipeline_mode).strip().lower()) + if vc_pipeline_mode not in {"current", "official"}: + vc_pipeline_mode = default_vc_pipeline_mode + singing_repair = bool(singing_repair if singing_repair is not None else default_singing_repair) index_ratio = max(0.0, min(1.0, float(index_ratio) / 100.0)) speaker_id = int(max(0, round(float(speaker_id)))) @@ -516,6 +668,7 @@ def process_cover( demucs_shifts=demucs_shifts, demucs_overlap=demucs_overlap, demucs_split=demucs_split, + roformer_model=roformer_model, separator=separator, uvr5_model=uvr5_model, uvr5_agg=uvr5_agg, @@ -533,20 +686,27 @@ def process_cover( karaoke_separation=bool(karaoke_separation), karaoke_model=karaoke_model, karaoke_merge_backing_into_accompaniment=bool(karaoke_merge_backing_into_accompaniment), - vc_preprocess_mode=vc_preprocess_mode, - source_constraint_mode=source_constraint_mode, - vc_pipeline_mode=vc_pipeline_mode, - singing_repair=singing_repair, - output_dir=str(output_dir), - model_display_name=resolved_name, - progress_callback=progress_callback + vc_preprocess_mode=vc_preprocess_mode, + source_constraint_mode=source_constraint_mode, + vc_pipeline_mode=vc_pipeline_mode, + singing_repair=singing_repair, + output_dir=str(output_dir), + model_display_name=resolved_name, + progress_callback=progress_callback ) - status_msg = "\u2705 \u7ffb\u5531\u5b8c\u6210!" - status_msg += f"\n{get_cover_vc_route_status(vc_preprocess_mode, vc_pipeline_mode).splitlines()[0]}" - status_msg += f"\nVC\u7ba1\u7ebf\u6a21\u5f0f: {pipeline_value_to_label.get(vc_pipeline_mode, vc_pipeline_mode)}" - status_msg += f"\n唱歌修复: {'开启' if singing_repair else '关闭'}" - status_msg += f"\n\u6e90\u7ea6\u675f\u7b56\u7565: {source_value_to_label.get(source_constraint_mode, source_constraint_mode)}" + status_msg = "\u2705 \u7ffb\u5531\u5b8c\u6210!" + status_msg += f"\n{get_cover_vc_route_status(vc_preprocess_mode, vc_pipeline_mode).splitlines()[0]}" + status_msg += f"\nVC\u7ba1\u7ebf\u6a21\u5f0f: {pipeline_value_to_label.get(vc_pipeline_mode, vc_pipeline_mode)}" + status_msg += f"\n唱歌修复: {'开启' if singing_repair else '关闭'}" + status_msg += f"\n\u6e90\u7ea6\u675f\u7b56\u7565: {source_value_to_label.get(source_constraint_mode, source_constraint_mode)}" + if char_meta.get("version_label"): + status_msg += f"\n模型版本: {char_meta['version_label']}" + if char_meta.get("continuity"): + status_msg += f"\n角色归属: {char_meta['continuity']}" + if char_meta.get("repo"): + status_msg += f"\n模型来源: {char_meta['repo']}" + status_msg += f"\n{get_runtime_build_label()}" if result.get("all_files_dir"): status_msg += f"\n\u5168\u90e8\u6587\u4ef6\u76ee\u5f55: {result['all_files_dir']}" @@ -569,22 +729,30 @@ def process_cover( def check_mature_deecho_status() -> str: """Check mature DeEcho model availability.""" - from tools.download_models import MATURE_DEECHO_MODELS, check_model, get_preferred_mature_deecho_model + from tools.download_models import MATURE_DEECHO_MODELS, check_model + from infer.separator import ROFORMER_DEREVERB_DEFAULT_MODEL, check_roformer_available status_lines = [] - preferred = get_preferred_mature_deecho_model() + roformer_ready = check_roformer_available() + icon = "✅" if roformer_ready else "❌" + status_lines.append( + f"{icon} {ROFORMER_DEREVERB_DEFAULT_MODEL} ← 当前自动模式优先使用" + ) + if roformer_ready: + status_lines.append(" RoFormer 模型由 audio-separator 首次运行时自动下载到 assets/separator_models") + for name in MATURE_DEECHO_MODELS: exists = check_model(name) icon = "✅" if exists else "❌" - suffix = " ← 当前自动模式优先使用" if preferred == name else "" + suffix = " ← 仅保留状态显示,严格SOTA自动模式不使用" status_lines.append(f"{icon} {name}{suffix}") - if preferred: + if roformer_ready: status_lines.append("") - status_lines.append(f"当前可用学习型 DeEcho: {preferred}") + status_lines.append(f"当前优先学习型 DeEcho: RoFormer {ROFORMER_DEREVERB_DEFAULT_MODEL}") else: status_lines.append("") - status_lines.append("当前未检测到学习型 DeEcho 模型;翻唱自动模式将回退为主唱直通 RVC") + status_lines.append("当前缺少严格SOTA DeEcho运行环境;翻唱自动模式将停止处理而不是降级") return "\n".join(status_lines) @@ -607,7 +775,7 @@ def get_cover_vc_route_status( vc_pipeline_mode: Optional[str] = None, ) -> str: """Return the active VC route shown in the cover UI.""" - from tools.download_models import get_preferred_mature_deecho_model + from infer.separator import ROFORMER_DEREVERB_DEFAULT_MODEL, check_roformer_available mode = str(vc_preprocess_mode or config.get("cover", {}).get("vc_preprocess_mode", "auto")).strip().lower() pipeline_mode = str(vc_pipeline_mode or config.get("cover", {}).get("vc_pipeline_mode", "current")).strip().lower() @@ -615,51 +783,46 @@ def get_cover_vc_route_status( pipeline_label_to_value, _ = get_vc_pipeline_mode_option_maps() mode = vc_label_to_value.get(mode, mode) pipeline_mode = pipeline_label_to_value.get(pipeline_mode, pipeline_mode) - preferred = get_preferred_mature_deecho_model() + roformer_ready = check_roformer_available() + preferred = f"RoFormer {ROFORMER_DEREVERB_DEFAULT_MODEL}" if roformer_ready else None newline = chr(10) + build_label = get_runtime_build_label() if pipeline_mode == "official": return newline.join([ "当前使用内置官方 RVC 实现", "流程:主唱分离 → 官方音频加载 / 官方 VC → 混音", "说明:跳过本项目自定义 VC 预处理、源约束与静音门限后处理", + build_label, ]) - if mode == "direct": - return newline.join([ - "ℹ️ 当前固定为主唱直通 RVC", - "流程: 主唱分离 → 直接进入 RVC → 混音", - "说明: 不使用学习型 DeEcho,也不走旧版手工链", - ]) - if mode == "legacy": - return newline.join([ - "⚠️ 当前固定为旧版手工链", - "流程: 主唱分离 → 手工去回声链 → RVC → 混音", - "说明: 仅用于对比,不是默认推荐路径", - ]) if mode == "uvr_deecho": if preferred: return newline.join([ - "✅ 当前固定优先使用学习型 DeEcho / DeReverb", + "✅ 当前固定使用严格 SOTA RoFormer De-Reverb", f"当前命中模型: {preferred}", - "流程: 主唱分离 → UVR DeEcho/DeReverb → RVC → 混音", + "流程: 主唱分离 → RoFormer De-Reverb → RVC → 混音", + build_label, ]) return newline.join([ - "⚠️ 当前设为官方 DeEcho 优先,但本地缺少模型", - "当前将回退流程: 主唱分离 → 直接进入 RVC → 混音", - "建议: 先在模型管理页下载成熟 DeEcho 模型", + "⚠️ 当前设为严格 SOTA RoFormer De-Reverb,但运行环境不可用", + "流程会停止处理,不会降级到 UVR 或算法去混响", + "建议: 修复 audio-separator / RoFormer De-Reverb 运行环境", + build_label, ]) if preferred: return newline.join([ - "✅ 自动模式当前会优先使用学习型 DeEcho / DeReverb", + "✅ 自动模式当前使用严格 SOTA RoFormer De-Reverb", f"当前命中模型: {preferred}", - "流程: 主唱分离 → UVR DeEcho/DeReverb → RVC → 混音", + "流程: 主唱分离 → RoFormer De-Reverb → RVC → 混音", + build_label, ]) return newline.join([ - "ℹ️ 自动模式当前会回退为主唱直通 RVC", - "原因: 本地未检测到成熟 DeEcho / DeReverb 模型", - "流程: 主唱分离 → 直接进入 RVC → 混音", + "⚠️ 自动模式缺少严格SOTA DeEcho运行环境", + "原因: 未检测到 RoFormer De-Reverb 可运行条件", + "流程会停止处理,不会降级到 UVR 或算法去混响", + build_label, ]) @@ -716,6 +879,14 @@ CUSTOM_CSS = """ color: #e0e0e0 !important; } +.runtime-stamp { + text-align: center; + margin: -0.3rem 0 1rem 0; + color: #bdbdbd !important; + font-family: 'Consolas', 'Monaco', monospace; + font-size: 0.95rem; +} + /* 状态框样式 */ .status-box { font-family: 'Consolas', 'Monaco', monospace; @@ -1289,6 +1460,28 @@ def create_ui() -> gr.Blocks: gr.Markdown( f"
{i18n.get('app_description', '基于 RVC v2 的 AI 翻唱系统')}
" ) + gr.Markdown( + f"
{get_runtime_build_label()}
" + ) + + with gr.Row(elem_classes=["language-switch-row"]): + language_dropdown = gr.Dropdown( + label=t("language", "settings"), + choices=list(LANGUAGE_LABEL_TO_CODE.keys()), + value=get_current_language_label(), + interactive=True, + ) + save_language_btn = gr.Button( + f"💾 {t('save_language', 'settings')}", + variant="secondary", + ) + + language_status = gr.Markdown(t("language_restart_info", "settings")) + save_language_btn.click( + fn=save_language_setting, + inputs=[language_dropdown], + outputs=[language_status], + ) with gr.Tabs(): # ===== 模型管理标签页 ===== @@ -1355,17 +1548,21 @@ def create_ui() -> gr.Blocks: weights_dir = ROOT_DIR / config.get("weights_dir", "assets/weights") models = list_voice_models(str(weights_dir)) if not models: - return [["(无模型)", "", ""]] - return [[m["name"], m["model_path"], m.get("index_path", "无")] for m in models] + return [[t("no_models", "ui"), "", ""]] + return [[m["name"], m["model_path"], m.get("index_path", "")] for m in models] model_table = gr.Dataframe( - headers=["模型名称", "模型路径", "索引路径"], + headers=[ + t("model_name", "ui"), + t("model_path", "ui"), + t("index_path", "ui"), + ], value=get_model_table(), interactive=False ) refresh_table_btn = gr.Button( - f"🔄 刷新模型列表", + f"🔄 {t('refresh_models', 'ui')}", variant="secondary" ) @@ -1377,19 +1574,7 @@ def create_ui() -> gr.Blocks: # ===== 歌曲翻唱标签页 ===== with gr.Tab(t("cover", "tabs")): gr.Markdown(f"### 🎵 {t('song_cover', 'cover')}") - gr.Markdown( - """ - **一键 AI 翻唱**:上传歌曲 → 自动分离人声 → 转换音色 → 混合伴奏 → 输出翻唱 - - **使用步骤:** - 1. 先下载角色模型(展开下方「下载角色模型」) - 2. 上传歌曲文件(支持 MP3/WAV/FLAC) - 3. 选择已下载的角色 - 4. 调整参数后点击「开始翻唱」 - - > ⚠️ 首次运行会自动下载 Mel-Band Roformer 人声分离模型(约 200MB),请耐心等待 - """ - ) + gr.Markdown(t("cover_usage", "ui")) with gr.Row(): # 左侧:输入和角色选择 @@ -1403,71 +1588,80 @@ def create_ui() -> gr.Blocks: gr.Markdown(f"#### 🎭 {t('select_character', 'cover')}") downloaded_series = gr.Dropdown( - label="作品/分类", + label=t("series_filter", "ui"), choices=get_downloaded_character_series(), value="全部", interactive=True ) downloaded_keyword = gr.Textbox( - label="关键词搜索", - placeholder="输入角色名/作品名", + label=t("keyword_search", "ui"), + placeholder=t("keyword_placeholder", "ui"), interactive=True ) character_dropdown = gr.Dropdown( - label="选择角色", + label=t("character", "cover"), choices=get_downloaded_character_choices("全部", ""), interactive=True, - info="括号中的信息为模型训练参数:epochs=训练轮数(越大通常越成熟),数字+k=训练采样率(如40k=40000Hz)" + info=t("character_choice_info", "ui") + ) + + character_details = gr.Markdown( + value=get_downloaded_character_details(None) ) with gr.Row(): refresh_char_btn = gr.Button( - "🔄 刷新", + f"🔄 {t('refresh', 'conversion')}", size="sm", variant="secondary" ) # 角色下载区域 - with gr.Accordion("下载角色模型", open=False): + with gr.Accordion(t("download_character", "cover"), open=False): series_choices = ["全部"] + get_available_character_series() download_series = gr.Dropdown( - label="作品/分类", + label=t("series_filter", "ui"), choices=series_choices, value="全部", interactive=True ) download_keyword = gr.Textbox( - label="关键词搜索", - placeholder="输入角色名/作品名", + label=t("keyword_search", "ui"), + placeholder=t("keyword_placeholder", "ui"), interactive=True ) download_char_dropdown = gr.Dropdown( - label="选择角色", + label=t("select_to_download", "cover"), choices=get_available_character_choices("全部", ""), - interactive=True + interactive=True, + info=t("download_character_info", "ui") + ) + + download_char_details = gr.Markdown( + value=get_available_character_details(None) ) download_char_btn = gr.Button( - "⬇️ 下载选中角色", + f"⬇️ {t('download_selected_character', 'ui')}", variant="primary" ) download_all_series_btn = gr.Button( - "⬇️ 下载该分类全部", + f"⬇️ {t('download_series_all', 'ui')}", variant="secondary" ) download_all_btn = gr.Button( - "⬇️ 下载全部角色模型", + f"⬇️ {t('download_all_characters', 'ui')}", variant="secondary" ) download_char_status = gr.Textbox( - label="下载状态", + label=t("download_status", "ui"), interactive=False ) @@ -1482,7 +1676,7 @@ def create_ui() -> gr.Blocks: maximum=12, value=0, step=1, - info="正数升调,负数降调" + info=t("positive_pitch_info", "ui") ) cover_index_rate = gr.Slider( @@ -1545,20 +1739,20 @@ def create_ui() -> gr.Blocks: value=source_value_to_label.get(str(cover_cfg.get("source_constraint_mode", "auto")), list(source_label_to_value.keys())[0]), info=t("source_constraint_mode_info", "cover"), ) - cover_vc_pipeline_mode = gr.Dropdown( - label=t("vc_pipeline_mode", "cover"), - choices=list(pipeline_label_to_value.keys()), - value=pipeline_value_to_label.get(str(cover_cfg.get("vc_pipeline_mode", "current")), list(pipeline_label_to_value.keys())[0]), - info=t("vc_pipeline_mode_info", "cover"), - ) - cover_singing_repair = gr.Checkbox( - label=t("singing_repair", "cover"), - value=bool(cover_cfg.get("singing_repair", False)), - info=t("singing_repair_info", "cover"), - visible=str(cover_cfg.get("vc_pipeline_mode", "current")).strip().lower() == "official", - ) - - cover_vc_route_status = gr.Textbox( + cover_vc_pipeline_mode = gr.Dropdown( + label=t("vc_pipeline_mode", "cover"), + choices=list(pipeline_label_to_value.keys()), + value=pipeline_value_to_label.get(str(cover_cfg.get("vc_pipeline_mode", "current")), list(pipeline_label_to_value.keys())[0]), + info=t("vc_pipeline_mode_info", "cover"), + ) + cover_singing_repair = gr.Checkbox( + label=t("singing_repair", "cover"), + value=bool(cover_cfg.get("singing_repair", False)), + info=t("singing_repair_info", "cover"), + visible=str(cover_cfg.get("vc_pipeline_mode", "current")).strip().lower() == "official", + ) + + cover_vc_route_status = gr.Textbox( label=t("vc_preprocess_status", "cover"), value=get_cover_vc_route_status( cover_cfg.get("vc_preprocess_mode", "auto"), @@ -1587,7 +1781,7 @@ def create_ui() -> gr.Blocks: maximum=200, value=default_mix["vocals_volume"], step=5, - info="100% 为原始音量" + info=t("normal_volume_info", "ui") ) cover_accompaniment_volume = gr.Slider( @@ -1596,7 +1790,7 @@ def create_ui() -> gr.Blocks: maximum=200, value=default_mix["accompaniment_volume"], step=5, - info="100% 为原始音量" + info=t("normal_volume_info", "ui") ) cover_reverb = gr.Slider( @@ -1605,7 +1799,7 @@ def create_ui() -> gr.Blocks: maximum=100, value=default_mix["reverb"], step=5, - info="为人声添加混响效果" + info=t("reverb_info", "ui") ) cover_rms_mix_rate = gr.Slider( @@ -1701,48 +1895,93 @@ def create_ui() -> gr.Blocks: inputs=[downloaded_series, downloaded_keyword], outputs=[downloaded_series, character_dropdown] ) + refresh_char_btn.click( + fn=lambda: get_downloaded_character_details(None), + outputs=[character_details] + ) downloaded_series.change( fn=update_downloaded_choices, inputs=[downloaded_series, downloaded_keyword], outputs=[character_dropdown] ) + downloaded_series.change( + fn=lambda: get_downloaded_character_details(None), + outputs=[character_details] + ) downloaded_keyword.change( fn=update_downloaded_choices, inputs=[downloaded_series, downloaded_keyword], outputs=[character_dropdown] ) + downloaded_keyword.change( + fn=lambda: get_downloaded_character_details(None), + outputs=[character_details] + ) + + character_dropdown.change( + fn=get_downloaded_character_details, + inputs=[character_dropdown], + outputs=[character_details] + ) download_series.change( fn=update_download_choices, inputs=[download_series, download_keyword], outputs=[download_char_dropdown] ) + download_series.change( + fn=lambda: get_available_character_details(None), + outputs=[download_char_details] + ) download_keyword.change( fn=update_download_choices, inputs=[download_series, download_keyword], outputs=[download_char_dropdown] ) + download_keyword.change( + fn=lambda: get_available_character_details(None), + outputs=[download_char_details] + ) + + download_char_dropdown.change( + fn=get_available_character_details, + inputs=[download_char_dropdown], + outputs=[download_char_details] + ) download_char_btn.click( fn=download_character, inputs=[download_char_dropdown, downloaded_series, downloaded_keyword], outputs=[download_char_status, character_dropdown, downloaded_series] ) + download_char_btn.click( + fn=get_available_character_details, + inputs=[download_char_dropdown], + outputs=[download_char_details] + ) download_all_series_btn.click( fn=download_all_characters, inputs=[download_series, downloaded_series, downloaded_keyword], outputs=[download_char_status, character_dropdown, downloaded_series] ) + download_all_series_btn.click( + fn=lambda: get_downloaded_character_details(None), + outputs=[character_details] + ) download_all_btn.click( fn=lambda series, keyword: download_all_characters("全部", series, keyword), inputs=[downloaded_series, downloaded_keyword], outputs=[download_char_status, character_dropdown, downloaded_series] ) + download_all_btn.click( + fn=lambda: get_downloaded_character_details(None), + outputs=[character_details] + ) cover_mix_preset.change( fn=apply_cover_mix_preset, @@ -1780,16 +2019,16 @@ def create_ui() -> gr.Blocks: outputs=[cover_vc_route_status] ) - cover_vc_pipeline_mode.change( - fn=get_cover_vc_route_status, - inputs=[cover_vc_preprocess_mode, cover_vc_pipeline_mode], - outputs=[cover_vc_route_status] - ) - cover_vc_pipeline_mode.change( - fn=update_singing_repair_visibility, - inputs=[cover_vc_pipeline_mode], - outputs=[cover_singing_repair] - ) + cover_vc_pipeline_mode.change( + fn=get_cover_vc_route_status, + inputs=[cover_vc_preprocess_mode, cover_vc_pipeline_mode], + outputs=[cover_vc_route_status] + ) + cover_vc_pipeline_mode.change( + fn=update_singing_repair_visibility, + inputs=[cover_vc_pipeline_mode], + outputs=[cover_singing_repair] + ) cover_btn.click( fn=process_cover, @@ -1801,13 +2040,13 @@ def create_ui() -> gr.Blocks: cover_speaker_id, cover_karaoke, cover_karaoke_merge_backing, - cover_vc_preprocess_mode, - cover_source_constraint_mode, - cover_vc_pipeline_mode, - cover_singing_repair, - cover_vocals_volume, - cover_accompaniment_volume, - cover_reverb, + cover_vc_preprocess_mode, + cover_source_constraint_mode, + cover_vc_pipeline_mode, + cover_singing_repair, + cover_vocals_volume, + cover_accompaniment_volume, + cover_reverb, cover_rms_mix_rate, cover_backing_mix, ], @@ -1846,7 +2085,7 @@ def create_ui() -> gr.Blocks: gr.Markdown("---") - gr.Markdown(f"### ⚙️ 运行设置") + gr.Markdown(f'### ⚙️ {t("runtime_settings", "settings")}') def _build_device_choices(): from lib.device import _has_xpu, _has_directml, _has_mps, _is_rocm @@ -1861,22 +2100,22 @@ def create_ui() -> gr.Blocks: choices.append(("DirectML (AMD/Intel GPU)", "directml")) if _has_mps(): choices.append(("MPS (Apple GPU)", "mps")) - choices.append(("CPU (较慢)", "cpu")) + choices.append((t("cpu_slow", "settings"), "cpu")) return choices device_radio = gr.Radio( - label="计算设备", + label=t("compute_device", "settings"), choices=_build_device_choices(), value=config.get("device", "cuda") ) save_settings_btn = gr.Button( - "💾 保存设置", + f"💾 {t('save_settings', 'settings')}", variant="primary" ) settings_status = gr.Textbox( - label="状态", + label=t("status", "settings"), interactive=False ) @@ -1888,7 +2127,7 @@ def create_ui() -> gr.Blocks: with open(config_path, "w", encoding="utf-8") as f: json.dump(config, f, indent=4, ensure_ascii=False) - return "✅ 设置已保存,重启后生效" + return t("settings_saved_restart", "settings") save_settings_btn.click( fn=save_settings, @@ -1899,60 +2138,11 @@ def create_ui() -> gr.Blocks: gr.Markdown("---") gr.Markdown(f"### ℹ️ {t('about', 'settings')}") - gr.Markdown( - """ - **RVC AI 翻唱系统** - - - 基于 RVC v2 + Mel-Band Roformer - - 使用 RMVPE 进行高质量 F0 提取 - - 支持 CUDA GPU 加速 - - [GitHub](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI) - """ - ) + gr.Markdown(t("about_body", "settings")) gr.Markdown("---") - gr.Markdown( - """ - ### 📥 角色模型来源 - - 以下是本项目角色模型的 HuggingFace 仓库来源,你也可以手动下载模型后放入 `assets/weights/characters/<角色名>/` 目录使用: - - **Love Live! 系列** - - [trioskosmos/rvc_models](https://huggingface.co/trioskosmos/rvc_models) — μ's / Aqours / 虹咲 / Liella! 多角色 - - [Icchan/LoveLive](https://huggingface.co/Icchan/LoveLive) — 千歌、梨子、绘里、曜 - - [0xMifune/LoveLive](https://huggingface.co/0xMifune/LoveLive) — 虹咲 / Liella! / 莲之空 - - [Swordsmagus/Love-Live-RVC](https://huggingface.co/Swordsmagus/Love-Live-RVC) — 花丸、雪菜、小鸟、A-RISE 等 - - [Zurakichi/RVC](https://huggingface.co/Zurakichi/RVC) — 妮可、彼方、雪菜、花丸 - - [Phos252/RVCmodels](https://huggingface.co/Phos252/RVCmodels) — 涩谷香音 - - [ChocoKat/Mari_Ohara](https://huggingface.co/ChocoKat/Mari_Ohara) — 小原鞠莉 - - [HarunaKasuga/YoshikoTsushima](https://huggingface.co/HarunaKasuga/YoshikoTsushima) — 津岛善子 - - [thebuddyadrian/RVC_Models](https://huggingface.co/thebuddyadrian/RVC_Models) — 鹿角姐妹 - - **原神 / 崩坏 / 绝区零 (米哈游)** - - [makiligon/RVC-Models](https://huggingface.co/makiligon/RVC-Models) — 芙宁娜、绫华、芙卡洛斯 - - [kohaku12/RVC-MODELS](https://huggingface.co/kohaku12/RVC-MODELS) — 纳西妲、黑塔、流萤、停云、星见雅 等 - - [jarari/RVC-v2](https://huggingface.co/jarari/RVC-v2) — 芙宁娜(韩语)、银狼(韩语) - - [mrmocciai/genshin-impact](https://huggingface.co/mrmocciai/genshin-impact) — 原神 50+ 角色(需手动下载) - - **VOCALOID** - - [javinfamous/infamous_miku_v2](https://huggingface.co/javinfamous/infamous_miku_v2) — 初音未来 (1000 epochs) - - **Hololive / VTuber** - - [megaaziib/my-rvc-models-collection](https://huggingface.co/megaaziib/my-rvc-models-collection) — 佩克拉、樱巫女、大空昴、Kobo、Kaela 等 - - [Kit-Lemonfoot/kitlemonfoot_rvc_models](https://huggingface.co/Kit-Lemonfoot/kitlemonfoot_rvc_models) — Hololive JP/EN 多角色 - - **偶像大师 / 赛马娘** - - [trioskosmos/rvc_models](https://huggingface.co/trioskosmos/rvc_models) — 神崎兰子、梦见莉亚梦 - - [makiligon/RVC-Models](https://huggingface.co/makiligon/RVC-Models) — 四条贵音、米浴 - - **Project SEKAI** - - [kohaku12/RVC-MODELS](https://huggingface.co/kohaku12/RVC-MODELS) — 草薙宁宁 - - > 💡 手动下载后,将 `.pth` 和 `.index` 文件放入 `assets/weights/characters/<角色名>/` 目录,刷新即可使用。 - """ - ) + gr.Markdown(t("model_sources", "settings")) return app