Spaces:

mason369
/

AI-RVC

Running

App Files Files Community

mason369 commited on May 17

Commit

a9536c4

verified ·

1 Parent(s): 4204217

Release v1.2.1

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.env.example +15 -0
.gitattributes +2 -0
.github/workflows/build-executables.yml +373 -0
.gitignore +80 -0
AGENTS.md +126 -0
AI-RVC.spec +71 -0
AI_RVC_Colab.ipynb +380 -0
CLAUDE.md +126 -0
LICENSE +21 -0
README_HF.md +6 -6
app.py +16 -9
assets/weights/characters/_version_notes.json +120 -0
assets/weights/characters/mari/metadata.json +177 -0
assets/weights/characters/tokai_teio/metadata.json +147 -0
check_deecho_config.py +130 -0
configs/config.json +15 -9
configs/presets/balanced.json +2 -2
configs/presets/clarity_priority.json +2 -2
configs/presets/timbre_priority.json +2 -2
docs/Colab演示.png +3 -0
docs/Windows界面.png +3 -0
docs/plans/2026-03-30-portable-ffmpeg-hardening.md +74 -0
docs/reports/2026-03-24-cover-quality-final-root-cause-report.md +127 -0
docs/superpowers/plans/2026-05-17-v1.2.1-language-release.md +90 -0
docs/superpowers/specs/2026-05-17-v1.2.1-language-release-design.md +21 -0
i18n/en_US.json +165 -0
i18n/zh_CN.json +41 -10
infer/__init__.py +17 -17
infer/advanced_dereverb.py +0 -32
infer/cover_pipeline.py +0 -0
infer/f0_extractor.py +114 -127
infer/modules/uvr5/modules.py +20 -4
infer/modules/vc/pipeline.py +89 -30
infer/official_adapter.py +114 -8
infer/quality_policy.py +294 -0
infer/separator.py +349 -123
install.py +396 -0
lib/audio.py +51 -51
lib/audio_metrics.py +93 -0
lib/ffmpeg_runtime.py +130 -0
lib/logger.py +254 -254
lib/mixer.py +22 -22
lib/runtime_build.py +59 -0
mcp/__init__.py +4 -0
mcp/server.py +139 -0
mcp/tools.py +233 -0
models/__init__.py +7 -7
models/rmvpe.py +439 -439
models/synthesizer.py +853 -853
requirements_hf.txt +39 -0

.env.example ADDED Viewed

	@@ -0,0 +1,15 @@

+# RVC 语音转换配置
+# 设备设置 (cuda / cpu)
+DEVICE=cuda
+# Gradio 服务器设置
+SERVER_HOST=127.0.0.1
+SERVER_PORT=7860
+# 模型路径
+HUBERT_PATH=assets/hubert/hubert_base.pt
+RMVPE_PATH=assets/rmvpe/rmvpe.pt
+# 日志级别 (DEBUG / INFO / WARNING / ERROR)
+LOG_LEVEL=INFO

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+docs/Colab演示.png filter=lfs diff=lfs merge=lfs -text
+docs/Windows界面.png filter=lfs diff=lfs merge=lfs -text

.github/workflows/build-executables.yml ADDED Viewed

	@@ -0,0 +1,373 @@

+name: Build Executables
+on:
+  push:
+    tags:
+      - 'v*'
+  workflow_dispatch:
+    inputs:
+      tag:
+        description: '要上传资源的 Release 标签（如 v1.2.0），留空则上传到最新 Release'
+        required: false
+        default: ''
+permissions:
+  contents: write
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: windows-latest
+            variant: CPU
+            platform: Windows
+            pytorch_url: https://download.pytorch.org/whl/cpu
+            sep: ";"
+            shell: pwsh
+          - os: windows-latest
+            variant: GPU
+            platform: Windows
+            pytorch_url: https://download.pytorch.org/whl/cu121
+            sep: ";"
+            shell: pwsh
+          - os: ubuntu-latest
+            variant: CPU
+            platform: Linux
+            pytorch_url: https://download.pytorch.org/whl/cpu
+            sep: ":"
+            shell: bash
+          - os: ubuntu-latest
+            variant: GPU
+            platform: Linux
+            pytorch_url: https://download.pytorch.org/whl/cu121
+            sep: ":"
+            shell: bash
+    runs-on: ${{ matrix.os }}
+    name: Build ${{ matrix.platform }}-${{ matrix.variant }}
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.10'
+    - name: Install system dependencies (Linux)
+      if: runner.os == 'Linux'
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y build-essential libsndfile1 ffmpeg portaudio19-dev
+    - name: Install FFmpeg (Windows)
+      if: runner.os == 'Windows'
+      run: choco install ffmpeg -y
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade "pip<24.1"
+        pip install pyinstaller
+        pip install torch torchaudio --index-url ${{ matrix.pytorch_url }}
+        pip install omegaconf==2.0.6 --no-deps
+        pip install PyYAML antlr4-python3-runtime hydra-core
+        pip install fairseq==0.12.2 --no-deps
+        pip install "jinja2<3.1.5"
+        pip install gradio==3.50.2 librosa soundfile scipy numpy praat-parselmouth pyworld faiss-cpu tqdm requests python-dotenv colorama huggingface_hub pedalboard ffmpeg-python av imageio-ffmpeg
+        pip install demucs torchcrepe --no-deps
+        pip install julius dora-search lameenc openunmix treetable
+        python -c "import torch; print(f'PyTorch: {torch.__version__}, CUDA: {torch.cuda.is_available()}')"
+    - name: Install audio-separator
+      run: pip install audio-separator onnxruntime
+    - name: Bundle FFmpeg runtime
+      shell: bash
+      run: |
+        python - <<'PY'
+        import os
+        import shutil
+        import stat
+        import subprocess
+        from pathlib import Path
+        import imageio_ffmpeg
+        def check_executable(executable: Path) -> None:
+            result = subprocess.run([str(executable), "-version"], text=True, capture_output=True)
+            if result.returncode != 0:
+                details = "\n".join(
+                    part.strip()
+                    for part in (result.stdout, result.stderr)
+                    if part and part.strip()
+                )
+                raise RuntimeError(f"{executable} failed -version:\n{details}")
+        def resolve_ffprobe() -> Path:
+            ffprobe_src = shutil.which("ffprobe")
+            if not ffprobe_src:
+                raise RuntimeError("ffprobe not found after installing FFmpeg")
+            ffprobe_path = Path(ffprobe_src)
+            if os.name == "nt":
+                chocolatey_root = Path(os.environ.get("ChocolateyInstall", r"C:\ProgramData\chocolatey"))
+                chocolatey_shim_dir = chocolatey_root / "bin"
+                if ffprobe_path.parent.resolve() == chocolatey_shim_dir.resolve():
+                    real_ffprobe = chocolatey_root / "lib" / "ffmpeg" / "tools" / "ffmpeg" / "bin" / "ffprobe.exe"
+                    if not real_ffprobe.exists():
+                        raise RuntimeError(f"Chocolatey ffprobe shim found, but real binary is missing: {real_ffprobe}")
+                    return real_ffprobe
+            return ffprobe_path
+        bundle_dir = Path("tools/ffmpeg/bin")
+        bundle_dir.mkdir(parents=True, exist_ok=True)
+        ffmpeg_src = Path(imageio_ffmpeg.get_ffmpeg_exe())
+        ffmpeg_name = "ffmpeg.exe" if os.name == "nt" else "ffmpeg"
+        ffmpeg_dest = bundle_dir / ffmpeg_name
+        shutil.copy2(ffmpeg_src, ffmpeg_dest)
+        ffmpeg_dest.chmod(ffmpeg_dest.stat().st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
+        check_executable(ffmpeg_dest)
+        print(f"Bundled ffmpeg: {ffmpeg_dest}")
+        ffprobe_src = resolve_ffprobe()
+        ffprobe_name = "ffprobe.exe" if os.name == "nt" else "ffprobe"
+        ffprobe_dest = bundle_dir / ffprobe_name
+        shutil.copy2(ffprobe_src, ffprobe_dest)
+        ffprobe_dest.chmod(ffprobe_dest.stat().st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
+        subprocess.run([str(ffprobe_dest), "-version"], text=True, capture_output=True, check=True)
+        print(f"Bundled ffprobe: {ffprobe_dest}")
+        PY
+    - name: Download all AI models
+      shell: bash
+      env:
+        PYTHONIOENCODING: utf-8
+      run: |
+        echo "=== Download all base models (HuBERT, RMVPE, UVR5, DeEcho, pretrained) ==="
+        python tools/download_models.py --all
+        echo "=== Download Roformer separator models ==="
+        python -c "
+        from audio_separator.separator import Separator
+        import os
+        model_dir = os.path.join('assets', 'separator_models')
+        os.makedirs(model_dir, exist_ok=True)
+        # Vocal separation model
+        sep = Separator(output_dir='.', model_file_dir=model_dir)
+        sep.load_model('model_bs_roformer_ep_317_sdr_12.9755.ckpt')
+        del sep
+        # Karaoke SOTA ensemble models
+        for karaoke_model in [
+            'mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt',
+            'mel_band_roformer_karaoke_gabox_v2.ckpt',
+            'mel_band_roformer_karaoke_becruily.ckpt',
+        ]:
+            sep2 = Separator(output_dir='.', model_file_dir=model_dir)
+            sep2.load_model(karaoke_model)
+            del sep2
+        print('Roformer models downloaded.')
+        "
+        echo "=== Verify downloaded models ==="
+        find assets/ -name "*.pt" -o -name "*.pth" -o -name "*.ckpt" -o -name "*.onnx" | while read f; do
+          SIZE=$(stat -c%s "$f" 2>/dev/null || stat -f%z "$f")
+          echo "  $f ($(( SIZE / 1048576 )) MB)"
+        done
+    - name: Build executable
+      shell: bash
+      run: |
+        SEP="${{ matrix.sep }}"
+        NAME="AI-RVC-${{ matrix.platform }}-${{ matrix.variant }}"
+        pyinstaller --name "${NAME}" \
+          --onedir \
+          --add-data "ui${SEP}ui" \
+          --add-data "infer${SEP}infer" \
+          --add-data "lib${SEP}lib" \
+          --add-data "models${SEP}models" \
+          --add-data "tools${SEP}tools" \
+          --add-data "i18n${SEP}i18n" \
+          --add-data "configs${SEP}configs" \
+          --add-data "assets/hubert${SEP}assets/hubert" \
+          --add-data "assets/rmvpe${SEP}assets/rmvpe" \
+          --add-data "assets/uvr5_weights${SEP}assets/uvr5_weights" \
+          --add-data "assets/pretrained_v2${SEP}assets/pretrained_v2" \
+          --add-data "assets/separator_models${SEP}assets/separator_models" \
+          --hidden-import=torch \
+          --hidden-import=torchaudio \
+          --hidden-import=gradio \
+          --hidden-import=librosa \
+          --hidden-import=soundfile \
+          --hidden-import=fairseq \
+          --hidden-import=audio_separator \
+          --hidden-import=demucs \
+          --hidden-import=pedalboard \
+          --collect-all torch \
+          --collect-all torchaudio \
+          --collect-all gradio \
+          --collect-all gradio_client \
+          run.py
+    - name: Create portable package
+      shell: bash
+      run: |
+        NAME="AI-RVC-${{ matrix.platform }}-${{ matrix.variant }}"
+        PKG="${NAME}-Portable"
+        mkdir -p "${PKG}"
+        # onedir 输出在 dist/NAME/ 目录下，复制全部内容
+        cp -r dist/${NAME}/* "${PKG}/"
+        cp README.md "${PKG}/"
+        [ -f LICENSE ] && cp LICENSE "${PKG}/"
+        if [ "${{ matrix.variant }}" = "GPU" ]; then
+          VARIANT_NOTE="GPU 版（CUDA 12.1），支持 NVIDIA 显卡加速
+        如果没有 NVIDIA 显卡，程序会自动回退到 CPU 推理"
+        else
+          VARIANT_NOTE="CPU 版，无需显卡即可运行
+        如需 GPU 加速，请下载 GPU 版本或使用本地安装方式：python install.py"
+        fi
+        if [ "${{ matrix.platform }}" = "Windows" ]; then
+          EXE_NAME="${NAME}.exe"
+          cat > "${PKG}/使用说明.txt" << HEREDOC
+        AI-RVC ${{ matrix.platform }} 便携版（${{ matrix.variant }}）
+        使用方法：
+          双击 ${EXE_NAME} 启动
+          浏览器访问 http://127.0.0.1:7860
+        ${VARIANT_NOTE}
+        AI 模型已内置，无需额外下载
+        无需安装 Python，解压即用
+        HEREDOC
+        else
+          EXE_NAME="${NAME}"
+          chmod +x "${PKG}/${EXE_NAME}"
+          cat > "${PKG}/使用说明.txt" << HEREDOC
+        AI-RVC ${{ matrix.platform }} 便携版（${{ matrix.variant }}）
+        使用方法：
+          chmod +x ${EXE_NAME}
+          ./${EXE_NAME}
+          浏览器访问 http://127.0.0.1:7860
+        ${VARIANT_NOTE}
+        AI 模型已内置，无需额外下载
+        无需安装 Python，解压即用
+        HEREDOC
+        fi
+    - name: Compress package
+      shell: bash
+      run: |
+        NAME="AI-RVC-${{ matrix.platform }}-${{ matrix.variant }}"
+        PKG="${NAME}-Portable"
+        if [ "${{ matrix.platform }}" = "Windows" ]; then
+          # 先压缩成 zip
+          7z a -tzip "${PKG}.zip" "${PKG}"
+          # 超过 1.9GB 时删除 zip 改用 7z（压缩率更高）
+          FILE_SIZE=$(stat -c%s "${PKG}.zip" 2>/dev/null || wc -c < "${PKG}.zip" | tr -d ' ')
+          if [ "$FILE_SIZE" -gt 1900000000 ]; then
+            rm "${PKG}.zip"
+            # 先尝试不分卷的 7z
+            7z a "${PKG}.7z" "${PKG}"
+            SEVENZ_SIZE=$(stat -c%s "${PKG}.7z" 2>/dev/null || wc -c < "${PKG}.7z" | tr -d ' ')
+            if [ "$SEVENZ_SIZE" -gt 1900000000 ]; then
+              # 7z 也超限，改用分卷
+              rm "${PKG}.7z"
+              7z a -v1900m "${PKG}.7z" "${PKG}"
+            fi
+          fi
+        else
+          tar -czf "${PKG}.tar.gz" "${PKG}"
+          # 超过 1.9GB 时拆分
+          FILE_SIZE=$(stat -c%s "${PKG}.tar.gz" 2>/dev/null || stat -f%z "${PKG}.tar.gz")
+          if [ "$FILE_SIZE" -gt 1900000000 ]; then
+            split -b 1900M "${PKG}.tar.gz" "${PKG}.tar.gz.part"
+            rm "${PKG}.tar.gz"
+          fi
+        fi
+    - name: Upload artifact
+      uses: actions/upload-artifact@v4
+      with:
+        name: AI-RVC-${{ matrix.platform }}-${{ matrix.variant }}
+        path: AI-RVC-${{ matrix.platform }}-${{ matrix.variant }}-Portable*
+  upload-release:
+    needs: [build]
+    runs-on: ubuntu-latest
+    steps:
+    - name: Download all artifacts
+      uses: actions/download-artifact@v4
+      with:
+        merge-multiple: true
+    - name: List artifacts
+      run: ls -lh AI-RVC-*
+    - name: Determine target tag
+      id: tag
+      run: |
+        if [ "${{ github.event_name }}" = "push" ]; then
+          echo "tag=${GITHUB_REF#refs/tags/}" >> $GITHUB_OUTPUT
+        elif [ -n "${{ github.event.inputs.tag }}" ]; then
+          echo "tag=${{ github.event.inputs.tag }}" >> $GITHUB_OUTPUT
+        else
+          LATEST=$(gh release list --repo ${{ github.repository }} --limit 1 --json tagName -q '.[0].tagName')
+          echo "tag=${LATEST}" >> $GITHUB_OUTPUT
+        fi
+      env:
+        GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    - name: Upload assets to release
+      run: |
+        TAG="${{ steps.tag.outputs.tag }}"
+        echo "上传资源到 Release: ${TAG}"
+        # 收集所有构建产物（含分卷文件）
+        mapfile -t FILES < <(find . -maxdepth 1 -name "AI-RVC-*-Portable*" -type f | sort)
+        upload_asset_with_retry() {
+          local tag="$1"
+          local file="$2"
+          local attempt
+          local exit_code
+          for attempt in 1 2 3; do
+            echo "上传(${attempt}/3): ${file}"
+            if timeout 30m gh release upload "${tag}" "${file}" \
+              --repo ${{ github.repository }} \
+              --clobber; then
+              return 0
+            fi
+            exit_code=$?
+            echo "上传失败(退出码=${exit_code}): ${file}"
+            if [ "${attempt}" -lt 3 ]; then
+              sleep $(( attempt * 20 ))
+            fi
+          done
+          echo "上传最终失败: ${file}"
+          return 1
+        }
+        echo "找到以下文件："
+        for f in "${FILES[@]}"; do
+          SIZE=$(stat -c%s "$f" 2>/dev/null || stat -f%z "$f")
+          echo "  $f ($(( SIZE / 1048576 )) MB)"
+        done
+        # 逐个上传
+        for f in "${FILES[@]}"; do
+          upload_asset_with_retry "${TAG}" "$f"
+        done
+        echo "全部上传完成"
+      env:
+        GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

.gitignore ADDED Viewed

	@@ -0,0 +1,80 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# 虚拟环境
+venv/
+ENV/
+env/
+.venv/
+# IDE
+.idea/
+.vscode/
+.claude/
+*.swp
+*.swo
+# 环境变量
+.env
+# 模型文件 (太大，不提交)
+assets/hubert/*.pt
+assets/rmvpe/*.pt
+assets/pretrained_v2/*.pth
+assets/separator_models/
+assets/uvr5_weights/
+assets/weights/*.pth
+assets/weights/*.index
+assets/weights/characters/**/*.pth
+assets/weights/characters/**/*.index
+assets/weights/characters/**/ai_rvc_model.json
+assets/weights/official_models/
+assets/weights/official_indexes/
+# 临时文件
+temp/
+*.tmp
+*.temp
+*.log
+logs/
+# 音频文件
+*.wav
+*.mp3
+*.flac
+*.ogg
+!assets/test/
+# Gradio
+flagged/
+/venv310
+# 上游参考代码 (vendored, 独立 git repo)
+_official_rvc/
+# 输出文件
+output/
+outputs/
+/.playwright-cli
+/AI-RVC-Windows-GPU-Portable
+/AI-RVC-Windows-GPU-Portable.7z.*
+/tools/ffmpeg/
+/scratch/

AGENTS.md ADDED Viewed

	@@ -0,0 +1,126 @@

+# AGENTS.md
+This file provides guidance to Codex (Codex.ai/code) when working with code in this repository.
+## Project Overview
+RVC v2 voice conversion & AI cover system. Users upload a song, the pipeline separates vocals from accompaniment (Mel-Band Roformer), converts the vocal timbre via RVC v2 (HuBERT + RMVPE + FAISS), then mixes the result back with the accompaniment.
+**Platform Support**: Windows / Linux / WSL2 / Google Colab
+**Key Features**:
+- AI song covers with automatic vocal separation and mixing
+- 117 downloadable character models
+- 4 mixing presets (universal, vocal-focused, accompaniment-focused, live)
+- Karaoke mode (lead/backing vocal separation)
+- 4 VC preprocessing modes (auto, direct, uvr_deecho, legacy)
+- Dual VC pipeline (current implementation vs official RVC)
+- Multi-backend GPU support (CUDA, ROCm, XPU, DirectML, MPS)
+## Commands
+```powershell
+# Activate venv (Windows)
+.\venv310\Scripts\Activate.ps1
+# Activate venv (Linux/WSL2)
+source venv310/bin/activate
+# Install dependencies
+python install.py              # full install + launch
+python install.py --check      # check only
+python install.py --cpu        # CPU variant
+# Run
+python run.py                          # default: http://127.0.0.1:7860
+python run.py --skip-check             # skip env/model validation
+python run.py --host 0.0.0.0 --port 8080 --share
+# Download base models (HuBERT, RMVPE)
+python tools/download_models.py
+# Download character models
+python -c "from tools.character_models import download_character_model; download_character_model('rin')"
+# Quick CUDA check
+python -c "import torch; print(torch.cuda.is_available())"
+# Colab
+# Open AI_RVC_Colab.ipynb in Google Colab, set runtime to GPU (T4), run cells sequentially
+```
+## Architecture
+**Entry:** `run.py` → env check → model check → `ui/app.py:launch()`
+**Pipeline flow** (`infer/cover_pipeline.py:CoverPipeline.process`):
+1. Vocal separation (`infer/separator.py`) — Roformer (default), Demucs, or UVR5
+2. RVC voice conversion (`infer/pipeline.py`) — HuBERT features → RMVPE F0 → RVC v2 inference with FAISS retrieval
+3. Mixing (`lib/mixer.py`) — volume adjust + reverb via pedalboard
+**Character model system** (`tools/character_models.py`):
+- 117 downloadable character models from HuggingFace (`trioskosmos/rvc_models`)
+- Stored in `assets/weights/characters/`
+- Version notes (epochs, sample rate) extracted from .pth metadata and cached in `_version_notes.json`
+- Display name assembly: `_get_display_name()` appends `(500 epochs·40k)` style training info
+**UI** (`ui/app.py`):
+- Gradio 3.50.2, single-file ~2000 lines
+- i18n via `i18n/zh_CN.json`, accessed through `t(key, section)` helper
+- Three main tabs: song cover (full pipeline), model management, settings
+- Cover tab features:
+  - Character model download/management with series filtering and keyword search
+  - 4 mixing presets (universal, vocal-focused, accompaniment-focused, live)
+  - Karaoke separation (lead/backing vocals)
+  - 4 VC preprocessing modes (auto, direct, uvr_deecho, legacy)
+  - Source constraint control (auto/off/on)
+  - Dual VC pipeline mode (current/official)
+  - Singing repair (official mode only)
+  - Real-time VC route status display
+- Model management tab:
+  - Base model download (HuBERT, RMVPE)
+  - Mature DeEcho model download
+  - Model list table with refresh
+- Settings tab:
+  - Device info display
+  - Backend selection (CUDA/ROCm/XPU/DirectML/MPS/CPU)
+  - Config save
+**Config:** `configs/config.json` — device, F0 method, index rate, cover separator settings, path mappings
+## Key Conventions
+- Python 3.10, UTF-8, 4-space indent
+- `snake_case` functions/variables, `PascalCase` classes, `UPPER_SNAKE_CASE` constants
+- User-facing text is bilingual Chinese/English
+- Commit messages: short imperative subjects, Chinese/English mixed (e.g. `infer: fix CUDA OOM`)
+- No automated test suite; verify changes by running one voice conversion + one cover through the UI
+- `_official_rvc/` is vendored upstream reference — don't modify unless syncing
+## Important Paths
+- `configs/config.json` — all runtime settings
+- `infer/cover_pipeline.py` — orchestrates the full cover workflow
+- `infer/pipeline.py` — RVC v2 inference core
+- `infer/separator.py` — Roformer/Demucs vocal separation wrappers
+- `tools/character_models.py` — character model registry (117 entries) + download logic
+- `tools/download_models.py` — base model (HuBERT/RMVPE) + mature DeEcho downloader
+- `lib/mixer.py` — audio mixing with volume/reverb
+- `ui/app.py` — entire Gradio UI (~2000 lines)
+- `mcp/server.py` + `mcp/tools.py` — MCP server integration for Codex
+- `AI_RVC_Colab.ipynb` — Google Colab notebook with full feature parity
+- `install.py` — cross-platform installation script (Windows/Linux)
+## Things to Watch
+- `fairseq` is pinned to `0.12.2` — HuBERT loading breaks on other versions
+- `audio-separator` must be installed with `[gpu]` extra for CUDA support
+- Roformer model auto-downloads on first use to `assets/separator_models/`
+- Gradio is pinned to `3.50.2`; the UI code uses v3 API patterns (not v4)
+- Model weights (.pt, .pth) and audio files are gitignored — never commit them
+- Path handling uses `pathlib.Path` for cross-platform compatibility (Windows/Linux)
+- Virtual environment activation differs by platform: `Scripts/Activate.ps1` (Windows) vs `bin/activate` (Linux)
+- `install.py` has hardcoded Windows Python paths in `PYTHON310_CANDIDATES` but falls back to `py -3.10` launcher
+- Platform detection uses `os.name == "nt"` for Windows-specific logic (venv paths, etc.)
+- All core functionality is platform-agnostic; audio libraries work better on Linux
+- Colab notebook (`AI_RVC_Colab.ipynb`) provides full feature parity with Web UI

AI-RVC.spec ADDED Viewed

	@@ -0,0 +1,71 @@

+# -*- mode: python ; coding: utf-8 -*-
+block_cipher = None
+a = Analysis(
+    ['run.py'],
+    pathex=[],
+    binaries=[],
+    datas=[
+        ('ui', 'ui'),
+        ('infer', 'infer'),
+        ('lib', 'lib'),
+        ('models', 'models'),
+        ('tools', 'tools'),
+        ('i18n', 'i18n'),
+        ('configs', 'configs'),
+        ('assets/hubert', 'assets/hubert'),
+        ('assets/rmvpe', 'assets/rmvpe'),
+    ],
+    hiddenimports=[
+        'torch',
+        'torchaudio',
+        'gradio',
+        'librosa',
+        'soundfile',
+        'scipy',
+        'numpy',
+        'fairseq',
+        'audio_separator',
+        'demucs',
+        'pedalboard',
+        'praat-parselmouth',
+        'pyworld',
+        'torchcrepe',
+        'faiss',
+        'huggingface_hub',
+        'ffmpeg',
+    ],
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    win_no_prefer_redirects=False,
+    win_private_assemblies=False,
+    cipher=block_cipher,
+    noarchive=False,
+)
+pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
+exe = EXE(
+    pyz,
+    a.scripts,
+    a.binaries,
+    a.zipfiles,
+    a.datas,
+    [],
+    name='AI-RVC',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    runtime_tmpdir=None,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)

AI_RVC_Colab.ipynb ADDED Viewed

	@@ -0,0 +1,380 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# AI-RVC 一键 AI 翻唱 (Colab)\n",
+    "\n",
+    "[Open in Colab](https://colab.research.google.com/github/mason369/AI-RVC/blob/master/AI_RVC_Colab.ipynb)\n",
+    "\n",
+    "这是 AI-RVC 的 Google Colab 运行入口。Notebook 会创建独立 Python 3.10 环境，再运行项目安装脚本，避免 Colab 默认 Python 版本变化影响 `fairseq==0.12.2`。\n",
+    "\n",
+    "当前处理链路模型：\n",
+    "\n",
+    "| 环节 | 默认模型 |\n",
+    "|------|----------|\n",
+    "| 人声分离 | `ensemble:vocal_rvc` |\n",
+    "| 卡拉OK分离 | `ensemble:karaoke` |\n",
+    "| 去混响 / DeEcho | `dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt` |\n",
+    "| 内容特征 | `hubert_base.pt` |\n",
+    "| F0 音高 | `rmvpe.pt` |\n",
+    "| VC 推理 | RVC v2 `.pth` + 可选 FAISS `.index` |\n",
+    "\n",
+    "使用前请在 Colab 菜单选择：`代码执行程序 -> 更改运行时类型 -> T4 GPU`，然后按顺序运行每个单元格。"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. 检查 Colab 运行时\n",
+    "\n",
+    "确认当前机器有 GPU，并显示 Colab 默认 Python。后续实际安装会使用独立 Python 3.10 环境。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import subprocess\n",
+    "\n",
+    "print('Colab 默认 Python:', sys.version)\n",
+    "print('\\nGPU 信息:')\n",
+    "subprocess.run(['nvidia-smi'], check=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. 克隆或刷新仓库\n",
+    "\n",
+    "如果 `/content/AI-RVC` 已存在，会重置到 GitHub `master` 分支，确保 Colab 环境和仓库最新代码一致。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "set -euo pipefail\n",
+    "cd /content\n",
+    "if [ -d AI-RVC/.git ]; then\n",
+    "  git -C AI-RVC fetch origin master\n",
+    "  git -C AI-RVC reset --hard origin/master\n",
+    "else\n",
+    "  git clone https://github.com/mason369/AI-RVC.git AI-RVC\n",
+    "fi\n",
+    "cd /content/AI-RVC\n",
+    "git rev-parse --short HEAD\n",
+    "ls -la"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. 创建 Python 3.10 环境并安装依赖\n",
+    "\n",
+    "这个步骤会安装 `uv`，用它准备 Python 3.10，然后调用项目自带 `install.py --no-run`。如果任何依赖安装失败，单元格会直接停止并显示错误。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "set -euo pipefail\n",
+    "cd /content/AI-RVC\n",
+    "\n",
+    "sudo apt-get update -y\n",
+    "sudo apt-get install -y ffmpeg build-essential python3-dev curl git\n",
+    "\n",
+    "if ! command -v uv >/dev/null 2>&1; then\n",
+    "  curl -LsSf https://astral.sh/uv/install.sh | sh\n",
+    "fi\n",
+    "export PATH=\"$HOME/.local/bin:$PATH\"\n",
+    "uv --version\n",
+    "uv python install 3.10\n",
+    "uv venv --seed --python 3.10 venv310\n",
+    "PY=/content/AI-RVC/venv310/bin/python\n",
+    "$PY --version\n",
+    "$PY -m pip --version\n",
+    "$PY -m pip install --upgrade pip\n",
+    "$PY -m pip install torch torchaudio --index-url https://download.pytorch.org/whl/cu126\n",
+    "$PY - <<'PY'\n",
+    "import torch\n",
+    "print('Torch:', torch.__version__)\n",
+    "print('CUDA available:', torch.cuda.is_available())\n",
+    "if not torch.cuda.is_available():\n",
+    "    raise RuntimeError('GPU PyTorch install completed but CUDA is not available.')\n",
+    "print('CUDA:', torch.version.cuda)\n",
+    "print('GPU:', torch.cuda.get_device_name(0))\n",
+    "PY\n",
+    "uv run --python 3.10 python install.py --no-run"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. 严格验证依赖与默认模型配置\n",
+    "\n",
+    "这里会确认关键包、Python 版本、CUDA、Colab 默认分离模型、卡拉OK模型、DeEcho模型都符合当前项目配置。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "set -euo pipefail\n",
+    "cd /content/AI-RVC\n",
+    "PY=/content/AI-RVC/venv310/bin/python\n",
+    "$PY - <<'PY'\n",
+    "import importlib.metadata as md\n",
+    "import sys\n",
+    "import torch\n",
+    "from infer.separator import (\n",
+    "    ROFORMER_DEFAULT_MODEL,\n",
+    "    KARAOKE_DEFAULT_MODEL,\n",
+    "    ROFORMER_DEREVERB_DEFAULT_MODEL,\n",
+    ")\n",
+    "\n",
+    "assert sys.version_info[:2] == (3, 10), sys.version\n",
+    "print('Python:', sys.version)\n",
+    "print('Torch:', torch.__version__)\n",
+    "print('CUDA available:', torch.cuda.is_available())\n",
+    "if not torch.cuda.is_available():\n",
+    "    raise RuntimeError('Colab runtime did not provide CUDA. Please switch runtime type to T4 GPU and rerun.')\n",
+    "\n",
+    "expected = {\n",
+    "    'audio-separator': '0.44.1',\n",
+    "    'fairseq': '0.12.2',\n",
+    "    'gradio': '3.50.2',\n",
+    "}\n",
+    "for dist, minimum in expected.items():\n",
+    "    version = md.version(dist)\n",
+    "    print(f'{dist}: {version}')\n",
+    "\n",
+    "assert ROFORMER_DEFAULT_MODEL == 'ensemble:vocal_rvc', ROFORMER_DEFAULT_MODEL\n",
+    "assert KARAOKE_DEFAULT_MODEL == 'ensemble:karaoke', KARAOKE_DEFAULT_MODEL\n",
+    "assert ROFORMER_DEREVERB_DEFAULT_MODEL == 'dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt', ROFORMER_DEREVERB_DEFAULT_MODEL\n",
+    "\n",
+    "import faiss  # noqa: F401\n",
+    "from audio_separator.separator import Separator  # noqa: F401\n",
+    "import fairseq  # noqa: F401\n",
+    "print('Dependency and model default checks passed.')\n",
+    "PY"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. 下载基础模型\n",
+    "\n",
+    "下载 HuBERT、RMVPE、UVR5 HP2 等必需模型。模型会保存到仓库的 `assets/` 目录。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "set -euo pipefail\n",
+    "cd /content/AI-RVC\n",
+    "PY=/content/AI-RVC/venv310/bin/python\n",
+    "$PY tools/download_models.py\n",
+    "$PY - <<'PY'\n",
+    "from tools.download_models import REQUIRED_MODELS, check_model\n",
+    "missing = [name for name in REQUIRED_MODELS if not check_model(name)]\n",
+    "if missing:\n",
+    "    raise RuntimeError('Missing required models: ' + ', '.join(missing))\n",
+    "print('Required models are present:', ', '.join(REQUIRED_MODELS))\n",
+    "PY"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. 可选：查看或下载声线模型\n",
+    "\n",
+    "处理链路模型已经在前面准备好。这里的 `.pth` 声线模型是 RVC 目标音色，不属于分离 / F0 / HuBERT 等处理模型。可以先下载一个示例模型，也可以跳过后在 Web UI 中下载。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "set -euo pipefail\n",
+    "cd /content/AI-RVC\n",
+    "PY=/content/AI-RVC/venv310/bin/python\n",
+    "$PY - <<'PY'\n",
+    "from tools.character_models import list_available_characters, list_available_series\n",
+    "print('Available series:')\n",
+    "for series in list_available_series():\n",
+    "    print(' -', series)\n",
+    "chars = list_available_characters()\n",
+    "print('\\nFirst 20 voice models:')\n",
+    "for char in chars[:20]:\n",
+    "    print(f\" - {char['name']}: {char.get('display', char.get('description', ''))}\")\n",
+    "print('\\nTotal:', len(chars))\n",
+    "PY"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Optional: download one example voice model.\n",
+    "# Change character_name as needed, then run this cell.\n",
+    "\n",
+    "# %%bash\n",
+    "# set -euo pipefail\n",
+    "# cd /content/AI-RVC\n",
+    "# PY=/content/AI-RVC/venv310/bin/python\n",
+    "# $PY - <<'PY'\n",
+    "# from tools.character_models import download_character_model\n",
+    "# character_name = 'rin'\n",
+    "# if not download_character_model(character_name):\n",
+    "#     raise RuntimeError(f'Failed to download voice model: {character_name}')\n",
+    "# print(f'Downloaded voice model: {character_name}')\n",
+    "# PY"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. 全模式处理矩阵验证\n",
+    "\n",
+    "上传一段含有人声的短音乐文件到 `/content/input_audio.*`，或把自己的文件路径填到 `INPUT_AUDIO`。此步骤会下载示例声线模型 `rin`，然后依次验证 RoFormer、Karaoke、Demucs、UVR5、官方 1:1、官方唱歌修复等处理模式。任何模式失败都会停止并在 JSON 里保留 traceback。\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Upload a short vocal music file, then set INPUT_AUDIO to its path.\n",
+    "# Example after upload: INPUT_AUDIO = '/content/input_audio.wav'\n",
+    "INPUT_AUDIO = '/content/input_audio.wav'\n",
+    "\n",
+    "from pathlib import Path\n",
+    "if not Path(INPUT_AUDIO).exists():\n",
+    "    raise FileNotFoundError(\n",
+    "        f'Missing test audio: {INPUT_AUDIO}. Upload a short vocal music file to this path before running the matrix.'\n",
+    "    )\n",
+    "print('Matrix input:', INPUT_AUDIO, Path(INPUT_AUDIO).stat().st_size, 'bytes')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "set -euo pipefail\n",
+    "cd /content/AI-RVC\n",
+    "PY=/content/AI-RVC/venv310/bin/python\n",
+    "$PY - <<'PY'\n",
+    "from tools.character_models import download_character_model\n",
+    "if not download_character_model('rin'):\n",
+    "    raise RuntimeError('Failed to download voice model: rin')\n",
+    "print('Voice model ready: rin')\n",
+    "PY\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "set -euo pipefail\n",
+    "cd /content/AI-RVC\n",
+    "PY=/content/AI-RVC/venv310/bin/python\n",
+    "$PY tools/run_mode_matrix.py \\\n",
+    "  --input /content/input_audio.wav \\\n",
+    "  --output-dir /content/AI-RVC/outputs/mode_matrix_colab \\\n",
+    "  --require-cuda\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 8. 启动 Web UI\n",
+    "\n",
+    "运行后 Colab 会输出 Gradio 公共链接。打开链接即可使用 AI-RVC Web UI。此单元格会再次检查环境和必需模型。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash\n",
+    "set -euo pipefail\n",
+    "cd /content/AI-RVC\n",
+    "PY=/content/AI-RVC/venv310/bin/python\n",
+    "$PY run.py --host 0.0.0.0 --port 7860 --share"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 常见问题\n",
+    "\n",
+    "| 问题 | 处理方式 |\n",
+    "|------|----------|\n",
+    "| `nvidia-smi` 失败 | 在 Colab 菜单选择 GPU 运行时后重新运行 |\n",
+    "| `fairseq` 安装失败 | 确认本 notebook 的第 3 步使用了 Python 3.10 环境 |\n",
+    "| `CUDA out of memory` | 换短音频，或在 UI 中降低处理负载 / 切换分离器 |\n",
+    "| 模型下载失败 | 重新运行第 5 步；如果 Hugging Face 限流，可稍后再试或配置 token |\n",
+    "\n",
+    "本项目仅供学习研究和个人娱乐用途，请勿用于商业用途、欺诈、冒充他人或侵犯权益的场景。"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}

CLAUDE.md ADDED Viewed

	@@ -0,0 +1,126 @@

+# CLAUDE.md
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+## Project Overview
+RVC v2 voice conversion & AI cover system. Users upload a song, the pipeline separates vocals from accompaniment (Mel-Band Roformer), converts the vocal timbre via RVC v2 (HuBERT + RMVPE + FAISS), then mixes the result back with the accompaniment.
+**Platform Support**: Windows / Linux / WSL2 / Google Colab
+**Key Features**:
+- AI song covers with automatic vocal separation and mixing
+- 117 downloadable character models
+- 4 mixing presets (universal, vocal-focused, accompaniment-focused, live)
+- Karaoke mode (lead/backing vocal separation)
+- 4 VC preprocessing modes (auto, direct, uvr_deecho, legacy)
+- Dual VC pipeline (current implementation vs official RVC)
+- Multi-backend GPU support (CUDA, ROCm, XPU, DirectML, MPS)
+## Commands
+```powershell
+# Activate venv (Windows)
+.\venv310\Scripts\Activate.ps1
+# Activate venv (Linux/WSL2)
+source venv310/bin/activate
+# Install dependencies
+python install.py              # full install + launch
+python install.py --check      # check only
+python install.py --cpu        # CPU variant
+# Run
+python run.py                          # default: http://127.0.0.1:7860
+python run.py --skip-check             # skip env/model validation
+python run.py --host 0.0.0.0 --port 8080 --share
+# Download base models (HuBERT, RMVPE)
+python tools/download_models.py
+# Download character models
+python -c "from tools.character_models import download_character_model; download_character_model('rin')"
+# Quick CUDA check
+python -c "import torch; print(torch.cuda.is_available())"
+# Colab
+# Open AI_RVC_Colab.ipynb in Google Colab, set runtime to GPU (T4), run cells sequentially
+```
+## Architecture
+**Entry:** `run.py` → env check → model check → `ui/app.py:launch()`
+**Pipeline flow** (`infer/cover_pipeline.py:CoverPipeline.process`):
+1. Vocal separation (`infer/separator.py`) — Roformer (default), Demucs, or UVR5
+2. RVC voice conversion (`infer/pipeline.py`) — HuBERT features → RMVPE F0 → RVC v2 inference with FAISS retrieval
+3. Mixing (`lib/mixer.py`) — volume adjust + reverb via pedalboard
+**Character model system** (`tools/character_models.py`):
+- 117 downloadable character models from HuggingFace (`trioskosmos/rvc_models`)
+- Stored in `assets/weights/characters/`
+- Version notes (epochs, sample rate) extracted from .pth metadata and cached in `_version_notes.json`
+- Display name assembly: `_get_display_name()` appends `(500 epochs·40k)` style training info
+**UI** (`ui/app.py`):
+- Gradio 3.50.2, single-file ~2000 lines
+- i18n via `i18n/zh_CN.json`, accessed through `t(key, section)` helper
+- Three main tabs: song cover (full pipeline), model management, settings
+- Cover tab features:
+  - Character model download/management with series filtering and keyword search
+  - 4 mixing presets (universal, vocal-focused, accompaniment-focused, live)
+  - Karaoke separation (lead/backing vocals)
+  - 4 VC preprocessing modes (auto, direct, uvr_deecho, legacy)
+  - Source constraint control (auto/off/on)
+  - Dual VC pipeline mode (current/official)
+  - Singing repair (official mode only)
+  - Real-time VC route status display
+- Model management tab:
+  - Base model download (HuBERT, RMVPE)
+  - Mature DeEcho model download
+  - Model list table with refresh
+- Settings tab:
+  - Device info display
+  - Backend selection (CUDA/ROCm/XPU/DirectML/MPS/CPU)
+  - Config save
+**Config:** `configs/config.json` — device, F0 method, index rate, cover separator settings, path mappings
+## Key Conventions
+- Python 3.10, UTF-8, 4-space indent
+- `snake_case` functions/variables, `PascalCase` classes, `UPPER_SNAKE_CASE` constants
+- User-facing text is bilingual Chinese/English
+- Commit messages: short imperative subjects, Chinese/English mixed (e.g. `infer: fix CUDA OOM`)
+- No automated test suite; verify changes by running one voice conversion + one cover through the UI
+- `_official_rvc/` is vendored upstream reference — don't modify unless syncing
+## Important Paths
+- `configs/config.json` — all runtime settings
+- `infer/cover_pipeline.py` — orchestrates the full cover workflow
+- `infer/pipeline.py` — RVC v2 inference core
+- `infer/separator.py` — Roformer/Demucs vocal separation wrappers
+- `tools/character_models.py` — character model registry (117 entries) + download logic
+- `tools/download_models.py` — base model (HuBERT/RMVPE) + mature DeEcho downloader
+- `lib/mixer.py` — audio mixing with volume/reverb
+- `ui/app.py` — entire Gradio UI (~2000 lines)
+- `mcp/server.py` + `mcp/tools.py` — MCP server integration for Claude Code
+- `AI_RVC_Colab.ipynb` — Google Colab notebook with full feature parity
+- `install.py` — cross-platform installation script (Windows/Linux)
+## Things to Watch
+- `fairseq` is pinned to `0.12.2` — HuBERT loading breaks on other versions
+- `audio-separator` must be installed with `[gpu]` extra for CUDA support
+- Roformer model auto-downloads on first use to `assets/separator_models/`
+- Gradio is pinned to `3.50.2`; the UI code uses v3 API patterns (not v4)
+- Model weights (.pt, .pth) and audio files are gitignored — never commit them
+- Path handling uses `pathlib.Path` for cross-platform compatibility (Windows/Linux)
+- Virtual environment activation differs by platform: `Scripts/Activate.ps1` (Windows) vs `bin/activate` (Linux)
+- `install.py` has hardcoded Windows Python paths in `PYTHON310_CANDIDATES` but falls back to `py -3.10` launcher
+- Platform detection uses `os.name == "nt"` for Windows-specific logic (venv paths, etc.)
+- All core functionality is platform-agnostic; audio libraries work better on Linux
+- Colab notebook (`AI_RVC_Colab.ipynb`) provides full feature parity with Web UI

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README_HF.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: AI-RVC 语音转换 & AI 翻唱
 emoji: 🎤
 colorFrom: blue
 colorTo: purple
@@ -10,15 +10,15 @@ pinned: false
 license: mit
 ---
-# 🎤 AI-RVC 语音转换 & AI 翻唱
-基于 RVC v2 + RMVPE 的高质量语音转换系统，支持一键 AI 翻唱功能。
 ## 功能特点
 - **AI 歌曲翻唱**：上传歌曲自动分离人声、转换音色、混合伴奏，一键生成翻唱
 - **人声分离**：默认 Mel-Band Roformer (KimberleyJensen)，在 MVSEP 公开 Multisong 指标中为 Vocals SDR 11.01 / Instrum SDR 17.32
-- **语音转换**：RVC v2 架构 + FAISS 检索增强流程
 - **RMVPE 音高提取**：高精度 F0 提取，噪声鲁棒性强
 - **角色模型**：内置 117 个可下载角色模型
 - **混音效果**：支持人声混响、音量调节、4 种混音预设
@@ -88,7 +88,7 @@ license: mit
               ↓
           人声分离 (Mel-Band Roformer)
               ↓
-          RVC 语音转换 (HuBERT + RMVPE + FAISS)
               ↓
           混音 (音量调节 + 混响)
               ↓
@@ -148,4 +148,4 @@ A: 建议选择与原唱性别、音色相近的角色，效果更自然。
 **License**: MIT
 **Version**: 2.0
-**Last Updated**: 2026-03-10

 ---
+title: AI-RVC 一键 AI 翻唱
 emoji: 🎤
 colorFrom: blue
 colorTo: purple
 license: mit
 ---
+# 🎤 AI-RVC 一键 AI 翻唱
+基于 RVC v2 的一键 AI 翻唱系统，自动完成人声分离、音色转换、混音合成全流程。
 ## 功能特点
 - **AI 歌曲翻唱**：上传歌曲自动分离人声、转换音色、混合伴奏，一键生成翻唱
 - **人声分离**：默认 Mel-Band Roformer (KimberleyJensen)，在 MVSEP 公开 Multisong 指标中为 Vocals SDR 11.01 / Instrum SDR 17.32
+- **音色转换**：RVC v2 架构 + FAISS 检索增强流程
 - **RMVPE 音高提取**：高精度 F0 提取，噪声鲁棒性强
 - **角色模型**：内置 117 个可下载角色模型
 - **混音效果**：支持人声混响、音量调节、4 种混音预设
               ↓
           人声分离 (Mel-Band Roformer)
               ↓
+          RVC 音色转换 (HuBERT + RMVPE + FAISS)
               ↓
           混音 (音量调节 + 混响)
               ↓
 **License**: MIT
 **Version**: 2.0
+**Last Updated**: 2026-03-15

app.py CHANGED Viewed

@@ -7,19 +7,26 @@ import os
 import sys
 from pathlib import Path
 ROOT_DIR = Path(__file__).parent
 sys.path.insert(0, str(ROOT_DIR))
 os.environ["GRADIO_SERVER_NAME"] = "0.0.0.0"
 os.environ["GRADIO_SERVER_PORT"] = "7860"
-from ui.app import create_ui
-app = create_ui()
-app.queue()
-app.launch(
-    server_name="0.0.0.0",
-    server_port=7860,
-    share=False,
-    inbrowser=False,
-)

 import sys
 from pathlib import Path
+# 添加项目根目录到路径
 ROOT_DIR = Path(__file__).parent
 sys.path.insert(0, str(ROOT_DIR))
+from lib.ffmpeg_runtime import configure_ffmpeg_runtime
+configure_ffmpeg_runtime()
+# 设置环境变量
 os.environ["GRADIO_SERVER_NAME"] = "0.0.0.0"
 os.environ["GRADIO_SERVER_PORT"] = "7860"
+# 导入并启动应用
+if __name__ == "__main__":
+    from ui.app import launch
+    # 启动 Gradio 界面
+    launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        inbrowser=False
+    )

assets/weights/characters/_version_notes.json ADDED Viewed

	@@ -0,0 +1,120 @@

+{
+  "Aimi": "200 epochs · 48k",
+  "hanayo": "250 epochs · 40k",
+  "rin": "250 epochs · 40k",
+  "umi": "250 epochs · 40k",
+  "nozomi": "250 epochs · 48k",
+  "dia": "250 epochs · 40k",
+  "ayumu": "250 epochs · 40k",
+  "chika": "500 epochs · 40k",
+  "riko": "500 epochs · 40k",
+  "mari": "400 epochs · RVC v2",
+  "yohane": "500 epochs · 48k",
+  "nico": "300 epochs · 40k",
+  "kanata": "250 epochs · 40k",
+  "setsuna_yuki_nana": "300 epochs · 48k",
+  "hanamaru_zurakichi": "195 epochs · 40k",
+  "kanan": null,
+  "yu_takasaki": null,
+  "shizuku_osaka": null,
+  "sarah_kazuno": "500 epochs · 40k",
+  "leah_kazuno": "500 epochs · 40k",
+  "eli": "350 epochs · 40k",
+  "you": "400 epochs · 40k",
+  "honoka": "1000 epochs · 48k",
+  "kotori": "309 epochs · 48k",
+  "maki": "135 epochs · 40k",
+  "ruby": "207 epochs · 48k",
+  "kasumi": "414 epochs · 48k",
+  "karin": "1000 epochs · 48k",
+  "rina": "248 epochs · 48k",
+  "lanzhu": "248 epochs · 48k",
+  "keke": "151 epochs · 48k",
+  "ai_miyashita": "400 epochs · 32k",
+  "emma_verde": "450 epochs · 32k",
+  "shioriko_mifune": "500 epochs · 32k",
+  "chisato_arashi": "450 epochs · 32k",
+  "ren_hazuki": "400 epochs · 32k",
+  "sumire_heanna": "400 epochs · 32k",
+  "kinako_sakurakoji": "400 epochs · 32k",
+  "mei_yoneme": "400 epochs · 32k",
+  "shiki_wakana": "400 epochs · 32k",
+  "natsumi_onitsuka": "450 epochs · 32k",
+  "sayaka_murano": "400 epochs · 32k",
+  "tsuzuri_yugiri": "400 epochs · 32k",
+  "hanamaru": "410 epochs · 40k",
+  "setsuna_yuki_og": "910 epochs · 40k",
+  "setsuna_yuki_coco": "555 epochs · 48k",
+  "hanayo_pre_anime_v0": "222 epochs · 40k",
+  "hanayo_pre_anime_v1": "286 epochs · 48k",
+  "kotori_pre_anime": "216 epochs · 48k",
+  "kotori_pre_anime_v2": "401 epochs · 40k",
+  "wien_margarete": "453 epochs · 40k",
+  "mia_taylor": "285 epochs · 40k",
+  "anju_yuki_arise": "446 epochs · 40k",
+  "erena_todo_arise": "480 epochs · 40k",
+  "tsubasa_kira_arise": "729 epochs · 40k",
+  "kanon": "150 epochs · 40k",
+  "setsuna": "504 epochs · 48k",
+  "setsuna_v2": "302 epochs · 48k",
+  "chika_v2": "453 epochs · 48k",
+  "dia_v2": "250 epochs · 40k",
+  "hanamaru_v2": "229 epochs · 48k",
+  "honoka_v2": "214 epochs · 48k",
+  "ayumu_v2": "214 epochs · 48k",
+  "ayumu_v3": "250 epochs · 40k",
+  "ayumu_v4": "250 epochs · 40k",
+  "karin_v2": "301 epochs · 48k",
+  "keke_v2": "102 epochs · 48k",
+  "riko_v2": "346 epochs · 48k",
+  "umi_v2": "250 epochs · 40k",
+  "wien_v2": "257 epochs · 48k",
+  "yohane_trios": "155 epochs · 48k",
+  "yoshiko_trios": "20 epochs · 48k",
+  "yoshiko_v2": "211 epochs · 48k",
+  "tokai_teio": "570 epochs · RVC v2",
+  "focalors": "940 epochs · 48k",
+  "essex": "925 epochs · 40k",
+  "ellie": "40k",
+  "furina": "170 epochs · 40k",
+  "ayaka": "300 epochs · 40k",
+  "takane_shijou": "565 epochs · 40k",
+  "kobo": "500 epochs · 48k",
+  "kaela": "400 epochs · 48k",
+  "pekora": "600 epochs · 48k",
+  "kizuna_ai": "300 epochs · 40k",
+  "fuwawa": "250 epochs · 48k",
+  "mococo": "250 epochs · 48k",
+  "vo_furina_kr": "300 epochs · 48k",
+  "silverwolf_kr": "300 epochs · 48k",
+  "sparkle_e40": "40 epochs · 40k",
+  "ranko": "250 epochs · 40k",
+  "yumemiriamu": "250 epochs · 40k",
+  "hatsune_miku": null,
+  "nahida": null,
+  "nilou": null,
+  "herta_hsr": null,
+  "the_herta": null,
+  "firefly_hsr": null,
+  "tingyun_hsr": null,
+  "yunli_hsr": null,
+  "tribbie_hsr": null,
+  "huohuo_hsr": null,
+  "castorice_hsr": null,
+  "seele_hi3": null,
+  "herrscher_ego_hi3": null,
+  "miyabi_zzz": null,
+  "nene_kusanagi": null,
+  "miko_sakura": null,
+  "subaru_oozora": null,
+  "moona_hoshinova": null,
+  "risu_ayunda": null,
+  "reine_pavolia": null,
+  "zeta_vestia": null,
+  "anya_melfissa": null,
+  "luna_himemori": null,
+  "boothill": null,
+  "shiroko_rosmontis": null,
+  "rice_shower": null,
+  "suwawa": "211 epochs · 40k"
+}

assets/weights/characters/mari/metadata.json ADDED Viewed

	@@ -0,0 +1,177 @@

+{
+  "title": "Mari Ohara (Love Live! Sunshine!!)  - RVC V2 (400 Epoch)",
+  "author": {
+    "name": "natehitsvtec",
+    "discordUserId": "641787156863385610"
+  },
+  "md5": "ff4bd8e126ab82a8145793f02db0cfee",
+  "uploadedAt": "2023-06-11T01:22:26.820Z",
+  "weightsLink": "https://www.weights.gg/models/clm72zu941aj1cctco2oj26p1",
+  "id": "clm72zu941aj1cctco2oj26p1",
+  "type": "v2",
+  "tags": [
+    "Anime Character",
+    "Fictional Character",
+    "Artist",
+    "RVC v2"
+  ],
+  "description": "400 Epoch\nhttps://drive.google.com/file/d/1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk/view?usp=sharing\n\nTested languages: Japanese, Korean, English\n\nNew version trained up to 400 epoch, revised the dataset to 25 songs down from 33, but made it much clearer and removed excessive silence and other parts with echo or reverb. I did not use SIFAS audio for my dataset because it ended up being too high pitched, I believe she speaks with a lot of exaggeration in the game. Unsure if I will train it more but I probably will eventually.\n\nPrevious 200 Epoch (old dataset)\nhttps://drive.google.com/file/d/1uBmoKluknMOWKivjfHcbu7YkOMtjoclH/view?usp=sharing",
+  "samples": [],
+  "files": [
+    {
+      "name": "model.index",
+      "size": 812116059,
+      "md5": "be135619c9fc2e04d0d9bf384f9eef11"
+    },
+    {
+      "name": "model.pth",
+      "size": 55227869,
+      "md5": "ff4bd8e126ab82a8145793f02db0cfee"
+    }
+  ],
+  "torchMetadata": {
+    "config": {
+      "spec_channels": 1025,
+      "segment_size": 32,
+      "inter_channels": 192,
+      "hidden_channels": 192,
+      "filter_channels": 768,
+      "n_heads": 2,
+      "n_layers": 6,
+      "kernel_size": 3,
+      "p_dropout": 0,
+      "resblock": "1",
+      "resblock_kernel_sizes": [
+        3,
+        7,
+        11
+      ],
+      "resblock_dilation_sizes": [
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ]
+      ],
+      "upsample_rates": [
+        10,
+        10,
+        2,
+        2
+      ],
+      "upsample_initial_channel": 512,
+      "upsample_kernel_sizes": [
+        16,
+        16,
+        4,
+        4
+      ],
+      "emb_channels": null,
+      "spk_embed_dim": 109,
+      "gin_channels": 256,
+      "sr": 40000
+    },
+    "f0": 1,
+    "version": "v2",
+    "extra_info": {
+      "config": [
+        1025,
+        32,
+        192,
+        192,
+        768,
+        2,
+        6,
+        3,
+        0,
+        "1",
+        [
+          3,
+          7,
+          11
+        ],
+        [
+          [
+            1,
+            3,
+            5
+          ],
+          [
+            1,
+            3,
+            5
+          ],
+          [
+            1,
+            3,
+            5
+          ]
+        ],
+        [
+          10,
+          10,
+          2,
+          2
+        ],
+        512,
+        [
+          16,
+          16,
+          4,
+          4
+        ],
+        109,
+        256,
+        40000
+      ],
+      "info": "400epoch",
+      "sr": "40k",
+      "f0": 1,
+      "version": "v2"
+    }
+  },
+  "url": "https://drive.google.com/file/d/1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk/view",
+  "urls": [
+    "https://huggingface.co/realmongx2/m0delz/resolve/main/mariohara_e400.zip?download=true",
+    "https://drive.google.com/file/d/1x5aZjlYea_HfaHNHnZCpqaKmU4N5J-1X/view?usp=sharing",
+    "https://drive.google.com/file/d/1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk/view?usp=sharing",
+    "https://drive.google.com/file/d/1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk/view?usp=drive_link",
+    "https://drive.google.com/file/d/1rJ3a4NJ-3MQ8Vv240-hhaP2bO1irbA9h/view?usp=sharing",
+    "https://drive.google.com/file/d/1rJ3a4NJ-3MQ8Vv240-hhaP2bO1irbA9h/view?usp=drive_link",
+    "https://drive.google.com/file/d/1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk/view?usp=drivesdk",
+    "https://drive.google.com/file/d/1Pklo3gYLRM8_wG0XZvwf_lJJnsDgZhry/view?usp=drive_link",
+    "https://drive.google.com/file/d/1tKgz422yovSPvuKHDuSETNYRQNq2wIoT/view?usp=sharing",
+    "https://drive.google.com/file/d/1AVYkFKf3SLkUhQqiJYWiqluc7KAnR2cC/view?usp=sharing",
+    "https://drive.google.com/file/d/1AVYkFKf3SLkUhQqiJYWiqluc7KAnR2cC/view?usp=drive_link",
+    "https://drive.google.com/file/d/1YLJ2GXYvbGQ2mCyE5KjF2z_6EAI8gE2E/view?usp=sharing",
+    "https://drive.google.com/file/d/1uBmoKluknMOWKivjfHcbu7YkOMtjoclH/view?usp=sharing",
+    "https://drive.google.com/file/d/1YLJ2GXYvbGQ2mCyE5KjF2z_6EAI8gE2E/view?usp=drive_link",
+    "https://drive.google.com/file/d/1hRcWKrDz1gEbEbmSjyVzthsJG0dIbJi8/view?usp=sharing",
+    "https://drive.google.com/file/d/1hRcWKrDz1gEbEbmSjyVzthsJG0dIbJi8/view?usp=drive_link",
+    "https://drive.google.com/file/d/1ou8SVFKN0ifqljKgufOAqZdjq7qvSk2e/view?usp=drive_link",
+    "https://drive.google.com/file/d/1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk/view",
+    "https://drive.google.com/file/d/1uBmoKluknMOWKivjfHcbu7YkOMtjoclH/view",
+    "https://drive.google.com/file/d/1YLJ2GXYvbGQ2mCyE5KjF2z_6EAI8gE2E/view",
+    "https://drive.google.com/file/d/1hRcWKrDz1gEbEbmSjyVzthsJG0dIbJi8/view",
+    "https://drive.google.com/file/d/1ou8SVFKN0ifqljKgufOAqZdjq7qvSk2e/view",
+    "https://drive.google.com/uc?id=1DUEN3draGpyKlM3Z0moUQwi6p-2Oq3Vk",
+    "https://drive.google.com/file/d/1rJ3a4NJ-3MQ8Vv240-hhaP2bO1irbA9h/view",
+    "https://drive.google.com/file/d/1Pklo3gYLRM8_wG0XZvwf_lJJnsDgZhry/view",
+    "https://drive.google.com/file/d/1tKgz422yovSPvuKHDuSETNYRQNq2wIoT/view",
+    "https://drive.google.com/file/d/1AVYkFKf3SLkUhQqiJYWiqluc7KAnR2cC/view"
+  ],
+  "originalFileList": [
+    "mariohara_e400/added_IVF6591_Flat_nprobe_1_v2.index",
+    "mariohara_e400/mariohara_v3_e400.pth"
+  ]
+}

assets/weights/characters/tokai_teio/metadata.json ADDED Viewed

	@@ -0,0 +1,147 @@

+{
+  "title": "Tokai Teio (Uma Musume) (RVC V2) (570 epochs)",
+  "author": {
+    "name": "realhijack",
+    "discordUserId": "903192512234127360"
+  },
+  "md5": "27ac7aee0860b447ed98553b84de43a7",
+  "uploadedAt": "2023-07-21T11:01:53.209Z",
+  "weightsLink": "https://www.weights.gg/models/clm72thvr0rf8cctcy1jn4no4",
+  "id": "clm72thvr0rf8cctcy1jn4no4",
+  "type": "v2",
+  "tags": [
+    "Anime Character",
+    "Fictional Character",
+    "RVC v2"
+  ],
+  "description": "https://huggingface.co/SANSSWEEP/rvc/resolve/main/TOKAITEIO.zip\nMADE BY <@299907871640911872> !!!\ncomm me",
+  "samples": [],
+  "files": [
+    {
+      "name": "model.index",
+      "size": 557899019,
+      "md5": "6bc29a4dc19d5f44cec8ee4cfa59f6b7"
+    },
+    {
+      "name": "model.pth",
+      "size": 57581999,
+      "md5": "27ac7aee0860b447ed98553b84de43a7"
+    }
+  ],
+  "torchMetadata": {
+    "config": {
+      "spec_channels": 1025,
+      "segment_size": 32,
+      "inter_channels": 192,
+      "hidden_channels": 192,
+      "filter_channels": 768,
+      "n_heads": 2,
+      "n_layers": 6,
+      "kernel_size": 3,
+      "p_dropout": 0,
+      "resblock": "1",
+      "resblock_kernel_sizes": [
+        3,
+        7,
+        11
+      ],
+      "resblock_dilation_sizes": [
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ],
+        [
+          1,
+          3,
+          5
+        ]
+      ],
+      "upsample_rates": [
+        12,
+        10,
+        2,
+        2
+      ],
+      "upsample_initial_channel": 512,
+      "upsample_kernel_sizes": [
+        24,
+        20,
+        4,
+        4
+      ],
+      "emb_channels": null,
+      "spk_embed_dim": 109,
+      "gin_channels": 256,
+      "sr": 48000
+    },
+    "f0": 1,
+    "version": "v2",
+    "extra_info": {
+      "config": [
+        1025,
+        32,
+        192,
+        192,
+        768,
+        2,
+        6,
+        3,
+        0,
+        "1",
+        [
+          3,
+          7,
+          11
+        ],
+        [
+          [
+            1,
+            3,
+            5
+          ],
+          [
+            1,
+            3,
+            5
+          ],
+          [
+            1,
+            3,
+            5
+          ]
+        ],
+        [
+          12,
+          10,
+          2,
+          2
+        ],
+        512,
+        [
+          24,
+          20,
+          4,
+          4
+        ],
+        109,
+        256,
+        48000
+      ],
+      "info": "570epoch",
+      "sr": "48k",
+      "f0": 1,
+      "version": "v2"
+    }
+  },
+  "url": "https://huggingface.co/SANSSWEEP/rvc/resolve/main/TOKAITEIO.zip",
+  "originalFileList": [
+    "New folder (3)/TokaiTeio_e570_s37620.pth",
+    "New folder (3)/added_IVF4528_Flat_nprobe_1_TokaiTeio_v2.index"
+  ]
+}

check_deecho_config.py ADDED Viewed

	@@ -0,0 +1,130 @@

+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+回声处理配置验证脚本
+检查当前配置是否正确启用了激进的去回声处理
+"""
+import json
+import os
+from pathlib import Path
+def check_deecho_models():
+    """检查 DeEcho 模型是否存在"""
+    print("=" * 60)
+    print("检查 DeEcho 模型")
+    print("=" * 60)
+    models_dir = Path("assets/uvr5_weights")
+    required_models = [
+        "VR-DeEchoDeReverb.pth",
+        "VR-DeEchoAggressive.pth",
+        "VR-DeEchoNormal.pth",
+        "onnx_dereverb_By_FoxJoy"
+    ]
+    all_found = True
+    for model in required_models:
+        model_path = models_dir / model
+        exists = model_path.exists()
+        status = "[OK]" if exists else "[NO]"
+        size = ""
+        if exists:
+            if model_path.is_file():
+                size_mb = model_path.stat().st_size / (1024 * 1024)
+                size = f" ({size_mb:.1f} MB)"
+            else:
+                size = " (目录)"
+        print(f"{status} {model}{size}")
+        if not exists:
+            all_found = False
+    print()
+    if all_found:
+        print("[OK] All DeEcho models ready")
+    else:
+        print("[NO] Missing models, run: python tools/download_models.py")
+    return all_found
+def check_config():
+    """检查配置文件"""
+    print("\n" + "=" * 60)
+    print("检查配置文件")
+    print("=" * 60)
+    config_path = Path("configs/config.json")
+    with open(config_path, 'r', encoding='utf-8') as f:
+        config = json.load(f)
+    cover_config = config.get("cover", {})
+    # 检查关键配置
+    checks = [
+        ("VC 预处理模式", "vc_preprocess_mode", "auto", cover_config.get("vc_preprocess_mode")),
+        ("源约束模式", "source_constraint_mode", "auto", cover_config.get("source_constraint_mode")),
+        ("Karaoke 分离", "karaoke_separation", True, cover_config.get("karaoke_separation")),
+        ("索引率", "index_rate", 0.50, cover_config.get("index_rate")),
+        ("保护系数", "protect", 0.33, cover_config.get("protect")),
+    ]
+    all_correct = True
+    for name, key, expected, actual in checks:
+        match = actual == expected
+        status = "[OK]" if match else "[NO]"
+        print(f"{status} {name} ({key}): {actual} {'==' if match else '!='} {expected}")
+        if not match:
+            all_correct = False
+    print()
+    if all_correct:
+        print("[OK] Config optimized for aggressive deecho")
+    else:
+        print("[WARN] Some configs not optimized")
+    return all_correct
+def print_recommendations():
+    """打印使用建议"""
+    print("\n" + "=" * 60)
+    print("使用建议")
+    print("=" * 60)
+    print("""
+1. 当前配置使用自动模式：
+   - 优先使用 UVR DeEcho 模型，缺模型时回退到算法去混响
+   - DeEcho 质量好时跳过 blend 直接使用，避免混回原始回音
+   - 源约束仅在去过回音的预处理下自动启用
+2. 如果回声仍然明显，可以尝试：
+   - 在 UI 中调整"索引率"（降低到 0.2-0.3）
+   - 在 UI 中调整"保护系数"（降低到 0.2-0.25）
+   - 使用更高质量的输入音频
+3. 处理流程：
+   原始音频 → Karaoke 分离 → UVR DeEcho(或算法去混响) → RVC 转换 → 输出
+4. 测试建议：
+   - 选择一首有明显回声的歌曲
+   - 查看日志中 DeEcho quality 指标
+   - 对比处理前后的回声强度
+""")
+def main():
+    print("\n回声处理配置验证\n")
+    models_ok = check_deecho_models()
+    config_ok = check_config()
+    print_recommendations()
+    print("\n" + "=" * 60)
+    if models_ok and config_ok:
+        print("[OK] System ready for testing")
+    else:
+        print("[WARN] Please fix issues above")
+    print("=" * 60 + "\n")
+if __name__ == "__main__":
+    main()

configs/config.json CHANGED Viewed

@@ -1,4 +1,5 @@
 {
     "device": "cuda",
     "sample_rate": 48000,
     "hop_length": 160,
@@ -29,8 +30,9 @@
     },
     "cover": {
         "separator": "roformer",
         "karaoke_separation": true,
-        "karaoke_model": "mel_band_roformer_karaoke_gabox.ckpt",
         "karaoke_merge_backing_into_accompaniment": true,
         "uvr5_model": "HP2_all_vocals",
         "uvr5_agg": 10,
@@ -42,10 +44,10 @@
         "demucs_split": true,
         "f0_method": "hybrid",
         "disable_chunking": false,
-        "index_rate": 0.70,
         "filter_radius": 3,
         "rms_mix_rate": 0.0,
-        "protect": 0.50,
         "speaker_id": 0,
         "hubert_layer": 12,
         "silence_gate": false,
@@ -60,18 +62,20 @@
         "f0_min": 50,
         "f0_max": 1100,
         "f0_energy_threshold_db": -50,
-        "f0_hybrid_mode": "off",
         "crepe_pd_threshold": 0.05,
-        "crepe_force_ratio": 0.0,
         "crepe_replace_semitones": 3.0,
         "f0_stabilize": false,
         "f0_stabilize_window": 2,
         "f0_stabilize_max_semitones": 6.0,
         "f0_stabilize_octave": true,
         "f0_rate_limit": false,
         "f0_rate_limit_semitones": 12.0,
-        "vc_preprocess_mode": "uvr_deecho",
-        "source_constraint_mode": "on",
         "vc_pipeline_mode": "current",
         "singing_repair": false,
         "reverb_reapply": true,
@@ -81,10 +85,12 @@
     "f0_min": 50,
     "f0_max": 1100,
     "f0_energy_threshold_db": -60,
-    "f0_hybrid_mode": "off",
     "crepe_pd_threshold": 0.1,
-    "crepe_force_ratio": 0.05,
     "crepe_replace_semitones": 0.0,
     "f0_stabilize": false,
     "f0_stabilize_window": 2,
     "f0_stabilize_max_semitones": 4.0,

 {
+    "language": "zh_CN",
     "device": "cuda",
     "sample_rate": 48000,
     "hop_length": 160,
     },
     "cover": {
         "separator": "roformer",
+        "roformer_model": "ensemble:vocal_rvc",
         "karaoke_separation": true,
+        "karaoke_model": "ensemble:karaoke",
         "karaoke_merge_backing_into_accompaniment": true,
         "uvr5_model": "HP2_all_vocals",
         "uvr5_agg": 10,
         "demucs_split": true,
         "f0_method": "hybrid",
         "disable_chunking": false,
+        "index_rate": 0.50,
         "filter_radius": 3,
         "rms_mix_rate": 0.0,
+        "protect": 0.33,
         "speaker_id": 0,
         "hubert_layer": 12,
         "silence_gate": false,
         "f0_min": 50,
         "f0_max": 1100,
         "f0_energy_threshold_db": -50,
+        "f0_hybrid_mode": "fallback",
         "crepe_pd_threshold": 0.05,
+        "crepe_force_ratio": 1.0,
         "crepe_replace_semitones": 3.0,
+        "unvoiced_feature_gate_floor": 0.28,
+        "breath_active_margin_db": 52.0,
         "f0_stabilize": false,
         "f0_stabilize_window": 2,
         "f0_stabilize_max_semitones": 6.0,
         "f0_stabilize_octave": true,
         "f0_rate_limit": false,
         "f0_rate_limit_semitones": 12.0,
+        "vc_preprocess_mode": "auto",
+        "source_constraint_mode": "auto",
         "vc_pipeline_mode": "current",
         "singing_repair": false,
         "reverb_reapply": true,
     "f0_min": 50,
     "f0_max": 1100,
     "f0_energy_threshold_db": -60,
+    "f0_hybrid_mode": "fallback",
     "crepe_pd_threshold": 0.1,
+    "crepe_force_ratio": 1.0,
     "crepe_replace_semitones": 0.0,
+    "unvoiced_feature_gate_floor": 0.28,
+    "breath_active_margin_db": 52.0,
     "f0_stabilize": false,
     "f0_stabilize_window": 2,
     "f0_stabilize_max_semitones": 4.0,

configs/presets/balanced.json CHANGED Viewed

@@ -13,8 +13,8 @@
     "f0_stabilize": true,
     "f0_stabilize_window": 3,
     "f0_stabilize_max_semitones": 3.0,
-    "vc_preprocess_mode": "uvr_deecho",
-    "source_constraint_mode": "on",
     "uvr5_agg": 10
   }
 }

     "f0_stabilize": true,
     "f0_stabilize_window": 3,
     "f0_stabilize_max_semitones": 3.0,
+    "vc_preprocess_mode": "auto",
+    "source_constraint_mode": "auto",
     "uvr5_agg": 10
   }
 }

configs/presets/clarity_priority.json CHANGED Viewed

@@ -13,8 +13,8 @@
     "f0_stabilize": false,
     "f0_stabilize_window": 2,
     "f0_stabilize_max_semitones": 2.0,
-    "vc_preprocess_mode": "uvr_deecho",
-    "source_constraint_mode": "on",
     "uvr5_agg": 8
   }
 }

     "f0_stabilize": false,
     "f0_stabilize_window": 2,
     "f0_stabilize_max_semitones": 2.0,
+    "vc_preprocess_mode": "auto",
+    "source_constraint_mode": "auto",
     "uvr5_agg": 8
   }
 }

configs/presets/timbre_priority.json CHANGED Viewed

@@ -13,8 +13,8 @@
     "f0_stabilize": true,
     "f0_stabilize_window": 5,
     "f0_stabilize_max_semitones": 4.0,
-    "vc_preprocess_mode": "uvr_deecho",
-    "source_constraint_mode": "on",
     "uvr5_agg": 12
   }
 }

     "f0_stabilize": true,
     "f0_stabilize_window": 5,
     "f0_stabilize_max_semitones": 4.0,
+    "vc_preprocess_mode": "auto",
+    "source_constraint_mode": "auto",
     "uvr5_agg": 12
   }
 }

docs/Colab演示.png ADDED Viewed

Git LFS Details

SHA256: afed65f6707b2b83f4e0a30a159f3c1f3e74ed9d3b9809819ec633dfc5181338
Pointer size: 131 Bytes
Size of remote file: 236 kB

docs/Windows界面.png ADDED Viewed

Git LFS Details

SHA256: f3f221fd5ffe4b646f8fa4e19108b09f104918db96849cf5b4a5981e8b89d352
Pointer size: 131 Bytes
Size of remote file: 225 kB

docs/plans/2026-03-30-portable-ffmpeg-hardening.md ADDED Viewed

	@@ -0,0 +1,74 @@

+# Portable FFmpeg Hardening Implementation Plan
+> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
+**Goal:** Make portable builds self-contained with bundled ffmpeg and ensure runtime code prefers the bundled binary instead of requiring a system installation.
+**Architecture:** Add a dedicated runtime helper that resolves bundled ffmpeg paths, prepends them to `PATH`, and sets explicit environment variables. Update GitHub build workflows to copy platform ffmpeg binaries into the packaged tree, and add regression tests that lock both the runtime helper and packaging contract.
+**Tech Stack:** Python, GitHub Actions, PyInstaller, pytest
+---
+### Task 1: Add failing tests for portable ffmpeg runtime
+**Files:**
+- Create: `tests/test_ffmpeg_runtime.py`
+**Step 1: Write the failing test**
+- Assert a bundled ffmpeg directory is preferred over system lookup.
+- Assert runtime setup prepends the bundled directory to `PATH`.
+**Step 2: Run test to verify it fails**
+- Run: `pytest -q tests/test_ffmpeg_runtime.py`
+**Step 3: Write minimal implementation**
+- Add the helper module and call it from `run.py`.
+**Step 4: Run test to verify it passes**
+- Run: `pytest -q tests/test_ffmpeg_runtime.py`
+**Step 5: Commit**
+- Commit helper + test once green.
+### Task 2: Remove remaining hard dependency on `ffprobe`
+**Files:**
+- Modify: `infer/modules/uvr5/modules.py`
+**Step 1: Write the failing test**
+- Assert the module no longer calls `ffmpeg.probe(..., cmd="ffprobe")`.
+**Step 2: Run test to verify it fails**
+- Run: `pytest -q tests/test_ffmpeg_runtime.py`
+**Step 3: Write minimal implementation**
+- Replace the probe path with a library-based metadata read that does not shell out to `ffprobe`.
+**Step 4: Run tests to verify they pass**
+- Run: `pytest -q tests/test_ffmpeg_runtime.py`
+**Step 5: Commit**
+- Commit metadata-path cleanup.
+### Task 3: Bundle ffmpeg in GitHub release builds
+**Files:**
+- Modify: `.github/workflows/build-executables.yml`
+- Modify: `AI-RVC.spec`
+**Step 1: Write the failing test**
+- Assert the workflow stages copy a bundled ffmpeg directory and package it.
+**Step 2: Run test to verify it fails**
+- Run: `pytest -q tests/test_ffmpeg_runtime.py`
+**Step 3: Write minimal implementation**
+- Add a workflow step to copy `ffmpeg` into `tools/ffmpeg/bin`.
+- Add the ffmpeg directory to packaged data inputs for both workflow and spec.
+**Step 4: Run tests to verify they pass**
+- Run: `pytest -q tests/test_ffmpeg_runtime.py`
+**Step 5: Commit**
+- Commit workflow/spec hardening.

docs/reports/2026-03-24-cover-quality-final-root-cause-report.md ADDED Viewed

	@@ -0,0 +1,127 @@

+# 2026-03-24 Cover Quality Final Root Cause Report
+## 结论
+这次问题不是单一模块失效，而是三层策略在同一条链路上叠加失衡：
+1. `hybrid` 在官方 VC 路径里实际被解释成激进的 `RMVPE + CREPE` 混合。
+2. 源约束在活跃唱段对高能量回声压制不足。
+3. 后处理又把低能量无声段、呼吸段、换气段压得过重。
+最终结果就是：
+- `SILENT PAIN` 这类回声更重、主唱和和声交叠更强的输入，会在活跃唱段残留明显回声。
+- 两首歌都会在呼吸、回气、擦音附近出现电音感、机械感，因为这些段落先被错误赋音高，再被后处理继续削薄。
+## 严格根因
+### 1. `hybrid` 路由过于激进
+已确认官方 VC 路径里的 `hybrid` 请求会进入 `rmvpe+crepe` 方向，而不是“仅在短掉线时保守兜底”。
+影响：
+- RMVPE 本来判为无声或无调性的段落，CREPE 仍可能给出“看起来有音高”的结果。
+- 呼吸、换气、擦音、气声尾音会被错误写入 F0，随后进入 RVC 合成器，听起来就是电音化、机械化。
+这条根因比“RMVPE 本身坏了”更准确。RMVPE 不是主问题，激进混合策略才是主问题。
+### 2. 活跃唱段的源约束压制不够
+之前的源约束主公式本质上更偏向“低活动段清理”，对活跃唱段内的高能量回声缺少持续压力。
+影响：
+- 低能量尾部回声能被抑制。
+- 但高音量、贴着主唱本体的回声伪影更容易穿过去。
+这也解释了为什么 `SILENT PAIN` 更差：它不是单纯“DeEcho 没工作”，而是输入本身更容易在活跃唱段把残留回声带进 VC，再被较宽松的后续预算放过去。
+### 3. 后处理对呼吸段过度清理
+后处理链路里存在两次对低能量段的再压制：
+- silence gate / unvoiced cleanup
+- source gap suppression
+原策略过于激进时，会把真正的呼吸、回气、气声辅音也看成“应被抑制的无声残留”。
+影响：
+- 如果前面 F0 已经有误判，这里会把真实呼吸进一步压没，只剩下 VC 合成残留。
+- 听感上就会更像电音噪点，而不是自然的呼吸。
+## 两份旧诊断中需要收紧的地方
+以下判断方向是对的，但需要更严格表述：
+- “高音量回声去不掉”并不等于 UVR DeEcho 完全失效。
+  实际上，DeEcho 对低能量尾部仍有效，主要问题在于后续源约束对活跃唱段压力不足。
+- “呼吸电音”也不应归结为单一的 silence gate。
+  更准确的说法是：激进 F0 混合先污染了呼吸段，后处理再把真实气声继续削弱，二者叠加后才变成明显机械感。
+## 已实施修复
+### A. 将 `hybrid` 改为保守策略
+已新增统一策略层 `infer/quality_policy.py`，并接入：
+- `infer/official_adapter.py`
+- `infer/f0_extractor.py`
+- `infer/cover_pipeline.py`
+修复后：
+- 官方 VC 请求 `hybrid` 时，不再直接走激进混合。
+- 主转换统一路由到 `RMVPE`。
+- 只有在短、局部、可信、且处于有声上下文的掉线帧上，才允许 CREPE 参与修补。
+- 后处理门控对 `hybrid` 也统一按 `RMVPE` 语义执行，避免前后语义不一致。
+### B. 强化活跃唱段的源约束
+已把源约束从“只对低活动段强约束”改成“在活跃且疑似回声重叠的帧上，也保留有效替换压力”。
+同时收紧了后续预算：
+- 活跃段不再允许旧逻辑那样过宽的能量放行。
+- 全局活动 RMS 也做了更对称、更保守的重新裁剪。
+目标不是把人声压瘪，而是减少活跃唱段里本不该存在的额外回声包络。
+### C. 放松对呼吸段的误杀
+已降低后处理对低能量段的攻击性：
+- `source gap suppression` 触发更谨慎。
+- 衰减深度减轻。
+- `uvr_deecho` 路由下的二次 silence gate 更保守，且加入保护系数。
+目标是保留自然呼吸、回气和轻辅音，不再把它们当成必须清掉的伪影。
+## 预期效果
+修复后预期是：
+- `SILENT PAIN` 这类高回声输入，在主唱活跃区的残留回声会明显减轻，但不会靠极端静音门去“硬切”。
+- `Far far away` 这类本来底子更好的输入，呼吸和回气会更自然，机械感应下降。
+- 如果模型本身训练音色或索引质量有限，仍可能保留一部分合成质感；这不是本次根因链的核心部分。
+## 已完成验证
+已完成本地轻量验证：
+- `python -m unittest tests.test_quality_policy`
+- `python -m py_compile infer\\quality_policy.py infer\\f0_extractor.py infer\\official_adapter.py infer\\cover_pipeline.py tests\\test_quality_policy.py`
+结果：
+- 单测通过。
+- 语法编译通过。
+## 尚未完成的验证
+尚未在当前 shell 环境执行重型端到端音频回归，因为当前会话未发现可直接复用的项目虚拟环境解释器。
+因此，本报告对“代码修复已落地”是确定的；
+对“具体两首歌的最终听感改善幅度”仍建议在用户实际运行环境里再做一次 A/B 回归确认。

docs/superpowers/plans/2026-05-17-v1.2.1-language-release.md ADDED Viewed

	@@ -0,0 +1,90 @@

+# v1.2.1 Language Release Implementation Plan
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+**Goal:** Add a Chinese/English UI language selector and publish v1.2.1 from current local `master`.
+**Architecture:** Reuse the existing `i18n/*.json` pattern and `ui.app:t()` lookup. Locale is read from `configs/config.json` during startup and saved through a Gradio dropdown with an explicit restart-required status.
+**Tech Stack:** Python 3.10, Gradio 3.50.2, unittest/pytest-compatible tests, GitHub CLI, Hugging Face CLI.
+---
+### Task 1: Locale Configuration Tests
+**Files:**
+- Create: `tests/test_ui_language.py`
+- Modify: `ui/app.py`
+- Create: `i18n/en_US.json`
+- [ ] **Step 1: Write the failing tests**
+Create tests that import `ui.app`, assert default locale selection, reject unsupported locales, save language to a temp config file, and require English translation keys.
+- [ ] **Step 2: Run tests to verify they fail**
+Run: `python -m pytest tests/test_ui_language.py -q`
+Expected: fail because helper functions and English pack are not implemented.
+- [ ] **Step 3: Implement minimal locale support**
+Add `SUPPORTED_LANGUAGES`, config-driven `get_configured_language`, `save_language_setting`, and load `i18n` from the configured language. Add `i18n/en_US.json`.
+- [ ] **Step 4: Run tests to verify they pass**
+Run: `python -m pytest tests/test_ui_language.py -q`
+Expected: pass.
+### Task 2: Gradio UI Selector
+**Files:**
+- Modify: `ui/app.py`
+- Modify: `i18n/zh_CN.json`
+- Modify: `i18n/en_US.json`
+- [ ] **Step 1: Add selector labels to tests**
+Extend `tests/test_ui_language.py` to assert both language packs include `language`, `save_language`, and `language_saved_restart` settings keys.
+- [ ] **Step 2: Run tests to verify they fail**
+Run: `python -m pytest tests/test_ui_language.py -q`
+Expected: fail until translation keys exist.
+- [ ] **Step 3: Add selector UI**
+Place a compact language dropdown near the top and a matching Settings control. Saving updates config and returns restart-required text.
+- [ ] **Step 4: Run tests and import check**
+Run: `python -m pytest tests/test_ui_language.py tests/test_quality_policy.py::SourceRegressionTests::test_ui_only_exposes_strict_sota_vc_preprocess_modes -q`
+Run: `python -c "import ui.app; print(ui.app.get_configured_language())"`
+Expected: both commands exit 0.
+### Task 3: Release
+**Files:**
+- Git metadata and remote release state.
+- [ ] **Step 1: Inspect diff and status**
+Run: `git status -sb` and `git diff --stat`.
+- [ ] **Step 2: Commit language changes**
+Run: `git add docs/superpowers/specs/2026-05-17-v1.2.1-language-release-design.md docs/superpowers/plans/2026-05-17-v1.2.1-language-release.md tests/test_ui_language.py i18n/en_US.json i18n/zh_CN.json ui/app.py configs/config.json`
+Run: `git commit -m "ui: add language switcher"`
+- [ ] **Step 3: Push and release**
+Run: `git push origin master`
+Run: `gh release create v1.2.1 --target master --title "AI-RVC v1.2.1" --notes-file <release-notes-file>`
+- [ ] **Step 4: Sync Hugging Face Space**
+Run: `hf upload mason369/AI-RVC . --repo-type space --commit-message "Release v1.2.1" --exclude ...`
+- [ ] **Step 5: Check package workflow**
+Run: `gh run list --workflow "Build Executables" --limit 5`
+Wait for the v1.2.1 workflow and report release assets status.

docs/superpowers/specs/2026-05-17-v1.2.1-language-release-design.md ADDED Viewed

	@@ -0,0 +1,21 @@

+# v1.2.1 Language Release Design
+## Goal
+Release AI-RVC v1.2.1 from the current local `master` branch, adding a visible Chinese/English language selector and publishing matching GitHub and Hugging Face Space versions.
+## Approach
+The UI already centralizes most labels through `i18n/zh_CN.json` and `ui/app.py:t()`. Add `i18n/en_US.json`, store the selected locale in `configs/config.json`, and load that locale at startup. Because Gradio 3 builds component labels statically, changing the selector saves the preference and shows an explicit restart-required message instead of pretending to hot-swap all labels.
+## UI Behavior
+Add a language selector near the top of the app and in Settings. Choices are `中文` and `English`. Saving writes `language` to `configs/config.json`; unsupported locale values fail explicitly and do not silently fall back.
+## Release Behavior
+Publish from the current local `master`: run targeted tests, commit language changes, push `master`, create GitHub Release/tag `v1.2.1`, let the existing `Build Executables` workflow build portable packages, and sync the same source to Hugging Face Space `mason369/AI-RVC`.
+## Verification
+Use test-first coverage for locale config loading, language option validation, English pack existence, and save behavior. After implementation, run the targeted tests plus a `ui.app` import check before committing and publishing.

i18n/en_US.json ADDED Viewed

	@@ -0,0 +1,165 @@

+{
+  "app_title": "RVC AI Cover",
+  "app_description": "AI song cover system based on RVC v2",
+  "tabs": {
+    "models": "Model Management",
+    "cover": "Song Cover",
+    "settings": "Settings"
+  },
+  "ui": {
+    "cover_usage": "**One-click AI Cover**: upload a song → separate vocals → convert the voice → mix the accompaniment → export the cover\n\n**Steps:**\n1. Download a character model first (open \"Download Character Model\" below)\n2. Upload a song file (MP3/WAV/FLAC)\n3. Choose a downloaded character\n4. Adjust parameters and click \"Start Cover\"\n\n> ⚠️ First launch downloads the Mel-Band Roformer vocal separation model automatically (about 200 MB). Please wait patiently.",
+    "series_filter": "Series / Category",
+    "keyword_search": "Keyword Search",
+    "keyword_placeholder": "Character or series name",
+    "character_choice_info": "The list shows version tags, character ownership, and model source. In versions, epochs means training epochs, while 40k/48k means sample rate.",
+    "download_character_info": "The list shows version tags, character ownership, and source repositories so you can distinguish different models for the same character.",
+    "refresh_models": "Refresh Model List",
+    "download_selected_character": "Download Selected Character",
+    "download_series_all": "Download This Category",
+    "download_all_characters": "Download All Character Models",
+    "download_status": "Download Status",
+    "model_name": "Model Name",
+    "model_path": "Model Path",
+    "index_path": "Index Path",
+    "no_models": "(No models)",
+    "positive_pitch_info": "Positive raises pitch, negative lowers it",
+    "normal_volume_info": "100% keeps original volume",
+    "reverb_info": "Adds reverb to the vocals"
+  },
+  "conversion": {
+    "model_settings": "Model Settings",
+    "select_model": "Select Model",
+    "refresh": "Refresh",
+    "load_model": "Load Model",
+    "status": "Status",
+    "audio_input": "Audio Input",
+    "upload_audio": "Upload Audio",
+    "conversion_params": "Conversion Parameters",
+    "pitch_shift": "Pitch Shift (semitones)",
+    "pitch_shift_info": "Positive raises pitch, negative lowers it. Male to female often uses +12; female to male often uses -12",
+    "f0_method": "F0 Extraction Method",
+    "f0_method_info": "rmvpe gives the highest quality (recommended)",
+    "index_rate": "Index Rate",
+    "index_rate_info": "Higher values sound closer to the training voice, but may reduce clarity",
+    "filter_radius": "Median Filter Radius",
+    "filter_radius_info": "Helps reduce breath noise",
+    "rms_mix_rate": "Loudness Envelope Mix",
+    "rms_mix_rate_info": "Closer to 1 uses more of the output loudness envelope",
+    "protect": "Unvoiced Protection",
+    "protect_info": "Protects consonants and breath sounds; 0.5 means no protection",
+    "start_conversion": "Start Conversion",
+    "conversion_result": "Conversion Result",
+    "conversion_status": "Conversion Status"
+  },
+  "cover": {
+    "song_cover": "AI Song Cover",
+    "cover_description": "Upload a song, choose a character, and generate an AI cover. The pipeline separates vocals, converts the voice, and mixes the accompaniment automatically.",
+    "upload_song": "Upload Song",
+    "input_song": "Input Song (MP3/WAV/FLAC)",
+    "select_character": "Select Character",
+    "character": "Character",
+    "conversion_settings": "Conversion Settings",
+    "pitch_shift": "Pitch Shift (semitones)",
+    "index_rate": "Index Rate (%)",
+    "index_rate_info": "Recommended: 30-45%; too low sounds more synthetic, too high may blur articulation",
+    "speaker_id": "Speaker ID",
+    "speaker_id_info": "Single-character models usually use 0; try other IDs for multi-speaker models",
+    "mix_settings": "Mix Settings",
+    "mix_preset": "Mix Preset",
+    "mix_preset_info": "Apply a mix profile suitable for most songs",
+    "mix_preset_universal": "Universal (recommended)",
+    "mix_preset_vocal": "Vocal Focus",
+    "mix_preset_accompaniment": "Accompaniment Focus",
+    "mix_preset_live": "Live Feel (more reverb)",
+    "vocals_volume": "Vocal Volume (%)",
+    "accompaniment_volume": "Accompaniment Volume (%)",
+    "vocals_reverb": "Vocal Reverb (%)",
+    "rms_mix_rate": "Loudness Envelope Match (%)",
+    "rms_mix_rate_info": "Moderately follows the original vocal loudness contour; recommended: 10-30%",
+    "backing_mix": "Original Lead Vocal Blend (%)",
+    "backing_mix_info": "Blend a small amount of original lead vocal to restore thickness and articulation; too high leaks the source timbre",
+    "karaoke_separation": "Harmony Separation",
+    "karaoke_separation_info": "Separate lead and backing vocals, convert only the lead, and preserve original harmonies. Useful for songs with backing vocals",
+    "karaoke_merge_backing": "Blend Backing Vocals Into Accompaniment",
+    "karaoke_merge_backing_info": "Mix separated original backing vocals back into the accompaniment. Enabled by default to preserve harmony layers",
+    "start_cover": "Start Cover",
+    "progress": "Progress",
+    "results": "Cover Results",
+    "final_cover": "Final Cover",
+    "converted_vocals": "Converted Vocals",
+    "original_vocals": "Original Vocals (Separated)",
+    "lead_vocals": "Lead Vocals",
+    "backing_vocals": "Backing Vocals",
+    "accompaniment": "Accompaniment",
+    "download_character": "Download Character Model",
+    "select_to_download": "Select Character to Download",
+    "download": "Download",
+    "vc_preprocess_mode": "VC Preprocess Strategy",
+    "vc_preprocess_mode_info": "Strictly uses RoFormer De-Reverb SOTA; if unavailable, processing stops instead of degrading",
+    "vc_preprocess_auto": "Auto (recommended)",
+    "vc_preprocess_uvr_deecho": "Strict RoFormer De-Reverb",
+    "source_constraint_mode": "Source Constraint Strategy",
+    "source_constraint_mode_info": "Auto (recommended) enables this only for strict RoFormer De-Reverb preprocessing; if unavailable, processing stops instead of degrading",
+    "source_constraint_auto": "Auto (recommended)",
+    "source_constraint_off": "Off",
+    "source_constraint_on": "Always On",
+    "vc_preprocess_status": "Current VC Preprocess Route",
+    "vc_preprocess_status_info": "Shows the actual route this machine will use in auto mode",
+    "vc_pipeline_mode": "VC Pipeline Mode",
+    "vc_pipeline_mode_info": "Current implementation keeps this project's existing path; official mode calls the bundled official RVC directly and skips custom VC pre/post-processing",
+    "vc_pipeline_mode_current": "Current Implementation (recommended)",
+    "vc_pipeline_mode_official": "Official Implementation (1:1)",
+    "singing_repair": "Singing Repair",
+    "singing_repair_info": "For high-note dropouts, tearing, metallic artifacts, and sibilance. When enabled, this exits strict 1:1 mode and uses the official-compatible repair path (FP32 + hybrid F0 + F0 stabilization/rate limiting)"
+  },
+  "models": {
+    "base_models": "Base Models",
+    "base_models_desc": "These models are required to run RVC and must be downloaded before first use.",
+    "check_status": "Check Model Status",
+    "download_required": "Download Required Models",
+    "model_status": "Model Status",
+    "voice_models": "Voice Models",
+    "voice_models_desc": "Put .pth model files into assets/weights/, then refresh the model list.",
+    "mature_deecho_models": "Mature DeEcho Models",
+    "mature_deecho_models_desc": "Current covers strictly use RoFormer De-Reverb SOTA; if unavailable, processing stops instead of degrading to older models or algorithm chains.",
+    "download_mature_deecho": "Download Mature DeEcho Model",
+    "mature_deecho_status": "Mature DeEcho Status",
+    "mature_deecho_check": "Check Mature DeEcho Status"
+  },
+  "settings": {
+    "language_settings": "Language Settings",
+    "language": "Interface Language",
+    "save_language": "Save Language",
+    "language_saved_restart": "✅ Interface language saved: {language}. Please restart the app for it to take effect.",
+    "language_restart_info": "The language choice is saved to the config file. Because Gradio 3 builds UI labels at startup, restart the app to apply it.",
+    "runtime_settings": "Runtime Settings",
+    "compute_device": "Compute Device",
+    "save_settings": "Save Settings",
+    "settings_saved_restart": "✅ Settings saved. Restart the app for changes to take effect.",
+    "status": "Status",
+    "cpu_slow": "CPU (slow)",
+    "about_body": "**RVC AI Cover System**\n\n- Built on RVC v2 + Mel-Band Roformer\n- Uses RMVPE for high-quality F0 extraction\n- Supports CUDA GPU acceleration\n\n[GitHub](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)",
+    "model_sources": "### 📥 Character Model Sources\n\nThese are the Hugging Face repositories used by the built-in character model list. You can also download models manually and place them under `assets/weights/characters/<character_name>/`:\n\n**Love Live! Series**\n- [trioskosmos/rvc_models](https://huggingface.co/trioskosmos/rvc_models) — μ's / Aqours / Nijigasaki / Liella! characters\n- [Icchan/LoveLive](https://huggingface.co/Icchan/LoveLive) — Chika, Riko, Eli, You\n- [0xMifune/LoveLive](https://huggingface.co/0xMifune/LoveLive) — Nijigasaki / Liella! / Hasunosora\n- [Swordsmagus/Love-Live-RVC](https://huggingface.co/Swordsmagus/Love-Live-RVC) — Hanamaru, Setsuna, Kotori, A-RISE, and more\n- [Zurakichi/RVC](https://huggingface.co/Zurakichi/RVC) — Nico, Kanata, Setsuna, Hanamaru\n- [Phos252/RVCmodels](https://huggingface.co/Phos252/RVCmodels) — Shibuya Kanon\n- [ChocoKat/Mari_Ohara](https://huggingface.co/ChocoKat/Mari_Ohara) — Mari Ohara\n- [HarunaKasuga/YoshikoTsushima](https://huggingface.co/HarunaKasuga/YoshikoTsushima) — Yoshiko Tsushima\n- [thebuddyadrian/RVC_Models](https://huggingface.co/thebuddyadrian/RVC_Models) — Kazuno sisters\n\n**Genshin / Honkai / Zenless Zone Zero (HoYoverse)**\n- [makiligon/RVC-Models](https://huggingface.co/makiligon/RVC-Models) — Furina, Ayaka, Focalors\n- [kohaku12/RVC-MODELS](https://huggingface.co/kohaku12/RVC-MODELS) — Nahida, Herta, Firefly, Tingyun, Hoshimi Miyabi, and more\n- [jarari/RVC-v2](https://huggingface.co/jarari/RVC-v2) — Furina (Korean), Silver Wolf (Korean)\n- [mrmocciai/genshin-impact](https://huggingface.co/mrmocciai/genshin-impact) — 50+ Genshin characters (manual download)\n\n**VOCALOID**\n- [javinfamous/infamous_miku_v2](https://huggingface.co/javinfamous/infamous_miku_v2) — Hatsune Miku (1000 epochs)\n\n**Hololive / VTuber**\n- [megaaziib/my-rvc-models-collection](https://huggingface.co/megaaziib/my-rvc-models-collection) — Pekora, Sakura Miko, Subaru, Kobo, Kaela, and more\n- [Kit-Lemonfoot/kitlemonfoot_rvc_models](https://huggingface.co/Kit-Lemonfoot/kitlemonfoot_rvc_models) — Hololive JP/EN characters\n\n**THE IDOLM@STER / Uma Musume**\n- [trioskosmos/rvc_models](https://huggingface.co/trioskosmos/rvc_models) — Ranko Kanzaki, Riamu Yumemi\n- [makiligon/RVC-Models](https://huggingface.co/makiligon/RVC-Models) — Takane Shijou, Rice Shower\n\n**Project SEKAI**\n- [kohaku12/RVC-MODELS](https://huggingface.co/kohaku12/RVC-MODELS) — Nene Kusanagi\n\n> 💡 After manual download, put `.pth` and `.index` files under `assets/weights/characters/<character_name>/`, then refresh the list.",
+    "device_info": "Device Information",
+    "current_device": "Current Device",
+    "refresh_device": "Refresh Device Information",
+    "about": "About"
+  },
+  "messages": {
+    "model_loaded": "Model loaded",
+    "model_not_found": "Model not found",
+    "load_failed": "Load failed",
+    "please_select_model": "Please select a model",
+    "please_load_model": "Please load a model first",
+    "please_upload_audio": "Please upload an audio file",
+    "conversion_complete": "Conversion complete",
+    "conversion_failed": "Conversion failed",
+    "download_complete": "Download complete",
+    "download_failed": "Download failed",
+    "cover_complete": "Cover complete",
+    "cover_failed": "Cover failed",
+    "separating_vocals": "Separating vocals...",
+    "converting_vocals": "Converting vocals...",
+    "mixing_audio": "Mixing audio..."
+  }
+}

i18n/zh_CN.json CHANGED Viewed

@@ -6,6 +6,26 @@
     "cover": "歌曲翻唱",
     "settings": "设置"
   },
   "conversion": {
     "model_settings": "模型设置",
     "select_model": "选择模型",
@@ -75,13 +95,11 @@
     "select_to_download": "选择要下载的角色",
     "download": "下载",
     "vc_preprocess_mode": "VC预处理策略",
-    "vc_preprocess_mode_info": "参考成熟项目：优先学习型 DeEcho/DeReverb，否则主唱直通 RVC",
     "vc_preprocess_auto": "自动(推荐)",
-    "vc_preprocess_direct": "主唱直通",
-    "vc_preprocess_uvr_deecho": "官方DeEcho优先",
-    "vc_preprocess_legacy": "旧版手工链",
     "source_constraint_mode": "源约束策略",
-    "source_constraint_mode_info": "自动(推荐)下仅旧版手工链启用；学习型 DeEcho 或主唱直通默认不再追加自定义源约束",
     "source_constraint_auto": "自动(推荐)",
     "source_constraint_off": "关闭",
     "source_constraint_on": "始终开启",
@@ -89,10 +107,10 @@
     "vc_preprocess_status_info": "显示本机当前自动模式会走的实际路径",
     "vc_pipeline_mode": "VC管线模式",
     "vc_pipeline_mode_info": "当前实现保留本项目现有链路；官方实现会直接调用内置官方 RVC，并跳过自定义 VC 前后处理",
-    "vc_pipeline_mode_current": "当前实现（推荐）",
-    "vc_pipeline_mode_official": "官方实现（一比一）",
-    "singing_repair": "唱歌修复",
-    "singing_repair_info": "用于高音断音、撕裂、电音、齿音；开启后会退出严格一比一，改用官方兼容修复链（FP32 + 混合F0 + F0稳定/限速）"
   },
   "models": {
     "base_models": "基础模型",
@@ -103,12 +121,25 @@
     "voice_models": "语音模型",
     "voice_models_desc": "将 .pth 模型文件放入 assets/weights/ 目录，然后刷新模型列表。",
     "mature_deecho_models": "成熟 DeEcho 模型",
-    "mature_deecho_models_desc": "用于成熟项目常见的学习型去回声/去混响流程。未下载时，翻唱页的自动模式会回退为主唱直通 RVC。",
     "download_mature_deecho": "下载成熟 DeEcho 模型",
     "mature_deecho_status": "成熟 DeEcho 状态",
     "mature_deecho_check": "检查成熟 DeEcho 状态"
   },
   "settings": {
     "device_info": "设备信息",
     "current_device": "当前设备",
     "refresh_device": "刷新设备信息",

     "cover": "歌曲翻唱",
     "settings": "设置"
   },
+  "ui": {
+    "cover_usage": "**一键 AI 翻唱**：上传歌曲 → 自动分离人声 → 转换音色 → 混合伴奏 → 输出翻唱\n\n**使用步骤：**\n1. 先下载角色模型（展开下方「下载角色模型」）\n2. 上传歌曲文件（支持 MP3/WAV/FLAC）\n3. 选择已下载的角色\n4. 调整参数后点击「开始翻唱」\n\n> ⚠️ 首次运行会自动下载 Mel-Band Roformer 人声分离模型（约 200MB），请耐心等待",
+    "series_filter": "作品/分类",
+    "keyword_search": "关键词搜索",
+    "keyword_placeholder": "输入角色名/作品名",
+    "character_choice_info": "列表会显示版本标识、角色归属和模型来源；版本里常见的 epochs 表示训练轮数，40k/48k 表示采样率。",
+    "download_character_info": "列表会显示版本标识、角色归属和来源仓库，便于区分同角色的不同模型。",
+    "refresh_models": "刷新模型列表",
+    "download_selected_character": "下载选中角色",
+    "download_series_all": "下载该分类全部",
+    "download_all_characters": "下载全部角色模型",
+    "download_status": "下载状态",
+    "model_name": "模型名称",
+    "model_path": "模型路径",
+    "index_path": "索引路径",
+    "no_models": "(无模型)",
+    "positive_pitch_info": "正数升调，负数降调",
+    "normal_volume_info": "100% 为原始音量",
+    "reverb_info": "为人声添加混响效果"
+  },
   "conversion": {
     "model_settings": "模型设置",
     "select_model": "选择模型",
     "select_to_download": "选择要下载的角色",
     "download": "下载",
     "vc_preprocess_mode": "VC预处理策略",
+    "vc_preprocess_mode_info": "严格使用 RoFormer De-Reverb SOTA；不可用时停止，不降级",
     "vc_preprocess_auto": "自动(推荐)",
+    "vc_preprocess_uvr_deecho": "严格RoFormer De-Reverb",
     "source_constraint_mode": "源约束策略",
+    "source_constraint_mode_info": "自动(推荐)下仅严格 RoFormer De-Reverb 预处理启用；不可用时停止，不降级",
     "source_constraint_auto": "自动(推荐)",
     "source_constraint_off": "关闭",
     "source_constraint_on": "始终开启",
     "vc_preprocess_status_info": "显示本机当前自动模式会走的实际路径",
     "vc_pipeline_mode": "VC管线模式",
     "vc_pipeline_mode_info": "当前实现保留本项目现有链路；官方实现会直接调用内置官方 RVC，并跳过自定义 VC 前后处理",
+    "vc_pipeline_mode_current": "当前实现（推荐）",
+    "vc_pipeline_mode_official": "官方实现（一比一）",
+    "singing_repair": "唱歌修复",
+    "singing_repair_info": "用于高音断音、撕裂、电音、齿音；开启后会退出严格一比一，改用官方兼容修复链（FP32 + 混合F0 + F0稳定/限速）"
   },
   "models": {
     "base_models": "基础模型",
     "voice_models": "语音模型",
     "voice_models_desc": "将 .pth 模型文件放入 assets/weights/ 目录，然后刷新模型列表。",
     "mature_deecho_models": "成熟 DeEcho 模型",
+    "mature_deecho_models_desc": "当前翻唱严格使用 RoFormer De-Reverb SOTA；不可用时停止处理，不降级到旧模型或算法链。",
     "download_mature_deecho": "下载成熟 DeEcho 模型",
     "mature_deecho_status": "成熟 DeEcho 状态",
     "mature_deecho_check": "检查成熟 DeEcho 状态"
   },
   "settings": {
+    "language_settings": "语言设置",
+    "language": "界面语言",
+    "save_language": "保存语言",
+    "language_saved_restart": "✅ 已保存界面语言：{language}。请重启应用后生效。",
+    "language_restart_info": "切换语言会保存到配置文件；由于 Gradio 3 的界面文案在启动时生成，重启后生效。",
+    "runtime_settings": "运行设置",
+    "compute_device": "计算设备",
+    "save_settings": "保存设置",
+    "settings_saved_restart": "✅ 设置���保存，重启后生效",
+    "status": "状态",
+    "cpu_slow": "CPU (较慢)",
+    "about_body": "**RVC AI 翻唱系统**\n\n- 基于 RVC v2 + Mel-Band Roformer\n- 使用 RMVPE 进行高质量 F0 提取\n- 支持 CUDA GPU 加速\n\n[GitHub](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)",
+    "model_sources": "### 📥 角色模型来源\n\n以下是本项目角色模型的 HuggingFace 仓库来源，你也可以手动下载模型后放入 `assets/weights/characters/<角色名>/` 目录使用：\n\n**Love Live! 系列**\n- [trioskosmos/rvc_models](https://huggingface.co/trioskosmos/rvc_models) — μ's / Aqours / 虹咲 / Liella! 多角色\n- [Icchan/LoveLive](https://huggingface.co/Icchan/LoveLive) — 千歌、梨子、绘里、曜\n- [0xMifune/LoveLive](https://huggingface.co/0xMifune/LoveLive) — 虹咲 / Liella! / 莲之空\n- [Swordsmagus/Love-Live-RVC](https://huggingface.co/Swordsmagus/Love-Live-RVC) — 花丸、雪菜、小鸟、A-RISE 等\n- [Zurakichi/RVC](https://huggingface.co/Zurakichi/RVC) — 妮可、彼方、雪菜、花丸\n- [Phos252/RVCmodels](https://huggingface.co/Phos252/RVCmodels) — 涩谷香音\n- [ChocoKat/Mari_Ohara](https://huggingface.co/ChocoKat/Mari_Ohara) — 小原鞠莉\n- [HarunaKasuga/YoshikoTsushima](https://huggingface.co/HarunaKasuga/YoshikoTsushima) — 津岛善子\n- [thebuddyadrian/RVC_Models](https://huggingface.co/thebuddyadrian/RVC_Models) — 鹿角姐妹\n\n**原神 / 崩坏 / 绝区零 (米哈游)**\n- [makiligon/RVC-Models](https://huggingface.co/makiligon/RVC-Models) — 芙宁娜、绫华、芙卡洛斯\n- [kohaku12/RVC-MODELS](https://huggingface.co/kohaku12/RVC-MODELS) — 纳西妲、黑塔、流萤、停云、星见雅 等\n- [jarari/RVC-v2](https://huggingface.co/jarari/RVC-v2) — 芙宁娜(韩语)、银狼(韩语)\n- [mrmocciai/genshin-impact](https://huggingface.co/mrmocciai/genshin-impact) — 原神 50+ 角色（需手动下载）\n\n**VOCALOID**\n- [javinfamous/infamous_miku_v2](https://huggingface.co/javinfamous/infamous_miku_v2) — 初音未来 (1000 epochs)\n\n**Hololive / VTuber**\n- [megaaziib/my-rvc-models-collection](https://huggingface.co/megaaziib/my-rvc-models-collection) — 佩克拉、樱巫女、大空昴、Kobo、Kaela 等\n- [Kit-Lemonfoot/kitlemonfoot_rvc_models](https://huggingface.co/Kit-Lemonfoot/kitlemonfoot_rvc_models) — Hololive JP/EN 多角色\n\n**偶像大师 / 赛马娘**\n- [trioskosmos/rvc_models](https://huggingface.co/trioskosmos/rvc_models) — 神崎兰子、梦见莉亚梦\n- [makiligon/RVC-Models](https://huggingface.co/makiligon/RVC-Models) — 四条贵音、米浴\n\n**Project SEKAI**\n- [kohaku12/RVC-MODELS](https://huggingface.co/kohaku12/RVC-MODELS) — 草薙宁宁\n\n> 💡 手动下载后，将 `.pth` 和 `.index` 文件放入 `assets/weights/characters/<角色名>/` 目录，刷新即可使用。",
     "device_info": "设备信息",
     "current_device": "当前设备",
     "refresh_device": "刷新设备信息",

infer/__init__.py CHANGED Viewed

@@ -1,17 +1,17 @@
-# -*- coding: utf-8 -*-
-"""
-推理模块
-"""
-from .f0_extractor import (
-    F0Extractor,
-    get_f0_extractor,
-    shift_f0,
-    F0Method
-)
-__all__ = [
-    "F0Extractor",
-    "get_f0_extractor",
-    "shift_f0",
-    "F0Method"
-]

+# -*- coding: utf-8 -*-
+"""
+推理模块
+"""
+from .f0_extractor import (
+    F0Extractor,
+    get_f0_extractor,
+    shift_f0,
+    F0Method
+)
+__all__ = [
+    "F0Extractor",
+    "get_f0_extractor",
+    "shift_f0",
+    "F0Method"
+]

infer/advanced_dereverb.py CHANGED Viewed

@@ -213,38 +213,6 @@ def advanced_dereverb(
     return dry_signal.astype(np.float32), reverb_tail.astype(np.float32)
-def apply_reverb_to_converted(
-    converted_dry: np.ndarray,
-    original_reverb: np.ndarray,
-    mix_ratio: float = 0.8
-) -> np.ndarray:
-    """
-    将原始混响重新应用到转换后的干声上
-    Args:
-        converted_dry: 转换后的干声
-        original_reverb: 原始混响尾巴
-        mix_ratio: 混响混合比例 (0-1)
-    Returns:
-        wet_signal: 带混响的转换结果
-    """
-    # 对齐长度
-    min_len = min(len(converted_dry), len(original_reverb))
-    converted_dry = converted_dry[:min_len]
-    original_reverb = original_reverb[:min_len]
-    # 混合
-    wet_signal = converted_dry + mix_ratio * original_reverb
-    # 软限幅
-    from lib.audio import soft_clip
-    wet_signal = soft_clip(wet_signal, threshold=0.9, ceiling=0.99)
-    return wet_signal.astype(np.float32)
 if __name__ == "__main__":
     # 测试
     print("Testing advanced dereverberation...")

     return dry_signal.astype(np.float32), reverb_tail.astype(np.float32)
 if __name__ == "__main__":
     # 测试
     print("Testing advanced dereverberation...")

infer/cover_pipeline.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

infer/f0_extractor.py CHANGED Viewed

@@ -1,17 +1,23 @@
 # -*- coding: utf-8 -*-
 """
-F0 (基频) 提取模块 - 支持多种提取方法
 """
 import numpy as np
 import torch
-from typing import Optional, Literal
-# F0 提取方法类型
 F0Method = Literal["rmvpe", "pm", "harvest", "crepe", "hybrid"]
 class F0Extractor:
-    """F0 提取器基类"""
     def __init__(self, sample_rate: int = 16000, hop_length: int = 160):
         self.sample_rate = sample_rate
@@ -20,34 +26,30 @@ class F0Extractor:
         self.f0_max = 1100
     def extract(self, audio: np.ndarray) -> np.ndarray:
-        """提取 F0，子类需实现此方法"""
         raise NotImplementedError
 class PMExtractor(F0Extractor):
-    """Parselmouth (Praat) F0 提取器 - 速度快"""
     def extract(self, audio: np.ndarray) -> np.ndarray:
         import parselmouth
         time_step = self.hop_length / self.sample_rate
         sound = parselmouth.Sound(audio, self.sample_rate)
         pitch = sound.to_pitch_ac(
             time_step=time_step,
             voicing_threshold=0.6,
             pitch_floor=self.f0_min,
-            pitch_ceiling=self.f0_max
         )
         f0 = pitch.selected_array["frequency"]
         f0[f0 == 0] = np.nan
         return f0
 class HarvestExtractor(F0Extractor):
-    """PyWorld Harvest F0 提取器 - 质量较好"""
     def extract(self, audio: np.ndarray) -> np.ndarray:
         import pyworld
@@ -58,26 +60,27 @@ class HarvestExtractor(F0Extractor):
             self.sample_rate,
             f0_floor=self.f0_min,
             f0_ceil=self.f0_max,
-            frame_period=self.hop_length / self.sample_rate * 1000
         )
         return f0
 class CrepeExtractor(F0Extractor):
-    """TorchCrepe F0 提取器 - 深度学习方法"""
-    def __init__(self, sample_rate: int = 16000, hop_length: int = 160,
-                 device: str = "cuda"):
         super().__init__(sample_rate, hop_length)
         self.device = device
     def extract(self, audio: np.ndarray) -> np.ndarray:
         import torchcrepe
-        audio_tensor = torch.from_numpy(audio).float().unsqueeze(0)
-        audio_tensor = audio_tensor.to(self.device)
         f0, _ = torchcrepe.predict(
             audio_tensor,
             self.sample_rate,
@@ -87,111 +90,120 @@ class CrepeExtractor(F0Extractor):
             model="full",
             batch_size=512,
             device=self.device,
-            return_periodicity=True
         )
-        f0 = f0.squeeze(0).cpu().numpy()
-        return f0
 class RMVPEExtractor(F0Extractor):
-    """RMVPE F0 提取器 - 质量最高 (推荐)"""
-    def __init__(self, model_path: str, sample_rate: int = 16000,
-                 hop_length: int = 160, device: str = "cuda"):
         super().__init__(sample_rate, hop_length)
         self.device = device
         self.model = None
         self.model_path = model_path
-    def load_model(self):
-        """加载 RMVPE 模型"""
         if self.model is not None:
             return
         from models.rmvpe import RMVPE
         self.model = RMVPE(self.model_path, device=self.device)
-        print(f"RMVPE 模型已加载: {self.device}")
     def extract(self, audio: np.ndarray) -> np.ndarray:
         self.load_model()
-        # RMVPE 需要 16kHz 输入
-        f0 = self.model.infer_from_audio(audio, thred=0.01)
-        return f0
-def get_f0_extractor(method: F0Method, device: str = "cuda",
-                     rmvpe_path: str = None, crepe_threshold: float = 0.05) -> F0Extractor:
-    """
-    获取 F0 提取器实例
-    Args:
-        method: 提取方法 ("rmvpe", "pm", "harvest", "crepe", "hybrid")
-        device: 计算设备
-        rmvpe_path: RMVPE 模型路径 (rmvpe/hybrid 方法需要)
-        crepe_threshold: CREPE置信度阈值 (仅hybrid方��使用)
-    Returns:
-        F0Extractor: 提取器实例
-    """
     if method == "rmvpe":
         if rmvpe_path is None:
-            raise ValueError("RMVPE 方法需要指定模型路径")
         return RMVPEExtractor(rmvpe_path, device=device)
-    elif method == "hybrid":
         if rmvpe_path is None:
-            raise ValueError("Hybrid 方法需要指定RMVPE模型路径")
-        return HybridF0Extractor(rmvpe_path, device=device, crepe_threshold=crepe_threshold)
-    elif method == "pm":
         return PMExtractor()
-    elif method == "harvest":
         return HarvestExtractor()
-    elif method == "crepe":
         return CrepeExtractor(device=device)
-    else:
-        raise ValueError(f"未知的 F0 提取方法: {method}")
 class HybridF0Extractor(F0Extractor):
-    """混合F0提取器 - RMVPE主导 + CREPE高精度补充"""
-    def __init__(self, rmvpe_path: str, sample_rate: int = 16000,
-                 hop_length: int = 160, device: str = "cuda",
-                 crepe_threshold: float = 0.05):
         super().__init__(sample_rate, hop_length)
         self.device = device
         self.rmvpe = RMVPEExtractor(rmvpe_path, sample_rate, hop_length, device)
-        self.crepe = None  # 延迟加载
-        self.crepe_threshold = crepe_threshold
-    def _load_crepe(self):
-        """延迟加载CREPE模型"""
-        if self.crepe is None:
-            try:
-                self.crepe = CrepeExtractor(self.sample_rate, self.hop_length, self.device)
-            except ImportError:
-                print("警告: torchcrepe未安装，混合F0将仅使用RMVPE")
-                self.crepe = False
     def extract(self, audio: np.ndarray) -> np.ndarray:
-        """
-        混合提取F0：
-        1. 使用RMVPE作为主要方法（快速、稳定）
-        2. 在RMVPE不稳定的区域使用CREPE补充（高精度）
-        """
-        # 提取RMVPE F0
         f0_rmvpe = self.rmvpe.extract(audio)
-        # 如果CREPE不可用，直接返回RMVPE结果
         self._load_crepe()
         if self.crepe is False:
             return f0_rmvpe
-        # 提取CREPE F0和置信度
         import torchcrepe
         audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
         f0_crepe, confidence = torchcrepe.predict(
             audio_tensor,
@@ -202,64 +214,39 @@ class HybridF0Extractor(F0Extractor):
             model="full",
             batch_size=512,
             device=self.device,
-            return_periodicity=True
         )
         f0_crepe = f0_crepe.squeeze(0).cpu().numpy()
         confidence = confidence.squeeze(0).cpu().numpy()
-        # 对齐长度
         min_len = min(len(f0_rmvpe), len(f0_crepe), len(confidence))
-        f0_rmvpe = f0_rmvpe[:min_len]
-        f0_crepe = f0_crepe[:min_len]
-        confidence = confidence[:min_len]
-        # 检测RMVPE不稳定区域
-        # 1. F0跳变过大（超过3个半音）
-        f0_diff = np.abs(np.diff(f0_rmvpe, prepend=f0_rmvpe[0]))
-        semitone_diff = np.abs(12 * np.log2((f0_rmvpe + 1e-6) / (np.roll(f0_rmvpe, 1) + 1e-6)))
-        semitone_diff[0] = 0
-        unstable_jump = semitone_diff > 3.0
-        # 2. CREPE置信度高但RMVPE给出F0=0
-        unstable_unvoiced = (f0_rmvpe < 1e-3) & (confidence > self.crepe_threshold)
-        # 3. RMVPE和CREPE差异过大（超过2个半音）且CREPE置信度高
-        f0_ratio = (f0_crepe + 1e-6) / (f0_rmvpe + 1e-6)
-        semitone_gap = np.abs(12 * np.log2(f0_ratio))
-        unstable_diverge = (semitone_gap > 2.0) & (confidence > self.crepe_threshold * 1.5)
-        # 合并不稳定区域
-        unstable_mask = unstable_jump | unstable_unvoiced | unstable_diverge
-        # 扩展不稳定区域（前后各2帧）以平滑过渡
-        kernel = np.ones(5, dtype=bool)
-        unstable_mask = np.convolve(unstable_mask, kernel, mode='same')
-        # 混合F0：不稳定区域使用CREPE，其他区域使用RMVPE
         f0_hybrid = f0_rmvpe.copy()
-        f0_hybrid[unstable_mask] = f0_crepe[unstable_mask]
-        # 平滑过渡边界
-        for i in range(1, len(f0_hybrid) - 1):
-            if unstable_mask[i] != unstable_mask[i-1]:
-                # 边界处使用加权平均
-                w = 0.5
-                f0_hybrid[i] = w * f0_rmvpe[i] + (1-w) * f0_crepe[i]
         return f0_hybrid
 def shift_f0(f0: np.ndarray, semitones: float) -> np.ndarray:
-    """
-    音调偏移
-    Args:
-        f0: 原始 F0
-        semitones: 偏移半音数 (正数升调，负数降调)
-    Returns:
-        np.ndarray: 偏移后的 F0
-    """
     factor = 2 ** (semitones / 12)
-    f0_shifted = f0 * factor
-    return f0_shifted

 # -*- coding: utf-8 -*-
 """
+F0 extraction helpers used by the local VC pipeline.
 """
+from __future__ import annotations
+from typing import Literal
 import numpy as np
 import torch
+from infer.quality_policy import build_conservative_crepe_fill_mask
 F0Method = Literal["rmvpe", "pm", "harvest", "crepe", "hybrid"]
 class F0Extractor:
+    """Base F0 extractor."""
     def __init__(self, sample_rate: int = 16000, hop_length: int = 160):
         self.sample_rate = sample_rate
         self.f0_max = 1100
     def extract(self, audio: np.ndarray) -> np.ndarray:
         raise NotImplementedError
 class PMExtractor(F0Extractor):
+    """Praat/Parselmouth extractor."""
     def extract(self, audio: np.ndarray) -> np.ndarray:
         import parselmouth
         time_step = self.hop_length / self.sample_rate
         sound = parselmouth.Sound(audio, self.sample_rate)
         pitch = sound.to_pitch_ac(
             time_step=time_step,
             voicing_threshold=0.6,
             pitch_floor=self.f0_min,
+            pitch_ceiling=self.f0_max,
         )
         f0 = pitch.selected_array["frequency"]
         f0[f0 == 0] = np.nan
         return f0
 class HarvestExtractor(F0Extractor):
+    """PyWorld harvest extractor."""
     def extract(self, audio: np.ndarray) -> np.ndarray:
         import pyworld
             self.sample_rate,
             f0_floor=self.f0_min,
             f0_ceil=self.f0_max,
+            frame_period=self.hop_length / self.sample_rate * 1000,
         )
         return f0
 class CrepeExtractor(F0Extractor):
+    """TorchCrepe extractor."""
+    def __init__(
+        self,
+        sample_rate: int = 16000,
+        hop_length: int = 160,
+        device: str = "cuda",
+    ):
         super().__init__(sample_rate, hop_length)
         self.device = device
     def extract(self, audio: np.ndarray) -> np.ndarray:
         import torchcrepe
+        audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
         f0, _ = torchcrepe.predict(
             audio_tensor,
             self.sample_rate,
             model="full",
             batch_size=512,
             device=self.device,
+            return_periodicity=True,
         )
+        return f0.squeeze(0).cpu().numpy()
 class RMVPEExtractor(F0Extractor):
+    """RMVPE extractor."""
+    def __init__(
+        self,
+        model_path: str,
+        sample_rate: int = 16000,
+        hop_length: int = 160,
+        device: str = "cuda",
+    ):
         super().__init__(sample_rate, hop_length)
         self.device = device
         self.model = None
         self.model_path = model_path
+    def load_model(self) -> None:
         if self.model is not None:
             return
         from models.rmvpe import RMVPE
         self.model = RMVPE(self.model_path, device=self.device)
+        print(f"RMVPE model loaded: {self.device}")
     def extract(self, audio: np.ndarray) -> np.ndarray:
         self.load_model()
+        return self.model.infer_from_audio(audio, thred=0.01)
+def get_f0_extractor(
+    method: F0Method,
+    device: str = "cuda",
+    rmvpe_path: str | None = None,
+    crepe_threshold: float = 0.05,
+) -> F0Extractor:
+    """Create an F0 extractor."""
     if method == "rmvpe":
         if rmvpe_path is None:
+            raise ValueError("RMVPE requires a model path")
         return RMVPEExtractor(rmvpe_path, device=device)
+    if method == "hybrid":
         if rmvpe_path is None:
+            raise ValueError("Hybrid requires an RMVPE model path")
+        return HybridF0Extractor(
+            rmvpe_path,
+            device=device,
+            crepe_threshold=crepe_threshold,
+        )
+    if method == "pm":
         return PMExtractor()
+    if method == "harvest":
         return HarvestExtractor()
+    if method == "crepe":
         return CrepeExtractor(device=device)
+    raise ValueError(f"Unknown F0 method: {method}")
 class HybridF0Extractor(F0Extractor):
+    """
+    Conservative hybrid extractor.
+    RMVPE remains the primary estimator. CREPE is only allowed to repair short,
+    high-confidence dropouts that sit inside already-voiced context.
+    """
+    def __init__(
+        self,
+        rmvpe_path: str,
+        sample_rate: int = 16000,
+        hop_length: int = 160,
+        device: str = "cuda",
+        crepe_threshold: float = 0.05,
+        max_fill_ratio: float = 0.02,
+        max_fill_frames: int = 320,
+        context_radius: int = 6,
+    ):
         super().__init__(sample_rate, hop_length)
         self.device = device
         self.rmvpe = RMVPEExtractor(rmvpe_path, sample_rate, hop_length, device)
+        self.crepe = None
+        self.crepe_threshold = float(crepe_threshold)
+        self.max_fill_ratio = float(max_fill_ratio)
+        self.max_fill_frames = int(max_fill_frames)
+        self.context_radius = int(context_radius)
+    def _load_crepe(self) -> None:
+        if self.crepe is not None:
+            return
+        try:
+            self.crepe = CrepeExtractor(
+                self.sample_rate,
+                self.hop_length,
+                self.device,
+            )
+        except ImportError:
+            print("Warning: torchcrepe is unavailable, hybrid falls back to RMVPE")
+            self.crepe = False
     def extract(self, audio: np.ndarray) -> np.ndarray:
         f0_rmvpe = self.rmvpe.extract(audio)
         self._load_crepe()
         if self.crepe is False:
             return f0_rmvpe
         import torchcrepe
         audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
         f0_crepe, confidence = torchcrepe.predict(
             audio_tensor,
             model="full",
             batch_size=512,
             device=self.device,
+            return_periodicity=True,
         )
         f0_crepe = f0_crepe.squeeze(0).cpu().numpy()
         confidence = confidence.squeeze(0).cpu().numpy()
         min_len = min(len(f0_rmvpe), len(f0_crepe), len(confidence))
+        if min_len <= 0:
+            return f0_rmvpe
+        f0_rmvpe = np.asarray(f0_rmvpe[:min_len], dtype=np.float32)
+        f0_crepe = np.asarray(f0_crepe[:min_len], dtype=np.float32)
+        confidence = np.asarray(confidence[:min_len], dtype=np.float32)
+        fill_mask = build_conservative_crepe_fill_mask(
+            f0_rmvpe=f0_rmvpe,
+            f0_crepe=f0_crepe,
+            confidence=confidence,
+            confidence_threshold=self.crepe_threshold,
+            max_ratio=self.max_fill_ratio,
+            max_frames=self.max_fill_frames,
+            context_radius=self.context_radius,
+        )
+        if not np.any(fill_mask):
+            return f0_rmvpe
         f0_hybrid = f0_rmvpe.copy()
+        f0_hybrid[fill_mask] = f0_crepe[fill_mask]
         return f0_hybrid
 def shift_f0(f0: np.ndarray, semitones: float) -> np.ndarray:
+    """Shift F0 by semitones."""
     factor = 2 ** (semitones / 12)
+    return f0 * factor

infer/modules/uvr5/modules.py CHANGED Viewed

@@ -4,7 +4,7 @@ import logging
 logger = logging.getLogger(__name__)
-import ffmpeg
 import torch
 from configs.config import Config
@@ -20,6 +20,24 @@ except ImportError:
 config = Config()
 def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0):
     infos = []
     try:
@@ -74,9 +92,7 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format
             need_reformat = 1
             done = 0
             try:
-                info = ffmpeg.probe(inp_path, cmd="ffprobe")
-                channels = info["streams"][0]["channels"]
-                sample_rate = info["streams"][0]["sample_rate"]
                 if log:
                     log.audio(f"音频信息: {channels}声道, {sample_rate}Hz")

 logger = logging.getLogger(__name__)
+import av
 import torch
 from configs.config import Config
 config = Config()
+def _read_audio_metadata(path: str) -> tuple[int, str]:
+    with av.open(path) as container:
+        stream = next((candidate for candidate in container.streams if candidate.type == "audio"), None)
+        if stream is None:
+            raise RuntimeError(f"No audio stream found in {path}")
+        codec_context = stream.codec_context
+        channels = getattr(codec_context, "channels", None)
+        if not channels and getattr(stream, "layout", None) is not None:
+            channels = getattr(stream.layout, "nb_channels", None)
+        sample_rate = getattr(codec_context, "sample_rate", None) or getattr(stream, "sample_rate", None)
+        if channels is None or sample_rate is None:
+            raise RuntimeError(f"Unable to read audio metadata from {path}")
+        return int(channels), str(sample_rate)
 def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0):
     infos = []
     try:
             need_reformat = 1
             done = 0
             try:
+                channels, sample_rate = _read_audio_metadata(inp_path)
                 if log:
                     log.audio(f"音频信息: {channels}声道, {sample_rate}Hz")

infer/modules/vc/pipeline.py CHANGED Viewed

@@ -29,6 +29,11 @@ except ImportError:
     log = None
 from lib.audio import soft_clip
 bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
@@ -372,6 +377,12 @@ class Pipeline(object):
         self.crepe_pd_threshold = float(getattr(config, "crepe_pd_threshold", 0.1))
         self.crepe_force_ratio = float(getattr(config, "crepe_force_ratio", 0.05))
         self.crepe_replace_semitones = float(getattr(config, "crepe_replace_semitones", 0.0))
         self.f0_fallback_context_radius = int(getattr(config, "f0_fallback_context_radius", 24))
         self.f0_fallback_repair_gap = int(getattr(config, "f0_fallback_repair_gap", 12))
         self.f0_fallback_post_gap = int(getattr(config, "f0_fallback_post_gap", 10))
@@ -394,6 +405,12 @@ class Pipeline(object):
             self.crepe_pd_threshold = 0.0
         if self.crepe_replace_semitones < 0:
             self.crepe_replace_semitones = 0.0
         if self.f0_fallback_context_radius < 1:
             self.f0_fallback_context_radius = 1
         if self.f0_fallback_repair_gap < 0:
@@ -421,6 +438,10 @@ class Pipeline(object):
                 f"F0混合: {self.f0_hybrid_mode}, CREPE阈值: {self.crepe_pd_threshold}, "
                 f"强制比率: {self.crepe_force_ratio}, 替换阈值(半音): {self.crepe_replace_semitones}"
             )
             log.detail(
                 f"F0兜底: 上下文半径={self.f0_fallback_context_radius}, "
                 f"预修补长度={self.f0_fallback_repair_gap}, 后修补长度={self.f0_fallback_post_gap}, "
@@ -466,12 +487,12 @@ class Pipeline(object):
             log.detail(f"时间步长: {time_step:.2f}ms, F0范围: {f0_min}-{f0_max}Hz")
             log.detail(f"音频长度: {len(x)} 样本, p_len: {p_len}")
-        # 将hybrid映射到rmvpe+crepe模式
         if f0_method == "hybrid":
             f0_method = "rmvpe"
-            # 临时设置hybrid模式
             original_hybrid_mode = self.f0_hybrid_mode
-            self.f0_hybrid_mode = "rmvpe+crepe"
             restore_hybrid_mode = True
         else:
             restore_hybrid_mode = False
@@ -704,7 +725,19 @@ class Pipeline(object):
                         else:
                             f0_fb = f0_fb[: len(f0)]
-                        fill_mask = need_fill & (f0_fb > 0)
                         f0[fill_mask] = f0_fb[fill_mask]
                         need_fill2 = (f0 <= 0) & energy_mask & voiced_context
@@ -923,40 +956,58 @@ class Pipeline(object):
         _energy_db = 20.0 * np.log10(_frame_rms + 1e-8)
         _ref = energy_ref_db if energy_ref_db is not None else float(np.percentile(_energy_db, 95))
-        # Soft gate: sigmoid curve centered at ref-45dB with 6dB transition width.
-        # Frames well above threshold → gain≈1; frames well below → gain≈0.05
-        # (keep a small floor to avoid zero-feature shock to the network).
-        _silence_center = _ref - 45.0
-        _transition_width = 6.0  # dB for the sigmoid ramp
-        _energy_gate = 1.0 / (1.0 + np.exp(-(_energy_db - _silence_center) / (_transition_width / 4.0)))
-        # Apply floor: never fully zero features (network handles near-zero better than hard zero)
-        _energy_gate = np.clip(_energy_gate, 0.05, 1.0)
         # Smooth temporally
         _sm = np.array([1, 2, 3, 2, 1], dtype=np.float32)
         _sm /= _sm.sum()
-        _energy_gate = np.convolve(_energy_gate, _sm, mode='same')[:_p_len_val]
-        _energy_gate = np.clip(_energy_gate, 0.05, 1.0)
         # Apply soft gate to features
         _feat_len = feats.shape[1]
-        if len(_energy_gate) > _feat_len:
-            _feat_gate = _energy_gate[:_feat_len]
-        elif len(_energy_gate) < _feat_len:
-            _feat_gate = np.pad(_energy_gate, (0, _feat_len - len(_energy_gate)), mode='constant', constant_values=1.0)
         else:
-            _feat_gate = _energy_gate
         _gate_t = torch.from_numpy(_feat_gate.astype(np.float32)).to(feats.device).unsqueeze(0).unsqueeze(-1)
         feats = feats * _gate_t
         # F0 soft gating: consistently soft-attenuate both pitch confidence and pitch value
         if pitch is not None and pitchf is not None:
             _pitch_len = pitch.shape[1]
-            if len(_energy_gate) > _pitch_len:
-                _f0_gate = _energy_gate[:_pitch_len]
-            elif len(_energy_gate) < _pitch_len:
-                _f0_gate = np.pad(_energy_gate, (0, _pitch_len - len(_energy_gate)), mode='constant', constant_values=1.0)
             else:
-                _f0_gate = _energy_gate
             _f0_gate_t = torch.from_numpy(_f0_gate.astype(np.float32)).to(pitch.device).unsqueeze(0)
             pitchf = pitchf * _f0_gate_t
             # Soft-blend pitch toward silence bin (1) instead of hard switch
@@ -1075,6 +1126,9 @@ class Pipeline(object):
                 )
             if log:
                 log.detail(f"分段数量: {len(opt_ts) + 1}")
         else:
             if log:
                 if self.disable_chunking:
@@ -1140,10 +1194,14 @@ class Pipeline(object):
         segment_count = len(opt_ts) + 1
         current_segment = 0
-        # Crossfade length at target rate (~12ms). Each boundary segment
-        # keeps this many extra samples from the normally-trimmed padding
-        # region. The overlap between adjacent segments is 2 * _xfade_tgt.
-        _xfade_tgt = min(int(0.012 * tgt_sr), self.t_pad_tgt // 4) if len(opt_ts) > 0 else 0
         def _trim_segment(raw, is_first, is_last):
             """Trim padding from vc() output, keeping crossfade overlap."""
@@ -1241,8 +1299,9 @@ class Pipeline(object):
             for seg in audio_opt[1:]:
                 xf = min(overlap, len(result), len(seg))
                 if xf > 1:
-                    fade_out = np.linspace(1.0, 0.0, xf, dtype=np.float32)
-                    fade_in = 1.0 - fade_out
                     blended = result[-xf:] * fade_out + seg[:xf] * fade_in
                     result = np.concatenate([result[:-xf], blended, seg[xf:]])
                 else:

     log = None
 from lib.audio import soft_clip
+from infer.quality_policy import (
+    build_conservative_harvest_fill_mask,
+    compute_breath_preserving_energy_gates,
+    compute_chunk_crossfade_samples,
+)
 bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
         self.crepe_pd_threshold = float(getattr(config, "crepe_pd_threshold", 0.1))
         self.crepe_force_ratio = float(getattr(config, "crepe_force_ratio", 0.05))
         self.crepe_replace_semitones = float(getattr(config, "crepe_replace_semitones", 0.0))
+        self.unvoiced_feature_gate_floor = float(
+            getattr(config, "unvoiced_feature_gate_floor", 0.28)
+        )
+        self.breath_active_margin_db = float(
+            getattr(config, "breath_active_margin_db", 52.0)
+        )
         self.f0_fallback_context_radius = int(getattr(config, "f0_fallback_context_radius", 24))
         self.f0_fallback_repair_gap = int(getattr(config, "f0_fallback_repair_gap", 12))
         self.f0_fallback_post_gap = int(getattr(config, "f0_fallback_post_gap", 10))
             self.crepe_pd_threshold = 0.0
         if self.crepe_replace_semitones < 0:
             self.crepe_replace_semitones = 0.0
+        if self.unvoiced_feature_gate_floor < 0.05:
+            self.unvoiced_feature_gate_floor = 0.05
+        if self.unvoiced_feature_gate_floor > 1.0:
+            self.unvoiced_feature_gate_floor = 1.0
+        if self.breath_active_margin_db < 1.0:
+            self.breath_active_margin_db = 1.0
         if self.f0_fallback_context_radius < 1:
             self.f0_fallback_context_radius = 1
         if self.f0_fallback_repair_gap < 0:
                 f"F0混合: {self.f0_hybrid_mode}, CREPE阈值: {self.crepe_pd_threshold}, "
                 f"强制比率: {self.crepe_force_ratio}, 替换阈值(半音): {self.crepe_replace_semitones}"
             )
+            log.detail(
+                f"气声门控: 特征地板={self.unvoiced_feature_gate_floor:.2f}, "
+                f"激活边界=ref-{self.breath_active_margin_db:.1f}dB"
+            )
             log.detail(
                 f"F0兜底: 上下文半径={self.f0_fallback_context_radius}, "
                 f"预修补长度={self.f0_fallback_repair_gap}, 后修补长度={self.f0_fallback_post_gap}, "
             log.detail(f"时间步长: {time_step:.2f}ms, F0范围: {f0_min}-{f0_max}Hz")
             log.detail(f"音频长度: {len(x)} 样本, p_len: {p_len}")
+        # Normalize direct hybrid requests to the conservative RMVPE fallback path.
         if f0_method == "hybrid":
             f0_method = "rmvpe"
             original_hybrid_mode = self.f0_hybrid_mode
+            if self.f0_hybrid_mode not in self.rmvpe_fallback_modes:
+                self.f0_hybrid_mode = "fallback"
             restore_hybrid_mode = True
         else:
             restore_hybrid_mode = False
                         else:
                             f0_fb = f0_fb[: len(f0)]
+                        fill_mask = build_conservative_harvest_fill_mask(
+                            reference_f0=f0,
+                            fallback_f0=f0_fb,
+                            dropout_mask=need_fill,
+                            max_run=10,
+                            local_radius=4,
+                            max_semitones=4.0,
+                        )
+                        rejected_fill_count = int(np.sum(need_fill)) - int(np.sum(fill_mask))
+                        if rejected_fill_count > 0 and log:
+                            log.detail(
+                                f"Harvest兜底已拒绝 {rejected_fill_count} 帧长间隙/离群音高，避免呼吸与回气误赋音高"
+                            )
                         f0[fill_mask] = f0_fb[fill_mask]
                         need_fill2 = (f0 <= 0) & energy_mask & voiced_context
         _energy_db = 20.0 * np.log10(_frame_rms + 1e-8)
         _ref = energy_ref_db if energy_ref_db is not None else float(np.percentile(_energy_db, 95))
+        _unvoiced_mask = None
+        if pitchf is not None:
+            _pitchf_np = pitchf[0].detach().float().cpu().numpy()
+            if len(_pitchf_np) > _p_len_val:
+                _pitchf_np = _pitchf_np[:_p_len_val]
+            elif len(_pitchf_np) < _p_len_val:
+                _pitchf_np = np.pad(_pitchf_np, (0, _p_len_val - len(_pitchf_np)), mode="edge")
+            _unvoiced_mask = _pitchf_np <= 1e-4
+        _feat_gate_curve, _f0_gate_curve = compute_breath_preserving_energy_gates(
+            energy_db=_energy_db,
+            ref_db=_ref,
+            unvoiced_mask=_unvoiced_mask,
+            quiet_floor=0.05,
+            breath_floor=self.unvoiced_feature_gate_floor,
+            breath_active_margin_db=self.breath_active_margin_db,
+            transition_width_db=6.0,
+        )
         # Smooth temporally
         _sm = np.array([1, 2, 3, 2, 1], dtype=np.float32)
         _sm /= _sm.sum()
+        _feat_gate_curve = np.convolve(_feat_gate_curve, _sm, mode='same')[:_p_len_val]
+        _f0_gate_curve = np.convolve(_f0_gate_curve, _sm, mode='same')[:_p_len_val]
+        _feat_gate_curve = np.clip(_feat_gate_curve, 0.05, 1.0)
+        _f0_gate_curve = np.clip(_f0_gate_curve, 0.05, 1.0)
+        if log and _unvoiced_mask is not None:
+            _breath_frames = int(np.sum(_feat_gate_curve > (_f0_gate_curve + 1e-4)))
+            if _breath_frames > 0:
+                log.detail(
+                    f"气声保护门控: { _breath_frames }/{ _p_len_val } 帧保留更高特征底噪"
+                )
         # Apply soft gate to features
         _feat_len = feats.shape[1]
+        if len(_feat_gate_curve) > _feat_len:
+            _feat_gate = _feat_gate_curve[:_feat_len]
+        elif len(_feat_gate_curve) < _feat_len:
+            _feat_gate = np.pad(_feat_gate_curve, (0, _feat_len - len(_feat_gate_curve)), mode='constant', constant_values=1.0)
         else:
+            _feat_gate = _feat_gate_curve
         _gate_t = torch.from_numpy(_feat_gate.astype(np.float32)).to(feats.device).unsqueeze(0).unsqueeze(-1)
         feats = feats * _gate_t
         # F0 soft gating: consistently soft-attenuate both pitch confidence and pitch value
         if pitch is not None and pitchf is not None:
             _pitch_len = pitch.shape[1]
+            if len(_f0_gate_curve) > _pitch_len:
+                _f0_gate = _f0_gate_curve[:_pitch_len]
+            elif len(_f0_gate_curve) < _pitch_len:
+                _f0_gate = np.pad(_f0_gate_curve, (0, _pitch_len - len(_f0_gate_curve)), mode='constant', constant_values=1.0)
             else:
+                _f0_gate = _f0_gate_curve
             _f0_gate_t = torch.from_numpy(_f0_gate.astype(np.float32)).to(pitch.device).unsqueeze(0)
             pitchf = pitchf * _f0_gate_t
             # Soft-blend pitch toward silence bin (1) instead of hard switch
                 )
             if log:
                 log.detail(f"分段数量: {len(opt_ts) + 1}")
+                if opt_ts:
+                    boundary_times = [round(float(t) / 16000.0, 3) for t in opt_ts[:12]]
+                    log.detail(f"分段边界(秒): {boundary_times}")
         else:
             if log:
                 if self.disable_chunking:
         segment_count = len(opt_ts) + 1
         current_segment = 0
+        # Keep a wider overlap than the legacy 12ms crossfade. Adjacent
+        # chunks are synthesized independently and benefit from a longer,
+        # equal-power merge to suppress tearing at the boundaries.
+        _xfade_tgt = compute_chunk_crossfade_samples(
+            tgt_sr=tgt_sr,
+            t_pad_tgt=self.t_pad_tgt,
+            segment_count=segment_count,
+        )
         def _trim_segment(raw, is_first, is_last):
             """Trim padding from vc() output, keeping crossfade overlap."""
             for seg in audio_opt[1:]:
                 xf = min(overlap, len(result), len(seg))
                 if xf > 1:
+                    theta = np.linspace(0.0, np.pi / 2.0, xf, dtype=np.float32)
+                    fade_out = np.cos(theta)
+                    fade_in = np.sin(theta)
                     blended = result[-xf:] * fade_out + seg[:xf] * fade_in
                     result = np.concatenate([result[:-xf], blended, seg[xf:]])
                 else:

infer/official_adapter.py CHANGED Viewed

@@ -16,9 +16,22 @@ from typing import Optional, Tuple
 import soundfile as sf
 from configs.config import Config as OfficialConfig
 from lib.logger import log
 def _load_app_config(root_dir: Path) -> dict:
     config_path = root_dir / "configs" / "config.json"
     if config_path.exists():
@@ -41,6 +54,17 @@ def _to_float(value, default):
         return float(default)
 def _resolve_index_path(model_path: Path, index_path: Optional[str]) -> Optional[Path]:
     """Best-effort resolve of the matching FAISS index for a model."""
     if index_path:
@@ -189,13 +213,15 @@ def separate_uvr5(
     fmt: str = "wav"
 ) -> Tuple[str, str]:
     """Run UVR5 separation and return vocals/ins paths."""
     log.progress("开始UVR5人声分离...")
     log.detail(f"输入音频: {input_audio}")
     log.detail(f"临时目录: {temp_dir}")
     log.config(f"激进度: {agg}, 输出格式: {fmt}")
     setup_official_env(Path(__file__).parent.parent)
-    from infer.modules.uvr5.modules import uvr as uvr5_run
     try:
         import ffmpeg  # noqa: F401
     except Exception as e:
@@ -246,6 +272,26 @@ def separate_uvr5(
     if not vocal_files or not ins_files:
         raise RuntimeError("UVR5 分离失败，未生成输出文件")
     log.success(f"UVR5分离完成")
     log.audio(f"人声文件: {vocal_files[-1].name}")
     log.audio(f"伴奏文件: {ins_files[-1].name}")
@@ -271,6 +317,14 @@ def convert_vocals_official(
     rms_mix_rate = float(max(0.0, min(1.0, rms_mix_rate)))
     # Official vc pipeline uses the opposite convention: 1=off, 0=strongest.
     official_rms_mix_rate = 1.0 - rms_mix_rate
     log.progress("开始官方VC人声转换...")
     log.detail(f"输入人声: {vocals_path}")
@@ -280,6 +334,14 @@ def convert_vocals_official(
         log.model(f"索引文件: {Path(index_path).name}")
     log.config(f"F0方法: {f0_method}")
     log.config(f"音调偏移: {pitch_shift} 半音")
     log.config(f"索引率: {index_rate}")
     log.config(f"滤波半径: {filter_radius}")
@@ -287,8 +349,6 @@ def convert_vocals_official(
     log.config(f"保护系数: {protect}")
     if repair_profile:
         log.config("唱歌修复: 开启")
-    root_dir = Path(__file__).parent.parent
     env_paths = setup_official_env(root_dir)
     log.detail("导入官方VC模块...")
@@ -306,7 +366,6 @@ def convert_vocals_official(
     log.detail("初始���官方配置...")
     config = OfficialConfig()
-    app_cfg = _load_app_config(root_dir)
     config.disable_chunking = bool(app_cfg.get("disable_chunking", False))
     if "cover" in app_cfg and isinstance(app_cfg["cover"], dict):
         config.disable_chunking = bool(app_cfg["cover"].get("disable_chunking", config.disable_chunking))
@@ -317,7 +376,7 @@ def convert_vocals_official(
     # Keep RMVPE extraction aligned with RVC training pitch embedding range.
     # Allowing much wider ranges (e.g. 1600Hz) often tracks higher harmonics
     # instead of the fundamental and introduces synthetic buzzing artifacts.
-    if f0_method == "rmvpe":
         if config.f0_min != 50.0 or config.f0_max != 1100.0:
             log.warning(
                 "检测到RMVPE F0范围偏离RVC训练范围，已强制使用 50-1100Hz 以避免误跟踪高次谐波"
@@ -328,7 +387,7 @@ def convert_vocals_official(
     config.f0_energy_threshold_db = _to_float(
         _get_cfg_value(app_cfg, "f0_energy_threshold_db", -50), -50
     )
-    config.f0_hybrid_mode = str(_get_cfg_value(app_cfg, "f0_hybrid_mode", "off"))
     config.crepe_pd_threshold = _to_float(
         _get_cfg_value(app_cfg, "crepe_pd_threshold", 0.1), 0.1
     )
@@ -338,6 +397,12 @@ def convert_vocals_official(
     config.crepe_replace_semitones = _to_float(
         _get_cfg_value(app_cfg, "crepe_replace_semitones", 0.0), 0.0
     )
     config.f0_stabilize = bool(_get_cfg_value(app_cfg, "f0_stabilize", False))
     config.f0_stabilize_window = int(_get_cfg_value(app_cfg, "f0_stabilize_window", 2))
     config.f0_stabilize_max_semitones = _to_float(
@@ -348,10 +413,32 @@ def convert_vocals_official(
     config.f0_rate_limit_semitones = _to_float(
         _get_cfg_value(app_cfg, "f0_rate_limit_semitones", 8.0), 8.0
     )
     if repair_profile:
         config.is_half = False
         config.f0_hybrid_mode = "fallback"
         config.f0_energy_threshold_db = -42.0
         config.f0_fallback_context_radius = 12
         config.f0_fallback_repair_gap = 6
         config.f0_fallback_post_gap = 4
@@ -365,6 +452,10 @@ def convert_vocals_official(
     log.config(f"F0范围: {config.f0_min}-{config.f0_max}Hz")
     log.config(f"RMVPE阈值: {config.rmvpe_threshold}")
     log.config(f"F0能量阈值: {config.f0_energy_threshold_db}dB")
     log.config(
         f"F0混合: {config.f0_hybrid_mode}, CREPE阈值: {config.crepe_pd_threshold}, "
         f"强制比率: {config.crepe_force_ratio}, 替换阈值(半音): {config.crepe_replace_semitones}"
@@ -402,7 +493,7 @@ def convert_vocals_official(
         vocals_path,
         pitch_shift,
         None,
-        f0_method,
         official_index or "",
         "",
         index_rate,
@@ -513,6 +604,13 @@ def convert_vocals_official_upstream(
 ) -> str:
     """Run vendored upstream official RVC in an isolated subprocess."""
     root_dir = Path(__file__).parent.parent
     env_paths = setup_upstream_official_env(root_dir)
     sid, official_index = export_model_to_official(
@@ -537,7 +635,7 @@ def convert_vocals_official_upstream(
         "--output-path",
         str(output_path),
         "--f0-method",
-        str(f0_method),
         "--pitch-shift",
         str(int(pitch_shift)),
         "--index-path",
@@ -558,6 +656,14 @@ def convert_vocals_official_upstream(
     log.detail(f"官方模型SID: {sid}")
     if official_index:
         log.detail(f"官方索引路径: {official_index}")
     log.detail(f"官方RMS混合率: {official_rms_mix_rate}")
     try:

 import soundfile as sf
 from configs.config import Config as OfficialConfig
+from infer.quality_policy import resolve_cover_f0_policy
 from lib.logger import log
+class _IsolatedArgv:
+    """Prevent vendored official modules from parsing this process' CLI args."""
+    def __enter__(self):
+        self._saved_argv = sys.argv[:]
+        sys.argv = [sys.argv[0]]
+    def __exit__(self, exc_type, exc, tb):
+        sys.argv = self._saved_argv
+        return False
 def _load_app_config(root_dir: Path) -> dict:
     config_path = root_dir / "configs" / "config.json"
     if config_path.exists():
         return float(default)
+def _get_audio_activity_stats(audio_path: Path) -> Tuple[float, float, int]:
+    audio, _ = sf.read(str(audio_path), dtype="float32", always_2d=True)
+    if audio.size == 0:
+        return 0.0, 0.0, 0
+    mono = audio.mean(axis=1)
+    rms = float((mono.astype("float64") ** 2).mean() ** 0.5)
+    peak = float(abs(mono).max())
+    nonzero = int((abs(mono) > 1e-8).sum())
+    return rms, peak, nonzero
 def _resolve_index_path(model_path: Path, index_path: Optional[str]) -> Optional[Path]:
     """Best-effort resolve of the matching FAISS index for a model."""
     if index_path:
     fmt: str = "wav"
 ) -> Tuple[str, str]:
     """Run UVR5 separation and return vocals/ins paths."""
+    temp_dir = Path(temp_dir).resolve()
     log.progress("开始UVR5人声分离...")
     log.detail(f"输入音频: {input_audio}")
     log.detail(f"临时目录: {temp_dir}")
     log.config(f"激进度: {agg}, 输出格式: {fmt}")
     setup_official_env(Path(__file__).parent.parent)
+    with _IsolatedArgv():
+        from infer.modules.uvr5.modules import uvr as uvr5_run
     try:
         import ffmpeg  # noqa: F401
     except Exception as e:
     if not vocal_files or not ins_files:
         raise RuntimeError("UVR5 分离失败，未生成输出文件")
+    vocal_rms, vocal_peak, vocal_nonzero = _get_audio_activity_stats(vocal_files[-1])
+    ins_rms, ins_peak, ins_nonzero = _get_audio_activity_stats(ins_files[-1])
+    log.detail(
+        "UVR5输出统计: "
+        f"vocals(rms={vocal_rms:.8f}, peak={vocal_peak:.8f}, nonzero={vocal_nonzero}), "
+        f"ins(rms={ins_rms:.8f}, peak={ins_peak:.8f}, nonzero={ins_nonzero})"
+    )
+    if vocal_peak <= 1e-6 or vocal_nonzero == 0:
+        raise RuntimeError(
+            "UVR5 分离失败：人声输出为空或静音，"
+            f"模型={model_name}, vocal={vocal_files[-1]}, "
+            f"rms={vocal_rms:.8f}, peak={vocal_peak:.8f}, nonzero={vocal_nonzero}"
+        )
+    if ins_peak <= 1e-6 or ins_nonzero == 0:
+        raise RuntimeError(
+            "UVR5 分离失败：伴奏输出为空或静音，"
+            f"模型={model_name}, instrumental={ins_files[-1]}, "
+            f"rms={ins_rms:.8f}, peak={ins_peak:.8f}, nonzero={ins_nonzero}"
+        )
     log.success(f"UVR5分离完成")
     log.audio(f"人声文件: {vocal_files[-1].name}")
     log.audio(f"伴奏文件: {ins_files[-1].name}")
     rms_mix_rate = float(max(0.0, min(1.0, rms_mix_rate)))
     # Official vc pipeline uses the opposite convention: 1=off, 0=strongest.
     official_rms_mix_rate = 1.0 - rms_mix_rate
+    root_dir = Path(__file__).parent.parent
+    app_cfg = _load_app_config(root_dir)
+    f0_policy = resolve_cover_f0_policy(
+        requested_method=f0_method,
+        configured_hybrid_mode=str(_get_cfg_value(app_cfg, "f0_hybrid_mode", "off")),
+        repair_profile=repair_profile,
+    )
+    effective_f0_method = f0_policy.vc_method
     log.progress("开始官方VC人声转换...")
     log.detail(f"输入人声: {vocals_path}")
         log.model(f"索引文件: {Path(index_path).name}")
     log.config(f"F0方法: {f0_method}")
+    if effective_f0_method != str(f0_method).strip().lower():
+        log.detail(
+            "F0路由解析: "
+            f"requested={f0_policy.requested_method}, "
+            f"vc={f0_policy.vc_method}, "
+            f"hybrid_mode={f0_policy.hybrid_mode}, "
+            f"gate={f0_policy.gate_method}"
+        )
     log.config(f"音调偏移: {pitch_shift} 半音")
     log.config(f"索引率: {index_rate}")
     log.config(f"滤波半径: {filter_radius}")
     log.config(f"保护系数: {protect}")
     if repair_profile:
         log.config("唱歌修复: 开启")
     env_paths = setup_official_env(root_dir)
     log.detail("导入官方VC模块...")
     log.detail("初始���官方配置...")
     config = OfficialConfig()
     config.disable_chunking = bool(app_cfg.get("disable_chunking", False))
     if "cover" in app_cfg and isinstance(app_cfg["cover"], dict):
         config.disable_chunking = bool(app_cfg["cover"].get("disable_chunking", config.disable_chunking))
     # Keep RMVPE extraction aligned with RVC training pitch embedding range.
     # Allowing much wider ranges (e.g. 1600Hz) often tracks higher harmonics
     # instead of the fundamental and introduces synthetic buzzing artifacts.
+    if effective_f0_method == "rmvpe":
         if config.f0_min != 50.0 or config.f0_max != 1100.0:
             log.warning(
                 "检测到RMVPE F0范围偏离RVC训练范围，已强制使用 50-1100Hz 以避免误跟踪高次谐波"
     config.f0_energy_threshold_db = _to_float(
         _get_cfg_value(app_cfg, "f0_energy_threshold_db", -50), -50
     )
+    config.f0_hybrid_mode = f0_policy.hybrid_mode
     config.crepe_pd_threshold = _to_float(
         _get_cfg_value(app_cfg, "crepe_pd_threshold", 0.1), 0.1
     )
     config.crepe_replace_semitones = _to_float(
         _get_cfg_value(app_cfg, "crepe_replace_semitones", 0.0), 0.0
     )
+    config.unvoiced_feature_gate_floor = _to_float(
+        _get_cfg_value(app_cfg, "unvoiced_feature_gate_floor", 0.28), 0.28
+    )
+    config.breath_active_margin_db = _to_float(
+        _get_cfg_value(app_cfg, "breath_active_margin_db", 52.0), 52.0
+    )
     config.f0_stabilize = bool(_get_cfg_value(app_cfg, "f0_stabilize", False))
     config.f0_stabilize_window = int(_get_cfg_value(app_cfg, "f0_stabilize_window", 2))
     config.f0_stabilize_max_semitones = _to_float(
     config.f0_rate_limit_semitones = _to_float(
         _get_cfg_value(app_cfg, "f0_rate_limit_semitones", 8.0), 8.0
     )
+    if f0_policy.requested_method == "hybrid":
+        config.f0_fallback_context_radius = 12
+        config.f0_fallback_repair_gap = 6
+        config.f0_fallback_post_gap = 4
+        config.f0_fallback_use_crepe = True
+        config.f0_fallback_crepe_max_ratio = 0.006
+        config.f0_fallback_crepe_max_frames = 160
+        if not repair_profile:
+            config.f0_stabilize = True
+            config.f0_rate_limit = True
+            config.f0_stabilize_max_semitones = min(
+                5.0,
+                float(config.f0_stabilize_max_semitones),
+            )
+            config.f0_rate_limit_semitones = min(
+                7.0,
+                float(config.f0_rate_limit_semitones),
+            )
+            log.detail("Hybrid唱歌护栏已启用: F0稳定器 + F0限速")
     if repair_profile:
         config.is_half = False
         config.f0_hybrid_mode = "fallback"
         config.f0_energy_threshold_db = -42.0
+        config.unvoiced_feature_gate_floor = max(
+            0.32, float(config.unvoiced_feature_gate_floor)
+        )
         config.f0_fallback_context_radius = 12
         config.f0_fallback_repair_gap = 6
         config.f0_fallback_post_gap = 4
     log.config(f"F0范围: {config.f0_min}-{config.f0_max}Hz")
     log.config(f"RMVPE阈值: {config.rmvpe_threshold}")
     log.config(f"F0能量阈值: {config.f0_energy_threshold_db}dB")
+    log.config(
+        f"无声气声特征地板: {config.unvoiced_feature_gate_floor:.2f}, "
+        f"呼吸激活边界: ref-{config.breath_active_margin_db:.1f}dB"
+    )
     log.config(
         f"F0混合: {config.f0_hybrid_mode}, CREPE阈值: {config.crepe_pd_threshold}, "
         f"强制比率: {config.crepe_force_ratio}, 替换阈值(半音): {config.crepe_replace_semitones}"
         vocals_path,
         pitch_shift,
         None,
+        effective_f0_method,
         official_index or "",
         "",
         index_rate,
 ) -> str:
     """Run vendored upstream official RVC in an isolated subprocess."""
     root_dir = Path(__file__).parent.parent
+    app_cfg = _load_app_config(root_dir)
+    f0_policy = resolve_cover_f0_policy(
+        requested_method=f0_method,
+        configured_hybrid_mode=str(_get_cfg_value(app_cfg, "f0_hybrid_mode", "off")),
+        repair_profile=False,
+    )
+    effective_f0_method = f0_policy.vc_method
     env_paths = setup_upstream_official_env(root_dir)
     sid, official_index = export_model_to_official(
         "--output-path",
         str(output_path),
         "--f0-method",
+        str(effective_f0_method),
         "--pitch-shift",
         str(int(pitch_shift)),
         "--index-path",
     log.detail(f"官方模型SID: {sid}")
     if official_index:
         log.detail(f"官方索引路径: {official_index}")
+    if effective_f0_method != str(f0_method).strip().lower():
+        log.detail(
+            "官方F0路由解析: "
+            f"requested={f0_policy.requested_method}, "
+            f"vc={f0_policy.vc_method}, "
+            f"hybrid_mode={f0_policy.hybrid_mode}, "
+            f"gate={f0_policy.gate_method}"
+        )
     log.detail(f"官方RMS混合率: {official_rms_mix_rate}")
     try:

infer/quality_policy.py ADDED Viewed

	@@ -0,0 +1,294 @@

+from __future__ import annotations
+from dataclasses import dataclass
+import numpy as np
+FALLBACK_HYBRID_MODES = {
+    "fallback",
+    "smart",
+    "rmvpe+fallback",
+    "rmvpe_fallback",
+    "rmvpe-fallback",
+    "hybrid_fallback",
+    "hybrid-fallback",
+}
+@dataclass(frozen=True)
+class F0RoutingPolicy:
+    requested_method: str
+    vc_method: str
+    hybrid_mode: str
+    gate_method: str
+    description: str
+def resolve_cover_f0_policy(
+    requested_method: str,
+    configured_hybrid_mode: str = "off",
+    repair_profile: bool = False,
+) -> F0RoutingPolicy:
+    requested = str(requested_method or "rmvpe").strip().lower()
+    configured = str(configured_hybrid_mode or "off").strip().lower()
+    if requested != "hybrid":
+        return F0RoutingPolicy(
+            requested_method=requested,
+            vc_method=requested,
+            hybrid_mode=configured or "off",
+            gate_method=requested,
+            description=f"{requested} uses the configured routing directly.",
+        )
+    hybrid_mode = configured if configured in FALLBACK_HYBRID_MODES else "fallback"
+    if repair_profile:
+        hybrid_mode = "fallback"
+    return F0RoutingPolicy(
+        requested_method="hybrid",
+        vc_method="rmvpe",
+        hybrid_mode=hybrid_mode,
+        gate_method="rmvpe",
+        description="hybrid request routed to RMVPE with conservative fallback; post-gate uses RMVPE only.",
+    )
+def build_conservative_crepe_fill_mask(
+    f0_rmvpe: np.ndarray,
+    f0_crepe: np.ndarray,
+    confidence: np.ndarray,
+    confidence_threshold: float,
+    max_ratio: float = 0.02,
+    max_frames: int = 320,
+    context_radius: int = 6,
+    energy_mask: np.ndarray | None = None,
+) -> np.ndarray:
+    f0_rmvpe = np.asarray(f0_rmvpe, dtype=np.float32).reshape(-1)
+    f0_crepe = np.asarray(f0_crepe, dtype=np.float32).reshape(-1)
+    confidence = np.asarray(confidence, dtype=np.float32).reshape(-1)
+    n = min(len(f0_rmvpe), len(f0_crepe), len(confidence))
+    if n == 0:
+        return np.zeros(0, dtype=bool)
+    f0_rmvpe = f0_rmvpe[:n]
+    f0_crepe = f0_crepe[:n]
+    confidence = confidence[:n]
+    threshold = max(0.0, float(confidence_threshold))
+    context_radius = max(1, int(context_radius))
+    max_ratio = max(0.0, float(max_ratio))
+    max_frames = max(0, int(max_frames))
+    if energy_mask is None:
+        energy_mask = np.ones(n, dtype=bool)
+    else:
+        energy_mask = np.asarray(energy_mask, dtype=bool).reshape(-1)
+        if len(energy_mask) < n:
+            energy_mask = np.pad(energy_mask, (0, n - len(energy_mask)), mode="edge")
+        else:
+            energy_mask = energy_mask[:n]
+    voiced_seed = f0_rmvpe > 0
+    if not np.any(voiced_seed):
+        return np.zeros(n, dtype=bool)
+    idx = np.arange(n)
+    left_seen = np.where(voiced_seed, idx, -10**9)
+    left_seen = np.maximum.accumulate(left_seen)
+    right_seen = np.where(voiced_seed, idx, 10**9)
+    right_seen = np.minimum.accumulate(right_seen[::-1])[::-1]
+    voiced_context = ((idx - left_seen) <= context_radius) & ((right_seen - idx) <= context_radius)
+    fill_mask = (
+        (f0_rmvpe <= 0)
+        & (f0_crepe > 0)
+        & (confidence >= threshold)
+        & energy_mask
+        & voiced_context
+    )
+    fill_count = int(np.sum(fill_mask))
+    if fill_count == 0:
+        return fill_mask
+    fill_ratio = float(fill_count) / float(n)
+    if fill_count > max_frames or fill_ratio > max_ratio:
+        return np.zeros(n, dtype=bool)
+    return fill_mask
+def build_conservative_harvest_fill_mask(
+    reference_f0: np.ndarray,
+    fallback_f0: np.ndarray,
+    dropout_mask: np.ndarray,
+    max_run: int = 10,
+    local_radius: int = 4,
+    max_semitones: float = 4.0,
+    min_neighbors: int = 2,
+) -> np.ndarray:
+    reference_f0 = np.asarray(reference_f0, dtype=np.float32).reshape(-1)
+    fallback_f0 = np.asarray(fallback_f0, dtype=np.float32).reshape(-1)
+    dropout_mask = np.asarray(dropout_mask, dtype=bool).reshape(-1)
+    n = min(reference_f0.size, fallback_f0.size, dropout_mask.size)
+    if n <= 0:
+        return np.zeros(0, dtype=bool)
+    reference_f0 = reference_f0[:n]
+    fallback_f0 = fallback_f0[:n]
+    dropout_mask = dropout_mask[:n]
+    accepted = np.zeros(n, dtype=bool)
+    max_run = max(1, int(max_run))
+    local_radius = max(1, int(local_radius))
+    max_semitones = max(0.0, float(max_semitones))
+    min_neighbors = max(1, int(min_neighbors))
+    padded = np.pad(dropout_mask.astype(np.int8), (1, 1), mode="constant")
+    edges = np.diff(padded)
+    starts = np.where(edges == 1)[0]
+    ends = np.where(edges == -1)[0]
+    eps = 1e-6
+    for start, end in zip(starts, ends):
+        run_slice = slice(start, end)
+        run_len = end - start
+        if run_len <= 0 or run_len > max_run:
+            continue
+        run_fallback = fallback_f0[run_slice]
+        voiced_run = run_fallback > 0
+        if not np.any(voiced_run):
+            continue
+        left = reference_f0[max(0, start - local_radius) : start]
+        right = reference_f0[end : min(n, end + local_radius)]
+        neighbors = np.concatenate([left[left > 0], right[right > 0]])
+        if neighbors.size < min_neighbors:
+            continue
+        local_median = float(np.median(neighbors))
+        if local_median <= 0:
+            continue
+        semitone_diff = np.abs(
+            12.0 * np.log2((run_fallback + eps) / (local_median + eps))
+        )
+        accepted[run_slice] = voiced_run & (semitone_diff <= max_semitones)
+    return accepted
+def compute_chunk_crossfade_samples(
+    tgt_sr: int,
+    t_pad_tgt: int,
+    segment_count: int,
+) -> int:
+    tgt_sr = int(max(tgt_sr, 0))
+    t_pad_tgt = int(max(t_pad_tgt, 0))
+    segment_count = int(max(segment_count, 0))
+    if tgt_sr <= 0 or t_pad_tgt <= 0 or segment_count <= 1:
+        return 0
+    base = int(round(tgt_sr * 0.018))
+    extra = int(round(tgt_sr * 0.002 * max(0, segment_count - 2)))
+    min_crossfade = int(round(tgt_sr * 0.012))
+    max_crossfade = max(min_crossfade, t_pad_tgt // 3)
+    return int(np.clip(base + extra, min_crossfade, max_crossfade))
+def compute_active_source_replace(
+    activity: np.ndarray,
+    soft_mask: np.ndarray,
+    echo_ratio: np.ndarray,
+    direct_ratio: np.ndarray,
+    max_replace: float = 0.82,
+) -> np.ndarray:
+    activity = np.asarray(activity, dtype=np.float32).reshape(-1)
+    direct_ratio = np.asarray(direct_ratio, dtype=np.float32).reshape(-1)
+    soft_mask = np.asarray(soft_mask, dtype=np.float32)
+    echo_ratio = np.asarray(echo_ratio, dtype=np.float32)
+    if soft_mask.ndim == 1:
+        soft_mask = soft_mask[np.newaxis, :]
+    if echo_ratio.ndim == 1:
+        echo_ratio = echo_ratio[np.newaxis, :]
+    frame_count = min(soft_mask.shape[-1], echo_ratio.shape[-1], len(activity), len(direct_ratio))
+    if frame_count <= 0:
+        return np.zeros_like(soft_mask, dtype=np.float32)
+    soft_mask = soft_mask[..., :frame_count]
+    echo_ratio = echo_ratio[..., :frame_count]
+    activity = activity[:frame_count][np.newaxis, :]
+    direct_ratio = direct_ratio[:frame_count][np.newaxis, :]
+    base_replace = 0.85 * (1.0 - activity) * (1.0 - soft_mask)
+    active_echo_presence = np.clip(
+        echo_ratio * (0.35 + 0.65 * (1.0 - direct_ratio)),
+        0.0,
+        1.0,
+    )
+    active_replace = 0.65 * activity * active_echo_presence * (1.0 - soft_mask)
+    source_replace = np.clip(base_replace + active_replace, 0.0, float(max_replace))
+    return source_replace.astype(np.float32)
+def compute_source_cleanup_budget(
+    energy_guard: np.ndarray,
+    phrase_activity: np.ndarray,
+) -> tuple[np.ndarray, np.ndarray]:
+    energy_guard = np.clip(np.asarray(energy_guard, dtype=np.float32), 0.0, 1.0)
+    phrase_activity = np.clip(np.asarray(phrase_activity, dtype=np.float32), 0.0, 1.0)
+    allowed_boost = 0.35 + 1.00 * energy_guard
+    cleanup_floor = 0.62 + 0.16 * phrase_activity
+    return allowed_boost.astype(np.float32), cleanup_floor.astype(np.float32)
+def compute_breath_preserving_energy_gates(
+    energy_db: np.ndarray,
+    ref_db: float,
+    unvoiced_mask: np.ndarray | None,
+    quiet_floor: float = 0.05,
+    breath_floor: float = 0.28,
+    breath_active_margin_db: float = 52.0,
+    transition_width_db: float = 6.0,
+) -> tuple[np.ndarray, np.ndarray]:
+    energy_db = np.asarray(energy_db, dtype=np.float32).reshape(-1)
+    if energy_db.size == 0:
+        empty = np.zeros(0, dtype=np.float32)
+        return empty, empty
+    quiet_floor = float(np.clip(quiet_floor, 0.0, 1.0))
+    breath_floor = float(np.clip(max(breath_floor, quiet_floor), quiet_floor, 1.0))
+    breath_active_margin_db = float(max(1.0, breath_active_margin_db))
+    transition_width_db = float(max(0.5, transition_width_db))
+    ref_db = float(ref_db)
+    silence_center = ref_db - 45.0
+    slope = transition_width_db / 4.0
+    base_gate = 1.0 / (1.0 + np.exp(-((energy_db - silence_center) / slope)))
+    base_gate = np.clip(base_gate, quiet_floor, 1.0).astype(np.float32)
+    if unvoiced_mask is None:
+        return base_gate, base_gate.copy()
+    unvoiced_mask = np.asarray(unvoiced_mask, dtype=bool).reshape(-1)
+    if len(unvoiced_mask) < len(base_gate):
+        unvoiced_mask = np.pad(unvoiced_mask, (0, len(base_gate) - len(unvoiced_mask)), mode="edge")
+    else:
+        unvoiced_mask = unvoiced_mask[: len(base_gate)]
+    breath_activity = np.clip(
+        (energy_db - (ref_db - breath_active_margin_db)) / 10.0,
+        0.0,
+        1.0,
+    ).astype(np.float32)
+    feature_floor = quiet_floor + (breath_floor - quiet_floor) * breath_activity
+    feature_gate = base_gate.copy()
+    feature_gate[unvoiced_mask] = np.maximum(feature_gate[unvoiced_mask], feature_floor[unvoiced_mask])
+    feature_gate = np.clip(feature_gate, quiet_floor, 1.0).astype(np.float32)
+    return feature_gate, base_gate.copy()

infer/separator.py CHANGED Viewed

@@ -3,13 +3,13 @@
 人声分离模块 - 支持 Demucs 和 Mel-Band Roformer (audio-separator)
 """
 import os
-import gc
-import shutil
-import torch
-import numpy as np
-import soundfile as sf
-from pathlib import Path
-from typing import Tuple, Optional, Callable
 from lib.logger import log
 from lib.device import get_device, empty_device_cache
@@ -34,12 +34,82 @@ except ImportError:
     AUDIO_SEPARATOR_AVAILABLE = False
-# Mel-Band Roformer 默认模型
-ROFORMER_DEFAULT_MODEL = "vocals_mel_band_roformer.ckpt"
-KARAOKE_DEFAULT_MODEL = "mel_band_roformer_karaoke_gabox.ckpt"
-KARAOKE_FALLBACK_MODELS = [
     "mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt",
 ]
 def _resolve_output_files(output_files, output_dir: Path) -> list[str]:
@@ -49,31 +119,59 @@ def _resolve_output_files(output_files, output_dir: Path) -> list[str]:
         file_path = Path(file_name)
         if not file_path.is_absolute():
             file_path = output_dir / file_path
         resolved_files.append(str(file_path))
     return resolved_files
-def _safe_move(src_path: str, dst_path: str) -> None:
-    """Move file with overwrite."""
-    if src_path == dst_path:
-        return
     dst = Path(dst_path)
     if dst.exists():
         dst.unlink()
-    shutil.move(src_path, dst_path)
-def _get_audio_activity_stats(audio_path: str) -> tuple[float, float, int]:
-    """Return simple activity stats for validating separator outputs."""
-    audio, _ = sf.read(audio_path, dtype="float32", always_2d=True)
-    if audio.size == 0:
-        return 0.0, 0.0, 0
-    mono = np.mean(audio, axis=1, dtype=np.float32)
-    rms = float(np.sqrt(np.mean(np.square(mono), dtype=np.float64) + 1e-12))
-    peak = float(np.max(np.abs(mono)))
-    nonzero = int(np.count_nonzero(np.abs(mono) > 1e-6))
-    return rms, peak, nonzero
 class RoformerSeparator:
@@ -81,19 +179,21 @@ class RoformerSeparator:
     def __init__(
         self,
-        model_filename: str = ROFORMER_DEFAULT_MODEL,
         device: str = "cuda",
     ):
         if not AUDIO_SEPARATOR_AVAILABLE:
             raise ImportError(
                 "请安装 audio-separator: pip install audio-separator[gpu]"
-            )
         self.model_filename = model_filename
         self.device = str(get_device(device))
         self.separator = None
     def load_model(self, output_dir: str = ""):
-        """加载 Roformer 模型"""
         model_dir = str(
             Path(__file__).parent.parent / "assets" / "separator_models"
         )
@@ -113,16 +213,23 @@ class RoformerSeparator:
             self.separator = None
             gc.collect()
-        log.info(f"正在加载 Mel-Band Roformer 模型: {self.model_filename}")
-        self.separator = Separator(
-            log_level=_logging.WARNING,
             output_dir=target_dir,
-            model_file_dir=model_dir,
         )
         self._init_output_dir = target_dir
-        self.separator.load_model(self.model_filename)
-        log.info("Mel-Band Roformer 模型已加载")
     def separate(
         self,
@@ -142,14 +249,12 @@ class RoformerSeparator:
         if progress_callback:
             progress_callback("正在加载 Roformer 模型...", 0.1)
-        self.load_model(output_dir=str(output_path))
         # audio-separator 需要 output_dir 在实例上设置
         self.separator.output_dir = str(output_path)
-        if progress_callback:
-            progress_callback("正在使用 Mel-Band Roformer 分离人声...", 0.3)
         output_files = self.separator.separate(audio_path)
         # audio-separator 返回的可能是纯文件名，需要拼上 output_dir
@@ -160,7 +265,7 @@ class RoformerSeparator:
                 p = output_path / p
             resolved_files.append(str(p))
-        # Fallback: if resolved files don't exist, search the output dir
         # for freshly created files. This handles cases where audio-separator
         # writes to a slightly different path (e.g. after output_dir update
         # on a reused Separator instance).
@@ -233,6 +338,7 @@ class RoformerSeparator:
         if self.separator is not None:
             del self.separator
             self.separator = None
         gc.collect()
         empty_device_cache()
@@ -242,7 +348,7 @@ class KaraokeSeparator:
     def __init__(
         self,
-        model_filename: str = KARAOKE_DEFAULT_MODEL,
         device: str = "cuda",
     ):
         if not AUDIO_SEPARATOR_AVAILABLE:
@@ -252,15 +358,11 @@ class KaraokeSeparator:
         self.device = str(get_device(device))
         self.separator = None
         self.active_model = None
-        models = [model_filename]
-        for fallback in KARAOKE_FALLBACK_MODELS:
-            if fallback not in models:
-                models.append(fallback)
-        self.model_candidates = models
     def load_model(self, output_dir: str = ""):
-        """加载 Karaoke 模型（主模型失败时自动回退）"""
         model_dir = str(Path(__file__).parent.parent / "assets" / "separator_models")
         Path(model_dir).mkdir(parents=True, exist_ok=True)
@@ -277,26 +379,23 @@ class KaraokeSeparator:
             self.active_model = None
             gc.collect()
-        last_error = None
-        for model_name in self.model_candidates:
-            try:
-                log.info(f"正在加载 Karaoke 模型: {model_name}")
-                separator = Separator(
-                    log_level=_logging.WARNING,
-                    output_dir=target_dir,
-                    model_file_dir=model_dir,
-                )
-                separator.load_model(model_name)
-                self.separator = separator
-                self._init_output_dir = target_dir
-                self.active_model = model_name
-                log.info("Karaoke 模型已加载")
-                return
-            except Exception as exc:
-                last_error = exc
-                log.warning(f"Karaoke 模型加载失败: {model_name} ({exc})")
-        raise RuntimeError(f"无法加载 Karaoke 模型: {last_error}")
     @staticmethod
     def _classify_stem(file_name: str) -> Optional[str]:
@@ -332,36 +431,37 @@ class KaraokeSeparator:
             return "backing"
         return None
-    def separate(self, audio_path: str, output_dir: str) -> Tuple[str, str]:
-        """
-        分离主唱和和声
-        Returns:
             Tuple[lead_vocals_path, backing_vocals_path]
         """
         output_path = Path(output_dir)
         output_path.mkdir(parents=True, exist_ok=True)
-        self.load_model(output_dir=str(output_path))
-        self.separator.output_dir = str(output_path)
-        output_files = self.separator.separate(audio_path)
-        resolved_files = _resolve_output_files(output_files, output_path)
-        log.detail(
-            f"Karaoke分离器输出文件: {[Path(file_path).name for file_path in resolved_files]}"
-        )
-        lead_vocals_path = None
-        backing_vocals_path = None
-        for file_path in resolved_files:
-            stem_role = self._classify_stem(Path(file_path).name)
-            log.detail(
-                f"  {Path(file_path).name} -> 分类为: {stem_role or 'unknown'}"
-            )
-            if stem_role == "lead" and lead_vocals_path is None:
-                lead_vocals_path = file_path
-            elif stem_role == "backing" and backing_vocals_path is None:
-                backing_vocals_path = file_path
         if lead_vocals_path is None and resolved_files:
             lead_vocals_path = resolved_files[0]
         if backing_vocals_path is None:
@@ -374,30 +474,30 @@ class KaraokeSeparator:
             raise FileNotFoundError(
                 f"Karaoke主唱轨未找到，输出文件: {[Path(p).name for p in resolved_files]}"
             )
-        if not backing_vocals_path or not Path(backing_vocals_path).exists():
-            raise FileNotFoundError(
-                f"Karaoke和声轨未找到，输出文件: {[Path(p).name for p in resolved_files]}"
-            )
-        lead_rms, lead_peak, lead_nonzero = _get_audio_activity_stats(lead_vocals_path)
-        backing_rms, backing_peak, backing_nonzero = _get_audio_activity_stats(backing_vocals_path)
-        log.detail(
-            "Karaoke输出能量检测: "
-            f"lead_rms={lead_rms:.6f}, lead_peak={lead_peak:.6f}, lead_nonzero={lead_nonzero}; "
-            f"backing_rms={backing_rms:.6f}, backing_peak={backing_peak:.6f}, backing_nonzero={backing_nonzero}"
-        )
-        lead_is_nearly_silent = lead_nonzero == 0 or (lead_rms < 1e-5 and lead_peak < 1e-4)
-        backing_has_content = backing_nonzero > 0 and (backing_rms >= 5e-5 or backing_peak >= 5e-4)
-        if lead_is_nearly_silent and backing_has_content:
-            log.warning("Karaoke主唱轨几乎静音，检测到输出疑似反转，已自动交换主唱/和声")
-            lead_vocals_path, backing_vocals_path = backing_vocals_path, lead_vocals_path
-        final_lead = str(output_path / "lead_vocals.wav")
-        final_backing = str(output_path / "backing_vocals.wav")
-        _safe_move(lead_vocals_path, final_lead)
-        _safe_move(backing_vocals_path, final_backing)
         return final_lead, final_backing
     def unload_model(self):
@@ -410,6 +510,132 @@ class KaraokeSeparator:
         empty_device_cache()
 class VocalSeparator:
     """人声分离器 - 基于 Demucs"""
@@ -575,7 +801,7 @@ def get_available_models() -> list:
     if AUDIO_SEPARATOR_AVAILABLE:
         models.append({
             "name": "roformer",
-            "description": "Mel-Band Roformer (Kimberley Jensen) - 高质量人声分离"
         })
     if DEMUCS_AVAILABLE:
         models.extend([

 人声分离模块 - 支持 Demucs 和 Mel-Band Roformer (audio-separator)
 """
 import os
+import gc
+import shutil
+import torch
+import numpy as np
+import soundfile as sf
+from pathlib import Path
+from typing import Tuple, Optional, Callable, Union
 from lib.logger import log
 from lib.device import get_device, empty_device_cache
     AUDIO_SEPARATOR_AVAILABLE = False
+ModelSpec = Union[str, list[str], tuple[str, ...]]
+# Public scored SOTA defaults from audio-separator 0.44.1's model table.
+# Keep the cover pipeline unchanged; only the separator model choices change.
+ENSEMBLE_PRESET_PREFIX = "ensemble:"
+ROFORMER_LEGACY_SINGLE_MODEL = "vocals_mel_band_roformer.ckpt"
+ROFORMER_SOTA_PRESET = "vocal_rvc"
+ROFORMER_DEFAULT_MODEL = f"{ENSEMBLE_PRESET_PREFIX}{ROFORMER_SOTA_PRESET}"
+ROFORMER_SOTA_MODEL = ROFORMER_DEFAULT_MODEL
+ROFORMER_SOTA_MODELS = [
+    "melband_roformer_big_beta6x.ckpt",
+    "mel_band_roformer_vocals_fv4_gabox.ckpt",
+]
+KARAOKE_LEGACY_SINGLE_MODEL = "mel_band_roformer_karaoke_gabox.ckpt"
+KARAOKE_SOTA_PRESET = "karaoke"
+KARAOKE_DEFAULT_MODEL = f"{ENSEMBLE_PRESET_PREFIX}{KARAOKE_SOTA_PRESET}"
+KARAOKE_SOTA_MODEL = KARAOKE_DEFAULT_MODEL
+KARAOKE_SOTA_MODELS = [
     "mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt",
+    "mel_band_roformer_karaoke_gabox_v2.ckpt",
+    "mel_band_roformer_karaoke_becruily.ckpt",
 ]
+KARAOKE_EXPERIMENTAL_MODELS = [
+    "mel_band_roformer_karaoke_gabox_v2.ckpt",
+    "mel_band_roformer_karaoke_becruily.ckpt",
+]
+ROFORMER_DEREVERB_DEFAULT_MODEL = "dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt"
+def _model_spec_key(model_spec: ModelSpec) -> tuple[str, ...]:
+    if isinstance(model_spec, (list, tuple)):
+        return tuple(str(item) for item in model_spec)
+    return (str(model_spec),)
+def _model_spec_label(model_spec: ModelSpec) -> str:
+    if isinstance(model_spec, (list, tuple)):
+        return "ensemble[" + ", ".join(str(item) for item in model_spec) + "]"
+    return str(model_spec)
+def _parse_ensemble_preset(model_spec: ModelSpec) -> Optional[str]:
+    if not isinstance(model_spec, str):
+        return None
+    spec = model_spec.strip()
+    if not spec.lower().startswith(ENSEMBLE_PRESET_PREFIX):
+        return None
+    preset = spec[len(ENSEMBLE_PRESET_PREFIX):].strip()
+    return preset or None
+def _load_audio_separator_model(
+    *,
+    model_spec: ModelSpec,
+    output_dir: str,
+    model_dir: str,
+) -> Separator:
+    preset_name = _parse_ensemble_preset(model_spec)
+    separator_kwargs = {
+        "log_level": _logging.WARNING,
+        "output_dir": output_dir,
+        "model_file_dir": model_dir,
+    }
+    if preset_name:
+        separator_kwargs["ensemble_preset"] = preset_name
+    separator = Separator(**separator_kwargs)
+    if preset_name:
+        separator.load_model()
+    else:
+        separator.load_model(list(model_spec) if isinstance(model_spec, tuple) else model_spec)
+    return separator
 def _resolve_output_files(output_files, output_dir: Path) -> list[str]:
         file_path = Path(file_name)
         if not file_path.is_absolute():
             file_path = output_dir / file_path
+        if file_path.exists():
+            resolved_files.append(str(file_path))
+            continue
+        role = _classify_common_stem_role(file_path.name)
+        if role:
+            candidates = [
+                candidate
+                for candidate in output_dir.glob("*.wav")
+                if _classify_common_stem_role(candidate.name) == role
+            ]
+            if len(candidates) == 1:
+                resolved_files.append(str(candidates[0]))
+                continue
         resolved_files.append(str(file_path))
     return resolved_files
+def _classify_common_stem_role(file_name: str) -> Optional[str]:
+    lower_name = file_name.lower()
+    if any(marker in lower_name for marker in ("(noreverb)", "(no_reverb)", "(no reverb)", "(dry)")):
+        return "dry"
+    if any(marker in lower_name for marker in ("(reverb)", "(echo)", "(wet)")):
+        return "wet"
+    if any(marker in lower_name for marker in ("(instrumental)", "(other)", "(backing)")):
+        return "backing"
+    if any(marker in lower_name for marker in ("(vocals)", "(lead)", "(main_vocal)", "(main vocals)")):
+        return "lead"
+    return None
+def _safe_move(src_path: str, dst_path: str) -> None:
+    """Move file with overwrite."""
+    if src_path == dst_path:
+        return
     dst = Path(dst_path)
     if dst.exists():
         dst.unlink()
+    shutil.move(src_path, dst_path)
+def _get_audio_activity_stats(audio_path: str) -> tuple[float, float, int]:
+    """Return simple activity stats for validating separator outputs."""
+    audio, _ = sf.read(audio_path, dtype="float32", always_2d=True)
+    if audio.size == 0:
+        return 0.0, 0.0, 0
+    mono = np.mean(audio, axis=1, dtype=np.float32)
+    rms = float(np.sqrt(np.mean(np.square(mono), dtype=np.float64) + 1e-12))
+    peak = float(np.max(np.abs(mono)))
+    nonzero = int(np.count_nonzero(np.abs(mono) > 1e-6))
+    return rms, peak, nonzero
 class RoformerSeparator:
     def __init__(
         self,
+        model_filename: ModelSpec = ROFORMER_DEFAULT_MODEL,
         device: str = "cuda",
     ):
         if not AUDIO_SEPARATOR_AVAILABLE:
             raise ImportError(
                 "请安装 audio-separator: pip install audio-separator[gpu]"
+        )
         self.model_filename = model_filename
+        self.model_candidates = [model_filename]
         self.device = str(get_device(device))
         self.separator = None
+        self.active_model = None
     def load_model(self, output_dir: str = ""):
+        """加载指定 RoFormer 模型；严格 SOTA 模式下不自动降级。"""
         model_dir = str(
             Path(__file__).parent.parent / "assets" / "separator_models"
         )
             self.separator = None
             gc.collect()
+        model_name = self.model_filename
+        log.info(
+            "正在加载公开 SOTA RoFormer 分离模型: "
+            f"{_model_spec_label(model_name)}"
+        )
+        separator = _load_audio_separator_model(
+            model_spec=model_name,
             output_dir=target_dir,
+            model_dir=model_dir,
         )
+        self.separator = separator
         self._init_output_dir = target_dir
+        self.active_model = model_name
+        log.info(
+            "RoFormer 分离模型已加载: "
+            f"{_model_spec_label(model_name)}"
+        )
     def separate(
         self,
         if progress_callback:
             progress_callback("正在加载 Roformer 模型...", 0.1)
+        if progress_callback:
+            progress_callback("正在使用 RoFormer 分离人声...", 0.3)
+        self.load_model(output_dir=str(output_path))
         # audio-separator 需要 output_dir 在实例上设置
         self.separator.output_dir = str(output_path)
         output_files = self.separator.separate(audio_path)
         # audio-separator 返回的可能是纯文件名，需要拼上 output_dir
                 p = output_path / p
             resolved_files.append(str(p))
+        # Recovery: if resolved files don't exist, search the output dir
         # for freshly created files. This handles cases where audio-separator
         # writes to a slightly different path (e.g. after output_dir update
         # on a reused Separator instance).
         if self.separator is not None:
             del self.separator
             self.separator = None
+        self.active_model = None
         gc.collect()
         empty_device_cache()
     def __init__(
         self,
+        model_filename: ModelSpec = KARAOKE_DEFAULT_MODEL,
         device: str = "cuda",
     ):
         if not AUDIO_SEPARATOR_AVAILABLE:
         self.device = str(get_device(device))
         self.separator = None
         self.active_model = None
+        self.model_filename = model_filename
+        self.model_candidates = [model_filename]
     def load_model(self, output_dir: str = ""):
+        """加载指定 Karaoke 模型；严格 SOTA 模式下不自动降级。"""
         model_dir = str(Path(__file__).parent.parent / "assets" / "separator_models")
         Path(model_dir).mkdir(parents=True, exist_ok=True)
             self.active_model = None
             gc.collect()
+        model_name = self.model_filename
+        log.info(
+            "正在加载公开 SOTA Karaoke 模型: "
+            f"{_model_spec_label(model_name)}"
+        )
+        separator = _load_audio_separator_model(
+            model_spec=model_name,
+            output_dir=target_dir,
+            model_dir=model_dir,
+        )
+        self.separator = separator
+        self._init_output_dir = target_dir
+        self.active_model = model_name
+        log.info(
+            "Karaoke 模型已加载: "
+            f"{_model_spec_label(model_name)}"
+        )
     @staticmethod
     def _classify_stem(file_name: str) -> Optional[str]:
             return "backing"
         return None
+    def separate(self, audio_path: str, output_dir: str) -> Tuple[str, str]:
+        """
+        分离主唱和和声
+        Returns:
             Tuple[lead_vocals_path, backing_vocals_path]
         """
         output_path = Path(output_dir)
         output_path.mkdir(parents=True, exist_ok=True)
+        self.load_model(output_dir=str(output_path))
+        self.separator.output_dir = str(output_path)
+        output_files = self.separator.separate(audio_path)
+        resolved_files = _resolve_output_files(output_files, output_path)
+        log.detail(
+            f"Karaoke分离器输出文件: {[Path(file_path).name for file_path in resolved_files]}"
+        )
+        lead_vocals_path = None
+        backing_vocals_path = None
+        for file_path in resolved_files:
+            stem_role = self._classify_stem(Path(file_path).name)
+            log.detail(
+                f"  {Path(file_path).name} -> 分类为: {stem_role or 'unknown'}"
+            )
+            if stem_role == "lead" and lead_vocals_path is None:
+                lead_vocals_path = file_path
+            elif stem_role == "backing" and backing_vocals_path is None:
+                backing_vocals_path = file_path
         if lead_vocals_path is None and resolved_files:
             lead_vocals_path = resolved_files[0]
         if backing_vocals_path is None:
             raise FileNotFoundError(
                 f"Karaoke主唱轨未找到，输出文件: {[Path(p).name for p in resolved_files]}"
             )
+        if not backing_vocals_path or not Path(backing_vocals_path).exists():
+            raise FileNotFoundError(
+                f"Karaoke和声轨未找到，输出文件: {[Path(p).name for p in resolved_files]}"
+            )
+        lead_rms, lead_peak, lead_nonzero = _get_audio_activity_stats(lead_vocals_path)
+        backing_rms, backing_peak, backing_nonzero = _get_audio_activity_stats(backing_vocals_path)
+        log.detail(
+            "Karaoke输出能量检测: "
+            f"lead_rms={lead_rms:.6f}, lead_peak={lead_peak:.6f}, lead_nonzero={lead_nonzero}; "
+            f"backing_rms={backing_rms:.6f}, backing_peak={backing_peak:.6f}, backing_nonzero={backing_nonzero}"
+        )
+        lead_is_nearly_silent = lead_nonzero == 0 or (lead_rms < 1e-5 and lead_peak < 1e-4)
+        backing_has_content = backing_nonzero > 0 and (backing_rms >= 5e-5 or backing_peak >= 5e-4)
+        if lead_is_nearly_silent and backing_has_content:
+            log.warning("Karaoke主唱轨几乎静音，检测到输出疑似反转，已自动交换主唱/和声")
+            lead_vocals_path, backing_vocals_path = backing_vocals_path, lead_vocals_path
+        final_lead = str(output_path / "lead_vocals.wav")
+        final_backing = str(output_path / "backing_vocals.wav")
+        _safe_move(lead_vocals_path, final_lead)
+        _safe_move(backing_vocals_path, final_backing)
         return final_lead, final_backing
     def unload_model(self):
         empty_device_cache()
+class RoformerDereverbSeparator:
+    """学习型 RoFormer 去混响/去回声，输出更干的人声供 VC 使用。"""
+    def __init__(
+        self,
+        model_filename: str = ROFORMER_DEREVERB_DEFAULT_MODEL,
+        device: str = "cuda",
+    ):
+        if not AUDIO_SEPARATOR_AVAILABLE:
+            raise ImportError(
+                "请安装 audio-separator: pip install audio-separator[gpu]"
+            )
+        self.device = str(get_device(device))
+        self.separator = None
+        self.active_model = None
+        self.model_filename = model_filename
+        self.model_candidates = [model_filename]
+    def load_model(self, output_dir: str = ""):
+        model_dir = str(Path(__file__).parent.parent / "assets" / "separator_models")
+        Path(model_dir).mkdir(parents=True, exist_ok=True)
+        target_dir = output_dir or str(
+            Path(__file__).parent.parent / "temp" / "separator"
+        )
+        if self.separator is not None:
+            if getattr(self, "_init_output_dir", None) == target_dir:
+                return
+            del self.separator
+            self.separator = None
+            self.active_model = None
+            gc.collect()
+        model_name = self.model_filename
+        log.info(f"正在加载 RoFormer De-Reverb 模型: {model_name}")
+        separator = _load_audio_separator_model(
+            model_spec=model_name,
+            output_dir=target_dir,
+            model_dir=model_dir,
+        )
+        self.separator = separator
+        self._init_output_dir = target_dir
+        self.active_model = model_name
+        log.info(f"RoFormer De-Reverb 模型已加载: {model_name}")
+    @staticmethod
+    def _classify_stem(file_name: str) -> Optional[str]:
+        lower_name = file_name.lower()
+        dry_markers = [
+            "(dry)",
+            "(noreverb)",
+            "(no_reverb)",
+            "(no reverb)",
+            "(dereverb)",
+            "(de-reverb)",
+            "(vocals)",
+            "(primary)",
+        ]
+        wet_markers = [
+            "(no dry)",
+            "(no_dry)",
+            "(reverb)",
+            "(echo)",
+            "(wet)",
+            "(secondary)",
+            "(instrumental)",
+            "(other)",
+        ]
+        for marker in wet_markers:
+            if marker in lower_name:
+                return "wet"
+        for marker in dry_markers:
+            if marker in lower_name:
+                return "dry"
+        if "dry" in lower_name or "noreverb" in lower_name or "vocal" in lower_name:
+            return "dry"
+        if "no dry" in lower_name or "reverb" in lower_name or "echo" in lower_name:
+            return "wet"
+        return None
+    def separate_dry(self, audio_path: str, output_dir: str) -> str:
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+        self.load_model(output_dir=str(output_path))
+        self.separator.output_dir = str(output_path)
+        output_files = self.separator.separate(audio_path)
+        resolved_files = _resolve_output_files(output_files, output_path)
+        log.detail(
+            "RoFormer De-Reverb 输出文件: "
+            f"{[Path(file_path).name for file_path in resolved_files]}"
+        )
+        dry_path = None
+        for file_path in resolved_files:
+            stem_role = self._classify_stem(Path(file_path).name)
+            log.detail(
+                f"  {Path(file_path).name} -> 分类为: {stem_role or 'unknown'}"
+            )
+            if stem_role == "dry":
+                dry_path = file_path
+                break
+        if not dry_path or not Path(dry_path).exists():
+            raise FileNotFoundError(
+                f"RoFormer De-Reverb dry轨未找到，输出文件: {[Path(p).name for p in resolved_files]}"
+            )
+        final_dry = str(output_path / "roformer_deecho_vocals.wav")
+        _safe_move(dry_path, final_dry)
+        return final_dry
+    def unload_model(self):
+        if self.separator is not None:
+            del self.separator
+            self.separator = None
+        self.active_model = None
+        gc.collect()
+        empty_device_cache()
 class VocalSeparator:
     """人声分离器 - 基于 Demucs"""
     if AUDIO_SEPARATOR_AVAILABLE:
         models.append({
             "name": "roformer",
+            "description": "audio-separator public scored SOTA - 最高质量人声/伴奏分离"
         })
     if DEMUCS_AVAILABLE:
         models.extend([

install.py ADDED Viewed

	@@ -0,0 +1,396 @@

+# -*- coding: utf-8 -*-
+"""
+RVC 安装脚本
+自动创建虚拟环境 → 安装依赖 → 启动应用
+用法:
+  python install.py            # 完整安装并启动
+  python install.py --check    # 仅检查依赖
+  python install.py --no-run   # 安装但不启动
+  python install.py --cpu      # 安装 CPU 版本
+"""
+import subprocess
+import sys
+import os
+from pathlib import Path
+ROOT_DIR = Path(__file__).parent
+VENV_DIR = ROOT_DIR / "venv310"
+PYTHON310_CANDIDATES = [
+    r"C:\Users\Administrator\AppData\Local\Programs\Python\Python310\python.exe",
+    r"C:\Python310\python.exe",
+    r"C:\Program Files\Python310\python.exe",
+    r"C:\Program Files (x86)\Python310\python.exe",
+]
+PACKAGES = {
+    "torch": {"import": "torch", "name": "PyTorch", "pip": "torch"},
+    "torchaudio": {"import": "torchaudio", "name": "torchaudio", "pip": "torchaudio"},
+    "gradio": {"import": "gradio", "name": "Gradio", "pip": "gradio==3.50.2"},
+    "librosa": {"import": "librosa", "name": "librosa", "pip": "librosa"},
+    "soundfile": {"import": "soundfile", "name": "soundfile", "pip": "soundfile"},
+    "av": {"import": "av", "name": "PyAV", "pip": "av"},
+    "scipy": {"import": "scipy", "name": "scipy", "pip": "scipy"},
+    "numpy": {
+        "import": "numpy",
+        "name": "numpy",
+        "pip": "numpy<2,>=1.23.0",
+        "dist": "numpy",
+        "min_version": "1.23.0",
+        "max_exclusive_version": "2.0.0",
+    },
+    "parselmouth": {"import": "parselmouth", "name": "praat-parselmouth", "pip": "praat-parselmouth"},
+    "pyworld": {"import": "pyworld", "name": "pyworld", "pip": "pyworld"},
+    "torchcrepe": {"import": "torchcrepe", "name": "torchcrepe", "pip": "torchcrepe"},
+    "faiss": {"import": "faiss", "name": "faiss-cpu", "pip": "faiss-cpu"},
+    "tqdm": {"import": "tqdm", "name": "tqdm", "pip": "tqdm"},
+    "requests": {"import": "requests", "name": "requests", "pip": "requests"},
+    "dotenv": {"import": "dotenv", "name": "python-dotenv", "pip": "python-dotenv"},
+    "colorama": {"import": "colorama", "name": "colorama", "pip": "colorama"},
+    "mcp": {"import": "mcp", "name": "mcp", "pip": "mcp"},
+    "demucs": {"import": "demucs", "name": "demucs", "pip": "demucs"},
+    "audio_separator": {
+        "import": "audio_separator",
+        "name": "audio-separator",
+        "pip": "audio-separator",
+        "dist": "audio-separator",
+        "min_version": "0.44.1",
+    },
+    "huggingface_hub": {"import": "huggingface_hub", "name": "huggingface_hub", "pip": "huggingface_hub"},
+    "pedalboard": {"import": "pedalboard", "name": "pedalboard", "pip": "pedalboard"},
+    "ffmpeg": {"import": "ffmpeg", "name": "ffmpeg-python", "pip": "ffmpeg-python"},
+    "fairseq": {"import": "fairseq", "name": "fairseq", "pip": "fairseq==0.12.2"},
+}
+# === 虚拟环境 ===
+def find_python310():
+    """查找系统中的 Python 3.10"""
+    if sys.version_info[:2] == (3, 10):
+        return sys.executable
+    for p in PYTHON310_CANDIDATES:
+        if os.path.isfile(p):
+            return p
+    try:
+        r = subprocess.run(
+            ["py", "-3.10", "-c", "import sys; print(sys.executable)"],
+            capture_output=True, text=True, timeout=10,
+        )
+        if r.returncode == 0:
+            return r.stdout.strip()
+    except (FileNotFoundError, subprocess.TimeoutExpired):
+        pass
+    return None
+def get_venv_python():
+    """获取虚拟环境的 Python 路径"""
+    if os.name == "nt":
+        return str(VENV_DIR / "Scripts" / "python.exe")
+    return str(VENV_DIR / "bin" / "python")
+def create_venv():
+    """创建 Python 3.10 虚拟环境"""
+    venv_py = get_venv_python()
+    if os.path.isfile(venv_py):
+        r = subprocess.run([venv_py, "--version"], capture_output=True, text=True)
+        if r.returncode == 0 and "3.10" in r.stdout:
+            print(f"  [OK] 虚拟环境已存在: {VENV_DIR}")
+            return True
+    py310 = find_python310()
+    if not py310:
+        print("  [错误] 未找到 Python 3.10")
+        print("  下载: https://www.python.org/downloads/release/python-31011/")
+        return False
+    print(f"  使用 Python: {py310}")
+    print(f"  创建虚拟环境: {VENV_DIR}")
+    r = subprocess.run([py310, "-m", "venv", str(VENV_DIR)], capture_output=True, text=True)
+    if r.returncode != 0:
+        print(f"  [错误] 创建失败:\n{r.stderr}")
+        return False
+    print("  [OK] 虚拟环境创建成功")
+    print("  升级 pip ...")
+    subprocess.run([venv_py, "-m", "pip", "install", "--upgrade", "pip"],
+                   capture_output=True, text=True)
+    return True
+# === 依赖检查与安装 ===
+def check_package(venv_py, import_name):
+    """用虚拟环境的 Python 检查包是否已安装"""
+    r = subprocess.run(
+        [venv_py, "-c", f"import {import_name}"],
+        capture_output=True, text=True,
+    )
+    return r.returncode == 0
+def get_installed_version(venv_py, distribution_name):
+    """返回虚拟环境中已安装发行包版本；无法读取时返回 None。"""
+    code = (
+        "from importlib.metadata import PackageNotFoundError, version\n"
+        f"dist = {distribution_name!r}\n"
+        "try:\n"
+        "    print(version(dist))\n"
+        "except PackageNotFoundError:\n"
+        "    raise SystemExit(1)\n"
+    )
+    r = subprocess.run([venv_py, "-c", code], capture_output=True, text=True)
+    if r.returncode != 0:
+        return None
+    return r.stdout.strip() or None
+def _version_parts(version_text):
+    parts = []
+    for part in str(version_text or "").split("."):
+        digits = ""
+        for char in part:
+            if not char.isdigit():
+                break
+            digits += char
+        parts.append(int(digits or 0))
+    return tuple(parts)
+def _version_at_least(installed, required):
+    installed_parts = _version_parts(installed)
+    required_parts = _version_parts(required)
+    width = max(len(installed_parts), len(required_parts))
+    installed_parts += (0,) * (width - len(installed_parts))
+    required_parts += (0,) * (width - len(required_parts))
+    return installed_parts >= required_parts
+def _version_less_than(installed, upper_bound):
+    installed_parts = _version_parts(installed)
+    upper_parts = _version_parts(upper_bound)
+    width = max(len(installed_parts), len(upper_parts))
+    installed_parts += (0,) * (width - len(installed_parts))
+    upper_parts += (0,) * (width - len(upper_parts))
+    return installed_parts < upper_parts
+def detect_cuda_version():
+    """检测系统 CUDA 版本，返回对应的 PyTorch index-url"""
+    try:
+        r = subprocess.run(
+            ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"],
+            capture_output=True, text=True, timeout=10,
+        )
+        if r.returncode == 0:
+            # nvidia-smi 存在，尝试获取 CUDA 版本
+            r2 = subprocess.run(
+                ["nvidia-smi"],
+                capture_output=True, text=True, timeout=10,
+            )
+            output = r2.stdout
+            # 从 nvidia-smi 输出中提取 CUDA Version
+            import re
+            match = re.search(r"CUDA Version:\s*(\d+)\.(\d+)", output)
+            if match:
+                major, minor = int(match.group(1)), int(match.group(2))
+                if (major, minor) >= (12, 6):
+                    return "https://download.pytorch.org/whl/cu126"
+                elif (major, minor) >= (12, 4):
+                    return "https://download.pytorch.org/whl/cu124"
+                elif (major, minor) >= (12, 1):
+                    return "https://download.pytorch.org/whl/cu121"
+                elif (major, minor) >= (11, 8):
+                    return "https://download.pytorch.org/whl/cu118"
+                else:
+                    return None
+    except (FileNotFoundError, subprocess.TimeoutExpired):
+        pass
+    return None
+def pip_install(venv_py, package, extra="", index_url=None, no_deps=False, version_spec=""):
+    """用虚拟环境的 pip 安装包"""
+    target = f"{package}[{extra}]{version_spec}" if extra else f"{package}{version_spec}"
+    print(f"  安装 {target} ...")
+    cmd = [venv_py, "-m", "pip", "install", target]
+    if index_url:
+        cmd.extend(["--index-url", index_url])
+    if no_deps:
+        cmd.append("--no-deps")
+    r = subprocess.run(cmd, capture_output=True, text=True)
+    if r.returncode != 0:
+        # fairseq 依赖冲突时尝试 --no-deps 回退
+        if not no_deps and "ResolutionImpossible" in r.stderr:
+            print(f"  [依赖冲突] 尝试 --no-deps 安装 {target} ...")
+            return pip_install(venv_py, package, extra=extra, index_url=index_url, no_deps=True)
+        print(f"  [失败] {target}")
+        lines = r.stderr.strip().splitlines()
+        if lines:
+            print(f"  {lines[-1]}")
+        return False
+    print(f"  [完成] {target}")
+    return True
+def check_all(venv_py):
+    """检查所有依赖"""
+    print("=" * 50)
+    print("RVC 依赖检查")
+    print("=" * 50)
+    missing = []
+    for key, info in PACKAGES.items():
+        ok = check_package(venv_py, info["import"])
+        status = "OK" if ok else "未安装"
+        min_version = info.get("min_version")
+        max_exclusive_version = info.get("max_exclusive_version")
+        if ok and (min_version or max_exclusive_version):
+            installed_version = get_installed_version(
+                venv_py,
+                info.get("dist", info["pip"]),
+            )
+            if not installed_version:
+                ok = False
+                requirements = []
+                if min_version:
+                    requirements.append(f">= {min_version}")
+                if max_exclusive_version:
+                    requirements.append(f"< {max_exclusive_version}")
+                status = f"需更新 (无法读取版本，要求 {' 且 '.join(requirements)})"
+            elif (
+                (not min_version or _version_at_least(installed_version, min_version))
+                and (
+                    not max_exclusive_version
+                    or _version_less_than(installed_version, max_exclusive_version)
+                )
+            ):
+                status = f"OK ({installed_version})"
+            else:
+                ok = False
+                requirements = []
+                if min_version:
+                    requirements.append(f">= {min_version}")
+                if max_exclusive_version:
+                    requirements.append(f"< {max_exclusive_version}")
+                status = f"需更新 ({installed_version} 不满足 {' 且 '.join(requirements)})"
+        mark = "[v]" if ok else "[x]"
+        print(f"  {mark} {info['name']:30s} {status}")
+        if not ok:
+            missing.append(info)
+    print("-" * 50)
+    if missing:
+        print(f"缺少 {len(missing)} 个依赖包")
+    else:
+        print("所有依赖已安装")
+    return missing
+def install_all(venv_py, gpu=True):
+    """安装所有缺失的依赖"""
+    missing = check_all(venv_py)
+    if not missing:
+        print("\n无需安装，所有依赖已就绪。")
+        return True
+    # 检测 CUDA 版本
+    cuda_index_url = None
+    if gpu:
+        cuda_index_url = detect_cuda_version()
+        if cuda_index_url:
+            print(f"\n  检测到 CUDA，使用 PyTorch 源: {cuda_index_url}")
+        else:
+            print("\n  [错误] 未检测到支持的 CUDA，GPU 安装停止。")
+            print("  如需 CPU 版 PyTorch，请显式使用 --cpu。")
+            return False
+    print(f"\n开始安装 {len(missing)} 个缺失的依赖...\n")
+    failed = []
+    for info in missing:
+        pip_name = info["pip"]
+        if pip_name in ("torch", "torchaudio"):
+            if gpu and cuda_index_url:
+                ok = pip_install(venv_py, pip_name, index_url=cuda_index_url)
+            else:
+                ok = pip_install(venv_py, pip_name, index_url="https://download.pytorch.org/whl/cpu")
+        elif pip_name == "audio-separator":
+            version_spec = f">={info['min_version']}" if info.get("min_version") else ""
+            ok = pip_install(
+                venv_py,
+                pip_name,
+                extra="gpu" if gpu else "cpu",
+                version_spec=version_spec,
+            )
+            if ok:
+                # audio-separator 0.31+ declares numpy>=2, while the current
+                # Gradio 3.x UI stack is pinned to numpy 1.x. The separator
+                # model table and runtime imports work with numpy 1.26, so
+                # restore the app-compatible numpy line after separator install.
+                ok = pip_install(venv_py, "numpy<2,>=1.23.0")
+        else:
+            ok = pip_install(venv_py, pip_name)
+        if not ok:
+            failed.append(info["name"])
+    print("\n" + "=" * 50)
+    if failed:
+        print(f"安装完成，{len(failed)} 个包失败: {', '.join(failed)}")
+        return False
+    print("所有依赖安装成功!")
+    return True
+def launch_app(venv_py):
+    """用虚拟环境启动应用"""
+    run_script = str(ROOT_DIR / "run.py")
+    print(f"\n启动应用: {run_script}")
+    print("=" * 50)
+    try:
+        subprocess.run([venv_py, run_script], cwd=str(ROOT_DIR))
+    except KeyboardInterrupt:
+        print("\n已停止")
+# === 主入口 ===
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="RVC 安装脚本")
+    parser.add_argument("--cpu", action="store_true", help="安装 CPU 版本")
+    parser.add_argument("--check", action="store_true", help="仅检查依赖")
+    parser.add_argument("--no-run", action="store_true", help="安装后不启动")
+    args = parser.parse_args()
+    print("=" * 50)
+    print("RVC 安装程序")
+    print("=" * 50)
+    # 1. 创建虚拟环境
+    print("\n[1/3] 检查虚拟环境")
+    if not create_venv():
+        sys.exit(1)
+    venv_py = get_venv_python()
+    # 2. 安装依赖
+    print(f"\n[2/3] 检查依赖")
+    if args.check:
+        check_all(venv_py)
+        return
+    gpu = not args.cpu
+    if not install_all(venv_py, gpu=gpu):
+        print("\n部分依赖安装失败，可尝试手动安装。")
+        sys.exit(1)
+    # 3. 启动应用
+    if args.no_run:
+        print("\n安装完成。运行方式:")
+        print(f"  {venv_py} run.py")
+        return
+    print(f"\n[3/3] 启动应用")
+    launch_app(venv_py)
+if __name__ == "__main__":
+    main()

lib/audio.py CHANGED Viewed

@@ -2,10 +2,10 @@
 """
 音频处理模块 - 加载、保存和处理音频文件
 """
-import numpy as np
-import librosa
-import soundfile as sf
-from typing import Tuple, Optional
 def load_audio(path: str, sr: int = 16000) -> np.ndarray:
@@ -27,7 +27,7 @@ def load_audio(path: str, sr: int = 16000) -> np.ndarray:
     return audio.astype(np.float32)
-def save_audio(path: str, audio: np.ndarray, sr: int = 48000):
     """
     保存音频到文件
@@ -37,52 +37,52 @@ def save_audio(path: str, audio: np.ndarray, sr: int = 48000):
         sr: 采样率 (默认 48000)
     """
     # 确保音频在 [-1, 1] 范围内
-    audio = np.clip(audio, -1.0, 1.0)
-    sf.write(path, audio, sr)
-def soft_clip(
-    audio: np.ndarray,
-    threshold: float = 0.9,
-    ceiling: float = 0.99,
-) -> np.ndarray:
-    """
-    使用平滑软削波抑制峰值，尽量保留主体响度。
-    Args:
-        audio: 输入音频
-        threshold: 开始压缩的阈值
-        ceiling: 软削波上限
-    Returns:
-        np.ndarray: 处理后的音频
-    """
-    audio = np.asarray(audio, dtype=np.float32)
-    if threshold <= 0:
-        raise ValueError("threshold 必须大于 0")
-    if ceiling <= threshold:
-        raise ValueError("ceiling 必须大于 threshold")
-    result = audio.copy()
-    abs_audio = np.abs(result)
-    mask = abs_audio > threshold
-    if not np.any(mask):
-        return result
-    overshoot = (abs_audio[mask] - threshold) / (ceiling - threshold + 1e-8)
-    compressed = threshold + (ceiling - threshold) * np.tanh(overshoot)
-    result[mask] = np.sign(result[mask]) * compressed
-    return result.astype(np.float32, copy=False)
-def soft_clip_array(
-    audio: np.ndarray,
-    threshold: float = 0.9,
-    ceiling: float = 0.99,
-) -> np.ndarray:
-    """软削波数组版本，支持单声道/多声道。"""
-    return soft_clip(audio, threshold=threshold, ceiling=ceiling)
 def get_audio_info(path: str) -> dict:

 """
 音频处理模块 - 加载、保存和处理音频文件
 """
+import numpy as np
+import librosa
+import soundfile as sf
+from typing import Tuple, Optional
 def load_audio(path: str, sr: int = 16000) -> np.ndarray:
     return audio.astype(np.float32)
+def save_audio(path: str, audio: np.ndarray, sr: int = 48000):
     """
     保存音频到文件
         sr: 采样率 (默认 48000)
     """
     # 确保音频在 [-1, 1] 范围内
+    audio = np.clip(audio, -1.0, 1.0)
+    sf.write(path, audio, sr)
+def soft_clip(
+    audio: np.ndarray,
+    threshold: float = 0.9,
+    ceiling: float = 0.99,
+) -> np.ndarray:
+    """
+    使用平滑软削波抑制峰值，尽量保留主体响度。
+    Args:
+        audio: 输入音频
+        threshold: 开始压缩的阈值
+        ceiling: 软削波上限
+    Returns:
+        np.ndarray: 处理后的音频
+    """
+    audio = np.asarray(audio, dtype=np.float32)
+    if threshold <= 0:
+        raise ValueError("threshold 必须大于 0")
+    if ceiling <= threshold:
+        raise ValueError("ceiling 必须大于 threshold")
+    result = audio.copy()
+    abs_audio = np.abs(result)
+    mask = abs_audio > threshold
+    if not np.any(mask):
+        return result
+    overshoot = (abs_audio[mask] - threshold) / (ceiling - threshold + 1e-8)
+    compressed = threshold + (ceiling - threshold) * np.tanh(overshoot)
+    result[mask] = np.sign(result[mask]) * compressed
+    return result.astype(np.float32, copy=False)
+def soft_clip_array(
+    audio: np.ndarray,
+    threshold: float = 0.9,
+    ceiling: float = 0.99,
+) -> np.ndarray:
+    """软削波数组版本，支持单声道/多声道。"""
+    return soft_clip(audio, threshold=threshold, ceiling=ceiling)
 def get_audio_info(path: str) -> dict:

lib/audio_metrics.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# -*- coding: utf-8 -*-
+"""Reference-based audio metrics for separation/cover evaluation."""
+from __future__ import annotations
+from typing import Mapping
+import numpy as np
+EPS = 1e-10
+def _as_mono_float(audio: np.ndarray) -> np.ndarray:
+    arr = np.asarray(audio, dtype=np.float64)
+    if arr.ndim == 2:
+        arr = np.mean(arr, axis=1)
+    return arr.reshape(-1)
+def _align_pair(reference: np.ndarray, estimate: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+    ref = _as_mono_float(reference)
+    est = _as_mono_float(estimate)
+    n = min(ref.size, est.size)
+    if n <= 0:
+        raise ValueError("Audio metric received empty reference or estimate.")
+    return ref[:n], est[:n]
+def _power(audio: np.ndarray) -> float:
+    arr = np.asarray(audio, dtype=np.float64).reshape(-1)
+    return float(np.sum(arr * arr))
+def _db_ratio(signal_power: float, noise_power: float) -> float:
+    signal_power = max(float(signal_power), EPS)
+    noise_power = max(float(noise_power), EPS)
+    return float(10.0 * np.log10(signal_power / noise_power))
+def signal_distortion_ratio(reference: np.ndarray, estimate: np.ndarray) -> float:
+    """Scale-dependent SDR: 10 log10(||s||^2 / ||s - shat||^2)."""
+    ref, est = _align_pair(reference, estimate)
+    return _db_ratio(_power(ref), _power(ref - est))
+def scale_invariant_signal_distortion_ratio(reference: np.ndarray, estimate: np.ndarray) -> float:
+    """SI-SDR as used by modern source-separation literature."""
+    ref, est = _align_pair(reference, estimate)
+    ref = ref - float(np.mean(ref))
+    est = est - float(np.mean(est))
+    ref_power = _power(ref)
+    if ref_power <= EPS:
+        raise ValueError("SI-SDR reference is silent.")
+    scale = float(np.dot(est, ref) / (ref_power + EPS))
+    target = scale * ref
+    residual = est - target
+    return _db_ratio(_power(target), _power(residual))
+def signal_to_noise_ratio(reference: np.ndarray, estimate: np.ndarray) -> float:
+    """Alias for scale-dependent reconstruction SNR."""
+    return signal_distortion_ratio(reference, estimate)
+def evaluate_reference_stems(
+    references: Mapping[str, np.ndarray],
+    estimates: Mapping[str, np.ndarray],
+) -> dict:
+    """Compute true reference-based metrics for matching stems.
+    The caller must provide time-aligned reference stems. Without references,
+    SI-SDR/SDR cannot be interpreted as source-separation quality.
+    """
+    stem_metrics: dict[str, dict[str, float]] = {}
+    for stem_name, reference_audio in references.items():
+        if stem_name not in estimates:
+            raise KeyError(f"Missing estimated stem for reference: {stem_name}")
+        estimate_audio = estimates[stem_name]
+        stem_metrics[stem_name] = {
+            "si_sdr": scale_invariant_signal_distortion_ratio(reference_audio, estimate_audio),
+            "sdr": signal_distortion_ratio(reference_audio, estimate_audio),
+            "snr": signal_to_noise_ratio(reference_audio, estimate_audio),
+        }
+    if not stem_metrics:
+        raise ValueError("No reference stems were provided.")
+    return {
+        "mean_si_sdr": float(np.mean([metrics["si_sdr"] for metrics in stem_metrics.values()])),
+        "mean_sdr": float(np.mean([metrics["sdr"] for metrics in stem_metrics.values()])),
+        "mean_snr": float(np.mean([metrics["snr"] for metrics in stem_metrics.values()])),
+        "stems": stem_metrics,
+    }

lib/ffmpeg_runtime.py ADDED Viewed

	@@ -0,0 +1,130 @@

+from __future__ import annotations
+import os
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+from typing import MutableMapping, Optional
+ROOT_DIR = Path(__file__).resolve().parent.parent
+def get_runtime_root(root_dir: Path | str | None = None) -> Path:
+    if root_dir is not None:
+        return Path(root_dir).expanduser()
+    if getattr(sys, "frozen", False):
+        return Path(sys.executable).resolve().parent
+    return ROOT_DIR
+def _normalize_path_key(path: Path | str) -> str:
+    return os.path.normcase(os.path.normpath(str(path)))
+def _iter_ffmpeg_bin_candidates(root_dir: Path) -> list[Path]:
+    candidates = [
+        root_dir / "tools" / "ffmpeg" / "bin",
+        root_dir / "tools" / "ffmpeg",
+    ]
+    meipass = getattr(sys, "_MEIPASS", None)
+    if meipass:
+        meipass_root = Path(meipass)
+        candidates.extend(
+            [
+                meipass_root / "tools" / "ffmpeg" / "bin",
+                meipass_root / "tools" / "ffmpeg",
+            ]
+        )
+    try:
+        import imageio_ffmpeg
+        candidates.append(Path(imageio_ffmpeg.get_ffmpeg_exe()).resolve().parent)
+    except Exception:
+        pass
+    unique: list[Path] = []
+    seen: set[str] = set()
+    for candidate in candidates:
+        key = _normalize_path_key(candidate)
+        if key not in seen:
+            unique.append(candidate)
+            seen.add(key)
+    return unique
+def get_ffmpeg_bin_dir(root_dir: Path | str | None = None) -> Optional[Path]:
+    runtime_root = get_runtime_root(root_dir)
+    ffmpeg_name = "ffmpeg.exe" if os.name == "nt" else "ffmpeg"
+    for candidate in _iter_ffmpeg_bin_candidates(runtime_root):
+        if (candidate / ffmpeg_name).exists():
+            return candidate
+    found = shutil.which("ffmpeg")
+    if found:
+        return Path(found).resolve().parent
+    return None
+def _check_executable_runs(executable: Path, label: str) -> None:
+    try:
+        result = subprocess.run(
+            [str(executable), "-version"],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+    except Exception as exc:
+        raise RuntimeError(f"{label} 无法启动: {executable} ({exc})") from exc
+    if result.returncode != 0:
+        details = "\n".join(
+            part.strip()
+            for part in (result.stdout, result.stderr)
+            if part and part.strip()
+        )
+        if not details:
+            details = f"退出码: {result.returncode}"
+        raise RuntimeError(f"{label} 无法正常运行: {executable}\n{details}")
+def configure_ffmpeg_runtime(
+    root_dir: Path | str | None = None,
+    env: MutableMapping[str, str] | None = None,
+) -> Optional[Path]:
+    env = os.environ if env is None else env
+    bin_dir = get_ffmpeg_bin_dir(root_dir=root_dir)
+    if bin_dir is None:
+        return None
+    bin_dir_str = str(bin_dir)
+    current_path = env.get("PATH", "")
+    path_entries = [entry for entry in current_path.split(os.pathsep) if entry]
+    normalized_entries = {_normalize_path_key(entry) for entry in path_entries}
+    if _normalize_path_key(bin_dir_str) not in normalized_entries:
+        env["PATH"] = bin_dir_str if not current_path else bin_dir_str + os.pathsep + current_path
+    ffmpeg_name = "ffmpeg.exe" if os.name == "nt" else "ffmpeg"
+    ffmpeg_path = bin_dir / ffmpeg_name
+    if ffmpeg_path.exists():
+        _check_executable_runs(ffmpeg_path, "ffmpeg")
+        env.setdefault("FFMPEG_BINARY", str(ffmpeg_path))
+        env.setdefault("IMAGEIO_FFMPEG_EXE", str(ffmpeg_path))
+    ffprobe_name = "ffprobe.exe" if os.name == "nt" else "ffprobe"
+    ffprobe_path = bin_dir / ffprobe_name
+    if ffprobe_path.exists():
+        _check_executable_runs(ffprobe_path, "ffprobe")
+        env.setdefault("FFPROBE_BINARY", str(ffprobe_path))
+    else:
+        resolved_ffprobe = shutil.which("ffprobe", path=env.get("PATH", ""))
+        if resolved_ffprobe is None:
+            raise RuntimeError("未找到 ffprobe。请安装 ffmpeg/ffprobe，或将 ffprobe 放入 tools/ffmpeg/bin。")
+        _check_executable_runs(Path(resolved_ffprobe), "ffprobe")
+        env.setdefault("FFPROBE_BINARY", str(Path(resolved_ffprobe)))
+    return bin_dir

lib/logger.py CHANGED Viewed

@@ -1,254 +1,254 @@
-# -*- coding: utf-8 -*-
-"""
-日志工具模块 - 支持时间戳和颜色输出
-"""
-import sys
-import logging
-from datetime import datetime
-try:
-    from colorama import init, Fore, Style, Back
-    init(autoreset=True)  # 初始化 colorama (Windows 兼容), autoreset确保每行重置
-    COLORAMA_AVAILABLE = True
-except ImportError:
-    COLORAMA_AVAILABLE = False
-    # 定义空的占位符
-    class Fore:
-        LIGHTBLACK_EX = GREEN = YELLOW = RED = CYAN = BLUE = MAGENTA = WHITE = LIGHTGREEN_EX = LIGHTCYAN_EX = LIGHTYELLOW_EX = LIGHTMAGENTA_EX = ""
-    class Style:
-        RESET_ALL = BRIGHT = DIM = ""
-    class Back:
-        pass
-class Logger:
-    """统一日志工具"""
-    SAFE_CHAR_MAP = {
-        "✓": "[OK] ",
-        "✗": "[X] ",
-        "→": "->",
-        "◆": "*",
-    }
-    COLORS = {
-        "DEBUG": Fore.LIGHTBLACK_EX,
-        "INFO": Fore.GREEN,
-        "SUCCESS": Fore.LIGHTGREEN_EX,
-        "WARNING": Fore.YELLOW,
-        "ERROR": Fore.RED,
-        "STEP": Fore.CYAN,
-        "DETAIL": Fore.LIGHTCYAN_EX,
-        "PROGRESS": Fore.MAGENTA,
-        "MODEL": Fore.LIGHTMAGENTA_EX,
-        "AUDIO": Fore.BLUE,
-        "CONFIG": Fore.LIGHTYELLOW_EX,
-    }
-    RESET = Style.RESET_ALL
-    BRIGHT = Style.BRIGHT
-    DIM = Style.DIM
-    # 详细日志开关
-    verbose = True
-    @staticmethod
-    def _sanitize_console_text(text: str) -> str:
-        """将不兼容当前终端编码的字符替换为安全文本。"""
-        sanitized = text
-        for src, dst in Logger.SAFE_CHAR_MAP.items():
-            sanitized = sanitized.replace(src, dst)
-        return sanitized
-    @staticmethod
-    def _emit(text: str):
-        """安全输出到终端，避免 Windows/GBK 控制台因 Unicode 崩溃。"""
-        try:
-            print(text, flush=True)
-            return
-        except UnicodeEncodeError:
-            pass
-        fallback = Logger._sanitize_console_text(text)
-        encoding = getattr(sys.stdout, "encoding", None) or "utf-8"
-        try:
-            print(
-                fallback.encode(encoding, errors="replace").decode(encoding),
-                flush=True,
-            )
-        except Exception:
-            print(
-                fallback.encode("ascii", errors="replace").decode("ascii"),
-                flush=True,
-            )
-    @staticmethod
-    def _log(level: str, msg: str, force_print: bool = True):
-        """内部日志方法"""
-        timestamp = datetime.now().strftime("%H:%M:%S")
-        color = Logger.COLORS.get(level, "")
-        reset = Logger.RESET
-        # 根据级别决定前缀
-        if level in ("INFO", "STEP", "SUCCESS"):
-            prefix = ""
-        elif level == "DETAIL":
-            prefix = "  → "
-        elif level == "PROGRESS":
-            prefix = "  ◆ "
-        elif level == "MODEL":
-            prefix = "[模型] "
-        elif level == "AUDIO":
-            prefix = "[音频] "
-        elif level == "CONFIG":
-            prefix = "[配置] "
-        else:
-            prefix = f"[{level}] "
-        output = f"{color}[{timestamp}]{prefix}{msg}{reset}"
-        Logger._emit(output)
-    @staticmethod
-    def debug(msg: str):
-        """调试日志 (灰色) - 仅在verbose模式下显示"""
-        if Logger.verbose:
-            Logger._log("DEBUG", msg)
-    @staticmethod
-    def info(msg: str):
-        """信息日志 (绿色)"""
-        Logger._log("INFO", msg)
-    @staticmethod
-    def success(msg: str):
-        """成功日志 (亮绿色)"""
-        Logger._log("SUCCESS", f"✓ {msg}")
-    @staticmethod
-    def warning(msg: str):
-        """警告日志 (黄色)"""
-        Logger._log("WARNING", msg)
-    @staticmethod
-    def error(msg: str):
-        """错误日志 (红色)"""
-        Logger._log("ERROR", msg)
-    @staticmethod
-    def step(current: int, total: int, msg: str):
-        """步骤日志 (青色)"""
-        timestamp = datetime.now().strftime("%H:%M:%S")
-        color = Logger.COLORS.get("STEP", "")
-        reset = Logger.RESET
-        Logger._emit(f"{color}[{timestamp}][{current}/{total}] {msg}{reset}")
-    @staticmethod
-    def detail(msg: str):
-        """详细日志 (浅青色) - 用于显示处理细节"""
-        if Logger.verbose:
-            Logger._log("DETAIL", msg)
-    @staticmethod
-    def progress(msg: str):
-        """进度日志 (紫色) - 用于显示处理进度"""
-        Logger._log("PROGRESS", msg)
-    @staticmethod
-    def model(msg: str):
-        """模型日志 (浅紫色) - 用于模型加载/卸载信息"""
-        Logger._log("MODEL", msg)
-    @staticmethod
-    def audio(msg: str):
-        """音频日志 (蓝色) - 用于音频处理信息"""
-        Logger._log("AUDIO", msg)
-    @staticmethod
-    def config(msg: str):
-        """配置日志 (浅黄色) - 用于配置信息"""
-        if Logger.verbose:
-            Logger._log("CONFIG", msg)
-    @staticmethod
-    def header(msg: str):
-        """标题日志 (带分隔线)"""
-        timestamp = datetime.now().strftime("%H:%M:%S")
-        color = Logger.COLORS.get("INFO", "")
-        reset = Logger.RESET
-        Logger._emit(f"{color}[{timestamp}] {'=' * 50}{reset}")
-        Logger._emit(f"{color}[{timestamp}] {msg}{reset}")
-        Logger._emit(f"{color}[{timestamp}] {'=' * 50}{reset}")
-    @staticmethod
-    def separator(char: str = "-", length: int = 40):
-        """分隔线"""
-        timestamp = datetime.now().strftime("%H:%M:%S")
-        color = Logger.COLORS.get("DEBUG", "")
-        reset = Logger.RESET
-        Logger._emit(f"{color}[{timestamp}] {char * length}{reset}")
-    @staticmethod
-    def set_verbose(enabled: bool):
-        """设置详细日志模式"""
-        Logger.verbose = enabled
-# 便捷实例
-log = Logger()
-# ============ 配置标准 logging 模块使用颜色 ============
-class ColoredFormatter(logging.Formatter):
-    """为标准logging模块添加颜色支持"""
-    LEVEL_COLORS = {
-        logging.DEBUG: Fore.LIGHTBLACK_EX,
-        logging.INFO: Fore.GREEN,
-        logging.WARNING: Fore.YELLOW,
-        logging.ERROR: Fore.RED,
-        logging.CRITICAL: Fore.RED + Style.BRIGHT,
-    }
-    def format(self, record):
-        # 获取颜色
-        color = self.LEVEL_COLORS.get(record.levelno, "")
-        reset = Style.RESET_ALL
-        # 格式化时间
-        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        # 构建消息
-        level_name = record.levelname
-        module_name = record.name
-        # 格式化输出
-        formatted = f"{color}{timestamp} | {level_name} | {module_name} | {record.getMessage()}{reset}"
-        return formatted
-def setup_colored_logging(level=logging.INFO):
-    """配置全局logging使用颜色输出"""
-    # 获取根logger
-    root_logger = logging.getLogger()
-    root_logger.setLevel(level)
-    # 移除现有的handlers
-    for handler in root_logger.handlers[:]:
-        root_logger.removeHandler(handler)
-    # 添加带颜色的handler
-    console_handler = logging.StreamHandler(sys.stdout)
-    console_handler.setLevel(level)
-    console_handler.setFormatter(ColoredFormatter())
-    root_logger.addHandler(console_handler)
-    return root_logger
-# 自动配置logging颜色
-setup_colored_logging(logging.INFO)
-# 抑制第三方库的英文日志
-logging.getLogger("faiss").setLevel(logging.WARNING)
-logging.getLogger("audio_separator").setLevel(logging.WARNING)

+# -*- coding: utf-8 -*-
+"""
+日志工具模块 - 支持时间戳和颜色输出
+"""
+import sys
+import logging
+from datetime import datetime
+try:
+    from colorama import init, Fore, Style, Back
+    init(autoreset=True)  # 初始化 colorama (Windows 兼容), autoreset确保每行重置
+    COLORAMA_AVAILABLE = True
+except ImportError:
+    COLORAMA_AVAILABLE = False
+    # 定义空的占位符
+    class Fore:
+        LIGHTBLACK_EX = GREEN = YELLOW = RED = CYAN = BLUE = MAGENTA = WHITE = LIGHTGREEN_EX = LIGHTCYAN_EX = LIGHTYELLOW_EX = LIGHTMAGENTA_EX = ""
+    class Style:
+        RESET_ALL = BRIGHT = DIM = ""
+    class Back:
+        pass
+class Logger:
+    """统一日志工具"""
+    SAFE_CHAR_MAP = {
+        "✓": "[OK] ",
+        "✗": "[X] ",
+        "→": "->",
+        "◆": "*",
+    }
+    COLORS = {
+        "DEBUG": Fore.LIGHTBLACK_EX,
+        "INFO": Fore.GREEN,
+        "SUCCESS": Fore.LIGHTGREEN_EX,
+        "WARNING": Fore.YELLOW,
+        "ERROR": Fore.RED,
+        "STEP": Fore.CYAN,
+        "DETAIL": Fore.LIGHTCYAN_EX,
+        "PROGRESS": Fore.MAGENTA,
+        "MODEL": Fore.LIGHTMAGENTA_EX,
+        "AUDIO": Fore.BLUE,
+        "CONFIG": Fore.LIGHTYELLOW_EX,
+    }
+    RESET = Style.RESET_ALL
+    BRIGHT = Style.BRIGHT
+    DIM = Style.DIM
+    # 详细日志开关
+    verbose = True
+    @staticmethod
+    def _sanitize_console_text(text: str) -> str:
+        """将不兼容当前终端编码的字符替换为安全文本。"""
+        sanitized = text
+        for src, dst in Logger.SAFE_CHAR_MAP.items():
+            sanitized = sanitized.replace(src, dst)
+        return sanitized
+    @staticmethod
+    def _emit(text: str):
+        """安全输出到终端，避免 Windows/GBK 控制台因 Unicode 崩溃。"""
+        try:
+            print(text, flush=True)
+            return
+        except UnicodeEncodeError:
+            pass
+        fallback = Logger._sanitize_console_text(text)
+        encoding = getattr(sys.stdout, "encoding", None) or "utf-8"
+        try:
+            print(
+                fallback.encode(encoding, errors="replace").decode(encoding),
+                flush=True,
+            )
+        except Exception:
+            print(
+                fallback.encode("ascii", errors="replace").decode("ascii"),
+                flush=True,
+            )
+    @staticmethod
+    def _log(level: str, msg: str, force_print: bool = True):
+        """��部日志方法"""
+        timestamp = datetime.now().strftime("%H:%M:%S")
+        color = Logger.COLORS.get(level, "")
+        reset = Logger.RESET
+        # 根据级别决定前缀
+        if level in ("INFO", "STEP", "SUCCESS"):
+            prefix = ""
+        elif level == "DETAIL":
+            prefix = "  → "
+        elif level == "PROGRESS":
+            prefix = "  ◆ "
+        elif level == "MODEL":
+            prefix = "[模型] "
+        elif level == "AUDIO":
+            prefix = "[音频] "
+        elif level == "CONFIG":
+            prefix = "[配置] "
+        else:
+            prefix = f"[{level}] "
+        output = f"{color}[{timestamp}]{prefix}{msg}{reset}"
+        Logger._emit(output)
+    @staticmethod
+    def debug(msg: str):
+        """调试日志 (灰色) - 仅在verbose模式下显示"""
+        if Logger.verbose:
+            Logger._log("DEBUG", msg)
+    @staticmethod
+    def info(msg: str):
+        """信息日志 (绿色)"""
+        Logger._log("INFO", msg)
+    @staticmethod
+    def success(msg: str):
+        """成功日志 (亮绿色)"""
+        Logger._log("SUCCESS", f"✓ {msg}")
+    @staticmethod
+    def warning(msg: str):
+        """警告日志 (黄色)"""
+        Logger._log("WARNING", msg)
+    @staticmethod
+    def error(msg: str):
+        """错误日志 (红色)"""
+        Logger._log("ERROR", msg)
+    @staticmethod
+    def step(current: int, total: int, msg: str):
+        """步骤日志 (青色)"""
+        timestamp = datetime.now().strftime("%H:%M:%S")
+        color = Logger.COLORS.get("STEP", "")
+        reset = Logger.RESET
+        Logger._emit(f"{color}[{timestamp}][{current}/{total}] {msg}{reset}")
+    @staticmethod
+    def detail(msg: str):
+        """详细日志 (浅青色) - 用于显示处理细节"""
+        if Logger.verbose:
+            Logger._log("DETAIL", msg)
+    @staticmethod
+    def progress(msg: str):
+        """进度日志 (紫色) - 用于显示处理进度"""
+        Logger._log("PROGRESS", msg)
+    @staticmethod
+    def model(msg: str):
+        """模型日志 (浅紫色) - 用于模型加载/卸载信息"""
+        Logger._log("MODEL", msg)
+    @staticmethod
+    def audio(msg: str):
+        """音频日志 (蓝色) - 用于音频处理信息"""
+        Logger._log("AUDIO", msg)
+    @staticmethod
+    def config(msg: str):
+        """配置日志 (浅黄色) - 用于配置信息"""
+        if Logger.verbose:
+            Logger._log("CONFIG", msg)
+    @staticmethod
+    def header(msg: str):
+        """标题日志 (带分隔线)"""
+        timestamp = datetime.now().strftime("%H:%M:%S")
+        color = Logger.COLORS.get("INFO", "")
+        reset = Logger.RESET
+        Logger._emit(f"{color}[{timestamp}] {'=' * 50}{reset}")
+        Logger._emit(f"{color}[{timestamp}] {msg}{reset}")
+        Logger._emit(f"{color}[{timestamp}] {'=' * 50}{reset}")
+    @staticmethod
+    def separator(char: str = "-", length: int = 40):
+        """分隔线"""
+        timestamp = datetime.now().strftime("%H:%M:%S")
+        color = Logger.COLORS.get("DEBUG", "")
+        reset = Logger.RESET
+        Logger._emit(f"{color}[{timestamp}] {char * length}{reset}")
+    @staticmethod
+    def set_verbose(enabled: bool):
+        """设置详细日志模式"""
+        Logger.verbose = enabled
+# 便捷实例
+log = Logger()
+# ============ 配置标准 logging 模块使用颜色 ============
+class ColoredFormatter(logging.Formatter):
+    """为标准logging模块添加颜色支持"""
+    LEVEL_COLORS = {
+        logging.DEBUG: Fore.LIGHTBLACK_EX,
+        logging.INFO: Fore.GREEN,
+        logging.WARNING: Fore.YELLOW,
+        logging.ERROR: Fore.RED,
+        logging.CRITICAL: Fore.RED + Style.BRIGHT,
+    }
+    def format(self, record):
+        # 获取颜色
+        color = self.LEVEL_COLORS.get(record.levelno, "")
+        reset = Style.RESET_ALL
+        # 格式化时间
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        # 构建消息
+        level_name = record.levelname
+        module_name = record.name
+        # 格式化输出
+        formatted = f"{color}{timestamp} | {level_name} | {module_name} | {record.getMessage()}{reset}"
+        return formatted
+def setup_colored_logging(level=logging.INFO):
+    """配置全局logging使用颜色输出"""
+    # 获取根logger
+    root_logger = logging.getLogger()
+    root_logger.setLevel(level)
+    # 移除现有的handlers
+    for handler in root_logger.handlers[:]:
+        root_logger.removeHandler(handler)
+    # 添加带颜色的handler
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setLevel(level)
+    console_handler.setFormatter(ColoredFormatter())
+    root_logger.addHandler(console_handler)
+    return root_logger
+# 自动配置logging颜色
+setup_colored_logging(logging.INFO)
+# 抑制第三方库的英文日志
+logging.getLogger("faiss").setLevel(logging.WARNING)
+logging.getLogger("audio_separator").setLevel(logging.WARNING)

lib/mixer.py CHANGED Viewed

@@ -2,13 +2,13 @@
 """
 混音模块 - 人声与伴奏混合
 """
-import numpy as np
-import librosa
-import soundfile as sf
-from pathlib import Path
-from typing import Optional
-from lib.audio import soft_clip_array
 try:
     from lib.logger import log
@@ -152,12 +152,12 @@ def mix_vocals_and_accompaniment(
     elif reverb_amount > 0 and log:
         log.warning("Pedalboard 不可用，跳过混响")
-    vocals = soft_clip_array(vocals * vocals_volume, threshold=0.85, ceiling=0.95)
-    accompaniment = soft_clip_array(
-        accompaniment * accompaniment_volume,
-        threshold=0.85,
-        ceiling=0.95,
-    )
     vocals_len = vocals.shape[-1]
     accompaniment_len = accompaniment.shape[-1]
@@ -174,18 +174,18 @@ def mix_vocals_and_accompaniment(
     vocals = adjust_audio_length(vocals, target_len)
     accompaniment = adjust_audio_length(accompaniment, target_len)
-    if log:
-        log.progress("混合音轨...")
-    mixed = vocals + accompaniment
-    max_val = float(np.max(np.abs(mixed)))
     if log:
         log.detail(f"混合后峰值: {max_val:.4f}")
-    mixed = soft_clip_array(mixed, threshold=0.90, ceiling=0.98)
-    if log:
-        final_peak = float(np.max(np.abs(mixed)))
-        log.detail(f"软削波后峰值: {final_peak:.4f}")
     if mixed.ndim == 2:
         mixed = mixed.T

 """
 混音模块 - 人声与伴奏混合
 """
+import numpy as np
+import librosa
+import soundfile as sf
+from pathlib import Path
+from typing import Optional
+from lib.audio import soft_clip_array
 try:
     from lib.logger import log
     elif reverb_amount > 0 and log:
         log.warning("Pedalboard 不可用，跳过混响")
+    vocals = soft_clip_array(vocals * vocals_volume, threshold=0.85, ceiling=0.95)
+    accompaniment = soft_clip_array(
+        accompaniment * accompaniment_volume,
+        threshold=0.85,
+        ceiling=0.95,
+    )
     vocals_len = vocals.shape[-1]
     accompaniment_len = accompaniment.shape[-1]
     vocals = adjust_audio_length(vocals, target_len)
     accompaniment = adjust_audio_length(accompaniment, target_len)
+    if log:
+        log.progress("混合音轨...")
+    mixed = vocals + accompaniment
+    max_val = float(np.max(np.abs(mixed)))
     if log:
         log.detail(f"混合后峰值: {max_val:.4f}")
+    mixed = soft_clip_array(mixed, threshold=0.90, ceiling=0.98)
+    if log:
+        final_peak = float(np.max(np.abs(mixed)))
+        log.detail(f"软削波后峰值: {final_peak:.4f}")
     if mixed.ndim == 2:
         mixed = mixed.T

lib/runtime_build.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from __future__ import annotations
+from datetime import datetime
+from pathlib import Path
+from typing import Dict
+ROOT_DIR = Path(__file__).resolve().parent.parent
+WATCHED_FILES = (
+    "run.py",
+    "app.py",
+    "lib/ffmpeg_runtime.py",
+    "ui/app.py",
+    "infer/cover_pipeline.py",
+    "infer/official_adapter.py",
+    "infer/quality_policy.py",
+)
+def _existing_watched_files() -> list[Path]:
+    files: list[Path] = []
+    for relative_path in WATCHED_FILES:
+        path = ROOT_DIR / relative_path
+        if path.exists():
+            files.append(path)
+    return files
+def _compute_runtime_build_info() -> Dict[str, str]:
+    files = _existing_watched_files()
+    if not files:
+        return {
+            "timestamp": "unknown",
+            "source": "unknown",
+        }
+    latest = max(files, key=lambda path: path.stat().st_mtime)
+    timestamp = datetime.fromtimestamp(latest.stat().st_mtime).strftime("%Y-%m-%d %H:%M:%S")
+    try:
+        source = latest.relative_to(ROOT_DIR).as_posix()
+    except ValueError:
+        source = str(latest)
+    return {
+        "timestamp": timestamp,
+        "source": source,
+    }
+RUNTIME_BUILD_INFO = _compute_runtime_build_info()
+def get_runtime_build_label() -> str:
+    info = RUNTIME_BUILD_INFO
+    return f"当前运行代码标记: {info['timestamp']} ({info['source']})"
+def get_runtime_build_short_label() -> str:
+    info = RUNTIME_BUILD_INFO
+    return f"{info['timestamp']} ({info['source']})"

mcp/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# -*- coding: utf-8 -*-
+"""
+MCP 模块
+"""

mcp/server.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# -*- coding: utf-8 -*-
+"""
+MCP 服务器 - 为 Claude Code 提供 AI 翻唱工具
+"""
+import asyncio
+import json
+from pathlib import Path
+from typing import Any
+from mcp.server import Server
+from mcp.server.stdio import stdio_server
+from mcp.types import Tool, TextContent
+from mcp.tools import (
+    list_models,
+    convert_voice,
+    download_model,
+    get_model_status
+)
+# 创建 MCP 服务器
+server = Server("rvc-voice-conversion")
+@server.list_tools()
+async def list_tools() -> list[Tool]:
+    """列出可用工具"""
+    return [
+        Tool(
+            name="list_voice_models",
+            description="列出所有可用的 RVC 语音模型",
+            inputSchema={
+                "type": "object",
+                "properties": {},
+                "required": []
+            }
+        ),
+        Tool(
+            name="convert_voice",
+            description="使用 RVC 模型进行 AI 翻唱",
+            inputSchema={
+                "type": "object",
+                "properties": {
+                    "input_path": {
+                        "type": "string",
+                        "description": "输入音频文件的绝对路径"
+                    },
+                    "output_path": {
+                        "type": "string",
+                        "description": "输出音频文件的绝对路径"
+                    },
+                    "model_name": {
+                        "type": "string",
+                        "description": "要使用的语音模型名称"
+                    },
+                    "pitch_shift": {
+                        "type": "number",
+                        "description": "音调偏移 (半音)，正数升调，负数降调",
+                        "default": 0
+                    },
+                    "index_ratio": {
+                        "type": "number",
+                        "description": "索引混合比率 (0-1)",
+                        "default": 0.5
+                    }
+                },
+                "required": ["input_path", "output_path", "model_name"]
+            }
+        ),
+        Tool(
+            name="download_base_models",
+            description="下载 RVC 所需的基础模型 (HuBERT, RMVPE)",
+            inputSchema={
+                "type": "object",
+                "properties": {},
+                "required": []
+            }
+        ),
+        Tool(
+            name="get_model_status",
+            description="获取基础模型的下载状态",
+            inputSchema={
+                "type": "object",
+                "properties": {},
+                "required": []
+            }
+        )
+    ]
+@server.call_tool()
+async def call_tool(name: str, arguments: dict[str, Any]) -> list[TextContent]:
+    """执行工具调用"""
+    if name == "list_voice_models":
+        models = list_models()
+        result = {
+            "models": models,
+            "count": len(models)
+        }
+        return [TextContent(type="text", text=json.dumps(result, ensure_ascii=False, indent=2))]
+    elif name == "convert_voice":
+        result = convert_voice(
+            input_path=arguments["input_path"],
+            output_path=arguments["output_path"],
+            model_name=arguments["model_name"],
+            pitch_shift=arguments.get("pitch_shift", 0),
+            index_ratio=arguments.get("index_ratio", 0.5),
+            filter_radius=arguments.get("filter_radius", 3),
+            rms_mix_rate=arguments.get("rms_mix_rate", 0.25),
+            protect=arguments.get("protect", 0.33)
+        )
+        return [TextContent(type="text", text=json.dumps(result, ensure_ascii=False, indent=2))]
+    elif name == "download_base_models":
+        result = download_model()
+        return [TextContent(type="text", text=json.dumps(result, ensure_ascii=False, indent=2))]
+    elif name == "get_model_status":
+        status = get_model_status()
+        return [TextContent(type="text", text=json.dumps(status, ensure_ascii=False, indent=2))]
+    else:
+        return [TextContent(type="text", text=f"未知工具: {name}")]
+async def main():
+    """启动 MCP 服务器"""
+    async with stdio_server() as (read_stream, write_stream):
+        await server.run(
+            read_stream,
+            write_stream,
+            server.create_initialization_options()
+        )
+if __name__ == "__main__":
+    asyncio.run(main())

mcp/tools.py ADDED Viewed

	@@ -0,0 +1,233 @@

+# -*- coding: utf-8 -*-
+"""
+MCP 工具函数 - 提供 AI 翻唱功能的工具接口
+"""
+import os
+import json
+from pathlib import Path
+from typing import Optional, List, Dict, Any
+# 项目根目录
+ROOT_DIR = Path(__file__).parent.parent
+def get_config() -> dict:
+    """获取配置"""
+    config_path = ROOT_DIR / "configs" / "config.json"
+    if config_path.exists():
+        with open(config_path, "r", encoding="utf-8") as f:
+            config = json.load(f)
+            return _normalize_config(config)
+    return {}
+def _normalize_config(config: dict) -> dict:
+    """Normalize legacy path keys to top-level entries."""
+    if not config:
+        return {}
+    paths = config.get("paths", {})
+    if "hubert_path" not in config and "hubert" in paths:
+        config["hubert_path"] = paths["hubert"]
+    if "rmvpe_path" not in config and "rmvpe" in paths:
+        config["rmvpe_path"] = paths["rmvpe"]
+    if "weights_dir" not in config and "weights" in paths:
+        config["weights_dir"] = paths["weights"]
+    if "output_dir" not in config and "outputs" in paths:
+        config["output_dir"] = paths["outputs"]
+    if "temp_dir" not in config and "temp" in paths:
+        config["temp_dir"] = paths["temp"]
+    return config
+def list_models() -> List[Dict[str, Any]]:
+    """
+    列出所有可用的语音模型
+    Returns:
+        List[Dict]: 模型列表，每个模型包含 name, model_path, index_path
+    """
+    from infer.pipeline import list_voice_models
+    config = get_config()
+    weights_dir = ROOT_DIR / config.get("weights_dir", "assets/weights")
+    return list_voice_models(str(weights_dir))
+def convert_voice(
+    input_path: str,
+    output_path: str,
+    model_name: str,
+    pitch_shift: float = 0,
+    index_ratio: float = 0.5,
+    filter_radius: int = 3,
+    rms_mix_rate: float = 0.25,
+    protect: float = 0.33
+) -> Dict[str, Any]:
+    """
+    执行 AI 翻唱
+    Args:
+        input_path: 输入音频文件路径
+        output_path: 输出音频文件路径
+        model_name: 模型名称
+        pitch_shift: 音调偏移 (半音)
+        index_ratio: 索引混合比率
+        filter_radius: 中值滤波半径
+        rms_mix_rate: 响度混合比率
+        protect: 保护系数
+    Returns:
+        Dict: 转换结果，包含 success, output_path, error
+    """
+    try:
+        from infer.pipeline import VoiceConversionPipeline, list_voice_models
+        config = get_config()
+        use_official_vc = config.get("use_official_vc", True)
+        # 查找模型
+        weights_dir = ROOT_DIR / config.get("weights_dir", "assets/weights")
+        models = list_voice_models(str(weights_dir))
+        model_info = None
+        for m in models:
+            if m["name"] == model_name:
+                model_info = m
+                break
+        if model_info is None:
+            return {
+                "success": False,
+                "output_path": None,
+                "error": f"模型不存在: {model_name}"
+            }
+        if use_official_vc:
+            from infer.official_adapter import convert_vocals_official
+            result_path = convert_vocals_official(
+                vocals_path=input_path,
+                output_path=output_path,
+                model_path=model_info["model_path"],
+                index_path=model_info.get("index_path"),
+                f0_method=config.get("f0_method", "rmvpe"),
+                pitch_shift=int(pitch_shift),
+                index_rate=index_ratio,
+                filter_radius=filter_radius,
+                rms_mix_rate=rms_mix_rate,
+                protect=protect,
+            )
+            return {
+                "success": True,
+                "output_path": result_path,
+                "error": None,
+            }
+        # 初始化管道
+        device = config.get("device", "cuda")
+        pipeline = VoiceConversionPipeline(device=device)
+        pipeline.hubert_layer = config.get("hubert_layer", 12)
+        # 加载 HuBERT
+        hubert_path = ROOT_DIR / config.get("hubert_path", "assets/hubert/hubert_base.pt")
+        if not hubert_path.exists():
+            return {
+                "success": False,
+                "output_path": None,
+                "error": "HuBERT 模型不存在，请先下载基础模型"
+            }
+        pipeline.load_hubert(str(hubert_path))
+        # 加载 F0 提取器
+        rmvpe_path = ROOT_DIR / config.get("rmvpe_path", "assets/rmvpe/rmvpe.pt")
+        if not rmvpe_path.exists():
+            return {
+                "success": False,
+                "output_path": None,
+                "error": "RMVPE 模型不存在，请先下载基础模型"
+            }
+        pipeline.load_f0_extractor("rmvpe", str(rmvpe_path))
+        # 加载语音模型
+        pipeline.load_voice_model(model_info["model_path"])
+        # 加载索引
+        if model_info.get("index_path"):
+            pipeline.load_index(model_info["index_path"])
+        # 执行转换
+        result_path = pipeline.convert(
+            audio_path=input_path,
+            output_path=output_path,
+            pitch_shift=pitch_shift,
+            index_ratio=index_ratio,
+            filter_radius=filter_radius,
+            rms_mix_rate=rms_mix_rate,
+            protect=protect
+        )
+        return {
+            "success": True,
+            "output_path": result_path,
+            "error": None
+        }
+    except Exception as e:
+        return {
+            "success": False,
+            "output_path": None,
+            "error": str(e)
+        }
+def download_model(model_name: str = None) -> Dict[str, Any]:
+    """
+    下载模型
+    Args:
+        model_name: 模型名称，为 None 时下载所有必需模型
+    Returns:
+        Dict: 下载结果
+    """
+    try:
+        from tools.download_models import (
+            download_model as dl_model,
+            download_required_models,
+            MODELS
+        )
+        if model_name:
+            if model_name not in MODELS:
+                return {
+                    "success": False,
+                    "error": f"未知模型: {model_name}"
+                }
+            success = dl_model(model_name)
+        else:
+            success = download_required_models()
+        return {
+            "success": success,
+            "error": None if success else "下载失败"
+        }
+    except Exception as e:
+        return {
+            "success": False,
+            "error": str(e)
+        }
+def get_model_status() -> Dict[str, bool]:
+    """
+    获取模型下载状态
+    Returns:
+        Dict: 模型名称 -> 是否已下载
+    """
+    from tools.download_models import check_all_models
+    return check_all_models()

models/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
-# -*- coding: utf-8 -*-
-"""
-模型定义模块
-"""
-from .rmvpe import RMVPE
-__all__ = ["RMVPE"]

+# -*- coding: utf-8 -*-
+"""
+模型定义模块
+"""
+from .rmvpe import RMVPE
+__all__ = ["RMVPE"]

models/rmvpe.py CHANGED Viewed

@@ -1,439 +1,439 @@
-# -*- coding: utf-8 -*-
-"""
-RMVPE 模型 - 用于高质量 F0 提取
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import numpy as np
-from typing import Optional
-class BiGRU(nn.Module):
-    """双向 GRU 层"""
-    def __init__(self, input_features: int, hidden_features: int, num_layers: int):
-        super().__init__()
-        self.gru = nn.GRU(
-            input_features,
-            hidden_features,
-            num_layers=num_layers,
-            batch_first=True,
-            bidirectional=True
-        )
-    def forward(self, x):
-        return self.gru(x)[0]
-class ConvBlockRes(nn.Module):
-    """残差卷积块"""
-    def __init__(self, in_channels: int, out_channels: int, momentum: float = 0.01,
-                 force_shortcut: bool = False):
-        super().__init__()
-        self.conv = nn.Sequential(
-            nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False),
-            nn.BatchNorm2d(out_channels, momentum=momentum),
-            nn.ReLU(),
-            nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False),
-            nn.BatchNorm2d(out_channels, momentum=momentum),
-            nn.ReLU()
-        )
-        # 当通道数不同或强制使用时才创建 shortcut
-        if in_channels != out_channels or force_shortcut:
-            self.shortcut = nn.Conv2d(in_channels, out_channels, 1)
-            self.has_shortcut = True
-        else:
-            self.has_shortcut = False
-    def forward(self, x):
-        if self.has_shortcut:
-            return self.conv(x) + self.shortcut(x)
-        else:
-            return self.conv(x) + x
-class EncoderBlock(nn.Module):
-    """编码器块 - 包含多个 ConvBlockRes 和一个池化层"""
-    def __init__(self, in_channels: int, out_channels: int, kernel_size: int,
-                 n_blocks: int, momentum: float = 0.01):
-        super().__init__()
-        self.conv = nn.ModuleList()
-        self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
-        for _ in range(n_blocks - 1):
-            self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
-        self.pool = nn.AvgPool2d(kernel_size)
-    def forward(self, x):
-        for block in self.conv:
-            x = block(x)
-        # 返回池化前的张量用于 skip connection
-        return self.pool(x), x
-class Encoder(nn.Module):
-    """RMVPE 编码器"""
-    def __init__(self, in_channels: int, in_size: int, n_encoders: int,
-                 kernel_size: int, n_blocks: int, out_channels: int = 16,
-                 momentum: float = 0.01):
-        super().__init__()
-        self.n_encoders = n_encoders
-        self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
-        self.layers = nn.ModuleList()
-        self.latent_channels = []
-        for i in range(n_encoders):
-            self.layers.append(
-                EncoderBlock(
-                    in_channels if i == 0 else out_channels * (2 ** (i - 1)),
-                    out_channels * (2 ** i),
-                    kernel_size,
-                    n_blocks,
-                    momentum
-                )
-            )
-            self.latent_channels.append(out_channels * (2 ** i))
-    def forward(self, x):
-        x = self.bn(x)
-        concat_tensors = []
-        for layer in self.layers:
-            x, skip = layer(x)
-            concat_tensors.append(skip)
-        return x, concat_tensors
-class Intermediate(nn.Module):
-    """中间层"""
-    def __init__(self, in_channels: int, out_channels: int, n_inters: int,
-                 n_blocks: int, momentum: float = 0.01):
-        super().__init__()
-        self.layers = nn.ModuleList()
-        for i in range(n_inters):
-            if i == 0:
-                # 第一层: in_channels -> out_channels (256 -> 512)
-                self.layers.append(
-                    IntermediateBlock(in_channels, out_channels, n_blocks, momentum, first_block_shortcut=True)
-                )
-            else:
-                # 后续层: out_channels -> out_channels (512 -> 512)
-                self.layers.append(
-                    IntermediateBlock(out_channels, out_channels, n_blocks, momentum, first_block_shortcut=False)
-                )
-    def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
-        return x
-class IntermediateBlock(nn.Module):
-    """中间层块"""
-    def __init__(self, in_channels: int, out_channels: int, n_blocks: int,
-                 momentum: float = 0.01, first_block_shortcut: bool = False):
-        super().__init__()
-        self.conv = nn.ModuleList()
-        # 第一个块可能需要强制使用 shortcut
-        self.conv.append(ConvBlockRes(in_channels, out_channels, momentum, force_shortcut=first_block_shortcut))
-        for _ in range(n_blocks - 1):
-            self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
-    def forward(self, x):
-        for block in self.conv:
-            x = block(x)
-        return x
-class DecoderBlock(nn.Module):
-    """解码器块"""
-    def __init__(self, in_channels: int, out_channels: int, stride: int,
-                 n_blocks: int, momentum: float = 0.01):
-        super().__init__()
-        # conv1: 转置卷积 + BatchNorm (kernel_size=3, stride=stride, padding=1, output_padding=1)
-        self.conv1 = nn.Sequential(
-            nn.ConvTranspose2d(in_channels, out_channels, 3, stride, padding=1, output_padding=1, bias=False),
-            nn.BatchNorm2d(out_channels, momentum=momentum)
-        )
-        # conv2: ConvBlockRes 列表
-        # 第一个块: in_channels = out_channels * 2 (concat 后), out_channels = out_channels
-        # 后续块: in_channels = out_channels, out_channels = out_channels
-        self.conv2 = nn.ModuleList()
-        self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
-        for _ in range(n_blocks - 1):
-            self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
-    def forward(self, x, concat_tensor):
-        x = self.conv1(x)
-        # 处理尺寸不匹配：填充较小的张量使其匹配较大的
-        diff_h = concat_tensor.size(2) - x.size(2)
-        diff_w = concat_tensor.size(3) - x.size(3)
-        if diff_h != 0 or diff_w != 0:
-            # 填充 x 使其与 concat_tensor 尺寸匹配
-            x = F.pad(x, [0, diff_w, 0, diff_h])
-        x = torch.cat([x, concat_tensor], dim=1)
-        for block in self.conv2:
-            x = block(x)
-        return x
-class Decoder(nn.Module):
-    """RMVPE 解码器"""
-    def __init__(self, in_channels: int, n_decoders: int, stride: int,
-                 n_blocks: int, out_channels: int = 16, momentum: float = 0.01):
-        super().__init__()
-        self.layers = nn.ModuleList()
-        for i in range(n_decoders):
-            out_ch = out_channels * (2 ** (n_decoders - 1 - i))
-            in_ch = in_channels if i == 0 else out_channels * (2 ** (n_decoders - i))
-            self.layers.append(
-                DecoderBlock(in_ch, out_ch, stride, n_blocks, momentum)
-            )
-    def forward(self, x, concat_tensors):
-        for i, layer in enumerate(self.layers):
-            x = layer(x, concat_tensors[-1 - i])
-        return x
-class DeepUnet(nn.Module):
-    """Deep U-Net 架构"""
-    def __init__(self, kernel_size: int, n_blocks: int, en_de_layers: int = 5,
-                 inter_layers: int = 4, in_channels: int = 1, en_out_channels: int = 16):
-        super().__init__()
-        # Encoder 输出通道: en_out_channels * 2^(en_de_layers-1) = 16 * 16 = 256
-        encoder_out_channels = en_out_channels * (2 ** (en_de_layers - 1))
-        # Intermediate 输出通道: encoder_out_channels * 2 = 512
-        intermediate_out_channels = encoder_out_channels * 2
-        self.encoder = Encoder(
-            in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
-        )
-        self.intermediate = Intermediate(
-            encoder_out_channels,
-            intermediate_out_channels,
-            inter_layers, n_blocks
-        )
-        self.decoder = Decoder(
-            intermediate_out_channels,
-            en_de_layers, kernel_size, n_blocks, en_out_channels
-        )
-    def forward(self, x):
-        x, concat_tensors = self.encoder(x)
-        x = self.intermediate(x)
-        x = self.decoder(x, concat_tensors)
-        return x
-class E2E(nn.Module):
-    """端到端 RMVPE 模型"""
-    def __init__(self, n_blocks: int, n_gru: int, kernel_size: int,
-                 en_de_layers: int = 5, inter_layers: int = 4,
-                 in_channels: int = 1, en_out_channels: int = 16):
-        super().__init__()
-        self.unet = DeepUnet(
-            kernel_size, n_blocks, en_de_layers, inter_layers,
-            in_channels, en_out_channels
-        )
-        self.cnn = nn.Conv2d(en_out_channels, 3, 3, 1, 1)
-        if n_gru:
-            self.fc = nn.Sequential(
-                BiGRU(3 * 128, 256, n_gru),
-                nn.Linear(512, 360),
-                nn.Dropout(0.25),
-                nn.Sigmoid()
-            )
-        else:
-            self.fc = nn.Sequential(
-                nn.Linear(3 * 128, 360),
-                nn.Dropout(0.25),
-                nn.Sigmoid()
-            )
-    def forward(self, mel):
-        # 输入 mel: [B, 128, T] 或 [B, 1, 128, T]
-        # 官方实现期望 [B, 1, T, 128]，即 time 在 height，mel bins 在 width
-        if mel.dim() == 3:
-            # [B, 128, T] -> [B, T, 128] -> [B, 1, T, 128]
-            mel = mel.transpose(-1, -2).unsqueeze(1)
-        elif mel.dim() == 4 and mel.shape[1] == 1:
-            # [B, 1, 128, T] -> [B, 1, T, 128]
-            mel = mel.transpose(-1, -2)
-        x = self.unet(mel)
-        x = self.cnn(x)
-        # x shape: (batch, 3, T, 128)
-        # 转换为 (batch, T, 384) 其中 384 = 3 * 128
-        x = x.transpose(1, 2).flatten(-2)  # (batch, T, 384)
-        x = self.fc(x)
-        return x
-class MelSpectrogram(nn.Module):
-    """Mel 频谱提取"""
-    def __init__(self, n_mel: int = 128, n_fft: int = 1024, win_size: int = 1024,
-                 hop_length: int = 160, sample_rate: int = 16000,
-                 fmin: int = 30, fmax: int = 8000):
-        super().__init__()
-        self.n_fft = n_fft
-        self.hop_length = hop_length
-        self.win_size = win_size
-        self.sample_rate = sample_rate
-        self.n_mel = n_mel
-        # 创建 Mel 滤波器组
-        mel_basis = self._mel_filterbank(sample_rate, n_fft, n_mel, fmin, fmax)
-        self.register_buffer("mel_basis", mel_basis)
-        self.register_buffer("window", torch.hann_window(win_size))
-    def _mel_filterbank(self, sr, n_fft, n_mels, fmin, fmax):
-        """创建 Mel 滤波器组"""
-        import librosa
-        # 必须使用 htk=True，与官方 RVC RMVPE 保持一致
-        mel = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=True)
-        return torch.from_numpy(mel).float()
-    def forward(self, audio):
-        # STFT
-        spec = torch.stft(
-            audio,
-            self.n_fft,
-            hop_length=self.hop_length,
-            win_length=self.win_size,
-            window=self.window,
-            center=True,
-            pad_mode="reflect",
-            normalized=False,
-            onesided=True,
-            return_complex=True
-        )
-        # 使用功率谱（幅度的平方），与官方 RMVPE 一致
-        spec = torch.abs(spec) ** 2
-        # Mel 变换
-        mel = torch.matmul(self.mel_basis, spec)
-        mel = torch.log(torch.clamp(mel, min=1e-5))
-        return mel
-class RMVPE:
-    """RMVPE F0 提取器封装类"""
-    def __init__(self, model_path: str, device: str = "cuda"):
-        self.device = device
-        # 加载模型
-        self.model = E2E(n_blocks=4, n_gru=1, kernel_size=2)
-        ckpt = torch.load(model_path, map_location="cpu", weights_only=False)
-        self.model.load_state_dict(ckpt)
-        self.model = self.model.to(device).eval()
-        # Mel 频谱提取器
-        self.mel_extractor = MelSpectrogram().to(device)
-        # 频率映射
-        cents_mapping = 20 * np.arange(360) + 1997.3794084376191
-        self.cents_mapping = np.pad(cents_mapping, (4, 4))
-    @torch.no_grad()
-    def infer_from_audio(self, audio: np.ndarray, thred: float = 0.03) -> np.ndarray:
-        """
-        从音频提取 F0
-        Args:
-            audio: 16kHz 音频数据
-            thred: 置信度阈值
-        Returns:
-            np.ndarray: F0 序列
-        """
-        # 转换为张量
-        audio = torch.from_numpy(audio).float().to(self.device)
-        if audio.dim() == 1:
-            audio = audio.unsqueeze(0)
-        # 提取 Mel 频谱: [B, 128, T]
-        mel = self.mel_extractor(audio)
-        # 记录原始帧数
-        n_frames = mel.shape[-1]
-        # 填充时间维度使其可被 32 整除（5 层池化，每层 /2）
-        n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames
-        if n_pad > 0:
-            mel = F.pad(mel, (0, n_pad), mode='constant', value=0)
-        # 模型推理 - E2E.forward 会处理 transpose
-        hidden = self.model(mel)
-        # 移除填充部分，只保留原始帧数
-        hidden = hidden[:, :n_frames, :]
-        hidden = hidden.squeeze(0).cpu().numpy()
-        # 解码 F0
-        f0 = self._decode(hidden, thred)
-        return f0
-    def _decode(self, hidden: np.ndarray, thred: float) -> np.ndarray:
-        """解码隐藏状态为 F0 - 使用官方 RVC 算法"""
-        # 使用官方的 to_local_average_cents 算法
-        cents = self._to_local_average_cents(hidden, thred)
-        # 转换 cents 到 Hz
-        f0 = 10 * (2 ** (cents / 1200))
-        f0[f0 == 10] = 0  # cents=0 时 f0=10，需要置零
-        return f0
-    def _to_local_average_cents(self, salience: np.ndarray, thred: float) -> np.ndarray:
-        """官方 RVC 的 to_local_average_cents 算法"""
-        # Step 1: 找到每帧的峰值 bin
-        center = np.argmax(salience, axis=1)  # [T]
-        # Step 2: 对 salience 进行 padding
-        salience = np.pad(salience, ((0, 0), (4, 4)))  # [T, 368]
-        center += 4  # 调整 center 索引
-        # Step 3: 提取峰值附近 9 个 bin 的窗口并计算加权平均
-        todo_salience = []
-        todo_cents_mapping = []
-        starts = center - 4
-        ends = center + 5
-        for idx in range(salience.shape[0]):
-            todo_salience.append(salience[idx, starts[idx]:ends[idx]])
-            todo_cents_mapping.append(self.cents_mapping[starts[idx]:ends[idx]])
-        todo_salience = np.array(todo_salience)  # [T, 9]
-        todo_cents_mapping = np.array(todo_cents_mapping)  # [T, 9]
-        # Step 4: 加权平均
-        product_sum = np.sum(todo_salience * todo_cents_mapping, axis=1)
-        weight_sum = np.sum(todo_salience, axis=1) + 1e-9
-        cents = product_sum / weight_sum
-        # Step 5: 阈值���滤 - 使用原始 salience 的最大值
-        maxx = np.max(salience, axis=1)
-        cents[maxx <= thred] = 0
-        return cents

+# -*- coding: utf-8 -*-
+"""
+RMVPE 模型 - 用于高质量 F0 提取
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from typing import Optional
+class BiGRU(nn.Module):
+    """双向 GRU 层"""
+    def __init__(self, input_features: int, hidden_features: int, num_layers: int):
+        super().__init__()
+        self.gru = nn.GRU(
+            input_features,
+            hidden_features,
+            num_layers=num_layers,
+            batch_first=True,
+            bidirectional=True
+        )
+    def forward(self, x):
+        return self.gru(x)[0]
+class ConvBlockRes(nn.Module):
+    """残差卷积块"""
+    def __init__(self, in_channels: int, out_channels: int, momentum: float = 0.01,
+                 force_shortcut: bool = False):
+        super().__init__()
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU(),
+            nn.Conv2d(out_channels, out_channels, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(out_channels, momentum=momentum),
+            nn.ReLU()
+        )
+        # 当通道数不同或强制使用时才创建 shortcut
+        if in_channels != out_channels or force_shortcut:
+            self.shortcut = nn.Conv2d(in_channels, out_channels, 1)
+            self.has_shortcut = True
+        else:
+            self.has_shortcut = False
+    def forward(self, x):
+        if self.has_shortcut:
+            return self.conv(x) + self.shortcut(x)
+        else:
+            return self.conv(x) + x
+class EncoderBlock(nn.Module):
+    """编码器块 - 包含多个 ConvBlockRes 和一个池化层"""
+    def __init__(self, in_channels: int, out_channels: int, kernel_size: int,
+                 n_blocks: int, momentum: float = 0.01):
+        super().__init__()
+        self.conv = nn.ModuleList()
+        self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
+        for _ in range(n_blocks - 1):
+            self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
+        self.pool = nn.AvgPool2d(kernel_size)
+    def forward(self, x):
+        for block in self.conv:
+            x = block(x)
+        # 返回池化前的张量用于 skip connection
+        return self.pool(x), x
+class Encoder(nn.Module):
+    """RMVPE 编码器"""
+    def __init__(self, in_channels: int, in_size: int, n_encoders: int,
+                 kernel_size: int, n_blocks: int, out_channels: int = 16,
+                 momentum: float = 0.01):
+        super().__init__()
+        self.n_encoders = n_encoders
+        self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
+        self.layers = nn.ModuleList()
+        self.latent_channels = []
+        for i in range(n_encoders):
+            self.layers.append(
+                EncoderBlock(
+                    in_channels if i == 0 else out_channels * (2 ** (i - 1)),
+                    out_channels * (2 ** i),
+                    kernel_size,
+                    n_blocks,
+                    momentum
+                )
+            )
+            self.latent_channels.append(out_channels * (2 ** i))
+    def forward(self, x):
+        x = self.bn(x)
+        concat_tensors = []
+        for layer in self.layers:
+            x, skip = layer(x)
+            concat_tensors.append(skip)
+        return x, concat_tensors
+class Intermediate(nn.Module):
+    """中间层"""
+    def __init__(self, in_channels: int, out_channels: int, n_inters: int,
+                 n_blocks: int, momentum: float = 0.01):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        for i in range(n_inters):
+            if i == 0:
+                # 第一层: in_channels -> out_channels (256 -> 512)
+                self.layers.append(
+                    IntermediateBlock(in_channels, out_channels, n_blocks, momentum, first_block_shortcut=True)
+                )
+            else:
+                # 后续层: out_channels -> out_channels (512 -> 512)
+                self.layers.append(
+                    IntermediateBlock(out_channels, out_channels, n_blocks, momentum, first_block_shortcut=False)
+                )
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
+class IntermediateBlock(nn.Module):
+    """中间层块"""
+    def __init__(self, in_channels: int, out_channels: int, n_blocks: int,
+                 momentum: float = 0.01, first_block_shortcut: bool = False):
+        super().__init__()
+        self.conv = nn.ModuleList()
+        # 第一个块可能需要强制使用 shortcut
+        self.conv.append(ConvBlockRes(in_channels, out_channels, momentum, force_shortcut=first_block_shortcut))
+        for _ in range(n_blocks - 1):
+            self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
+    def forward(self, x):
+        for block in self.conv:
+            x = block(x)
+        return x
+class DecoderBlock(nn.Module):
+    """解码器块"""
+    def __init__(self, in_channels: int, out_channels: int, stride: int,
+                 n_blocks: int, momentum: float = 0.01):
+        super().__init__()
+        # conv1: 转置卷积 + BatchNorm (kernel_size=3, stride=stride, padding=1, output_padding=1)
+        self.conv1 = nn.Sequential(
+            nn.ConvTranspose2d(in_channels, out_channels, 3, stride, padding=1, output_padding=1, bias=False),
+            nn.BatchNorm2d(out_channels, momentum=momentum)
+        )
+        # conv2: ConvBlockRes 列表
+        # 第一个块: in_channels = out_channels * 2 (concat 后), out_channels = out_channels
+        # 后续块: in_channels = out_channels, out_channels = out_channels
+        self.conv2 = nn.ModuleList()
+        self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
+        for _ in range(n_blocks - 1):
+            self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
+    def forward(self, x, concat_tensor):
+        x = self.conv1(x)
+        # 处理尺寸不匹配：填充较小的张量使其匹配较大的
+        diff_h = concat_tensor.size(2) - x.size(2)
+        diff_w = concat_tensor.size(3) - x.size(3)
+        if diff_h != 0 or diff_w != 0:
+            # 填充 x 使其与 concat_tensor 尺寸匹配
+            x = F.pad(x, [0, diff_w, 0, diff_h])
+        x = torch.cat([x, concat_tensor], dim=1)
+        for block in self.conv2:
+            x = block(x)
+        return x
+class Decoder(nn.Module):
+    """RMVPE 解码器"""
+    def __init__(self, in_channels: int, n_decoders: int, stride: int,
+                 n_blocks: int, out_channels: int = 16, momentum: float = 0.01):
+        super().__init__()
+        self.layers = nn.ModuleList()
+        for i in range(n_decoders):
+            out_ch = out_channels * (2 ** (n_decoders - 1 - i))
+            in_ch = in_channels if i == 0 else out_channels * (2 ** (n_decoders - i))
+            self.layers.append(
+                DecoderBlock(in_ch, out_ch, stride, n_blocks, momentum)
+            )
+    def forward(self, x, concat_tensors):
+        for i, layer in enumerate(self.layers):
+            x = layer(x, concat_tensors[-1 - i])
+        return x
+class DeepUnet(nn.Module):
+    """Deep U-Net 架构"""
+    def __init__(self, kernel_size: int, n_blocks: int, en_de_layers: int = 5,
+                 inter_layers: int = 4, in_channels: int = 1, en_out_channels: int = 16):
+        super().__init__()
+        # Encoder 输出通道: en_out_channels * 2^(en_de_layers-1) = 16 * 16 = 256
+        encoder_out_channels = en_out_channels * (2 ** (en_de_layers - 1))
+        # Intermediate 输出通道: encoder_out_channels * 2 = 512
+        intermediate_out_channels = encoder_out_channels * 2
+        self.encoder = Encoder(
+            in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
+        )
+        self.intermediate = Intermediate(
+            encoder_out_channels,
+            intermediate_out_channels,
+            inter_layers, n_blocks
+        )
+        self.decoder = Decoder(
+            intermediate_out_channels,
+            en_de_layers, kernel_size, n_blocks, en_out_channels
+        )
+    def forward(self, x):
+        x, concat_tensors = self.encoder(x)
+        x = self.intermediate(x)
+        x = self.decoder(x, concat_tensors)
+        return x
+class E2E(nn.Module):
+    """端到端 RMVPE 模型"""
+    def __init__(self, n_blocks: int, n_gru: int, kernel_size: int,
+                 en_de_layers: int = 5, inter_layers: int = 4,
+                 in_channels: int = 1, en_out_channels: int = 16):
+        super().__init__()
+        self.unet = DeepUnet(
+            kernel_size, n_blocks, en_de_layers, inter_layers,
+            in_channels, en_out_channels
+        )
+        self.cnn = nn.Conv2d(en_out_channels, 3, 3, 1, 1)
+        if n_gru:
+            self.fc = nn.Sequential(
+                BiGRU(3 * 128, 256, n_gru),
+                nn.Linear(512, 360),
+                nn.Dropout(0.25),
+                nn.Sigmoid()
+            )
+        else:
+            self.fc = nn.Sequential(
+                nn.Linear(3 * 128, 360),
+                nn.Dropout(0.25),
+                nn.Sigmoid()
+            )
+    def forward(self, mel):
+        # 输入 mel: [B, 128, T] 或 [B, 1, 128, T]
+        # 官方实现期望 [B, 1, T, 128]，即 time 在 height，mel bins 在 width
+        if mel.dim() == 3:
+            # [B, 128, T] -> [B, T, 128] -> [B, 1, T, 128]
+            mel = mel.transpose(-1, -2).unsqueeze(1)
+        elif mel.dim() == 4 and mel.shape[1] == 1:
+            # [B, 1, 128, T] -> [B, 1, T, 128]
+            mel = mel.transpose(-1, -2)
+        x = self.unet(mel)
+        x = self.cnn(x)
+        # x shape: (batch, 3, T, 128)
+        # 转换为 (batch, T, 384) 其中 384 = 3 * 128
+        x = x.transpose(1, 2).flatten(-2)  # (batch, T, 384)
+        x = self.fc(x)
+        return x
+class MelSpectrogram(nn.Module):
+    """Mel 频谱提取"""
+    def __init__(self, n_mel: int = 128, n_fft: int = 1024, win_size: int = 1024,
+                 hop_length: int = 160, sample_rate: int = 16000,
+                 fmin: int = 30, fmax: int = 8000):
+        super().__init__()
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.win_size = win_size
+        self.sample_rate = sample_rate
+        self.n_mel = n_mel
+        # 创建 Mel 滤波器组
+        mel_basis = self._mel_filterbank(sample_rate, n_fft, n_mel, fmin, fmax)
+        self.register_buffer("mel_basis", mel_basis)
+        self.register_buffer("window", torch.hann_window(win_size))
+    def _mel_filterbank(self, sr, n_fft, n_mels, fmin, fmax):
+        """创建 Mel 滤波器组"""
+        import librosa
+        # 必须使用 htk=True，与官方 RVC RMVPE 保持一致
+        mel = librosa.filters.mel(sr=sr, n_fft=n_fft, n_mels=n_mels, fmin=fmin, fmax=fmax, htk=True)
+        return torch.from_numpy(mel).float()
+    def forward(self, audio):
+        # STFT
+        spec = torch.stft(
+            audio,
+            self.n_fft,
+            hop_length=self.hop_length,
+            win_length=self.win_size,
+            window=self.window,
+            center=True,
+            pad_mode="reflect",
+            normalized=False,
+            onesided=True,
+            return_complex=True
+        )
+        # 使用功率谱（幅度的平方），与官方 RMVPE 一致
+        spec = torch.abs(spec) ** 2
+        # Mel 变换
+        mel = torch.matmul(self.mel_basis, spec)
+        mel = torch.log(torch.clamp(mel, min=1e-5))
+        return mel
+class RMVPE:
+    """RMVPE F0 提取器封装类"""
+    def __init__(self, model_path: str, device: str = "cuda"):
+        self.device = device
+        # 加载模型
+        self.model = E2E(n_blocks=4, n_gru=1, kernel_size=2)
+        ckpt = torch.load(model_path, map_location="cpu", weights_only=False)
+        self.model.load_state_dict(ckpt)
+        self.model = self.model.to(device).eval()
+        # Mel 频谱提取器
+        self.mel_extractor = MelSpectrogram().to(device)
+        # 频率映射
+        cents_mapping = 20 * np.arange(360) + 1997.3794084376191
+        self.cents_mapping = np.pad(cents_mapping, (4, 4))
+    @torch.no_grad()
+    def infer_from_audio(self, audio: np.ndarray, thred: float = 0.03) -> np.ndarray:
+        """
+        从音频提取 F0
+        Args:
+            audio: 16kHz 音频数据
+            thred: 置信度阈值
+        Returns:
+            np.ndarray: F0 序列
+        """
+        # 转换为张量
+        audio = torch.from_numpy(audio).float().to(self.device)
+        if audio.dim() == 1:
+            audio = audio.unsqueeze(0)
+        # 提取 Mel 频谱: [B, 128, T]
+        mel = self.mel_extractor(audio)
+        # 记录原始帧数
+        n_frames = mel.shape[-1]
+        # 填充时间维度使其可被 32 整除（5 层池化，每层 /2）
+        n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames
+        if n_pad > 0:
+            mel = F.pad(mel, (0, n_pad), mode='constant', value=0)
+        # 模型推理 - E2E.forward 会处理 transpose
+        hidden = self.model(mel)
+        # 移除填充部分，只保留原始帧数
+        hidden = hidden[:, :n_frames, :]
+        hidden = hidden.squeeze(0).cpu().numpy()
+        # 解码 F0
+        f0 = self._decode(hidden, thred)
+        return f0
+    def _decode(self, hidden: np.ndarray, thred: float) -> np.ndarray:
+        """解码隐藏状态为 F0 - 使用官方 RVC 算法"""
+        # 使用官方的 to_local_average_cents 算法
+        cents = self._to_local_average_cents(hidden, thred)
+        # 转换 cents 到 Hz
+        f0 = 10 * (2 ** (cents / 1200))
+        f0[f0 == 10] = 0  # cents=0 时 f0=10，需要置零
+        return f0
+    def _to_local_average_cents(self, salience: np.ndarray, thred: float) -> np.ndarray:
+        """官方 RVC 的 to_local_average_cents 算法"""
+        # Step 1: 找到每帧的峰值 bin
+        center = np.argmax(salience, axis=1)  # [T]
+        # Step 2: 对 salience 进行 padding
+        salience = np.pad(salience, ((0, 0), (4, 4)))  # [T, 368]
+        center += 4  # 调整 center 索引
+        # Step 3: 提取峰值附近 9 个 bin 的窗口并计算加权平均
+        todo_salience = []
+        todo_cents_mapping = []
+        starts = center - 4
+        ends = center + 5
+        for idx in range(salience.shape[0]):
+            todo_salience.append(salience[idx, starts[idx]:ends[idx]])
+            todo_cents_mapping.append(self.cents_mapping[starts[idx]:ends[idx]])
+        todo_salience = np.array(todo_salience)  # [T, 9]
+        todo_cents_mapping = np.array(todo_cents_mapping)  # [T, 9]
+        # Step 4: 加权平均
+        product_sum = np.sum(todo_salience * todo_cents_mapping, axis=1)
+        weight_sum = np.sum(todo_salience, axis=1) + 1e-9
+        cents = product_sum / weight_sum
+        # Step 5: 阈值过滤 - 使用原始 salience 的最大值
+        maxx = np.max(salience, axis=1)
+        cents[maxx <= thred] = 0
+        return cents

models/synthesizer.py CHANGED Viewed

@@ -1,853 +1,853 @@
-# -*- coding: utf-8 -*-
-"""
-RVC v2 合成器模型定义
-"""
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from typing import Optional, Tuple
-import numpy as np
-class LayerNorm(nn.Module):
-    """Layer normalization for channels-first tensors"""
-    def __init__(self, channels: int, eps: float = 1e-5):
-        super().__init__()
-        self.channels = channels
-        self.eps = eps
-        self.gamma = nn.Parameter(torch.ones(channels))
-        self.beta = nn.Parameter(torch.zeros(channels))
-    def forward(self, x):
-        # x: [B, C, T]
-        x = x.transpose(1, -1)  # [B, T, C]
-        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
-        return x.transpose(1, -1)  # [B, C, T]
-class MultiHeadAttention(nn.Module):
-    """Multi-head attention module"""
-    def __init__(self, channels: int, out_channels: int, n_heads: int,
-                 p_dropout: float = 0.0, window_size: Optional[int] = None,
-                 heads_share: bool = True, block_length: Optional[int] = None,
-                 proximal_bias: bool = False, proximal_init: bool = False):
-        super().__init__()
-        assert channels % n_heads == 0
-        self.channels = channels
-        self.out_channels = out_channels
-        self.n_heads = n_heads
-        self.p_dropout = p_dropout
-        self.window_size = window_size
-        self.heads_share = heads_share
-        self.block_length = block_length
-        self.proximal_bias = proximal_bias
-        self.proximal_init = proximal_init
-        self.attn = None
-        self.k_channels = channels // n_heads
-        self.conv_q = nn.Conv1d(channels, channels, 1)
-        self.conv_k = nn.Conv1d(channels, channels, 1)
-        self.conv_v = nn.Conv1d(channels, channels, 1)
-        self.conv_o = nn.Conv1d(channels, out_channels, 1)
-        self.drop = nn.Dropout(p_dropout)
-        if window_size is not None:
-            n_heads_rel = 1 if heads_share else n_heads
-            rel_stddev = self.k_channels ** -0.5
-            self.emb_rel_k = nn.Parameter(
-                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev
-            )
-            self.emb_rel_v = nn.Parameter(
-                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev
-            )
-        nn.init.xavier_uniform_(self.conv_q.weight)
-        nn.init.xavier_uniform_(self.conv_k.weight)
-        nn.init.xavier_uniform_(self.conv_v.weight)
-        if proximal_init:
-            with torch.no_grad():
-                self.conv_k.weight.copy_(self.conv_q.weight)
-                self.conv_k.bias.copy_(self.conv_q.bias)
-    def forward(self, x, c, attn_mask=None):
-        q = self.conv_q(x)
-        k = self.conv_k(c)
-        v = self.conv_v(c)
-        x, self.attn = self.attention(q, k, v, mask=attn_mask)
-        x = self.conv_o(x)
-        return x
-    def attention(self, query, key, value, mask=None):
-        # query, key, value: [B, C, T]
-        b, d, t_s = key.size()
-        t_t = query.size(2)
-        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
-        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
-        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
-        if self.window_size is not None:
-            assert t_s == t_t, "Relative attention only for self-attention"
-            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
-            rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
-            scores_local = self._relative_position_to_absolute_position(rel_logits)
-            scores = scores + scores_local
-        if self.proximal_bias:
-            assert t_s == t_t, "Proximal bias only for self-attention"
-            scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
-        if mask is not None:
-            scores = scores.masked_fill(mask == 0, -1e4)
-            if self.block_length is not None:
-                assert t_s == t_t, "Block length only for self-attention"
-                block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
-                scores = scores.masked_fill(block_mask == 0, -1e4)
-        p_attn = F.softmax(scores, dim=-1)
-        p_attn = self.drop(p_attn)
-        output = torch.matmul(p_attn, value)
-        if self.window_size is not None:
-            relative_weights = self._absolute_position_to_relative_position(p_attn)
-            value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
-            output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
-        output = output.transpose(2, 3).contiguous().view(b, d, t_t)
-        return output, p_attn
-    def _matmul_with_relative_values(self, x, y):
-        ret = torch.matmul(x, y.unsqueeze(0))
-        return ret
-    def _matmul_with_relative_keys(self, x, y):
-        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
-        return ret
-    def _get_relative_embeddings(self, relative_embeddings, length):
-        max_relative_position = 2 * self.window_size + 1
-        pad_length = max(length - (self.window_size + 1), 0)
-        slice_start_position = max((self.window_size + 1) - length, 0)
-        slice_end_position = slice_start_position + 2 * length - 1
-        if pad_length > 0:
-            padded_relative_embeddings = F.pad(
-                relative_embeddings,
-                (0, 0, pad_length, pad_length, 0, 0)
-            )
-        else:
-            padded_relative_embeddings = relative_embeddings
-        used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
-        return used_relative_embeddings
-    def _relative_position_to_absolute_position(self, x):
-        batch, heads, length, _ = x.size()
-        x = F.pad(x, (0, 1, 0, 0, 0, 0, 0, 0))
-        x_flat = x.view(batch, heads, length * 2 * length)
-        x_flat = F.pad(x_flat, (0, length - 1, 0, 0, 0, 0))
-        x_final = x_flat.view(batch, heads, length + 1, 2 * length - 1)[:, :, :length, length - 1:]
-        return x_final
-    def _absolute_position_to_relative_position(self, x):
-        batch, heads, length, _ = x.size()
-        x = F.pad(x, (0, length - 1, 0, 0, 0, 0, 0, 0))
-        x_flat = x.view(batch, heads, length ** 2 + length * (length - 1))
-        x_flat = F.pad(x_flat, (length, 0, 0, 0, 0, 0))
-        x_final = x_flat.view(batch, heads, length, 2 * length)[:, :, :, 1:]
-        return x_final
-    def _attention_bias_proximal(self, length):
-        r = torch.arange(length, dtype=torch.float32)
-        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
-        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
-class FFN(nn.Module):
-    """Feed-forward network with optional causal convolution"""
-    def __init__(self, in_channels: int, out_channels: int, filter_channels: int,
-                 kernel_size: int, p_dropout: float = 0.0, activation: str = None,
-                 causal: bool = False):
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.filter_channels = filter_channels
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.activation = activation
-        self.causal = causal
-        if causal:
-            self.padding = self._causal_padding
-        else:
-            self.padding = self._same_padding
-        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
-        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
-        self.drop = nn.Dropout(p_dropout)
-    def forward(self, x, x_mask):
-        x = self.conv_1(self.padding(x))
-        if self.activation == "gelu":
-            x = x * torch.sigmoid(1.702 * x)
-        else:
-            x = torch.relu(x)
-        x = self.drop(x)
-        x = self.conv_2(self.padding(x))
-        return x * x_mask
-    def _causal_padding(self, x):
-        if self.kernel_size == 1:
-            return x
-        pad_l = self.kernel_size - 1
-        pad_r = 0
-        return F.pad(x, (pad_l, pad_r, 0, 0, 0, 0))
-    def _same_padding(self, x):
-        if self.kernel_size == 1:
-            return x
-        pad_l = (self.kernel_size - 1) // 2
-        pad_r = self.kernel_size // 2
-        return F.pad(x, (pad_l, pad_r, 0, 0, 0, 0))
-class Encoder(nn.Module):
-    """Transformer encoder with multi-head attention"""
-    def __init__(self, hidden_channels: int, filter_channels: int, n_heads: int,
-                 n_layers: int, kernel_size: int = 1, p_dropout: float = 0.0,
-                 window_size: int = 10):
-        super().__init__()
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.window_size = window_size
-        self.drop = nn.Dropout(p_dropout)
-        self.attn_layers = nn.ModuleList()
-        self.norm_layers_1 = nn.ModuleList()
-        self.ffn_layers = nn.ModuleList()
-        self.norm_layers_2 = nn.ModuleList()
-        for _ in range(n_layers):
-            self.attn_layers.append(
-                MultiHeadAttention(
-                    hidden_channels, hidden_channels, n_heads,
-                    p_dropout=p_dropout, window_size=window_size
-                )
-            )
-            self.norm_layers_1.append(LayerNorm(hidden_channels))
-            self.ffn_layers.append(
-                FFN(hidden_channels, hidden_channels, filter_channels,
-                    kernel_size, p_dropout=p_dropout)
-            )
-            self.norm_layers_2.append(LayerNorm(hidden_channels))
-    def forward(self, x, x_mask):
-        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
-        x = x * x_mask
-        for i in range(self.n_layers):
-            y = self.attn_layers[i](x, x, attn_mask)
-            y = self.drop(y)
-            x = self.norm_layers_1[i](x + y)
-            y = self.ffn_layers[i](x, x_mask)
-            y = self.drop(y)
-            x = self.norm_layers_2[i](x + y)
-        x = x * x_mask
-        return x
-class TextEncoder(nn.Module):
-    """Text encoder for RVC - encodes phone and pitch embeddings"""
-    def __init__(self, out_channels: int, hidden_channels: int, filter_channels: int,
-                 n_heads: int, n_layers: int, kernel_size: int, p_dropout: float,
-                 f0: bool = True):
-        super().__init__()
-        self.out_channels = out_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.f0 = f0
-        # Phone embedding: Linear projection from 768-dim HuBERT features
-        self.emb_phone = nn.Linear(768, hidden_channels)
-        # Pitch embedding (only if f0 is enabled)
-        if f0:
-            self.emb_pitch = nn.Embedding(256, hidden_channels)
-        # Transformer encoder
-        self.encoder = Encoder(
-            hidden_channels, filter_channels, n_heads, n_layers,
-            kernel_size, p_dropout
-        )
-        # Output projection to mean and log-variance
-        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-    def forward(self, phone, pitch, lengths):
-        """
-        Args:
-            phone: [B, 768, T] phone features from HuBERT (channels first)
-            pitch: [B, T] pitch indices (0-255)
-            lengths: [B] sequence lengths
-        Returns:
-            m: [B, out_channels, T] mean
-            logs: [B, out_channels, T] log-variance
-            x_mask: [B, 1, T] mask
-        """
-        import logging
-        log = logging.getLogger(__name__)
-        log.debug(f"[TextEncoder] 输入 phone: shape={phone.shape}")
-        log.debug(f"[TextEncoder] 输入 pitch: shape={pitch.shape}, max={pitch.max().item()}, min={pitch.min().item()}")
-        log.debug(f"[TextEncoder] 输入 lengths: {lengths}")
-        # Transpose phone from [B, C, T] to [B, T, C] for linear layer
-        phone = phone.transpose(1, 2)  # [B, T, 768]
-        log.debug(f"[TextEncoder] 转置后 phone: shape={phone.shape}")
-        # Create mask
-        x_mask = torch.unsqueeze(
-            self._sequence_mask(lengths, phone.size(1)), 1
-        ).to(phone.dtype)
-        log.debug(f"[TextEncoder] x_mask: shape={x_mask.shape}, sum={x_mask.sum().item()}")
-        # Phone embedding
-        x = self.emb_phone(phone)  # [B, T, hidden_channels]
-        log.debug(f"[TextEncoder] emb_phone 输出: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}")
-        # Add pitch embedding if enabled
-        if self.f0 and pitch is not None:
-            # Clamp pitch to valid range
-            pitch_clamped = torch.clamp(pitch, 0, 255)
-            pitch_emb = self.emb_pitch(pitch_clamped)
-            log.debug(f"[TextEncoder] emb_pitch 输出: shape={pitch_emb.shape}, max={pitch_emb.abs().max().item():.4f}")
-            x = x + pitch_emb
-        # Transpose for conv layers: [B, hidden_channels, T]
-        x = x.transpose(1, 2)
-        log.debug(f"[TextEncoder] 转置后 x: shape={x.shape}")
-        # Apply mask
-        x = x * x_mask
-        # Transformer encoder
-        x = self.encoder(x, x_mask)
-        log.debug(f"[TextEncoder] Transformer 输出: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}")
-        # Project to mean and log-variance
-        stats = self.proj(x) * x_mask
-        m, logs = torch.split(stats, self.out_channels, dim=1)
-        log.debug(f"[TextEncoder] 最终输出 m: shape={m.shape}, max={m.abs().max().item():.4f}")
-        log.debug(f"[TextEncoder] 最终输出 logs: shape={logs.shape}, max={logs.max().item():.4f}, min={logs.min().item():.4f}")
-        return m, logs, x_mask
-    def _sequence_mask(self, length, max_length=None):
-        if max_length is None:
-            max_length = length.max()
-        x = torch.arange(max_length, dtype=length.dtype, device=length.device)
-        return x.unsqueeze(0) < length.unsqueeze(1)
-class ResidualCouplingBlock(nn.Module):
-    """残差耦合块"""
-    def __init__(self, channels: int, hidden_channels: int, kernel_size: int,
-                 dilation_rate: int, n_layers: int, n_flows: int = 4,
-                 gin_channels: int = 0):
-        super().__init__()
-        self.flows = nn.ModuleList()
-        for _ in range(n_flows):
-            self.flows.append(
-                ResidualCouplingLayer(
-                    channels, hidden_channels, kernel_size,
-                    dilation_rate, n_layers, gin_channels=gin_channels
-                )
-            )
-            self.flows.append(Flip())
-    def forward(self, x, x_mask, g=None, reverse=False):
-        if not reverse:
-            for flow in self.flows:
-                x, _ = flow(x, x_mask, g=g, reverse=reverse)
-        else:
-            for flow in reversed(self.flows):
-                x = flow(x, x_mask, g=g, reverse=reverse)
-        return x
-class ResidualCouplingLayer(nn.Module):
-    """残差耦合层"""
-    def __init__(self, channels: int, hidden_channels: int, kernel_size: int,
-                 dilation_rate: int, n_layers: int, mean_only: bool = True,
-                 gin_channels: int = 0):
-        super().__init__()
-        self.half_channels = channels // 2
-        self.mean_only = mean_only
-        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
-        self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels)
-        self.post = nn.Conv1d(hidden_channels, self.half_channels, 1)
-        self.post.weight.data.zero_()
-        self.post.bias.data.zero_()
-    def forward(self, x, x_mask, g=None, reverse=False):
-        x0, x1 = torch.split(x, [self.half_channels] * 2, dim=1)
-        h = self.pre(x0) * x_mask
-        h = self.enc(h, x_mask, g=g)
-        stats = self.post(h) * x_mask
-        m = stats
-        if not reverse:
-            x1 = m + x1 * x_mask
-            x = torch.cat([x0, x1], dim=1)
-            return x, None
-        else:
-            x1 = (x1 - m) * x_mask
-            x = torch.cat([x0, x1], dim=1)
-            return x
-class Flip(nn.Module):
-    """翻转层"""
-    def forward(self, x, *args, reverse=False, **kwargs):
-        x = torch.flip(x, [1])
-        return x
-class WN(nn.Module):
-    """WaveNet 风格网络 (带权重归一化)"""
-    def __init__(self, hidden_channels: int, kernel_size: int,
-                 dilation_rate: int, n_layers: int, gin_channels: int = 0,
-                 p_dropout: float = 0):
-        super().__init__()
-        self.n_layers = n_layers
-        self.hidden_channels = hidden_channels
-        self.gin_channels = gin_channels
-        self.in_layers = nn.ModuleList()
-        self.res_skip_layers = nn.ModuleList()
-        self.drop = nn.Dropout(p_dropout)
-        if gin_channels > 0:
-            self.cond_layer = nn.utils.weight_norm(
-                nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
-            )
-        for i in range(n_layers):
-            dilation = dilation_rate ** i
-            padding = (kernel_size * dilation - dilation) // 2
-            self.in_layers.append(
-                nn.utils.weight_norm(
-                    nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size,
-                              dilation=dilation, padding=padding)
-                )
-            )
-            # 前 n-1 层输出 2 * hidden_channels，最后一层输出 hidden_channels
-            if i < n_layers - 1:
-                res_skip_channels = 2 * hidden_channels
-            else:
-                res_skip_channels = hidden_channels
-            self.res_skip_layers.append(
-                nn.utils.weight_norm(
-                    nn.Conv1d(hidden_channels, res_skip_channels, 1)
-                )
-            )
-    def forward(self, x, x_mask, g=None):
-        output = torch.zeros_like(x)
-        if g is not None and self.gin_channels > 0:
-            g = self.cond_layer(g)
-        for i in range(self.n_layers):
-            x_in = self.in_layers[i](x)
-            if g is not None:
-                cond_offset = i * 2 * self.hidden_channels
-                g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
-                x_in = x_in + g_l
-            acts = torch.tanh(x_in[:, :self.hidden_channels]) * torch.sigmoid(x_in[:, self.hidden_channels:])
-            acts = self.drop(acts)
-            res_skip = self.res_skip_layers[i](acts)
-            if i < self.n_layers - 1:
-                # 前 n-1 层：residual + skip
-                x = (x + res_skip[:, :self.hidden_channels]) * x_mask
-                output = output + res_skip[:, self.hidden_channels:]
-            else:
-                # 最后一层：只有 residual，加到 output
-                x = (x + res_skip) * x_mask
-                output = output + res_skip
-        return output * x_mask
-class PosteriorEncoder(nn.Module):
-    """后验编码器"""
-    def __init__(self, in_channels: int, out_channels: int, hidden_channels: int,
-                 kernel_size: int, dilation_rate: int, n_layers: int,
-                 gin_channels: int = 0):
-        super().__init__()
-        self.out_channels = out_channels
-        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
-        self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels)
-        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
-    def forward(self, x, x_lengths, g=None):
-        x_mask = torch.unsqueeze(
-            self._sequence_mask(x_lengths, x.size(2)), 1
-        ).to(x.dtype)
-        x = self.pre(x) * x_mask
-        x = self.enc(x, x_mask, g=g)
-        stats = self.proj(x) * x_mask
-        m, logs = torch.split(stats, self.out_channels, dim=1)
-        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
-        return z, m, logs, x_mask
-    def _sequence_mask(self, length, max_length=None):
-        if max_length is None:
-            max_length = length.max()
-        x = torch.arange(max_length, dtype=length.dtype, device=length.device)
-        return x.unsqueeze(0) < length.unsqueeze(1)
-class Generator(nn.Module):
-    """NSF-HiFi-GAN 生成器 (带权重归一化)"""
-    def __init__(self, initial_channel: int, resblock_kernel_sizes: list,
-                 resblock_dilation_sizes: list, upsample_rates: list,
-                 upsample_initial_channel: int, upsample_kernel_sizes: list,
-                 gin_channels: int = 0, sr: int = 40000, is_half: bool = False):
-        super().__init__()
-        self.num_kernels = len(resblock_kernel_sizes)
-        self.num_upsamples = len(upsample_rates)
-        self.sr = sr
-        self.is_half = is_half
-        # 计算上采样因子
-        self.upp = int(np.prod(upsample_rates))
-        self.conv_pre = nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, 3)
-        # NSF 源模块
-        self.m_source = SourceModuleHnNSF(sample_rate=sr, harmonic_num=0)
-        # 噪声卷积层
-        self.noise_convs = nn.ModuleList()
-        self.ups = nn.ModuleList()
-        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
-            c_cur = upsample_initial_channel // (2 ** (i + 1))
-            self.ups.append(
-                nn.utils.weight_norm(
-                    nn.ConvTranspose1d(
-                        upsample_initial_channel // (2 ** i),
-                        c_cur,
-                        k, u, (k - u) // 2
-                    )
-                )
-            )
-            # 噪声卷积
-            if i + 1 < len(upsample_rates):
-                stride_f0 = int(np.prod(upsample_rates[i + 1:]))
-                self.noise_convs.append(
-                    nn.Conv1d(1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2)
-                )
-            else:
-                self.noise_convs.append(nn.Conv1d(1, c_cur, kernel_size=1))
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.ups)):
-            ch = upsample_initial_channel // (2 ** (i + 1))
-            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
-                self.resblocks.append(ResBlock(ch, k, d))
-        self.conv_post = nn.Conv1d(ch, 1, 7, 1, 3, bias=False)
-        if gin_channels > 0:
-            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
-    def forward(self, x, f0, g=None):
-        import logging
-        log = logging.getLogger(__name__)
-        log.debug(f"[Generator] 输入 x: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}")
-        log.debug(f"[Generator] 输入 f0: shape={f0.shape}, max={f0.max().item():.1f}, min={f0.min().item():.1f}")
-        if g is not None:
-            log.debug(f"[Generator] 输入 g: shape={g.shape}, max={g.abs().max().item():.4f}")
-        # 生成 NSF 激励信号
-        har_source, _, _ = self.m_source(f0, self.upp)
-        har_source = har_source.transpose(1, 2)  # [B, 1, T*upp]
-        log.debug(f"[Generator] NSF har_source: shape={har_source.shape}, max={har_source.abs().max().item():.4f}")
-        x = self.conv_pre(x)
-        log.debug(f"[Generator] conv_pre 输出: shape={x.shape}, max={x.abs().max().item():.4f}")
-        if g is not None:
-            x = x + self.cond(g)
-            log.debug(f"[Generator] 加入条件后: max={x.abs().max().item():.4f}")
-        for i in range(self.num_upsamples):
-            x = F.leaky_relu(x, 0.1)
-            x = self.ups[i](x)
-            # 融合噪声
-            x_source = self.noise_convs[i](har_source)
-            x = x + x_source
-            xs = None
-            for j in range(self.num_kernels):
-                if xs is None:
-                    xs = self.resblocks[i * self.num_kernels + j](x)
-                else:
-                    xs += self.resblocks[i * self.num_kernels + j](x)
-            x = xs / self.num_kernels
-            log.debug(f"[Generator] 上采样层 {i}: shape={x.shape}, max={x.abs().max().item():.4f}")
-        x = F.leaky_relu(x)
-        x = self.conv_post(x)
-        log.debug(f"[Generator] conv_post 输出: shape={x.shape}, max={x.abs().max().item():.4f}")
-        x = torch.tanh(x)
-        log.debug(f"[Generator] tanh 输出: shape={x.shape}, max={x.abs().max().item():.4f}")
-        return x
-    def remove_weight_norm(self):
-        for l in self.ups:
-            nn.utils.remove_weight_norm(l)
-        for l in self.resblocks:
-            l.remove_weight_norm()
-class ResBlock(nn.Module):
-    """残��块 (带权重归一化)"""
-    def __init__(self, channels: int, kernel_size: int = 3, dilation: tuple = (1, 3, 5)):
-        super().__init__()
-        self.convs1 = nn.ModuleList([
-            nn.utils.weight_norm(
-                nn.Conv1d(channels, channels, kernel_size, 1,
-                          (kernel_size * d - d) // 2, dilation=d)
-            )
-            for d in dilation
-        ])
-        self.convs2 = nn.ModuleList([
-            nn.utils.weight_norm(
-                nn.Conv1d(channels, channels, kernel_size, 1,
-                          (kernel_size - 1) // 2)
-            )
-            for _ in dilation
-        ])
-    def forward(self, x):
-        for c1, c2 in zip(self.convs1, self.convs2):
-            xt = F.leaky_relu(x, 0.1)
-            xt = c1(xt)
-            xt = F.leaky_relu(xt, 0.1)
-            xt = c2(xt)
-            x = xt + x
-        return x
-    def remove_weight_norm(self):
-        for l in self.convs1:
-            nn.utils.remove_weight_norm(l)
-        for l in self.convs2:
-            nn.utils.remove_weight_norm(l)
-class SineGenerator(nn.Module):
-    """正弦波生成器 - NSF 的核心组件"""
-    def __init__(self, sample_rate: int, harmonic_num: int = 0,
-                 sine_amp: float = 0.1, noise_std: float = 0.003,
-                 voiced_threshold: float = 10):
-        super().__init__()
-        self.sample_rate = sample_rate
-        self.harmonic_num = harmonic_num
-        self.sine_amp = sine_amp
-        self.noise_std = noise_std
-        self.voiced_threshold = voiced_threshold
-        self.dim = harmonic_num + 1
-    def forward(self, f0: torch.Tensor, upp: int):
-        """
-        生成正弦波激励信号
-        Args:
-            f0: 基频张量 [B, T]
-            upp: 上采样因子
-        Returns:
-            正弦波信号 [B, T*upp, 1]
-        """
-        with torch.no_grad():
-            # 上采样 F0
-            f0 = f0.unsqueeze(1)  # [B, 1, T]
-            f0_up = F.interpolate(f0, scale_factor=upp, mode='nearest')
-            f0_up = f0_up.transpose(1, 2)  # [B, T*upp, 1]
-            # 生成正弦波
-            rad = f0_up / self.sample_rate  # 归一化频率
-            rad_acc = torch.cumsum(rad, dim=1) % 1  # 累积相位
-            sine_wave = torch.sin(2 * np.pi * rad_acc) * self.sine_amp
-            # 静音区域（F0=0）使用噪声
-            voiced_mask = (f0_up > self.voiced_threshold).float()
-            noise = torch.randn_like(sine_wave) * self.noise_std
-            sine_wave = sine_wave * voiced_mask + noise * (1 - voiced_mask)
-            return sine_wave
-class SourceModuleHnNSF(nn.Module):
-    """谐波加噪声源模块"""
-    def __init__(self, sample_rate: int, harmonic_num: int = 0,
-                 sine_amp: float = 0.1, noise_std: float = 0.003,
-                 add_noise_std: float = 0.003):
-        super().__init__()
-        self.sine_generator = SineGenerator(
-            sample_rate, harmonic_num, sine_amp, noise_std
-        )
-        self.l_linear = nn.Linear(harmonic_num + 1, 1)
-        self.l_tanh = nn.Tanh()
-    def forward(self, f0: torch.Tensor, upp: int):
-        sine = self.sine_generator(f0, upp)  # [B, T*upp, 1]
-        sine = self.l_tanh(self.l_linear(sine))
-        noise = torch.randn_like(sine) * 0.003
-        return sine, noise, None  # 返回 3 个值以匹配接口
-class SynthesizerTrnMs768NSFsid(nn.Module):
-    """RVC v2 合成器 (768 维 HuBERT + NSF + SID)"""
-    def __init__(self, spec_channels: int, segment_size: int,
-                 inter_channels: int, hidden_channels: int, filter_channels: int,
-                 n_heads: int, n_layers: int, kernel_size: int, p_dropout: float,
-                 resblock: str, resblock_kernel_sizes: list,
-                 resblock_dilation_sizes: list, upsample_rates: list,
-                 upsample_initial_channel: int, upsample_kernel_sizes: list,
-                 spk_embed_dim: int, gin_channels: int, sr: int):
-        super().__init__()
-        self.spec_channels = spec_channels
-        self.inter_channels = inter_channels
-        self.hidden_channels = hidden_channels
-        self.filter_channels = filter_channels
-        self.n_heads = n_heads
-        self.n_layers = n_layers
-        self.kernel_size = kernel_size
-        self.p_dropout = p_dropout
-        self.resblock = resblock
-        self.resblock_kernel_sizes = resblock_kernel_sizes
-        self.resblock_dilation_sizes = resblock_dilation_sizes
-        self.upsample_rates = upsample_rates
-        self.upsample_initial_channel = upsample_initial_channel
-        self.upsample_kernel_sizes = upsample_kernel_sizes
-        self.segment_size = segment_size
-        self.gin_channels = gin_channels
-        self.spk_embed_dim = spk_embed_dim
-        self.sr = sr
-        # 文本编码器 (使用 TextEncoder 替代 PosteriorEncoder)
-        self.enc_p = TextEncoder(
-            inter_channels, hidden_channels, filter_channels,
-            n_heads, n_layers, kernel_size, p_dropout, f0=True
-        )
-        # 解码器/生成器 (NSF-HiFiGAN，内部包含 m_source)
-        self.dec = Generator(
-            inter_channels, resblock_kernel_sizes, resblock_dilation_sizes,
-            upsample_rates, upsample_initial_channel, upsample_kernel_sizes,
-            gin_channels, sr=sr
-        )
-        # 流
-        self.flow = ResidualCouplingBlock(
-            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
-        )
-        # 说话人嵌入
-        self.emb_g = nn.Embedding(spk_embed_dim, gin_channels)
-    def forward(self, phone, phone_lengths, pitch, nsff0, sid, skip_head=0, return_length=0):
-        """前向传播"""
-        g = self.emb_g(sid).unsqueeze(-1)
-        # TextEncoder 返回 mean 和 log-variance
-        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
-        # 在编码器外部采样
-        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
-        # 正向 flow
-        z = self.flow(z_p, x_mask, g=g)
-        # 生成音频 (传入 f0)
-        o = self.dec(z, nsff0, g=g)
-        return o
-    def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=1.0):
-        """推理"""
-        import logging
-        log = logging.getLogger(__name__)
-        log.debug(f"[infer] 输入 phone: shape={phone.shape}, dtype={phone.dtype}")
-        log.debug(f"[infer] 输入 phone 统计: max={phone.abs().max().item():.4f}, mean={phone.abs().mean().item():.4f}")
-        log.debug(f"[infer] 输入 phone_lengths: {phone_lengths}")
-        log.debug(f"[infer] 输入 pitch: shape={pitch.shape}, max={pitch.max().item()}, min={pitch.min().item()}")
-        log.debug(f"[infer] 输入 nsff0: shape={nsff0.shape}, max={nsff0.max().item():.1f}, min={nsff0.min().item():.1f}")
-        log.debug(f"[infer] 输入 sid: {sid}")
-        g = self.emb_g(sid).unsqueeze(-1)
-        log.debug(f"[infer] 说话人嵌入 g: shape={g.shape}, max={g.abs().max().item():.4f}")
-        # TextEncoder 返回 mean 和 log-variance
-        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
-        log.debug(f"[infer] TextEncoder 输出:")
-        log.debug(f"[infer]   m_p: shape={m_p.shape}, max={m_p.abs().max().item():.4f}, mean={m_p.abs().mean().item():.4f}")
-        log.debug(f"[infer]   logs_p: shape={logs_p.shape}, max={logs_p.max().item():.4f}, min={logs_p.min().item():.4f}")
-        log.debug(f"[infer]   x_mask: shape={x_mask.shape}, sum={x_mask.sum().item()}")
-        # 在编码器外部采样 (使用较小的噪声系数以获得更稳定的输出)
-        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
-        log.debug(f"[infer] 采样后 z_p: shape={z_p.shape}, max={z_p.abs().max().item():.4f}, mean={z_p.abs().mean().item():.4f}")
-        # 反向 flow
-        z = self.flow(z_p, x_mask, g=g, reverse=True)
-        log.debug(f"[infer] Flow 输出 z: shape={z.shape}, max={z.abs().max().item():.4f}, mean={z.abs().mean().item():.4f}")
-        # 生成音频 (传入 f0，Generator 内部会生成 NSF 激励信号)
-        o = self.dec(z * x_mask, nsff0, g=g)
-        log.debug(f"[infer] Generator 输出 o: shape={o.shape}, max={o.abs().max().item():.4f}, mean={o.abs().mean().item():.4f}")
-        return o, x_mask

+# -*- coding: utf-8 -*-
+"""
+RVC v2 合成器模型定义
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple
+import numpy as np
+class LayerNorm(nn.Module):
+    """Layer normalization for channels-first tensors"""
+    def __init__(self, channels: int, eps: float = 1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        # x: [B, C, T]
+        x = x.transpose(1, -1)  # [B, T, C]
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)  # [B, C, T]
+class MultiHeadAttention(nn.Module):
+    """Multi-head attention module"""
+    def __init__(self, channels: int, out_channels: int, n_heads: int,
+                 p_dropout: float = 0.0, window_size: Optional[int] = None,
+                 heads_share: bool = True, block_length: Optional[int] = None,
+                 proximal_bias: bool = False, proximal_init: bool = False):
+        super().__init__()
+        assert channels % n_heads == 0
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.attn = None
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels ** -0.5
+            self.emb_rel_k = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev
+            )
+            self.emb_rel_v = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev
+            )
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+        if proximal_init:
+            with torch.no_grad():
+                self.conv_k.weight.copy_(self.conv_q.weight)
+                self.conv_k.bias.copy_(self.conv_q.bias)
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+        x = self.conv_o(x)
+        return x
+    def attention(self, query, key, value, mask=None):
+        # query, key, value: [B, C, T]
+        b, d, t_s = key.size()
+        t_t = query.size(2)
+        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+        if self.window_size is not None:
+            assert t_s == t_t, "Relative attention only for self-attention"
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(query / math.sqrt(self.k_channels), key_relative_embeddings)
+            scores_local = self._relative_position_to_absolute_position(rel_logits)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias only for self-attention"
+            scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+            if self.block_length is not None:
+                assert t_s == t_t, "Block length only for self-attention"
+                block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
+                scores = scores.masked_fill(block_mask == 0, -1e4)
+        p_attn = F.softmax(scores, dim=-1)
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
+            output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
+        output = output.transpose(2, 3).contiguous().view(b, d, t_t)
+        return output, p_attn
+    def _matmul_with_relative_values(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+    def _matmul_with_relative_keys(self, x, y):
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        max_relative_position = 2 * self.window_size + 1
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = F.pad(
+                relative_embeddings,
+                (0, 0, pad_length, pad_length, 0, 0)
+            )
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[:, slice_start_position:slice_end_position]
+        return used_relative_embeddings
+    def _relative_position_to_absolute_position(self, x):
+        batch, heads, length, _ = x.size()
+        x = F.pad(x, (0, 1, 0, 0, 0, 0, 0, 0))
+        x_flat = x.view(batch, heads, length * 2 * length)
+        x_flat = F.pad(x_flat, (0, length - 1, 0, 0, 0, 0))
+        x_final = x_flat.view(batch, heads, length + 1, 2 * length - 1)[:, :, :length, length - 1:]
+        return x_final
+    def _absolute_position_to_relative_position(self, x):
+        batch, heads, length, _ = x.size()
+        x = F.pad(x, (0, length - 1, 0, 0, 0, 0, 0, 0))
+        x_flat = x.view(batch, heads, length ** 2 + length * (length - 1))
+        x_flat = F.pad(x_flat, (length, 0, 0, 0, 0, 0))
+        x_final = x_flat.view(batch, heads, length, 2 * length)[:, :, :, 1:]
+        return x_final
+    def _attention_bias_proximal(self, length):
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+class FFN(nn.Module):
+    """Feed-forward network with optional causal convolution"""
+    def __init__(self, in_channels: int, out_channels: int, filter_channels: int,
+                 kernel_size: int, p_dropout: float = 0.0, activation: str = None,
+                 causal: bool = False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+        self.causal = causal
+        if causal:
+            self.padding = self._causal_padding
+        else:
+            self.padding = self._same_padding
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+        self.drop = nn.Dropout(p_dropout)
+    def forward(self, x, x_mask):
+        x = self.conv_1(self.padding(x))
+        if self.activation == "gelu":
+            x = x * torch.sigmoid(1.702 * x)
+        else:
+            x = torch.relu(x)
+        x = self.drop(x)
+        x = self.conv_2(self.padding(x))
+        return x * x_mask
+    def _causal_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = self.kernel_size - 1
+        pad_r = 0
+        return F.pad(x, (pad_l, pad_r, 0, 0, 0, 0))
+    def _same_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = (self.kernel_size - 1) // 2
+        pad_r = self.kernel_size // 2
+        return F.pad(x, (pad_l, pad_r, 0, 0, 0, 0))
+class Encoder(nn.Module):
+    """Transformer encoder with multi-head attention"""
+    def __init__(self, hidden_channels: int, filter_channels: int, n_heads: int,
+                 n_layers: int, kernel_size: int = 1, p_dropout: float = 0.0,
+                 window_size: int = 10):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.drop = nn.Dropout(p_dropout)
+        self.attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for _ in range(n_layers):
+            self.attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels, hidden_channels, n_heads,
+                    p_dropout=p_dropout, window_size=window_size
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(hidden_channels, hidden_channels, filter_channels,
+                    kernel_size, p_dropout=p_dropout)
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+    def forward(self, x, x_mask):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+class TextEncoder(nn.Module):
+    """Text encoder for RVC - encodes phone and pitch embeddings"""
+    def __init__(self, out_channels: int, hidden_channels: int, filter_channels: int,
+                 n_heads: int, n_layers: int, kernel_size: int, p_dropout: float,
+                 f0: bool = True):
+        super().__init__()
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.f0 = f0
+        # Phone embedding: Linear projection from 768-dim HuBERT features
+        self.emb_phone = nn.Linear(768, hidden_channels)
+        # Pitch embedding (only if f0 is enabled)
+        if f0:
+            self.emb_pitch = nn.Embedding(256, hidden_channels)
+        # Transformer encoder
+        self.encoder = Encoder(
+            hidden_channels, filter_channels, n_heads, n_layers,
+            kernel_size, p_dropout
+        )
+        # Output projection to mean and log-variance
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, phone, pitch, lengths):
+        """
+        Args:
+            phone: [B, 768, T] phone features from HuBERT (channels first)
+            pitch: [B, T] pitch indices (0-255)
+            lengths: [B] sequence lengths
+        Returns:
+            m: [B, out_channels, T] mean
+            logs: [B, out_channels, T] log-variance
+            x_mask: [B, 1, T] mask
+        """
+        import logging
+        log = logging.getLogger(__name__)
+        log.debug(f"[TextEncoder] 输入 phone: shape={phone.shape}")
+        log.debug(f"[TextEncoder] 输入 pitch: shape={pitch.shape}, max={pitch.max().item()}, min={pitch.min().item()}")
+        log.debug(f"[TextEncoder] 输入 lengths: {lengths}")
+        # Transpose phone from [B, C, T] to [B, T, C] for linear layer
+        phone = phone.transpose(1, 2)  # [B, T, 768]
+        log.debug(f"[TextEncoder] 转置后 phone: shape={phone.shape}")
+        # Create mask
+        x_mask = torch.unsqueeze(
+            self._sequence_mask(lengths, phone.size(1)), 1
+        ).to(phone.dtype)
+        log.debug(f"[TextEncoder] x_mask: shape={x_mask.shape}, sum={x_mask.sum().item()}")
+        # Phone embedding
+        x = self.emb_phone(phone)  # [B, T, hidden_channels]
+        log.debug(f"[TextEncoder] emb_phone 输出: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}")
+        # Add pitch embedding if enabled
+        if self.f0 and pitch is not None:
+            # Clamp pitch to valid range
+            pitch_clamped = torch.clamp(pitch, 0, 255)
+            pitch_emb = self.emb_pitch(pitch_clamped)
+            log.debug(f"[TextEncoder] emb_pitch 输出: shape={pitch_emb.shape}, max={pitch_emb.abs().max().item():.4f}")
+            x = x + pitch_emb
+        # Transpose for conv layers: [B, hidden_channels, T]
+        x = x.transpose(1, 2)
+        log.debug(f"[TextEncoder] 转置后 x: shape={x.shape}")
+        # Apply mask
+        x = x * x_mask
+        # Transformer encoder
+        x = self.encoder(x, x_mask)
+        log.debug(f"[TextEncoder] Transformer 输出: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}")
+        # Project to mean and log-variance
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        log.debug(f"[TextEncoder] 最终输出 m: shape={m.shape}, max={m.abs().max().item():.4f}")
+        log.debug(f"[TextEncoder] 最终输出 logs: shape={logs.shape}, max={logs.max().item():.4f}, min={logs.min().item():.4f}")
+        return m, logs, x_mask
+    def _sequence_mask(self, length, max_length=None):
+        if max_length is None:
+            max_length = length.max()
+        x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+        return x.unsqueeze(0) < length.unsqueeze(1)
+class ResidualCouplingBlock(nn.Module):
+    """残差耦合块"""
+    def __init__(self, channels: int, hidden_channels: int, kernel_size: int,
+                 dilation_rate: int, n_layers: int, n_flows: int = 4,
+                 gin_channels: int = 0):
+        super().__init__()
+        self.flows = nn.ModuleList()
+        for _ in range(n_flows):
+            self.flows.append(
+                ResidualCouplingLayer(
+                    channels, hidden_channels, kernel_size,
+                    dilation_rate, n_layers, gin_channels=gin_channels
+                )
+            )
+            self.flows.append(Flip())
+    def forward(self, x, x_mask, g=None, reverse=False):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+class ResidualCouplingLayer(nn.Module):
+    """残差耦合层"""
+    def __init__(self, channels: int, hidden_channels: int, kernel_size: int,
+                 dilation_rate: int, n_layers: int, mean_only: bool = True,
+                 gin_channels: int = 0):
+        super().__init__()
+        self.half_channels = channels // 2
+        self.mean_only = mean_only
+        self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
+        self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels)
+        self.post = nn.Conv1d(hidden_channels, self.half_channels, 1)
+        self.post.weight.data.zero_()
+        self.post.bias.data.zero_()
+    def forward(self, x, x_mask, g=None, reverse=False):
+        x0, x1 = torch.split(x, [self.half_channels] * 2, dim=1)
+        h = self.pre(x0) * x_mask
+        h = self.enc(h, x_mask, g=g)
+        stats = self.post(h) * x_mask
+        m = stats
+        if not reverse:
+            x1 = m + x1 * x_mask
+            x = torch.cat([x0, x1], dim=1)
+            return x, None
+        else:
+            x1 = (x1 - m) * x_mask
+            x = torch.cat([x0, x1], dim=1)
+            return x
+class Flip(nn.Module):
+    """翻转层"""
+    def forward(self, x, *args, reverse=False, **kwargs):
+        x = torch.flip(x, [1])
+        return x
+class WN(nn.Module):
+    """WaveNet 风格网络 (带权重归一化)"""
+    def __init__(self, hidden_channels: int, kernel_size: int,
+                 dilation_rate: int, n_layers: int, gin_channels: int = 0,
+                 p_dropout: float = 0):
+        super().__init__()
+        self.n_layers = n_layers
+        self.hidden_channels = hidden_channels
+        self.gin_channels = gin_channels
+        self.in_layers = nn.ModuleList()
+        self.res_skip_layers = nn.ModuleList()
+        self.drop = nn.Dropout(p_dropout)
+        if gin_channels > 0:
+            self.cond_layer = nn.utils.weight_norm(
+                nn.Conv1d(gin_channels, 2 * hidden_channels * n_layers, 1)
+            )
+        for i in range(n_layers):
+            dilation = dilation_rate ** i
+            padding = (kernel_size * dilation - dilation) // 2
+            self.in_layers.append(
+                nn.utils.weight_norm(
+                    nn.Conv1d(hidden_channels, 2 * hidden_channels, kernel_size,
+                              dilation=dilation, padding=padding)
+                )
+            )
+            # 前 n-1 层输出 2 * hidden_channels，最后一层输出 hidden_channels
+            if i < n_layers - 1:
+                res_skip_channels = 2 * hidden_channels
+            else:
+                res_skip_channels = hidden_channels
+            self.res_skip_layers.append(
+                nn.utils.weight_norm(
+                    nn.Conv1d(hidden_channels, res_skip_channels, 1)
+                )
+            )
+    def forward(self, x, x_mask, g=None):
+        output = torch.zeros_like(x)
+        if g is not None and self.gin_channels > 0:
+            g = self.cond_layer(g)
+        for i in range(self.n_layers):
+            x_in = self.in_layers[i](x)
+            if g is not None:
+                cond_offset = i * 2 * self.hidden_channels
+                g_l = g[:, cond_offset:cond_offset + 2 * self.hidden_channels, :]
+                x_in = x_in + g_l
+            acts = torch.tanh(x_in[:, :self.hidden_channels]) * torch.sigmoid(x_in[:, self.hidden_channels:])
+            acts = self.drop(acts)
+            res_skip = self.res_skip_layers[i](acts)
+            if i < self.n_layers - 1:
+                # 前 n-1 层：residual + skip
+                x = (x + res_skip[:, :self.hidden_channels]) * x_mask
+                output = output + res_skip[:, self.hidden_channels:]
+            else:
+                # 最后一层：只有 residual，加到 output
+                x = (x + res_skip) * x_mask
+                output = output + res_skip
+        return output * x_mask
+class PosteriorEncoder(nn.Module):
+    """后验编码器"""
+    def __init__(self, in_channels: int, out_channels: int, hidden_channels: int,
+                 kernel_size: int, dilation_rate: int, n_layers: int,
+                 gin_channels: int = 0):
+        super().__init__()
+        self.out_channels = out_channels
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels)
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(
+            self._sequence_mask(x_lengths, x.size(2)), 1
+        ).to(x.dtype)
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+    def _sequence_mask(self, length, max_length=None):
+        if max_length is None:
+            max_length = length.max()
+        x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+        return x.unsqueeze(0) < length.unsqueeze(1)
+class Generator(nn.Module):
+    """NSF-HiFi-GAN 生成器 (带权重归一化)"""
+    def __init__(self, initial_channel: int, resblock_kernel_sizes: list,
+                 resblock_dilation_sizes: list, upsample_rates: list,
+                 upsample_initial_channel: int, upsample_kernel_sizes: list,
+                 gin_channels: int = 0, sr: int = 40000, is_half: bool = False):
+        super().__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.sr = sr
+        self.is_half = is_half
+        # 计算上采样因子
+        self.upp = int(np.prod(upsample_rates))
+        self.conv_pre = nn.Conv1d(initial_channel, upsample_initial_channel, 7, 1, 3)
+        # NSF 源模块
+        self.m_source = SourceModuleHnNSF(sample_rate=sr, harmonic_num=0)
+        # 噪声卷积层
+        self.noise_convs = nn.ModuleList()
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            c_cur = upsample_initial_channel // (2 ** (i + 1))
+            self.ups.append(
+                nn.utils.weight_norm(
+                    nn.ConvTranspose1d(
+                        upsample_initial_channel // (2 ** i),
+                        c_cur,
+                        k, u, (k - u) // 2
+                    )
+                )
+            )
+            # 噪声卷积
+            if i + 1 < len(upsample_rates):
+                stride_f0 = int(np.prod(upsample_rates[i + 1:]))
+                self.noise_convs.append(
+                    nn.Conv1d(1, c_cur, kernel_size=stride_f0 * 2, stride=stride_f0, padding=stride_f0 // 2)
+                )
+            else:
+                self.noise_convs.append(nn.Conv1d(1, c_cur, kernel_size=1))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
+                self.resblocks.append(ResBlock(ch, k, d))
+        self.conv_post = nn.Conv1d(ch, 1, 7, 1, 3, bias=False)
+        if gin_channels > 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+    def forward(self, x, f0, g=None):
+        import logging
+        log = logging.getLogger(__name__)
+        log.debug(f"[Generator] 输入 x: shape={x.shape}, max={x.abs().max().item():.4f}, mean={x.abs().mean().item():.4f}")
+        log.debug(f"[Generator] 输入 f0: shape={f0.shape}, max={f0.max().item():.1f}, min={f0.min().item():.1f}")
+        if g is not None:
+            log.debug(f"[Generator] 输入 g: shape={g.shape}, max={g.abs().max().item():.4f}")
+        # 生成 NSF 激励信号
+        har_source, _, _ = self.m_source(f0, self.upp)
+        har_source = har_source.transpose(1, 2)  # [B, 1, T*upp]
+        log.debug(f"[Generator] NSF har_source: shape={har_source.shape}, max={har_source.abs().max().item():.4f}")
+        x = self.conv_pre(x)
+        log.debug(f"[Generator] conv_pre 输出: shape={x.shape}, max={x.abs().max().item():.4f}")
+        if g is not None:
+            x = x + self.cond(g)
+            log.debug(f"[Generator] 加入条件后: max={x.abs().max().item():.4f}")
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, 0.1)
+            x = self.ups[i](x)
+            # 融合噪声
+            x_source = self.noise_convs[i](har_source)
+            x = x + x_source
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+            log.debug(f"[Generator] 上采样层 {i}: shape={x.shape}, max={x.abs().max().item():.4f}")
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        log.debug(f"[Generator] conv_post 输出: shape={x.shape}, max={x.abs().max().item():.4f}")
+        x = torch.tanh(x)
+        log.debug(f"[Generator] tanh 输出: shape={x.shape}, max={x.abs().max().item():.4f}")
+        return x
+    def remove_weight_norm(self):
+        for l in self.ups:
+            nn.utils.remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+class ResBlock(nn.Module):
+    """残差块 (带权重归一化)"""
+    def __init__(self, channels: int, kernel_size: int = 3, dilation: tuple = (1, 3, 5)):
+        super().__init__()
+        self.convs1 = nn.ModuleList([
+            nn.utils.weight_norm(
+                nn.Conv1d(channels, channels, kernel_size, 1,
+                          (kernel_size * d - d) // 2, dilation=d)
+            )
+            for d in dilation
+        ])
+        self.convs2 = nn.ModuleList([
+            nn.utils.weight_norm(
+                nn.Conv1d(channels, channels, kernel_size, 1,
+                          (kernel_size - 1) // 2)
+            )
+            for _ in dilation
+        ])
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, 0.1)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, 0.1)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            nn.utils.remove_weight_norm(l)
+        for l in self.convs2:
+            nn.utils.remove_weight_norm(l)
+class SineGenerator(nn.Module):
+    """正弦波生成器 - NSF 的核心组件"""
+    def __init__(self, sample_rate: int, harmonic_num: int = 0,
+                 sine_amp: float = 0.1, noise_std: float = 0.003,
+                 voiced_threshold: float = 10):
+        super().__init__()
+        self.sample_rate = sample_rate
+        self.harmonic_num = harmonic_num
+        self.sine_amp = sine_amp
+        self.noise_std = noise_std
+        self.voiced_threshold = voiced_threshold
+        self.dim = harmonic_num + 1
+    def forward(self, f0: torch.Tensor, upp: int):
+        """
+        生成正弦波激励信号
+        Args:
+            f0: 基频张量 [B, T]
+            upp: 上采样因子
+        Returns:
+            正弦波信号 [B, T*upp, 1]
+        """
+        with torch.no_grad():
+            # 上采样 F0
+            f0 = f0.unsqueeze(1)  # [B, 1, T]
+            f0_up = F.interpolate(f0, scale_factor=upp, mode='nearest')
+            f0_up = f0_up.transpose(1, 2)  # [B, T*upp, 1]
+            # 生成正弦波
+            rad = f0_up / self.sample_rate  # 归一化频率
+            rad_acc = torch.cumsum(rad, dim=1) % 1  # 累积相位
+            sine_wave = torch.sin(2 * np.pi * rad_acc) * self.sine_amp
+            # 静音区域（F0=0）使用噪声
+            voiced_mask = (f0_up > self.voiced_threshold).float()
+            noise = torch.randn_like(sine_wave) * self.noise_std
+            sine_wave = sine_wave * voiced_mask + noise * (1 - voiced_mask)
+            return sine_wave
+class SourceModuleHnNSF(nn.Module):
+    """谐波加噪声源模块"""
+    def __init__(self, sample_rate: int, harmonic_num: int = 0,
+                 sine_amp: float = 0.1, noise_std: float = 0.003,
+                 add_noise_std: float = 0.003):
+        super().__init__()
+        self.sine_generator = SineGenerator(
+            sample_rate, harmonic_num, sine_amp, noise_std
+        )
+        self.l_linear = nn.Linear(harmonic_num + 1, 1)
+        self.l_tanh = nn.Tanh()
+    def forward(self, f0: torch.Tensor, upp: int):
+        sine = self.sine_generator(f0, upp)  # [B, T*upp, 1]
+        sine = self.l_tanh(self.l_linear(sine))
+        noise = torch.randn_like(sine) * 0.003
+        return sine, noise, None  # 返回 3 个值以匹配接口
+class SynthesizerTrnMs768NSFsid(nn.Module):
+    """RVC v2 合成器 (768 维 HuBERT + NSF + SID)"""
+    def __init__(self, spec_channels: int, segment_size: int,
+                 inter_channels: int, hidden_channels: int, filter_channels: int,
+                 n_heads: int, n_layers: int, kernel_size: int, p_dropout: float,
+                 resblock: str, resblock_kernel_sizes: list,
+                 resblock_dilation_sizes: list, upsample_rates: list,
+                 upsample_initial_channel: int, upsample_kernel_sizes: list,
+                 spk_embed_dim: int, gin_channels: int, sr: int):
+        super().__init__()
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.gin_channels = gin_channels
+        self.spk_embed_dim = spk_embed_dim
+        self.sr = sr
+        # 文本编码器 (使用 TextEncoder 替代 PosteriorEncoder)
+        self.enc_p = TextEncoder(
+            inter_channels, hidden_channels, filter_channels,
+            n_heads, n_layers, kernel_size, p_dropout, f0=True
+        )
+        # 解码器/生成器 (NSF-HiFiGAN，内部包含 m_source)
+        self.dec = Generator(
+            inter_channels, resblock_kernel_sizes, resblock_dilation_sizes,
+            upsample_rates, upsample_initial_channel, upsample_kernel_sizes,
+            gin_channels, sr=sr
+        )
+        # 流
+        self.flow = ResidualCouplingBlock(
+            inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
+        )
+        # 说话人嵌入
+        self.emb_g = nn.Embedding(spk_embed_dim, gin_channels)
+    def forward(self, phone, phone_lengths, pitch, nsff0, sid, skip_head=0, return_length=0):
+        """前向传播"""
+        g = self.emb_g(sid).unsqueeze(-1)
+        # TextEncoder 返回 mean 和 log-variance
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        # 在编码器外部采样
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        # 正向 flow
+        z = self.flow(z_p, x_mask, g=g)
+        # 生成音频 (传入 f0)
+        o = self.dec(z, nsff0, g=g)
+        return o
+    def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=1.0):
+        """推理"""
+        import logging
+        log = logging.getLogger(__name__)
+        log.debug(f"[infer] 输入 phone: shape={phone.shape}, dtype={phone.dtype}")
+        log.debug(f"[infer] 输入 phone 统计: max={phone.abs().max().item():.4f}, mean={phone.abs().mean().item():.4f}")
+        log.debug(f"[infer] 输入 phone_lengths: {phone_lengths}")
+        log.debug(f"[infer] 输入 pitch: shape={pitch.shape}, max={pitch.max().item()}, min={pitch.min().item()}")
+        log.debug(f"[infer] 输入 nsff0: shape={nsff0.shape}, max={nsff0.max().item():.1f}, min={nsff0.min().item():.1f}")
+        log.debug(f"[infer] 输入 sid: {sid}")
+        g = self.emb_g(sid).unsqueeze(-1)
+        log.debug(f"[infer] 说话人嵌入 g: shape={g.shape}, max={g.abs().max().item():.4f}")
+        # TextEncoder 返回 mean 和 log-variance
+        m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
+        log.debug(f"[infer] TextEncoder 输出:")
+        log.debug(f"[infer]   m_p: shape={m_p.shape}, max={m_p.abs().max().item():.4f}, mean={m_p.abs().mean().item():.4f}")
+        log.debug(f"[infer]   logs_p: shape={logs_p.shape}, max={logs_p.max().item():.4f}, min={logs_p.min().item():.4f}")
+        log.debug(f"[infer]   x_mask: shape={x_mask.shape}, sum={x_mask.sum().item()}")
+        # 在编码器外部采样 (使用较小的噪声系数以获得更稳定的输出)
+        z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
+        log.debug(f"[infer] 采样后 z_p: shape={z_p.shape}, max={z_p.abs().max().item():.4f}, mean={z_p.abs().mean().item():.4f}")
+        # 反向 flow
+        z = self.flow(z_p, x_mask, g=g, reverse=True)
+        log.debug(f"[infer] Flow 输出 z: shape={z.shape}, max={z.abs().max().item():.4f}, mean={z.abs().mean().item():.4f}")
+        # 生成音频 (传入 f0，Generator 内部会生成 NSF 激励信号)
+        o = self.dec(z * x_mask, nsff0, g=g)
+        log.debug(f"[infer] Generator 输出 o: shape={o.shape}, max={o.abs().max().item():.4f}, mean={o.abs().mean().item():.4f}")
+        return o, x_mask

requirements_hf.txt ADDED Viewed

	@@ -0,0 +1,39 @@

+# RVC AI 翻唱依赖 (Hugging Face Space - CPU 精简版)
+# 注意：此文件用于 HF Space 部署，同步到 Space 时需重命名为 requirements.txt
+# 本地安装请使用 requirements.txt（包含完整 GPU 依赖）
+# PyTorch
+torch>=2.0.0
+torchaudio>=2.0.0
+# Gradio 界面
+gradio==3.50.2
+# 音频处理
+librosa>=0.9.0
+soundfile>=0.12.0
+scipy>=1.10.0
+numpy>=1.23.0
+praat-parselmouth>=0.4.3
+torchcrepe>=0.0.20
+# 向量检索
+faiss-cpu>=1.7.4
+# 工具库
+tqdm>=4.65.0
+requests>=2.28.0
+python-dotenv>=1.0.0
+colorama>=0.4.6
+# AI 翻唱功能（核心）
+audio-separator
+huggingface_hub>=0.19.0
+pedalboard>=0.7.0
+ffmpeg-python>=0.2.0
+# 以下包在 HF Space 构建环境中编译失败，改为运行时按需安装：
+# fairseq==0.12.2  (HuBERT 特征提取)
+# demucs>=4.0.0    (人声分离备选)
+# pyworld>=0.3.4   (F0 提取备选)
+# av>=10.0.0       (音频解码备选)