Spaces:

mason369
/

AI-RVC

Runtime error

App Files Files Community

mason369 commited on Mar 10

Commit

b6f9c90

verified ·

1 Parent(s): b15e31b

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

lib/__init__.py +8 -0
lib/audio.py +139 -0
lib/device.py +186 -0
lib/logger.py +254 -0
lib/mixer.py +214 -0
lib/vocal_cleanup.py +253 -0
lib/vocoder_fix.py +385 -0

lib/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# -*- coding: utf-8 -*-
+"""
+核心库模块
+"""
+from .audio import load_audio, save_audio
+from .device import get_device, get_device_info, empty_device_cache, supports_fp16
+__all__ = ["load_audio", "save_audio", "get_device", "get_device_info", "empty_device_cache", "supports_fp16"]

lib/audio.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# -*- coding: utf-8 -*-
+"""
+音频处理模块 - 加载、保存和处理音频文件
+"""
+import numpy as np
+import librosa
+import soundfile as sf
+from typing import Tuple, Optional
+def load_audio(path: str, sr: int = 16000) -> np.ndarray:
+    """
+    加载音频文件并重采样
+    Args:
+        path: 音频文件路径
+        sr: 目标采样率 (默认 16000)
+    Returns:
+        np.ndarray: 音频数据 (float32, 单声道)
+    """
+    audio, orig_sr = librosa.load(path, sr=None, mono=True)
+    if orig_sr != sr:
+        audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=sr)
+    return audio.astype(np.float32)
+def save_audio(path: str, audio: np.ndarray, sr: int = 48000):
+    """
+    保存音频到文件
+    Args:
+        path: 输出文件路径
+        audio: 音频数据
+        sr: 采样率 (默认 48000)
+    """
+    # 确保音频在 [-1, 1] 范围内
+    audio = np.clip(audio, -1.0, 1.0)
+    sf.write(path, audio, sr)
+def soft_clip(
+    audio: np.ndarray,
+    threshold: float = 0.9,
+    ceiling: float = 0.99,
+) -> np.ndarray:
+    """
+    使用平滑软削波抑制峰值，尽量保留主体响度。
+    Args:
+        audio: 输入音频
+        threshold: 开始压缩的阈值
+        ceiling: 软削波上限
+    Returns:
+        np.ndarray: 处理后的音频
+    """
+    audio = np.asarray(audio, dtype=np.float32)
+    if threshold <= 0:
+        raise ValueError("threshold 必须大于 0")
+    if ceiling <= threshold:
+        raise ValueError("ceiling 必须大于 threshold")
+    result = audio.copy()
+    abs_audio = np.abs(result)
+    mask = abs_audio > threshold
+    if not np.any(mask):
+        return result
+    overshoot = (abs_audio[mask] - threshold) / (ceiling - threshold + 1e-8)
+    compressed = threshold + (ceiling - threshold) * np.tanh(overshoot)
+    result[mask] = np.sign(result[mask]) * compressed
+    return result.astype(np.float32, copy=False)
+def soft_clip_array(
+    audio: np.ndarray,
+    threshold: float = 0.9,
+    ceiling: float = 0.99,
+) -> np.ndarray:
+    """软削波数组版本，支持单声道/多声道。"""
+    return soft_clip(audio, threshold=threshold, ceiling=ceiling)
+def get_audio_info(path: str) -> dict:
+    """
+    获取音频文件信息
+    Args:
+        path: 音频文件路径
+    Returns:
+        dict: 音频信息
+    """
+    info = sf.info(path)
+    return {
+        "duration": info.duration,
+        "sample_rate": info.samplerate,
+        "channels": info.channels,
+        "format": info.format
+    }
+def normalize_audio(audio: np.ndarray, target_db: float = -20.0) -> np.ndarray:
+    """
+    音频响度归一化
+    Args:
+        audio: 输入音频
+        target_db: 目标响度 (dB)
+    Returns:
+        np.ndarray: 归一化后的音频
+    """
+    rms = np.sqrt(np.mean(audio ** 2))
+    if rms > 0:
+        target_rms = 10 ** (target_db / 20)
+        audio = audio * (target_rms / rms)
+    return np.clip(audio, -1.0, 1.0)
+def trim_silence(audio: np.ndarray, sr: int = 16000,
+                 top_db: int = 30) -> np.ndarray:
+    """
+    去除音频首尾静音
+    Args:
+        audio: 输入音频
+        sr: 采样率
+        top_db: 静音阈值 (dB)
+    Returns:
+        np.ndarray: 去除静音后的音频
+    """
+    trimmed, _ = librosa.effects.trim(audio, top_db=top_db)
+    return trimmed

lib/device.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# -*- coding: utf-8 -*-
+"""
+设备检测模块 - 自动检测并选择最佳计算设备
+支持: CUDA (NVIDIA / AMD ROCm), XPU (Intel Arc via IPEX), DirectML, MPS (Apple), CPU
+"""
+import torch
+def _has_xpu() -> bool:
+    """检测 Intel XPU (需要 intel_extension_for_pytorch)"""
+    try:
+        import intel_extension_for_pytorch  # noqa: F401
+        return hasattr(torch, "xpu") and torch.xpu.is_available()
+    except ImportError:
+        return False
+def _has_directml() -> bool:
+    """检测 DirectML (AMD/Intel on Windows)"""
+    try:
+        import torch_directml  # noqa: F401
+        return True
+    except ImportError:
+        return False
+def _has_mps() -> bool:
+    """检测 Apple MPS"""
+    if not hasattr(torch.backends, "mps") or not torch.backends.mps.is_available():
+        return False
+    try:
+        torch.zeros(1).to(torch.device("mps"))
+        return True
+    except Exception:
+        return False
+def _is_rocm() -> bool:
+    """检测当前 PyTorch 是否为 ROCm 构建 (AMD GPU)"""
+    return hasattr(torch.version, "hip") and torch.version.hip is not None
+def get_device(preferred: str = "cuda") -> torch.device:
+    """
+    获取计算设备，按优先级自动回退
+    Args:
+        preferred: 首选设备 ("cuda", "xpu", "directml", "mps", "cpu")
+    Returns:
+        torch.device: 可用的计算设备
+    """
+    p = preferred.lower().strip()
+    # 精确匹配请求
+    if p in ("cuda", "cuda:0") and torch.cuda.is_available():
+        return torch.device("cuda")
+    if p in ("xpu", "xpu:0") and _has_xpu():
+        return torch.device("xpu")
+    if (p == "directml" or p.startswith("privateuseone")) and _has_directml():
+        import torch_directml
+        return torch_directml.device(torch_directml.default_device())
+    if p == "mps" and _has_mps():
+        return torch.device("mps")
+    if p == "cpu":
+        return torch.device("cpu")
+    # 自动检测: CUDA (含 ROCm) > XPU > DirectML > MPS > CPU
+    if torch.cuda.is_available():
+        return torch.device("cuda")
+    if _has_xpu():
+        return torch.device("xpu")
+    if _has_directml():
+        import torch_directml
+        return torch_directml.device(torch_directml.default_device())
+    if _has_mps():
+        return torch.device("mps")
+    return torch.device("cpu")
+def supports_fp16(device: torch.device) -> bool:
+    """判断设备是否支持 FP16 推理"""
+    dtype = str(device.type) if hasattr(device, "type") else str(device)
+    if dtype == "cuda":
+        return True  # CUDA (含 ROCm) 均支持
+    if dtype == "xpu":
+        return True
+    # DirectML / MPS / CPU 不稳定，默认关闭
+    return False
+def empty_device_cache(device: torch.device = None):
+    """清理设备显存缓存（设备无关）"""
+    if device is not None:
+        dtype = str(device.type) if hasattr(device, "type") else str(device)
+    else:
+        dtype = None
+    if (dtype is None or dtype == "cuda") and torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    if (dtype is None or dtype == "xpu") and _has_xpu():
+        torch.xpu.empty_cache()
+    if (dtype is None or dtype == "mps") and _has_mps():
+        if hasattr(torch.mps, "empty_cache"):
+            torch.mps.empty_cache()
+def get_device_info() -> dict:
+    """获取设备详细信息"""
+    info = {
+        "backends": [],
+        "current_device": "cpu",
+        "devices": []
+    }
+    # CUDA (NVIDIA 或 AMD ROCm)
+    if torch.cuda.is_available():
+        backend = "ROCm (AMD)" if _is_rocm() else "CUDA (NVIDIA)"
+        info["backends"].append(backend)
+        info["current_device"] = "cuda"
+        for i in range(torch.cuda.device_count()):
+            props = torch.cuda.get_device_properties(i)
+            info["devices"].append({
+                "index": i,
+                "backend": backend,
+                "name": props.name,
+                "total_memory_gb": round(props.total_memory / (1024**3), 2),
+            })
+    # Intel XPU
+    if _has_xpu():
+        info["backends"].append("XPU (Intel)")
+        if not info["devices"]:
+            info["current_device"] = "xpu"
+        for i in range(torch.xpu.device_count()):
+            props = torch.xpu.get_device_properties(i)
+            info["devices"].append({
+                "index": i,
+                "backend": "XPU (Intel)",
+                "name": props.name,
+                "total_memory_gb": round(props.total_memory / (1024**3), 2),
+            })
+    # DirectML
+    if _has_directml():
+        import torch_directml
+        info["backends"].append("DirectML")
+        if not info["devices"]:
+            info["current_device"] = "directml"
+        info["devices"].append({
+            "index": 0,
+            "backend": "DirectML",
+            "name": torch_directml.device_name(0),
+            "total_memory_gb": None,
+        })
+    # MPS
+    if _has_mps():
+        info["backends"].append("MPS (Apple)")
+        if not info["devices"]:
+            info["current_device"] = "mps"
+    if not info["backends"]:
+        info["backends"].append("CPU")
+    return info
+def print_device_info():
+    """打印设备信息到控制台"""
+    info = get_device_info()
+    print("=" * 50)
+    print("设备信息")
+    print("=" * 50)
+    print(f"可用后端: {', '.join(info['backends'])}")
+    print(f"当前设备: {info['current_device']}")
+    for dev in info["devices"]:
+        mem = f"{dev['total_memory_gb']} GB" if dev.get("total_memory_gb") else "N/A"
+        print(f"  [{dev['index']}] {dev['name']} ({dev['backend']}) - 显存: {mem}")
+    if not info["devices"]:
+        print("  无 GPU 设备，将使用 CPU 进行推理")
+    print("=" * 50)

lib/logger.py ADDED Viewed

	@@ -0,0 +1,254 @@

+# -*- coding: utf-8 -*-
+"""
+日志工具模块 - 支持时间戳和颜色输出
+"""
+import sys
+import logging
+from datetime import datetime
+try:
+    from colorama import init, Fore, Style, Back
+    init(autoreset=True)  # 初始化 colorama (Windows 兼容), autoreset确保每行重置
+    COLORAMA_AVAILABLE = True
+except ImportError:
+    COLORAMA_AVAILABLE = False
+    # 定义空的占位符
+    class Fore:
+        LIGHTBLACK_EX = GREEN = YELLOW = RED = CYAN = BLUE = MAGENTA = WHITE = LIGHTGREEN_EX = LIGHTCYAN_EX = LIGHTYELLOW_EX = LIGHTMAGENTA_EX = ""
+    class Style:
+        RESET_ALL = BRIGHT = DIM = ""
+    class Back:
+        pass
+class Logger:
+    """统一日志工具"""
+    SAFE_CHAR_MAP = {
+        "✓": "[OK] ",
+        "✗": "[X] ",
+        "→": "->",
+        "◆": "*",
+    }
+    COLORS = {
+        "DEBUG": Fore.LIGHTBLACK_EX,
+        "INFO": Fore.GREEN,
+        "SUCCESS": Fore.LIGHTGREEN_EX,
+        "WARNING": Fore.YELLOW,
+        "ERROR": Fore.RED,
+        "STEP": Fore.CYAN,
+        "DETAIL": Fore.LIGHTCYAN_EX,
+        "PROGRESS": Fore.MAGENTA,
+        "MODEL": Fore.LIGHTMAGENTA_EX,
+        "AUDIO": Fore.BLUE,
+        "CONFIG": Fore.LIGHTYELLOW_EX,
+    }
+    RESET = Style.RESET_ALL
+    BRIGHT = Style.BRIGHT
+    DIM = Style.DIM
+    # 详细日志开关
+    verbose = True
+    @staticmethod
+    def _sanitize_console_text(text: str) -> str:
+        """将不兼容当前终端编码的字符替换为安全文本。"""
+        sanitized = text
+        for src, dst in Logger.SAFE_CHAR_MAP.items():
+            sanitized = sanitized.replace(src, dst)
+        return sanitized
+    @staticmethod
+    def _emit(text: str):
+        """安全输出到终端，避免 Windows/GBK 控制台因 Unicode 崩溃。"""
+        try:
+            print(text, flush=True)
+            return
+        except UnicodeEncodeError:
+            pass
+        fallback = Logger._sanitize_console_text(text)
+        encoding = getattr(sys.stdout, "encoding", None) or "utf-8"
+        try:
+            print(
+                fallback.encode(encoding, errors="replace").decode(encoding),
+                flush=True,
+            )
+        except Exception:
+            print(
+                fallback.encode("ascii", errors="replace").decode("ascii"),
+                flush=True,
+            )
+    @staticmethod
+    def _log(level: str, msg: str, force_print: bool = True):
+        """内部日志方法"""
+        timestamp = datetime.now().strftime("%H:%M:%S")
+        color = Logger.COLORS.get(level, "")
+        reset = Logger.RESET
+        # 根据级别决定前缀
+        if level in ("INFO", "STEP", "SUCCESS"):
+            prefix = ""
+        elif level == "DETAIL":
+            prefix = "  → "
+        elif level == "PROGRESS":
+            prefix = "  ◆ "
+        elif level == "MODEL":
+            prefix = "[模型] "
+        elif level == "AUDIO":
+            prefix = "[音频] "
+        elif level == "CONFIG":
+            prefix = "[配置] "
+        else:
+            prefix = f"[{level}] "
+        output = f"{color}[{timestamp}]{prefix}{msg}{reset}"
+        Logger._emit(output)
+    @staticmethod
+    def debug(msg: str):
+        """调试日志 (灰色) - 仅在verbose模式下显示"""
+        if Logger.verbose:
+            Logger._log("DEBUG", msg)
+    @staticmethod
+    def info(msg: str):
+        """信息日志 (绿色)"""
+        Logger._log("INFO", msg)
+    @staticmethod
+    def success(msg: str):
+        """成功日志 (亮绿色)"""
+        Logger._log("SUCCESS", f"✓ {msg}")
+    @staticmethod
+    def warning(msg: str):
+        """警告日志 (黄色)"""
+        Logger._log("WARNING", msg)
+    @staticmethod
+    def error(msg: str):
+        """错误日志 (红色)"""
+        Logger._log("ERROR", msg)
+    @staticmethod
+    def step(current: int, total: int, msg: str):
+        """步骤日志 (青色)"""
+        timestamp = datetime.now().strftime("%H:%M:%S")
+        color = Logger.COLORS.get("STEP", "")
+        reset = Logger.RESET
+        Logger._emit(f"{color}[{timestamp}][{current}/{total}] {msg}{reset}")
+    @staticmethod
+    def detail(msg: str):
+        """详细日志 (浅青色) - 用于显示处理细节"""
+        if Logger.verbose:
+            Logger._log("DETAIL", msg)
+    @staticmethod
+    def progress(msg: str):
+        """进度日志 (紫色) - 用于显示处理进度"""
+        Logger._log("PROGRESS", msg)
+    @staticmethod
+    def model(msg: str):
+        """模型日志 (浅紫色) - 用于模型加载/卸载信息"""
+        Logger._log("MODEL", msg)
+    @staticmethod
+    def audio(msg: str):
+        """音频日志 (蓝色) - 用于音频处理信息"""
+        Logger._log("AUDIO", msg)
+    @staticmethod
+    def config(msg: str):
+        """配置日志 (浅黄色) - 用于配置信息"""
+        if Logger.verbose:
+            Logger._log("CONFIG", msg)
+    @staticmethod
+    def header(msg: str):
+        """标题日志 (带分隔线)"""
+        timestamp = datetime.now().strftime("%H:%M:%S")
+        color = Logger.COLORS.get("INFO", "")
+        reset = Logger.RESET
+        Logger._emit(f"{color}[{timestamp}] {'=' * 50}{reset}")
+        Logger._emit(f"{color}[{timestamp}] {msg}{reset}")
+        Logger._emit(f"{color}[{timestamp}] {'=' * 50}{reset}")
+    @staticmethod
+    def separator(char: str = "-", length: int = 40):
+        """分隔线"""
+        timestamp = datetime.now().strftime("%H:%M:%S")
+        color = Logger.COLORS.get("DEBUG", "")
+        reset = Logger.RESET
+        Logger._emit(f"{color}[{timestamp}] {char * length}{reset}")
+    @staticmethod
+    def set_verbose(enabled: bool):
+        """设置详细日志模式"""
+        Logger.verbose = enabled
+# 便捷实例
+log = Logger()
+# ============ 配置标准 logging 模块使用颜色 ============
+class ColoredFormatter(logging.Formatter):
+    """为标准logging模块添加颜色支持"""
+    LEVEL_COLORS = {
+        logging.DEBUG: Fore.LIGHTBLACK_EX,
+        logging.INFO: Fore.GREEN,
+        logging.WARNING: Fore.YELLOW,
+        logging.ERROR: Fore.RED,
+        logging.CRITICAL: Fore.RED + Style.BRIGHT,
+    }
+    def format(self, record):
+        # 获取颜色
+        color = self.LEVEL_COLORS.get(record.levelno, "")
+        reset = Style.RESET_ALL
+        # 格式化时间
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        # 构建消息
+        level_name = record.levelname
+        module_name = record.name
+        # 格式化输出
+        formatted = f"{color}{timestamp} | {level_name} | {module_name} | {record.getMessage()}{reset}"
+        return formatted
+def setup_colored_logging(level=logging.INFO):
+    """配置全局logging使用颜色输出"""
+    # 获取根logger
+    root_logger = logging.getLogger()
+    root_logger.setLevel(level)
+    # 移除现有的handlers
+    for handler in root_logger.handlers[:]:
+        root_logger.removeHandler(handler)
+    # 添加带颜色的handler
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setLevel(level)
+    console_handler.setFormatter(ColoredFormatter())
+    root_logger.addHandler(console_handler)
+    return root_logger
+# 自动配置logging颜色
+setup_colored_logging(logging.INFO)
+# 抑制第三方库的英文日志
+logging.getLogger("faiss").setLevel(logging.WARNING)
+logging.getLogger("audio_separator").setLevel(logging.WARNING)

lib/mixer.py ADDED Viewed

	@@ -0,0 +1,214 @@

+# -*- coding: utf-8 -*-
+"""
+混音模块 - 人声与伴奏混合
+"""
+import numpy as np
+import librosa
+import soundfile as sf
+from pathlib import Path
+from typing import Optional
+from lib.audio import soft_clip_array
+try:
+    from lib.logger import log
+except ImportError:
+    log = None
+try:
+    from pedalboard import Pedalboard, Reverb, Compressor, Gain
+    PEDALBOARD_AVAILABLE = True
+except ImportError:
+    PEDALBOARD_AVAILABLE = False
+def _probe_sample_rate(path: str, fallback: int = 44100) -> int:
+    """Probe sample rate from file metadata."""
+    try:
+        return int(sf.info(path).samplerate)
+    except Exception:
+        return int(fallback)
+def load_audio_for_mix(path: str, target_sr: Optional[int] = None) -> tuple:
+    """
+    加载音频用于混音。
+    Args:
+        path: 音频路径
+        target_sr: 目标采样率；为 None 时保持原始采样率
+    Returns:
+        tuple: (audio_data, sample_rate)
+    """
+    if log:
+        log.detail(f"加载音频: {Path(path).name}")
+    audio, sr = librosa.load(path, sr=target_sr, mono=False)
+    if audio.ndim == 1:
+        audio = np.stack([audio, audio])
+        if log:
+            log.detail("单声道已扩展为双声道")
+    if log:
+        log.detail(f"音频形状: {audio.shape}, 采样率: {sr}Hz")
+    return audio, sr
+def apply_reverb(
+    audio: np.ndarray,
+    sr: int,
+    room_size: float = 0.3,
+    wet_level: float = 0.2,
+) -> np.ndarray:
+    """对人声应用混响效果。"""
+    if not PEDALBOARD_AVAILABLE:
+        if log:
+            log.warning("Pedalboard 不可用，跳过混响处理")
+        return audio
+    if log:
+        log.detail(f"应用混响: room_size={room_size}, wet_level={wet_level}")
+    if audio.ndim == 1:
+        audio = audio.reshape(1, -1)
+    board = Pedalboard([
+        Reverb(room_size=room_size, wet_level=wet_level, dry_level=1.0 - wet_level)
+    ])
+    processed = board(audio, sr)
+    if log:
+        log.detail("混响处理完成")
+    return processed
+def adjust_audio_length(audio: np.ndarray, target_length: int) -> np.ndarray:
+    """将音频裁切/补零到目标长度。"""
+    current_length = audio.shape[-1]
+    if current_length == target_length:
+        return audio
+    if current_length > target_length:
+        return audio[..., :target_length]
+    pad_amount = target_length - current_length
+    if audio.ndim == 1:
+        return np.pad(audio, (0, pad_amount))
+    return np.pad(audio, ((0, 0), (0, pad_amount)))
+def mix_vocals_and_accompaniment(
+    vocals_path: str,
+    accompaniment_path: str,
+    output_path: str,
+    vocals_volume: float = 1.0,
+    accompaniment_volume: float = 1.0,
+    reverb_amount: float = 0.0,
+    target_sr: Optional[int] = None,
+) -> str:
+    """
+    混合人声和伴奏。
+    Args:
+        vocals_path: 人声音频路径
+        accompaniment_path: 伴奏音频路径
+        output_path: 输出路径
+        vocals_volume: 人声音量 (0-2)
+        accompaniment_volume: 伴奏音量 (0-2)
+        reverb_amount: 人声混响量 (0-1)
+        target_sr: 目标采样率；None 时自动采用两轨中更高采样率
+    Returns:
+        str: 输出文件路径
+    """
+    if target_sr is None or target_sr <= 0:
+        vocals_sr = _probe_sample_rate(vocals_path)
+        accompaniment_sr = _probe_sample_rate(accompaniment_path)
+        target_sr = max(vocals_sr, accompaniment_sr)
+    if log:
+        log.progress("开始混音处理...")
+        log.audio(f"人声文件: {Path(vocals_path).name}")
+        log.audio(f"伴奏文件: {Path(accompaniment_path).name}")
+        log.config(f"人声音量: {vocals_volume}, 伴奏音量: {accompaniment_volume}")
+        log.config(f"混响量: {reverb_amount}, 目标采样率: {target_sr}Hz")
+    if log:
+        log.detail("加载人声音频...")
+    vocals, sr = load_audio_for_mix(vocals_path, target_sr)
+    if log:
+        log.detail("加载伴奏音频...")
+    accompaniment, _ = load_audio_for_mix(accompaniment_path, target_sr)
+    if reverb_amount > 0 and PEDALBOARD_AVAILABLE:
+        if log:
+            log.progress("应用人声混响...")
+        vocals = apply_reverb(vocals, sr, room_size=0.4, wet_level=reverb_amount)
+    elif reverb_amount > 0 and log:
+        log.warning("Pedalboard 不可用，跳过混响")
+    vocals = soft_clip_array(vocals * vocals_volume, threshold=0.85, ceiling=0.95)
+    accompaniment = soft_clip_array(
+        accompaniment * accompaniment_volume,
+        threshold=0.85,
+        ceiling=0.95,
+    )
+    vocals_len = vocals.shape[-1]
+    accompaniment_len = accompaniment.shape[-1]
+    target_len = max(vocals_len, accompaniment_len)
+    if target_len <= 0:
+        raise ValueError("混音失败：音频长度为 0")
+    if log:
+        log.detail(f"人声长度: {vocals_len}, 伴奏长度: {accompaniment_len}")
+        if vocals_len != accompaniment_len:
+            log.detail(f"长度不一致，已补齐到最长长度: {target_len}")
+    vocals = adjust_audio_length(vocals, target_len)
+    accompaniment = adjust_audio_length(accompaniment, target_len)
+    if log:
+        log.progress("混合音轨...")
+    mixed = vocals + accompaniment
+    max_val = float(np.max(np.abs(mixed)))
+    if log:
+        log.detail(f"混合后峰值: {max_val:.4f}")
+    mixed = soft_clip_array(mixed, threshold=0.90, ceiling=0.98)
+    if log:
+        final_peak = float(np.max(np.abs(mixed)))
+        log.detail(f"软削波后峰值: {final_peak:.4f}")
+    if mixed.ndim == 2:
+        mixed = mixed.T
+    output_dir = Path(output_path).parent
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if log:
+        log.progress(f"保存混音文件: {output_path}")
+    sf.write(output_path, mixed, sr)
+    output_size = Path(output_path).stat().st_size
+    duration = target_len / sr
+    if log:
+        log.success("混音完成")
+        log.audio(f"输出时长: {duration:.2f}秒")
+        log.audio(f"输出大小: {output_size / 1024 / 1024:.2f} MB")
+    return output_path
+def check_pedalboard_available() -> bool:
+    """检查 pedalboard 是否可用。"""
+    return PEDALBOARD_AVAILABLE

lib/vocal_cleanup.py ADDED Viewed

	@@ -0,0 +1,253 @@

+# -*- coding: utf-8 -*-
+"""
+音频后处理模块 - 齿音和呼吸音处理
+基于研究文献的最佳实践
+"""
+import numpy as np
+from scipy import signal
+from typing import Optional
+def detect_sibilance_frames(audio: np.ndarray, sr: int, threshold_db: float = -20.0) -> np.ndarray:
+    """
+    检测齿音帧 (s, sh, ch, z 等高频辅音)
+    参考: "Managing Sibilance" - Sound on Sound
+    齿音主要集中在 4-10kHz 频段
+    Args:
+        audio: 音频数据
+        sr: 采样率
+        threshold_db: 高频能量阈值 (dB)
+    Returns:
+        布尔数组，True 表示齿音帧
+    """
+    # 设计高通滤波器提取高频成分 (4-10kHz)
+    nyquist = sr / 2
+    low_freq = 4000 / nyquist
+    high_freq = min(10000 / nyquist, 0.99)
+    # 带通滤波器
+    sos = signal.butter(4, [low_freq, high_freq], btype='band', output='sos')
+    high_freq_audio = signal.sosfilt(sos, audio)
+    # 计算帧能量
+    frame_length = int(0.02 * sr)  # 20ms 帧
+    hop_length = int(0.01 * sr)    # 10ms 跳跃
+    n_frames = 1 + (len(audio) - frame_length) // hop_length
+    high_energy = np.zeros(n_frames)
+    total_energy = np.zeros(n_frames)
+    for i in range(n_frames):
+        start = i * hop_length
+        end = start + frame_length
+        if end > len(audio):
+            break
+        # 高频能量
+        high_energy[i] = np.sum(high_freq_audio[start:end] ** 2)
+        # 总能量
+        total_energy[i] = np.sum(audio[start:end] ** 2)
+    # 计算高频能量比例
+    high_ratio = np.zeros_like(high_energy)
+    mask = total_energy > 1e-10
+    high_ratio[mask] = high_energy[mask] / total_energy[mask]
+    # 转换为 dB
+    high_energy_db = 10 * np.log10(high_energy + 1e-10)
+    # 齿音检测：高频能量高且高频比例大
+    is_sibilance = (high_energy_db > threshold_db) & (high_ratio > 0.3)
+    return is_sibilance
+def reduce_sibilance(audio: np.ndarray, sr: int, reduction_db: float = 6.0) -> np.ndarray:
+    """
+    减少齿音 (De-essing)
+    参考: "Advanced Sibilance Control" - Mike's Mix Master
+    使用多频段动态压缩技术
+    Args:
+        audio: 音频数据
+        sr: 采样率
+        reduction_db: 齿音衰减量 (dB)
+    Returns:
+        处理后的音频
+    """
+    # 检测齿音帧
+    sibilance_frames = detect_sibilance_frames(audio, sr)
+    if not np.any(sibilance_frames):
+        return audio
+    # 计算衰减增益曲线（在时域应用，避免频段分离的相位问题）
+    frame_length = int(0.02 * sr)
+    hop_length = int(0.01 * sr)
+    gain_curve = np.ones(len(audio))
+    reduction_factor = 10 ** (-reduction_db / 20)
+    for i, is_sib in enumerate(sibilance_frames):
+        if is_sib:
+            start = i * hop_length
+            end = start + frame_length
+            if end > len(audio):
+                break
+            # 平滑过渡
+            fade_in = np.linspace(1.0, reduction_factor, frame_length // 4)
+            sustain = np.full(frame_length // 2, reduction_factor)
+            fade_out = np.linspace(reduction_factor, 1.0, frame_length // 4)
+            envelope = np.concatenate([fade_in, sustain, fade_out])
+            # 应用增益
+            gain_curve[start:start+len(envelope)] = np.minimum(
+                gain_curve[start:start+len(envelope)],
+                envelope
+            )
+    # 直接在时域应用增益（避免频段分离）
+    result = audio * gain_curve
+    return result
+def detect_breath_frames(audio: np.ndarray, sr: int, threshold_db: float = -40.0) -> np.ndarray:
+    """
+    检测呼吸音帧
+    呼吸音特征：
+    - 低能量
+    - 宽频噪声
+    - 通常在乐句之间
+    Args:
+        audio: 音频数据
+        sr: 采样率
+        threshold_db: 能量阈值 (dB)
+    Returns:
+        布尔数组，True 表示呼吸音帧
+    """
+    frame_length = int(0.02 * sr)  # 20ms
+    hop_length = int(0.01 * sr)    # 10ms
+    n_frames = 1 + (len(audio) - frame_length) // hop_length
+    is_breath = np.zeros(n_frames, dtype=bool)
+    for i in range(n_frames):
+        start = i * hop_length
+        end = start + frame_length
+        if end > len(audio):
+            break
+        frame = audio[start:end]
+        # 计算能量
+        energy = np.sum(frame ** 2)
+        energy_db = 10 * np.log10(energy + 1e-10)
+        # 计算频谱平坦度 (噪声特征)
+        fft = np.abs(np.fft.rfft(frame))
+        geometric_mean = np.exp(np.mean(np.log(fft + 1e-10)))
+        arithmetic_mean = np.mean(fft)
+        spectral_flatness = geometric_mean / (arithmetic_mean + 1e-10)
+        # 呼吸音：低能量 + 高频谱平坦度
+        is_breath[i] = (energy_db < threshold_db) and (spectral_flatness > 0.5)
+    return is_breath
+def reduce_breath_noise(audio: np.ndarray, sr: int, reduction_db: float = 12.0) -> np.ndarray:
+    """
+    减少呼吸音噪声
+    参考: "How to REALLY Clean Vocals" - Waves
+    Args:
+        audio: 音频数据
+        sr: 采样率
+        reduction_db: 呼吸音衰减量 (dB)
+    Returns:
+        处理后的音频
+    """
+    # 检测呼吸音帧
+    breath_frames = detect_breath_frames(audio, sr)
+    if not np.any(breath_frames):
+        return audio
+    # 计算衰减增益曲线
+    frame_length = int(0.02 * sr)
+    hop_length = int(0.01 * sr)
+    gain_curve = np.ones(len(audio))
+    reduction_factor = 10 ** (-reduction_db / 20)
+    for i, is_breath in enumerate(breath_frames):
+        if is_breath:
+            start = i * hop_length
+            end = start + frame_length
+            if end > len(audio):
+                break
+            # 平滑过渡，避免咔嗒声
+            fade_length = frame_length // 4
+            fade_in = np.linspace(1.0, reduction_factor, fade_length)
+            sustain = np.full(frame_length - 2 * fade_length, reduction_factor)
+            fade_out = np.linspace(reduction_factor, 1.0, fade_length)
+            envelope = np.concatenate([fade_in, sustain, fade_out])
+            # 应用增益
+            gain_curve[start:start+len(envelope)] = np.minimum(
+                gain_curve[start:start+len(envelope)],
+                envelope
+            )
+    # 应用增益曲线
+    result = audio * gain_curve
+    return result
+def apply_vocal_cleanup(
+    audio: np.ndarray,
+    sr: int,
+    reduce_sibilance_enabled: bool = True,
+    reduce_breath_enabled: bool = True,
+    sibilance_reduction_db: float = 4.0,
+    breath_reduction_db: float = 8.0
+) -> np.ndarray:
+    """
+    应用完整的人声清理处理
+    Args:
+        audio: 音频数据
+        sr: 采样率
+        reduce_sibilance_enabled: 是否减少齿音
+        reduce_breath_enabled: 是否减少呼吸音
+        sibilance_reduction_db: 齿音衰减量 (dB)
+        breath_reduction_db: 呼吸音衰减量 (dB)
+    Returns:
+        处理后的音频
+    """
+    result = audio.copy()
+    # 减少呼吸音（先处理，因为能量更低）
+    if reduce_breath_enabled:
+        result = reduce_breath_noise(result, sr, breath_reduction_db)
+    # 减少齿音
+    if reduce_sibilance_enabled:
+        result = reduce_sibilance(result, sr, sibilance_reduction_db)
+    return result

lib/vocoder_fix.py ADDED Viewed

	@@ -0,0 +1,385 @@

+# -*- coding: utf-8 -*-
+"""
+Vocoder伪影修复 - 针对呼吸音电音和长音撕裂
+基于RVC社区反馈和研究文献
+"""
+import numpy as np
+from scipy import signal
+from typing import Optional
+def fix_phase_discontinuity(audio: np.ndarray, sr: int, chunk_boundaries: Optional[list] = None) -> np.ndarray:
+    """
+    修复相位不连续导致的撕裂
+    参考: "Prosody-Guided Harmonic Attention for Phase-Coherent Neural Vocoding" (arXiv:2601.14472)
+    Vocoder在长音时会产生相位不连续，导致撕裂
+    Args:
+        audio: 音频数据
+        sr: 采样率
+        chunk_boundaries: 分块边界位置（样本索引）
+    Returns:
+        修复后的音频
+    """
+    # 使用希尔伯特变换提取瞬时相位
+    analytic_signal = signal.hilbert(audio)
+    instantaneous_phase = np.unwrap(np.angle(analytic_signal))
+    amplitude = np.abs(analytic_signal)
+    # 检测相位跳变
+    phase_diff = np.diff(instantaneous_phase)
+    phase_diff_threshold = np.percentile(np.abs(phase_diff), 99) * 2.5
+    # 找到相位跳变点
+    discontinuities = np.where(np.abs(phase_diff) > phase_diff_threshold)[0]
+    if len(discontinuities) == 0:
+        return audio
+    # 修复每个不连续点
+    result = audio.copy()
+    phase_corrected = instantaneous_phase.copy()
+    for disc_idx in discontinuities:
+        # 计算相位跳变量
+        phase_jump = phase_diff[disc_idx]
+        # 在不连续点之后应用相位校正（累积补偿）
+        correction_length = min(int(0.02 * sr), len(phase_corrected) - disc_idx - 1)  # 20ms
+        if correction_length > 0:
+            # 线性过渡相位校正
+            correction_curve = np.linspace(phase_jump, 0, correction_length)
+            phase_corrected[disc_idx + 1:disc_idx + 1 + correction_length] -= correction_curve
+    # 用校正后的相位重建信号
+    corrected_signal = amplitude * np.exp(1j * phase_corrected)
+    result = np.real(corrected_signal).astype(np.float32)
+    return result
+def reduce_breath_electric_noise(audio: np.ndarray, sr: int, f0: Optional[np.ndarray] = None) -> np.ndarray:
+    """
+    减少呼吸音中的电音
+    参考: GitHub Issue #65 "Artefacting when speech has breath"
+    问题: Vocoder在F0=0的区域会产生电子噪声
+    Args:
+        audio: 音频数据
+        sr: 采样率
+        f0: F0序列（可选，用于定位呼吸音）
+    Returns:
+        处理后的音频
+    """
+    # 第一步：去除DC偏移和极低频噪声（0-80Hz）
+    # 这是vocoder常见的低频泄漏问题
+    from scipy import signal as scipy_signal
+    # 设计高通滤波器：80Hz截止
+    nyquist = sr / 2
+    cutoff = 80 / nyquist
+    # 使用4阶Butterworth高通滤波器
+    sos = scipy_signal.butter(4, cutoff, btype='highpass', output='sos')
+    audio = scipy_signal.sosfilt(sos, audio)
+    # 第二步：检测和清理宽频噪声（原有逻辑）
+    # 检测低能量区域（可能是呼吸音）
+    frame_length = int(0.02 * sr)  # 20ms
+    hop_length = int(0.01 * sr)    # 10ms
+    n_frames = 1 + (len(audio) - frame_length) // hop_length
+    # 计算每帧的能量和频谱平坦度
+    energy = np.zeros(n_frames)
+    spectral_flatness = np.zeros(n_frames)
+    high_freq_ratio = np.zeros(n_frames)  # 新增：高频能量占比
+    for i in range(n_frames):
+        start = i * hop_length
+        end = start + frame_length
+        if end > len(audio):
+            break
+        frame = audio[start:end]
+        # 能量
+        energy[i] = np.sum(frame ** 2)
+        # 频谱平坦度（噪声特征）
+        fft = np.abs(np.fft.rfft(frame))
+        if np.sum(fft) > 1e-10:
+            geometric_mean = np.exp(np.mean(np.log(fft + 1e-10)))
+            arithmetic_mean = np.mean(fft)
+            spectral_flatness[i] = geometric_mean / (arithmetic_mean + 1e-10)
+            # 计算高频能量占比（4kHz以上）
+            freqs = np.fft.rfftfreq(len(frame), 1/sr)
+            high_freq_mask = freqs >= 4000
+            high_freq_energy = np.sum(fft[high_freq_mask] ** 2)
+            total_freq_energy = np.sum(fft ** 2)
+            high_freq_ratio[i] = high_freq_energy / (total_freq_energy + 1e-10)
+    # 归一化能量
+    energy_db = 10 * np.log10(energy + 1e-10)
+    # 自适应底噪检测：
+    # 1. 计算能量分布的统计特征
+    # 2. 使用最低5%作为候选底噪区域
+    # 3. 在候选区域中,根据频谱特征进一步筛选
+    # 候选底噪区域：最低5%能量
+    candidate_threshold = np.percentile(energy_db, 5)
+    # 在候选区域中,检测真正的底噪
+    # 底噪类型1：宽频噪声（频谱平坦度 > 0.35）
+    # 底噪类型2：高频电流声（高频占比 > 0.15）
+    is_candidate = energy_db < candidate_threshold
+    is_wideband_noise = is_candidate & (spectral_flatness > 0.35)
+    is_highfreq_noise = is_candidate & (high_freq_ratio > 0.15)
+    # 合并两种类型的底噪
+    is_noise = is_wideband_noise | is_highfreq_noise
+    # 如果检测到的底噪帧数太少(<1%),说明音频本身很纯净,不需要处理
+    noise_ratio = is_noise.sum() / len(is_noise)
+    if noise_ratio < 0.01:
+        return audio
+    # 如果提供了F0，使用F0=0来辅助判断
+    if f0 is not None and len(f0) > 0:
+        # F0对齐到音频帧
+        f0_per_audio_frame = len(f0) / n_frames
+        for i in range(n_frames):
+            if not is_noise[i]:
+                continue
+            f0_idx = int(i * f0_per_audio_frame)
+            if f0_idx < len(f0):
+                # 如果F0>0，说明有音高，不是底噪
+                if f0[f0_idx] > 0:
+                    is_noise[i] = False
+    # 使用is_noise替代is_breath，更准确地描述我们要处理的内容
+    is_breath = is_noise
+    # 根据底噪比例动态调整清理强度
+    # 底噪越多，说明vocoder质量越差，需要更激进的清理
+    if noise_ratio < 0.05:
+        # 底噪很少(1-5%)，温和清理
+        spectral_threshold_percentile = 85  # 保留15%
+        magnitude_attenuation = 0.2  # 衰减到20%
+        mix_ratio = 0.5  # 50%清理
+    elif noise_ratio < 0.15:
+        # 底噪中等(5-15%)，中等清理
+        spectral_threshold_percentile = 90  # 保留10%
+        magnitude_attenuation = 0.1  # 衰减到10%
+        mix_ratio = 0.7  # 70%清理
+    else:
+        # 底噪很多(>15%)，激进清理
+        spectral_threshold_percentile = 95  # 保留5%
+        magnitude_attenuation = 0.05  # 衰减到5%
+        mix_ratio = 0.85  # 85%清理
+    # 对底噪区域应用降噪
+    result = audio.copy()
+    for i in range(n_frames):
+        if is_breath[i]:
+            start = i * hop_length
+            end = start + frame_length
+            if end > len(audio):
+                break
+            # 使用频谱门限降噪
+            frame = audio[start:end]
+            # FFT
+            fft = np.fft.rfft(frame)
+            magnitude = np.abs(fft)
+            phase = np.angle(fft)
+            freqs = np.fft.rfftfreq(len(frame), 1/sr)
+            # 检测这一帧是高频噪声还是宽频噪声
+            high_freq_mask = freqs >= 4000
+            high_freq_energy = np.sum(magnitude[high_freq_mask] ** 2)
+            total_freq_energy = np.sum(magnitude ** 2)
+            frame_high_ratio = high_freq_energy / (total_freq_energy + 1e-10)
+            if frame_high_ratio > 0.15:
+                # 高频电流声：专门衰减高频部分
+                magnitude[high_freq_mask] *= 0.05  # 高频衰减到5%
+                # 中频(1-4kHz)温和衰减
+                mid_freq_mask = (freqs >= 1000) & (freqs < 4000)
+                magnitude[mid_freq_mask] *= 0.3
+            else:
+                # 宽频噪声：使用原有的频谱门限
+                threshold = np.percentile(magnitude, spectral_threshold_percentile)
+                magnitude = np.where(magnitude > threshold, magnitude, magnitude * magnitude_attenuation)
+            # 重建
+            fft_cleaned = magnitude * np.exp(1j * phase)
+            frame_cleaned = np.fft.irfft(fft_cleaned, n=len(frame))
+            # 平滑过渡
+            fade_length = min(hop_length // 2, len(frame) // 4)
+            if fade_length > 0:
+                fade_in = np.linspace(0, 1, fade_length)
+                fade_out = np.linspace(1, 0, fade_length)
+                frame_cleaned[:fade_length] *= fade_in
+                frame_cleaned[-fade_length:] *= fade_out
+            # 动态混合比例
+            result[start:end] = frame * (1 - mix_ratio) + frame_cleaned * mix_ratio
+    return result
+def stabilize_sustained_notes(audio: np.ndarray, sr: int, f0: Optional[np.ndarray] = None) -> np.ndarray:
+    """
+    稳定长音，防止撕裂
+    参考: "Mel Spectrogram Inversion with Stable Pitch" - Apple Research
+    长音时vocoder容易产生相位漂移
+    Args:
+        audio: 音频数据
+        sr: 采样率
+        f0: F0序列（用于检测长音）
+    Returns:
+        稳定后的音频
+    """
+    if f0 is None or len(f0) == 0:
+        return audio
+    # 检测长音区域（F0稳定且持续时间长）
+    frame_length = int(0.02 * sr)
+    hop_length = int(0.01 * sr)
+    # F0对齐到音频帧
+    n_audio_frames = 1 + (len(audio) - frame_length) // hop_length
+    f0_per_audio_frame = len(f0) / n_audio_frames
+    is_sustained = np.zeros(n_audio_frames, dtype=bool)
+    # 检测F0稳定的区域
+    window_size = 20  # 200ms窗口
+    for i in range(window_size, n_audio_frames - window_size):
+        f0_idx = int(i * f0_per_audio_frame)
+        if f0_idx >= len(f0):
+            break
+        # 获取窗口内的F0
+        f0_window_start = max(0, f0_idx - window_size)
+        f0_window_end = min(len(f0), f0_idx + window_size)
+        f0_window = f0[f0_window_start:f0_window_end]
+        # 过滤F0=0
+        f0_voiced = f0_window[f0_window > 0]
+        if len(f0_voiced) > window_size * 0.8:  # 80%有声
+            # 计算F0稳定性
+            f0_std = np.std(f0_voiced)
+            f0_mean = np.mean(f0_voiced)
+            # F0变化小于5%认为是长音
+            if f0_std / (f0_mean + 1e-6) < 0.05:
+                is_sustained[i] = True
+    # 对长音区域应用相位稳定
+    result = audio.copy()
+    i = 0
+    while i < n_audio_frames:
+        if is_sustained[i]:
+            # 找到长音区域的起止
+            start_frame = i
+            while i < n_audio_frames and is_sustained[i]:
+                i += 1
+            end_frame = i
+            # 转换为样本索引
+            start_sample = start_frame * hop_length
+            end_sample = min(end_frame * hop_length + frame_length, len(audio))
+            if end_sample - start_sample < frame_length:
+                continue
+            # 提取长音段
+            sustained_segment = audio[start_sample:end_sample]
+            # 使用低通滤波平滑幅度包络（而非除法）
+            envelope = np.abs(signal.hilbert(sustained_segment))
+            # 平滑包络
+            b, a = signal.butter(2, 50 / (sr / 2), btype='low')
+            smoothed_envelope = signal.filtfilt(b, a, envelope)
+            # 计算增益调整（避免除法放大噪声）
+            # 只在包络变化剧烈的地方应用平滑
+            envelope_variation = np.abs(envelope - smoothed_envelope)
+            variation_threshold = np.percentile(envelope_variation, 75)
+            # 创建混合掩码：变化大的地方用平滑包络，变化小的地方保持原样
+            blend_mask = np.clip(envelope_variation / (variation_threshold + 1e-6), 0, 1)
+            # 计算目标包络
+            target_envelope = smoothed_envelope * blend_mask + envelope * (1 - blend_mask)
+            # 应用包络调整（使用乘法而非除法）
+            if np.max(envelope) > 1e-6:
+                gain = target_envelope / (envelope + 1e-6)
+                # 限制增益范围，避免放大噪声
+                gain = np.clip(gain, 0.5, 2.0)
+                result[start_sample:end_sample] = sustained_segment * gain
+        i += 1
+    return result
+def apply_vocoder_artifact_fix(
+    audio: np.ndarray,
+    sr: int,
+    f0: Optional[np.ndarray] = None,
+    chunk_boundaries: Optional[list] = None,
+    fix_phase: bool = True,
+    fix_breath: bool = True,
+    fix_sustained: bool = True
+) -> np.ndarray:
+    """
+    应用完整的vocoder伪影修复
+    Args:
+        audio: 音频数据
+        sr: 采样率
+        f0: F0序列
+        chunk_boundaries: 分块边界
+        fix_phase: 是否修复相位不连续
+        fix_breath: 是否修复呼吸音电音
+        fix_sustained: 是否稳定长音
+    Returns:
+        修复后的音频
+    """
+    result = audio.copy()
+    # 1. 修复相位不连续（长音撕裂）
+    if fix_phase:
+        result = fix_phase_discontinuity(result, sr, chunk_boundaries)
+    # 2. 减少呼吸音电音
+    if fix_breath:
+        result = reduce_breath_electric_noise(result, sr, f0)
+    # 3. 稳定长音
+    if fix_sustained:
+        result = stabilize_sustained_notes(result, sr, f0)
+    return result