Spaces:

mason369
/

AI-RVC

Running

File size: 13,495 Bytes

b6f9c90

# -*- coding: utf-8 -*-
"""

Vocoder伪影修复 - 针对呼吸音电音和长音撕裂

基于RVC社区反馈和研究文献

"""
import numpy as np
from scipy import signal
from typing import Optional


def fix_phase_discontinuity(audio: np.ndarray, sr: int, chunk_boundaries: Optional[list] = None) -> np.ndarray:
    """

    修复相位不连续导致的撕裂



    参考: "Prosody-Guided Harmonic Attention for Phase-Coherent Neural Vocoding" (arXiv:2601.14472)

    Vocoder在长音时会产生相位不连续，导致撕裂



    Args:

        audio: 音频数据

        sr: 采样率

        chunk_boundaries: 分块边界位置（样本索引）



    Returns:

        修复后的音频

    """
    # 使用希尔伯特变换提取瞬时相位
    analytic_signal = signal.hilbert(audio)
    instantaneous_phase = np.unwrap(np.angle(analytic_signal))
    amplitude = np.abs(analytic_signal)

    # 检测相位跳变
    phase_diff = np.diff(instantaneous_phase)
    phase_diff_threshold = np.percentile(np.abs(phase_diff), 99) * 2.5

    # 找到相位跳变点
    discontinuities = np.where(np.abs(phase_diff) > phase_diff_threshold)[0]

    if len(discontinuities) == 0:
        return audio

    # 修复每个不连续点
    result = audio.copy()
    phase_corrected = instantaneous_phase.copy()

    for disc_idx in discontinuities:
        # 计算相位跳变量
        phase_jump = phase_diff[disc_idx]

        # 在不连续点之后应用相位校正（累积补偿）
        correction_length = min(int(0.02 * sr), len(phase_corrected) - disc_idx - 1)  # 20ms
        if correction_length > 0:
            # 线性过渡相位校正
            correction_curve = np.linspace(phase_jump, 0, correction_length)
            phase_corrected[disc_idx + 1:disc_idx + 1 + correction_length] -= correction_curve

    # 用校正后的相位重建信号
    corrected_signal = amplitude * np.exp(1j * phase_corrected)
    result = np.real(corrected_signal).astype(np.float32)

    return result


def reduce_breath_electric_noise(audio: np.ndarray, sr: int, f0: Optional[np.ndarray] = None) -> np.ndarray:
    """

    减少呼吸音中的电音



    参考: GitHub Issue #65 "Artefacting when speech has breath"

    问题: Vocoder在F0=0的区域会产生电子噪声



    Args:

        audio: 音频数据

        sr: 采样率

        f0: F0序列（可选，用于定位呼吸音）



    Returns:

        处理后的音频

    """
    # 第一步：去除DC偏移和极低频噪声（0-80Hz）
    # 这是vocoder常见的低频泄漏问题
    from scipy import signal as scipy_signal

    # 设计高通滤波器：80Hz截止
    nyquist = sr / 2
    cutoff = 80 / nyquist

    # 使用4阶Butterworth高通滤波器
    sos = scipy_signal.butter(4, cutoff, btype='highpass', output='sos')
    audio = scipy_signal.sosfilt(sos, audio)

    # 第二步：检测和清理宽频噪声（原有逻辑）
    # 检测低能量区域（可能是呼吸音）
    frame_length = int(0.02 * sr)  # 20ms
    hop_length = int(0.01 * sr)    # 10ms

    n_frames = 1 + (len(audio) - frame_length) // hop_length

    # 计算每帧的能量和频谱平坦度
    energy = np.zeros(n_frames)
    spectral_flatness = np.zeros(n_frames)
    high_freq_ratio = np.zeros(n_frames)  # 新增：高频能量占比

    for i in range(n_frames):
        start = i * hop_length
        end = start + frame_length
        if end > len(audio):
            break

        frame = audio[start:end]

        # 能量
        energy[i] = np.sum(frame ** 2)

        # 频谱平坦度（噪声特征）
        fft = np.abs(np.fft.rfft(frame))
        if np.sum(fft) > 1e-10:
            geometric_mean = np.exp(np.mean(np.log(fft + 1e-10)))
            arithmetic_mean = np.mean(fft)
            spectral_flatness[i] = geometric_mean / (arithmetic_mean + 1e-10)

            # 计算高频能量占比（4kHz以上）
            freqs = np.fft.rfftfreq(len(frame), 1/sr)
            high_freq_mask = freqs >= 4000
            high_freq_energy = np.sum(fft[high_freq_mask] ** 2)
            total_freq_energy = np.sum(fft ** 2)
            high_freq_ratio[i] = high_freq_energy / (total_freq_energy + 1e-10)

    # 归一化能量
    energy_db = 10 * np.log10(energy + 1e-10)

    # 自适应底噪检测：
    # 1. 计算能量分布的统计特征
    # 2. 使用最低5%作为候选底噪区域
    # 3. 在候选区域中,根据频谱特征进一步筛选

    # 候选底噪区域：最低5%能量
    candidate_threshold = np.percentile(energy_db, 5)

    # 在候选区域中,检测真正的底噪
    # 底噪类型1：宽频噪声（频谱平坦度 > 0.35）
    # 底噪类型2：高频电流声（高频占比 > 0.15）
    is_candidate = energy_db < candidate_threshold
    is_wideband_noise = is_candidate & (spectral_flatness > 0.35)
    is_highfreq_noise = is_candidate & (high_freq_ratio > 0.15)

    # 合并两种类型的底噪
    is_noise = is_wideband_noise | is_highfreq_noise

    # 如果检测到的底噪帧数太少(<1%),说明音频本身很纯净,不需要处理
    noise_ratio = is_noise.sum() / len(is_noise)
    if noise_ratio < 0.01:
        return audio

    # 如果提供了F0，使用F0=0来辅助判断
    if f0 is not None and len(f0) > 0:
        # F0对齐到音频帧
        f0_per_audio_frame = len(f0) / n_frames
        for i in range(n_frames):
            if not is_noise[i]:
                continue

            f0_idx = int(i * f0_per_audio_frame)
            if f0_idx < len(f0):
                # 如果F0>0，说明有音高，不是底噪
                if f0[f0_idx] > 0:
                    is_noise[i] = False

    # 使用is_noise替代is_breath，更准确地描述我们要处理的内容
    is_breath = is_noise

    # 根据底噪比例动态调整清理强度
    # 底噪越多，说明vocoder质量越差，需要更激进的清理
    if noise_ratio < 0.05:
        # 底噪很少(1-5%)，温和清理
        spectral_threshold_percentile = 85  # 保留15%
        magnitude_attenuation = 0.2  # 衰减到20%
        mix_ratio = 0.5  # 50%清理
    elif noise_ratio < 0.15:
        # 底噪中等(5-15%)，中等清理
        spectral_threshold_percentile = 90  # 保留10%
        magnitude_attenuation = 0.1  # 衰减到10%
        mix_ratio = 0.7  # 70%清理
    else:
        # 底噪很多(>15%)，激进清理
        spectral_threshold_percentile = 95  # 保留5%
        magnitude_attenuation = 0.05  # 衰减到5%
        mix_ratio = 0.85  # 85%清理

    # 对底噪区域应用降噪
    result = audio.copy()

    for i in range(n_frames):
        if is_breath[i]:
            start = i * hop_length
            end = start + frame_length
            if end > len(audio):
                break

            # 使用频谱门限降噪
            frame = audio[start:end]

            # FFT
            fft = np.fft.rfft(frame)
            magnitude = np.abs(fft)
            phase = np.angle(fft)
            freqs = np.fft.rfftfreq(len(frame), 1/sr)

            # 检测这一帧是高频噪声还是宽频噪声
            high_freq_mask = freqs >= 4000
            high_freq_energy = np.sum(magnitude[high_freq_mask] ** 2)
            total_freq_energy = np.sum(magnitude ** 2)
            frame_high_ratio = high_freq_energy / (total_freq_energy + 1e-10)

            if frame_high_ratio > 0.15:
                # 高频电流声：专门衰减高频部分
                magnitude[high_freq_mask] *= 0.05  # 高频衰减到5%
                # 中频(1-4kHz)温和衰减
                mid_freq_mask = (freqs >= 1000) & (freqs < 4000)
                magnitude[mid_freq_mask] *= 0.3
            else:
                # 宽频噪声：使用原有的频谱门限
                threshold = np.percentile(magnitude, spectral_threshold_percentile)
                magnitude = np.where(magnitude > threshold, magnitude, magnitude * magnitude_attenuation)

            # 重建
            fft_cleaned = magnitude * np.exp(1j * phase)
            frame_cleaned = np.fft.irfft(fft_cleaned, n=len(frame))

            # 平滑过渡
            fade_length = min(hop_length // 2, len(frame) // 4)
            if fade_length > 0:
                fade_in = np.linspace(0, 1, fade_length)
                fade_out = np.linspace(1, 0, fade_length)

                frame_cleaned[:fade_length] *= fade_in
                frame_cleaned[-fade_length:] *= fade_out

            # 动态混合比例
            result[start:end] = frame * (1 - mix_ratio) + frame_cleaned * mix_ratio

    return result


def stabilize_sustained_notes(audio: np.ndarray, sr: int, f0: Optional[np.ndarray] = None) -> np.ndarray:
    """

    稳定长音，防止撕裂



    参考: "Mel Spectrogram Inversion with Stable Pitch" - Apple Research

    长音时vocoder容易产生相位漂移



    Args:

        audio: 音频数据

        sr: 采样率

        f0: F0序列（用于检测长音）



    Returns:

        稳定后的音频

    """
    if f0 is None or len(f0) == 0:
        return audio

    # 检测长音区域（F0稳定且持续时间长）
    frame_length = int(0.02 * sr)
    hop_length = int(0.01 * sr)

    # F0对齐到音频帧
    n_audio_frames = 1 + (len(audio) - frame_length) // hop_length
    f0_per_audio_frame = len(f0) / n_audio_frames

    is_sustained = np.zeros(n_audio_frames, dtype=bool)

    # 检测F0稳定的区域
    window_size = 20  # 200ms窗口
    for i in range(window_size, n_audio_frames - window_size):
        f0_idx = int(i * f0_per_audio_frame)
        if f0_idx >= len(f0):
            break

        # 获取窗口内的F0
        f0_window_start = max(0, f0_idx - window_size)
        f0_window_end = min(len(f0), f0_idx + window_size)
        f0_window = f0[f0_window_start:f0_window_end]

        # 过滤F0=0
        f0_voiced = f0_window[f0_window > 0]

        if len(f0_voiced) > window_size * 0.8:  # 80%有声
            # 计算F0稳定性
            f0_std = np.std(f0_voiced)
            f0_mean = np.mean(f0_voiced)

            # F0变化小于5%认为是长音
            if f0_std / (f0_mean + 1e-6) < 0.05:
                is_sustained[i] = True

    # 对长音区域应用相位稳定
    result = audio.copy()

    i = 0
    while i < n_audio_frames:
        if is_sustained[i]:
            # 找到长音区域的起止
            start_frame = i
            while i < n_audio_frames and is_sustained[i]:
                i += 1
            end_frame = i

            # 转换为样本索引
            start_sample = start_frame * hop_length
            end_sample = min(end_frame * hop_length + frame_length, len(audio))

            if end_sample - start_sample < frame_length:
                continue

            # 提取长音段
            sustained_segment = audio[start_sample:end_sample]

            # 使用低通滤波平滑幅度包络（而非除法）
            envelope = np.abs(signal.hilbert(sustained_segment))

            # 平滑包络
            b, a = signal.butter(2, 50 / (sr / 2), btype='low')
            smoothed_envelope = signal.filtfilt(b, a, envelope)

            # 计算增益调整（避免除法放大噪声）
            # 只在包络变化剧烈的地方应用平滑
            envelope_variation = np.abs(envelope - smoothed_envelope)
            variation_threshold = np.percentile(envelope_variation, 75)

            # 创建混合掩码：变化大的地方用平滑包络，变化小的地方保持原样
            blend_mask = np.clip(envelope_variation / (variation_threshold + 1e-6), 0, 1)

            # 计算目标包络
            target_envelope = smoothed_envelope * blend_mask + envelope * (1 - blend_mask)

            # 应用包络调整（使用乘法而非除法）
            if np.max(envelope) > 1e-6:
                gain = target_envelope / (envelope + 1e-6)
                # 限制增益范围，避免放大噪声
                gain = np.clip(gain, 0.5, 2.0)
                result[start_sample:end_sample] = sustained_segment * gain

        i += 1

    return result


def apply_vocoder_artifact_fix(

    audio: np.ndarray,

    sr: int,

    f0: Optional[np.ndarray] = None,

    chunk_boundaries: Optional[list] = None,

    fix_phase: bool = True,

    fix_breath: bool = True,

    fix_sustained: bool = True

) -> np.ndarray:
    """

    应用完整的vocoder伪影修复



    Args:

        audio: 音频数据

        sr: 采样率

        f0: F0序列

        chunk_boundaries: 分块边界

        fix_phase: 是否修复相位不连续

        fix_breath: 是否修复呼吸音电音

        fix_sustained: 是否稳定长音



    Returns:

        修复后的音频

    """
    result = audio.copy()

    # 1. 修复相位不连续（长音撕裂）
    if fix_phase:
        result = fix_phase_discontinuity(result, sr, chunk_boundaries)

    # 2. 减少呼吸音电音
    if fix_breath:
        result = reduce_breath_electric_noise(result, sr, f0)

    # 3. 稳定长音
    if fix_sustained:
        result = stabilize_sustained_notes(result, sr, f0)

    return result