Spaces:

mason369
/

AI-RVC

Running

File size: 7,426 Bytes

b6f9c90

# -*- coding: utf-8 -*-
"""

音频后处理模块 - 齿音和呼吸音处理

基于研究文献的最佳实践

"""
import numpy as np
from scipy import signal
from typing import Optional


def detect_sibilance_frames(audio: np.ndarray, sr: int, threshold_db: float = -20.0) -> np.ndarray:
    """

    检测齿音帧 (s, sh, ch, z 等高频辅音)



    参考: "Managing Sibilance" - Sound on Sound

    齿音主要集中在 4-10kHz 频段



    Args:

        audio: 音频数据

        sr: 采样率

        threshold_db: 高频能量阈值 (dB)



    Returns:

        布尔数组，True 表示齿音帧

    """
    # 设计高通滤波器提取高频成分 (4-10kHz)
    nyquist = sr / 2
    low_freq = 4000 / nyquist
    high_freq = min(10000 / nyquist, 0.99)

    # 带通滤波器
    sos = signal.butter(4, [low_freq, high_freq], btype='band', output='sos')
    high_freq_audio = signal.sosfilt(sos, audio)

    # 计算帧能量
    frame_length = int(0.02 * sr)  # 20ms 帧
    hop_length = int(0.01 * sr)    # 10ms 跳跃

    n_frames = 1 + (len(audio) - frame_length) // hop_length
    high_energy = np.zeros(n_frames)
    total_energy = np.zeros(n_frames)

    for i in range(n_frames):
        start = i * hop_length
        end = start + frame_length
        if end > len(audio):
            break

        # 高频能量
        high_energy[i] = np.sum(high_freq_audio[start:end] ** 2)
        # 总能量
        total_energy[i] = np.sum(audio[start:end] ** 2)

    # 计算高频能量比例
    high_ratio = np.zeros_like(high_energy)
    mask = total_energy > 1e-10
    high_ratio[mask] = high_energy[mask] / total_energy[mask]

    # 转换为 dB
    high_energy_db = 10 * np.log10(high_energy + 1e-10)

    # 齿音检测：高频能量高且高频比例大
    is_sibilance = (high_energy_db > threshold_db) & (high_ratio > 0.3)

    return is_sibilance


def reduce_sibilance(audio: np.ndarray, sr: int, reduction_db: float = 6.0) -> np.ndarray:
    """

    减少齿音 (De-essing)



    参考: "Advanced Sibilance Control" - Mike's Mix Master

    使用多频段动态压缩技术



    Args:

        audio: 音频数据

        sr: 采样率

        reduction_db: 齿音衰减量 (dB)



    Returns:

        处理后的音频

    """
    # 检测齿音帧
    sibilance_frames = detect_sibilance_frames(audio, sr)

    if not np.any(sibilance_frames):
        return audio

    # 计算衰减增益曲线（在时域应用，避免频段分离的相位问题）
    frame_length = int(0.02 * sr)
    hop_length = int(0.01 * sr)

    gain_curve = np.ones(len(audio))
    reduction_factor = 10 ** (-reduction_db / 20)

    for i, is_sib in enumerate(sibilance_frames):
        if is_sib:
            start = i * hop_length
            end = start + frame_length
            if end > len(audio):
                break

            # 平滑过渡
            fade_in = np.linspace(1.0, reduction_factor, frame_length // 4)
            sustain = np.full(frame_length // 2, reduction_factor)
            fade_out = np.linspace(reduction_factor, 1.0, frame_length // 4)
            envelope = np.concatenate([fade_in, sustain, fade_out])

            # 应用增益
            gain_curve[start:start+len(envelope)] = np.minimum(
                gain_curve[start:start+len(envelope)],
                envelope
            )

    # 直接在时域应用增益（避免频段分离）
    result = audio * gain_curve

    return result


def detect_breath_frames(audio: np.ndarray, sr: int, threshold_db: float = -40.0) -> np.ndarray:
    """

    检测呼吸音帧



    呼吸音特征：

    - 低能量

    - 宽频噪声

    - 通常在乐句之间



    Args:

        audio: 音频数据

        sr: 采样率

        threshold_db: 能量阈值 (dB)



    Returns:

        布尔数组，True 表示呼吸音帧

    """
    frame_length = int(0.02 * sr)  # 20ms
    hop_length = int(0.01 * sr)    # 10ms

    n_frames = 1 + (len(audio) - frame_length) // hop_length
    is_breath = np.zeros(n_frames, dtype=bool)

    for i in range(n_frames):
        start = i * hop_length
        end = start + frame_length
        if end > len(audio):
            break

        frame = audio[start:end]

        # 计算能量
        energy = np.sum(frame ** 2)
        energy_db = 10 * np.log10(energy + 1e-10)

        # 计算频谱平坦度 (噪声特征)
        fft = np.abs(np.fft.rfft(frame))
        geometric_mean = np.exp(np.mean(np.log(fft + 1e-10)))
        arithmetic_mean = np.mean(fft)
        spectral_flatness = geometric_mean / (arithmetic_mean + 1e-10)

        # 呼吸音：低能量 + 高频谱平坦度
        is_breath[i] = (energy_db < threshold_db) and (spectral_flatness > 0.5)

    return is_breath


def reduce_breath_noise(audio: np.ndarray, sr: int, reduction_db: float = 12.0) -> np.ndarray:
    """

    减少呼吸音噪声



    参考: "How to REALLY Clean Vocals" - Waves



    Args:

        audio: 音频数据

        sr: 采样率

        reduction_db: 呼吸音衰减量 (dB)



    Returns:

        处理后的音频

    """
    # 检测呼吸音帧
    breath_frames = detect_breath_frames(audio, sr)

    if not np.any(breath_frames):
        return audio

    # 计算衰减增益曲线
    frame_length = int(0.02 * sr)
    hop_length = int(0.01 * sr)

    gain_curve = np.ones(len(audio))
    reduction_factor = 10 ** (-reduction_db / 20)

    for i, is_breath in enumerate(breath_frames):
        if is_breath:
            start = i * hop_length
            end = start + frame_length
            if end > len(audio):
                break

            # 平滑过渡，避免咔嗒声
            fade_length = frame_length // 4
            fade_in = np.linspace(1.0, reduction_factor, fade_length)
            sustain = np.full(frame_length - 2 * fade_length, reduction_factor)
            fade_out = np.linspace(reduction_factor, 1.0, fade_length)
            envelope = np.concatenate([fade_in, sustain, fade_out])

            # 应用增益
            gain_curve[start:start+len(envelope)] = np.minimum(
                gain_curve[start:start+len(envelope)],
                envelope
            )

    # 应用增益曲线
    result = audio * gain_curve

    return result


def apply_vocal_cleanup(

    audio: np.ndarray,

    sr: int,

    reduce_sibilance_enabled: bool = True,

    reduce_breath_enabled: bool = True,

    sibilance_reduction_db: float = 4.0,

    breath_reduction_db: float = 8.0

) -> np.ndarray:
    """

    应用完整的人声清理处理



    Args:

        audio: 音频数据

        sr: 采样率

        reduce_sibilance_enabled: 是否减少齿音

        reduce_breath_enabled: 是否减少呼吸音

        sibilance_reduction_db: 齿音衰减量 (dB)

        breath_reduction_db: 呼吸音衰减量 (dB)



    Returns:

        处理后的音频

    """
    result = audio.copy()

    # 减少呼吸音（先处理，因为能量更低）
    if reduce_breath_enabled:
        result = reduce_breath_noise(result, sr, breath_reduction_db)

    # 减少齿音
    if reduce_sibilance_enabled:
        result = reduce_sibilance(result, sr, sibilance_reduction_db)

    return result