AI-RVC / lib /vocal_cleanup.py
mason369's picture
Upload folder using huggingface_hub
b6f9c90 verified
# -*- coding: utf-8 -*-
"""
音频后处理模块 - 齿音和呼吸音处理
基于研究文献的最佳实践
"""
import numpy as np
from scipy import signal
from typing import Optional
def detect_sibilance_frames(audio: np.ndarray, sr: int, threshold_db: float = -20.0) -> np.ndarray:
"""
检测齿音帧 (s, sh, ch, z 等高频辅音)
参考: "Managing Sibilance" - Sound on Sound
齿音主要集中在 4-10kHz 频段
Args:
audio: 音频数据
sr: 采样率
threshold_db: 高频能量阈值 (dB)
Returns:
布尔数组,True 表示齿音帧
"""
# 设计高通滤波器提取高频成分 (4-10kHz)
nyquist = sr / 2
low_freq = 4000 / nyquist
high_freq = min(10000 / nyquist, 0.99)
# 带通滤波器
sos = signal.butter(4, [low_freq, high_freq], btype='band', output='sos')
high_freq_audio = signal.sosfilt(sos, audio)
# 计算帧能量
frame_length = int(0.02 * sr) # 20ms 帧
hop_length = int(0.01 * sr) # 10ms 跳跃
n_frames = 1 + (len(audio) - frame_length) // hop_length
high_energy = np.zeros(n_frames)
total_energy = np.zeros(n_frames)
for i in range(n_frames):
start = i * hop_length
end = start + frame_length
if end > len(audio):
break
# 高频能量
high_energy[i] = np.sum(high_freq_audio[start:end] ** 2)
# 总能量
total_energy[i] = np.sum(audio[start:end] ** 2)
# 计算高频能量比例
high_ratio = np.zeros_like(high_energy)
mask = total_energy > 1e-10
high_ratio[mask] = high_energy[mask] / total_energy[mask]
# 转换为 dB
high_energy_db = 10 * np.log10(high_energy + 1e-10)
# 齿音检测:高频能量高且高频比例大
is_sibilance = (high_energy_db > threshold_db) & (high_ratio > 0.3)
return is_sibilance
def reduce_sibilance(audio: np.ndarray, sr: int, reduction_db: float = 6.0) -> np.ndarray:
"""
减少齿音 (De-essing)
参考: "Advanced Sibilance Control" - Mike's Mix Master
使用多频段动态压缩技术
Args:
audio: 音频数据
sr: 采样率
reduction_db: 齿音衰减量 (dB)
Returns:
处理后的音频
"""
# 检测齿音帧
sibilance_frames = detect_sibilance_frames(audio, sr)
if not np.any(sibilance_frames):
return audio
# 计算衰减增益曲线(在时域应用,避免频段分离的相位问题)
frame_length = int(0.02 * sr)
hop_length = int(0.01 * sr)
gain_curve = np.ones(len(audio))
reduction_factor = 10 ** (-reduction_db / 20)
for i, is_sib in enumerate(sibilance_frames):
if is_sib:
start = i * hop_length
end = start + frame_length
if end > len(audio):
break
# 平滑过渡
fade_in = np.linspace(1.0, reduction_factor, frame_length // 4)
sustain = np.full(frame_length // 2, reduction_factor)
fade_out = np.linspace(reduction_factor, 1.0, frame_length // 4)
envelope = np.concatenate([fade_in, sustain, fade_out])
# 应用增益
gain_curve[start:start+len(envelope)] = np.minimum(
gain_curve[start:start+len(envelope)],
envelope
)
# 直接在时域应用增益(避免频段分离)
result = audio * gain_curve
return result
def detect_breath_frames(audio: np.ndarray, sr: int, threshold_db: float = -40.0) -> np.ndarray:
"""
检测呼吸音帧
呼吸音特征:
- 低能量
- 宽频噪声
- 通常在乐句之间
Args:
audio: 音频数据
sr: 采样率
threshold_db: 能量阈值 (dB)
Returns:
布尔数组,True 表示呼吸音帧
"""
frame_length = int(0.02 * sr) # 20ms
hop_length = int(0.01 * sr) # 10ms
n_frames = 1 + (len(audio) - frame_length) // hop_length
is_breath = np.zeros(n_frames, dtype=bool)
for i in range(n_frames):
start = i * hop_length
end = start + frame_length
if end > len(audio):
break
frame = audio[start:end]
# 计算能量
energy = np.sum(frame ** 2)
energy_db = 10 * np.log10(energy + 1e-10)
# 计算频谱平坦度 (噪声特征)
fft = np.abs(np.fft.rfft(frame))
geometric_mean = np.exp(np.mean(np.log(fft + 1e-10)))
arithmetic_mean = np.mean(fft)
spectral_flatness = geometric_mean / (arithmetic_mean + 1e-10)
# 呼吸音:低能量 + 高频谱平坦度
is_breath[i] = (energy_db < threshold_db) and (spectral_flatness > 0.5)
return is_breath
def reduce_breath_noise(audio: np.ndarray, sr: int, reduction_db: float = 12.0) -> np.ndarray:
"""
减少呼吸音噪声
参考: "How to REALLY Clean Vocals" - Waves
Args:
audio: 音频数据
sr: 采样率
reduction_db: 呼吸音衰减量 (dB)
Returns:
处理后的音频
"""
# 检测呼吸音帧
breath_frames = detect_breath_frames(audio, sr)
if not np.any(breath_frames):
return audio
# 计算衰减增益曲线
frame_length = int(0.02 * sr)
hop_length = int(0.01 * sr)
gain_curve = np.ones(len(audio))
reduction_factor = 10 ** (-reduction_db / 20)
for i, is_breath in enumerate(breath_frames):
if is_breath:
start = i * hop_length
end = start + frame_length
if end > len(audio):
break
# 平滑过渡,避免咔嗒声
fade_length = frame_length // 4
fade_in = np.linspace(1.0, reduction_factor, fade_length)
sustain = np.full(frame_length - 2 * fade_length, reduction_factor)
fade_out = np.linspace(reduction_factor, 1.0, fade_length)
envelope = np.concatenate([fade_in, sustain, fade_out])
# 应用增益
gain_curve[start:start+len(envelope)] = np.minimum(
gain_curve[start:start+len(envelope)],
envelope
)
# 应用增益曲线
result = audio * gain_curve
return result
def apply_vocal_cleanup(
audio: np.ndarray,
sr: int,
reduce_sibilance_enabled: bool = True,
reduce_breath_enabled: bool = True,
sibilance_reduction_db: float = 4.0,
breath_reduction_db: float = 8.0
) -> np.ndarray:
"""
应用完整的人声清理处理
Args:
audio: 音频数据
sr: 采样率
reduce_sibilance_enabled: 是否减少齿音
reduce_breath_enabled: 是否减少呼吸音
sibilance_reduction_db: 齿音衰减量 (dB)
breath_reduction_db: 呼吸音衰减量 (dB)
Returns:
处理后的音频
"""
result = audio.copy()
# 减少呼吸音(先处理,因为能量更低)
if reduce_breath_enabled:
result = reduce_breath_noise(result, sr, breath_reduction_db)
# 减少齿音
if reduce_sibilance_enabled:
result = reduce_sibilance(result, sr, sibilance_reduction_db)
return result