| |
|
| | """
|
| | 音频后处理模块 - 齿音和呼吸音处理
|
| | 基于研究文献的最佳实践
|
| | """
|
| | import numpy as np
|
| | from scipy import signal
|
| | from typing import Optional
|
| |
|
| |
|
| | def detect_sibilance_frames(audio: np.ndarray, sr: int, threshold_db: float = -20.0) -> np.ndarray:
|
| | """
|
| | 检测齿音帧 (s, sh, ch, z 等高频辅音)
|
| |
|
| | 参考: "Managing Sibilance" - Sound on Sound
|
| | 齿音主要集中在 4-10kHz 频段
|
| |
|
| | Args:
|
| | audio: 音频数据
|
| | sr: 采样率
|
| | threshold_db: 高频能量阈值 (dB)
|
| |
|
| | Returns:
|
| | 布尔数组,True 表示齿音帧
|
| | """
|
| |
|
| | nyquist = sr / 2
|
| | low_freq = 4000 / nyquist
|
| | high_freq = min(10000 / nyquist, 0.99)
|
| |
|
| |
|
| | sos = signal.butter(4, [low_freq, high_freq], btype='band', output='sos')
|
| | high_freq_audio = signal.sosfilt(sos, audio)
|
| |
|
| |
|
| | frame_length = int(0.02 * sr)
|
| | hop_length = int(0.01 * sr)
|
| |
|
| | n_frames = 1 + (len(audio) - frame_length) // hop_length
|
| | high_energy = np.zeros(n_frames)
|
| | total_energy = np.zeros(n_frames)
|
| |
|
| | for i in range(n_frames):
|
| | start = i * hop_length
|
| | end = start + frame_length
|
| | if end > len(audio):
|
| | break
|
| |
|
| |
|
| | high_energy[i] = np.sum(high_freq_audio[start:end] ** 2)
|
| |
|
| | total_energy[i] = np.sum(audio[start:end] ** 2)
|
| |
|
| |
|
| | high_ratio = np.zeros_like(high_energy)
|
| | mask = total_energy > 1e-10
|
| | high_ratio[mask] = high_energy[mask] / total_energy[mask]
|
| |
|
| |
|
| | high_energy_db = 10 * np.log10(high_energy + 1e-10)
|
| |
|
| |
|
| | is_sibilance = (high_energy_db > threshold_db) & (high_ratio > 0.3)
|
| |
|
| | return is_sibilance
|
| |
|
| |
|
| | def reduce_sibilance(audio: np.ndarray, sr: int, reduction_db: float = 6.0) -> np.ndarray:
|
| | """
|
| | 减少齿音 (De-essing)
|
| |
|
| | 参考: "Advanced Sibilance Control" - Mike's Mix Master
|
| | 使用多频段动态压缩技术
|
| |
|
| | Args:
|
| | audio: 音频数据
|
| | sr: 采样率
|
| | reduction_db: 齿音衰减量 (dB)
|
| |
|
| | Returns:
|
| | 处理后的音频
|
| | """
|
| |
|
| | sibilance_frames = detect_sibilance_frames(audio, sr)
|
| |
|
| | if not np.any(sibilance_frames):
|
| | return audio
|
| |
|
| |
|
| | frame_length = int(0.02 * sr)
|
| | hop_length = int(0.01 * sr)
|
| |
|
| | gain_curve = np.ones(len(audio))
|
| | reduction_factor = 10 ** (-reduction_db / 20)
|
| |
|
| | for i, is_sib in enumerate(sibilance_frames):
|
| | if is_sib:
|
| | start = i * hop_length
|
| | end = start + frame_length
|
| | if end > len(audio):
|
| | break
|
| |
|
| |
|
| | fade_in = np.linspace(1.0, reduction_factor, frame_length // 4)
|
| | sustain = np.full(frame_length // 2, reduction_factor)
|
| | fade_out = np.linspace(reduction_factor, 1.0, frame_length // 4)
|
| | envelope = np.concatenate([fade_in, sustain, fade_out])
|
| |
|
| |
|
| | gain_curve[start:start+len(envelope)] = np.minimum(
|
| | gain_curve[start:start+len(envelope)],
|
| | envelope
|
| | )
|
| |
|
| |
|
| | result = audio * gain_curve
|
| |
|
| | return result
|
| |
|
| |
|
| | def detect_breath_frames(audio: np.ndarray, sr: int, threshold_db: float = -40.0) -> np.ndarray:
|
| | """
|
| | 检测呼吸音帧
|
| |
|
| | 呼吸音特征:
|
| | - 低能量
|
| | - 宽频噪声
|
| | - 通常在乐句之间
|
| |
|
| | Args:
|
| | audio: 音频数据
|
| | sr: 采样率
|
| | threshold_db: 能量阈值 (dB)
|
| |
|
| | Returns:
|
| | 布尔数组,True 表示呼吸音帧
|
| | """
|
| | frame_length = int(0.02 * sr)
|
| | hop_length = int(0.01 * sr)
|
| |
|
| | n_frames = 1 + (len(audio) - frame_length) // hop_length
|
| | is_breath = np.zeros(n_frames, dtype=bool)
|
| |
|
| | for i in range(n_frames):
|
| | start = i * hop_length
|
| | end = start + frame_length
|
| | if end > len(audio):
|
| | break
|
| |
|
| | frame = audio[start:end]
|
| |
|
| |
|
| | energy = np.sum(frame ** 2)
|
| | energy_db = 10 * np.log10(energy + 1e-10)
|
| |
|
| |
|
| | fft = np.abs(np.fft.rfft(frame))
|
| | geometric_mean = np.exp(np.mean(np.log(fft + 1e-10)))
|
| | arithmetic_mean = np.mean(fft)
|
| | spectral_flatness = geometric_mean / (arithmetic_mean + 1e-10)
|
| |
|
| |
|
| | is_breath[i] = (energy_db < threshold_db) and (spectral_flatness > 0.5)
|
| |
|
| | return is_breath
|
| |
|
| |
|
| | def reduce_breath_noise(audio: np.ndarray, sr: int, reduction_db: float = 12.0) -> np.ndarray:
|
| | """
|
| | 减少呼吸音噪声
|
| |
|
| | 参考: "How to REALLY Clean Vocals" - Waves
|
| |
|
| | Args:
|
| | audio: 音频数据
|
| | sr: 采样率
|
| | reduction_db: 呼吸音衰减量 (dB)
|
| |
|
| | Returns:
|
| | 处理后的音频
|
| | """
|
| |
|
| | breath_frames = detect_breath_frames(audio, sr)
|
| |
|
| | if not np.any(breath_frames):
|
| | return audio
|
| |
|
| |
|
| | frame_length = int(0.02 * sr)
|
| | hop_length = int(0.01 * sr)
|
| |
|
| | gain_curve = np.ones(len(audio))
|
| | reduction_factor = 10 ** (-reduction_db / 20)
|
| |
|
| | for i, is_breath in enumerate(breath_frames):
|
| | if is_breath:
|
| | start = i * hop_length
|
| | end = start + frame_length
|
| | if end > len(audio):
|
| | break
|
| |
|
| |
|
| | fade_length = frame_length // 4
|
| | fade_in = np.linspace(1.0, reduction_factor, fade_length)
|
| | sustain = np.full(frame_length - 2 * fade_length, reduction_factor)
|
| | fade_out = np.linspace(reduction_factor, 1.0, fade_length)
|
| | envelope = np.concatenate([fade_in, sustain, fade_out])
|
| |
|
| |
|
| | gain_curve[start:start+len(envelope)] = np.minimum(
|
| | gain_curve[start:start+len(envelope)],
|
| | envelope
|
| | )
|
| |
|
| |
|
| | result = audio * gain_curve
|
| |
|
| | return result
|
| |
|
| |
|
| | def apply_vocal_cleanup(
|
| | audio: np.ndarray,
|
| | sr: int,
|
| | reduce_sibilance_enabled: bool = True,
|
| | reduce_breath_enabled: bool = True,
|
| | sibilance_reduction_db: float = 4.0,
|
| | breath_reduction_db: float = 8.0
|
| | ) -> np.ndarray:
|
| | """
|
| | 应用完整的人声清理处理
|
| |
|
| | Args:
|
| | audio: 音频数据
|
| | sr: 采样率
|
| | reduce_sibilance_enabled: 是否减少齿音
|
| | reduce_breath_enabled: 是否减少呼吸音
|
| | sibilance_reduction_db: 齿音衰减量 (dB)
|
| | breath_reduction_db: 呼吸音衰减量 (dB)
|
| |
|
| | Returns:
|
| | 处理后的音频
|
| | """
|
| | result = audio.copy()
|
| |
|
| |
|
| | if reduce_breath_enabled:
|
| | result = reduce_breath_noise(result, sr, breath_reduction_db)
|
| |
|
| |
|
| | if reduce_sibilance_enabled:
|
| | result = reduce_sibilance(result, sr, sibilance_reduction_db)
|
| |
|
| | return result
|
| |
|