AI-RVC / lib /vocoder_fix.py
mason369's picture
Upload folder using huggingface_hub
b6f9c90 verified
# -*- coding: utf-8 -*-
"""
Vocoder伪影修复 - 针对呼吸音电音和长音撕裂
基于RVC社区反馈和研究文献
"""
import numpy as np
from scipy import signal
from typing import Optional
def fix_phase_discontinuity(audio: np.ndarray, sr: int, chunk_boundaries: Optional[list] = None) -> np.ndarray:
"""
修复相位不连续导致的撕裂
参考: "Prosody-Guided Harmonic Attention for Phase-Coherent Neural Vocoding" (arXiv:2601.14472)
Vocoder在长音时会产生相位不连续,导致撕裂
Args:
audio: 音频数据
sr: 采样率
chunk_boundaries: 分块边界位置(样本索引)
Returns:
修复后的音频
"""
# 使用希尔伯特变换提取瞬时相位
analytic_signal = signal.hilbert(audio)
instantaneous_phase = np.unwrap(np.angle(analytic_signal))
amplitude = np.abs(analytic_signal)
# 检测相位跳变
phase_diff = np.diff(instantaneous_phase)
phase_diff_threshold = np.percentile(np.abs(phase_diff), 99) * 2.5
# 找到相位跳变点
discontinuities = np.where(np.abs(phase_diff) > phase_diff_threshold)[0]
if len(discontinuities) == 0:
return audio
# 修复每个不连续点
result = audio.copy()
phase_corrected = instantaneous_phase.copy()
for disc_idx in discontinuities:
# 计算相位跳变量
phase_jump = phase_diff[disc_idx]
# 在不连续点之后应用相位校正(累积补偿)
correction_length = min(int(0.02 * sr), len(phase_corrected) - disc_idx - 1) # 20ms
if correction_length > 0:
# 线性过渡相位校正
correction_curve = np.linspace(phase_jump, 0, correction_length)
phase_corrected[disc_idx + 1:disc_idx + 1 + correction_length] -= correction_curve
# 用校正后的相位重建信号
corrected_signal = amplitude * np.exp(1j * phase_corrected)
result = np.real(corrected_signal).astype(np.float32)
return result
def reduce_breath_electric_noise(audio: np.ndarray, sr: int, f0: Optional[np.ndarray] = None) -> np.ndarray:
"""
减少呼吸音中的电音
参考: GitHub Issue #65 "Artefacting when speech has breath"
问题: Vocoder在F0=0的区域会产生电子噪声
Args:
audio: 音频数据
sr: 采样率
f0: F0序列(可选,用于定位呼吸音)
Returns:
处理后的音频
"""
# 第一步:去除DC偏移和极低频噪声(0-80Hz)
# 这是vocoder常见的低频泄漏问题
from scipy import signal as scipy_signal
# 设计高通滤波器:80Hz截止
nyquist = sr / 2
cutoff = 80 / nyquist
# 使用4阶Butterworth高通滤波器
sos = scipy_signal.butter(4, cutoff, btype='highpass', output='sos')
audio = scipy_signal.sosfilt(sos, audio)
# 第二步:检测和清理宽频噪声(原有逻辑)
# 检测低能量区域(可能是呼吸音)
frame_length = int(0.02 * sr) # 20ms
hop_length = int(0.01 * sr) # 10ms
n_frames = 1 + (len(audio) - frame_length) // hop_length
# 计算每帧的能量和频谱平坦度
energy = np.zeros(n_frames)
spectral_flatness = np.zeros(n_frames)
high_freq_ratio = np.zeros(n_frames) # 新增:高频能量占比
for i in range(n_frames):
start = i * hop_length
end = start + frame_length
if end > len(audio):
break
frame = audio[start:end]
# 能量
energy[i] = np.sum(frame ** 2)
# 频谱平坦度(噪声特征)
fft = np.abs(np.fft.rfft(frame))
if np.sum(fft) > 1e-10:
geometric_mean = np.exp(np.mean(np.log(fft + 1e-10)))
arithmetic_mean = np.mean(fft)
spectral_flatness[i] = geometric_mean / (arithmetic_mean + 1e-10)
# 计算高频能量占比(4kHz以上)
freqs = np.fft.rfftfreq(len(frame), 1/sr)
high_freq_mask = freqs >= 4000
high_freq_energy = np.sum(fft[high_freq_mask] ** 2)
total_freq_energy = np.sum(fft ** 2)
high_freq_ratio[i] = high_freq_energy / (total_freq_energy + 1e-10)
# 归一化能量
energy_db = 10 * np.log10(energy + 1e-10)
# 自适应底噪检测:
# 1. 计算能量分布的统计特征
# 2. 使用最低5%作为候选底噪区域
# 3. 在候选区域中,根据频谱特征进一步筛选
# 候选底噪区域:最低5%能量
candidate_threshold = np.percentile(energy_db, 5)
# 在候选区域中,检测真正的底噪
# 底噪类型1:宽频噪声(频谱平坦度 > 0.35)
# 底噪类型2:高频电流声(高频占比 > 0.15)
is_candidate = energy_db < candidate_threshold
is_wideband_noise = is_candidate & (spectral_flatness > 0.35)
is_highfreq_noise = is_candidate & (high_freq_ratio > 0.15)
# 合并两种类型的底噪
is_noise = is_wideband_noise | is_highfreq_noise
# 如果检测到的底噪帧数太少(<1%),说明音频本身很纯净,不需要处理
noise_ratio = is_noise.sum() / len(is_noise)
if noise_ratio < 0.01:
return audio
# 如果提供了F0,使用F0=0来辅助判断
if f0 is not None and len(f0) > 0:
# F0对齐到音频帧
f0_per_audio_frame = len(f0) / n_frames
for i in range(n_frames):
if not is_noise[i]:
continue
f0_idx = int(i * f0_per_audio_frame)
if f0_idx < len(f0):
# 如果F0>0,说明有音高,不是底噪
if f0[f0_idx] > 0:
is_noise[i] = False
# 使用is_noise替代is_breath,更准确地描述我们要处理的内容
is_breath = is_noise
# 根据底噪比例动态调整清理强度
# 底噪越多,说明vocoder质量越差,需要更激进的清理
if noise_ratio < 0.05:
# 底噪很少(1-5%),温和清理
spectral_threshold_percentile = 85 # 保留15%
magnitude_attenuation = 0.2 # 衰减到20%
mix_ratio = 0.5 # 50%清理
elif noise_ratio < 0.15:
# 底噪中等(5-15%),中等清理
spectral_threshold_percentile = 90 # 保留10%
magnitude_attenuation = 0.1 # 衰减到10%
mix_ratio = 0.7 # 70%清理
else:
# 底噪很多(>15%),激进清理
spectral_threshold_percentile = 95 # 保留5%
magnitude_attenuation = 0.05 # 衰减到5%
mix_ratio = 0.85 # 85%清理
# 对底噪区域应用降噪
result = audio.copy()
for i in range(n_frames):
if is_breath[i]:
start = i * hop_length
end = start + frame_length
if end > len(audio):
break
# 使用频谱门限降噪
frame = audio[start:end]
# FFT
fft = np.fft.rfft(frame)
magnitude = np.abs(fft)
phase = np.angle(fft)
freqs = np.fft.rfftfreq(len(frame), 1/sr)
# 检测这一帧是高频噪声还是宽频噪声
high_freq_mask = freqs >= 4000
high_freq_energy = np.sum(magnitude[high_freq_mask] ** 2)
total_freq_energy = np.sum(magnitude ** 2)
frame_high_ratio = high_freq_energy / (total_freq_energy + 1e-10)
if frame_high_ratio > 0.15:
# 高频电流声:专门衰减高频部分
magnitude[high_freq_mask] *= 0.05 # 高频衰减到5%
# 中频(1-4kHz)温和衰减
mid_freq_mask = (freqs >= 1000) & (freqs < 4000)
magnitude[mid_freq_mask] *= 0.3
else:
# 宽频噪声:使用原有的频谱门限
threshold = np.percentile(magnitude, spectral_threshold_percentile)
magnitude = np.where(magnitude > threshold, magnitude, magnitude * magnitude_attenuation)
# 重建
fft_cleaned = magnitude * np.exp(1j * phase)
frame_cleaned = np.fft.irfft(fft_cleaned, n=len(frame))
# 平滑过渡
fade_length = min(hop_length // 2, len(frame) // 4)
if fade_length > 0:
fade_in = np.linspace(0, 1, fade_length)
fade_out = np.linspace(1, 0, fade_length)
frame_cleaned[:fade_length] *= fade_in
frame_cleaned[-fade_length:] *= fade_out
# 动态混合比例
result[start:end] = frame * (1 - mix_ratio) + frame_cleaned * mix_ratio
return result
def stabilize_sustained_notes(audio: np.ndarray, sr: int, f0: Optional[np.ndarray] = None) -> np.ndarray:
"""
稳定长音,防止撕裂
参考: "Mel Spectrogram Inversion with Stable Pitch" - Apple Research
长音时vocoder容易产生相位漂移
Args:
audio: 音频数据
sr: 采样率
f0: F0序列(用于检测长音)
Returns:
稳定后的音频
"""
if f0 is None or len(f0) == 0:
return audio
# 检测长音区域(F0稳定且持续时间长)
frame_length = int(0.02 * sr)
hop_length = int(0.01 * sr)
# F0对齐到音频帧
n_audio_frames = 1 + (len(audio) - frame_length) // hop_length
f0_per_audio_frame = len(f0) / n_audio_frames
is_sustained = np.zeros(n_audio_frames, dtype=bool)
# 检测F0稳定的区域
window_size = 20 # 200ms窗口
for i in range(window_size, n_audio_frames - window_size):
f0_idx = int(i * f0_per_audio_frame)
if f0_idx >= len(f0):
break
# 获取窗口内的F0
f0_window_start = max(0, f0_idx - window_size)
f0_window_end = min(len(f0), f0_idx + window_size)
f0_window = f0[f0_window_start:f0_window_end]
# 过滤F0=0
f0_voiced = f0_window[f0_window > 0]
if len(f0_voiced) > window_size * 0.8: # 80%有声
# 计算F0稳定性
f0_std = np.std(f0_voiced)
f0_mean = np.mean(f0_voiced)
# F0变化小于5%认为是长音
if f0_std / (f0_mean + 1e-6) < 0.05:
is_sustained[i] = True
# 对长音区域应用相位稳定
result = audio.copy()
i = 0
while i < n_audio_frames:
if is_sustained[i]:
# 找到长音区域的起止
start_frame = i
while i < n_audio_frames and is_sustained[i]:
i += 1
end_frame = i
# 转换为样本索引
start_sample = start_frame * hop_length
end_sample = min(end_frame * hop_length + frame_length, len(audio))
if end_sample - start_sample < frame_length:
continue
# 提取长音段
sustained_segment = audio[start_sample:end_sample]
# 使用低通滤波平滑幅度包络(而非除法)
envelope = np.abs(signal.hilbert(sustained_segment))
# 平滑包络
b, a = signal.butter(2, 50 / (sr / 2), btype='low')
smoothed_envelope = signal.filtfilt(b, a, envelope)
# 计算增益调整(避免除法放大噪声)
# 只在包络变化剧烈的地方应用平滑
envelope_variation = np.abs(envelope - smoothed_envelope)
variation_threshold = np.percentile(envelope_variation, 75)
# 创建混合掩码:变化大的地方用平滑包络,变化小的地方保持原样
blend_mask = np.clip(envelope_variation / (variation_threshold + 1e-6), 0, 1)
# 计算目标包络
target_envelope = smoothed_envelope * blend_mask + envelope * (1 - blend_mask)
# 应用包络调整(使用乘法而非除法)
if np.max(envelope) > 1e-6:
gain = target_envelope / (envelope + 1e-6)
# 限制增益范围,避免放大噪声
gain = np.clip(gain, 0.5, 2.0)
result[start_sample:end_sample] = sustained_segment * gain
i += 1
return result
def apply_vocoder_artifact_fix(
audio: np.ndarray,
sr: int,
f0: Optional[np.ndarray] = None,
chunk_boundaries: Optional[list] = None,
fix_phase: bool = True,
fix_breath: bool = True,
fix_sustained: bool = True
) -> np.ndarray:
"""
应用完整的vocoder伪影修复
Args:
audio: 音频数据
sr: 采样率
f0: F0序列
chunk_boundaries: 分块边界
fix_phase: 是否修复相位不连续
fix_breath: 是否修复呼吸音电音
fix_sustained: 是否稳定长音
Returns:
修复后的音频
"""
result = audio.copy()
# 1. 修复相位不连续(长音撕裂)
if fix_phase:
result = fix_phase_discontinuity(result, sr, chunk_boundaries)
# 2. 减少呼吸音电音
if fix_breath:
result = reduce_breath_electric_noise(result, sr, f0)
# 3. 稳定长音
if fix_sustained:
result = stabilize_sustained_notes(result, sr, f0)
return result