File size: 13,495 Bytes
b6f9c90 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 | # -*- coding: utf-8 -*-
"""
Vocoder伪影修复 - 针对呼吸音电音和长音撕裂
基于RVC社区反馈和研究文献
"""
import numpy as np
from scipy import signal
from typing import Optional
def fix_phase_discontinuity(audio: np.ndarray, sr: int, chunk_boundaries: Optional[list] = None) -> np.ndarray:
"""
修复相位不连续导致的撕裂
参考: "Prosody-Guided Harmonic Attention for Phase-Coherent Neural Vocoding" (arXiv:2601.14472)
Vocoder在长音时会产生相位不连续,导致撕裂
Args:
audio: 音频数据
sr: 采样率
chunk_boundaries: 分块边界位置(样本索引)
Returns:
修复后的音频
"""
# 使用希尔伯特变换提取瞬时相位
analytic_signal = signal.hilbert(audio)
instantaneous_phase = np.unwrap(np.angle(analytic_signal))
amplitude = np.abs(analytic_signal)
# 检测相位跳变
phase_diff = np.diff(instantaneous_phase)
phase_diff_threshold = np.percentile(np.abs(phase_diff), 99) * 2.5
# 找到相位跳变点
discontinuities = np.where(np.abs(phase_diff) > phase_diff_threshold)[0]
if len(discontinuities) == 0:
return audio
# 修复每个不连续点
result = audio.copy()
phase_corrected = instantaneous_phase.copy()
for disc_idx in discontinuities:
# 计算相位跳变量
phase_jump = phase_diff[disc_idx]
# 在不连续点之后应用相位校正(累积补偿)
correction_length = min(int(0.02 * sr), len(phase_corrected) - disc_idx - 1) # 20ms
if correction_length > 0:
# 线性过渡相位校正
correction_curve = np.linspace(phase_jump, 0, correction_length)
phase_corrected[disc_idx + 1:disc_idx + 1 + correction_length] -= correction_curve
# 用校正后的相位重建信号
corrected_signal = amplitude * np.exp(1j * phase_corrected)
result = np.real(corrected_signal).astype(np.float32)
return result
def reduce_breath_electric_noise(audio: np.ndarray, sr: int, f0: Optional[np.ndarray] = None) -> np.ndarray:
"""
减少呼吸音中的电音
参考: GitHub Issue #65 "Artefacting when speech has breath"
问题: Vocoder在F0=0的区域会产生电子噪声
Args:
audio: 音频数据
sr: 采样率
f0: F0序列(可选,用于定位呼吸音)
Returns:
处理后的音频
"""
# 第一步:去除DC偏移和极低频噪声(0-80Hz)
# 这是vocoder常见的低频泄漏问题
from scipy import signal as scipy_signal
# 设计高通滤波器:80Hz截止
nyquist = sr / 2
cutoff = 80 / nyquist
# 使用4阶Butterworth高通滤波器
sos = scipy_signal.butter(4, cutoff, btype='highpass', output='sos')
audio = scipy_signal.sosfilt(sos, audio)
# 第二步:检测和清理宽频噪声(原有逻辑)
# 检测低能量区域(可能是呼吸音)
frame_length = int(0.02 * sr) # 20ms
hop_length = int(0.01 * sr) # 10ms
n_frames = 1 + (len(audio) - frame_length) // hop_length
# 计算每帧的能量和频谱平坦度
energy = np.zeros(n_frames)
spectral_flatness = np.zeros(n_frames)
high_freq_ratio = np.zeros(n_frames) # 新增:高频能量占比
for i in range(n_frames):
start = i * hop_length
end = start + frame_length
if end > len(audio):
break
frame = audio[start:end]
# 能量
energy[i] = np.sum(frame ** 2)
# 频谱平坦度(噪声特征)
fft = np.abs(np.fft.rfft(frame))
if np.sum(fft) > 1e-10:
geometric_mean = np.exp(np.mean(np.log(fft + 1e-10)))
arithmetic_mean = np.mean(fft)
spectral_flatness[i] = geometric_mean / (arithmetic_mean + 1e-10)
# 计算高频能量占比(4kHz以上)
freqs = np.fft.rfftfreq(len(frame), 1/sr)
high_freq_mask = freqs >= 4000
high_freq_energy = np.sum(fft[high_freq_mask] ** 2)
total_freq_energy = np.sum(fft ** 2)
high_freq_ratio[i] = high_freq_energy / (total_freq_energy + 1e-10)
# 归一化能量
energy_db = 10 * np.log10(energy + 1e-10)
# 自适应底噪检测:
# 1. 计算能量分布的统计特征
# 2. 使用最低5%作为候选底噪区域
# 3. 在候选区域中,根据频谱特征进一步筛选
# 候选底噪区域:最低5%能量
candidate_threshold = np.percentile(energy_db, 5)
# 在候选区域中,检测真正的底噪
# 底噪类型1:宽频噪声(频谱平坦度 > 0.35)
# 底噪类型2:高频电流声(高频占比 > 0.15)
is_candidate = energy_db < candidate_threshold
is_wideband_noise = is_candidate & (spectral_flatness > 0.35)
is_highfreq_noise = is_candidate & (high_freq_ratio > 0.15)
# 合并两种类型的底噪
is_noise = is_wideband_noise | is_highfreq_noise
# 如果检测到的底噪帧数太少(<1%),说明音频本身很纯净,不需要处理
noise_ratio = is_noise.sum() / len(is_noise)
if noise_ratio < 0.01:
return audio
# 如果提供了F0,使用F0=0来辅助判断
if f0 is not None and len(f0) > 0:
# F0对齐到音频帧
f0_per_audio_frame = len(f0) / n_frames
for i in range(n_frames):
if not is_noise[i]:
continue
f0_idx = int(i * f0_per_audio_frame)
if f0_idx < len(f0):
# 如果F0>0,说明有音高,不是底噪
if f0[f0_idx] > 0:
is_noise[i] = False
# 使用is_noise替代is_breath,更准确地描述我们要处理的内容
is_breath = is_noise
# 根据底噪比例动态调整清理强度
# 底噪越多,说明vocoder质量越差,需要更激进的清理
if noise_ratio < 0.05:
# 底噪很少(1-5%),温和清理
spectral_threshold_percentile = 85 # 保留15%
magnitude_attenuation = 0.2 # 衰减到20%
mix_ratio = 0.5 # 50%清理
elif noise_ratio < 0.15:
# 底噪中等(5-15%),中等清理
spectral_threshold_percentile = 90 # 保留10%
magnitude_attenuation = 0.1 # 衰减到10%
mix_ratio = 0.7 # 70%清理
else:
# 底噪很多(>15%),激进清理
spectral_threshold_percentile = 95 # 保留5%
magnitude_attenuation = 0.05 # 衰减到5%
mix_ratio = 0.85 # 85%清理
# 对底噪区域应用降噪
result = audio.copy()
for i in range(n_frames):
if is_breath[i]:
start = i * hop_length
end = start + frame_length
if end > len(audio):
break
# 使用频谱门限降噪
frame = audio[start:end]
# FFT
fft = np.fft.rfft(frame)
magnitude = np.abs(fft)
phase = np.angle(fft)
freqs = np.fft.rfftfreq(len(frame), 1/sr)
# 检测这一帧是高频噪声还是宽频噪声
high_freq_mask = freqs >= 4000
high_freq_energy = np.sum(magnitude[high_freq_mask] ** 2)
total_freq_energy = np.sum(magnitude ** 2)
frame_high_ratio = high_freq_energy / (total_freq_energy + 1e-10)
if frame_high_ratio > 0.15:
# 高频电流声:专门衰减高频部分
magnitude[high_freq_mask] *= 0.05 # 高频衰减到5%
# 中频(1-4kHz)温和衰减
mid_freq_mask = (freqs >= 1000) & (freqs < 4000)
magnitude[mid_freq_mask] *= 0.3
else:
# 宽频噪声:使用原有的频谱门限
threshold = np.percentile(magnitude, spectral_threshold_percentile)
magnitude = np.where(magnitude > threshold, magnitude, magnitude * magnitude_attenuation)
# 重建
fft_cleaned = magnitude * np.exp(1j * phase)
frame_cleaned = np.fft.irfft(fft_cleaned, n=len(frame))
# 平滑过渡
fade_length = min(hop_length // 2, len(frame) // 4)
if fade_length > 0:
fade_in = np.linspace(0, 1, fade_length)
fade_out = np.linspace(1, 0, fade_length)
frame_cleaned[:fade_length] *= fade_in
frame_cleaned[-fade_length:] *= fade_out
# 动态混合比例
result[start:end] = frame * (1 - mix_ratio) + frame_cleaned * mix_ratio
return result
def stabilize_sustained_notes(audio: np.ndarray, sr: int, f0: Optional[np.ndarray] = None) -> np.ndarray:
"""
稳定长音,防止撕裂
参考: "Mel Spectrogram Inversion with Stable Pitch" - Apple Research
长音时vocoder容易产生相位漂移
Args:
audio: 音频数据
sr: 采样率
f0: F0序列(用于检测长音)
Returns:
稳定后的音频
"""
if f0 is None or len(f0) == 0:
return audio
# 检测长音区域(F0稳定且持续时间长)
frame_length = int(0.02 * sr)
hop_length = int(0.01 * sr)
# F0对齐到音频帧
n_audio_frames = 1 + (len(audio) - frame_length) // hop_length
f0_per_audio_frame = len(f0) / n_audio_frames
is_sustained = np.zeros(n_audio_frames, dtype=bool)
# 检测F0稳定的区域
window_size = 20 # 200ms窗口
for i in range(window_size, n_audio_frames - window_size):
f0_idx = int(i * f0_per_audio_frame)
if f0_idx >= len(f0):
break
# 获取窗口内的F0
f0_window_start = max(0, f0_idx - window_size)
f0_window_end = min(len(f0), f0_idx + window_size)
f0_window = f0[f0_window_start:f0_window_end]
# 过滤F0=0
f0_voiced = f0_window[f0_window > 0]
if len(f0_voiced) > window_size * 0.8: # 80%有声
# 计算F0稳定性
f0_std = np.std(f0_voiced)
f0_mean = np.mean(f0_voiced)
# F0变化小于5%认为是长音
if f0_std / (f0_mean + 1e-6) < 0.05:
is_sustained[i] = True
# 对长音区域应用相位稳定
result = audio.copy()
i = 0
while i < n_audio_frames:
if is_sustained[i]:
# 找到长音区域的起止
start_frame = i
while i < n_audio_frames and is_sustained[i]:
i += 1
end_frame = i
# 转换为样本索引
start_sample = start_frame * hop_length
end_sample = min(end_frame * hop_length + frame_length, len(audio))
if end_sample - start_sample < frame_length:
continue
# 提取长音段
sustained_segment = audio[start_sample:end_sample]
# 使用低通滤波平滑幅度包络(而非除法)
envelope = np.abs(signal.hilbert(sustained_segment))
# 平滑包络
b, a = signal.butter(2, 50 / (sr / 2), btype='low')
smoothed_envelope = signal.filtfilt(b, a, envelope)
# 计算增益调整(避免除法放大噪声)
# 只在包络变化剧烈的地方应用平滑
envelope_variation = np.abs(envelope - smoothed_envelope)
variation_threshold = np.percentile(envelope_variation, 75)
# 创建混合掩码:变化大的地方用平滑包络,变化小的地方保持原样
blend_mask = np.clip(envelope_variation / (variation_threshold + 1e-6), 0, 1)
# 计算目标包络
target_envelope = smoothed_envelope * blend_mask + envelope * (1 - blend_mask)
# 应用包络调整(使用乘法而非除法)
if np.max(envelope) > 1e-6:
gain = target_envelope / (envelope + 1e-6)
# 限制增益范围,避免放大噪声
gain = np.clip(gain, 0.5, 2.0)
result[start_sample:end_sample] = sustained_segment * gain
i += 1
return result
def apply_vocoder_artifact_fix(
audio: np.ndarray,
sr: int,
f0: Optional[np.ndarray] = None,
chunk_boundaries: Optional[list] = None,
fix_phase: bool = True,
fix_breath: bool = True,
fix_sustained: bool = True
) -> np.ndarray:
"""
应用完整的vocoder伪影修复
Args:
audio: 音频数据
sr: 采样率
f0: F0序列
chunk_boundaries: 分块边界
fix_phase: 是否修复相位不连续
fix_breath: 是否修复呼吸音电音
fix_sustained: 是否稳定长音
Returns:
修复后的音频
"""
result = audio.copy()
# 1. 修复相位不连续(长音撕裂)
if fix_phase:
result = fix_phase_discontinuity(result, sr, chunk_boundaries)
# 2. 减少呼吸音电音
if fix_breath:
result = reduce_breath_electric_noise(result, sr, f0)
# 3. 稳定长音
if fix_sustained:
result = stabilize_sustained_notes(result, sr, f0)
return result
|