File size: 7,426 Bytes
b6f9c90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# -*- coding: utf-8 -*-
"""

音频后处理模块 - 齿音和呼吸音处理

基于研究文献的最佳实践

"""
import numpy as np
from scipy import signal
from typing import Optional


def detect_sibilance_frames(audio: np.ndarray, sr: int, threshold_db: float = -20.0) -> np.ndarray:
    """

    检测齿音帧 (s, sh, ch, z 等高频辅音)



    参考: "Managing Sibilance" - Sound on Sound

    齿音主要集中在 4-10kHz 频段



    Args:

        audio: 音频数据

        sr: 采样率

        threshold_db: 高频能量阈值 (dB)



    Returns:

        布尔数组,True 表示齿音帧

    """
    # 设计高通滤波器提取高频成分 (4-10kHz)
    nyquist = sr / 2
    low_freq = 4000 / nyquist
    high_freq = min(10000 / nyquist, 0.99)

    # 带通滤波器
    sos = signal.butter(4, [low_freq, high_freq], btype='band', output='sos')
    high_freq_audio = signal.sosfilt(sos, audio)

    # 计算帧能量
    frame_length = int(0.02 * sr)  # 20ms 帧
    hop_length = int(0.01 * sr)    # 10ms 跳跃

    n_frames = 1 + (len(audio) - frame_length) // hop_length
    high_energy = np.zeros(n_frames)
    total_energy = np.zeros(n_frames)

    for i in range(n_frames):
        start = i * hop_length
        end = start + frame_length
        if end > len(audio):
            break

        # 高频能量
        high_energy[i] = np.sum(high_freq_audio[start:end] ** 2)
        # 总能量
        total_energy[i] = np.sum(audio[start:end] ** 2)

    # 计算高频能量比例
    high_ratio = np.zeros_like(high_energy)
    mask = total_energy > 1e-10
    high_ratio[mask] = high_energy[mask] / total_energy[mask]

    # 转换为 dB
    high_energy_db = 10 * np.log10(high_energy + 1e-10)

    # 齿音检测:高频能量高且高频比例大
    is_sibilance = (high_energy_db > threshold_db) & (high_ratio > 0.3)

    return is_sibilance


def reduce_sibilance(audio: np.ndarray, sr: int, reduction_db: float = 6.0) -> np.ndarray:
    """

    减少齿音 (De-essing)



    参考: "Advanced Sibilance Control" - Mike's Mix Master

    使用多频段动态压缩技术



    Args:

        audio: 音频数据

        sr: 采样率

        reduction_db: 齿音衰减量 (dB)



    Returns:

        处理后的音频

    """
    # 检测齿音帧
    sibilance_frames = detect_sibilance_frames(audio, sr)

    if not np.any(sibilance_frames):
        return audio

    # 计算衰减增益曲线(在时域应用,避免频段分离的相位问题)
    frame_length = int(0.02 * sr)
    hop_length = int(0.01 * sr)

    gain_curve = np.ones(len(audio))
    reduction_factor = 10 ** (-reduction_db / 20)

    for i, is_sib in enumerate(sibilance_frames):
        if is_sib:
            start = i * hop_length
            end = start + frame_length
            if end > len(audio):
                break

            # 平滑过渡
            fade_in = np.linspace(1.0, reduction_factor, frame_length // 4)
            sustain = np.full(frame_length // 2, reduction_factor)
            fade_out = np.linspace(reduction_factor, 1.0, frame_length // 4)
            envelope = np.concatenate([fade_in, sustain, fade_out])

            # 应用增益
            gain_curve[start:start+len(envelope)] = np.minimum(
                gain_curve[start:start+len(envelope)],
                envelope
            )

    # 直接在时域应用增益(避免频段分离)
    result = audio * gain_curve

    return result


def detect_breath_frames(audio: np.ndarray, sr: int, threshold_db: float = -40.0) -> np.ndarray:
    """

    检测呼吸音帧



    呼吸音特征:

    - 低能量

    - 宽频噪声

    - 通常在乐句之间



    Args:

        audio: 音频数据

        sr: 采样率

        threshold_db: 能量阈值 (dB)



    Returns:

        布尔数组,True 表示呼吸音帧

    """
    frame_length = int(0.02 * sr)  # 20ms
    hop_length = int(0.01 * sr)    # 10ms

    n_frames = 1 + (len(audio) - frame_length) // hop_length
    is_breath = np.zeros(n_frames, dtype=bool)

    for i in range(n_frames):
        start = i * hop_length
        end = start + frame_length
        if end > len(audio):
            break

        frame = audio[start:end]

        # 计算能量
        energy = np.sum(frame ** 2)
        energy_db = 10 * np.log10(energy + 1e-10)

        # 计算频谱平坦度 (噪声特征)
        fft = np.abs(np.fft.rfft(frame))
        geometric_mean = np.exp(np.mean(np.log(fft + 1e-10)))
        arithmetic_mean = np.mean(fft)
        spectral_flatness = geometric_mean / (arithmetic_mean + 1e-10)

        # 呼吸音:低能量 + 高频谱平坦度
        is_breath[i] = (energy_db < threshold_db) and (spectral_flatness > 0.5)

    return is_breath


def reduce_breath_noise(audio: np.ndarray, sr: int, reduction_db: float = 12.0) -> np.ndarray:
    """

    减少呼吸音噪声



    参考: "How to REALLY Clean Vocals" - Waves



    Args:

        audio: 音频数据

        sr: 采样率

        reduction_db: 呼吸音衰减量 (dB)



    Returns:

        处理后的音频

    """
    # 检测呼吸音帧
    breath_frames = detect_breath_frames(audio, sr)

    if not np.any(breath_frames):
        return audio

    # 计算衰减增益曲线
    frame_length = int(0.02 * sr)
    hop_length = int(0.01 * sr)

    gain_curve = np.ones(len(audio))
    reduction_factor = 10 ** (-reduction_db / 20)

    for i, is_breath in enumerate(breath_frames):
        if is_breath:
            start = i * hop_length
            end = start + frame_length
            if end > len(audio):
                break

            # 平滑过渡,避免咔嗒声
            fade_length = frame_length // 4
            fade_in = np.linspace(1.0, reduction_factor, fade_length)
            sustain = np.full(frame_length - 2 * fade_length, reduction_factor)
            fade_out = np.linspace(reduction_factor, 1.0, fade_length)
            envelope = np.concatenate([fade_in, sustain, fade_out])

            # 应用增益
            gain_curve[start:start+len(envelope)] = np.minimum(
                gain_curve[start:start+len(envelope)],
                envelope
            )

    # 应用增益曲线
    result = audio * gain_curve

    return result


def apply_vocal_cleanup(

    audio: np.ndarray,

    sr: int,

    reduce_sibilance_enabled: bool = True,

    reduce_breath_enabled: bool = True,

    sibilance_reduction_db: float = 4.0,

    breath_reduction_db: float = 8.0

) -> np.ndarray:
    """

    应用完整的人声清理处理



    Args:

        audio: 音频数据

        sr: 采样率

        reduce_sibilance_enabled: 是否减少齿音

        reduce_breath_enabled: 是否减少呼吸音

        sibilance_reduction_db: 齿音衰减量 (dB)

        breath_reduction_db: 呼吸音衰减量 (dB)



    Returns:

        处理后的音频

    """
    result = audio.copy()

    # 减少呼吸音(先处理,因为能量更低)
    if reduce_breath_enabled:
        result = reduce_breath_noise(result, sr, breath_reduction_db)

    # 减少齿音
    if reduce_sibilance_enabled:
        result = reduce_sibilance(result, sr, sibilance_reduction_db)

    return result