File size: 7,956 Bytes
e7bf1e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 |
"""
Audio Preprocessing Module
Enhances audio quality before source separation and transcription.
Preprocessing Steps:
1. Spectral denoising - Remove background noise and artifacts
2. Peak normalization - Normalize volume to consistent level
3. High-pass filtering - Remove rumble and DC offset
4. Resampling - Ensure consistent sample rate
Target: +2-5% accuracy improvement on noisy/compressed YouTube audio
"""
from pathlib import Path
from typing import Optional
import numpy as np
import librosa
import soundfile as sf
class AudioPreprocessor:
"""
Audio preprocessing for improving transcription accuracy.
Mitigates common issues with YouTube audio:
- Compression artifacts (lossy codecs)
- Background noise (ambient, microphone noise)
- Inconsistent levels (quiet vs loud recordings)
- Low-frequency rumble (not musical, degrades separation)
"""
def __init__(
self,
enable_denoising: bool = True,
enable_normalization: bool = True,
enable_highpass: bool = True,
target_sample_rate: int = 44100
):
"""
Initialize audio preprocessor.
Args:
enable_denoising: Enable spectral denoising
enable_normalization: Enable peak normalization
enable_highpass: Enable high-pass filter (remove rumble)
target_sample_rate: Target sample rate (Hz)
"""
self.enable_denoising = enable_denoising
self.enable_normalization = enable_normalization
self.enable_highpass = enable_highpass
self.target_sample_rate = target_sample_rate
def preprocess(
self,
audio_path: Path,
output_dir: Optional[Path] = None
) -> Path:
"""
Preprocess audio file for improved transcription quality.
Args:
audio_path: Input audio file
output_dir: Output directory (default: same as input)
Returns:
Path to preprocessed audio file
"""
if output_dir is None:
output_dir = audio_path.parent
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / f"{audio_path.stem}_preprocessed.wav"
print(f" Preprocessing audio: {audio_path.name}")
# Load audio (preserve stereo if present)
y, sr = librosa.load(str(audio_path), sr=None, mono=False)
# Handle stereo vs mono
if y.ndim == 2:
print(f" Input: stereo, {sr}Hz")
is_stereo = True
else:
print(f" Input: mono, {sr}Hz")
is_stereo = False
y = np.expand_dims(y, axis=0) # Make it (1, samples) for uniform processing
# 1. Spectral denoising
if self.enable_denoising:
print(f" Applying spectral denoising...")
y = self._denoise(y, sr, is_stereo)
# 2. Peak normalization
if self.enable_normalization:
print(f" Normalizing volume...")
y = self._normalize(y)
# 3. High-pass filter (remove rumble <30Hz)
if self.enable_highpass:
print(f" Applying high-pass filter (30Hz cutoff)...")
y = self._highpass_filter(y, sr)
# 4. Resample to target sample rate
if sr != self.target_sample_rate:
print(f" Resampling: {sr}Hz → {self.target_sample_rate}Hz")
y = self._resample(y, sr, self.target_sample_rate)
sr = self.target_sample_rate
# Convert back to mono if input was mono
if not is_stereo:
y = y[0] # Remove channel dimension
# Save preprocessed audio
sf.write(output_path, y.T if is_stereo else y, sr)
print(f" ✓ Preprocessed audio saved: {output_path.name}")
return output_path
def _denoise(self, y: np.ndarray, sr: int, is_stereo: bool) -> np.ndarray:
"""
Apply spectral denoising using noisereduce library.
Args:
y: Audio data (channels, samples)
sr: Sample rate
is_stereo: Whether audio is stereo
Returns:
Denoised audio
"""
try:
import noisereduce as nr
except ImportError:
print(f" ⚠ noisereduce not installed, skipping denoising")
return y
# Apply denoising per channel
y_denoised = np.zeros_like(y)
for ch in range(y.shape[0]):
y_denoised[ch] = nr.reduce_noise(
y=y[ch],
sr=sr,
stationary=True, # Assume noise is stationary (consistent background)
prop_decrease=0.8 # Aggressiveness (0-1, higher = more aggressive)
)
return y_denoised
def _normalize(self, y: np.ndarray, target_db: float = -1.0) -> np.ndarray:
"""
Normalize audio to target peak level.
Args:
y: Audio data
target_db: Target peak level in dB (default: -1dB = almost full scale)
Returns:
Normalized audio
"""
# Find peak across all channels
peak = np.abs(y).max()
if peak == 0:
return y # Avoid division by zero
# Calculate gain to reach target peak
target_linear = 10 ** (target_db / 20.0)
gain = target_linear / peak
return y * gain
def _highpass_filter(
self,
y: np.ndarray,
sr: int,
cutoff_hz: float = 30.0
) -> np.ndarray:
"""
Apply high-pass filter to remove low-frequency rumble.
Args:
y: Audio data (channels, samples)
sr: Sample rate
cutoff_hz: Cutoff frequency (Hz)
Returns:
Filtered audio
"""
from scipy.signal import butter, sosfilt
# Design 4th-order Butterworth high-pass filter
sos = butter(4, cutoff_hz, 'hp', fs=sr, output='sos')
# Apply per channel
y_filtered = np.zeros_like(y)
for ch in range(y.shape[0]):
y_filtered[ch] = sosfilt(sos, y[ch])
return y_filtered
def _resample(
self,
y: np.ndarray,
orig_sr: int,
target_sr: int
) -> np.ndarray:
"""
Resample audio to target sample rate.
Args:
y: Audio data (channels, samples)
orig_sr: Original sample rate
target_sr: Target sample rate
Returns:
Resampled audio
"""
y_resampled = np.zeros((y.shape[0], int(y.shape[1] * target_sr / orig_sr)))
for ch in range(y.shape[0]):
y_resampled[ch] = librosa.resample(
y[ch],
orig_sr=orig_sr,
target_sr=target_sr
)
return y_resampled
if __name__ == "__main__":
# Test the preprocessor
import argparse
parser = argparse.ArgumentParser(description="Test Audio Preprocessor")
parser.add_argument("audio_file", type=str, help="Path to audio file")
parser.add_argument("--output", type=str, default="./output_audio",
help="Output directory for preprocessed audio")
parser.add_argument("--no-denoise", action="store_true",
help="Disable denoising")
parser.add_argument("--no-normalize", action="store_true",
help="Disable normalization")
parser.add_argument("--no-highpass", action="store_true",
help="Disable high-pass filter")
args = parser.parse_args()
preprocessor = AudioPreprocessor(
enable_denoising=not args.no_denoise,
enable_normalization=not args.no_normalize,
enable_highpass=not args.no_highpass
)
audio_path = Path(args.audio_file)
output_dir = Path(args.output)
# Preprocess
output_path = preprocessor.preprocess(audio_path, output_dir)
print(f"\n✓ Preprocessing complete: {output_path}")
|