|
|
""" |
|
|
Audio Preprocessing Module |
|
|
|
|
|
Enhances audio quality before source separation and transcription. |
|
|
|
|
|
Preprocessing Steps: |
|
|
1. Spectral denoising - Remove background noise and artifacts |
|
|
2. Peak normalization - Normalize volume to consistent level |
|
|
3. High-pass filtering - Remove rumble and DC offset |
|
|
4. Resampling - Ensure consistent sample rate |
|
|
|
|
|
Target: +2-5% accuracy improvement on noisy/compressed YouTube audio |
|
|
""" |
|
|
|
|
|
from pathlib import Path |
|
|
from typing import Optional |
|
|
import numpy as np |
|
|
import librosa |
|
|
import soundfile as sf |
|
|
|
|
|
|
|
|
class AudioPreprocessor: |
|
|
""" |
|
|
Audio preprocessing for improving transcription accuracy. |
|
|
|
|
|
Mitigates common issues with YouTube audio: |
|
|
- Compression artifacts (lossy codecs) |
|
|
- Background noise (ambient, microphone noise) |
|
|
- Inconsistent levels (quiet vs loud recordings) |
|
|
- Low-frequency rumble (not musical, degrades separation) |
|
|
""" |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
enable_denoising: bool = True, |
|
|
enable_normalization: bool = True, |
|
|
enable_highpass: bool = True, |
|
|
target_sample_rate: int = 44100 |
|
|
): |
|
|
""" |
|
|
Initialize audio preprocessor. |
|
|
|
|
|
Args: |
|
|
enable_denoising: Enable spectral denoising |
|
|
enable_normalization: Enable peak normalization |
|
|
enable_highpass: Enable high-pass filter (remove rumble) |
|
|
target_sample_rate: Target sample rate (Hz) |
|
|
""" |
|
|
self.enable_denoising = enable_denoising |
|
|
self.enable_normalization = enable_normalization |
|
|
self.enable_highpass = enable_highpass |
|
|
self.target_sample_rate = target_sample_rate |
|
|
|
|
|
def preprocess( |
|
|
self, |
|
|
audio_path: Path, |
|
|
output_dir: Optional[Path] = None |
|
|
) -> Path: |
|
|
""" |
|
|
Preprocess audio file for improved transcription quality. |
|
|
|
|
|
Args: |
|
|
audio_path: Input audio file |
|
|
output_dir: Output directory (default: same as input) |
|
|
|
|
|
Returns: |
|
|
Path to preprocessed audio file |
|
|
""" |
|
|
if output_dir is None: |
|
|
output_dir = audio_path.parent |
|
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
output_path = output_dir / f"{audio_path.stem}_preprocessed.wav" |
|
|
|
|
|
print(f" Preprocessing audio: {audio_path.name}") |
|
|
|
|
|
|
|
|
y, sr = librosa.load(str(audio_path), sr=None, mono=False) |
|
|
|
|
|
|
|
|
if y.ndim == 2: |
|
|
print(f" Input: stereo, {sr}Hz") |
|
|
is_stereo = True |
|
|
else: |
|
|
print(f" Input: mono, {sr}Hz") |
|
|
is_stereo = False |
|
|
y = np.expand_dims(y, axis=0) |
|
|
|
|
|
|
|
|
if self.enable_denoising: |
|
|
print(f" Applying spectral denoising...") |
|
|
y = self._denoise(y, sr, is_stereo) |
|
|
|
|
|
|
|
|
if self.enable_normalization: |
|
|
print(f" Normalizing volume...") |
|
|
y = self._normalize(y) |
|
|
|
|
|
|
|
|
if self.enable_highpass: |
|
|
print(f" Applying high-pass filter (30Hz cutoff)...") |
|
|
y = self._highpass_filter(y, sr) |
|
|
|
|
|
|
|
|
if sr != self.target_sample_rate: |
|
|
print(f" Resampling: {sr}Hz → {self.target_sample_rate}Hz") |
|
|
y = self._resample(y, sr, self.target_sample_rate) |
|
|
sr = self.target_sample_rate |
|
|
|
|
|
|
|
|
if not is_stereo: |
|
|
y = y[0] |
|
|
|
|
|
|
|
|
sf.write(output_path, y.T if is_stereo else y, sr) |
|
|
print(f" ✓ Preprocessed audio saved: {output_path.name}") |
|
|
|
|
|
return output_path |
|
|
|
|
|
def _denoise(self, y: np.ndarray, sr: int, is_stereo: bool) -> np.ndarray: |
|
|
""" |
|
|
Apply spectral denoising using noisereduce library. |
|
|
|
|
|
Args: |
|
|
y: Audio data (channels, samples) |
|
|
sr: Sample rate |
|
|
is_stereo: Whether audio is stereo |
|
|
|
|
|
Returns: |
|
|
Denoised audio |
|
|
""" |
|
|
try: |
|
|
import noisereduce as nr |
|
|
except ImportError: |
|
|
print(f" ⚠ noisereduce not installed, skipping denoising") |
|
|
return y |
|
|
|
|
|
|
|
|
y_denoised = np.zeros_like(y) |
|
|
|
|
|
for ch in range(y.shape[0]): |
|
|
y_denoised[ch] = nr.reduce_noise( |
|
|
y=y[ch], |
|
|
sr=sr, |
|
|
stationary=True, |
|
|
prop_decrease=0.8 |
|
|
) |
|
|
|
|
|
return y_denoised |
|
|
|
|
|
def _normalize(self, y: np.ndarray, target_db: float = -1.0) -> np.ndarray: |
|
|
""" |
|
|
Normalize audio to target peak level. |
|
|
|
|
|
Args: |
|
|
y: Audio data |
|
|
target_db: Target peak level in dB (default: -1dB = almost full scale) |
|
|
|
|
|
Returns: |
|
|
Normalized audio |
|
|
""" |
|
|
|
|
|
peak = np.abs(y).max() |
|
|
|
|
|
if peak == 0: |
|
|
return y |
|
|
|
|
|
|
|
|
target_linear = 10 ** (target_db / 20.0) |
|
|
gain = target_linear / peak |
|
|
|
|
|
return y * gain |
|
|
|
|
|
def _highpass_filter( |
|
|
self, |
|
|
y: np.ndarray, |
|
|
sr: int, |
|
|
cutoff_hz: float = 30.0 |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Apply high-pass filter to remove low-frequency rumble. |
|
|
|
|
|
Args: |
|
|
y: Audio data (channels, samples) |
|
|
sr: Sample rate |
|
|
cutoff_hz: Cutoff frequency (Hz) |
|
|
|
|
|
Returns: |
|
|
Filtered audio |
|
|
""" |
|
|
from scipy.signal import butter, sosfilt |
|
|
|
|
|
|
|
|
sos = butter(4, cutoff_hz, 'hp', fs=sr, output='sos') |
|
|
|
|
|
|
|
|
y_filtered = np.zeros_like(y) |
|
|
|
|
|
for ch in range(y.shape[0]): |
|
|
y_filtered[ch] = sosfilt(sos, y[ch]) |
|
|
|
|
|
return y_filtered |
|
|
|
|
|
def _resample( |
|
|
self, |
|
|
y: np.ndarray, |
|
|
orig_sr: int, |
|
|
target_sr: int |
|
|
) -> np.ndarray: |
|
|
""" |
|
|
Resample audio to target sample rate. |
|
|
|
|
|
Args: |
|
|
y: Audio data (channels, samples) |
|
|
orig_sr: Original sample rate |
|
|
target_sr: Target sample rate |
|
|
|
|
|
Returns: |
|
|
Resampled audio |
|
|
""" |
|
|
y_resampled = np.zeros((y.shape[0], int(y.shape[1] * target_sr / orig_sr))) |
|
|
|
|
|
for ch in range(y.shape[0]): |
|
|
y_resampled[ch] = librosa.resample( |
|
|
y[ch], |
|
|
orig_sr=orig_sr, |
|
|
target_sr=target_sr |
|
|
) |
|
|
|
|
|
return y_resampled |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
import argparse |
|
|
|
|
|
parser = argparse.ArgumentParser(description="Test Audio Preprocessor") |
|
|
parser.add_argument("audio_file", type=str, help="Path to audio file") |
|
|
parser.add_argument("--output", type=str, default="./output_audio", |
|
|
help="Output directory for preprocessed audio") |
|
|
parser.add_argument("--no-denoise", action="store_true", |
|
|
help="Disable denoising") |
|
|
parser.add_argument("--no-normalize", action="store_true", |
|
|
help="Disable normalization") |
|
|
parser.add_argument("--no-highpass", action="store_true", |
|
|
help="Disable high-pass filter") |
|
|
args = parser.parse_args() |
|
|
|
|
|
preprocessor = AudioPreprocessor( |
|
|
enable_denoising=not args.no_denoise, |
|
|
enable_normalization=not args.no_normalize, |
|
|
enable_highpass=not args.no_highpass |
|
|
) |
|
|
|
|
|
audio_path = Path(args.audio_file) |
|
|
output_dir = Path(args.output) |
|
|
|
|
|
|
|
|
output_path = preprocessor.preprocess(audio_path, output_dir) |
|
|
print(f"\n✓ Preprocessing complete: {output_path}") |
|
|
|