Spaces:
Sleeping
Sleeping
File size: 4,516 Bytes
4e9a3bc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | """Audio preprocessing: resampling, normalization, silence trimming."""
import logging
import numpy as np
import librosa
import pyloudnorm as pyln
from src.ingestion.schemas import PreprocessedAudio
logger = logging.getLogger(__name__)
class AudioPreprocessor:
"""Preprocess audio for feature extraction."""
def __init__(
self,
target_sr: int = 16000,
normalize_loudness: bool = True,
target_loudness: float = -23.0,
trim_silence: bool = True,
top_db: float = 30.0,
):
"""
Initialize preprocessor.
Args:
target_sr: Target sample rate (Hz)
normalize_loudness: Apply loudness normalization (ITU-R BS.1770)
target_loudness: Target loudness in LUFS
trim_silence: Trim leading/trailing silence
top_db: Threshold for silence trimming (dB)
"""
self.target_sr = target_sr
self.normalize_loudness = normalize_loudness
self.target_loudness = target_loudness
self.trim_silence = trim_silence
self.top_db = top_db
# Initialize loudness meter
if normalize_loudness:
self.meter = pyln.Meter(target_sr)
def process(
self, waveform: np.ndarray, sr: int, original_duration: float
) -> PreprocessedAudio:
"""
Preprocess audio waveform.
Args:
waveform: Input waveform
sr: Input sample rate
original_duration: Original duration (seconds)
Returns:
PreprocessedAudio with preprocessed waveform
"""
logger.info(f"Preprocessing audio: {len(waveform)} samples at {sr}Hz")
processed_waveform = waveform.copy()
resampled = False
normalized = False
trimmed = False
# 1. Resample to target sample rate
if sr != self.target_sr:
logger.debug(f"Resampling from {sr}Hz to {self.target_sr}Hz")
processed_waveform = librosa.resample(
processed_waveform, orig_sr=sr, target_sr=self.target_sr
)
sr = self.target_sr
resampled = True
# 2. Normalize loudness (ITU-R BS.1770)
if self.normalize_loudness:
try:
# Measure loudness
loudness = self.meter.integrated_loudness(processed_waveform)
logger.debug(f"Original loudness: {loudness:.2f} LUFS")
# Normalize to target loudness
processed_waveform = pyln.normalize.loudness(
processed_waveform, loudness, self.target_loudness
)
normalized = True
logger.debug(f"Normalized to {self.target_loudness} LUFS")
except Exception as e:
logger.warning(f"Loudness normalization failed: {e}. Skipping.")
# 3. Trim silence
if self.trim_silence:
try:
processed_waveform, _ = librosa.effects.trim(
processed_waveform, top_db=self.top_db
)
trimmed = True
logger.debug(
f"Trimmed silence: {len(waveform)} → {len(processed_waveform)} samples"
)
except Exception as e:
logger.warning(f"Silence trimming failed: {e}. Skipping.")
# Calculate final duration
duration_sec = len(processed_waveform) / sr
logger.info(
f"Preprocessing complete: {duration_sec:.2f}s "
f"(resampled={resampled}, normalized={normalized}, trimmed={trimmed})"
)
return PreprocessedAudio(
waveform=processed_waveform,
sample_rate=sr,
duration_sec=duration_sec,
original_duration_sec=original_duration,
resampled=resampled,
normalized=normalized,
trimmed=trimmed,
vad_applied=False, # VAD is applied separately
)
@staticmethod
def ensure_mono(waveform: np.ndarray) -> np.ndarray:
"""Convert stereo to mono if needed."""
if waveform.ndim == 2:
return np.mean(waveform, axis=0)
return waveform
@staticmethod
def normalize_amplitude(waveform: np.ndarray, target_peak: float = 0.95) -> np.ndarray:
"""Normalize amplitude to target peak."""
peak = np.abs(waveform).max()
if peak > 0:
return waveform * (target_peak / peak)
return waveform
|