Spaces:
Sleeping
Sleeping
| """Audio preprocessing: resampling, normalization, silence trimming.""" | |
| import logging | |
| import numpy as np | |
| import librosa | |
| import pyloudnorm as pyln | |
| from src.ingestion.schemas import PreprocessedAudio | |
| logger = logging.getLogger(__name__) | |
| class AudioPreprocessor: | |
| """Preprocess audio for feature extraction.""" | |
| def __init__( | |
| self, | |
| target_sr: int = 16000, | |
| normalize_loudness: bool = True, | |
| target_loudness: float = -23.0, | |
| trim_silence: bool = True, | |
| top_db: float = 30.0, | |
| ): | |
| """ | |
| Initialize preprocessor. | |
| Args: | |
| target_sr: Target sample rate (Hz) | |
| normalize_loudness: Apply loudness normalization (ITU-R BS.1770) | |
| target_loudness: Target loudness in LUFS | |
| trim_silence: Trim leading/trailing silence | |
| top_db: Threshold for silence trimming (dB) | |
| """ | |
| self.target_sr = target_sr | |
| self.normalize_loudness = normalize_loudness | |
| self.target_loudness = target_loudness | |
| self.trim_silence = trim_silence | |
| self.top_db = top_db | |
| # Initialize loudness meter | |
| if normalize_loudness: | |
| self.meter = pyln.Meter(target_sr) | |
| def process( | |
| self, waveform: np.ndarray, sr: int, original_duration: float | |
| ) -> PreprocessedAudio: | |
| """ | |
| Preprocess audio waveform. | |
| Args: | |
| waveform: Input waveform | |
| sr: Input sample rate | |
| original_duration: Original duration (seconds) | |
| Returns: | |
| PreprocessedAudio with preprocessed waveform | |
| """ | |
| logger.info(f"Preprocessing audio: {len(waveform)} samples at {sr}Hz") | |
| processed_waveform = waveform.copy() | |
| resampled = False | |
| normalized = False | |
| trimmed = False | |
| # 1. Resample to target sample rate | |
| if sr != self.target_sr: | |
| logger.debug(f"Resampling from {sr}Hz to {self.target_sr}Hz") | |
| processed_waveform = librosa.resample( | |
| processed_waveform, orig_sr=sr, target_sr=self.target_sr | |
| ) | |
| sr = self.target_sr | |
| resampled = True | |
| # 2. Normalize loudness (ITU-R BS.1770) | |
| if self.normalize_loudness: | |
| try: | |
| # Measure loudness | |
| loudness = self.meter.integrated_loudness(processed_waveform) | |
| logger.debug(f"Original loudness: {loudness:.2f} LUFS") | |
| # Normalize to target loudness | |
| processed_waveform = pyln.normalize.loudness( | |
| processed_waveform, loudness, self.target_loudness | |
| ) | |
| normalized = True | |
| logger.debug(f"Normalized to {self.target_loudness} LUFS") | |
| except Exception as e: | |
| logger.warning(f"Loudness normalization failed: {e}. Skipping.") | |
| # 3. Trim silence | |
| if self.trim_silence: | |
| try: | |
| processed_waveform, _ = librosa.effects.trim( | |
| processed_waveform, top_db=self.top_db | |
| ) | |
| trimmed = True | |
| logger.debug( | |
| f"Trimmed silence: {len(waveform)} → {len(processed_waveform)} samples" | |
| ) | |
| except Exception as e: | |
| logger.warning(f"Silence trimming failed: {e}. Skipping.") | |
| # Calculate final duration | |
| duration_sec = len(processed_waveform) / sr | |
| logger.info( | |
| f"Preprocessing complete: {duration_sec:.2f}s " | |
| f"(resampled={resampled}, normalized={normalized}, trimmed={trimmed})" | |
| ) | |
| return PreprocessedAudio( | |
| waveform=processed_waveform, | |
| sample_rate=sr, | |
| duration_sec=duration_sec, | |
| original_duration_sec=original_duration, | |
| resampled=resampled, | |
| normalized=normalized, | |
| trimmed=trimmed, | |
| vad_applied=False, # VAD is applied separately | |
| ) | |
| def ensure_mono(waveform: np.ndarray) -> np.ndarray: | |
| """Convert stereo to mono if needed.""" | |
| if waveform.ndim == 2: | |
| return np.mean(waveform, axis=0) | |
| return waveform | |
| def normalize_amplitude(waveform: np.ndarray, target_peak: float = 0.95) -> np.ndarray: | |
| """Normalize amplitude to target peak.""" | |
| peak = np.abs(waveform).max() | |
| if peak > 0: | |
| return waveform * (target_peak / peak) | |
| return waveform | |