File size: 4,516 Bytes
4e9a3bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""Audio preprocessing: resampling, normalization, silence trimming."""

import logging
import numpy as np
import librosa
import pyloudnorm as pyln

from src.ingestion.schemas import PreprocessedAudio

logger = logging.getLogger(__name__)


class AudioPreprocessor:
    """Preprocess audio for feature extraction."""

    def __init__(
        self,
        target_sr: int = 16000,
        normalize_loudness: bool = True,
        target_loudness: float = -23.0,
        trim_silence: bool = True,
        top_db: float = 30.0,
    ):
        """
        Initialize preprocessor.

        Args:
            target_sr: Target sample rate (Hz)
            normalize_loudness: Apply loudness normalization (ITU-R BS.1770)
            target_loudness: Target loudness in LUFS
            trim_silence: Trim leading/trailing silence
            top_db: Threshold for silence trimming (dB)
        """
        self.target_sr = target_sr
        self.normalize_loudness = normalize_loudness
        self.target_loudness = target_loudness
        self.trim_silence = trim_silence
        self.top_db = top_db

        # Initialize loudness meter
        if normalize_loudness:
            self.meter = pyln.Meter(target_sr)

    def process(
        self, waveform: np.ndarray, sr: int, original_duration: float
    ) -> PreprocessedAudio:
        """
        Preprocess audio waveform.

        Args:
            waveform: Input waveform
            sr: Input sample rate
            original_duration: Original duration (seconds)

        Returns:
            PreprocessedAudio with preprocessed waveform
        """
        logger.info(f"Preprocessing audio: {len(waveform)} samples at {sr}Hz")

        processed_waveform = waveform.copy()
        resampled = False
        normalized = False
        trimmed = False

        # 1. Resample to target sample rate
        if sr != self.target_sr:
            logger.debug(f"Resampling from {sr}Hz to {self.target_sr}Hz")
            processed_waveform = librosa.resample(
                processed_waveform, orig_sr=sr, target_sr=self.target_sr
            )
            sr = self.target_sr
            resampled = True

        # 2. Normalize loudness (ITU-R BS.1770)
        if self.normalize_loudness:
            try:
                # Measure loudness
                loudness = self.meter.integrated_loudness(processed_waveform)
                logger.debug(f"Original loudness: {loudness:.2f} LUFS")

                # Normalize to target loudness
                processed_waveform = pyln.normalize.loudness(
                    processed_waveform, loudness, self.target_loudness
                )
                normalized = True
                logger.debug(f"Normalized to {self.target_loudness} LUFS")
            except Exception as e:
                logger.warning(f"Loudness normalization failed: {e}. Skipping.")

        # 3. Trim silence
        if self.trim_silence:
            try:
                processed_waveform, _ = librosa.effects.trim(
                    processed_waveform, top_db=self.top_db
                )
                trimmed = True
                logger.debug(
                    f"Trimmed silence: {len(waveform)}{len(processed_waveform)} samples"
                )
            except Exception as e:
                logger.warning(f"Silence trimming failed: {e}. Skipping.")

        # Calculate final duration
        duration_sec = len(processed_waveform) / sr

        logger.info(
            f"Preprocessing complete: {duration_sec:.2f}s "
            f"(resampled={resampled}, normalized={normalized}, trimmed={trimmed})"
        )

        return PreprocessedAudio(
            waveform=processed_waveform,
            sample_rate=sr,
            duration_sec=duration_sec,
            original_duration_sec=original_duration,
            resampled=resampled,
            normalized=normalized,
            trimmed=trimmed,
            vad_applied=False,  # VAD is applied separately
        )

    @staticmethod
    def ensure_mono(waveform: np.ndarray) -> np.ndarray:
        """Convert stereo to mono if needed."""
        if waveform.ndim == 2:
            return np.mean(waveform, axis=0)
        return waveform

    @staticmethod
    def normalize_amplitude(waveform: np.ndarray, target_peak: float = 0.95) -> np.ndarray:
        """Normalize amplitude to target peak."""
        peak = np.abs(waveform).max()
        if peak > 0:
            return waveform * (target_peak / peak)
        return waveform