File size: 9,412 Bytes
6b408d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
"""

Audio processing service for VoiceAuth API.



Handles Base64 decoding, format conversion, and audio preprocessing.

"""

import base64
import io
from typing import TYPE_CHECKING

import numpy as np
from pydub import AudioSegment

from app.config import get_settings
from app.utils.constants import MP3_MAGIC_BYTES
from app.utils.constants import TARGET_SAMPLE_RATE
from app.utils.exceptions import AudioDecodeError
from app.utils.exceptions import AudioDurationError
from app.utils.exceptions import AudioFormatError
from app.utils.exceptions import AudioProcessingError
from app.utils.logger import get_logger

if TYPE_CHECKING:
    import torch

logger = get_logger(__name__)


class AudioProcessor:
    """

    Audio processing service for preparing audio for ML inference.



    Handles the complete pipeline from Base64-encoded MP3 to

    normalized numpy arrays suitable for Wav2Vec2.

    """

    def __init__(self) -> None:
        """Initialize AudioProcessor with settings."""
        self.settings = get_settings()
        self.target_sample_rate = TARGET_SAMPLE_RATE

    def decode_base64_audio(self, base64_string: str) -> bytes:
        """

        Decode Base64 string to raw audio bytes.



        Args:

            base64_string: Base64-encoded audio data



        Returns:

            Raw audio bytes



        Raises:

            AudioDecodeError: If decoding fails

        """
        try:
            # Handle potential padding issues
            base64_string = base64_string.strip()
            padding = 4 - len(base64_string) % 4
            if padding != 4:
                base64_string += "=" * padding

            audio_bytes = base64.b64decode(base64_string)

            if len(audio_bytes) < 100:
                raise AudioDecodeError(
                    "Decoded audio data is too small",
                    details={"size_bytes": len(audio_bytes)},
                )

            logger.debug(
                "Decoded base64 audio",
                size_bytes=len(audio_bytes),
            )
            return audio_bytes

        except AudioDecodeError:
            raise
        except Exception as e:
            raise AudioDecodeError(
                f"Failed to decode Base64 audio: {e}",
                details={"error": str(e)},
            ) from e

    def validate_mp3_format(self, audio_bytes: bytes) -> bool:
        """

        Validate that the audio bytes represent a valid MP3 file.



        Args:

            audio_bytes: Raw audio bytes



        Returns:

            True if valid MP3



        Raises:

            AudioFormatError: If not a valid MP3 file

        """
        # Check for MP3 magic bytes
        is_valid = any(audio_bytes.startswith(magic) for magic in MP3_MAGIC_BYTES)

        if not is_valid:
            raise AudioFormatError(
                "Invalid MP3 format: file does not have valid MP3 header",
                details={"header_bytes": audio_bytes[:10].hex()},
            )

        return True

    def convert_mp3_to_wav_array(self, mp3_bytes: bytes) -> np.ndarray:
        """

        Convert MP3 bytes to normalized WAV numpy array.



        Args:

            mp3_bytes: Raw MP3 audio bytes



        Returns:

            Normalized numpy array of audio samples



        Raises:

            AudioProcessingError: If conversion fails

        """
        try:
            # Load MP3 using pydub
            audio_buffer = io.BytesIO(mp3_bytes)
            audio_segment = AudioSegment.from_mp3(audio_buffer)

            # Convert to mono if stereo
            if audio_segment.channels > 1:
                audio_segment = audio_segment.set_channels(1)

            # Resample to target sample rate
            if audio_segment.frame_rate != self.target_sample_rate:
                audio_segment = audio_segment.set_frame_rate(self.target_sample_rate)

            # Convert to numpy array
            samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)

            # Normalize to [-1, 1] range
            samples = samples / 32768.0  # 16-bit audio normalization

            logger.debug(
                "Converted MP3 to WAV array",
                original_channels=audio_segment.channels,
                sample_rate=self.target_sample_rate,
                num_samples=len(samples),
            )

            return samples

        except Exception as e:
            raise AudioProcessingError(
                f"Failed to convert MP3 to WAV: {e}",
                details={"error": str(e)},
            ) from e

    def validate_audio_duration(

        self,

        audio_array: np.ndarray,

        sample_rate: int | None = None,

    ) -> float:
        """

        Validate audio duration is within allowed bounds.



        Args:

            audio_array: Numpy array of audio samples

            sample_rate: Sample rate (uses target_sample_rate if not provided)



        Returns:

            Duration in seconds



        Raises:

            AudioDurationError: If duration is out of bounds

        """
        if sample_rate is None:
            sample_rate = self.target_sample_rate

        duration = len(audio_array) / sample_rate

        if duration < self.settings.MIN_AUDIO_DURATION:
            raise AudioDurationError(
                f"Audio too short: {duration:.2f}s (minimum: {self.settings.MIN_AUDIO_DURATION}s)",
                duration=duration,
                min_duration=self.settings.MIN_AUDIO_DURATION,
            )

        if duration > self.settings.MAX_AUDIO_DURATION:
            raise AudioDurationError(
                f"Audio too long: {duration:.2f}s (maximum: {self.settings.MAX_AUDIO_DURATION}s)",
                duration=duration,
                max_duration=self.settings.MAX_AUDIO_DURATION,
            )

        logger.debug("Audio duration validated", duration_seconds=round(duration, 2))
        return duration

    def normalize_audio(self, audio_array: np.ndarray) -> np.ndarray:
        """

        Normalize audio amplitude to [-1, 1] range.



        Applies peak normalization to maximize dynamic range.



        Args:

            audio_array: Input audio array



        Returns:

            Normalized audio array

        """
        # Avoid division by zero for silent audio
        max_amplitude = np.abs(audio_array).max()

        if max_amplitude < 1e-8:
            logger.warning("Audio appears to be silent or near-silent")
            return audio_array

        normalized = audio_array / max_amplitude
        return normalized

    def extract_audio_metadata(

        self,

        audio_array: np.ndarray,

        sample_rate: int | None = None,

    ) -> dict:
        """

        Extract metadata from audio for explainability.



        Args:

            audio_array: Numpy array of audio samples

            sample_rate: Sample rate



        Returns:

            Dictionary of audio metadata

        """
        if sample_rate is None:
            sample_rate = self.target_sample_rate

        duration = len(audio_array) / sample_rate

        # Calculate RMS energy
        rms_energy = float(np.sqrt(np.mean(audio_array**2)))

        # Calculate zero crossing rate
        zero_crossings = np.sum(np.abs(np.diff(np.sign(audio_array)))) / 2
        zcr = float(zero_crossings / len(audio_array))

        # Calculate peak amplitude
        peak_amplitude = float(np.abs(audio_array).max())

        return {
            "duration_seconds": round(duration, 3),
            "num_samples": len(audio_array),
            "sample_rate": sample_rate,
            "rms_energy": round(rms_energy, 6),
            "zero_crossing_rate": round(zcr, 6),
            "peak_amplitude": round(peak_amplitude, 6),
        }

    def process_audio(self, audio_base64: str) -> tuple[np.ndarray, dict]:
        """

        Complete audio processing pipeline.



        Takes Base64-encoded MP3 and returns normalized audio array

        with metadata.



        Args:

            audio_base64: Base64-encoded MP3 audio



        Returns:

            Tuple of (normalized audio array, metadata dict)



        Raises:

            AudioDecodeError: If Base64 decoding fails

            AudioFormatError: If not valid MP3

            AudioDurationError: If duration out of bounds

            AudioProcessingError: If processing fails

        """
        logger.info("Starting audio processing pipeline")

        # Decode Base64
        audio_bytes = self.decode_base64_audio(audio_base64)

        # Validate MP3 format
        self.validate_mp3_format(audio_bytes)

        # Convert to WAV array
        audio_array = self.convert_mp3_to_wav_array(audio_bytes)

        # Validate duration
        self.validate_audio_duration(audio_array)

        # Normalize
        normalized_audio = self.normalize_audio(audio_array)

        # Extract metadata
        metadata = self.extract_audio_metadata(normalized_audio)

        logger.info(
            "Audio processing complete",
            duration=metadata["duration_seconds"],
            samples=metadata["num_samples"],
        )

        return normalized_audio, metadata