""" Audio preprocessing for Wav2Vec2 model. Handles conversion from audio arrays to model input tensors. """ import numpy as np import torch from transformers import Wav2Vec2Processor from app.utils.constants import TARGET_SAMPLE_RATE from app.utils.logger import get_logger logger = get_logger(__name__) class AudioPreprocessor: """ Preprocessor for preparing audio data for Wav2Vec2 model. Converts numpy audio arrays into the tensor format expected by the Wav2Vec2ForSequenceClassification model. """ def __init__( self, processor: Wav2Vec2Processor, device: str = "cpu", ) -> None: """ Initialize AudioPreprocessor. Args: processor: Wav2Vec2Processor instance device: Target device for tensors (cpu/cuda) """ self.processor = processor self.device = device self.sample_rate = TARGET_SAMPLE_RATE def validate_input(self, audio_array: np.ndarray) -> bool: """ Validate audio array for processing. Args: audio_array: Input audio array Returns: True if valid Raises: ValueError: If validation fails """ if not isinstance(audio_array, np.ndarray): raise ValueError(f"Expected numpy array, got {type(audio_array)}") if audio_array.ndim != 1: raise ValueError(f"Expected 1D array, got {audio_array.ndim}D") if len(audio_array) == 0: raise ValueError("Audio array is empty") if np.isnan(audio_array).any(): raise ValueError("Audio array contains NaN values") if np.isinf(audio_array).any(): raise ValueError("Audio array contains infinite values") return True def preprocess( self, audio_array: np.ndarray, return_attention_mask: bool = True, ) -> dict[str, torch.Tensor]: """ Preprocess audio array for model inference. Args: audio_array: 1D numpy array of audio samples (16kHz, normalized) return_attention_mask: Whether to return attention mask Returns: Dictionary with input_values and optionally attention_mask """ # Validate input self.validate_input(audio_array) # Ensure float32 audio_array = audio_array.astype(np.float32) # Process through Wav2Vec2Processor inputs = self.processor( audio_array, sampling_rate=self.sample_rate, return_tensors="pt", padding=True, return_attention_mask=return_attention_mask, ) # Move to target device inputs = {key: value.to(self.device) for key, value in inputs.items()} logger.debug( "Audio preprocessed for model", input_length=inputs["input_values"].shape[-1], device=self.device, ) return inputs def preprocess_batch( self, audio_arrays: list[np.ndarray], return_attention_mask: bool = True, ) -> dict[str, torch.Tensor]: """ Preprocess a batch of audio arrays. Args: audio_arrays: List of 1D numpy arrays return_attention_mask: Whether to return attention mask Returns: Dictionary with batched input_values and optionally attention_mask """ # Validate all inputs for i, audio in enumerate(audio_arrays): try: self.validate_input(audio) except ValueError as e: raise ValueError(f"Invalid audio at index {i}: {e}") from e # Ensure float32 audio_arrays = [audio.astype(np.float32) for audio in audio_arrays] # Process batch through Wav2Vec2Processor inputs = self.processor( audio_arrays, sampling_rate=self.sample_rate, return_tensors="pt", padding=True, return_attention_mask=return_attention_mask, ) # Move to target device inputs = {key: value.to(self.device) for key, value in inputs.items()} logger.debug( "Batch preprocessed for model", batch_size=len(audio_arrays), device=self.device, ) return inputs