| | """
|
| | Audio preprocessing for Wav2Vec2 model.
|
| |
|
| | Handles conversion from audio arrays to model input tensors.
|
| | """
|
| |
|
| | import numpy as np
|
| | import torch
|
| | from transformers import Wav2Vec2Processor
|
| |
|
| | from app.utils.constants import TARGET_SAMPLE_RATE
|
| | from app.utils.logger import get_logger
|
| |
|
| | logger = get_logger(__name__)
|
| |
|
| |
|
| | class AudioPreprocessor:
|
| | """
|
| | Preprocessor for preparing audio data for Wav2Vec2 model.
|
| |
|
| | Converts numpy audio arrays into the tensor format expected
|
| | by the Wav2Vec2ForSequenceClassification model.
|
| | """
|
| |
|
| | def __init__(
|
| | self,
|
| | processor: Wav2Vec2Processor,
|
| | device: str = "cpu",
|
| | ) -> None:
|
| | """
|
| | Initialize AudioPreprocessor.
|
| |
|
| | Args:
|
| | processor: Wav2Vec2Processor instance
|
| | device: Target device for tensors (cpu/cuda)
|
| | """
|
| | self.processor = processor
|
| | self.device = device
|
| | self.sample_rate = TARGET_SAMPLE_RATE
|
| |
|
| | def validate_input(self, audio_array: np.ndarray) -> bool:
|
| | """
|
| | Validate audio array for processing.
|
| |
|
| | Args:
|
| | audio_array: Input audio array
|
| |
|
| | Returns:
|
| | True if valid
|
| |
|
| | Raises:
|
| | ValueError: If validation fails
|
| | """
|
| | if not isinstance(audio_array, np.ndarray):
|
| | raise ValueError(f"Expected numpy array, got {type(audio_array)}")
|
| |
|
| | if audio_array.ndim != 1:
|
| | raise ValueError(f"Expected 1D array, got {audio_array.ndim}D")
|
| |
|
| | if len(audio_array) == 0:
|
| | raise ValueError("Audio array is empty")
|
| |
|
| | if np.isnan(audio_array).any():
|
| | raise ValueError("Audio array contains NaN values")
|
| |
|
| | if np.isinf(audio_array).any():
|
| | raise ValueError("Audio array contains infinite values")
|
| |
|
| | return True
|
| |
|
| | def preprocess(
|
| | self,
|
| | audio_array: np.ndarray,
|
| | return_attention_mask: bool = True,
|
| | ) -> dict[str, torch.Tensor]:
|
| | """
|
| | Preprocess audio array for model inference.
|
| |
|
| | Args:
|
| | audio_array: 1D numpy array of audio samples (16kHz, normalized)
|
| | return_attention_mask: Whether to return attention mask
|
| |
|
| | Returns:
|
| | Dictionary with input_values and optionally attention_mask
|
| | """
|
| |
|
| | self.validate_input(audio_array)
|
| |
|
| |
|
| | audio_array = audio_array.astype(np.float32)
|
| |
|
| |
|
| | inputs = self.processor(
|
| | audio_array,
|
| | sampling_rate=self.sample_rate,
|
| | return_tensors="pt",
|
| | padding=True,
|
| | return_attention_mask=return_attention_mask,
|
| | )
|
| |
|
| |
|
| | inputs = {key: value.to(self.device) for key, value in inputs.items()}
|
| |
|
| | logger.debug(
|
| | "Audio preprocessed for model",
|
| | input_length=inputs["input_values"].shape[-1],
|
| | device=self.device,
|
| | )
|
| |
|
| | return inputs
|
| |
|
| | def preprocess_batch(
|
| | self,
|
| | audio_arrays: list[np.ndarray],
|
| | return_attention_mask: bool = True,
|
| | ) -> dict[str, torch.Tensor]:
|
| | """
|
| | Preprocess a batch of audio arrays.
|
| |
|
| | Args:
|
| | audio_arrays: List of 1D numpy arrays
|
| | return_attention_mask: Whether to return attention mask
|
| |
|
| | Returns:
|
| | Dictionary with batched input_values and optionally attention_mask
|
| | """
|
| |
|
| | for i, audio in enumerate(audio_arrays):
|
| | try:
|
| | self.validate_input(audio)
|
| | except ValueError as e:
|
| | raise ValueError(f"Invalid audio at index {i}: {e}") from e
|
| |
|
| |
|
| | audio_arrays = [audio.astype(np.float32) for audio in audio_arrays]
|
| |
|
| |
|
| | inputs = self.processor(
|
| | audio_arrays,
|
| | sampling_rate=self.sample_rate,
|
| | return_tensors="pt",
|
| | padding=True,
|
| | return_attention_mask=return_attention_mask,
|
| | )
|
| |
|
| |
|
| | inputs = {key: value.to(self.device) for key, value in inputs.items()}
|
| |
|
| | logger.debug(
|
| | "Batch preprocessed for model",
|
| | batch_size=len(audio_arrays),
|
| | device=self.device,
|
| | )
|
| |
|
| | return inputs
|
| |
|