Spaces:
Running
Running
| from logging import Logger, getLogger | |
| from pathlib import Path | |
| from typing import Literal, Self | |
| from librosa import load as librosa_load | |
| from numpy import pad as np_pad | |
| from torch import Tensor | |
| from torch import device as torch_device | |
| from torch import inference_mode as torch_inferece_mode | |
| from torch.cuda import is_available as cuda_is_available | |
| from transformers import AutoFeatureExtractor, AutoModelForAudioClassification | |
| from transformers.modeling_outputs import SequenceClassifierOutput | |
| from ..settings import ModelsSettings | |
| from ..utils import timeit | |
| class Classifier: | |
| def __init__( | |
| self: Self, | |
| settings: ModelsSettings, | |
| /, | |
| audio_max_duration: int | None = 30, | |
| ): | |
| self._log: Logger = getLogger(__name__) | |
| self._log.debug("Selecting model device") | |
| self._device = torch_device("cuda" if cuda_is_available() else "cpu") | |
| self._log.info(f"Model will be initiated in {self._device}") | |
| self._log.debug(f"Get pretrained {settings.MODEL_ID} to {self._device}") | |
| self._model = AutoModelForAudioClassification.from_pretrained( | |
| settings.MODEL_ID | |
| ).to(self._device) | |
| self._log.info(f"Model {settings.MODEL_ID} available in {self._device}") | |
| self._log.debug( | |
| f"Get feature extractor {settings.FEATURE_EXTRACTOR_ID or settings.MODEL_ID}" | |
| ) | |
| self._feat_extractor = AutoFeatureExtractor.from_pretrained( | |
| settings.FEATURE_EXTRACTOR_ID or settings.MODEL_ID | |
| ) | |
| self._max_length = int(self._feat_extractor.sampling_rate * audio_max_duration) | |
| def id2label(self: Self) -> dict[int, str]: | |
| return self._model.config.id2label | |
| def label2id(self: Self) -> dict[str, int]: | |
| return self._model.config.label2id | |
| def _preprocess( | |
| self: Self, | |
| audio_path: Path | str, | |
| /, | |
| truncation: bool = True, | |
| return_tensors_type: Literal["pt", "tf"] = "pt", | |
| ) -> dict[str, Tensor]: | |
| self._log.debug(f"Preprocessing audio: {audio_path}") | |
| audio, _ = librosa_load(audio_path, sr=None) | |
| audio = ( | |
| audio[: self._max_length] | |
| if len(audio) > self._max_length | |
| else np_pad(audio, (0, self._max_length)) | |
| ) | |
| return self._feat_extractor( | |
| audio, | |
| sampling_rate=self._feat_extractor.sampling_rate, | |
| max_length=self._max_length, | |
| truncation=truncation, | |
| return_tensors=return_tensors_type, | |
| ) | |
| def _predict(self: Self, audio_sample: dict[str, Tensor]) -> Tensor: | |
| with torch_inferece_mode(): | |
| prediction: SequenceClassifierOutput = self._model(**audio_sample) | |
| self._log.debug(f"Prediction logits: {prediction.logits}") | |
| return prediction.logits | |
| def predict( | |
| self: Self, | |
| audio: Path | str, | |
| /, | |
| return_labeled_probs: bool = False, | |
| ) -> dict[int, float] | Tensor: | |
| probs: Tensor = ( | |
| self._predict( | |
| { | |
| key: value.to(self._device) | |
| for key, value in self._preprocess(audio).items() | |
| } | |
| ) | |
| .cpu() | |
| .softmax(dim=-1)[0] | |
| ) | |
| self._log.info(f"Prediction probabilities: {probs}") | |
| return ( | |
| {idx: prob.item() for idx, prob in enumerate(probs)} | |
| if return_labeled_probs | |
| else probs | |
| ) | |