Instructions to use MahmoodAnaam/MSP-Audio with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use MahmoodAnaam/MSP-Audio with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("automatic-speech-recognition", model="MahmoodAnaam/MSP-Audio", trust_remote_code=True)# Load model directly from transformers import AutoModelForCTC model = AutoModelForCTC.from_pretrained("MahmoodAnaam/MSP-Audio", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """ | |
| Speech processor class for MSPAudio | |
| """ | |
| from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack | |
| from transformers.tokenization_utils_base import ( | |
| AudioInput, | |
| PreTokenizedInput, | |
| TextInput, | |
| ) | |
| class MSPAudioProcessorKwargs(ProcessingKwargs, total=False): | |
| _defaults = {} | |
| class MSPAudioProcessor(ProcessorMixin): | |
| attributes = ["feature_extractor", "tokenizer"] | |
| feature_extractor_class = "MSPAudioFeatureExtractor" | |
| tokenizer_class = "AutoTokenizer" | |
| def __init__(self, feature_extractor, tokenizer): | |
| super().__init__(feature_extractor, tokenizer) | |
| def __call__( | |
| self, | |
| audio: AudioInput | None = None, | |
| text: str | list[str] | TextInput | PreTokenizedInput | None = None, | |
| **kwargs: Unpack[MSPAudioProcessorKwargs], | |
| ): | |
| r""" | |
| Returns: | |
| This method returns the results of each `call` method. If both are used, the output is a dictionary containing the results of both. | |
| """ | |
| if audio is None and text is None: | |
| raise ValueError( | |
| "You need to specify either an `audio` or `text` input to process." | |
| ) | |
| output_kwargs = self._merge_kwargs( | |
| MSPAudioProcessorKwargs, | |
| tokenizer_init_kwargs=self.tokenizer.init_kwargs, | |
| **kwargs, | |
| ) | |
| if audio is not None: | |
| inputs = self.feature_extractor(audio, **output_kwargs["audio_kwargs"]) | |
| if text is not None: | |
| encodings = self.tokenizer(text, **output_kwargs["text_kwargs"]) | |
| if text is None: | |
| return inputs | |
| elif audio is None: | |
| return encodings | |
| else: | |
| inputs["labels"] = encodings["input_ids"] | |
| return inputs | |
| def pad(self, *args, **kwargs): | |
| """ | |
| This method operates on batches of extracted features and/or tokenized text. It forwards all arguments to | |
| [`MSPAudioFeatureExtractor.pad`] and/or [`PreTrainedTokenizer.pad`] depending on the input modality and returns their outputs. If both modalities are passed, [`MSPAudioFeatureExtractor.pad`] and [`PreTrainedTokenizer.pad`] are called. | |
| Args: | |
| input_features: | |
| When the first argument is a dictionary containing a batch of tensors, or the `input_features` argument is present, it is passed to [`MSPAudioFeatureExtractor.pad`]. | |
| labels: | |
| When the `label` argument is present, it is passed to [`PreTrainedTokenizer.pad`]. | |
| Returns: | |
| This method returns the results of each `pad` method. If both are used, the output is a dictionary containing the results of both. | |
| """ | |
| input_features = kwargs.pop("input_features", None) | |
| labels = kwargs.pop("labels", None) | |
| if len(args) > 0: | |
| input_features = args[0] | |
| args = args[1:] | |
| if input_features is not None: | |
| input_features = self.feature_extractor.pad(input_features, *args, **kwargs) | |
| if labels is not None: | |
| labels = self.tokenizer.pad(labels, **kwargs) | |
| if labels is None: | |
| return input_features | |
| elif input_features is None: | |
| return labels | |
| else: | |
| input_features["labels"] = labels["input_ids"] | |
| return input_features | |
| def model_input_names(self): | |
| # The processor doesn't return text ids and the model seems to not need them | |
| feature_extractor_input_names = self.feature_extractor.model_input_names | |
| return feature_extractor_input_names + ["labels"] | |