| | """Processor class for MERaLiON3.""" |
| |
|
| | from typing import List, Optional, Union |
| |
|
| | import numpy as np |
| |
|
| | from transformers.feature_extraction_utils import BatchFeature |
| | from transformers.processing_utils import ProcessorMixin |
| | from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput |
| |
|
| |
|
| | |
| | class MERaLiON3Processor(ProcessorMixin): |
| | r""" |
| | Constructs a MERaLiON3 processor which wraps a whisper feature extractor and a gemma tokenizer into a single processor. |
| | |
| | [`MERaLiON3Processor`] offers all the functionalities of [`WhisperFeatureExtractor`] and [`GemmaTokenizer`]. See the |
| | [`~MERaLiON3Processor.__call__`] and [`~MERaLiON3Processor.decode`] for more information. |
| | |
| | Args: |
| | feature_extractor ([`WhisperFeatureExtractor`], *optional*): |
| | The feature extractor is a required input. |
| | tokenizer ([`GemmaTokenizer`], *optional*): |
| | The tokenizer is a required input. |
| | chat_template (`Optional[str]`, *optional*): |
| | The Jinja template to use for formatting the conversation. If not provided, the default chat template |
| | is used. |
| | """ |
| |
|
| | attributes = ["feature_extractor", "tokenizer"] |
| | feature_extractor_class = "WhisperFeatureExtractor" |
| | tokenizer_class = "AutoTokenizer" |
| | valid_kwargs = [ |
| | "fixed_speech_embeds_length", |
| | "speech_token_index", |
| | "time_duration_limit", |
| | "whisper_chunk_size", |
| | "do_normalize" |
| | ] |
| |
|
| | def __init__( |
| | self, |
| | feature_extractor=None, |
| | tokenizer=None, |
| | fixed_speech_embeds_length=100, |
| | speech_token_index=255999, |
| | time_duration_limit=300, |
| | whisper_chunk_size=30, |
| | do_normalize=True, |
| | chat_template=None, |
| | **kwargs, |
| | ): |
| | self.fixed_speech_embeds_length = fixed_speech_embeds_length |
| | self.speech_token_index = speech_token_index |
| | self.time_duration_limit = time_duration_limit |
| | self.whisper_chunk_size = whisper_chunk_size |
| | self.number_chunk_limit = self.time_duration_limit // self.whisper_chunk_size |
| | self.do_normalize = do_normalize |
| |
|
| | super().__init__(feature_extractor, tokenizer, chat_template=chat_template, **kwargs) |
| |
|
| | self.speech_token = self.tokenizer.added_tokens_decoder[self.speech_token_index].content |
| | self.feature_chunk_size = self.whisper_chunk_size * self.feature_extractor.sampling_rate |
| |
|
| | def _process_text(self, text: List[str], audio_number_chunks: np.ndarray): |
| | pieces = [] |
| | for i, item in enumerate(text): |
| | target_string = self.speech_token * self.fixed_speech_embeds_length * audio_number_chunks[i] |
| | pieces.append(item.replace(self.speech_token, target_string)) |
| | return pieces |
| |
|
| | def _get_number_chunks(self, audios: List[np.ndarray]): |
| | audio_lengths = np.array([_.shape[0] for _ in audios]) |
| | number_chunks = ((audio_lengths - 1) // self.feature_chunk_size) + 1 |
| | return np.clip(number_chunks, a_min=None, a_max=self.number_chunk_limit) |
| |
|
| | def _get_chunked_audios(self, audios: Union[np.ndarray, List[np.ndarray]]): |
| | if isinstance(audios, np.ndarray): |
| | audios = [audios] |
| |
|
| | audio_number_chunks = self._get_number_chunks(audios) |
| | chunked_audios = [] |
| |
|
| | for audio_idx, audio in enumerate(audios): |
| | for cid in range(audio_number_chunks[audio_idx]): |
| | chunked_audios.append( |
| | audio[cid * self.feature_chunk_size: (cid + 1) * self.feature_chunk_size] |
| | ) |
| | return audio_number_chunks, chunked_audios |
| |
|
| | def __call__( |
| | self, |
| | text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, |
| | audios: Union[np.ndarray, List[np.ndarray]] = None, |
| | padding: Union[bool, str, PaddingStrategy] = True, |
| | sampling_rate: Optional[int] = None, |
| | do_normalize: Optional[bool] = None, |
| | **kwargs, |
| | ) -> BatchFeature: |
| | """ |
| | Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text` |
| | and `kwargs` arguments to GemmaTokenizer's [`~GemmaTokenizer.__call__`] if `text` is not `None` to encode |
| | the text. To prepare the audio(s), this method forwards the `audios` and `kwrags` arguments to |
| | WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the doctsring |
| | of the above two methods for more information. |
| | |
| | Args: |
| | text (`str`, `List[str]`): |
| | The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings |
| | (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set |
| | `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). |
| | audios (`np.ndarray`, `List[np.ndarray]`): |
| | The audio or batch of audios to be prepared. Each audio can be a NumPy array. |
| | padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): |
| | Select a strategy to pad the returned sequences (according to the model's padding side and padding |
| | index) among: |
| | - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single |
| | sequence if provided). |
| | - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum |
| | acceptable input length for the model if that argument is not provided. |
| | - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different |
| | lengths). |
| | sampling_rate (`int`, defaults to 16000): |
| | The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). |
| | do_normalize (`bool`, defaults to `True`): |
| | Whether or not to zero-mean unit-variance normalize the input. |
| | Normalizing can help to significantly improve the performance of the model. |
| | """ |
| |
|
| | if text is None: |
| | raise ValueError("You need to specify either a `text` input to process.") |
| | if not isinstance(text, list): |
| | text = [text] |
| | if not isinstance(audios, list): |
| | audios = [audios] |
| | if sampling_rate is None: |
| | sampling_rate = self.feature_extractor.sampling_rate |
| | if do_normalize is None: |
| | do_normalize = self.do_normalize |
| |
|
| | for i, audio in enumerate(audios): |
| | if audio.ndim > 1: |
| | raise Exception(f"MERaLiON3 only accepts mono channel audio, {i+1}th audio have {audios[0].ndim} channels") |
| | |
| | inputs_dict = {} |
| | |
| | if audios is not None: |
| | audio_number_chunks, chunked_audios = self._get_chunked_audios(audios) |
| | text = self._process_text(text, audio_number_chunks) |
| | |
| | audio_inputs = self.feature_extractor( |
| | chunked_audios, |
| | sampling_rate=sampling_rate, |
| | return_tensors="pt", |
| | return_attention_mask=True, |
| | padding="max_length", |
| | do_normalize=self.do_normalize, |
| | ) |
| | audio_inputs["feature_attention_mask"] = audio_inputs.pop( |
| | "attention_mask" |
| | ) |
| | inputs_dict.update(audio_inputs) |
| |
|
| | text_input = self.tokenizer( |
| | text=text, |
| | return_tensors="pt", |
| | add_special_tokens=False, |
| | return_attention_mask=True, |
| | padding=padding, |
| | ) |
| |
|
| | inputs_dict["input_ids"] = text_input.input_ids |
| | inputs_dict["attention_mask"] = text_input.attention_mask |
| |
|
| | return BatchFeature(data={**inputs_dict}) |
| |
|
| | def batch_decode(self, *args, **kwargs): |
| | """ |
| | This method forwards all its arguments to GemmaTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please |
| | refer to the docstring of this method for more information. |
| | """ |
| | return self.tokenizer.batch_decode(*args, **kwargs) |
| |
|
| | def decode(self, *args, **kwargs): |
| | """ |
| | This method forwards all its arguments to GemmaTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to |
| | the docstring of this method for more information. |
| | """ |
| | return self.tokenizer.decode(*args, **kwargs) |
| |
|
| | @property |
| | def model_input_names(self): |
| | tokenizer_input_names = self.tokenizer.model_input_names |
| | feature_extractor_input_names = self.feature_extractor.model_input_names |
| | return list(dict.fromkeys(tokenizer_input_names + feature_extractor_input_names + ["feature_attention_mask"])) |