| from typing import Union | |
| from transformers import CLIPProcessor, CLIPTokenizer, CLIPTokenizerFast | |
| class LongCLIPProcessor(CLIPProcessor): | |
| tokenizer: Union[CLIPTokenizer, CLIPTokenizerFast] | |
| def __call__( | |
| self, text=None, short_text=None, images=None, return_tensors=None, **kwargs | |
| ): | |
| encoding = super().__call__(text, images, return_tensors, **kwargs) | |
| if short_text is not None: | |
| short_text_encoding = self.tokenizer( | |
| short_text, return_tensors=return_tensors, **kwargs | |
| ) | |
| encoding["short_input_ids"] = short_text_encoding.input_ids | |
| encoding["short_attention_mask"] = short_text_encoding.attention_mask | |
| return encoding | |