|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
Image/Text processor class for ALIGN |
|
|
""" |
|
|
|
|
|
from ...processing_utils import ProcessingKwargs, ProcessorMixin |
|
|
|
|
|
|
|
|
class AlignProcessorKwargs(ProcessingKwargs, total=False): |
|
|
|
|
|
_defaults = { |
|
|
"text_kwargs": { |
|
|
"padding": "max_length", |
|
|
"max_length": 64, |
|
|
}, |
|
|
} |
|
|
|
|
|
|
|
|
class AlignProcessor(ProcessorMixin): |
|
|
r""" |
|
|
Constructs an ALIGN processor which wraps [`EfficientNetImageProcessor`] and |
|
|
[`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that inherits both the image processor and |
|
|
tokenizer functionalities. See the [`~AlignProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more |
|
|
information. |
|
|
The preferred way of passing kwargs is as a dictionary per modality, see usage example below. |
|
|
```python |
|
|
from transformers import AlignProcessor |
|
|
from PIL import Image |
|
|
model_id = "kakaobrain/align-base" |
|
|
processor = AlignProcessor.from_pretrained(model_id) |
|
|
|
|
|
processor( |
|
|
images=your_pil_image, |
|
|
text=["What is that?"], |
|
|
images_kwargs = {"crop_size": {"height": 224, "width": 224}}, |
|
|
text_kwargs = {"padding": "do_not_pad"}, |
|
|
common_kwargs = {"return_tensors": "pt"}, |
|
|
) |
|
|
``` |
|
|
|
|
|
Args: |
|
|
image_processor ([`EfficientNetImageProcessor`]): |
|
|
The image processor is a required input. |
|
|
tokenizer ([`BertTokenizer`, `BertTokenizerFast`]): |
|
|
The tokenizer is a required input. |
|
|
|
|
|
""" |
|
|
|
|
|
attributes = ["image_processor", "tokenizer"] |
|
|
image_processor_class = "EfficientNetImageProcessor" |
|
|
tokenizer_class = ("BertTokenizer", "BertTokenizerFast") |
|
|
valid_processor_kwargs = AlignProcessorKwargs |
|
|
|
|
|
def __init__(self, image_processor, tokenizer): |
|
|
super().__init__(image_processor, tokenizer) |
|
|
|
|
|
|
|
|
__all__ = ["AlignProcessor"] |
|
|
|