File size: 2,343 Bytes

5ca5652

# coding=utf-8

from transformers.utils import TensorType, is_vision_available, logging

from .image_processing_movqgan import MoVQImageProcessor

logger = logging.get_logger(__name__)


class DualViTokImageProcessor(MoVQImageProcessor):
    r"""

    Constructs a DualViTok image processor that dynamically resizes images based on the original images.

    This image processor is based on MoVQImageProcessor with spatial_factor of 16.



    Args:

        do_resize (`bool`, *optional*, defaults to `True`):

            Whether to resize the image's (height, width) dimensions.

        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):

            Resampling filter to use when resizing the image.

        do_rescale (`bool`, *optional*, defaults to `True`):

            Whether to rescale the image by the specified scale `rescale_factor`.

        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):

            Scale factor to use if rescaling the image.

        do_normalize (`bool`, *optional*, defaults to `True`):

            Whether to normalize the image.

        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):

            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.

        image_std (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):

            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.

        do_convert_rgb (`bool`, *optional*, defaults to `True`):

            Whether to convert the image to RGB.

        min_pixels (`int`, *optional*, defaults to `512 * 512`):

            The min pixels of the image to resize the image.

        max_pixels (`int`, *optional*, defaults to `1024 * 1024`):

            The max pixels of the image to resize the image.

        spatial_factor (`int`, *optional*, defautls to 8):

            The spatial downsample factor the image will be downsampled in feature extracting phase

    """

    model_input_names = ["pixel_values"]

    def __init__(

        self,

        *args,

        spatial_factor: int = 16,

        **kwargs,

    ) -> None:
        super().__init__(*args, spatial_factor=spatial_factor, **kwargs)