| |
|
| |
|
| | from transformers.utils import TensorType, is_vision_available, logging
|
| |
|
| | from .image_processing_movqgan import MoVQImageProcessor
|
| |
|
| | logger = logging.get_logger(__name__)
|
| |
|
| |
|
| | class DualViTokImageProcessor(MoVQImageProcessor):
|
| | r"""
|
| | Constructs a DualViTok image processor that dynamically resizes images based on the original images.
|
| | This image processor is based on MoVQImageProcessor with spatial_factor of 16.
|
| |
|
| | Args:
|
| | do_resize (`bool`, *optional*, defaults to `True`):
|
| | Whether to resize the image's (height, width) dimensions.
|
| | resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
|
| | Resampling filter to use when resizing the image.
|
| | do_rescale (`bool`, *optional*, defaults to `True`):
|
| | Whether to rescale the image by the specified scale `rescale_factor`.
|
| | rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
|
| | Scale factor to use if rescaling the image.
|
| | do_normalize (`bool`, *optional*, defaults to `True`):
|
| | Whether to normalize the image.
|
| | image_mean (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
|
| | Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
|
| | image_std (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
|
| | Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
|
| | do_convert_rgb (`bool`, *optional*, defaults to `True`):
|
| | Whether to convert the image to RGB.
|
| | min_pixels (`int`, *optional*, defaults to `512 * 512`):
|
| | The min pixels of the image to resize the image.
|
| | max_pixels (`int`, *optional*, defaults to `1024 * 1024`):
|
| | The max pixels of the image to resize the image.
|
| | spatial_factor (`int`, *optional*, defautls to 8):
|
| | The spatial downsample factor the image will be downsampled in feature extracting phase
|
| | """
|
| |
|
| | model_input_names = ["pixel_values"]
|
| |
|
| | def __init__(
|
| | self,
|
| | *args,
|
| | spatial_factor: int = 16,
|
| | **kwargs,
|
| | ) -> None:
|
| | super().__init__(*args, spatial_factor=spatial_factor, **kwargs)
|
| |
|