huangrh9 commited on Jun 1, 2025

Commit

5ca5652

verified ·

1 Parent(s): 3886cb9

Upload folder using huggingface_hub

Browse files

Files changed (18) hide show

.gitattributes +35 -35
README.md +65 -0
config.json +75 -0
configuration_dualvitok.py +153 -0
configuration_movqgan.py +90 -0
configuration_qwen2vit.py +249 -0
image_processing_dualvitok.py +48 -0
image_processing_movqgan.py +428 -0
image_processing_qwen2vit.py +476 -0
image_utils.py +812 -0
modeling_dualvitok.py +653 -0
modeling_movqgan.py +828 -0
modeling_qwen2vit.py +842 -0
modeling_rope_utils.py +561 -0
preprocessor_config.json +29 -0
processing_qwen2vit.py +186 -0
pytorch_model.bin +3 -0
sdxl_decoder_pipe.py +901 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,35 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,65 @@

+# DualViTok
+<div align="center">
+<img src="https://illume-unified-mllm.github.io/static/images/logo.png" width="100em"></img>
+📄 [Paper](https://arxiv.org/abs/2504.01934) |
+🌐 [Project-Page](https://illume-unified-mllm.github.io/) |
+📦 [Github](https://github.com/illume-unified-mllm/ILLUME_plus) |
+</div>
+## Introduction
+**DualViTok**, Dual Vision Tokenizer, is a dual-branch vision tokenizer designed to capture both deep semantics and fine-grained textures. It is proposed in [ILLUME+](https://arxiv.org/abs/2504.01934). The semantic branch utilizes a pre-trained text-aligned vision encoder for semantic feature extraction, supervised by feature reconstruction loss. In parallel, the pixel branch integrates quantized features from both the semantic encoder and a CNN-based pixel encoder to enhance pixel-level reconstruction. To improve robustness against incorrect token predictions in autoregressive generation, we introduce noise injection during training by randomly perturbing visual tokens. Despite its simplicity, DualViTok is specifically designed for unified models, ensuring both semantic and texture preservation while maintaining robust token decoding.
+<div align="center">
+<img src="https://illume-unified-mllm.github.io/static/images/tokenizer_framework.png" width="80%"></img>
+</div>
+## Quickstart for Autoencoding
+```python
+from PIL import Image
+import torch
+from transformers import AutoModel, AutoImageProcessor
+MODEL_HUB = "ILLUME-MLLM/dualvitok/"
+model = AutoModel.from_pretrained(MODEL_HUB, trust_remote_code=True).eval().cuda()
+processor = AutoImageProcessor.from_pretrained(MODEL_HUB, trust_remote_code=True)
+# load the diffusion decoder.
+# diffusion_decoder = model.build_sdxl_decoder('ILLUME-MLLM/dualvitok-sdxl-decoder')
+# TODO: you need to modify the path here
+IMAGE_PATH = "YOUR_IMAGE_PATH"
+image = Image.open(IMAGE_PATH)
+image = processor(image, return_tensors="pt")["pixel_values"]
+image = image.unsqueeze(0).cuda()
+with torch.no_grad():
+    (quant_semantic, diff_semantic, indices_semantic, _), \
+        (quant_pixel, diff_pixel, indices_pixel) = model.encode(image)
+    recon = model.decode(quant_semantic, quant_pixel)
+    # decode from the codes.
+    # recon = model.decode_code(indices_semantic, indices_pixel)
+    print(recon.shape)
+    recon_image = processor.postprocess(recon)["pixel_values"][0]
+    recon_image.save("recon_image.png")
+    # diffusion decoder only support 11 resolution. Check here `diffusion_decoder.resolution_group`.
+    # diffusion_recon = diffusion_decoder(# use vq_indices or vq_embeds
+    #                                     vq_indices=(indices_semantic, indices_pixel),
+    #                                     vq_embeds=(quant_semantic, quant_pixel),
+    #                                     height = height * 2,
+    #                                     width = width * 2,
+    #                                     num_inference_steps = 50,
+    #                                     guidance_scale = 1.5,)
+    # diffusion_recon.images[0].save("diffusion_recon_image.png")
+```

config.json ADDED Viewed

	@@ -0,0 +1,75 @@

+{
+  "auto_map": {
+    "AutoConfig": "configuration_dualvitok.DualViTokConfig",
+    "AutoModel": "modeling_dualvitok.DualViTok"
+  },
+  "architectures": [
+    "DualViTok"
+  ],
+  "semantic_encoder": {
+    "pretrained_semantic_encoder": "Emova-ollm/qwen2vit600m",
+    "z_channels": 32,
+    "num_blocks": 4,
+    "out_layer": "linear",
+    "embed_dim": 1280,
+    "target_mlp": "norm"
+  },
+  "semantic_decoder": {
+    "z_channels": 32,
+    "num_blocks": 4,
+    "embed_dim": 1280,
+    "out_layer": "linear_norm",
+    "out_channels": 3584
+  },
+  "semantic_quantizer_type": "simvq",
+  "pixel_quantizer_type": "simvq",
+  "semantic_quantizer_codebook_size": 32768,
+  "pixel_quantizer_codebook_size": 98304,
+  "attn_implementation": "eager",
+  "pixel_encoder": {
+    "codebook_size": 98304,
+    "embed_dim": 32,
+    "z_channels": 32,
+    "double_z": false,
+    "in_channels": 3,
+    "out_channels": 3,
+    "ch": 128,
+    "ch_mult": [
+      1,
+      1,
+      2,
+      2,
+      4
+    ],
+    "num_res_blocks": 2,
+    "attn_resolutions": [
+      4
+    ],
+    "dropout": 0.0,
+    "use_dc_up_down_blocks": true
+  },
+  "pixel_decoder": {
+    "codebook_size": 98304,
+    "embed_dim": 64,
+    "z_channels": 64,
+    "double_z": false,
+    "in_channels": 3,
+    "out_channels": 3,
+    "ch": 384,
+    "ch_mult": [
+      1,
+      1,
+      2,
+      2,
+      4
+    ],
+    "num_res_blocks": 2,
+    "attn_resolutions": [
+      4
+    ],
+    "dropout": 0.0,
+    "use_dc_up_down_blocks": true
+  },
+  "torch_dtype": "float16",
+  "transformers_version": "4.44.2"
+}

configuration_dualvitok.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import os
+from typing import List, Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from .configuration_movqgan import MoVQConfig
+from .modeling_rope_utils import rope_config_validation
+from .configuration_qwen2vit import Qwen2VLVisionConfig
+logger = logging.get_logger(__name__)
+class SemanticEncoderConfig(PretrainedConfig):
+    model_type = "DualViTokSemanticEncoder"
+    def __init__(
+            self,
+            pretrained_semantic_encoder='Emova-ollm/qwen2vit600m',
+            z_channels=32,
+            num_blocks=4,
+            embed_dim=1280,
+            out_layer='linear',
+            target_mlp='norm',
+            **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.pretrained_semantic_encoder = pretrained_semantic_encoder
+        self.z_channels = z_channels
+        self.num_blocks = num_blocks
+        self.out_layer = out_layer
+        self.embed_dim = embed_dim
+        self.target_mlp = target_mlp
+class SemanticDecoderConfig(PretrainedConfig):
+    model_type = "DualViTokSemanticDecoder"
+    def __init__(
+            self,
+            z_channels=32,
+            num_blocks=4,
+            embed_dim=1280,
+            out_layer='linear_norm',
+            out_channels=3584,
+            **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.z_channels = z_channels
+        self.num_blocks = num_blocks
+        self.embed_dim = embed_dim
+        self.out_layer = out_layer
+        self.out_channels = out_channels
+class DualViTokConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DualViTok`]. It is used to instantiate an video movq
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a configuration to the VQ model presented in  paper.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        codebook_size (`int`, *optional*, defaults to 32768):
+            Codebook size of the VQ model.
+        embed_dim (`int`, *optional*, defaults to 4):
+            Dimension of the quantized vector in codebook.
+        z_channels (`int`, *optional*, defaults to 4):
+            Dimension of the output channel of encoder and the input channel of decoder
+        double_z (`bool`, *optional*, defaults to False):
+            Whether double the output dim of the encoder.
+        in_channels (`int`, *optional*, defaults to 3):
+            Input channel of encoder.
+        out_channels (`int`, *optional*, defaults to 3):
+            Output channel of decoder.
+        ch (`int`, *optional*, defaults to 256):
+            Basic channel number of the intermediate blocks.
+        ch_mult (`List[int]`, *optional*, defaults to `[1, 2, 2, 4]`):
+            Channel scaling factor of the intermediate blocks.
+        num_res_blocks (`int`, *optional*, defaults to 2):
+            Residual block number in each stage.
+        attn_resolutions (`List[int]`, *optional*, defaults to 3):
+            Stage indices to apply attention.
+        dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability.
+    ```python
+    >>> from transformers import DualViTok, DualViTokConfig
+    >>> # Initializing a video VQ model of  configuration
+    >>> configuration = DualViTokConfig()
+    >>> # Initializing a model from the  VQ model style configuration
+    >>> model = DualViTok(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "DualViTok"
+    def __init__(
+            self,
+            semantic_encoder=None,
+            semantic_decoder=None,
+            pixel_encoder=None,
+            pixel_decoder=None,
+            semantic_quantizer_type='simvq',
+            pixel_quantizer_type='simvq',
+            semantic_quantizer_codebook_size=32768,
+            pixel_quantizer_codebook_size=98304,
+            attn_implementation='sdpa',
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if semantic_encoder is None:
+            self.semantic_encoder = SemanticEncoderConfig()
+        else:
+            self.semantic_encoder = SemanticEncoderConfig(**semantic_encoder)
+        if semantic_decoder is None:
+            self.semantic_decoder = SemanticEncoderConfig()
+        else:
+            self.semantic_decoder = SemanticEncoderConfig(**semantic_decoder)
+        self.semantic_quantizer_type = semantic_quantizer_type
+        self.pixel_quantizer_type = pixel_quantizer_type
+        self.semantic_quantizer_codebook_size = semantic_quantizer_codebook_size
+        self.pixel_quantizer_codebook_size = pixel_quantizer_codebook_size
+        if pixel_encoder is None:
+            self.pixel_encoder = MoVQConfig()
+        else:
+            self.pixel_encoder = MoVQConfig(**pixel_encoder)
+        self.pixel_decoder = self.pixel_encoder if pixel_decoder is None else MoVQConfig(**pixel_decoder)
+        self.attn_implementation = attn_implementation
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], attn_implementation='sdpa', **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, attn_implementation=attn_implementation, **kwargs)

configuration_movqgan.py ADDED Viewed

	@@ -0,0 +1,90 @@

+""" MoVQ model configuration """
+from typing import List
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class MoVQConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MoVQ`]. It is used to instantiate an video movq
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a configuration to the VQ model presented in  paper.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        codebook_size (`int`, *optional*, defaults to 32768):
+            Codebook size of the VQ model.
+        embed_dim (`int`, *optional*, defaults to 4):
+            Dimension of the quantized vector in codebook.
+        z_channels (`int`, *optional*, defaults to 4):
+            Dimension of the output channel of encoder and the input channel of decoder
+        double_z (`bool`, *optional*, defaults to False):
+            Whether double the output dim of the encoder.
+        in_channels (`int`, *optional*, defaults to 3):
+            Input channel of encoder.
+        out_channels (`int`, *optional*, defaults to 3):
+            Output channel of decoder.
+        ch (`int`, *optional*, defaults to 256):
+            Basic channel number of the intermediate blocks.
+        ch_mult (`List[int]`, *optional*, defaults to `[1, 2, 2, 4]`):
+            Channel scaling factor of the intermediate blocks.
+        num_res_blocks (`int`, *optional*, defaults to 2):
+            Residual block number in each stage.
+        attn_resolutions (`List[int]`, *optional*, defaults to 3):
+            Stage indices to apply attention.
+        dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability.
+    ```python
+    >>> from transformers import MoVQ, MoVQConfig
+    >>> # Initializing a video VQ model of  configuration
+    >>> configuration = MoVQConfig()
+    >>> # Initializing a model from the  VQ model style configuration
+    >>> model = MoVQModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "MoVQ"
+    def __init__(
+        self,
+        codebook_size: int = 32768,
+        embed_dim: int = 4,
+        z_channels: int = 4,
+        double_z: bool = False,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        ch: int = 256,
+        ch_mult: List[int] = [1, 2, 2, 4],
+        num_res_blocks: int = 2,
+        attn_resolutions: List[int] = [3],
+        dropout: float = 0.0,
+        use_dc_up_down_blocks=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.codebook_size = codebook_size
+        self.embed_dim = embed_dim
+        self.z_channels = z_channels
+        self.double_z = double_z
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.ch = ch
+        self.ch_mult = ch_mult
+        self.num_res_blocks = num_res_blocks
+        self.attn_resolutions = attn_resolutions
+        self.dropout = dropout
+        self.use_dc_up_down_blocks = use_dc_up_down_blocks

configuration_qwen2vit.py ADDED Viewed

	@@ -0,0 +1,249 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen2VL model configuration"""
+import os
+from typing import Union
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from .modeling_rope_utils import rope_config_validation
+logger = logging.get_logger(__name__)
+class Qwen2VLVisionConfig(PretrainedConfig):
+    model_type = "qwen2_vl"
+    def __init__(
+            self,
+            depth=32,
+            embed_dim=1280,
+            hidden_size=3584,
+            hidden_act="quick_gelu",
+            mlp_ratio=4,
+            num_heads=16,
+            in_channels=3,
+            patch_size=14,
+            spatial_merge_size=2,
+            temporal_patch_size=2,
+            attn_implementation='eager',
+            init_weights=False,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.depth = depth
+        self.embed_dim = embed_dim
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.attn_implementation = attn_implementation if attn_implementation else 'eager'
+        self.init_weights = init_weights
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        # if config_dict.get("model_type") == "qwen2_vl":
+        #     config_dict = config_dict["vision_config"]
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+class Qwen2VLConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen2VLModel`]. It is used to instantiate a
+    Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen2-VL-7B-Instruct [Qwen/Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 152064):
+            Vocabulary size of the Qwen2VL model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen2VLModel`]
+        hidden_size (`int`, *optional*, defaults to 8192):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 29568):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 80):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 64):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `32`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 80):
+            The number of layers that use SWA (Sliding Window Attention). The bottom layers use SWA while the top use full attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        vision_config (`Dict`, *optional*):
+            The config for the visual encoder initialization.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+    ```python
+    >>> from transformers import Qwen2VLForConditionalGeneration, Qwen2VLConfig
+    >>> # Initializing a Qwen2VL style configuration
+    >>> configuration = Qwen2VLConfig()
+    >>> # Initializing a model from the Qwen2-VL-7B style configuration
+    >>> model = Qwen2VLForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "qwen2_vl"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+            self,
+            vocab_size=152064,
+            hidden_size=8192,
+            intermediate_size=29568,
+            num_hidden_layers=80,
+            num_attention_heads=64,
+            num_key_value_heads=8,
+            hidden_act="silu",
+            max_position_embeddings=32768,
+            initializer_range=0.02,
+            rms_norm_eps=1e-05,
+            use_cache=True,
+            tie_word_embeddings=False,
+            rope_theta=1000000.0,
+            use_sliding_window=False,
+            sliding_window=4096,
+            max_window_layers=80,
+            attention_dropout=0.0,
+            vision_config=None,
+            rope_scaling=None,
+            **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = Qwen2VLVisionConfig(**vision_config)
+        elif vision_config is None:
+            self.vision_config = Qwen2VLVisionConfig()
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        # and change type from 'mrope' to 'default'
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            if self.rope_scaling["type"] == "mrope":
+                self.rope_scaling["type"] = "default"
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

image_processing_dualvitok.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# coding=utf-8
+from transformers.utils import TensorType, is_vision_available, logging
+from .image_processing_movqgan import MoVQImageProcessor
+logger = logging.get_logger(__name__)
+class DualViTokImageProcessor(MoVQImageProcessor):
+    r"""
+    Constructs a DualViTok image processor that dynamically resizes images based on the original images.
+    This image processor is based on MoVQImageProcessor with spatial_factor of 16.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        min_pixels (`int`, *optional*, defaults to `512 * 512`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `1024 * 1024`):
+            The max pixels of the image to resize the image.
+        spatial_factor (`int`, *optional*, defautls to 8):
+            The spatial downsample factor the image will be downsampled in feature extracting phase
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        *args,
+        spatial_factor: int = 16,
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, spatial_factor=spatial_factor, **kwargs)

image_processing_movqgan.py ADDED Viewed

	@@ -0,0 +1,428 @@

+"""Image processor class for MoVQ."""
+import math
+from typing import Dict, List, Optional, Union
+import numpy as np
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.utils import TensorType, is_vision_available, logging
+logger = logging.get_logger(__name__)
+if is_vision_available():
+    from PIL import Image
+def smart_resize(
+    height: int, width: int, factor: int = 8, min_pixels: int = 512 * 512, max_pixels: int = 1024 * 1024
+):
+    """Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    # if height < factor or width < factor:
+    #     raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+    # elif max(height, width) / min(height, width) > 5:
+    #     raise ValueError(
+    #         f"absolute aspect ratio must be smaller than 5, got {max(height, width) / min(height, width)}"
+    #     )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return max(h_bar, factor), max(w_bar, factor)
+class MoVQImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a MoVQ image processor that dynamically resizes images based on the original images.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        min_pixels (`int`, *optional*, defaults to `512 * 512`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `1024 * 1024`):
+            The max pixels of the image to resize the image.
+        spatial_factor (`int`, *optional*, defautls to 8):
+            The spatial downsample factor the image will be downsampled in feature extracting phase
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 32 * 32,
+        max_pixels: int = 1024 * 1024,
+        spatial_factor: int = 8,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+        self.do_convert_rgb = do_convert_rgb
+        self.spatial_factor = spatial_factor
+    def _preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        spatial_factor: Optional[int] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        output_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            spatial_factor (`int`, *optional*, defaults to `self.spatial_factor`):
+                The spatial downsample factor the image will be downsampled in feature extracting phase
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            output_data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+        """
+        spatial_factor = spatial_factor if spatial_factor is not None else self.spatial_factor
+        images = make_list_of_images(images)
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                "pixel_values.append()images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=spatial_factor,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+                image = resize(
+                    image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
+                )
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+            image = to_channel_dimension_format(image, output_data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+        image = np.array(processed_images)
+        return image
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        spatial_factor: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        output_data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            spatial_factor (`int`, *optional*, defaults to `self.spatial_factor`):
+                The spatial downsample factor the image will be downsampled in feature extracting phase
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            output_data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        spatial_factor = spatial_factor if spatial_factor is not None else self.spatial_factor
+        images = make_list_of_images(images)
+        if images is None or not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=self.size,
+            resample=resample,
+        )
+        pixel_values = []
+        for image in images:
+            norm_image = self._preprocess(
+                image,
+                do_resize=do_resize,
+                resample=resample,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                do_convert_rgb=do_convert_rgb,
+                spatial_factor=spatial_factor,
+                input_data_format=input_data_format,
+                output_data_format=output_data_format,
+            )
+            pixel_values.extend(norm_image)
+        pixel_values = np.array(pixel_values)
+        data = {"pixel_values": pixel_values}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+    def postprocess(
+        self,
+        images: ImageInput,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = "PIL.Image.Image",
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Postprocess an image or batch of images tensor. Postprocess is the reverse process of preprocess.
+        The parameters should be same as in preprocess.
+        Args:
+            images (`ImageInput`):
+                Image to postprocess. Expects a single or batch of images with pixel values ranging from -1 to 1.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        rescale_factor = 1 / rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        image_mean, image_std = self.inverse_meanstd(image_mean, image_std)
+        images = make_list_of_images(images)
+        if isinstance(images[0], Image.Image):
+            return images if len(images) > 1 else images[0]
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        pixel_values = []
+        for image in images:
+            image = to_numpy_array(image)
+            if do_normalize:
+                image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+                image = image.clip(0, 255).astype(np.uint8)
+            if do_normalize and do_rescale and return_tensors == "PIL.Image.Image":
+                image = to_channel_dimension_format(image, ChannelDimension.LAST, input_channel_dim=input_data_format)
+                pixel_values.append(Image.fromarray(image))
+            else:
+                pixel_values.extend(image)
+        data = {"pixel_values": pixel_values}
+        return_tensors = return_tensors if return_tensors != "PIL.Image.Image" else None
+        return BatchFeature(data=data, tensor_type=return_tensors)
+    def inverse_meanstd(self, image_mean, image_std):
+        image_mean = self.to_tuple(image_mean)
+        image_std = self.to_tuple(image_std)
+        rev_image_mean = tuple(-m / s for m, s in zip(image_mean, image_std))
+        rev_image_std = tuple(1 / s for s in image_std)
+        return rev_image_mean, rev_image_std
+    def to_tuple(self, value, dim=3):
+        if isinstance(value, (int, float)):
+            return (value,) * dim
+        return tuple(value)

image_processing_qwen2vit.py ADDED Viewed

	@@ -0,0 +1,476 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Qwen2-VL."""
+import math
+from typing import Dict, List, Optional, Union
+import numpy as np
+import torch
+from torch import nn
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from .image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    VideoInput,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from transformers.utils import TensorType, is_vision_available, logging
+logger = logging.get_logger(__name__)
+if is_vision_available():
+    from PIL import Image
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+    Returns:
+        list: A list of images.
+    """
+    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
+        return [img for img_list in images for img in img_list]
+    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+        return images
+    elif is_valid_image(images):
+        return [images]
+    raise ValueError(f"Could not make batched images from {images}")
+# Copied from transformers.models.llava_next_video.image_processing_llava_next_video.make_batched_videos
+def make_batched_videos(videos) -> List[VideoInput]:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+        return videos
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if isinstance(videos[0], Image.Image):
+            return [videos]
+        elif len(videos[0].shape) == 4:
+            return [list(video) for video in videos]
+    elif is_valid_image(videos) and len(videos.shape) == 4:
+        return [list(videos)]
+    raise ValueError(f"Could not make batched video from {videos}")
+def smart_resize(
+    height: int, width: int, factor: int = 28, min_pixels: int = 56 * 56, max_pixels: int = 14 * 14 * 4 * 1280
+):
+    """Rescales the image so that the following conditions are met:
+    1. Both dimensions (height and width) are divisible by 'factor'.
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+    3. The aspect ratio of the image is maintained as closely as possible.
+    """
+    if height < factor or width < factor:
+        # print("height, width", height, width)
+        if height < width:
+            h_bar = factor
+            w_bar = round(width / height * factor)
+        else:
+            h_bar = round(height / width * factor)
+            w_bar = factor
+        # print("h_bar, w_bar", h_bar, w_bar)
+        height, width = h_bar, w_bar
+        # raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+    elif max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+class Qwen2VLImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Qwen2-VL image processor that dynamically resizes images based on the original images.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        min_pixels (`int`, *optional*, defaults to `56 * 56`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
+            The max pixels of the image to resize the image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The spacial patch size of the vision encoder.
+        temporal_patch_size (`int`, *optional*, defaults to 2):
+            The temporal patch size of the vision encoder.
+        merge_size (`int`, *optional*, defaults to 2):
+            The merge size of the vision encoder to llm encoder.
+    """
+    model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 56 * 56,
+        max_pixels: int = 28 * 28 * 1280,
+        patch_size: int = 14,
+        temporal_patch_size: int = 2,
+        merge_size: int = 2,
+        shifted_patch_tokenize=False,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+        self.do_convert_rgb = do_convert_rgb
+        self.shifted_patch_tokenize = shifted_patch_tokenize
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        do_resize: bool = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            vision_info (`List[Dict]`, *optional*):
+                Optional list of dictionaries containing additional information about vision inputs.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.   - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        # import pdb; pdb.set_trace()
+        # print("images", images)
+        # for image in images:
+        #     print("image", image.size)
+        images = make_list_of_images(images)
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=self.patch_size * self.merge_size,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+                image = resize(
+                    image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
+                )
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+        patches = np.array(processed_images)
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose(0, 3, 1, 2)
+        if patches.shape[0] == 1:
+            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+        patches = patches.reshape(
+            grid_t,
+            self.temporal_patch_size,
+            channel,
+            grid_h // self.merge_size,
+            self.merge_size,
+            self.patch_size,
+            grid_w // self.merge_size,
+            self.merge_size,
+            self.patch_size,
+        )
+        patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
+        flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size
+        )
+        return flatten_patches, (grid_t, grid_h, grid_w)
+    def preprocess(
+        self,
+        images: ImageInput,
+        videos: VideoInput = None,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            videos (`VideoInput`):
+                Video to preprocess. Expects a single or batch of videos with pixel values ranging from 0 to 255. If
+                passing in videos with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        if images is not None:
+            images = make_batched_images(images)
+        if videos is not None:
+            videos = make_batched_videos(videos)
+        if images is not None and not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for image in images:
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
+        if videos is not None:
+            pixel_values, vision_grid_thws = [], []
+            for images in videos:
+                patches, video_grid_thw = self._preprocess(
+                    images,
+                    do_resize=do_resize,
+                    resample=resample,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    do_convert_rgb=do_convert_rgb,
+                    input_data_format=input_data_format,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(video_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {"pixel_values_videos": pixel_values, "video_grid_thw": vision_grid_thws}
+        return BatchFeature(data=data, tensor_type=return_tensors)

image_utils.py ADDED Viewed

	@@ -0,0 +1,812 @@

+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import os
+from io import BytesIO
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
+import numpy as np
+import requests
+from packaging import version
+from transformers.utils import (
+    ExplicitEnum,
+    is_jax_tensor,
+    is_numpy_array,
+    is_tf_tensor,
+    is_torch_available,
+    is_torch_tensor,
+    is_torchvision_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+    to_numpy,
+)
+from transformers.utils.constants import (  # noqa: F401
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+)
+if is_vision_available():
+    import PIL.Image
+    import PIL.ImageOps
+    if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"):
+        PILImageResampling = PIL.Image.Resampling
+    else:
+        PILImageResampling = PIL.Image
+    if is_torchvision_available():
+        from torchvision.transforms import InterpolationMode
+        pil_torch_interpolation_mapping = {
+            PILImageResampling.NEAREST: InterpolationMode.NEAREST,
+            PILImageResampling.BOX: InterpolationMode.BOX,
+            PILImageResampling.BILINEAR: InterpolationMode.BILINEAR,
+            PILImageResampling.HAMMING: InterpolationMode.HAMMING,
+            PILImageResampling.BICUBIC: InterpolationMode.BICUBIC,
+            PILImageResampling.LANCZOS: InterpolationMode.LANCZOS,
+        }
+if TYPE_CHECKING:
+    if is_torch_available():
+        import torch
+logger = logging.get_logger(__name__)
+ImageInput = Union[
+    "PIL.Image.Image", np.ndarray, "torch.Tensor", List["PIL.Image.Image"], List[np.ndarray], List["torch.Tensor"]
+]  # noqa
+VideoInput = Union[
+    List["PIL.Image.Image"],
+    "np.ndarray",
+    "torch.Tensor",
+    List["np.ndarray"],
+    List["torch.Tensor"],
+    List[List["PIL.Image.Image"]],
+    List[List["np.ndarrray"]],
+    List[List["torch.Tensor"]],
+]  # noqa
+class ChannelDimension(ExplicitEnum):
+    FIRST = "channels_first"
+    LAST = "channels_last"
+class AnnotationFormat(ExplicitEnum):
+    COCO_DETECTION = "coco_detection"
+    COCO_PANOPTIC = "coco_panoptic"
+class AnnotionFormat(ExplicitEnum):
+    COCO_DETECTION = AnnotationFormat.COCO_DETECTION.value
+    COCO_PANOPTIC = AnnotationFormat.COCO_PANOPTIC.value
+AnnotationType = Dict[str, Union[int, str, List[Dict]]]
+def is_pil_image(img):
+    return is_vision_available() and isinstance(img, PIL.Image.Image)
+class ImageType(ExplicitEnum):
+    PIL = "pillow"
+    TORCH = "torch"
+    NUMPY = "numpy"
+    TENSORFLOW = "tensorflow"
+    JAX = "jax"
+def get_image_type(image):
+    if is_pil_image(image):
+        return ImageType.PIL
+    if is_torch_tensor(image):
+        return ImageType.TORCH
+    if is_numpy_array(image):
+        return ImageType.NUMPY
+    if is_tf_tensor(image):
+        return ImageType.TENSORFLOW
+    if is_jax_tensor(image):
+        return ImageType.JAX
+    raise ValueError(f"Unrecognised image type {type(image)}")
+def is_valid_image(img):
+    return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img) or is_tf_tensor(img) or is_jax_tensor(img)
+def valid_images(imgs):
+    # If we have an list of images, make sure every image is valid
+    if isinstance(imgs, (list, tuple)):
+        for img in imgs:
+            if not valid_images(img):
+                return False
+    # If not a list of tuple, we have been given a single image or batched tensor of images
+    elif not is_valid_image(imgs):
+        return False
+    return True
+def is_batched(img):
+    if isinstance(img, (list, tuple)):
+        return is_valid_image(img[0])
+    return False
+def is_scaled_image(image: np.ndarray) -> bool:
+    """
+    Checks to see whether the pixel values have already been rescaled to [0, 1].
+    """
+    if image.dtype == np.uint8:
+        return False
+    # It's possible the image has pixel values in [0, 255] but is of floating type
+    return np.min(image) >= 0 and np.max(image) <= 1
+def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]:
+    """
+    Ensure that the input is a list of images. If the input is a single image, it is converted to a list of length 1.
+    If the input is a batch of images, it is converted to a list of images.
+    Args:
+        images (`ImageInput`):
+            Image of images to turn into a list of images.
+        expected_ndims (`int`, *optional*, defaults to 3):
+            Expected number of dimensions for a single input image. If the input image has a different number of
+            dimensions, an error is raised.
+    """
+    if is_batched(images):
+        return images
+    # Either the input is a single image, in which case we create a list of length 1
+    if isinstance(images, PIL.Image.Image):
+        # PIL images are never batched
+        return [images]
+    if is_valid_image(images):
+        if images.ndim == expected_ndims + 1:
+            # Batch of images
+            images = list(images)
+        elif images.ndim == expected_ndims:
+            # Single image
+            images = [images]
+        else:
+            raise ValueError(
+                f"Invalid image shape. Expected either {expected_ndims + 1} or {expected_ndims} dimensions, but got"
+                f" {images.ndim} dimensions."
+            )
+        return images
+    raise ValueError(
+        "Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or "
+        f"jax.ndarray, but got {type(images)}."
+    )
+def to_numpy_array(img) -> np.ndarray:
+    if not is_valid_image(img):
+        raise ValueError(f"Invalid image type: {type(img)}")
+    if is_vision_available() and isinstance(img, PIL.Image.Image):
+        return np.array(img)
+    return to_numpy(img)
+def infer_channel_dimension_format(
+    image: np.ndarray, num_channels: Optional[Union[int, Tuple[int, ...]]] = None
+) -> ChannelDimension:
+    """
+    Infers the channel dimension format of `image`.
+    Args:
+        image (`np.ndarray`):
+            The image to infer the channel dimension of.
+        num_channels (`int` or `Tuple[int, ...]`, *optional*, defaults to `(1, 3)`):
+            The number of channels of the image.
+    Returns:
+        The channel dimension of the image.
+    """
+    num_channels = num_channels if num_channels is not None else (1, 3)
+    num_channels = (num_channels,) if isinstance(num_channels, int) else num_channels
+    if image.ndim == 3:
+        first_dim, last_dim = 0, 2
+    elif image.ndim == 4:
+        first_dim, last_dim = 1, 3
+    else:
+        raise ValueError(f"Unsupported number of image dimensions: {image.ndim}")
+    if image.shape[first_dim] in num_channels and image.shape[last_dim] in num_channels:
+        logger.warning(
+            f"The channel dimension is ambiguous. Got image shape {image.shape}. Assuming channels are the first dimension."
+        )
+        return ChannelDimension.FIRST
+    elif image.shape[first_dim] in num_channels:
+        return ChannelDimension.FIRST
+    elif image.shape[last_dim] in num_channels:
+        return ChannelDimension.LAST
+    raise ValueError("Unable to infer channel dimension format")
+def get_channel_dimension_axis(
+    image: np.ndarray, input_data_format: Optional[Union[ChannelDimension, str]] = None
+) -> int:
+    """
+    Returns the channel dimension axis of the image.
+    Args:
+        image (`np.ndarray`):
+            The image to get the channel dimension axis of.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the image. If `None`, will infer the channel dimension from the image.
+    Returns:
+        The channel dimension axis of the image.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image)
+    if input_data_format == ChannelDimension.FIRST:
+        return image.ndim - 3
+    elif input_data_format == ChannelDimension.LAST:
+        return image.ndim - 1
+    raise ValueError(f"Unsupported data format: {input_data_format}")
+def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = None) -> Tuple[int, int]:
+    """
+    Returns the (height, width) dimensions of the image.
+    Args:
+        image (`np.ndarray`):
+            The image to get the dimensions of.
+        channel_dim (`ChannelDimension`, *optional*):
+            Which dimension the channel dimension is in. If `None`, will infer the channel dimension from the image.
+    Returns:
+        A tuple of the image's height and width.
+    """
+    if channel_dim is None:
+        channel_dim = infer_channel_dimension_format(image)
+    if channel_dim == ChannelDimension.FIRST:
+        return image.shape[-2], image.shape[-1]
+    elif channel_dim == ChannelDimension.LAST:
+        return image.shape[-3], image.shape[-2]
+    else:
+        raise ValueError(f"Unsupported data format: {channel_dim}")
+def is_valid_annotation_coco_detection(annotation: Dict[str, Union[List, Tuple]]) -> bool:
+    if (
+        isinstance(annotation, dict)
+        and "image_id" in annotation
+        and "annotations" in annotation
+        and isinstance(annotation["annotations"], (list, tuple))
+        and (
+            # an image can have no annotations
+            len(annotation["annotations"]) == 0 or isinstance(annotation["annotations"][0], dict)
+        )
+    ):
+        return True
+    return False
+def is_valid_annotation_coco_panoptic(annotation: Dict[str, Union[List, Tuple]]) -> bool:
+    if (
+        isinstance(annotation, dict)
+        and "image_id" in annotation
+        and "segments_info" in annotation
+        and "file_name" in annotation
+        and isinstance(annotation["segments_info"], (list, tuple))
+        and (
+            # an image can have no segments
+            len(annotation["segments_info"]) == 0 or isinstance(annotation["segments_info"][0], dict)
+        )
+    ):
+        return True
+    return False
+def valid_coco_detection_annotations(annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool:
+    return all(is_valid_annotation_coco_detection(ann) for ann in annotations)
+def valid_coco_panoptic_annotations(annotations: Iterable[Dict[str, Union[List, Tuple]]]) -> bool:
+    return all(is_valid_annotation_coco_panoptic(ann) for ann in annotations)
+def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] = None) -> "PIL.Image.Image":
+    """
+    Loads `image` to a PIL Image.
+    Args:
+        image (`str` or `PIL.Image.Image`):
+            The image to convert to the PIL Image format.
+        timeout (`float`, *optional*):
+            The timeout value in seconds for the URL request.
+    Returns:
+        `PIL.Image.Image`: A PIL Image.
+    """
+    requires_backends(load_image, ["vision"])
+    if isinstance(image, str):
+        if image.startswith("http://") or image.startswith("https://"):
+            # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+            # like http_huggingface_co.png
+            image = PIL.Image.open(BytesIO(requests.get(image, timeout=timeout).content))
+        elif os.path.isfile(image):
+            image = PIL.Image.open(image)
+        else:
+            if image.startswith("data:image/"):
+                image = image.split(",")[1]
+            # Try to load as base64
+            try:
+                b64 = base64.decodebytes(image.encode())
+                image = PIL.Image.open(BytesIO(b64))
+            except Exception as e:
+                raise ValueError(
+                    f"Incorrect image source. Must be a valid URL starting with `http://` or `https://`, a valid path to an image file, or a base64 encoded string. Got {image}. Failed with {e}"
+                )
+    elif isinstance(image, PIL.Image.Image):
+        image = image
+    else:
+        raise TypeError(
+            "Incorrect format used for image. Should be an url linking to an image, a base64 string, a local path, or a PIL image."
+        )
+    image = PIL.ImageOps.exif_transpose(image)
+    image = image.convert("RGB")
+    return image
+def validate_preprocess_arguments(
+    do_rescale: Optional[bool] = None,
+    rescale_factor: Optional[float] = None,
+    do_normalize: Optional[bool] = None,
+    image_mean: Optional[Union[float, List[float]]] = None,
+    image_std: Optional[Union[float, List[float]]] = None,
+    do_pad: Optional[bool] = None,
+    size_divisibility: Optional[int] = None,
+    do_center_crop: Optional[bool] = None,
+    crop_size: Optional[Dict[str, int]] = None,
+    do_resize: Optional[bool] = None,
+    size: Optional[Dict[str, int]] = None,
+    resample: Optional["PILImageResampling"] = None,
+):
+    """
+    Checks validity of typically used arguments in an `ImageProcessor` `preprocess` method.
+    Raises `ValueError` if arguments incompatibility is caught.
+    Many incompatibilities are model-specific. `do_pad` sometimes needs `size_divisor`,
+    sometimes `size_divisibility`, and sometimes `size`. New models and processors added should follow
+    existing arguments when possible.
+    """
+    if do_rescale and rescale_factor is None:
+        raise ValueError("`rescale_factor` must be specified if `do_rescale` is `True`.")
+    if do_pad and size_divisibility is None:
+        # Here, size_divisor might be passed as the value of size
+        raise ValueError(
+            "Depending on the model, `size_divisibility`, `size_divisor`, `pad_size` or `size` must be specified if `do_pad` is `True`."
+        )
+    if do_normalize and (image_mean is None or image_std is None):
+        raise ValueError("`image_mean` and `image_std` must both be specified if `do_normalize` is `True`.")
+    if do_center_crop and crop_size is None:
+        raise ValueError("`crop_size` must be specified if `do_center_crop` is `True`.")
+    if do_resize and (size is None or resample is None):
+        raise ValueError("`size` and `resample` must be specified if `do_resize` is `True`.")
+# In the future we can add a TF implementation here when we have TF models.
+class ImageFeatureExtractionMixin:
+    """
+    Mixin that contain utilities for preparing image features.
+    """
+    def _ensure_format_supported(self, image):
+        if not isinstance(image, (PIL.Image.Image, np.ndarray)) and not is_torch_tensor(image):
+            raise ValueError(
+                f"Got type {type(image)} which is not supported, only `PIL.Image.Image`, `np.array` and "
+                "`torch.Tensor` are."
+            )
+    def to_pil_image(self, image, rescale=None):
+        """
+        Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
+        needed.
+        Args:
+            image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
+                The image to convert to the PIL Image format.
+            rescale (`bool`, *optional*):
+                Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will
+                default to `True` if the image type is a floating type, `False` otherwise.
+        """
+        self._ensure_format_supported(image)
+        if is_torch_tensor(image):
+            image = image.numpy()
+        if isinstance(image, np.ndarray):
+            if rescale is None:
+                # rescale default to the array being of floating type.
+                rescale = isinstance(image.flat[0], np.floating)
+            # If the channel as been moved to first dim, we put it back at the end.
+            if image.ndim == 3 and image.shape[0] in [1, 3]:
+                image = image.transpose(1, 2, 0)
+            if rescale:
+                image = image * 255
+            image = image.astype(np.uint8)
+            return PIL.Image.fromarray(image)
+        return image
+    def convert_rgb(self, image):
+        """
+        Converts `PIL.Image.Image` to RGB format.
+        Args:
+            image (`PIL.Image.Image`):
+                The image to convert.
+        """
+        self._ensure_format_supported(image)
+        if not isinstance(image, PIL.Image.Image):
+            return image
+        return image.convert("RGB")
+    def rescale(self, image: np.ndarray, scale: Union[float, int]) -> np.ndarray:
+        """
+        Rescale a numpy image by scale amount
+        """
+        self._ensure_format_supported(image)
+        return image * scale
+    def to_numpy_array(self, image, rescale=None, channel_first=True):
+        """
+        Converts `image` to a numpy array. Optionally rescales it and puts the channel dimension as the first
+        dimension.
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image to convert to a NumPy array.
+            rescale (`bool`, *optional*):
+                Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Will
+                default to `True` if the image is a PIL Image or an array/tensor of integers, `False` otherwise.
+            channel_first (`bool`, *optional*, defaults to `True`):
+                Whether or not to permute the dimensions of the image to put the channel dimension first.
+        """
+        self._ensure_format_supported(image)
+        if isinstance(image, PIL.Image.Image):
+            image = np.array(image)
+        if is_torch_tensor(image):
+            image = image.numpy()
+        rescale = isinstance(image.flat[0], np.integer) if rescale is None else rescale
+        if rescale:
+            image = self.rescale(image.astype(np.float32), 1 / 255.0)
+        if channel_first and image.ndim == 3:
+            image = image.transpose(2, 0, 1)
+        return image
+    def expand_dims(self, image):
+        """
+        Expands 2-dimensional `image` to 3 dimensions.
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image to expand.
+        """
+        self._ensure_format_supported(image)
+        # Do nothing if PIL image
+        if isinstance(image, PIL.Image.Image):
+            return image
+        if is_torch_tensor(image):
+            image = image.unsqueeze(0)
+        else:
+            image = np.expand_dims(image, axis=0)
+        return image
+    def normalize(self, image, mean, std, rescale=False):
+        """
+        Normalizes `image` with `mean` and `std`. Note that this will trigger a conversion of `image` to a NumPy array
+        if it's a PIL Image.
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image to normalize.
+            mean (`List[float]` or `np.ndarray` or `torch.Tensor`):
+                The mean (per channel) to use for normalization.
+            std (`List[float]` or `np.ndarray` or `torch.Tensor`):
+                The standard deviation (per channel) to use for normalization.
+            rescale (`bool`, *optional*, defaults to `False`):
+                Whether or not to rescale the image to be between 0 and 1. If a PIL image is provided, scaling will
+                happen automatically.
+        """
+        self._ensure_format_supported(image)
+        if isinstance(image, PIL.Image.Image):
+            image = self.to_numpy_array(image, rescale=True)
+        # If the input image is a PIL image, it automatically gets rescaled. If it's another
+        # type it may need rescaling.
+        elif rescale:
+            if isinstance(image, np.ndarray):
+                image = self.rescale(image.astype(np.float32), 1 / 255.0)
+            elif is_torch_tensor(image):
+                image = self.rescale(image.float(), 1 / 255.0)
+        if isinstance(image, np.ndarray):
+            if not isinstance(mean, np.ndarray):
+                mean = np.array(mean).astype(image.dtype)
+            if not isinstance(std, np.ndarray):
+                std = np.array(std).astype(image.dtype)
+        elif is_torch_tensor(image):
+            import torch
+            if not isinstance(mean, torch.Tensor):
+                if isinstance(mean, np.ndarray):
+                    mean = torch.from_numpy(mean)
+                else:
+                    mean = torch.tensor(mean)
+            if not isinstance(std, torch.Tensor):
+                if isinstance(std, np.ndarray):
+                    std = torch.from_numpy(std)
+                else:
+                    std = torch.tensor(std)
+        if image.ndim == 3 and image.shape[0] in [1, 3]:
+            return (image - mean[:, None, None]) / std[:, None, None]
+        else:
+            return (image - mean) / std
+    def resize(self, image, size, resample=None, default_to_square=True, max_size=None):
+        """
+        Resizes `image`. Enforces conversion of input to PIL.Image.
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image to resize.
+            size (`int` or `Tuple[int, int]`):
+                The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be
+                matched to this.
+                If `size` is an int and `default_to_square` is `True`, then image will be resized to (size, size). If
+                `size` is an int and `default_to_square` is `False`, then smaller edge of the image will be matched to
+                this number. i.e, if height > width, then image will be rescaled to (size * height / width, size).
+            resample (`int`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                The filter to user for resampling.
+            default_to_square (`bool`, *optional*, defaults to `True`):
+                How to convert `size` when it is a single int. If set to `True`, the `size` will be converted to a
+                square (`size`,`size`). If set to `False`, will replicate
+                [`torchvision.transforms.Resize`](https://pytorch.org/vision/stable/transforms.html#torchvision.transforms.Resize)
+                with support for resizing only the smallest edge and providing an optional `max_size`.
+            max_size (`int`, *optional*, defaults to `None`):
+                The maximum allowed for the longer edge of the resized image: if the longer edge of the image is
+                greater than `max_size` after being resized according to `size`, then the image is resized again so
+                that the longer edge is equal to `max_size`. As a result, `size` might be overruled, i.e the smaller
+                edge may be shorter than `size`. Only used if `default_to_square` is `False`.
+        Returns:
+            image: A resized `PIL.Image.Image`.
+        """
+        resample = resample if resample is not None else PILImageResampling.BILINEAR
+        self._ensure_format_supported(image)
+        if not isinstance(image, PIL.Image.Image):
+            image = self.to_pil_image(image)
+        if isinstance(size, list):
+            size = tuple(size)
+        if isinstance(size, int) or len(size) == 1:
+            if default_to_square:
+                size = (size, size) if isinstance(size, int) else (size[0], size[0])
+            else:
+                width, height = image.size
+                # specified size only for the smallest edge
+                short, long = (width, height) if width <= height else (height, width)
+                requested_new_short = size if isinstance(size, int) else size[0]
+                if short == requested_new_short:
+                    return image
+                new_short, new_long = requested_new_short, int(requested_new_short * long / short)
+                if max_size is not None:
+                    if max_size <= requested_new_short:
+                        raise ValueError(
+                            f"max_size = {max_size} must be strictly greater than the requested "
+                            f"size for the smaller edge size = {size}"
+                        )
+                    if new_long > max_size:
+                        new_short, new_long = int(max_size * new_short / new_long), max_size
+                size = (new_short, new_long) if width <= height else (new_long, new_short)
+        return image.resize(size, resample=resample)
+    def center_crop(self, image, size):
+        """
+        Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to the
+        size given, it will be padded (so the returned result has the size asked).
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape (n_channels, height, width) or (height, width, n_channels)):
+                The image to resize.
+            size (`int` or `Tuple[int, int]`):
+                The size to which crop the image.
+        Returns:
+            new_image: A center cropped `PIL.Image.Image` or `np.ndarray` or `torch.Tensor` of shape: (n_channels,
+            height, width).
+        """
+        self._ensure_format_supported(image)
+        if not isinstance(size, tuple):
+            size = (size, size)
+        # PIL Image.size is (width, height) but NumPy array and torch Tensors have (height, width)
+        if is_torch_tensor(image) or isinstance(image, np.ndarray):
+            if image.ndim == 2:
+                image = self.expand_dims(image)
+            image_shape = image.shape[1:] if image.shape[0] in [1, 3] else image.shape[:2]
+        else:
+            image_shape = (image.size[1], image.size[0])
+        top = (image_shape[0] - size[0]) // 2
+        bottom = top + size[0]  # In case size is odd, (image_shape[0] + size[0]) // 2 won't give the proper result.
+        left = (image_shape[1] - size[1]) // 2
+        right = left + size[1]  # In case size is odd, (image_shape[1] + size[1]) // 2 won't give the proper result.
+        # For PIL Images we have a method to crop directly.
+        if isinstance(image, PIL.Image.Image):
+            return image.crop((left, top, right, bottom))
+        # Check if image is in (n_channels, height, width) or (height, width, n_channels) format
+        channel_first = True if image.shape[0] in [1, 3] else False
+        # Transpose (height, width, n_channels) format images
+        if not channel_first:
+            if isinstance(image, np.ndarray):
+                image = image.transpose(2, 0, 1)
+            if is_torch_tensor(image):
+                image = image.permute(2, 0, 1)
+        # Check if cropped area is within image boundaries
+        if top >= 0 and bottom <= image_shape[0] and left >= 0 and right <= image_shape[1]:
+            return image[..., top:bottom, left:right]
+        # Otherwise, we may need to pad if the image is too small. Oh joy...
+        new_shape = image.shape[:-2] + (max(size[0], image_shape[0]), max(size[1], image_shape[1]))
+        if isinstance(image, np.ndarray):
+            new_image = np.zeros_like(image, shape=new_shape)
+        elif is_torch_tensor(image):
+            new_image = image.new_zeros(new_shape)
+        top_pad = (new_shape[-2] - image_shape[0]) // 2
+        bottom_pad = top_pad + image_shape[0]
+        left_pad = (new_shape[-1] - image_shape[1]) // 2
+        right_pad = left_pad + image_shape[1]
+        new_image[..., top_pad:bottom_pad, left_pad:right_pad] = image
+        top += top_pad
+        bottom += top_pad
+        left += left_pad
+        right += left_pad
+        new_image = new_image[
+            ..., max(0, top) : min(new_image.shape[-2], bottom), max(0, left) : min(new_image.shape[-1], right)
+        ]
+        return new_image
+    def flip_channel_order(self, image):
+        """
+        Flips the channel order of `image` from RGB to BGR, or vice versa. Note that this will trigger a conversion of
+        `image` to a NumPy array if it's a PIL Image.
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image whose color channels to flip. If `np.ndarray` or `torch.Tensor`, the channel dimension should
+                be first.
+        """
+        self._ensure_format_supported(image)
+        if isinstance(image, PIL.Image.Image):
+            image = self.to_numpy_array(image)
+        return image[::-1, :, :]
+    def rotate(self, image, angle, resample=None, expand=0, center=None, translate=None, fillcolor=None):
+        """
+        Returns a rotated copy of `image`. This method returns a copy of `image`, rotated the given number of degrees
+        counter clockwise around its centre.
+        Args:
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
+                The image to rotate. If `np.ndarray` or `torch.Tensor`, will be converted to `PIL.Image.Image` before
+                rotating.
+        Returns:
+            image: A rotated `PIL.Image.Image`.
+        """
+        resample = resample if resample is not None else PIL.Image.NEAREST
+        self._ensure_format_supported(image)
+        if not isinstance(image, PIL.Image.Image):
+            image = self.to_pil_image(image)
+        return image.rotate(
+            angle, resample=resample, expand=expand, center=center, translate=translate, fillcolor=fillcolor
+        )
+def validate_annotations(
+    annotation_format: AnnotationFormat,
+    supported_annotation_formats: Tuple[AnnotationFormat, ...],
+    annotations: List[Dict],
+) -> None:
+    if annotation_format not in supported_annotation_formats:
+        raise ValueError(f"Unsupported annotation format: {format} must be one of {supported_annotation_formats}")
+    if annotation_format is AnnotationFormat.COCO_DETECTION:
+        if not valid_coco_detection_annotations(annotations):
+            raise ValueError(
+                "Invalid COCO detection annotations. Annotations must a dict (single image) or list of dicts "
+                "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
+                "being a list of annotations in the COCO format."
+            )
+    if annotation_format is AnnotationFormat.COCO_PANOPTIC:
+        if not valid_coco_panoptic_annotations(annotations):
+            raise ValueError(
+                "Invalid COCO panoptic annotations. Annotations must a dict (single image) or list of dicts "
+                "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
+                "the latter being a list of annotations in the COCO format."
+            )
+def validate_kwargs(valid_processor_keys: List[str], captured_kwargs: List[str]):
+    unused_keys = set(captured_kwargs).difference(set(valid_processor_keys))
+    if unused_keys:
+        unused_key_str = ", ".join(unused_keys)
+        # TODO raise a warning here instead of simply logging?
+        logger.warning(f"Unused or unrecognized kwargs: {unused_key_str}.")

modeling_dualvitok.py ADDED Viewed

	@@ -0,0 +1,653 @@

+from __future__ import annotations
+import os
+import sys
+import math
+from typing import Optional, Tuple, Union, List, Callable
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import Module
+from einops import rearrange, repeat, pack, unpack
+from einx import get_at
+from torch.utils.checkpoint import checkpoint
+from transformers import AutoImageProcessor
+from transformers.modeling_utils import PreTrainedModel, get_parameter_device, get_parameter_dtype
+from .configuration_dualvitok import DualViTokConfig
+from .modeling_movqgan import MoVQModel, MoVQEncoder, MoVQDecoder, Decoder
+from .configuration_qwen2vit import Qwen2VLVisionConfig
+from .modeling_qwen2vit import Qwen2VisionTransformerPretrainedModel, \
+    VisionRotaryEmbedding, Qwen2VLBatchVisionBlock
+try:
+    import xformers.ops as xops
+    is_xformers_available = True
+except Exception as e:
+    is_xformers_available = False
+if torch.__version__ > "2.1.2":
+    IS_SDPA_AVAILABLE = True
+else:
+    IS_SDPA_AVAILABLE = False
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(cur_dir)
+# helper functions
+def exists(v):
+    return v is not None
+def identity(t):
+    return t
+def default(v, d):
+    return v if exists(v) else d
+def pack_one(t, pattern):
+    packed, packed_shape = pack([t], pattern)
+    def inverse(out, inv_pattern=None):
+        inv_pattern = default(inv_pattern, pattern)
+        out, = unpack(out, packed_shape, inv_pattern)
+        return out
+    return packed, inverse
+# class
+class SimVQ(Module):
+    def __init__(
+            self,
+            dim,
+            codebook_size,
+            codebook_transform: Module | None = None,
+            init_fn: Callable = identity,
+            channel_first=True,
+            input_to_quantize_commit_loss_weight=0.25,
+            commitment_weight=1.,
+            frozen_codebook_dim=None  # frozen codebook dim could have different dimensions than projection
+    ):
+        super().__init__()
+        self.codebook_size = codebook_size
+        self.channel_first = channel_first
+        frozen_codebook_dim = default(frozen_codebook_dim, dim)
+        codebook = torch.randn(codebook_size, frozen_codebook_dim) * (frozen_codebook_dim ** -0.5)
+        codebook = init_fn(codebook)
+        # the codebook is actually implicit from a linear layer from frozen gaussian or uniform
+        if not exists(codebook_transform):
+            codebook_transform = nn.Linear(frozen_codebook_dim, dim, bias=False)
+        self.code_transform = codebook_transform
+        self.register_buffer('frozen_codebook', codebook)
+        # commit loss weighting - weighing input to quantize a bit less is crucial for it to work
+        self.input_to_quantize_commit_loss_weight = input_to_quantize_commit_loss_weight
+        # total commitment loss weight
+        self.commitment_weight = commitment_weight
+    @property
+    def codebook(self):
+        return self.code_transform(self.frozen_codebook)
+    def indices_to_codes(
+            self,
+            indices
+    ):
+        implicit_codebook = self.codebook
+        frozen_codes = get_at('[c] d, b ... -> b ... d', self.frozen_codebook, indices)
+        quantized = self.code_transform(frozen_codes)
+        if self.channel_first:
+            quantized = rearrange(quantized, 'b ... d -> b d ...')
+        return quantized
+    def forward(
+            self,
+            x
+    ):
+        if self.channel_first:
+            x = rearrange(x, 'b d ... -> b ... d')
+        x, inverse_pack = pack_one(x, 'b * d')
+        implicit_codebook = self.codebook
+        with torch.no_grad():
+            dist = torch.cdist(x, implicit_codebook)
+            indices = dist.argmin(dim=-1)
+        # select codes
+        quantized = get_at('[c] d, b n -> b n d', implicit_codebook, indices)
+        # commit loss and straight through, as was done in the paper
+        commit_loss = (
+                F.mse_loss(x.detach(), quantized) +
+                F.mse_loss(x, quantized.detach()) * self.input_to_quantize_commit_loss_weight
+        )
+        quantized = (quantized - x).detach() + x
+        quantized = inverse_pack(quantized)
+        indices = inverse_pack(indices, 'b *')
+        if self.channel_first:
+            quantized = rearrange(quantized, 'b ... d-> b d ...')
+        return quantized, commit_loss * self.commitment_weight, indices
+def init_weights(m):
+    if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.LayerNorm):
+        if m.weight is not None:
+            nn.init.constant_(m.weight, 1)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.Linear):
+        nn.init.xavier_uniform_(m.weight)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d) \
+            or isinstance(m, nn.Conv3d) or isinstance(m, nn.ConvTranspose3d):
+        w = m.weight.data
+        nn.init.xavier_uniform_(w)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.Embedding):
+        nn.init.normal_(m.weight, mean=0, std=1)
+class ScalingLayerForQwen2ViT:
+    def __init__(
+            self,
+            min_pixels: int = 56 * 56,
+            max_pixels: int = 28 * 28 * 1280,
+            patch_size: int = 14,
+            temporal_patch_size: int = 2,
+            merge_size: int = 2,
+            **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        OPENAI_CLIP_MEAN = torch.as_tensor([0.48145466, 0.4578275, 0.40821073])[None, :, None, None]
+        OPENAI_CLIP_STD = torch.as_tensor([0.26862954, 0.26130258, 0.27577711])[None, :, None, None]
+        self.image_mean = OPENAI_CLIP_MEAN
+        self.image_std = OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}
+    def __call__(self, images):
+        if images.ndim == 4:
+            images = images.unsqueeze(1)
+        batch_size, temporal, channel, height, width = images.shape
+        factor = self.patch_size * self.merge_size
+        resized_height, resized_width = height // factor * factor, width // factor * factor
+        images = (images + 1) / 2  # rescale to [0, 1.]
+        images = torch.nn.functional.interpolate(
+            images.flatten(0, 1).float(),
+            size=(resized_height, resized_width),
+            mode='bicubic',
+            align_corners=False,
+            antialias=True
+        ).to(images.dtype)
+        images = images.clamp(0, 1)  # rescale to [0, 1.]
+        images = ((images - self.image_mean.to(images)) / self.image_std.to(images))
+        images = rearrange(images, '(b t) c h w -> b t c h w', b=batch_size, t=temporal)
+        if temporal == 1:
+            images = images.repeat_interleave(self.temporal_patch_size, dim=1)
+            temporal = self.temporal_patch_size
+        grid_t = temporal // self.temporal_patch_size
+        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+        images = images.reshape(
+            batch_size * grid_t,
+            self.temporal_patch_size,
+            channel,
+            -1
+        )
+        images = rearrange(images, 'b p c n -> b n (c p)')
+        images = images.reshape(
+            batch_size * grid_t,
+            grid_h // self.merge_size,
+            self.merge_size,
+            self.patch_size,
+            grid_w // self.merge_size,
+            self.merge_size,
+            self.patch_size,
+            -1
+        )
+        images = rearrange(images, 'b h k s1 w l s2 n -> (b h w k l) (n s1 s2)')
+        return dict(image=images, image_grid_thw=torch.as_tensor([[grid_t, grid_h, grid_w] for _ in range(batch_size)]))
+class SemanticEncoder(nn.Module):
+    def __init__(self,
+                 semantic_encoder,
+                 z_channels=4,
+                 num_blocks=2,
+                 embed_dim=1280,
+                 proj_layer='linear',
+                 attn_implementation='xformers',
+                 target_mlp='identity',
+                 ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        if isinstance(semantic_encoder, str):
+            self.model = Qwen2VisionTransformerPretrainedModel.from_pretrained(
+                semantic_encoder,
+                attn_implementation=attn_implementation
+            )
+        elif isinstance(semantic_encoder, dict):
+            config = Qwen2VLVisionConfig(**semantic_encoder, attn_implementation=attn_implementation)
+            self.model = Qwen2VisionTransformerPretrainedModel(config)
+        else:
+            raise ValueError(f"Invalid semantic_encoder: {semantic_encoder}")
+        input_channels = self.model.config.hidden_size
+        for p in self.model.parameters():
+            p.requires_grad = False
+        self.proj_in = nn.Conv2d(input_channels, embed_dim, 1, 1) if input_channels != embed_dim else nn.Identity()
+        config = Qwen2VLVisionConfig(depth=num_blocks,
+                                     embed_dim=embed_dim, )
+        head_dim = config.embed_dim // config.num_heads
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+        self.blocks = nn.ModuleList(
+            [Qwen2VLBatchVisionBlock(config, attn_implementation) for _ in range(num_blocks)]
+        )
+        if proj_layer == 'norm_linear':
+            self.proj_out = nn.Sequential(
+                nn.LayerNorm(embed_dim),
+                nn.Linear(
+                    embed_dim,
+                    z_channels,
+                )
+            )
+        elif proj_layer == 'linear':
+            self.proj_out = nn.Sequential(
+                nn.Linear(
+                    embed_dim,
+                    z_channels,
+                )
+            )
+        elif proj_layer == 'mlp':
+            self.proj_out = nn.Sequential(
+                nn.Linear(embed_dim, embed_dim),
+                nn.Tanh(),
+                nn.Linear(embed_dim, z_channels),
+            )
+        else:
+            raise RuntimeError(f"Wrong proj layer. Got {proj_layer}")
+        if target_mlp == 'identity':
+            self.target_mlp = nn.Sequential(
+                nn.Identity(),
+            )
+        elif target_mlp == 'norm':
+            self.target_mlp = nn.Sequential(
+                nn.LayerNorm(input_channels, eps=1e-6, elementwise_affine=False),
+            )
+        self.init_weight()
+    def init_weight(self):
+        self.proj_in.apply(init_weights)
+        self.blocks.apply(init_weights)
+        self.proj_out.apply(init_weights)
+        self.target_mlp.apply(init_weights)
+    def rot_pos_emb(self, grid_thw, max_seq_len):
+        pos_ids = torch.zeros((len(grid_thw), max_seq_len, 2), dtype=torch.long)
+        for idx, (t, h, w) in enumerate(grid_thw):
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.flatten()
+            current_pos_ids = torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)
+            pos_ids[idx, :current_pos_ids.shape[0]] = current_pos_ids
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(2)
+        return rotary_pos_emb
+    def forward(self, x, grid_thw):
+        x = self.model(x, grid_thw=grid_thw)
+        x = x_target = self.target_mlp(x)
+        x = F.linear(x,
+                     self.proj_in.weight.view(self.proj_in.weight.shape[0], -1),
+                     self.proj_in.bias)
+        new_grid_thw = torch.as_tensor([[t, h // 2, w // 2] for t, h, w in grid_thw])
+        seq_lens = [t_i * h_i * w_i for t_i, h_i, w_i in new_grid_thw]
+        max_seq_len = max(seq_lens)
+        x = rearrange(x, '(b h w) c -> b (h w) c', h=new_grid_thw[0, 1], w=new_grid_thw[0, 2])
+        rotary_pos_emb = self.rot_pos_emb(new_grid_thw, max_seq_len)
+        for blk in self.blocks:
+            x = blk(x, rotary_pos_emb=rotary_pos_emb)
+        x = self.proj_out(x)  # [b, max_length, d]
+        t, h, w = new_grid_thw[0]
+        b = len(grid_thw)
+        x = rearrange(x, 'b (h w) c ->b c h w', b=b, h=h, w=w)
+        x_target = rearrange(x_target, '(b h w) c ->b c h w', b=b, h=h, w=w)
+        return x, x_target
+class SemanticDecoder(nn.Module):
+    def __init__(self,
+                 z_channels=4,
+                 embed_dim=1280,
+                 num_blocks=2,
+                 output_channels=1280,
+                 attn_implementation='xformers',
+                 proj_layer='linear_norm'):
+        super().__init__()
+        self.proj_in = nn.Linear(z_channels, embed_dim)
+        self.output_channels = output_channels
+        config = Qwen2VLVisionConfig(depth=num_blocks, embed_dim=embed_dim)
+        self.blocks = nn.ModuleList(
+            [Qwen2VLBatchVisionBlock(config, attn_implementation) for _ in range(num_blocks)]
+        )
+        head_dim = config.embed_dim // config.num_heads
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+        if proj_layer == 'norm_linear':
+            self.proj_out = nn.Sequential(
+                nn.LayerNorm(embed_dim),
+                nn.Linear(embed_dim, output_channels),
+            )
+        elif proj_layer == 'linear':
+            self.proj_out = nn.Sequential(
+                nn.Linear(embed_dim, output_channels)
+            )
+        elif proj_layer == 'mlp':
+            self.proj_out = nn.Sequential(
+                nn.Linear(embed_dim, embed_dim),
+                nn.Tanh(),
+                nn.Linear(embed_dim, output_channels),
+            )
+        elif proj_layer == 'linear_norm':
+            self.proj_out = nn.Sequential(
+                nn.Linear(embed_dim, output_channels),
+                nn.LayerNorm(output_channels),
+            )
+        self.apply(init_weights)
+    @property
+    def last_layer(self):
+        return self.proj_out[-1].weight
+    def rot_pos_emb(self, grid_thw, max_seq_len):
+        pos_ids = torch.zeros((len(grid_thw), max_seq_len, 2), dtype=torch.long)
+        for idx, (t, h, w) in enumerate(grid_thw):
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.flatten()
+            current_pos_ids = torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1)
+            pos_ids[idx, :current_pos_ids.shape[0]] = current_pos_ids
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(2)
+        return rotary_pos_emb
+    def forward(self, z: torch.Tensor):
+        x = z
+        b, c, h, w = x.shape
+        x = rearrange(x, 'b c h w -> b (h w) c')
+        grid_thw = torch.as_tensor([[1, h, w] for _ in range(b)])
+        seq_lens = [t * h * w for t, h, w in grid_thw]
+        max_seq_len = max(seq_lens)
+        x = self.proj_in(x)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw, max_seq_len)
+        for blk in self.blocks:
+            x = blk(x, rotary_pos_emb=rotary_pos_emb)
+        x = self.proj_out(x)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w)
+        return x
+class DualViTokPretrainModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = DualViTokConfig
+    base_model_prefix = "dualvitok"
+    main_input_name = "pixel_values"
+    _no_split_modules = ["BatchQwen2VLVisionBlock", "MoVQResnetBlock", "MoVQAttnBlock", "MoVQResnetTemporalBlock"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Conv2d, nn.Conv3d)):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+        # copied from the `reset_parameters` method of `class Linear(Module)` in `torch`.
+        elif isinstance(module, nn.Linear):
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                nn.init.uniform_(module.bias, -bound, bound)
+        elif isinstance(module, (nn.BatchNorm2d, nn.BatchNorm3d, nn.GroupNorm)):
+            nn.init.constant_(module.weight, 1)
+            nn.init.constant_(module.bias, 0)
+class DualViTok(DualViTokPretrainModel):
+    def __init__(self, config: DualViTokConfig):
+        super().__init__(config)
+        self.config = config
+        self._semantic_channel = config.semantic_encoder.z_channels
+        self._pixel_channel = config.pixel_encoder.z_channels
+        self.semantic_encoder = SemanticEncoder(
+            semantic_encoder=config.semantic_encoder.pretrained_semantic_encoder,
+            z_channels=config.semantic_encoder.z_channels,
+            num_blocks=config.semantic_encoder.num_blocks,
+            embed_dim=config.semantic_encoder.embed_dim,
+            proj_layer=config.semantic_encoder.out_layer,
+            attn_implementation=config.attn_implementation,
+            target_mlp=config.semantic_encoder.target_mlp, )
+        self.semantic_decoder = SemanticDecoder(
+            z_channels=config.semantic_decoder.z_channels,
+            embed_dim=config.semantic_decoder.embed_dim,
+            num_blocks=config.semantic_decoder.num_blocks,
+            output_channels=config.semantic_decoder.out_channels,
+            attn_implementation=config.attn_implementation,
+            proj_layer=config.semantic_decoder.out_layer,
+        )
+        if config.semantic_quantizer_type.lower() == 'simvq':
+            self.semantic_quantizer = SimVQ(
+                dim=config.semantic_encoder.z_channels,
+                codebook_size=config.semantic_quantizer_codebook_size,
+            )
+        elif config.semantic_quantizer_type.lower() == 'vq':
+            raise NotImplementedError
+            self.semantic_quantizer = VQ(
+                dim=config.semantic_encoder.z_channels,
+                codebook_size=config.semantic_quantizer_codebook_size,
+            )
+        self.pixel_encoder = MoVQEncoder(config.pixel_encoder)
+        self.pixel_quant_conv = nn.Conv2d(config.pixel_encoder.z_channels, config.pixel_encoder.embed_dim, 1)
+        if config.pixel_quantizer_type.lower() == 'simvq':
+            self.pixel_quantizer = SimVQ(
+                dim=config.pixel_encoder.z_channels,
+                codebook_size=config.pixel_quantizer_codebook_size,
+            )
+        elif config.pixel_quantizer_type.lower() == 'vq':
+            raise NotImplementedError
+            self.pixel_quantizer = VQ(
+                dim=config.pixel_encoder.z_channels,
+                codebook_size=config.pixel_quantizer_codebook_size,
+            )
+        self.pixel_post_quant_conv = nn.Conv2d(config.pixel_decoder.embed_dim,
+                                               config.pixel_decoder.z_channels, 1)
+        self.pixel_decoder = MoVQDecoder(config.pixel_decoder)
+        self.scaling_layer = ScalingLayerForQwen2ViT()
+    @property
+    def device(self):
+        return get_parameter_device(self)
+    @property
+    def dtype(self):
+        return get_parameter_dtype(self)
+    @property
+    def pixel_channel(self):
+        return self._pixel_channel
+    @property
+    def semantic_channel(self):
+        return self._semantic_channel
+    def encode(self, image: torch.FloatTensor):
+        scale_output = self.scaling_layer(image)
+        image, image_grid_thw, image_gen = scale_output['image'], scale_output['image_grid_thw'], image
+        h_semantic, target_semantic = self.semantic_encoder(image, image_grid_thw)
+        quant_semantic, emb_loss_semantic, info_semantic = self.semantic_quantizer(h_semantic.float())
+        h_pixel = self.pixel_encoder(image_gen)
+        h_pixel = self.pixel_quant_conv(h_pixel)
+        quant_pixel, emb_loss_pixel, info_pixel = self.pixel_quantizer(h_pixel.float())
+        return (quant_semantic, emb_loss_semantic, info_semantic, target_semantic), \
+               (quant_pixel, emb_loss_pixel, info_pixel)
+    def encode_code(self, *args, **kwargs):
+        (_, _, semantic_indices, _), \
+        (_, _, pixel_indices) = self.encode(*args, **kwargs)
+        return semantic_indices, pixel_indices
+    def indices_to_codes(self, semantic_indices, pixel_indices):
+        quant_semantic = self.semantic_quantizer.indices_to_codes(semantic_indices)
+        quant_pixel = self.pixel_quantizer.indices_to_codes(pixel_indices)
+        return quant_semantic, quant_pixel
+    def encode_semantic(self, image: torch.FloatTensor):
+        scale_output = self.scaling_layer(image)
+        image, image_grid_thw, image_gen = scale_output['image'], scale_output['image_grid_thw'], image
+        h_semantic, target_semantic = self.semantic_encoder(image, image_grid_thw)
+        quant_semantic, emb_loss_semantic, info_semantic = self.semantic_quantizer(h_semantic.float())
+        return quant_semantic, emb_loss_semantic, info_semantic, target_semantic
+    def merge_quants(self, quant_semantic: torch.Tensor, quant_pixel: torch.Tensor):
+        quant_semantic_resized = F.interpolate(
+            quant_semantic, quant_pixel.shape[-2:], mode='bicubic'
+        ).to(quant_semantic.dtype)
+        quant_semantic = quant_semantic_resized
+        quant = torch.cat([quant_semantic, quant_pixel], dim=1)
+        return quant
+    def decode(self, quant_semantic: torch.Tensor, quant_pixel: torch.Tensor, ):
+        quant = self.merge_quants(quant_semantic, quant_pixel)
+        quant2 = self.pixel_post_quant_conv(quant)
+        x = self.pixel_decoder(quant2, quant)
+        return x
+    def decode_code(self, semantic_indices, pixel_indices):
+        quant_semantic = self.semantic_quantizer.indices_to_codes(semantic_indices)
+        quant_pixel = self.pixel_quantizer.indices_to_codes(pixel_indices)
+        return self.decode(quant_semantic, quant_pixel)
+    def decode_semantic(self, x: List[torch.Tensor]):
+        return self.semantic_decoder(x)
+    def forward(self, pixel_values: torch.FloatTensor):
+        (quant_semantic, diff_semantic, _, target_semantic), \
+        (quant_pixel, diff_pixel, _) = self.encode(pixel_values)
+        dec = self.decode(quant_semantic, quant_pixel)
+        dec_semantic = self.decode_semantic(quant_semantic)
+        return (dec_semantic, diff_semantic, target_semantic), (dec, diff_pixel)
+    def build_sdxl_decoder(self, path='ILLUME-MLLM/dualvitok-sdxl-decoder',
+                           image_processor=None,
+                           torch_dtype=torch.float16,
+                           add_watermarker=False,
+                           device='cuda',
+                           ):
+        from .sdxl_decoder_pipe import StableDiffusionXLDecoderPipeline
+        if image_processor is None:
+            image_processor = AutoImageProcessor.from_pretrained('ILLUME-MLLM/dualvitok', trust_remote_code=True)
+        return StableDiffusionXLDecoderPipeline.from_pretrained(path,
+                                                                torch_dtype=torch_dtype,
+                                                                add_watermarker=add_watermarker,
+                                                                vq_model=self,
+                                                                vq_image_processor=image_processor).to(device)

modeling_movqgan.py ADDED Viewed

	@@ -0,0 +1,828 @@

+""" MoVQ model """
+import math
+from typing import Optional, Tuple, Union
+import torch
+from einops import rearrange, repeat
+from torch import nn
+from torch.nn import functional as F
+from torch.utils.checkpoint import checkpoint
+from transformers.modeling_utils import PreTrainedModel
+from .configuration_movqgan import MoVQConfig
+try:
+    import xformers.ops as xops
+    is_xformers_available = True
+except Exception as e:
+    is_xformers_available = False
+if torch.__version__ > "2.1.2":
+    IS_SDPA_AVAILABLE = True
+else:
+    IS_SDPA_AVAILABLE = False
+class MoVQActivation(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def __call__(self, x: torch.Tensor):
+        return x * torch.sigmoid(x)
+class MoVQUpsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+    def forward(self, x: torch.Tensor):
+        x = F.interpolate(x.float(), scale_factor=2.0, mode="nearest").to(x.dtype)
+        x = self.conv(x)
+        return x
+class DCDownBlock2d(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int = None, downsample: bool = True,
+                 shortcut: bool = True) -> None:
+        super().__init__()
+        out_channels = out_channels if out_channels else in_channels
+        self.downsample = downsample
+        self.factor = 2
+        self.stride = 1 if downsample else 2
+        self.group_size = in_channels * self.factor ** 2 // out_channels
+        self.shortcut = shortcut
+        out_ratio = self.factor ** 2
+        if downsample:
+            assert out_channels % out_ratio == 0
+            out_channels = out_channels // out_ratio
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=self.stride,
+            padding=1,
+        )
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = self.conv(hidden_states)
+        if self.downsample:
+            x = F.pixel_unshuffle(x, self.factor)
+        if self.shortcut:
+            y = F.pixel_unshuffle(hidden_states, self.factor)
+            y = y.unflatten(1, (-1, self.group_size))
+            y = y.mean(dim=2)
+            hidden_states = x + y
+        else:
+            hidden_states = x
+        return hidden_states  # x + y
+class DCUpBlock2d(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int = None,
+            interpolate: bool = False,
+            shortcut: bool = True,
+            interpolation_mode: str = "nearest",
+    ) -> None:
+        super().__init__()
+        out_channels = out_channels if out_channels else in_channels
+        self.interpolate = interpolate
+        self.interpolation_mode = interpolation_mode
+        self.shortcut = shortcut
+        self.factor = 2
+        self.repeats = out_channels * self.factor ** 2 // in_channels
+        out_ratio = self.factor ** 2
+        if not interpolate:
+            out_channels = out_channels * out_ratio
+        self.conv = nn.Conv2d(in_channels, out_channels, 3, 1, 1)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if self.interpolate:
+            x = F.interpolate(hidden_states, scale_factor=self.factor, mode=self.interpolation_mode)
+            x = self.conv(x)
+        else:
+            x = self.conv(hidden_states)
+            x = F.pixel_shuffle(x, self.factor)
+        if self.shortcut:
+            y = hidden_states.repeat_interleave(self.repeats, dim=1)
+            y = F.pixel_shuffle(y, self.factor)
+            hidden_states = x + y
+        else:
+            hidden_states = x
+        return hidden_states
+class MoVQDownsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=3,
+            stride=2,
+            padding=0,
+        )
+    def forward(self, x: torch.Tensor):
+        pad = (0, 1, 0, 1)
+        x = F.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class MoVQSpatialNorm(nn.Module):
+    def __init__(
+            self,
+            f_channels: int,
+            zq_channels: int,
+            norm_layer: nn.Module = nn.GroupNorm,
+            add_conv: bool = False,
+            num_groups: int = 32,
+            eps: float = 1e-6,
+            affine: bool = True,
+    ):
+        super().__init__()
+        self.norm_layer = norm_layer(
+            num_channels=f_channels,
+            num_groups=num_groups,
+            eps=eps,
+            affine=affine,
+        )
+        self.add_conv = add_conv
+        if self.add_conv:
+            self.conv = nn.Conv2d(
+                zq_channels,
+                zq_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+            )
+        self.conv_y = nn.Conv2d(
+            zq_channels,
+            f_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.conv_b = nn.Conv2d(
+            zq_channels,
+            f_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+    def forward(self, x: torch.Tensor, zq: torch.Tensor):
+        zq = F.interpolate(zq.float(), size=x.shape[-2:], mode="nearest").to(zq.dtype)
+        if self.add_conv:
+            zq = self.conv(zq)
+        x = self.norm_layer(x)
+        x = x * self.conv_y(zq) + self.conv_b(zq)
+        return x
+class MoVQResnetBlock(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: Optional[int] = None,
+            conv_shortcut: bool = False,
+            dropout: float = 0.0,
+            zq_ch: Optional[int] = None,
+            add_conv: bool = False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.zq_ch = zq_ch
+        if zq_ch is None:
+            norm_kwargs = dict(num_groups=32, eps=1e-6, affine=True)
+            self.norm1 = nn.GroupNorm(num_channels=in_channels, **norm_kwargs)
+            self.norm2 = nn.GroupNorm(num_channels=out_channels, **norm_kwargs)
+        else:
+            self.norm1 = MoVQSpatialNorm(in_channels, zq_ch, add_conv=add_conv)
+            self.norm2 = MoVQSpatialNorm(out_channels, zq_ch, add_conv=add_conv)
+        self.conv1 = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = nn.Conv2d(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        self.act = MoVQActivation()
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                )
+            else:
+                self.nin_shortcut = nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                )
+    def forward(self, x: torch.Tensor, zq: Optional[torch.Tensor] = None):
+        norm_args = tuple() if self.zq_ch is None else (zq,)
+        h = self.norm1(x, *norm_args)
+        h = self.act(h)
+        h = self.conv1(h)
+        h = self.norm2(h, *norm_args)
+        h = self.act(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x + h
+class MoVQAttnBlock(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            zq_ch: Optional[int] = None,
+            add_conv: bool = False,
+            num_heads=1,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.zq_ch = zq_ch
+        self.num_heads = num_heads
+        if zq_ch is None:
+            norm_kwargs = dict(num_groups=32, eps=1e-6, affine=True)
+            self.norm = nn.GroupNorm(num_channels=in_channels, **norm_kwargs)
+        else:
+            self.norm = MoVQSpatialNorm(in_channels, zq_ch, add_conv=add_conv)
+        self.q = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.k = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.v = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+        self.proj_out = nn.Conv2d(
+            in_channels,
+            in_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        )
+    def forward(self, x: torch.Tensor, zq: Optional[torch.Tensor] = None):
+        # x: [b, c1, h1, w1]
+        # zq: [b, c2, h2, w2]
+        # attention_mask: [b, 1, h3, w3]
+        norm_args = tuple() if self.zq_ch is None else (zq,)
+        # if context is not None:
+        #     context = F.interpolate(context.float(), size=x.shape[-2:], mode="nearest").to(context.dtype)
+        #     x = x + self.conv_context(context)
+        nx = self.norm(x, *norm_args)
+        q = self.q(nx)
+        k = self.k(nx)
+        v = self.v(nx)
+        b, c, h, w = q.shape
+        if is_xformers_available:
+            # If xformers is available, create attn_bias for xops.memory_efficient_attention.
+            attn_bias = None
+            v = xops.memory_efficient_attention(
+                rearrange(q, 'b (n c) h w -> b (h w) n c', n=self.num_heads).contiguous(),
+                rearrange(k, 'b (n c) h w -> b (h w) n c', n=self.num_heads).contiguous(),
+                rearrange(v, 'b (n c) h w -> b (h w) n c', n=self.num_heads).contiguous(),
+                scale=1.0 / math.sqrt(c // self.num_heads),
+                attn_bias=attn_bias,
+            )
+            v = rearrange(v, 'b (h w) n c -> b (n c) h w', h=h, w=w).contiguous()
+        elif IS_SDPA_AVAILABLE:
+            # compute attention
+            q = rearrange(q, 'b (n c) h w -> b n (h w) c', n=self.num_heads).contiguous()
+            k = rearrange(k, 'b (n c) h w -> b n (h w) c', n=self.num_heads).contiguous()
+            v = rearrange(v, 'b (n c) h w -> b n (h w) c', n=self.num_heads).contiguous()
+            attn_bias = None
+            v = F.scaled_dot_product_attention(q, k, v, attn_bias, dropout_p=0.0)
+            v = v.transpose(1, 2)
+            v = rearrange(v, 'b (h w) n c -> b (n c) h w', h=h, w=w)
+        else:
+            # compute attention
+            q = rearrange(q, 'b (n c) h w -> b n c (h w)', n=self.num_heads).contiguous()
+            k = rearrange(k, 'b (n c) h w -> b n c (h w)', n=self.num_heads).contiguous()
+            v = rearrange(v, 'b (n c) h w -> b n c (h w)', n=self.num_heads).contiguous()
+            # score = torch.bmm(q.permute(0, 2, 1), k)
+            score = torch.einsum('b n c k, b n c l -> b n k l', q, k)
+            score = score / math.sqrt(c // self.num_heads)
+            score = F.softmax(score, dim=2)
+            # attend to values
+            # v = v.reshape(b, c, h * w)
+            # v = torch.bmm(v, score.permute(0, 2, 1))
+            v = torch.einsum('b n c l, b n k l -> b n c k', v, score)
+            v = v.reshape(b, c, h, w)
+        v = self.proj_out(v)
+        return x + v
+class MoVQVectorQuantizer(nn.Module):
+    def __init__(self, config: MoVQConfig):
+        super().__init__()
+        self.embedding = nn.Embedding(config.codebook_size, config.embed_dim)
+        self.embedding.weight.data.uniform_(-1.0 / config.codebook_size, 1.0 / config.codebook_size)
+    def forward(self, x: torch.Tensor):
+        # b t c h w -> b t h w c
+        b, t, c, h, w = x.shape
+        x = x.permute(0, 1, 3, 4, 2).contiguous()
+        x_flattened = x.view(-1, c)
+        codebook = self.embedding.weight
+        d = torch.sum(x_flattened ** 2, dim=1, keepdim=True) + \
+            torch.sum(codebook ** 2, dim=1) - 2 * \
+            torch.einsum('bd,dn->bn', x_flattened, codebook.permute(1, 0))
+        indices = torch.argmin(d, dim=1)
+        indices = indices.view(b, t, h, w)
+        return indices
+class MoVQPretrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = MoVQConfig
+    base_model_prefix = "movq"
+    main_input_name = "pixel_values"
+    _no_split_modules = ["MoVQResnetBlock", "MoVQAttnBlock"]
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Conv2d, nn.Conv3d)):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+        # copied from the `reset_parameters` method of `class Linear(Module)` in `torch`.
+        elif isinstance(module, nn.Linear):
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                nn.init.uniform_(module.bias, -bound, bound)
+        elif isinstance(module, (nn.BatchNorm2d, nn.BatchNorm3d, nn.GroupNorm)):
+            nn.init.constant_(module.weight, 1)
+            nn.init.constant_(module.bias, 0)
+class MoVQEncoder(nn.Module):
+    def __init__(self, config: MoVQConfig):
+        super().__init__()
+        self.config = config
+        self.ch = config.ch
+        self.num_resolutions = len(config.ch_mult)
+        self.num_res_blocks = config.num_res_blocks
+        self.in_channels = config.in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(
+            self.in_channels,
+            self.ch,
+            kernel_size=3,
+            stride=1,
+            padding=1
+        )
+        in_ch_mult = (1,) + tuple(config.ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = config.ch * in_ch_mult[i_level]
+            block_out = config.ch * config.ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    MoVQResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        dropout=config.dropout,
+                    )
+                )
+                block_in = block_out
+                if i_level in config.attn_resolutions:
+                    attn.append(MoVQAttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                if config.use_dc_up_down_blocks:
+                    down.downsample = DCDownBlock2d(block_in)
+                else:
+                    down.downsample = MoVQDownsample(block_in)
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = MoVQResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=config.dropout,
+        )
+        self.mid.attn_1 = MoVQAttnBlock(block_in)
+        self.mid.block_2 = MoVQResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=config.dropout,
+        )
+        # end
+        self.norm_out = nn.GroupNorm(num_channels=block_in, num_groups=32, eps=1e-6, affine=True)
+        self.act = MoVQActivation()
+        out_z_channels = 2 * config.z_channels if config.double_z else config.z_channels
+        self.conv_out = nn.Conv2d(
+            block_in,
+            out_z_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        self.out_shortcut_average_group_size = block_in // out_z_channels
+    def forward(self, x: torch.Tensor):
+        # downsampling
+        h = self.conv_in(x)
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](h)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = self.down[i_level].downsample(h)
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = self.act(h)
+        if self.config.use_dc_up_down_blocks:
+            x = h.unflatten(1, (-1, self.out_shortcut_average_group_size))
+            x = x.mean(dim=2)
+            h = self.conv_out(h) + x
+        else:
+            h = self.conv_out(h)
+        return h
+class MoVQDecoder(nn.Module):
+    def __init__(self, config: MoVQConfig):
+        super().__init__()
+        self.config = config
+        self.ch = config.ch
+        self.num_resolutions = len(config.ch_mult)
+        self.num_res_blocks = config.num_res_blocks
+        in_ch_mult = (1,) + tuple(config.ch_mult)
+        zq_ch = config.embed_dim
+        block_in = config.ch * config.ch_mult[-1]
+        self.in_shortcut_repeats = block_in // config.embed_dim
+        self.conv_in = nn.Conv2d(
+            config.z_channels,
+            block_in,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = MoVQResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=config.dropout,
+            zq_ch=zq_ch,
+        )
+        self.mid.attn_1 = MoVQAttnBlock(block_in, zq_ch)
+        self.mid.block_2 = MoVQResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=config.dropout,
+            zq_ch=zq_ch,
+        )
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = config.ch * config.ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    MoVQResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        dropout=config.dropout,
+                        zq_ch=zq_ch,
+                    )
+                )
+                block_in = block_out
+                if i_level in config.attn_resolutions:
+                    attn.append(MoVQAttnBlock(block_in, zq_ch))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                if config.use_dc_up_down_blocks:
+                    up.upsample = DCUpBlock2d(block_in)
+                else:
+                    up.upsample = MoVQUpsample(block_in)
+            self.up.insert(0, up)
+        self.act = MoVQActivation()
+        self.norm_out = MoVQSpatialNorm(block_in, zq_ch)
+        self.conv_out = nn.Conv2d(
+            block_in,
+            config.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+    @property
+    def last_layer(self):
+        return self.conv_out.weight
+    def forward(self, z: torch.Tensor, zq: torch.Tensor):
+        h = z
+        if self.config.use_dc_up_down_blocks:
+            h = h.repeat_interleave(self.in_shortcut_repeats, dim=1)
+            h = self.conv_in(z) + h
+        else:
+            h = self.conv_in(h)
+        # middle
+        h = self.mid.block_1(h, zq)
+        h = self.mid.attn_1(h, zq)
+        h = self.mid.block_2(h, zq)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h, zq)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h, zq)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        h = self.norm_out(h, zq)
+        h = self.act(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(self, config: MoVQConfig):
+        super().__init__()
+        self.config = config
+        self.ch = config.ch
+        self.num_resolutions = len(config.ch_mult)
+        self.num_res_blocks = config.num_res_blocks
+        in_ch_mult = (1,) + tuple(config.ch_mult)
+        block_in = config.ch * config.ch_mult[-1]
+        self.conv_in = nn.Conv2d(
+            config.z_channels,
+            block_in,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = MoVQResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=config.dropout,
+        )
+        self.mid.attn_1 = MoVQAttnBlock(block_in)
+        self.mid.block_2 = MoVQResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=config.dropout,
+        )
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = config.ch * config.ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    MoVQResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        dropout=config.dropout,
+                    )
+                )
+                block_in = block_out
+                if i_level in config.attn_resolutions:
+                    attn.append(MoVQAttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = MoVQUpsample(block_in)
+            self.up.insert(0, up)
+        self.act = MoVQActivation()
+        norm_kwargs = dict(num_groups=32, eps=1e-6, affine=True)
+        self.norm_out = nn.GroupNorm(num_channels=block_in, **norm_kwargs)
+        self.conv_out = nn.Conv2d(
+            block_in,
+            config.out_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+    @property
+    def last_layer(self):
+        return self.conv_out.weight
+    def forward(self, z: torch.Tensor, zq: torch.Tensor):
+        h = z
+        h = self.conv_in(h)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        h = self.norm_out(h)
+        h = self.act(h)
+        h = self.conv_out(h)
+        return h
+class MoVQModel(MoVQPretrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.encoder = MoVQEncoder(config)
+        self.decoder = MoVQDecoder(config)
+        self.quantize = MoVQVectorQuantizer(config)
+        self.quant_conv = nn.Conv2d(config.z_channels, config.embed_dim, 1)
+        self.post_quant_conv = nn.Conv2d(config.embed_dim, config.z_channels, 1)
+        self.spatial_scale_factor = 2 ** (len(config.ch_mult) - 1)
+        self.post_init()
+    def encode(self, x: torch.Tensor):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        codes = self.quantize(h)
+        return codes
+    def decode(self, x: torch.Tensor):
+        quant = self.quantize.embedding(x.flatten())
+        b, h, w, c = quant.shape
+        quant = quant.view(b, h, w, c).permute(0, 3, 1, 2).contiguous()
+        quant2 = self.post_quant_conv(quant)
+        image = self.decoder(quant2, quant)
+        image = image.reshape(
+            b,
+            self.config.out_channels,
+            h * self.spatial_scale_factor,
+            w * self.spatial_scale_factor,
+        )
+        return image
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype

modeling_qwen2vit.py ADDED Viewed

	@@ -0,0 +1,842 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Qwen2-VL model."""
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+from torch import Tensor
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, StaticCache
+from transformers.modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    ModelOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torch_npu_available,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_qwen2vit import Qwen2VLConfig, Qwen2VLVisionConfig
+from .modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from einops import rearrange
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "Qwen2VLConfig"
+try:
+    import xformers.ops as xops
+    is_xformers_available = True
+except Exception as e:
+    is_xformers_available = False
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_varlen_func
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+else:
+    flash_attn_varlen_func = None
+def init_weights(m):
+    if isinstance(m, nn.Linear):
+        # we use xavier_uniform following official JAX ViT:
+        torch.nn.init.xavier_uniform_(m.weight)
+        if m.bias is not None:
+            nn.init.constant_(m.bias, 0)
+    elif isinstance(m, nn.nn.LayerNorm):
+        nn.init.constant_(m.bias, 0)
+        nn.init.constant_(m.weight, 1.0)
+    elif isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
+        w = m.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+@dataclass
+class Qwen2VLCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for Qwen2VL causal language model (or autoregressive) outputs.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
+            The rope index difference between sequence length and multimodal rope.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    rope_deltas: Optional[torch.LongTensor] = None
+class Qwen2VLRotaryEmbedding(nn.Module):
+    def __init__(
+            self,
+            dim=None,
+            max_position_embeddings=2048,
+            base=10000,
+            device=None,
+            scaling_factor=1.0,
+            rope_type="default",
+            config: Optional[Qwen2VLConfig] = None,
+    ):
+        super().__init__()
+        # TODO (joao): remove the `if` below, only used for BC
+        self.rope_kwargs = {}
+        if config is None:
+            logger.warning_once(
+                "`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the "
+                "`config` argument. All other arguments will be removed in v4.46"
+            )
+            self.rope_kwargs = {
+                "rope_type": rope_type,
+                "factor": scaling_factor,
+                "dim": dim,
+                "base": base,
+                "max_position_embeddings": max_position_embeddings,
+            }
+            self.rope_type = rope_type
+            self.max_seq_len_cached = max_position_embeddings
+            self.original_max_seq_len = max_position_embeddings
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_position_embeddings
+            self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    def _dynamic_frequency_update(self, position_ids, device):
+        """
+        dynamic RoPE layers should recompute `inv_freq` in the following situations:
+        1 - growing beyond the cached sequence length (allow scaling)
+        2 - the current sequence length is in the original scale (avoid losing precision with small sequences)
+        """
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_seq_len_cached:  # growth
+            inv_freq, self.attention_scaling = self.rope_init_fn(
+                self.config, device, seq_len=seq_len, **self.rope_kwargs
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
+            self.max_seq_len_cached = seq_len
+        if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len:  # reset
+            self.register_buffer("inv_freq", self.original_inv_freq, persistent=False)
+            self.max_seq_len_cached = self.original_max_seq_len
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        if "dynamic" in self.rope_type:
+            self._dynamic_frequency_update(position_ids, device=x.device)
+        # Core RoPE block. In contrast to other models, Qwen2_VL has different position ids for thw grids
+        # So we expand the inv_freq to shape (3, ...)
+        inv_freq_expanded = self.inv_freq[None, None, :, None].float().expand(3, position_ids.shape[1], -1, 1)
+        position_ids_expanded = position_ids[:, :, None, :].float()  # shape (3, bs, 1, positions)
+        # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(2, 3)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    orig_dtype = tensor.dtype
+    tensor = tensor.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    output = (tensor * cos) + (rotate_half(tensor) * sin)
+    output = output.to(orig_dtype)
+    return output
+def apply_rotary_pos_emb_vision_batch(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    orig_dtype = tensor.dtype
+    tensor = tensor.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    cos = cos.repeat(1, 1, 1, 2).float()
+    sin = sin.repeat(1, 1, 1, 2).float()
+    output = (tensor * cos) + (rotate_half(tensor) * sin)
+    output = output.to(orig_dtype)
+    return output
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seqlen: int, scale_factor: float = 1.0) -> torch.Tensor:
+        # 使用 scale_factor 动态调整 inv_freq
+        scaled_inv_freq = self.inv_freq * scale_factor
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, scaled_inv_freq)
+        return freqs
+class PatchEmbed(nn.Module):
+    def __init__(
+            self,
+            patch_size: int = 14,
+            temporal_patch_size: int = 2,
+            in_channels: int = 3,
+            embed_dim: int = 1152,
+    ) -> None:
+        super().__init__()
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.in_channels = in_channels
+        self.embed_dim = embed_dim
+        kernel_size = [temporal_patch_size, patch_size, patch_size]
+        self.proj = nn.Conv3d(in_channels, embed_dim, kernel_size=kernel_size, stride=kernel_size, bias=False)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        target_dtype = self.proj.weight.dtype
+        if is_torch_npu_available():
+            # if True:
+            hidden_states = F.linear(hidden_states, self.proj.weight.view(self.embed_dim, -1))
+        else:
+            hidden_states = hidden_states.view(
+                -1, self.in_channels, self.temporal_patch_size, self.patch_size, self.patch_size
+            )
+            hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).view(-1, self.embed_dim)
+        return hidden_states
+class PatchMerger(nn.Module):
+    def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size ** 2)
+        self.ln_q = nn.LayerNorm(context_dim, eps=1e-6)
+        self.mlp = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size),
+            nn.GELU(),
+            nn.Linear(self.hidden_size, dim),
+        )
+    def forward(self, x: torch.Tensor, grid_thw) -> torch.Tensor:
+        x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
+        return x
+class VisionMlp(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int, hidden_act: str) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(dim, hidden_dim)
+        self.act = ACT2FN[hidden_act]
+        self.fc2 = nn.Linear(hidden_dim, dim)
+    def forward(self, x) -> torch.Tensor:
+        return self.fc2(self.act(self.fc1(x)))
+class VisionAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16, ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        attention_mask = torch.full(
+            [1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype
+        )
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1]: cu_seqlens[i], cu_seqlens[i - 1]: cu_seqlens[i]] = 0
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
+        attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+        attn_output = torch.matmul(attn_weights, v)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+class BatchVisionAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,  # [batch_size, seq_len, dim]
+            attention_mask: torch.Tensor,  # [batch_size, 1, 1, seq_len]
+            rotary_pos_emb: torch.Tensor = None  # [batch_size, seq_len, head_dim//2]
+    ) -> torch.Tensor:
+        batch_size, seq_len, _ = hidden_states.shape
+        q, k, v = self.qkv(hidden_states).reshape(batch_size, seq_len, 3, self.num_heads, self.head_dim).permute(2, 0,
+                                                                                                                 3, 1,
+                                                                                                                 4).unbind(
+            0)
+        # [batch_size, num_heads, seq_len, head_dim]
+        if rotary_pos_emb is not None:
+            rotary_pos_emb = rotary_pos_emb.unsqueeze(1)  # [batch_size, 1, seq_len, head_dim//2]
+            q = apply_rotary_pos_emb_vision_batch(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision_batch(k, rotary_pos_emb)
+        attn_weights = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+        # Softmax
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+        attn_output = torch.matmul(attn_weights, v)  # [batch_size, num_heads, seq_len, head_dim]
+        attn_output = attn_output.transpose(1, 2).reshape(batch_size, seq_len, -1)
+        return self.proj(attn_output)
+class VisionXformerAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            cu_seqlens: torch.Tensor,
+            rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb)
+        seqlens = [cu_seqlens[0]] + [cu_seqlens[i] - cu_seqlens[i - 1] for i in range(1, len(cu_seqlens))]
+        attn_bias = xops.fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_output = xops.memory_efficient_attention(
+            q, k, v.unsqueeze(0),
+            attn_bias=attn_bias,
+            scale=1.0 / math.sqrt(self.head_dim)
+        )
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+class BatchVisionXformerAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: torch.Tensor,  # [batch_size, 1, 1, seq_len]
+            rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        batch_size, seq_len = hidden_states.shape
+        q, k, v = self.qkv(hidden_states).reshape(batch_size, seq_len, 3, self.num_heads, self.head_dim).permute(2, 0,
+                                                                                                                 3, 1,
+                                                                                                                 4).unbind(
+            0)
+        # [batch_size, num_heads, seq_len, head_dim]
+        if rotary_pos_emb is not None:
+            rotary_pos_emb = rotary_pos_emb.unsqueeze(1)  # [batch_size, 1, seq_len, head_dim//2]
+            q = apply_rotary_pos_emb_vision_batch(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision_batch(k, rotary_pos_emb)
+        attn_output = xops.memory_efficient_attention(
+            q, k, v,
+            attn_bias=attention_mask,
+            scale=1.0 / math.sqrt(self.head_dim)
+        )
+        attn_output = attn_output.reshape(batch_size, seq_len, -1)
+        return self.proj(attn_output)
+class VisionFlashAttention2(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+            self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
+            seq_length, -1
+        )
+        attn_output = self.proj(attn_output)
+        return attn_output
+class BatchVisionFlashAttention2(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+            self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        batch_size, seq_len, _ = hidden_states.shape
+        q, k, v = self.qkv(hidden_states).reshape(batch_size, seq_len, 3, self.num_heads, -1).permute(2, 0, 3, 1,
+                                                                                                      4).unbind(0)
+        if rotary_pos_emb is not None:
+            rotary_pos_emb = rotary_pos_emb.unsqueeze(1)  # [batch_size, 1, seq_len, head_dim//2]
+            q = apply_rotary_pos_emb_vision_batch(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision_batch(k, rotary_pos_emb)
+        q = rearrange(q, 'b h l d -> b l h d')
+        k = rearrange(k, 'b h l d -> b l h d')
+        v = rearrange(v, 'b h l d -> b l h d')
+        attn_output = _flash_attention_forward(q, k, v).reshape(batch_size, seq_len, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+class VisionSdpaAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+            self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor, rotary_pos_emb: torch.Tensor = None
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q, k, v = self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0)
+        q = apply_rotary_pos_emb_vision(q.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        k = apply_rotary_pos_emb_vision(k.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        attention_mask = torch.zeros([1, seq_length, seq_length], device=q.device, dtype=torch.bool)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1]: cu_seqlens[i], cu_seqlens[i - 1]: cu_seqlens[i]] = True
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+class BatchVisionSdpaAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 16) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=True)
+        self.proj = nn.Linear(dim, dim)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,  # [batch_size, seq_len, dim]
+            attention_mask: torch.Tensor = None,  # [batch_size, 1, 1, seq_len]
+            rotary_pos_emb: torch.Tensor = None  # [batch_size, seq_len, head_dim//2]
+    ) -> torch.Tensor:
+        batch_size, seq_len, _ = hidden_states.shape
+        q, k, v = self.qkv(hidden_states).reshape(batch_size, seq_len, 3, self.num_heads, -1).permute(2, 0, 3, 1,
+                                                                                                      4).unbind(0)
+        # [batch_size, num_heads, seq_len, head_dim]
+        if rotary_pos_emb is not None:
+            rotary_pos_emb = rotary_pos_emb.unsqueeze(1)  # [batch_size, 1, seq_len, head_dim//2]
+            q = apply_rotary_pos_emb_vision_batch(q, rotary_pos_emb)
+            k = apply_rotary_pos_emb_vision_batch(k, rotary_pos_emb)
+        attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, seq_len, -1)
+        attn_output = self.proj(attn_output)
+        return attn_output
+QWEN2_VL_VISION_ATTENTION_CLASSES = {
+    "eager": VisionAttention,
+    "flash_attention_2": VisionFlashAttention2,
+    "sdpa": VisionSdpaAttention,
+    "xformers": VisionXformerAttention,
+}
+QWEN2_VL_VISION_BATCH_ATTENTION_CLASSES = {
+    "eager": BatchVisionAttention,
+    "flash_attention_2": VisionFlashAttention2,
+    "sdpa": BatchVisionSdpaAttention,
+}
+class Qwen2VLVisionBlock(nn.Module):
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.embed_dim, eps=1e-6)
+        self.norm2 = nn.LayerNorm(config.embed_dim, eps=1e-6)
+        mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio)
+        self.attn = QWEN2_VL_VISION_ATTENTION_CLASSES[attn_implementation](
+            config.embed_dim, num_heads=config.num_heads,
+        )
+        self.mlp = VisionMlp(dim=config.embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=config.hidden_act)
+    def forward(self, hidden_states, cu_seqlens, rotary_pos_emb, grid_thw) -> torch.Tensor:
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
+        )
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+class Qwen2VLBatchVisionBlock(nn.Module):
+    def __init__(self, config, attn_implementation: str = "sdpa") -> None:
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.embed_dim, eps=1e-6)
+        self.norm2 = nn.LayerNorm(config.embed_dim, eps=1e-6)
+        mlp_hidden_dim = int(config.embed_dim * config.mlp_ratio)
+        self.attn = QWEN2_VL_VISION_BATCH_ATTENTION_CLASSES[attn_implementation](
+            config.embed_dim, num_heads=config.num_heads,
+        )
+        self.mlp = VisionMlp(config.embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=config.hidden_act)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,  # [batch_size, seq_len, dim]
+            attention_mask: torch.Tensor = None,  # [batch_size, 1, 1, seq_len]
+            rotary_pos_emb: torch.Tensor = None  # [batch_size, seq_len, head_dim//2]
+    ) -> torch.Tensor:
+        # Attention
+        hidden_states = hidden_states + self.attn(
+            self.norm1(hidden_states), attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb
+        )
+        # MLP
+        hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
+        return hidden_states
+# Copied from transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_with_cache_position
+def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        min_dtype: float,
+        cache_position: torch.Tensor,
+        batch_size: int,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+    Args:
+        attention_mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`torch.dtype`):
+            The dtype to use for the 4D attention mask.
+        device (`torch.device`):
+            The device to plcae the 4D attention mask on.
+        min_dtype (`float`):
+            The minimum value representable with the dtype `dtype`.
+        cache_position (`torch.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`torch.Tensor`):
+            Batch size.
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+    return causal_mask
+# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2RMSNorm
+class Qwen2RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Qwen2RMSNorm is equivalent to T5nn.LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+# Copied from transformers.models.qwen2.modeling_qwen2.Qwen2MLP
+class Qwen2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, hidden_state):
+        return self.down_proj(self.act_fn(self.gate_proj(hidden_state)) * self.up_proj(hidden_state))
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class Qwen2VLPreTrainedModel(PreTrainedModel):
+    config_class = Qwen2VLConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2VLDecoderLayer", "Qwen2VLVisionBlock"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv3d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+class Qwen2VisionTransformerPretrainedModel(Qwen2VLPreTrainedModel):
+    config_class = Qwen2VLVisionConfig
+    _no_split_modules = ["Qwen2VLVisionBlock"]
+    def __init__(self, config) -> None:
+        super().__init__(config)
+        self.spatial_merge_size = config.spatial_merge_size
+        self.patch_embed = PatchEmbed(
+            patch_size=config.patch_size,
+            temporal_patch_size=config.temporal_patch_size,
+            in_channels=config.in_channels,
+            embed_dim=config.embed_dim,
+        )
+        head_dim = config.embed_dim // config.num_heads
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+        self.blocks = nn.ModuleList(
+            [Qwen2VLVisionBlock(config, config.attn_implementation) for _ in range(config.depth)]
+        )
+        self.merger = PatchMerger(
+            dim=config.hidden_size, context_dim=config.embed_dim, spatial_merge_size=config.spatial_merge_size
+        )
+    def get_dtype(self) -> torch.dtype:
+        return self.blocks[0].mlp.fc2.weight.dtype
+    def get_device(self) -> torch.device:
+        return self.blocks[0].mlp.fc2.weight.device
+    def rot_pos_emb(self, grid_thw):
+        pos_ids = []
+        for t, h, w in grid_thw:
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_thw[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+    def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor,
+                output_hidden_states=False, org_forward=False, ) -> torch.Tensor:
+        hidden_states = self.patch_embed(hidden_states)
+        rotary_pos_emb = self.rot_pos_emb(grid_thw)
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
+            dim=0, dtype=torch.int32
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        for blk in self.blocks:
+            hidden_states = blk(hidden_states,
+                                cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb,
+                                grid_thw=grid_thw)
+        hidden_states = self.merger(hidden_states, grid_thw)
+        return hidden_states

modeling_rope_utils.py ADDED Viewed

	@@ -0,0 +1,561 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional, Tuple
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import is_torch_available, logging
+logger = logging.get_logger(__name__)
+if is_torch_available():
+    import torch
+def _compute_default_rope_parameters(
+    config: Optional[PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies according to the original RoPE implementation
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    if config is not None and len(rope_kwargs) > 0:
+        raise ValueError(
+            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
+            f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
+        )
+    if len(rope_kwargs) > 0:
+        base = rope_kwargs["base"]
+        dim = rope_kwargs["dim"]
+    elif config is not None:
+        base = config.rope_theta
+        partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        dim = int(head_dim * partial_rotary_factor)
+    attention_factor = 1.0  # Unused in this type of RoPE
+    # Compute the inverse frequencies
+    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
+    return inv_freq, attention_factor
+def _compute_linear_scaling_rope_parameters(
+    config: Optional[PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    if config is not None and len(rope_kwargs) > 0:
+        raise ValueError(
+            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
+            f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
+        )
+    if len(rope_kwargs) > 0:
+        factor = rope_kwargs["factor"]
+    elif config is not None:
+        factor = config.rope_scaling["factor"]
+    # Gets the default RoPE parameters
+    inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
+    # Then applies linear scaling to the frequencies.
+    # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so
+    # applying scaling to the inverse frequencies is equivalent.
+    inv_freq /= factor
+    return inv_freq, attention_factor
+def _compute_dynamic_ntk_parameters(
+    config: Optional[PretrainedConfig] = None,
+    device: Optional["torch.device"] = None,
+    seq_len: Optional[int] = None,
+    **rope_kwargs,
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length, used to update the dynamic RoPE at inference time.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
+    """
+    # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
+    if config is not None and len(rope_kwargs) > 0:
+        raise ValueError(
+            "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in "
+            f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}"
+        )
+    if len(rope_kwargs) > 0:
+        base = rope_kwargs["base"]
+        dim = rope_kwargs["dim"]
+        max_position_embeddings = rope_kwargs["max_position_embeddings"]
+        factor = rope_kwargs["factor"]
+    elif config is not None:
+        base = config.rope_theta
+        partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+        head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        dim = int(head_dim * partial_rotary_factor)
+        max_position_embeddings = config.max_position_embeddings
+        factor = config.rope_scaling["factor"]
+    attention_factor = 1.0  # Unused in this type of RoPE
+    # seq_len: default to max_position_embeddings, e.g. at init time
+    seq_len = seq_len if seq_len is not None and seq_len > max_position_embeddings else max_position_embeddings
+    # Compute the inverse frequencies
+    base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2))
+    inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float().to(device) / dim))
+    return inv_freq, attention_factor
+def _compute_yarn_parameters(
+    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with NTK scaling. Please refer to the
+    [original paper](https://arxiv.org/abs/2309.00071)
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    # No need to keep BC with yarn, unreleased when this new pattern was created.
+    if len(rope_kwargs) > 0:
+        raise ValueError(
+            f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}"
+        )
+    base = config.rope_theta
+    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+    dim = int(head_dim * partial_rotary_factor)
+    max_position_embeddings = config.max_position_embeddings
+    factor = config.rope_scaling["factor"]
+    # Sets the attention factor as suggested in the paper
+    attention_factor = config.rope_scaling.get("attention_factor")
+    if attention_factor is None:
+        attention_factor = 0.1 * math.log(factor) + 1.0
+    # Optional config options
+    # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly)
+    beta_fast = config.rope_scaling.get("beta_fast") or 32
+    beta_slow = config.rope_scaling.get("beta_slow") or 1
+    # Compute the inverse frequencies
+    def find_correction_dim(num_rotations, dim, base, max_position_embeddings):
+        """Inverse dimension formula to find the dimension based on the number of rotations"""
+        return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base))
+    def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings):
+        """Find dimension range bounds based on rotations"""
+        low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings))
+        high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings))
+        return max(low, 0), min(high, dim - 1)
+    def linear_ramp_factor(min, max, dim):
+        if min == max:
+            max += 0.001  # Prevent singularity
+        linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min)
+        ramp_func = torch.clamp(linear_func, 0, 1)
+        return ramp_func
+    # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs
+    # to expand the possible context length. In other words, interpolation = apply scaling factor.
+    pos_freqs = base ** (torch.arange(0, dim, 2).float().to(device) / dim)
+    inv_freq_extrapolation = 1.0 / pos_freqs
+    inv_freq_interpolation = 1.0 / (factor * pos_freqs)
+    low, high = find_correction_range(beta_fast, beta_slow, dim, base, max_position_embeddings)
+    # Get n-dimensional rotational scaling corrected for extrapolation
+    inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).float().to(device)
+    inv_freq = (
+        inv_freq_interpolation * (1 - inv_freq_extrapolation_factor)
+        + inv_freq_extrapolation * inv_freq_extrapolation_factor
+    )
+    return inv_freq, attention_factor
+def _compute_longrope_parameters(
+    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies with LongRoPE scaling. Please refer to the
+    [original implementation](https://github.com/microsoft/LongRoPE)
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
+    # No need to keep BC with longrope, unreleased when this new pattern was created.
+    if len(rope_kwargs) > 0:
+        raise ValueError(
+            "Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got "
+            f"{rope_kwargs}"
+        )
+    base = config.rope_theta
+    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+    dim = int(head_dim * partial_rotary_factor)
+    long_factor = config.rope_scaling["long_factor"]
+    short_factor = config.rope_scaling["short_factor"]
+    factor = config.rope_scaling.get("factor")
+    attention_factor = config.rope_scaling.get("attention_factor")
+    # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a
+    # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two
+    # values to compute the default attention scaling factor, instead of using `factor`.
+    if hasattr(config, "original_max_position_embeddings"):
+        max_position_embeddings = config.original_max_position_embeddings
+        expanded_max_position_embeddings = config.max_position_embeddings
+        factor = expanded_max_position_embeddings / max_position_embeddings
+    else:
+        max_position_embeddings = config.max_position_embeddings
+        expanded_max_position_embeddings = max_position_embeddings * factor
+    # Sets the attention factor as suggested in the paper
+    if attention_factor is None:
+        if factor <= 1.0:
+            attention_factor = 1.0
+        else:
+            attention_factor = math.sqrt(1 + math.log(factor) / math.log(max_position_embeddings))
+    # Compute the inverse frequencies -- scaled based on the target sequence length
+    if expanded_max_position_embeddings > max_position_embeddings:
+        ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device)
+    else:
+        ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device)
+    inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim
+    inv_freq = 1.0 / (ext_factors * base**inv_freq_shape)
+    return inv_freq, attention_factor
+def _compute_llama3_parameters(
+    config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs
+) -> Tuple["torch.Tensor", float]:
+    """
+    Computes the inverse frequencies for llama 3.1.
+    Args:
+        config ([`~transformers.PretrainedConfig`]):
+            The model configuration.
+        device (`torch.device`):
+            The device to use for initialization of the inverse frequencies.
+        seq_len (`int`, *optional*):
+            The current sequence length. Unused for this type of RoPE.
+        rope_kwargs (`Dict`, *optional*):
+            BC compatibility with the previous RoPE class instantiation, will be removed in v4.45.
+    Returns:
+        Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
+        post-processing scaling factor applied to the computed cos/sin.
+    """
+    # Gets the default RoPE parameters
+    inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs)
+    factor = config.rope_scaling["factor"]  # `8` in the original implementation
+    low_freq_factor = config.rope_scaling["low_freq_factor"]  # `1` in the original implementation
+    high_freq_factor = config.rope_scaling["high_freq_factor"]  # `4` in the original implementation
+    old_context_len = config.rope_scaling["original_max_position_embeddings"]  # `8192` in the original implementation
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
+    wavelen = 2 * math.pi / inv_freq
+    # wavelen < high_freq_wavelen: do nothing
+    # wavelen > low_freq_wavelen: divide by factor
+    inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq)
+    # otherwise: interpolate between the two, using a smooth factor
+    smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
+    smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama
+    is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen)
+    inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama)
+    return inv_freq_llama, attention_factor
+# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters
+# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE
+# parameterizations, as long as the callable has the same signature.
+ROPE_INIT_FUNCTIONS = {
+    "default": _compute_default_rope_parameters,
+    "linear": _compute_linear_scaling_rope_parameters,
+    "dynamic": _compute_dynamic_ntk_parameters,
+    "yarn": _compute_yarn_parameters,
+    "longrope": _compute_longrope_parameters,
+    "llama3": _compute_llama3_parameters,
+}
+def _check_received_keys(rope_type: str, received_keys: set, required_keys: set, optional_keys: Optional[set] = None):
+    """Compare the received keys in `config.rope_scaling` against the expected and optional keys"""
+    # BC: "rope_type" was originally "type" -- let's check for "rope_type" when "type" is present
+    if "type" in received_keys:
+        received_keys -= {"type"}
+        required_keys.add("rope_type")
+    missing_keys = required_keys - received_keys
+    if missing_keys:
+        raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}")
+    if optional_keys is not None:
+        unused_keys = received_keys - required_keys - optional_keys
+    else:
+        unused_keys = received_keys - required_keys
+    if unused_keys:
+        logger.warning(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}")
+def _validate_default_rope_parameters(config: PretrainedConfig):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(rope_type, received_keys, required_keys)
+def _validate_linear_scaling_rope_parameters(config: PretrainedConfig):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "factor"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(rope_type, received_keys, required_keys)
+    factor = rope_scaling["factor"]
+    if factor is None or not isinstance(factor, float) or factor < 1.0:
+        logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
+def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "factor"}
+    # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
+    optional_keys = {"original_max_position_embeddings"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(rope_type, received_keys, required_keys, optional_keys)
+    factor = rope_scaling["factor"]
+    if factor is None or not isinstance(factor, float) or factor < 1.0:
+        logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
+def _validate_yarn_parameters(config: PretrainedConfig):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "factor"}
+    optional_keys = {"attention_factor", "beta_fast", "beta_slow"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(rope_type, received_keys, required_keys, optional_keys)
+    factor = rope_scaling["factor"]
+    if factor is None or not isinstance(factor, float) or factor < 1.0:
+        logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
+    attention_factor = rope_scaling.get("attention_factor")
+    if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0):
+        logger.warning(
+            f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
+        )
+    beta_fast = rope_scaling.get("beta_fast")
+    if beta_fast is not None and not isinstance(beta_fast, float):
+        logger.warning(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}")
+    beta_slow = rope_scaling.get("beta_slow")
+    if beta_slow is not None and not isinstance(beta_slow, float):
+        logger.warning(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}")
+    if (beta_fast or 32) < (beta_slow or 1):
+        logger.warning(
+            f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} "
+            f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)"
+        )
+def _validate_longrope_parameters(config: PretrainedConfig):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "short_factor", "long_factor"}
+    # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
+    optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(rope_type, received_keys, required_keys, optional_keys)
+    partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0
+    head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+    dim = int(head_dim * partial_rotary_factor)
+    short_factor = rope_scaling.get("short_factor")
+    if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor):
+        logger.warning(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}")
+    if not len(short_factor) == dim // 2:
+        logger.warning(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}")
+    long_factor = rope_scaling.get("long_factor")
+    if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor):
+        logger.warning(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}")
+    if not len(long_factor) == dim // 2:
+        logger.warning(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}")
+    # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over
+    # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is
+    # unique to longrope (= undesirable)
+    if hasattr(config, "original_max_position_embeddings"):
+        logger.warning_once(
+            "This model has set a `original_max_position_embeddings` field, to be used together with "
+            "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`"
+            "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, "
+            "as it is compatible with most model architectures."
+        )
+    else:
+        factor = rope_scaling.get("factor")
+        if factor is None:
+            logger.warning("Missing required keys in `rope_scaling`: 'factor'")
+        elif not isinstance(factor, float) or factor < 1.0:
+            logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
+        attention_factor = rope_scaling.get("attention_factor")
+        if attention_factor is not None:
+            if not isinstance(attention_factor, float) or attention_factor < 0.0:
+                logger.warning(
+                    f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}"
+                )
+def _validate_llama3_parameters(config: PretrainedConfig):
+    rope_scaling = config.rope_scaling
+    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None))  # BC: "rope_type" was originally "type"
+    required_keys = {"rope_type", "factor", "original_max_position_embeddings", "low_freq_factor", "high_freq_factor"}
+    received_keys = set(rope_scaling.keys())
+    _check_received_keys(rope_type, received_keys, required_keys)
+    factor = rope_scaling["factor"]
+    if factor is None or not isinstance(factor, float) or factor < 1.0:
+        logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}")
+    low_freq_factor = rope_scaling["low_freq_factor"]
+    high_freq_factor = rope_scaling["high_freq_factor"]
+    if low_freq_factor is None or not isinstance(low_freq_factor, float):
+        logger.warning(f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}")
+    if high_freq_factor is None or not isinstance(high_freq_factor, float):
+        logger.warning(f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}")
+    if high_freq_factor <= low_freq_factor:
+        logger.warning(
+            "`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor="
+            f"{high_freq_factor} and low_freq_factor={low_freq_factor}"
+        )
+    original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]
+    if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int):
+        logger.warning(
+            "`rope_scaling`'s original_max_position_embeddings field must be an integer, got "
+            f"{original_max_position_embeddings}"
+        )
+    if original_max_position_embeddings >= config.max_position_embeddings:
+        logger.warning(
+            "`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got "
+            f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}"
+        )
+# Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types.
+ROPE_VALIDATION_FUNCTIONS = {
+    "default": _validate_default_rope_parameters,
+    "linear": _validate_linear_scaling_rope_parameters,
+    "dynamic": _validate_dynamic_scaling_rope_parameters,
+    "yarn": _validate_yarn_parameters,
+    "longrope": _validate_longrope_parameters,
+    "llama3": _validate_llama3_parameters,
+}
+def rope_config_validation(config: PretrainedConfig):
+    """
+    Validate the RoPE config arguments, given a `PretrainedConfig` object
+    """
+    rope_scaling = getattr(config, "rope_scaling", None)  # not a default parameter in `PretrainedConfig`
+    if rope_scaling is None:
+        return
+    # BC: "rope_type" was originally "type"
+    rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", "default"))
+    validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type)
+    if validation_fn is not None:
+        validation_fn(config)
+    else:
+        logger.warning(
+            f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'"
+        )

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "auto_map": {
+        "AutoImageProcessor": "image_processing_dualvitok.DualViTokImageProcessor"
+    },
+    "do_convert_rgb": true,
+    "do_normalize": true,
+    "do_rescale": true,
+    "do_resize": true,
+    "image_mean": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "image_processor_type": "DualViTokImageProcessor",
+    "image_std": [
+      0.5,
+      0.5,
+      0.5
+    ],
+    "max_pixels": 1048576,
+    "min_pixels": 1024,
+    "resample": 3,
+    "rescale_factor": 0.00392156862745098,
+    "size": {
+      "max_pixels": 1048576,
+      "min_pixels": 1024
+    },
+    "spatial_factor": 16
+  }

processing_qwen2vit.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Qwen2-VL.
+"""
+from typing import List, Union
+try:
+    from typing import Unpack
+except ImportError:
+    from typing_extensions import Unpack
+from transformers.feature_extraction_utils import BatchFeature
+from .image_utils import ImageInput, VideoInput
+from transformers.processing_utils import (
+    ProcessingKwargs,
+    ProcessorMixin,
+)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Qwen2VLProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+    }
+class Qwen2VLProcessor(ProcessorMixin):
+    r"""
+    Constructs a Qwen2-VL processor which wraps a Qwen2-VL image processor and a Qwen2 tokenizer into a single processor.
+    [`Qwen2VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
+    [`~Qwen2VLProcessor.__call__`] and [`~Qwen2VLProcessor.decode`] for more information.
+    Args:
+        image_processor ([`Qwen2VLImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`Qwen2TokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+    image_processor_class = "Qwen2VLImageProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        videos: VideoInput = None,
+        **kwargs: Unpack[Qwen2VLProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
+                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
+            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            Qwen2VLProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(images=images, videos=None, **output_kwargs["images_kwargs"])
+            image_grid_thw = image_inputs["image_grid_thw"]
+        else:
+            image_inputs = {}
+            image_grid_thw = None
+        if videos is not None:
+            videos_inputs = self.image_processor(images=None, videos=videos, **output_kwargs["videos_kwargs"])
+            video_grid_thw = videos_inputs["video_grid_thw"]
+        else:
+            videos_inputs = {}
+            video_grid_thw = None
+        if not isinstance(text, list):
+            text = [text]
+        if image_grid_thw is not None:
+            merge_length = self.image_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while "<|image_pad|>" in text[i]:
+                    text[i] = text[i].replace(
+                        "<|image_pad|>", "<|placeholder|>" * (image_grid_thw[index].prod() // merge_length), 1
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", "<|image_pad|>")
+        if video_grid_thw is not None:
+            merge_length = self.image_processor.merge_size**2
+            index = 0
+            for i in range(len(text)):
+                while "<|video_pad|>" in text[i]:
+                    text[i] = text[i].replace(
+                        "<|video_pad|>", "<|placeholder|>" * (video_grid_thw[index].prod() // merge_length), 1
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", "<|video_pad|>")
+        _ = output_kwargs["text_kwargs"].pop("padding_side", None)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a4da82e1c8624d84fc3120cc25001da7ad348fbcac426cc5c6b50375657c2a86
+size 5401021086

sdxl_decoder_pipe.py ADDED Viewed

	@@ -0,0 +1,901 @@

+# Modify from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+import inspect
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import repeat, rearrange
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils.torch_utils import randn_tensor
+import PIL.Image
+from diffusers.models.attention_processor import (
+    AttnProcessor2_0,
+    FusedAttnProcessor2_0,
+    XFormersAttnProcessor,
+)
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    deprecate,
+    is_invisible_watermark_available,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+)
+if is_invisible_watermark_available():
+    from diffusers.pipelines.stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
+from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
+from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import StableDiffusionXLPipeline, \
+    retrieve_timesteps, rescale_noise_cfg
+from torchvision.transforms import Compose, Resize, CenterCrop, Normalize, InterpolationMode
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class StableDiffusionXLDecoderPipelineOutput(StableDiffusionXLPipelineOutput):
+    images: Union[List[PIL.Image.Image], np.ndarray]
+    indices_semantic: Optional[torch.Tensor] = None
+    indices_pixel: Optional[torch.Tensor] = None
+def expand_dims_like(x, y):
+    while x.dim() != y.dim():
+        x = x.unsqueeze(-1)
+    return x
+class AbstractEmbModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._is_trainable = None
+        self._ucg_rate = None
+        self._input_key = None
+    @property
+    def is_trainable(self) -> bool:
+        return self._is_trainable
+    @property
+    def ucg_rate(self) -> Union[float, torch.Tensor]:
+        return self._ucg_rate
+    @property
+    def input_key(self) -> str:
+        return self._input_key
+    @is_trainable.setter
+    def is_trainable(self, value: bool):
+        self._is_trainable = value
+    @ucg_rate.setter
+    def ucg_rate(self, value: Union[float, torch.Tensor]):
+        self._ucg_rate = value
+    @input_key.setter
+    def input_key(self, value: str):
+        self._input_key = value
+    @is_trainable.deleter
+    def is_trainable(self):
+        del self._is_trainable
+    @ucg_rate.deleter
+    def ucg_rate(self):
+        del self._ucg_rate
+    @input_key.deleter
+    def input_key(self):
+        del self._input_key
+class DualViTok2ImageEmbedder(AbstractEmbModel):
+    def __init__(
+            self,
+            image_processor=None,
+            vq_model=None,
+            device="cuda",
+            dtype=torch.float32,
+            freeze=True,
+            image_size=0,
+            resize_factor=1,
+            not_bicubic=True,
+            return_sequence=False,
+            grid_feature_scale=1,
+            texture_drop_prob=0,
+            semantic_drop_prob=0,
+            pixel_channel=32,
+            semantic_channel=32,
+    ):
+        super().__init__()
+        vq_model.to(device=device, dtype=dtype)
+        vq_model.eval()
+        self.processor = image_processor
+        self.model = vq_model
+        self.device = device
+        if freeze:
+            self.freeze()
+        if image_size > 0:
+            preprocessor = [
+                Resize(image_size) if not_bicubic else Resize(image_size, interpolation=InterpolationMode.BICUBIC)]
+            preprocessor += [
+                CenterCrop(image_size),
+            ]
+            self.preprocessor = Compose(preprocessor)
+        self.image_size = image_size
+        self.resize_factor = resize_factor
+        self.not_bicubic = not_bicubic
+        self.return_sequence = return_sequence
+        self.grid_feature_scale = grid_feature_scale
+        self.texture_drop_prob = texture_drop_prob
+        self.semantic_drop_prob = semantic_drop_prob
+        self.pixel_channel = pixel_channel
+        self.semantic_channel = semantic_channel
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    def vq_encode(self, image):
+        if image.ndim == 5:
+            assert image.size(1) == 1
+            image = image.squeeze(1)
+        bs, _, h, w = image.shape
+        if self.image_size > 0:
+            image = self.preprocessor(image)
+        else:
+            assert self.resize_factor > 0
+            preprocessor = Resize((int(h * self.resize_factor), int(w * self.resize_factor))) if self.not_bicubic else \
+                Resize((int(h * self.resize_factor), int(w * self.resize_factor)),
+                       interpolation=InterpolationMode.BICUBIC)
+            image = preprocessor(image)
+        inputs = dict(image=image)
+        inputs = self.model.get_input(inputs)
+        (quant_semantic, diff_semantic, indices_semantic, target_semantic), \
+        (quant_pixel, diff_pixel, indices_pixel) = self.model.encode(**inputs)
+        return indices_semantic, indices_pixel
+    def vq_encode_code(self, image):
+        (quant_semantic, diff_semantic, indices_semantic, target_semantic), \
+        (quant_pixel, diff_pixel, indices_pixel) = self.vq_encode(image)
+        return indices_semantic, indices_pixel
+    def vq_decode_code(self, indices_semantic, indices_pixel):
+        return self.model.decode_code(indices_semantic, indices_pixel)
+    def forward(self, image, return_indices=False):
+        if image.ndim == 5:
+            assert image.size(1) == 1
+            image = image.squeeze(1)
+        bs, _, h, w = image.shape
+        if self.image_size > 0:
+            image = self.preprocessor(image)
+        else:
+            assert self.resize_factor > 0
+            preprocessor = Resize((int(h * self.resize_factor), int(w * self.resize_factor))) if self.not_bicubic else \
+                Resize((int(h * self.resize_factor), int(w * self.resize_factor)),
+                       interpolation=InterpolationMode.BICUBIC)
+            image = preprocessor(image)
+        inputs = dict(image=image)
+        inputs = self.model.get_input(inputs)
+        (quant_semantic, diff_semantic, indices_semantic, target_semantic), \
+        (quant_pixel, diff_pixel, indices_pixel) = self.model.encode(**inputs)
+        feature = self.model.merge_quants(quant_semantic, quant_pixel)
+        if self.return_sequence:
+            feature = rearrange(feature, 'b c h w -> b h w c')
+            _, this_h, this_w, _ = feature.shape
+            feature = feature.view(bs, this_w * this_w, -1)
+        else:
+            feature = feature * self.grid_feature_scale
+        if return_indices:
+            return feature, indices_semantic, indices_pixel
+        return feature
+    def encode(self, img):
+        return self(img)
+    def indices_to_codes(self, semantic_indices, texture_indices):
+        quant_semantic, quant_texture = self.model.indices_to_codes(semantic_indices, texture_indices)
+        return self.model.merge_quants(quant_semantic, quant_texture)
+class StableDiffusionXLDecoderPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    FromSingleFileMixin,
+    StableDiffusionXLLoraLoaderMixin,
+    TextualInversionLoaderMixin,
+):
+    model_cpu_offload_seq = "vq_model_embedder->unet->vae"
+    _optional_components = [
+        "vq_model_embedder",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]
+    def __init__(
+            self,
+            vae: AutoencoderKL,
+            unet: UNet2DConditionModel,
+            scheduler: KarrasDiffusionSchedulers,
+            force_zeros_for_empty_prompt: bool = True,
+            add_watermarker: Optional[bool] = None,
+            vq_image_processor=None,
+            vq_model=None,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.default_sample_size = self.unet.config.sample_size
+        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+        if add_watermarker:
+            self.watermark = StableDiffusionXLWatermarker()
+        else:
+            self.watermark = None
+        self.empty_prompt_embeds = torch.zeros([1, 77, 2048]).to(device=unet.device, dtype=unet.dtype)
+        self.empty_pooled_prompt_embeds = torch.zeros([1, 1280]).to(device=unet.device, dtype=unet.dtype)
+        self.dualvitok_channels = vq_model.pixel_channel + vq_model.semantic_channel
+        self.resolution_group = ['(1024, 1024)', '(768, 1024)', '(1024, 768)', '(512, 2048)', '(2048, 512)',
+                                 '(640, 1920)', '(1920, 640)', '(768, 1536)', '(1536, 768)', '(768, 1152)',
+                                 '(1152, 768)', '(512, 512)']
+        embedder_kwargs = dict(image_size=0,
+                               resize_factor=1,
+                               return_sequence=False,
+                               grid_feature_scale=1)
+        if isinstance(vq_model, DualViTok2ImageEmbedder):
+            self.vq_model_embedder = vq_model
+        else:
+            self.vq_model_embedder = DualViTok2ImageEmbedder(vq_image_processor, vq_model, **embedder_kwargs)
+    def vq_encode(self, image):
+        return self.vq_model_embedder.encode(image)
+    def vq_encode_code(self, image):
+        return self.vq_model_embedder.vq_encode_code(image)
+    def vq_decode_code(self, *args, **kwargs):
+        return self.vq_model_embedder.vq_decode_code(*args, **kwargs)
+    def indices_to_codes(self, *args, **kwargs):
+        return self.vq_model_embedder.indices_to_codes(*args, **kwargs)
+    def _get_add_time_ids(
+            self, original_size, crops_coords_top_left, target_size, dtype, text_encoder_projection_dim=None,
+            resolution_index=None,
+    ):
+        add_time_ids = [resolution_index] * 6
+        passed_add_embed_dim = (
+                self.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+        )
+        expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
+    def check_inputs(
+            self,
+            height,
+            width,
+            callback_steps,
+            callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+                k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    def upcast_vae(self):
+        dtype = self.vae.dtype
+        self.vae.to(dtype=torch.float32)
+        use_torch_2_0_or_xformers = isinstance(
+            self.vae.decoder.mid_block.attentions[0].processor,
+            (
+                AttnProcessor2_0,
+                XFormersAttnProcessor,
+                FusedAttnProcessor2_0,
+            ),
+        )
+        # if xformers or torch_2_0 is used attention block does not need
+        # to be in float32 which can save lots of memory
+        if use_torch_2_0_or_xformers:
+            self.vae.post_quant_conv.to(dtype)
+            self.vae.decoder.conv_in.to(dtype)
+            self.vae.decoder.mid_block.to(dtype)
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+            self, w: torch.Tensor, embedding_dim: int = 512, dtype: torch.dtype = torch.float32
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+    @property
+    def denoising_end(self):
+        return self._denoising_end
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    @torch.no_grad()
+    def __call__(
+            self,
+            vq_indices: Optional[List] = None,
+            vq_embeds: Optional[torch.Tensor] = None,
+            images: Optional[PipelineImageInput] = None,
+            height: Optional[int] = None,
+            width: Optional[int] = None,
+            num_inference_steps: int = 50,
+            timesteps: List[int] = None,
+            sigmas: List[float] = None,
+            denoising_end: Optional[float] = None,
+            guidance_scale: float = 2.0,
+            eta: float = 0.0,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.Tensor] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            guidance_rescale: float = 0.0,
+            original_size: Optional[Tuple[int, int]] = None,
+            crops_coords_top_left: Tuple[int, int] = (0, 0),
+            target_size: Optional[Tuple[int, int]] = None,
+            negative_original_size: Optional[Tuple[int, int]] = None,
+            negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
+            negative_target_size: Optional[Tuple[int, int]] = None,
+            clip_skip: Optional[int] = None,
+            callback_on_step_end: Optional[
+                Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+            ] = None,
+            callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+            **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            vq_indices (`Optional[PipelineImageInput]`, *optional*):
+                The VQ indices for semantic and pixel tokens. Should be a tuple of (semantic_indices, pixel_indices).
+            images (`Optional[PipelineImageInput]`, *optional*):
+                Input images in range [-1, 1] as torch.Tensor with shape (batch_size, channels, height, width).
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # 0. Default height and width to unet
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        original_size = original_size or (height, width)
+        target_size = target_size or (height, width)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            height,
+            width,
+            callback_steps,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._denoising_end = denoising_end
+        self._interrupt = False
+        # 2. encode vq_embeds
+        assert images is not None or vq_indices is not None or vq_embeds is not None
+        batch_size = len(images) if images is not None else len(vq_indices[0])
+        if images:
+            vq_embeds, indices_semantic, indices_pixel = self.vq_model_embedder(images, return_indices=True)
+        elif vq_indices:
+            indices_semantic, indices_pixel = vq_indices[0], vq_indices[1]
+            vq_embeds = self.vq_model_embedder.indices_to_codes(vq_indices[0], vq_indices[1])
+        elif vq_embeds:
+            if isinstance(vq_embeds, list):
+                vq_embeds = self.vq_model_embedder.merge_quants(vq_embeds)
+            indices_semantic, indices_pixel = None, None
+        else:
+            raise ValueError("No valid input provided")
+        device = self._execution_device
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds = repeat(self.empty_prompt_embeds, '1 l c -> b l c', b=batch_size)
+        pooled_prompt_embeds = repeat(self.empty_pooled_prompt_embeds, '1 c -> b c', b=batch_size)
+        negative_prompt_embeds = prompt_embeds
+        negative_pooled_prompt_embeds = pooled_prompt_embeds
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        # 5. Prepare latent variables
+        # num_channels_latents = self.unet.config.in_channels
+        num_channels_latents = 4
+        latents = self.prepare_latents(
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 7. Prepare added time ids & embeddings
+        add_text_embeds = pooled_prompt_embeds
+        text_encoder_projection_dim = 1280
+        resolution = f'({width}, {height})'
+        assert resolution in self.resolution_group, f"resolution are not in resolution group. Got {resolution}. Candidates:{self.resolution_group}"
+        resolution_index = self.resolution_group.index(resolution)
+        # resolution_index = None
+        add_time_ids = self._get_add_time_ids(
+            original_size,
+            crops_coords_top_left,
+            target_size,
+            dtype=prompt_embeds.dtype,
+            text_encoder_projection_dim=text_encoder_projection_dim,
+            resolution_index=resolution_index,
+        )
+        if negative_original_size is not None and negative_target_size is not None:
+            negative_add_time_ids = self._get_add_time_ids(
+                negative_original_size,
+                negative_crops_coords_top_left,
+                negative_target_size,
+                dtype=prompt_embeds.dtype,
+                text_encoder_projection_dim=text_encoder_projection_dim,
+            )
+        else:
+            negative_add_time_ids = add_time_ids
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+        prompt_embeds = prompt_embeds.to(device)
+        add_text_embeds = add_text_embeds.to(device)
+        add_time_ids = add_time_ids.to(device).repeat(batch_size, 1)
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        # 8.1 Apply denoising_end
+        if (
+                self.denoising_end is not None
+                and isinstance(self.denoising_end, float)
+                and self.denoising_end > 0
+                and self.denoising_end < 1
+        ):
+            discrete_timestep_cutoff = int(
+                round(
+                    self.scheduler.config.num_train_timesteps
+                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
+                )
+            )
+            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
+            timesteps = timesteps[:num_inference_steps]
+        # 9. Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        self._num_timesteps = len(timesteps)
+        # with self.progress_bar(total=num_inference_steps) as progress_bar:
+        for i, t in enumerate(timesteps):
+            if self.interrupt:
+                continue
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            vq_embeds = vq_embeds.to(latent_model_input) if vq_embeds.size(
+                -1) == latent_model_input.size(
+                -1) else \
+                torch.nn.functional.interpolate(vq_embeds.to(latent_model_input),
+                                                size=latent_model_input.shape[-2:])
+            vq_embeds_input = torch.cat([torch.zeros_like(vq_embeds),
+                                         vq_embeds]) if self.do_classifier_free_guidance else vq_embeds
+            # predict the noise residual
+            added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+            latent_model_input = torch.cat([latent_model_input, vq_embeds_input], dim=1)
+            noise_pred = self.unet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=prompt_embeds,
+                timestep_cond=timestep_cond,
+                cross_attention_kwargs=self.cross_attention_kwargs,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+            # perform guidance
+            if self.do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+            if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                noise_pred = rescale_noise_cfg(noise_pred, noise_pred_cond, guidance_rescale=self.guidance_rescale)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents_dtype = latents.dtype
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+            if latents.dtype != latents_dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    latents = latents.to(latents_dtype)
+            if callback_on_step_end is not None:
+                callback_kwargs = {}
+                for k in callback_on_step_end_tensor_inputs:
+                    callback_kwargs[k] = locals()[k]
+                callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                latents = callback_outputs.pop("latents", latents)
+                prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                # progress_bar.update()
+                if callback is not None and i % callback_steps == 0:
+                    step_idx = i // getattr(self.scheduler, "order", 1)
+                    callback(step_idx, t, latents)
+            if XLA_AVAILABLE:
+                xm.mark_step()
+        if not output_type == "latent":
+            # make sure the VAE is in float32 mode, as it overflows in float16
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            if needs_upcasting:
+                self.upcast_vae()
+                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
+            elif latents.dtype != self.vae.dtype:
+                if torch.backends.mps.is_available():
+                    # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                    self.vae = self.vae.to(latents.dtype)
+            # unscale/denormalize the latents
+            # denormalize with the mean and std if available and not None
+            has_latents_mean = hasattr(self.vae.config, "latents_mean") and self.vae.config.latents_mean is not None
+            has_latents_std = hasattr(self.vae.config, "latents_std") and self.vae.config.latents_std is not None
+            if has_latents_mean and has_latents_std:
+                latents_mean = (
+                    torch.tensor(self.vae.config.latents_mean).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents_std = (
+                    torch.tensor(self.vae.config.latents_std).view(1, 4, 1, 1).to(latents.device, latents.dtype)
+                )
+                latents = latents * latents_std / self.vae.config.scaling_factor + latents_mean
+            else:
+                latents = latents / self.vae.config.scaling_factor
+            image = self.vae.decode(latents, return_dict=False)[0]
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+        else:
+            image = latents
+        if not output_type == "latent":
+            # apply watermark if available
+            if self.watermark is not None:
+                image = self.watermark.apply_watermark(image)
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return StableDiffusionXLDecoderPipelineOutput(images=image,
+                                                      indices_semantic=indices_semantic,
+                                                      indices_pixel=indices_pixel)