Zuyan commited on
Commit
6cae37b
·
verified ·
1 Parent(s): 6ca609d

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "HunyuanViTModel"
4
+ ],
5
+ "_attn_implementation": "flash_attention_2",
6
+ "initializer_factor": 1.0,
7
+ "model_type": "HunyuanViT",
8
+ "torch_dtype": "float32",
9
+ "transformers_version": "4.52.4",
10
+ "vision_config": {
11
+ "attention_dropout": 0.0,
12
+ "hidden_act": "gelu_pytorch_tanh",
13
+ "hidden_size": 1536,
14
+ "intermediate_size": 6144,
15
+ "layer_norm_eps": 1e-06,
16
+ "model_type": "HunyuanViT_vision_model",
17
+ "num_attention_heads": 16,
18
+ "num_channels": 3,
19
+ "num_hidden_layers": 40,
20
+ "num_patches": 16384,
21
+ "patch_size": 16
22
+ }
23
+ }
configuration_HunyuanViT.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/HunyuanViT/modular_HunyuanViT.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_HunyuanViT.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # coding=utf-8
8
+ # Copyright 2025 The HuggingFace Inc. team.
9
+ #
10
+ # Licensed under the Apache License, Version 2.0 (the "License");
11
+ # you may not use this file except in compliance with the License.
12
+ # You may obtain a copy of the License at
13
+ #
14
+ # http://www.apache.org/licenses/LICENSE-2.0
15
+ #
16
+ # Unless required by applicable law or agreed to in writing, software
17
+ # distributed under the License is distributed on an "AS IS" BASIS,
18
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ # See the License for the specific language governing permissions and
20
+ # limitations under the License.
21
+
22
+ from transformers.configuration_utils import PretrainedConfig
23
+ from transformers.utils import logging
24
+
25
+
26
+ logger = logging.get_logger(__name__)
27
+
28
+ class HunyuanViTVisionConfig(PretrainedConfig):
29
+ r"""
30
+ This is the configuration class to store the configuration of a [`HunyuanViTVisionModel`]. It is used to instantiate a
31
+ HunyuanViT vision encoder according to the specified arguments, defining the model architecture. Instantiating a
32
+ configuration with the defaults will yield a similar configuration to that of the vision encoder of the HunyuanViT
33
+ [google/HunyuanViT-base-patch16-naflex](https://huggingface.co/google/HunyuanViT-base-patch16-naflex) architecture.
34
+
35
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
36
+ documentation from [`PretrainedConfig`] for more information.
37
+
38
+ Args:
39
+ hidden_size (`int`, *optional*, defaults to 768):
40
+ Dimensionality of the encoder layers and the pooler layer.
41
+ intermediate_size (`int`, *optional*, defaults to 3072):
42
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
43
+ num_hidden_layers (`int`, *optional*, defaults to 12):
44
+ Number of hidden layers in the Transformer encoder.
45
+ num_attention_heads (`int`, *optional*, defaults to 12):
46
+ Number of attention heads for each attention layer in the Transformer encoder.
47
+ num_channels (`int`, *optional*, defaults to 3):
48
+ Number of channels in the input images.
49
+ num_patches (`int`, *optional*, defaults to 256):
50
+ The number of patches in the image with the size of (`patch_size`, `patch_size`).
51
+ The image is resized to fill maximum of this number of patches, and to preserve
52
+ the aspect ratio. In case the resulted number of patches is lower, the image is
53
+ padded in "patch" dimension.
54
+ patch_size (`int`, *optional*, defaults to 16):
55
+ The size (resolution) of each patch.
56
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
57
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
58
+ `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
59
+ layer_norm_eps (`float`, *optional*, defaults to 1e-06):
60
+ The epsilon used by the layer normalization layers.
61
+ attention_dropout (`float`, *optional*, defaults to 0.0):
62
+ The dropout ratio for the attention probabilities.
63
+
64
+ Example:
65
+
66
+ ```python
67
+ >>> from transformers import HunyuanViTVisionConfig, HunyuanViTVisionModel
68
+
69
+ >>> # Initializing a HunyuanViTVisionConfig with google/HunyuanViT-base-patch16-naflex style configuration
70
+ >>> configuration = HunyuanViTVisionConfig()
71
+
72
+ >>> # Initializing a HunyuanViTVisionModel (with random weights) from the google/HunyuanViT-base-patch16-naflex style configuration
73
+ >>> model = HunyuanViTVisionModel(configuration)
74
+
75
+ >>> # Accessing the model configuration
76
+ >>> configuration = model.config
77
+ ```"""
78
+
79
+ model_type = "HunyuanViT_vision_model"
80
+ base_config_key = "vision_config"
81
+
82
+ def __init__(
83
+ self,
84
+ hidden_size=768,
85
+ intermediate_size=3072,
86
+ num_hidden_layers=12,
87
+ num_attention_heads=12,
88
+ num_channels=3,
89
+ num_patches=256,
90
+ patch_size=16,
91
+ hidden_act="gelu_pytorch_tanh",
92
+ layer_norm_eps=1e-6,
93
+ attention_dropout=0.0,
94
+ **kwargs,
95
+ ):
96
+ super().__init__(**kwargs)
97
+
98
+ self.hidden_size = hidden_size
99
+ self.intermediate_size = intermediate_size
100
+ self.num_hidden_layers = num_hidden_layers
101
+ self.num_attention_heads = num_attention_heads
102
+ self.num_channels = num_channels
103
+ self.patch_size = patch_size
104
+ self.attention_dropout = attention_dropout
105
+ self.layer_norm_eps = layer_norm_eps
106
+ self.hidden_act = hidden_act
107
+ self.num_patches = num_patches
108
+
109
+
110
+ class HunyuanViTConfig(PretrainedConfig):
111
+ r"""
112
+ [`HunyuanViTConfig`] is the configuration class to store the configuration of a [`HunyuanViTModel`]. It is used to
113
+ instantiate a HunyuanViT model according to the specified arguments, defining the text model and vision model configs.
114
+ Instantiating a configuration with the defaults will yield a similar configuration to that of the HunyuanViT
115
+ [google/HunyuanViT-base-patch16-224](https://huggingface.co/google/HunyuanViT-base-patch16-224) architecture.
116
+
117
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
118
+ documentation from [`PretrainedConfig`] for more information.
119
+
120
+ Args:
121
+ text_config (`dict`, *optional*):
122
+ Dictionary of configuration options used to initialize [`HunyuanViTTextConfig`].
123
+ vision_config (`dict`, *optional*):
124
+ Dictionary of configuration options used to initialize [`HunyuanViTVisionConfig`].
125
+ kwargs (*optional*):
126
+ Dictionary of keyword arguments.
127
+
128
+ Example:
129
+
130
+ ```python
131
+ >>> from transformers import HunyuanViTConfig, HunyuanViTModel
132
+
133
+ >>> # Initializing a HunyuanViTConfig with google/HunyuanViT-base-patch16-224 style configuration
134
+ >>> configuration = HunyuanViTConfig()
135
+
136
+ >>> # Initializing a HunyuanViTModel (with random weights) from the google/HunyuanViT-base-patch16-224 style configuration
137
+ >>> model = HunyuanViTModel(configuration)
138
+
139
+ >>> # Accessing the model configuration
140
+ >>> configuration = model.config
141
+
142
+ >>> # We can also initialize a HunyuanViTConfig from a HunyuanViTTextConfig and a HunyuanViTVisionConfig
143
+ >>> from transformers import HunyuanViTTextConfig, HunyuanViTVisionConfig
144
+
145
+ >>> # Initializing a HunyuanViTText and HunyuanViTVision configuration
146
+ >>> config_text = HunyuanViTTextConfig()
147
+ >>> config_vision = HunyuanViTVisionConfig()
148
+
149
+ >>> config = HunyuanViTConfig.from_text_vision_configs(config_text, config_vision)
150
+ ```"""
151
+
152
+ model_type = "HunyuanViT"
153
+ sub_configs = {"vision_config": HunyuanViTVisionConfig}
154
+
155
+ def __init__(self, text_config=None, vision_config=None, **kwargs):
156
+ super().__init__(**kwargs)
157
+
158
+ if vision_config is None:
159
+ vision_config = {}
160
+ logger.info("`vision_config` is `None`. initializing the `HunyuanViTVisionConfig` with default values.")
161
+
162
+ self.vision_config = HunyuanViTVisionConfig(**vision_config)
163
+
164
+ self.initializer_factor = 1.0
165
+
166
+ @classmethod
167
+ def from_text_vision_configs(cls, vision_config: HunyuanViTVisionConfig, **kwargs):
168
+ r"""
169
+ Instantiate a [`HunyuanViTConfig`] (or a derived class) from HunyuanViT text model configuration and HunyuanViT vision
170
+ model configuration.
171
+
172
+ Returns:
173
+ [`HunyuanViTConfig`]: An instance of a configuration object
174
+ """
175
+
176
+ return cls(vision_config=vision_config.to_dict(), **kwargs)
177
+
178
+
179
+ __all__ = ["HunyuanViTConfig", "HunyuanViTVisionConfig"]
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ca9916451580f64b600eada187e5f4d5203cf428bacc0f4be780fe9b9527155
3
+ size 4638523464
modeling_HunyuanViT.py ADDED
@@ -0,0 +1,838 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
2
+ # This file was automatically generated from src/transformers/models/HunyuanViT/modular_HunyuanViT.py.
3
+ # Do NOT edit this file manually as any edits will be overwritten by the generation of
4
+ # the file from the modular. If any change should be done, please apply the change to the
5
+ # modular_HunyuanViT.py file directly. One of our CI enforces this.
6
+ # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
7
+ # coding=utf-8
8
+ # Copyright 2025 The HuggingFace Inc. team.
9
+ #
10
+ # Licensed under the Apache License, Version 2.0 (the "License");
11
+ # you may not use this file except in compliance with the License.
12
+ # You may obtain a copy of the License at
13
+ #
14
+ # http://www.apache.org/licenses/LICENSE-2.0
15
+ #
16
+ # Unless required by applicable law or agreed to in writing, software
17
+ # distributed under the License is distributed on an "AS IS" BASIS,
18
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ # See the License for the specific language governing permissions and
20
+ # limitations under the License.
21
+ import math
22
+ import warnings
23
+ from dataclasses import dataclass
24
+ from typing import Any, Callable, Optional, Tuple, Union
25
+
26
+ import numpy as np
27
+ import torch
28
+ import torch.nn as nn
29
+ import torch.nn.functional as F
30
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
31
+ from torch.nn.init import _calculate_fan_in_and_fan_out
32
+
33
+ from transformers.activations import ACT2FN
34
+ from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
35
+ from transformers.modeling_layers import GradientCheckpointingLayer
36
+ from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
37
+ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
38
+ from transformers.utils import ModelOutput, auto_docstring, can_return_tuple, logging
39
+ from configuration_hunyuan_vit import HunyuanViTConfig, HunyuanViTVisionConfig
40
+
41
+
42
+ logger = logging.get_logger(__name__)
43
+
44
+
45
+ @dataclass
46
+ class HunyuanViTVisionOutput(ModelOutput):
47
+ """
48
+ Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
49
+
50
+ Args:
51
+ image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
52
+ The image embeddings obtained by applying the projection layer to the pooler_output.
53
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
54
+ Sequence of hidden-states at the output of the last layer of the model.
55
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
56
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
57
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
58
+
59
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
60
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
61
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
62
+ sequence_length)`.
63
+
64
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
65
+ heads.
66
+ """
67
+
68
+ image_embeds: Optional[torch.FloatTensor] = None
69
+ last_hidden_state: Optional[torch.FloatTensor] = None
70
+ hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
71
+ attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
72
+
73
+ @dataclass
74
+ class HunyuanViTOutput(ModelOutput):
75
+ """
76
+ Args:
77
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
78
+ Contrastive loss for image-text similarity.
79
+ logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
80
+ The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
81
+ similarity scores.
82
+ logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
83
+ The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
84
+ similarity scores.
85
+ text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
86
+ The text embeddings obtained by applying the projection layer to the pooled output of [`HunyuanViTTextModel`].
87
+ image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
88
+ The image embeddings obtained by applying the projection layer to the pooled output of [`HunyuanViTVisionModel`].
89
+ text_model_output (`BaseModelOutputWithPooling`):
90
+ The output of the [`HunyuanViTTextModel`].
91
+ vision_model_output (`BaseModelOutputWithPooling`):
92
+ The output of the [`HunyuanViTVisionModel`].
93
+ """
94
+
95
+ loss: Optional[torch.FloatTensor] = None
96
+ logits_per_image: Optional[torch.FloatTensor] = None
97
+ logits_per_text: Optional[torch.FloatTensor] = None
98
+ image_embeds: Optional[torch.FloatTensor] = None
99
+
100
+ def to_tuple(self) -> Tuple[Any]:
101
+ return tuple(
102
+ self[k] if k not in ["vision_model_output"] else getattr(self, k).to_tuple()
103
+ for k in self.keys()
104
+ )
105
+
106
+
107
+ class HunyuanViTVisionEmbeddings(nn.Module):
108
+ def __init__(self, config: HunyuanViTVisionConfig):
109
+ super().__init__()
110
+ self.config = config
111
+ self.embed_dim = config.hidden_size
112
+ self.patch_size = config.patch_size
113
+
114
+ self.patch_embedding = nn.Linear(
115
+ in_features=config.num_channels * self.patch_size * self.patch_size,
116
+ out_features=self.embed_dim,
117
+ )
118
+
119
+ self.num_patches = config.num_patches
120
+ self.position_embedding_size = int(self.num_patches**0.5)
121
+ self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim)
122
+
123
+ @staticmethod
124
+ def resize_positional_embeddings(
125
+ positional_embeddings: torch.Tensor,
126
+ spatial_shapes: torch.LongTensor,
127
+ max_length: int,
128
+ ) -> torch.Tensor:
129
+ """
130
+ Resize positional embeddings to image-specific size and pad to a fixed size.
131
+
132
+ Args:
133
+ positional_embeddings (`torch.Tensor`):
134
+ Position embeddings of shape (height, width, embed_dim)
135
+ spatial_shapes (`torch.LongTensor`):
136
+ Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
137
+ max_length (`int`):
138
+ Maximum length of the positional embeddings to pad resized positional embeddings to
139
+
140
+ Returns:
141
+ `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
142
+ """
143
+ batch_size = spatial_shapes.shape[0]
144
+ embed_dim = positional_embeddings.shape[-1]
145
+ source_dtype = positional_embeddings.dtype
146
+
147
+ resulted_positional_embeddings = torch.empty(
148
+ (batch_size, max_length, embed_dim),
149
+ device=positional_embeddings.device,
150
+ dtype=source_dtype,
151
+ )
152
+
153
+ # (height, width, embed_dim) -> (1, embed_dim, height, width) for interpolation
154
+ positional_embeddings = positional_embeddings.permute(2, 0, 1).unsqueeze(0)
155
+
156
+ # Upcast to float32 on CPU because antialias is not supported for bfloat16/float16 on CPU
157
+ if positional_embeddings.device.type == "cpu":
158
+ positional_embeddings = positional_embeddings.to(torch.float32)
159
+
160
+ for i in range(batch_size):
161
+ # (1, dim, height, width) -> (1, dim, target_height, target_width)
162
+ height, width = spatial_shapes[i]
163
+ resized_embeddings = F.interpolate(
164
+ positional_embeddings,
165
+ size=(height, width),
166
+ mode="bilinear",
167
+ align_corners=False,
168
+ antialias=True,
169
+ )
170
+
171
+ # (1, dim, target_height, target_width) -> (target_height * target_width, dim)
172
+ resized_embeddings = resized_embeddings.reshape(embed_dim, height * width).transpose(0, 1)
173
+
174
+ # Cast to original dtype
175
+ resized_embeddings = resized_embeddings.to(source_dtype)
176
+
177
+ resulted_positional_embeddings[i, : height * width] = resized_embeddings
178
+ resulted_positional_embeddings[i, height * width :] = resized_embeddings[0]
179
+
180
+ return resulted_positional_embeddings
181
+
182
+ def forward(self, pixel_values: torch.FloatTensor, spatial_shapes: torch.LongTensor) -> torch.Tensor:
183
+ """
184
+ Args:
185
+ pixel_values (`torch.FloatTensor`):
186
+ Pixel values of shape (batch_size, max_num_patches, num_channels * patch_size * patch_size)
187
+ spatial_shapes (`List[Tuple[int, int]]`):
188
+ Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
189
+ """
190
+
191
+ # Apply patch embeddings to already patchified pixel values
192
+ target_dtype = self.patch_embedding.weight.dtype
193
+ patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
194
+
195
+ # Get positional resized and padded positional embeddings
196
+ positional_embeddings = self.position_embedding.weight.reshape(
197
+ self.position_embedding_size, self.position_embedding_size, -1
198
+ )
199
+ resized_positional_embeddings = self.resize_positional_embeddings(
200
+ positional_embeddings, spatial_shapes, max_length=pixel_values.shape[1]
201
+ )
202
+
203
+ # Add positional embeddings to patch embeddings
204
+ embeddings = patch_embeds + resized_positional_embeddings
205
+ return embeddings
206
+
207
+
208
+ def eager_attention_forward(
209
+ module: nn.Module,
210
+ query: torch.Tensor,
211
+ key: torch.Tensor,
212
+ value: torch.Tensor,
213
+ attention_mask: Optional[torch.Tensor],
214
+ scaling: float,
215
+ dropout: float = 0.0,
216
+ **kwargs,
217
+ ):
218
+ attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
219
+ if attention_mask is not None:
220
+ attn_weights = attn_weights + attention_mask
221
+
222
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
223
+ attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
224
+
225
+ attn_output = torch.matmul(attn_weights, value)
226
+ attn_output = attn_output.transpose(1, 2).contiguous()
227
+
228
+ return attn_output, attn_weights
229
+
230
+
231
+ class HunyuanViTAttention(nn.Module):
232
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
233
+
234
+ def __init__(self, config):
235
+ super().__init__()
236
+ self.config = config
237
+ self.embed_dim = config.hidden_size
238
+ self.num_heads = config.num_attention_heads
239
+ self.head_dim = self.embed_dim // self.num_heads
240
+ if self.head_dim * self.num_heads != self.embed_dim:
241
+ raise ValueError(
242
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
243
+ f" {self.num_heads})."
244
+ )
245
+ self.scale = self.head_dim**-0.5
246
+ self.dropout = config.attention_dropout
247
+ self.is_causal = False
248
+
249
+ self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
250
+ self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
251
+ self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
252
+ self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
253
+
254
+ def forward(
255
+ self,
256
+ hidden_states: torch.Tensor,
257
+ attention_mask: Optional[torch.Tensor] = None,
258
+ output_attentions: Optional[bool] = False,
259
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
260
+ """Input shape: Batch x Time x Channel"""
261
+
262
+ batch_size, seq_length, embed_dim = hidden_states.shape
263
+
264
+ queries = self.q_proj(hidden_states)
265
+ keys = self.k_proj(hidden_states)
266
+ values = self.v_proj(hidden_states)
267
+
268
+ queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
269
+ keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
270
+ values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
271
+
272
+ attention_interface: Callable = eager_attention_forward
273
+ if self.config._attn_implementation != "eager":
274
+ if self.config._attn_implementation == "sdpa" and output_attentions:
275
+ logger.warning_once(
276
+ "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to "
277
+ 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
278
+ )
279
+ else:
280
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
281
+
282
+ attn_output, attn_weights = attention_interface(
283
+ self,
284
+ queries,
285
+ keys,
286
+ values,
287
+ attention_mask,
288
+ is_causal=self.is_causal,
289
+ scaling=self.scale,
290
+ dropout=0.0 if not self.training else self.dropout,
291
+ )
292
+
293
+ attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
294
+ attn_output = self.out_proj(attn_output)
295
+
296
+ if not output_attentions:
297
+ attn_weights = None
298
+
299
+ return attn_output, attn_weights
300
+
301
+
302
+ class HunyuanViTMLP(nn.Module):
303
+ def __init__(self, config):
304
+ super().__init__()
305
+ self.config = config
306
+ self.activation_fn = ACT2FN[config.hidden_act]
307
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
308
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
309
+
310
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
311
+ hidden_states = self.fc1(hidden_states)
312
+ hidden_states = self.activation_fn(hidden_states)
313
+ hidden_states = self.fc2(hidden_states)
314
+ return hidden_states
315
+
316
+
317
+ class HunyuanViTEncoderLayer(GradientCheckpointingLayer):
318
+ def __init__(self, config: Union[HunyuanViTVisionConfig]):
319
+ super().__init__()
320
+ self.embed_dim = config.hidden_size
321
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
322
+ self.self_attn = HunyuanViTAttention(config)
323
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
324
+ self.mlp = HunyuanViTMLP(config)
325
+
326
+ def forward(
327
+ self,
328
+ hidden_states: torch.Tensor,
329
+ attention_mask: torch.Tensor,
330
+ output_attentions: Optional[bool] = False,
331
+ ) -> Tuple[torch.FloatTensor]:
332
+ """
333
+ Args:
334
+ hidden_states (`torch.FloatTensor`):
335
+ Input to the layer of shape `(batch, seq_len, embed_dim)`.
336
+ attention_mask (`torch.FloatTensor`):
337
+ Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
338
+ output_attentions (`bool`, *optional*, defaults to `False`):
339
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
340
+ returned tensors for more detail.
341
+ """
342
+ residual = hidden_states
343
+
344
+ hidden_states = self.layer_norm1(hidden_states)
345
+ hidden_states, attn_weights = self.self_attn(
346
+ hidden_states=hidden_states,
347
+ attention_mask=attention_mask,
348
+ output_attentions=output_attentions,
349
+ )
350
+ hidden_states = residual + hidden_states
351
+
352
+ residual = hidden_states
353
+ hidden_states = self.layer_norm2(hidden_states)
354
+ hidden_states = self.mlp(hidden_states)
355
+ hidden_states = residual + hidden_states
356
+
357
+ outputs = (hidden_states,)
358
+
359
+ if output_attentions:
360
+ outputs += (attn_weights,)
361
+
362
+ return outputs
363
+
364
+
365
+ class HunyuanViTEncoder(nn.Module):
366
+ """
367
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
368
+ [`HunyuanViTEncoderLayer`].
369
+
370
+ Args:
371
+ config: HunyuanViTConfig
372
+ """
373
+
374
+ def __init__(self, config: HunyuanViTConfig):
375
+ super().__init__()
376
+ self.config = config
377
+ self.layers = nn.ModuleList([HunyuanViTEncoderLayer(config) for _ in range(config.num_hidden_layers)])
378
+ self.gradient_checkpointing = False
379
+
380
+ # Ignore copy
381
+ @can_return_tuple
382
+ def forward(
383
+ self,
384
+ inputs_embeds,
385
+ attention_mask: Optional[torch.Tensor] = None,
386
+ output_attentions: Optional[bool] = None,
387
+ output_hidden_states: Optional[bool] = None,
388
+ ) -> BaseModelOutput:
389
+ r"""
390
+ Args:
391
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
392
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
393
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
394
+ than the model's internal embedding lookup matrix.
395
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
396
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
397
+
398
+ - 1 for tokens that are **not masked**,
399
+ - 0 for tokens that are **masked**.
400
+
401
+ [What are attention masks?](../glossary#attention-mask)
402
+ output_attentions (`bool`, *optional*):
403
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
404
+ returned tensors for more detail.
405
+ output_hidden_states (`bool`, *optional*):
406
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
407
+ for more detail.
408
+ return_dict (`bool`, *optional*):
409
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
410
+ """
411
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
412
+ output_hidden_states = (
413
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
414
+ )
415
+
416
+ encoder_states = () if output_hidden_states else None
417
+ all_attentions = () if output_attentions else None
418
+
419
+ hidden_states = inputs_embeds
420
+ for encoder_layer in self.layers:
421
+ if output_hidden_states:
422
+ encoder_states = encoder_states + (hidden_states,)
423
+
424
+ layer_outputs = encoder_layer(
425
+ hidden_states,
426
+ attention_mask,
427
+ output_attentions=output_attentions,
428
+ )
429
+
430
+ hidden_states = layer_outputs[0]
431
+
432
+ if output_attentions:
433
+ all_attentions = all_attentions + (layer_outputs[1],)
434
+
435
+ if output_hidden_states:
436
+ encoder_states = encoder_states + (hidden_states,)
437
+
438
+ return BaseModelOutput(
439
+ last_hidden_state=hidden_states,
440
+ hidden_states=encoder_states,
441
+ attentions=all_attentions,
442
+ )
443
+
444
+
445
+ class HunyuanViTVisionTransformer(nn.Module):
446
+ def __init__(self, config: HunyuanViTVisionConfig):
447
+ super().__init__()
448
+ self.config = config
449
+ embed_dim = config.hidden_size
450
+
451
+ self.embeddings = HunyuanViTVisionEmbeddings(config)
452
+ self.encoder = HunyuanViTEncoder(config)
453
+ self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
454
+ # self.use_head = True if not hasattr(config, "vision_use_head") else config.vision_use_head
455
+ # if self.use_head:
456
+ # self.head = HunyuanViTMultiheadAttentionPoolingHead(config)
457
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
458
+
459
+ @can_return_tuple
460
+ @auto_docstring
461
+ def forward(
462
+ self,
463
+ pixel_values: torch.FloatTensor,
464
+ attention_mask: torch.Tensor,
465
+ spatial_shapes: torch.LongTensor,
466
+ output_attentions: Optional[bool] = None,
467
+ output_hidden_states: Optional[bool] = None,
468
+ ) -> BaseModelOutput:
469
+ r"""
470
+ spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
471
+ Tensor containing the spatial dimensions (height, width) of the input images.
472
+ """
473
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
474
+ output_hidden_states = (
475
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
476
+ )
477
+
478
+ hidden_states = self.embeddings(pixel_values, spatial_shapes)
479
+
480
+ if attention_mask is not None and not self._use_flash_attention_2:
481
+ # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
482
+ encoder_attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
483
+ else:
484
+ encoder_attention_mask = attention_mask
485
+
486
+ encoder_outputs: BaseModelOutput = self.encoder(
487
+ inputs_embeds=hidden_states,
488
+ attention_mask=encoder_attention_mask,
489
+ output_attentions=output_attentions,
490
+ output_hidden_states=output_hidden_states,
491
+ )
492
+
493
+ last_hidden_state = encoder_outputs.last_hidden_state
494
+ last_hidden_state = self.post_layernorm(last_hidden_state)
495
+
496
+ # pooler_output = self.head(last_hidden_state, attention_mask) if self.use_head else None
497
+
498
+ return BaseModelOutput(
499
+ last_hidden_state=last_hidden_state,
500
+ # pooler_output=pooler_output,
501
+ hidden_states=encoder_outputs.hidden_states,
502
+ attentions=encoder_outputs.attentions,
503
+ )
504
+
505
+
506
+ def _trunc_normal_(tensor, mean, std, a, b):
507
+ # Cut & paste from PyTorch official master until it's in a few official releases - RW
508
+ # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
509
+ def norm_cdf(x):
510
+ # Computes standard normal cumulative distribution function
511
+ return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
512
+
513
+ if (mean < a - 2 * std) or (mean > b + 2 * std):
514
+ warnings.warn(
515
+ "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
516
+ "The distribution of values may be incorrect.",
517
+ stacklevel=2,
518
+ )
519
+
520
+ # Values are generated by using a truncated uniform distribution and
521
+ # then using the inverse CDF for the normal distribution.
522
+ # Get upper and lower cdf values
523
+ l = norm_cdf((a - mean) / std)
524
+ u = norm_cdf((b - mean) / std)
525
+
526
+ # Uniformly fill tensor with values from [l, u], then translate to
527
+ # [2l-1, 2u-1].
528
+ tensor.uniform_(2 * l - 1, 2 * u - 1)
529
+
530
+ # Use inverse cdf transform for normal distribution to get truncated
531
+ # standard normal
532
+ tensor.erfinv_()
533
+
534
+ # Transform to proper mean, std
535
+ tensor.mul_(std * math.sqrt(2.0))
536
+ tensor.add_(mean)
537
+
538
+ # Clamp to ensure it's in the proper range
539
+ tensor.clamp_(min=a, max=b)
540
+
541
+
542
+ def trunc_normal_tf_(
543
+ tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
544
+ ) -> torch.Tensor:
545
+ """Fills the input Tensor with values drawn from a truncated
546
+ normal distribution. The values are effectively drawn from the
547
+ normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
548
+ with values outside :math:`[a, b]` redrawn until they are within
549
+ the bounds. The method used for generating the random values works
550
+ best when :math:`a \\leq \text{mean} \\leq b`.
551
+
552
+ NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
553
+ bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
554
+ and the result is subsequently scaled and shifted by the mean and std args.
555
+
556
+ Args:
557
+ tensor: an n-dimensional `torch.Tensor`
558
+ mean: the mean of the normal distribution
559
+ std: the standard deviation of the normal distribution
560
+ a: the minimum cutoff value
561
+ b: the maximum cutoff value
562
+ """
563
+ with torch.no_grad():
564
+ _trunc_normal_(tensor, 0, 1.0, a, b)
565
+ tensor.mul_(std).add_(mean)
566
+
567
+
568
+ def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
569
+ fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
570
+ if mode == "fan_in":
571
+ denom = fan_in
572
+ elif mode == "fan_out":
573
+ denom = fan_out
574
+ elif mode == "fan_avg":
575
+ denom = (fan_in + fan_out) / 2
576
+
577
+ variance = scale / denom
578
+
579
+ if distribution == "truncated_normal":
580
+ # constant is stddev of standard normal truncated to (-2, 2)
581
+ trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
582
+ elif distribution == "normal":
583
+ with torch.no_grad():
584
+ tensor.normal_(std=math.sqrt(variance))
585
+ elif distribution == "uniform":
586
+ bound = math.sqrt(3 * variance)
587
+ with torch.no_grad():
588
+ tensor.uniform_(-bound, bound)
589
+ else:
590
+ raise ValueError(f"invalid distribution {distribution}")
591
+
592
+
593
+ def lecun_normal_(tensor):
594
+ variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
595
+
596
+
597
+ def default_flax_embed_init(tensor):
598
+ variance_scaling_(tensor, mode="fan_in", distribution="normal")
599
+
600
+ @auto_docstring
601
+ class HunyuanViTPreTrainedModel(PreTrainedModel):
602
+ config_class = HunyuanViTConfig
603
+ base_model_prefix = "HunyuanViT"
604
+ supports_gradient_checkpointing = True
605
+
606
+ _no_split_modules = [
607
+ "HunyuanViTTextEmbeddings",
608
+ "HunyuanViTEncoderLayer",
609
+ "HunyuanViTVisionEmbeddings",
610
+ "HunyuanViTEncoderLayer",
611
+ # "HunyuanViTMultiheadAttentionPoolingHead",
612
+ ]
613
+ _supports_flash_attn_2 = True
614
+ _supports_sdpa = True
615
+ _supports_flex_attn = True
616
+ _supports_attention_backend = True
617
+
618
+ def _init_weights(self, module):
619
+ """Initialize the weights"""
620
+ if isinstance(module, HunyuanViTVisionEmbeddings):
621
+ width = (
622
+ self.config.vision_config.hidden_size
623
+ if isinstance(self.config, HunyuanViTConfig)
624
+ else self.config.hidden_size
625
+ )
626
+ nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
627
+ elif isinstance(module, nn.Embedding):
628
+ default_flax_embed_init(module.weight)
629
+ elif isinstance(module, HunyuanViTAttention):
630
+ nn.init.xavier_uniform_(module.q_proj.weight)
631
+ nn.init.xavier_uniform_(module.k_proj.weight)
632
+ nn.init.xavier_uniform_(module.v_proj.weight)
633
+ nn.init.xavier_uniform_(module.out_proj.weight)
634
+ nn.init.zeros_(module.q_proj.bias)
635
+ nn.init.zeros_(module.k_proj.bias)
636
+ nn.init.zeros_(module.v_proj.bias)
637
+ nn.init.zeros_(module.out_proj.bias)
638
+ elif isinstance(module, HunyuanViTMLP):
639
+ nn.init.xavier_uniform_(module.fc1.weight)
640
+ nn.init.xavier_uniform_(module.fc2.weight)
641
+ nn.init.normal_(module.fc1.bias, std=1e-6)
642
+ nn.init.normal_(module.fc2.bias, std=1e-6)
643
+ # elif isinstance(module, HunyuanViTMultiheadAttentionPoolingHead):
644
+ # nn.init.xavier_uniform_(module.probe.data)
645
+ # nn.init.xavier_uniform_(module.attention.in_proj_weight.data)
646
+ # nn.init.zeros_(module.attention.in_proj_bias.data)
647
+ elif isinstance(module, (nn.Linear, nn.Conv2d)):
648
+ lecun_normal_(module.weight)
649
+ if module.bias is not None:
650
+ nn.init.zeros_(module.bias)
651
+ elif isinstance(module, nn.LayerNorm):
652
+ module.bias.data.zero_()
653
+ module.weight.data.fill_(1.0)
654
+
655
+
656
+ # class HunyuanViTMultiheadAttentionPoolingHead(nn.Module):
657
+ # """Multihead Attention Pooling."""
658
+
659
+ # def __init__(self, config: HunyuanViTVisionConfig):
660
+ # super().__init__()
661
+
662
+ # self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
663
+ # self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
664
+ # self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
665
+ # self.mlp = HunyuanViTMLP(config)
666
+ # self.num_heads = config.num_attention_heads
667
+
668
+ # def forward(self, hidden_state: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
669
+ # batch_size = hidden_state.shape[0]
670
+ # probe = self.probe.repeat(batch_size, 1, 1)
671
+
672
+ # if attention_mask is not None:
673
+ # target_len, source_len = probe.shape[1], hidden_state.shape[1]
674
+ # attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_state.dtype, target_len)
675
+ # attention_mask = attention_mask.repeat(1, self.num_heads, target_len, 1)
676
+ # attention_mask = attention_mask.reshape(-1, target_len, source_len)
677
+
678
+ # hidden_state = self.attention(probe, hidden_state, hidden_state, attn_mask=attention_mask)[0]
679
+
680
+ # residual = hidden_state
681
+ # hidden_state = self.layernorm(hidden_state)
682
+ # hidden_state = residual + self.mlp(hidden_state)
683
+
684
+ # return hidden_state[:, 0]
685
+
686
+
687
+ @auto_docstring(
688
+ custom_intro="""
689
+ The vision model from HunyuanViT without any head or projection on top.
690
+ """
691
+ )
692
+ class HunyuanViTVisionModel(HunyuanViTPreTrainedModel):
693
+ config_class = HunyuanViTVisionConfig
694
+ main_input_name = "pixel_values"
695
+
696
+ def __init__(self, config: HunyuanViTVisionConfig):
697
+ super().__init__(config)
698
+
699
+ self.vision_model = HunyuanViTVisionTransformer(config)
700
+
701
+ # Initialize weights and apply final processing
702
+ self.post_init()
703
+
704
+ def get_input_embeddings(self) -> nn.Module:
705
+ return self.vision_model.embeddings.patch_embedding
706
+
707
+ @can_return_tuple
708
+ @auto_docstring
709
+ def forward(
710
+ self,
711
+ pixel_values: torch.FloatTensor,
712
+ pixel_attention_mask: torch.Tensor,
713
+ spatial_shapes: torch.LongTensor,
714
+ output_attentions: Optional[bool] = None,
715
+ output_hidden_states: Optional[bool] = None,
716
+ ) -> BaseModelOutputWithPooling:
717
+ r"""
718
+ pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
719
+ Mask to avoid performing attention on padding pixel indices.
720
+ spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
721
+ Tensor containing the spatial dimensions (height, width) of the input images.
722
+
723
+ Examples:
724
+
725
+ ```python
726
+ >>> from PIL import Image
727
+ >>> import requests
728
+ >>> from transformers import AutoProcessor, HunyuanViTVisionModel
729
+
730
+ >>> model = HunyuanViTVisionModel.from_pretrained("google/HunyuanViT-base-patch16-224")
731
+ >>> processor = AutoProcessor.from_pretrained("google/HunyuanViT-base-patch16-224")
732
+
733
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
734
+ >>> image = Image.open(requests.get(url, stream=True).raw)
735
+
736
+ >>> inputs = processor(images=image, return_tensors="pt")
737
+
738
+ >>> outputs = model(**inputs)
739
+ >>> last_hidden_state = outputs.last_hidden_state
740
+ >>> pooled_output = outputs.pooler_output # pooled features
741
+ ```"""
742
+ return self.vision_model(
743
+ pixel_values=pixel_values,
744
+ attention_mask=pixel_attention_mask,
745
+ spatial_shapes=spatial_shapes,
746
+ output_attentions=output_attentions,
747
+ output_hidden_states=output_hidden_states,
748
+ )
749
+
750
+
751
+ @auto_docstring
752
+ class HunyuanViTModel(HunyuanViTPreTrainedModel):
753
+ config_class = HunyuanViTConfig
754
+
755
+ def __init__(self, config: HunyuanViTConfig):
756
+ super().__init__(config)
757
+
758
+ if not isinstance(config.vision_config, HunyuanViTVisionConfig):
759
+ raise TypeError(
760
+ "config.vision_config is expected to be of type HunyuanViTVisionConfig but is of type"
761
+ f" {type(config.vision_config)}."
762
+ )
763
+
764
+ vision_config = config.vision_config
765
+
766
+ # First, initialize the text and vision models with proper attention implementation
767
+ vision_model = HunyuanViTVisionModel._from_config(vision_config)
768
+
769
+ # Second, get the text and vision submodules (for backward compatibility)
770
+ self.vision_model = vision_model.vision_model
771
+
772
+ # Initialize weights and apply final processing
773
+ self.post_init()
774
+
775
+ @auto_docstring
776
+ def get_image_features(
777
+ self,
778
+ pixel_values: Optional[torch.FloatTensor] = None,
779
+ pixel_attention_mask: Optional[torch.Tensor] = None,
780
+ spatial_shapes: Optional[torch.LongTensor] = None,
781
+ output_attentions: Optional[bool] = None,
782
+ output_hidden_states: Optional[bool] = None,
783
+ ) -> torch.FloatTensor:
784
+ r"""
785
+ pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
786
+ Mask to avoid performing attention on padding pixel indices.
787
+ spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
788
+ Tensor containing the spatial dimensions (height, width) of the input images.
789
+
790
+ Returns:
791
+ image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
792
+ applying the projection layer to the pooled output of [`HunyuanViTVisionModel`].
793
+
794
+ Examples:
795
+
796
+ ```python
797
+ >>> from PIL import Image
798
+ >>> import requests
799
+ >>> from transformers import AutoProcessor, AutoModel
800
+ >>> import torch
801
+
802
+ >>> model = AutoModel.from_pretrained("google/HunyuanViT-base-patch16-224")
803
+ >>> processor = AutoProcessor.from_pretrained("google/HunyuanViT-base-patch16-224")
804
+
805
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
806
+ >>> image = Image.open(requests.get(url, stream=True).raw)
807
+
808
+ >>> inputs = processor(images=image, return_tensors="pt")
809
+
810
+ >>> with torch.no_grad():
811
+ ... image_features = model.get_image_features(**inputs)
812
+ ```
813
+ """
814
+ # Use HunyuanViTModel's config for some fields (if specified) instead of those of vision & text components.
815
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
816
+ output_hidden_states = (
817
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
818
+ )
819
+
820
+ vision_outputs: BaseModelOutputWithPooling = self.vision_model(
821
+ pixel_values=pixel_values,
822
+ attention_mask=pixel_attention_mask,
823
+ spatial_shapes=spatial_shapes,
824
+ output_attentions=output_attentions,
825
+ output_hidden_states=output_hidden_states,
826
+ )
827
+
828
+ pooled_output = vision_outputs.pooler_output
829
+
830
+ return pooled_output
831
+
832
+
833
+ __all__ = [
834
+ "HunyuanViTModel",
835
+ "HunyuanViTPreTrainedModel",
836
+ "HunyuanViTVisionModel",
837
+ "HunyuanViTForImageClassification",
838
+ ]
preprocessor_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.5,
8
+ 0.5,
9
+ 0.5
10
+ ],
11
+ "image_processor_type": "HunyuanViTImageProcessor",
12
+ "image_std": [
13
+ 0.5,
14
+ 0.5,
15
+ 0.5
16
+ ],
17
+ "processor_class": "HunyuanViTProcessor",
18
+ "resample": 2,
19
+ "rescale_factor": 0.00392156862745098,
20
+ "base_size": 0,
21
+ "maxres": 1536,
22
+ "minres": 0
23
+ }