townwish commited on
Commit
e4469bb
·
verified ·
1 Parent(s): 4f10f7b

Upload folder using huggingface_hub

Browse files
face_clip_model/config.json ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "architectures": [
4
+ "EvaCLIPModel"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_evaclip.EvaCLIPConfig",
8
+ "AutoModel": "modeling_evaclip.EvaCLIPModel"
9
+ },
10
+ "initializer_factor": 1.0,
11
+ "logit_scale_init_value": 2.6592,
12
+ "model_type": "clip",
13
+ "projection_dim": 768,
14
+ "text_config": {
15
+ "_attn_implementation_autoset": false,
16
+ "_name_or_path": "",
17
+ "add_cross_attention": false,
18
+ "architectures": null,
19
+ "attention_dropout": 0.0,
20
+ "bad_words_ids": null,
21
+ "begin_suppress_tokens": null,
22
+ "bos_token_id": 0,
23
+ "chunk_size_feed_forward": 0,
24
+ "cross_attention_hidden_size": null,
25
+ "decoder_start_token_id": null,
26
+ "diversity_penalty": 0.0,
27
+ "do_sample": false,
28
+ "dropout": 0.0,
29
+ "early_stopping": false,
30
+ "encoder_no_repeat_ngram_size": 0,
31
+ "eos_token_id": 2,
32
+ "exponential_decay_length_penalty": null,
33
+ "finetuning_task": null,
34
+ "forced_bos_token_id": null,
35
+ "forced_eos_token_id": null,
36
+ "hidden_act": "gelu",
37
+ "hidden_size": 768,
38
+ "id2label": {
39
+ "0": "LABEL_0",
40
+ "1": "LABEL_1"
41
+ },
42
+ "initializer_factor": 1.0,
43
+ "initializer_range": 0.02,
44
+ "intermediate_size": 3072,
45
+ "is_decoder": false,
46
+ "is_encoder_decoder": false,
47
+ "k_bias": true,
48
+ "label2id": {
49
+ "LABEL_0": 0,
50
+ "LABEL_1": 1
51
+ },
52
+ "layer_norm_eps": 1e-05,
53
+ "length_penalty": 1.0,
54
+ "max_length": 20,
55
+ "max_position_embeddings": 77,
56
+ "min_length": 0,
57
+ "model_type": "clip_text_model",
58
+ "no_repeat_ngram_size": 0,
59
+ "num_attention_heads": 12,
60
+ "num_beam_groups": 1,
61
+ "num_beams": 1,
62
+ "num_hidden_layers": 12,
63
+ "num_return_sequences": 1,
64
+ "output_attentions": false,
65
+ "output_hidden_states": false,
66
+ "output_scores": false,
67
+ "pad_token_id": 1,
68
+ "post_layernorm": false,
69
+ "prefix": null,
70
+ "problem_type": null,
71
+ "projection_dim": 768,
72
+ "pruned_heads": {},
73
+ "q_bias": true,
74
+ "remove_invalid_values": false,
75
+ "repetition_penalty": 1.0,
76
+ "return_dict": true,
77
+ "return_dict_in_generate": false,
78
+ "rope_theta": 10000,
79
+ "sep_token_id": null,
80
+ "suppress_tokens": null,
81
+ "task_specific_params": null,
82
+ "temperature": 1.0,
83
+ "tf_legacy_loss": false,
84
+ "tie_encoder_decoder": false,
85
+ "tie_word_embeddings": true,
86
+ "tokenizer_class": null,
87
+ "top_k": 50,
88
+ "top_p": 1.0,
89
+ "torch_dtype": null,
90
+ "torchscript": false,
91
+ "transformers_version": "4.50.0",
92
+ "typical_p": 1.0,
93
+ "use_bfloat16": true,
94
+ "use_rms_norm": false,
95
+ "use_rope": false,
96
+ "use_sub_ln": false,
97
+ "use_swiglu_mlp": false,
98
+ "v_bias": true,
99
+ "vocab_size": 49408
100
+ },
101
+ "torch_dtype": "float32",
102
+ "transformers_version": "4.28.1",
103
+ "vision_config": {
104
+ "_attn_implementation_autoset": false,
105
+ "_name_or_path": "",
106
+ "add_cross_attention": false,
107
+ "architectures": null,
108
+ "attention_dropout": 0.0,
109
+ "bad_words_ids": null,
110
+ "begin_suppress_tokens": null,
111
+ "bos_token_id": null,
112
+ "chunk_size_feed_forward": 0,
113
+ "cross_attention_hidden_size": null,
114
+ "decoder_start_token_id": null,
115
+ "diversity_penalty": 0.0,
116
+ "do_sample": false,
117
+ "dropout": 0.0,
118
+ "early_stopping": false,
119
+ "encoder_no_repeat_ngram_size": 0,
120
+ "eos_token_id": null,
121
+ "exponential_decay_length_penalty": null,
122
+ "finetuning_task": null,
123
+ "forced_bos_token_id": null,
124
+ "forced_eos_token_id": null,
125
+ "hidden_act": "gelu",
126
+ "hidden_size": 1024,
127
+ "id2label": {
128
+ "0": "LABEL_0",
129
+ "1": "LABEL_1"
130
+ },
131
+ "image_size": 336,
132
+ "initializer_factor": 1.0,
133
+ "initializer_range": 0.02,
134
+ "intermediate_size": 2730,
135
+ "is_decoder": false,
136
+ "is_encoder_decoder": false,
137
+ "k_bias": false,
138
+ "label2id": {
139
+ "LABEL_0": 0,
140
+ "LABEL_1": 1
141
+ },
142
+ "layer_norm_eps": 1e-06,
143
+ "length_penalty": 1.0,
144
+ "max_length": 20,
145
+ "min_length": 0,
146
+ "model_type": "clip_vision_model",
147
+ "no_repeat_ngram_size": 0,
148
+ "num_attention_heads": 16,
149
+ "num_beam_groups": 1,
150
+ "num_beams": 1,
151
+ "num_channels": 3,
152
+ "num_hidden_layers": 24,
153
+ "num_return_sequences": 1,
154
+ "output_attentions": false,
155
+ "output_hidden_states": false,
156
+ "output_scores": false,
157
+ "pad_token_id": null,
158
+ "patch_size": 14,
159
+ "post_layernorm": false,
160
+ "prefix": null,
161
+ "pretrained_seq_len": 16,
162
+ "problem_type": null,
163
+ "projection_dim": 768,
164
+ "pruned_heads": {},
165
+ "q_bias": true,
166
+ "remove_invalid_values": false,
167
+ "repetition_penalty": 1.0,
168
+ "return_dict": true,
169
+ "return_dict_in_generate": false,
170
+ "rope_theta": 10000,
171
+ "sep_token_id": null,
172
+ "suppress_tokens": null,
173
+ "task_specific_params": null,
174
+ "temperature": 1.0,
175
+ "tf_legacy_loss": false,
176
+ "tie_encoder_decoder": false,
177
+ "tie_word_embeddings": true,
178
+ "tokenizer_class": null,
179
+ "top_k": 50,
180
+ "top_p": 1.0,
181
+ "torch_dtype": null,
182
+ "torchscript": false,
183
+ "transformers_version": "4.50.0",
184
+ "typical_p": 1.0,
185
+ "use_bfloat16": true,
186
+ "use_rms_norm": false,
187
+ "use_rope": true,
188
+ "use_sub_ln": true,
189
+ "use_swiglu_mlp": true,
190
+ "v_bias": true
191
+ }
192
+ }
face_clip_model/configuration_evaclip.py ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ """ EvaCLIP model configuration"""
3
+ # Code mainly copied here: https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/configuration_clip.py
4
+ # and adjusted for evaclip
5
+
6
+ import copy
7
+ import os
8
+ from collections import OrderedDict
9
+ from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
10
+
11
+
12
+ if TYPE_CHECKING:
13
+ from transformers.processing_utils import ProcessorMixin
14
+ from transformers.utils import TensorType
15
+
16
+ from transformers.configuration_utils import PretrainedConfig
17
+ from transformers.utils import logging
18
+
19
+
20
+ logger = logging.get_logger(__name__)
21
+
22
+
23
+ class EvaCLIPTextConfig(PretrainedConfig):
24
+ r"""
25
+ This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
26
+ text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
27
+ with the defaults will yield a similar configuration to that of the text encoder of the CLIP
28
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
29
+
30
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
31
+ documentation from [`PretrainedConfig`] for more information.
32
+
33
+ Args:
34
+ vocab_size (`int`, *optional*, defaults to 49408):
35
+ Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
36
+ the `inputs_ids` passed when calling [`CLIPModel`].
37
+ hidden_size (`int`, *optional*, defaults to 512):
38
+ Dimensionality of the encoder layers and the pooler layer.
39
+ intermediate_size (`int`, *optional*, defaults to 2048):
40
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
41
+ num_hidden_layers (`int`, *optional*, defaults to 12):
42
+ Number of hidden layers in the Transformer encoder.
43
+ num_attention_heads (`int`, *optional*, defaults to 8):
44
+ Number of attention heads for each attention layer in the Transformer encoder.
45
+ max_position_embeddings (`int`, *optional*, defaults to 77):
46
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
47
+ just in case (e.g., 512 or 1024 or 2048).
48
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
49
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
50
+ `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
51
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
52
+ The epsilon used by the layer normalization layers.
53
+ attention_dropout (`float`, *optional*, defaults to 0.0):
54
+ The dropout ratio for the attention probabilities.
55
+ initializer_range (`float`, *optional*, defaults to 0.02):
56
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
57
+ initializer_factor (`float`, *optional*, defaults to 1):
58
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
59
+ testing).
60
+
61
+ Example:
62
+
63
+ ```python
64
+ >>> from transformers import CLIPTextConfig, CLIPTextModel
65
+
66
+ >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
67
+ >>> configuration = CLIPTextConfig()
68
+
69
+ >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
70
+ >>> model = CLIPTextModel(configuration)
71
+
72
+ >>> # Accessing the model configuration
73
+ >>> configuration = model.config
74
+ ```"""
75
+ model_type = "clip_text_model"
76
+
77
+ def __init__(
78
+ self,
79
+ vocab_size=49408,
80
+ hidden_size=512,
81
+ intermediate_size=2048,
82
+ projection_dim=512,
83
+ num_hidden_layers=12,
84
+ num_attention_heads=8,
85
+ max_position_embeddings=77,
86
+ hidden_act="gelu",
87
+ layer_norm_eps=1e-5,
88
+ attention_dropout=0.0,
89
+ initializer_range=0.02,
90
+ initializer_factor=1.0,
91
+ q_bias=True,
92
+ k_bias=True,
93
+ v_bias=True,
94
+ post_layernorm=False,
95
+ pad_token_id=1,
96
+ bos_token_id=0,
97
+ eos_token_id=2,
98
+ rope_theta=10000,
99
+ use_rms_norm=False,
100
+ use_rope=False,
101
+ use_sub_ln=False,
102
+ use_swiglu_mlp=False,
103
+ **kwargs,
104
+ ):
105
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
106
+
107
+ self.vocab_size = vocab_size
108
+ self.hidden_size = hidden_size
109
+ self.intermediate_size = intermediate_size
110
+ self.projection_dim = projection_dim
111
+ self.num_hidden_layers = num_hidden_layers
112
+ self.num_attention_heads = num_attention_heads
113
+ self.max_position_embeddings = max_position_embeddings
114
+ self.layer_norm_eps = layer_norm_eps
115
+ self.hidden_act = hidden_act
116
+ self.initializer_range = initializer_range
117
+ self.initializer_factor = initializer_factor
118
+ self.q_bias=q_bias
119
+ self.k_bias=k_bias
120
+ self.v_bias=v_bias
121
+ self.post_layernorm = post_layernorm
122
+ self.attention_dropout = attention_dropout
123
+ self.rope_theta = rope_theta
124
+ self.use_rms_norm = use_rms_norm
125
+ self.use_rope = use_rope
126
+ self.use_sub_ln = use_sub_ln
127
+ self.use_swiglu_mlp=use_swiglu_mlp
128
+
129
+ @classmethod
130
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
131
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
132
+
133
+ # get the text config dict if we are loading from CLIPConfig
134
+ if config_dict.get("model_type") == "clip":
135
+ config_dict = config_dict["text_config"]
136
+
137
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
138
+ logger.warning(
139
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
140
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
141
+ )
142
+
143
+ return cls.from_dict(config_dict, **kwargs)
144
+
145
+
146
+ class EvaCLIPVisionConfig(PretrainedConfig):
147
+ r"""
148
+ This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
149
+ CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
150
+ configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
151
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
152
+
153
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
154
+ documentation from [`PretrainedConfig`] for more information.
155
+
156
+ Args:
157
+ hidden_size (`int`, *optional*, defaults to 768):
158
+ Dimensionality of the encoder layers and the pooler layer.
159
+ intermediate_size (`int`, *optional*, defaults to 3072):
160
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
161
+ num_hidden_layers (`int`, *optional*, defaults to 12):
162
+ Number of hidden layers in the Transformer encoder.
163
+ num_attention_heads (`int`, *optional*, defaults to 12):
164
+ Number of attention heads for each attention layer in the Transformer encoder.
165
+ image_size (`int`, *optional*, defaults to 224):
166
+ The size (resolution) of each image.
167
+ patch_size (`int`, *optional*, defaults to 32):
168
+ The size (resolution) of each patch.
169
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
170
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
171
+ `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
172
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
173
+ The epsilon used by the layer normalization layers.
174
+ attention_dropout (`float`, *optional*, defaults to 0.0):
175
+ The dropout ratio for the attention probabilities.
176
+ initializer_range (`float`, *optional*, defaults to 0.02):
177
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
178
+ initializer_factor (`float`, *optional*, defaults to 1):
179
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
180
+ testing).
181
+
182
+ Example:
183
+
184
+ ```python
185
+ >>> from transformers import CLIPVisionConfig, CLIPVisionModel
186
+
187
+ >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
188
+ >>> configuration = CLIPVisionConfig()
189
+
190
+ >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
191
+ >>> model = CLIPVisionModel(configuration)
192
+
193
+ >>> # Accessing the model configuration
194
+ >>> configuration = model.config
195
+ ```"""
196
+
197
+ model_type = "clip_vision_model"
198
+
199
+ def __init__(
200
+ self,
201
+ hidden_size=768,
202
+ intermediate_size=3072,
203
+ projection_dim=512,
204
+ num_hidden_layers=12,
205
+ num_attention_heads=12,
206
+ num_channels=3,
207
+ image_size=224,
208
+ patch_size=32,
209
+ hidden_act="gelu",
210
+ layer_norm_eps=1e-5,
211
+ attention_dropout=0.0,
212
+ initializer_range=0.02,
213
+ initializer_factor=1.0,
214
+ q_bias=True,
215
+ k_bias=True,
216
+ v_bias=True,
217
+ post_layernorm=False,
218
+ pretrained_seq_len=16,
219
+ rope_theta=10000,
220
+ use_rms_norm=True,
221
+ use_rope=False,
222
+ use_sub_ln=False,
223
+ use_swiglu_mlp=False,
224
+ **kwargs,
225
+ ):
226
+ super().__init__(**kwargs)
227
+
228
+ self.hidden_size = hidden_size
229
+ self.intermediate_size = intermediate_size
230
+ self.projection_dim = projection_dim
231
+ self.num_hidden_layers = num_hidden_layers
232
+ self.num_attention_heads = num_attention_heads
233
+ self.num_channels = num_channels
234
+ self.patch_size = patch_size
235
+ self.image_size = image_size
236
+ self.initializer_range = initializer_range
237
+ self.initializer_factor = initializer_factor
238
+ self.q_bias=q_bias
239
+ self.k_bias=k_bias
240
+ self.v_bias=v_bias
241
+ self.post_layernorm = post_layernorm
242
+ self.pretrained_seq_len = pretrained_seq_len
243
+ self.attention_dropout = attention_dropout
244
+ self.layer_norm_eps = layer_norm_eps
245
+ self.hidden_act = hidden_act
246
+ self.rope_theta = rope_theta
247
+ self.use_rms_norm = use_rms_norm
248
+ self.use_rope = use_rope
249
+ self.use_sub_ln = use_sub_ln
250
+ self.use_swiglu_mlp=use_swiglu_mlp
251
+
252
+ @classmethod
253
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
254
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
255
+
256
+ # get the vision config dict if we are loading from CLIPConfig
257
+ if config_dict.get("model_type") == "clip":
258
+ config_dict = config_dict["vision_config"]
259
+
260
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
261
+ logger.warning(
262
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
263
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
264
+ )
265
+
266
+ return cls.from_dict(config_dict, **kwargs)
267
+
268
+
269
+ class EvaCLIPConfig(PretrainedConfig):
270
+ r"""
271
+ [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
272
+ a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
273
+ a configuration with the defaults will yield a similar configuration to that of the CLIP
274
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
275
+
276
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
277
+ documentation from [`PretrainedConfig`] for more information.
278
+
279
+ Args:
280
+ text_config (`dict`, *optional*):
281
+ Dictionary of configuration options used to initialize [`CLIPTextConfig`].
282
+ vision_config (`dict`, *optional*):
283
+ Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
284
+ projection_dim (`int`, *optional*, defaults to 512):
285
+ Dimentionality of text and vision projection layers.
286
+ logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
287
+ The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
288
+ kwargs (*optional*):
289
+ Dictionary of keyword arguments.
290
+
291
+ Example:
292
+
293
+ ```python
294
+ >>> from transformers import CLIPConfig, CLIPModel
295
+
296
+ >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
297
+ >>> configuration = CLIPConfig()
298
+
299
+ >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
300
+ >>> model = CLIPModel(configuration)
301
+
302
+ >>> # Accessing the model configuration
303
+ >>> configuration = model.config
304
+
305
+ >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
306
+ >>> from transformers import CLIPTextConfig, CLIPVisionConfig
307
+
308
+ >>> # Initializing a CLIPText and CLIPVision configuration
309
+ >>> config_text = CLIPTextConfig()
310
+ >>> config_vision = CLIPVisionConfig()
311
+
312
+ >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
313
+ ```"""
314
+
315
+ model_type = "clip"
316
+ is_composition = True
317
+
318
+ def __init__(
319
+ self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
320
+ ):
321
+ # If `_config_dict` exist, we use them for the backward compatibility.
322
+ # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
323
+ # of confusion!).
324
+ text_config_dict = kwargs.pop("text_config_dict", None)
325
+ vision_config_dict = kwargs.pop("vision_config_dict", None)
326
+
327
+ super().__init__(**kwargs)
328
+
329
+ # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
330
+ # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
331
+ # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
332
+ if text_config_dict is not None:
333
+ if text_config is None:
334
+ text_config = {}
335
+
336
+ # This is the complete result when using `text_config_dict`.
337
+ _text_config_dict = EvaCLIPTextConfig(**text_config_dict).to_dict()
338
+
339
+ # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
340
+ for key, value in _text_config_dict.items():
341
+ if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
342
+ # If specified in `text_config_dict`
343
+ if key in text_config_dict:
344
+ message = (
345
+ f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
346
+ f'The value `text_config_dict["{key}"]` will be used instead.'
347
+ )
348
+ # If inferred from default argument values (just to be super careful)
349
+ else:
350
+ message = (
351
+ f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
352
+ f'value `text_config["{key}"]` will be overriden.'
353
+ )
354
+ logger.warning(message)
355
+
356
+ # Update all values in `text_config` with the ones in `_text_config_dict`.
357
+ text_config.update(_text_config_dict)
358
+
359
+ if vision_config_dict is not None:
360
+ if vision_config is None:
361
+ vision_config = {}
362
+
363
+ # This is the complete result when using `vision_config_dict`.
364
+ _vision_config_dict = EvaCLIPVisionConfig(**vision_config_dict).to_dict()
365
+ # convert keys to string instead of integer
366
+ if "id2label" in _vision_config_dict:
367
+ _vision_config_dict["id2label"] = {
368
+ str(key): value for key, value in _vision_config_dict["id2label"].items()
369
+ }
370
+
371
+ # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
372
+ for key, value in _vision_config_dict.items():
373
+ if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
374
+ # If specified in `vision_config_dict`
375
+ if key in vision_config_dict:
376
+ message = (
377
+ f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
378
+ f'values. The value `vision_config_dict["{key}"]` will be used instead.'
379
+ )
380
+ # If inferred from default argument values (just to be super careful)
381
+ else:
382
+ message = (
383
+ f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
384
+ f'The value `vision_config["{key}"]` will be overriden.'
385
+ )
386
+ logger.warning(message)
387
+
388
+ # Update all values in `vision_config` with the ones in `_vision_config_dict`.
389
+ vision_config.update(_vision_config_dict)
390
+
391
+ if text_config is None:
392
+ text_config = {}
393
+ logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
394
+
395
+ if vision_config is None:
396
+ vision_config = {}
397
+ logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
398
+
399
+ self.text_config = EvaCLIPTextConfig(**text_config)
400
+ self.vision_config = EvaCLIPVisionConfig(**vision_config)
401
+
402
+ self.projection_dim = projection_dim
403
+ self.logit_scale_init_value = logit_scale_init_value
404
+ self.initializer_factor = 1.0
405
+
406
+ @classmethod
407
+ def from_text_vision_configs(cls, text_config: EvaCLIPTextConfig, vision_config: EvaCLIPVisionConfig, **kwargs):
408
+ r"""
409
+ Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
410
+ configuration.
411
+
412
+ Returns:
413
+ [`CLIPConfig`]: An instance of a configuration object
414
+ """
415
+
416
+ return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
417
+
418
+ def to_dict(self):
419
+ """
420
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
421
+
422
+ Returns:
423
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
424
+ """
425
+ output = copy.deepcopy(self.__dict__)
426
+ output["text_config"] = self.text_config.to_dict()
427
+ output["vision_config"] = self.vision_config.to_dict()
428
+ output["model_type"] = self.__class__.model_type
429
+ return output
face_clip_model/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
face_clip_model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8095b15c3a51b8170879aa11242d8788b0b4549d1f849e9edaa84d04b410edc
3
+ size 1712418940
face_clip_model/modeling_evaclip.py ADDED
@@ -0,0 +1,1061 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2025 The OpenAI Team Authors, The HuggingFace Team,
3
+ # The BAAI Team Authors and The Huawei Team Authors. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """PyTorch EvaCLIP model."""
17
+
18
+ from dataclasses import dataclass
19
+ from typing import Any, Optional, Tuple, Union
20
+
21
+ import torch
22
+ import torch.utils.checkpoint
23
+ from torch import nn
24
+
25
+ from transformers.activations import ACT2FN
26
+ from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
27
+ from transformers.modeling_utils import PreTrainedModel
28
+ from transformers.utils import (
29
+ ModelOutput,
30
+ logging,
31
+ )
32
+ from .configuration_evaclip import EvaCLIPConfig, EvaCLIPTextConfig, EvaCLIPVisionConfig
33
+
34
+ logger = logging.get_logger(__name__)
35
+
36
+
37
+ # Copied from transformers.models.bart.modeling_bart._expand_mask
38
+ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
39
+ """
40
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
41
+ """
42
+ bsz, src_len = mask.size()
43
+ tgt_len = tgt_len if tgt_len is not None else src_len
44
+
45
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
46
+
47
+ inverted_mask = 1.0 - expanded_mask
48
+
49
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
50
+
51
+
52
+ # contrastive loss function, adapted from
53
+ # https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
54
+ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
55
+ return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
56
+
57
+
58
+ def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
59
+ caption_loss = contrastive_loss(similarity)
60
+ image_loss = contrastive_loss(similarity.t())
61
+ return (caption_loss + image_loss) / 2.0
62
+
63
+
64
+ @dataclass
65
+ class EvaCLIPVisionModelOutput(ModelOutput):
66
+ image_embeds: Optional[torch.FloatTensor] = None
67
+ last_hidden_state: torch.FloatTensor = None
68
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
69
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
70
+
71
+
72
+ @dataclass
73
+ class EvaCLIPTextModelOutput(ModelOutput):
74
+ text_embeds: Optional[torch.FloatTensor] = None
75
+ last_hidden_state: torch.FloatTensor = None
76
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
77
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
78
+
79
+
80
+ @dataclass
81
+ class EvaCLIPOutput(ModelOutput):
82
+ loss: Optional[torch.FloatTensor] = None
83
+ logits_per_image: torch.FloatTensor = None
84
+ logits_per_text: torch.FloatTensor = None
85
+ text_embeds: torch.FloatTensor = None
86
+ image_embeds: torch.FloatTensor = None
87
+ text_model_output: BaseModelOutputWithPooling = None
88
+ vision_model_output: BaseModelOutputWithPooling = None
89
+
90
+ def to_tuple(self) -> Tuple[Any]:
91
+ return tuple(
92
+ self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
93
+ for k in self.keys()
94
+ )
95
+
96
+
97
+ class RMSNorm(nn.Module):
98
+ """
99
+ adepted from transformers T5LayerNorm
100
+ """
101
+ def __init__(self, hidden_size, eps=1e-6):
102
+ """
103
+ Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
104
+ """
105
+ super().__init__()
106
+ self.weight = nn.Parameter(torch.ones(hidden_size))
107
+ self.variance_epsilon = eps
108
+
109
+ def forward(self, hidden_states):
110
+ # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
111
+ # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
112
+ # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
113
+ # half-precision inputs is done in fp32
114
+
115
+ variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
116
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
117
+
118
+ # convert into half-precision if necessary
119
+ if self.weight.dtype in [torch.float16, torch.bfloat16]:
120
+ hidden_states = hidden_states.to(self.weight.dtype)
121
+
122
+ return self.weight * hidden_states
123
+
124
+
125
+ class EvaCLIPAttention(nn.Module):
126
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
127
+
128
+ def __init__(self, config):
129
+ super().__init__()
130
+ self.config = config
131
+ self.embed_dim = config.hidden_size
132
+ self.num_heads = config.num_attention_heads
133
+ self.head_dim = self.embed_dim // self.num_heads
134
+ if self.head_dim * self.num_heads != self.embed_dim:
135
+ raise ValueError(
136
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
137
+ f" {self.num_heads})."
138
+ )
139
+ self.scale = self.head_dim**-0.5
140
+ self.dropout = config.attention_dropout
141
+
142
+ self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.k_bias)
143
+ self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.v_bias)
144
+ self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.q_bias)
145
+ self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
146
+
147
+ if config.use_rope:
148
+ self.rope = VisionRotaryEmbedding(config)
149
+ else:
150
+ self.rope = None
151
+
152
+ if not config.use_sub_ln:
153
+ self.inner_attn_ln = nn.Identity()
154
+ elif config.use_rms_norm:
155
+ self.inner_attn_ln = RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
156
+ else:
157
+ self.inner_attn_ln = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
158
+
159
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
160
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
161
+
162
+ def forward(
163
+ self,
164
+ hidden_states: torch.Tensor,
165
+ attention_mask: Optional[torch.Tensor] = None,
166
+ causal_attention_mask: Optional[torch.Tensor] = None,
167
+ output_attentions: Optional[bool] = False,
168
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
169
+ """Input shape: Batch x Time x Channel"""
170
+
171
+ bsz, tgt_len, embed_dim = hidden_states.size()
172
+ proj_shape = (bsz * self.num_heads, -1, self.head_dim)
173
+
174
+ query_states = self._shape(self.q_proj(hidden_states), -1, bsz)
175
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
176
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
177
+
178
+ # RoPE
179
+ if self.rope:
180
+ query_states_cls, query_states_rest = query_states[:, :, :1], query_states[:, :, 1:]
181
+ key_states_cls, key_states_rest = key_states[:, :, :1], key_states[:, :, 1:]
182
+
183
+ query_states = torch.cat(
184
+ [query_states_cls, self.rope(query_states_rest)], dim=-2
185
+ ).type_as(value_states)
186
+ key_states = torch.cat(
187
+ [key_states_cls, self.rope(key_states_rest)], dim=-2
188
+ ).type_as(value_states)
189
+
190
+ query_states = query_states.view(*proj_shape)
191
+ key_states = key_states.view(*proj_shape)
192
+ value_states = value_states.view(*proj_shape)
193
+
194
+ src_len = key_states.size(1)
195
+ attn_weights = torch.bmm(query_states * self.scale, key_states.transpose(1, 2))
196
+
197
+ if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
198
+ raise ValueError(
199
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
200
+ f" {attn_weights.size()}"
201
+ )
202
+
203
+ # apply the causal_attention_mask first
204
+ if causal_attention_mask is not None:
205
+ if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
206
+ raise ValueError(
207
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
208
+ f" {causal_attention_mask.size()}"
209
+ )
210
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
211
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
212
+
213
+ if attention_mask is not None:
214
+ if attention_mask.size() != (bsz, 1, tgt_len, src_len):
215
+ raise ValueError(
216
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
217
+ )
218
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
219
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
220
+
221
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
222
+
223
+ if output_attentions:
224
+ # this operation is a bit akward, but it's required to
225
+ # make sure that attn_weights keeps its gradient.
226
+ # In order to do so, attn_weights have to reshaped
227
+ # twice and have to be reused in the following
228
+ attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
229
+ attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
230
+ else:
231
+ attn_weights_reshaped = None
232
+
233
+ attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
234
+
235
+ attn_output = torch.bmm(attn_probs, value_states)
236
+
237
+ if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
238
+ raise ValueError(
239
+ f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
240
+ f" {attn_output.size()}"
241
+ )
242
+
243
+ attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
244
+ attn_output = attn_output.transpose(1, 2)
245
+ attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
246
+
247
+ attn_output = self.inner_attn_ln(attn_output)
248
+ attn_output = self.out_proj(attn_output)
249
+
250
+ return attn_output, attn_weights_reshaped
251
+
252
+
253
+ class EvaCLIPVisionEmbeddings(nn.Module):
254
+ def __init__(self, config: EvaCLIPVisionConfig):
255
+ super().__init__()
256
+ self.config = config
257
+ self.embed_dim = config.hidden_size
258
+ self.image_size = config.image_size
259
+ self.patch_size = config.patch_size
260
+
261
+ self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
262
+
263
+ self.patch_embedding = nn.Conv2d(
264
+ in_channels=config.num_channels,
265
+ out_channels=self.embed_dim,
266
+ kernel_size=self.patch_size,
267
+ stride=self.patch_size,
268
+ bias=True,
269
+ )
270
+
271
+ self.num_patches = (self.image_size // self.patch_size) ** 2
272
+ self.num_positions = self.num_patches + 1
273
+ self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
274
+ self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
275
+
276
+ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
277
+ # TODO: This is a temporary limitations and we will figure out how to handle this in a more elegant way later
278
+ if pixel_values.shape[-1] != self.image_size or pixel_values.shape[-2] != self.image_size:
279
+ raise ValueError(
280
+ f"Input pixel_values should have height and width ({self.image_size}, {self.image_size}),"
281
+ f" but got {pixel_values.shape}."
282
+ )
283
+
284
+ batch_size = pixel_values.shape[0]
285
+ patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid]
286
+ patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
287
+
288
+ class_embeds = self.class_embedding.expand(batch_size, 1, -1)
289
+ embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
290
+ embeddings = embeddings + self.position_embedding(self.position_ids)
291
+ return embeddings
292
+
293
+
294
+ class VisionRotaryEmbedding(nn.Module):
295
+ def __init__(self, config):
296
+ super().__init__()
297
+ seq_len = config.image_size // config.patch_size
298
+ dim = config.hidden_size // config.num_attention_heads // 2
299
+
300
+ t = torch.arange(seq_len) / seq_len * config.pretrained_seq_len
301
+ freqs = 1.0 / (config.rope_theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
302
+ freqs = t.unsqueeze(-1) * freqs.unsqueeze(0)
303
+ freqs = freqs.repeat_interleave(2, dim=-1) # [seq_len, dim]
304
+ freqs = torch.cat(
305
+ [freqs.unsqueeze(1).expand(-1, seq_len, -1), freqs.unsqueeze(0).expand(seq_len, -1, -1)],
306
+ dim=-1
307
+ )
308
+
309
+ freqs_cos = freqs.cos().view(-1, freqs.shape[-1])
310
+ freqs_sin = freqs.sin().view(-1, freqs.shape[-1])
311
+ self.register_buffer("freqs_cos", freqs_cos, persistent=False)
312
+ self.register_buffer("freqs_sin", freqs_sin, persistent=False)
313
+
314
+ def rotate_half(self, x):
315
+ x = x.view(*x.shape[:-1], -1, 2)
316
+ x1, x2 = x.unbind(dim=-1)
317
+ x = torch.stack((-x2, x1), dim=-1)
318
+ return x.flatten(start_dim=-2)
319
+
320
+ def forward(self, x):
321
+ return x * self.freqs_cos + self.rotate_half(x) * self.freqs_sin
322
+
323
+
324
+ class EvaCLIPTextEmbeddings(nn.Module):
325
+ def __init__(self, config: EvaCLIPTextConfig):
326
+ super().__init__()
327
+ embed_dim = config.hidden_size
328
+
329
+ self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
330
+ self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
331
+
332
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
333
+ self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False)
334
+
335
+ def forward(
336
+ self,
337
+ input_ids: Optional[torch.LongTensor] = None,
338
+ position_ids: Optional[torch.LongTensor] = None,
339
+ inputs_embeds: Optional[torch.FloatTensor] = None,
340
+ ) -> torch.Tensor:
341
+ seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
342
+
343
+ if position_ids is None:
344
+ position_ids = self.position_ids[:, :seq_length]
345
+
346
+ if inputs_embeds is None:
347
+ inputs_embeds = self.token_embedding(input_ids)
348
+
349
+ position_embeddings = self.position_embedding(position_ids)
350
+ embeddings = inputs_embeds + position_embeddings
351
+
352
+ return embeddings
353
+
354
+
355
+ class EvaCLIPMLP(nn.Module):
356
+ def __init__(self, config):
357
+ super().__init__()
358
+ self.config = config
359
+ self.activation_fn = ACT2FN[config.hidden_act]
360
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
361
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
362
+ self.ffn_ln = nn.LayerNorm(config.intermediate_size) if config.use_sub_ln else nn.Identity()
363
+
364
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
365
+ hidden_states = self.fc1(hidden_states)
366
+ hidden_states = self.activation_fn(hidden_states)
367
+ hidden_states = self.ffn_ln(hidden_states)
368
+ hidden_states = self.fc2(hidden_states)
369
+ return hidden_states
370
+
371
+
372
+ class EvaCLIPSwiGLUMLP(nn.Module):
373
+ def __init__(self, config):
374
+ super().__init__()
375
+ self.config = config
376
+ self.activation_fn = nn.SiLU()
377
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
378
+ self.fc2 = nn.Linear(config.hidden_size, config.intermediate_size)
379
+ self.fc3 = nn.Linear(config.intermediate_size, config.hidden_size)
380
+
381
+ if not config.use_sub_ln:
382
+ self.ffn_ln = nn.Identity()
383
+ elif config.use_rms_norm:
384
+ self.ffn_ln = RMSNorm(config.intermediate_size, eps=config.layer_norm_eps)
385
+ else:
386
+ self.ffn_ln = nn.LayerNorm(config.intermediate_size, eps=config.layer_norm_eps)
387
+
388
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
389
+ hidden_states1 = self.fc1(hidden_states)
390
+ hidden_states2 = self.fc2(hidden_states)
391
+ hidden_states = self.activation_fn(hidden_states1) * hidden_states2
392
+ hidden_states = self.ffn_ln(hidden_states)
393
+ hidden_states = self.fc3(hidden_states)
394
+ return hidden_states
395
+
396
+
397
+ class EvaCLIPEncoderLayer(nn.Module):
398
+ def __init__(self, config: EvaCLIPConfig):
399
+ super().__init__()
400
+ norm_layer = RMSNorm if config.use_rms_norm else nn.LayerNorm
401
+
402
+ self.config = config
403
+ self.embed_dim = config.hidden_size
404
+ self.post_layernorm = config.post_layernorm if config.post_layernorm is not None else False
405
+ self.self_attn = EvaCLIPAttention(config)
406
+ self.layer_norm1 = norm_layer(self.embed_dim, eps=config.layer_norm_eps)
407
+ self.mlp = EvaCLIPSwiGLUMLP(config) if config.use_swiglu_mlp else EvaCLIPMLP(config)
408
+ self.layer_norm2 = norm_layer(self.embed_dim, eps=config.layer_norm_eps)
409
+
410
+ def forward(
411
+ self,
412
+ hidden_states: torch.Tensor,
413
+ attention_mask: torch.Tensor,
414
+ causal_attention_mask: torch.Tensor,
415
+ output_attentions: Optional[bool] = False,
416
+ ) -> Tuple[torch.FloatTensor]:
417
+ residual = hidden_states
418
+
419
+ if not self.post_layernorm:
420
+ hidden_states = self.layer_norm1(hidden_states)
421
+ hidden_states, attn_weights = self.self_attn(
422
+ hidden_states=hidden_states,
423
+ attention_mask=attention_mask,
424
+ causal_attention_mask=causal_attention_mask,
425
+ output_attentions=output_attentions,
426
+ )
427
+ if self.post_layernorm:
428
+ hidden_states = self.layer_norm1(hidden_states)
429
+ hidden_states = residual + hidden_states
430
+ residual = hidden_states
431
+ if not self.post_layernorm:
432
+ hidden_states = self.layer_norm2(hidden_states)
433
+ hidden_states = self.mlp(hidden_states)
434
+ if self.post_layernorm:
435
+ hidden_states = self.layer_norm2(hidden_states)
436
+ hidden_states = residual + hidden_states
437
+
438
+ outputs = (hidden_states,)
439
+
440
+ if output_attentions:
441
+ outputs += (attn_weights,)
442
+
443
+ return outputs
444
+
445
+
446
+ class EvaCLIPEncoder(nn.Module):
447
+ """
448
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
449
+ [`EvaCLIPEncoderLayer`].
450
+
451
+ Args:
452
+ config: EvaCLIPConfig
453
+ """
454
+
455
+ def __init__(self, config: EvaCLIPConfig):
456
+ super().__init__()
457
+ self.config = config
458
+ self.layers = nn.ModuleList([EvaCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
459
+ self.gradient_checkpointing = False
460
+
461
+ def forward(
462
+ self,
463
+ inputs_embeds,
464
+ attention_mask: Optional[torch.Tensor] = None,
465
+ causal_attention_mask: Optional[torch.Tensor] = None,
466
+ output_attentions: Optional[bool] = None,
467
+ output_hidden_states: Optional[bool] = None,
468
+ return_dict: Optional[bool] = None,
469
+ ) -> Union[Tuple, BaseModelOutput]:
470
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
471
+ output_hidden_states = (
472
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
473
+ )
474
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
475
+
476
+ encoder_states = () if output_hidden_states else None
477
+ all_attentions = () if output_attentions else None
478
+
479
+ hidden_states = inputs_embeds
480
+ for idx, encoder_layer in enumerate(self.layers):
481
+ if output_hidden_states:
482
+ encoder_states = encoder_states + (hidden_states,)
483
+ if self.gradient_checkpointing and self.training:
484
+
485
+ def create_custom_forward(module):
486
+ def custom_forward(*inputs):
487
+ return module(*inputs, output_attentions)
488
+
489
+ return custom_forward
490
+
491
+ layer_outputs = torch.utils.checkpoint.checkpoint(
492
+ create_custom_forward(encoder_layer),
493
+ hidden_states,
494
+ attention_mask,
495
+ causal_attention_mask,
496
+ )
497
+ else:
498
+ layer_outputs = encoder_layer(
499
+ hidden_states,
500
+ attention_mask,
501
+ causal_attention_mask,
502
+ output_attentions=output_attentions,
503
+ )
504
+
505
+ hidden_states = layer_outputs[0]
506
+
507
+ if output_attentions:
508
+ all_attentions = all_attentions + (layer_outputs[1],)
509
+
510
+ if output_hidden_states:
511
+ encoder_states = encoder_states + (hidden_states,)
512
+
513
+ if not return_dict:
514
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
515
+ return BaseModelOutput(
516
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
517
+ )
518
+
519
+
520
+ class EvaCLIPPreTrainedModel(PreTrainedModel):
521
+ """
522
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
523
+ models.
524
+ """
525
+
526
+ config_class = EvaCLIPConfig
527
+ base_model_prefix = "clip"
528
+ supports_gradient_checkpointing = True
529
+ _keys_to_ignore_on_load_missing = [r"position_ids"]
530
+
531
+ def _init_weights(self, module):
532
+ """Initialize the weights"""
533
+ factor = self.config.initializer_factor
534
+ if isinstance(module, EvaCLIPTextEmbeddings):
535
+ module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
536
+ module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
537
+ elif isinstance(module, EvaCLIPVisionEmbeddings):
538
+ factor = self.config.initializer_factor
539
+ nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
540
+ nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
541
+ nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
542
+ elif isinstance(module, EvaCLIPAttention):
543
+ factor = self.config.initializer_factor
544
+ in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
545
+ out_proj_std = (module.embed_dim**-0.5) * factor
546
+ nn.init.normal_(module.q_proj.weight, std=in_proj_std)
547
+ nn.init.normal_(module.k_proj.weight, std=in_proj_std)
548
+ nn.init.normal_(module.v_proj.weight, std=in_proj_std)
549
+ nn.init.normal_(module.out_proj.weight, std=out_proj_std)
550
+ elif isinstance(module, EvaCLIPMLP):
551
+ factor = self.config.initializer_factor
552
+ in_proj_std = (
553
+ (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
554
+ )
555
+ fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
556
+ nn.init.normal_(module.fc1.weight, std=fc_std)
557
+ nn.init.normal_(module.fc2.weight, std=in_proj_std)
558
+ elif isinstance(module, EvaCLIPModel):
559
+ nn.init.normal_(
560
+ module.text_projection.weight,
561
+ std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
562
+ )
563
+ nn.init.normal_(
564
+ module.visual_projection.weight,
565
+ std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
566
+ )
567
+ elif isinstance(module, EvaCLIPVisionModelWithProjection):
568
+ nn.init.normal_(
569
+ module.visual_projection.weight,
570
+ std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
571
+ )
572
+ elif isinstance(module, EvaCLIPTextModelWithProjection):
573
+ nn.init.normal_(
574
+ module.text_projection.weight,
575
+ std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
576
+ )
577
+
578
+ if isinstance(module, nn.LayerNorm):
579
+ module.bias.data.zero_()
580
+ module.weight.data.fill_(1.0)
581
+ if isinstance(module, nn.Linear) and module.bias is not None:
582
+ module.bias.data.zero_()
583
+
584
+ def _set_gradient_checkpointing(self, module, value=False):
585
+ if isinstance(module, EvaCLIPEncoder):
586
+ module.gradient_checkpointing = value
587
+
588
+
589
+ class EvaCLIPVisionTransformer(nn.Module):
590
+ def __init__(self, config: EvaCLIPVisionConfig):
591
+ super().__init__()
592
+ self.config = config
593
+
594
+ self.embeddings = EvaCLIPVisionEmbeddings(config)
595
+ self.encoder = EvaCLIPEncoder(config)
596
+ self.post_layernorm = RMSNorm(config.hidden_size, eps=config.layer_norm_eps) if config.use_rms_norm else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
597
+
598
+ def gradient_checkpointing_enable(self):
599
+ self.encoder.gradient_checkpointing = True
600
+
601
+ def forward(
602
+ self,
603
+ pixel_values: Optional[torch.FloatTensor] = None,
604
+ output_attentions: Optional[bool] = None,
605
+ output_hidden_states: Optional[bool] = None,
606
+ return_dict: Optional[bool] = None,
607
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
608
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
609
+ output_hidden_states = (
610
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
611
+ )
612
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
613
+
614
+ if pixel_values is None:
615
+ raise ValueError("You have to specify pixel_values")
616
+
617
+ hidden_states = self.embeddings(pixel_values)
618
+
619
+ encoder_outputs = self.encoder(
620
+ inputs_embeds=hidden_states,
621
+ output_attentions=output_attentions,
622
+ output_hidden_states=output_hidden_states,
623
+ return_dict=return_dict,
624
+ )
625
+
626
+ last_hidden_state = encoder_outputs[0]
627
+ pooled_output = last_hidden_state[:, 0, :]
628
+ pooled_output = self.post_layernorm(pooled_output)
629
+
630
+ if not return_dict:
631
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
632
+
633
+ return BaseModelOutputWithPooling(
634
+ last_hidden_state=last_hidden_state,
635
+ pooler_output=pooled_output,
636
+ hidden_states=encoder_outputs.hidden_states,
637
+ attentions=encoder_outputs.attentions,
638
+ )
639
+
640
+
641
+ class EvaCLIPTextTransformer(nn.Module):
642
+ def __init__(self, config: EvaCLIPTextConfig):
643
+ super().__init__()
644
+ self.config = config
645
+ embed_dim = config.hidden_size
646
+ norm_layer = RMSNorm if config.use_rms_norm else nn.LayerNorm
647
+ self.embeddings = EvaCLIPTextEmbeddings(config)
648
+ self.encoder = EvaCLIPEncoder(config)
649
+ self.final_layer_norm = norm_layer(embed_dim, eps=config.layer_norm_eps)
650
+
651
+ def gradient_checkpointing_enable(self):
652
+ self.encoder.gradient_checkpointing = True
653
+
654
+ def forward(
655
+ self,
656
+ input_ids: Optional[torch.Tensor] = None,
657
+ attention_mask: Optional[torch.Tensor] = None,
658
+ position_ids: Optional[torch.Tensor] = None,
659
+ output_attentions: Optional[bool] = None,
660
+ output_hidden_states: Optional[bool] = None,
661
+ return_dict: Optional[bool] = None,
662
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
663
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
664
+ output_hidden_states = (
665
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
666
+ )
667
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
668
+
669
+ if input_ids is None:
670
+ raise ValueError("You have to specify input_ids")
671
+
672
+ input_shape = input_ids.size()
673
+ input_ids = input_ids.view(-1, input_shape[-1])
674
+
675
+ hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
676
+
677
+ bsz, seq_len = input_shape
678
+ # CLIP's text model uses causal mask, prepare it here.
679
+ # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
680
+ causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
681
+ hidden_states.device
682
+ )
683
+ # expand attention_mask
684
+ if attention_mask is not None:
685
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
686
+ attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
687
+
688
+ encoder_outputs = self.encoder(
689
+ inputs_embeds=hidden_states,
690
+ attention_mask=attention_mask,
691
+ causal_attention_mask=causal_attention_mask,
692
+ output_attentions=output_attentions,
693
+ output_hidden_states=output_hidden_states,
694
+ return_dict=return_dict,
695
+ )
696
+
697
+ last_hidden_state = encoder_outputs[0]
698
+ last_hidden_state = self.final_layer_norm(last_hidden_state)
699
+
700
+ # text_embeds.shape = [batch_size, sequence_length, transformer.width]
701
+ # take features from the eot embedding (eot_token is the highest number in each sequence)
702
+ # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
703
+ pooled_output = last_hidden_state[
704
+ torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
705
+ input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
706
+ ]
707
+
708
+ if not return_dict:
709
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
710
+
711
+ return BaseModelOutputWithPooling(
712
+ last_hidden_state=last_hidden_state,
713
+ pooler_output=pooled_output,
714
+ hidden_states=encoder_outputs.hidden_states,
715
+ attentions=encoder_outputs.attentions,
716
+ )
717
+
718
+ def _build_causal_attention_mask(self, bsz, seq_len, dtype):
719
+ # lazily create causal attention mask, with full attention between the vision tokens
720
+ # pytorch uses additive attention mask; fill with -inf
721
+ mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
722
+ mask.fill_(torch.tensor(torch.finfo(dtype).min))
723
+ mask.triu_(1) # zero out the lower diagonal
724
+ mask = mask.unsqueeze(1) # expand mask
725
+ return mask
726
+
727
+
728
+ class EvaCLIPVisionModel(EvaCLIPPreTrainedModel):
729
+ config_class = EvaCLIPVisionConfig
730
+ main_input_name = "pixel_values"
731
+
732
+ def __init__(self, config: EvaCLIPVisionConfig):
733
+ super().__init__(config)
734
+
735
+ self.vision_model = EvaCLIPVisionTransformer(config)
736
+ # Initialize weights and apply final processing
737
+ self.post_init()
738
+
739
+ def get_input_embeddings(self) -> nn.Module:
740
+ return self.vision_model.embeddings.patch_embedding
741
+
742
+ def forward(
743
+ self,
744
+ pixel_values: Optional[torch.FloatTensor] = None,
745
+ output_attentions: Optional[bool] = None,
746
+ output_hidden_states: Optional[bool] = None,
747
+ return_dict: Optional[bool] = None,
748
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
749
+
750
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
751
+
752
+ return self.vision_model(
753
+ pixel_values=pixel_values,
754
+ output_attentions=output_attentions,
755
+ output_hidden_states=output_hidden_states,
756
+ return_dict=return_dict,
757
+ )
758
+
759
+
760
+ class EvaCLIPTextModel(EvaCLIPPreTrainedModel):
761
+ config_class = EvaCLIPTextConfig
762
+
763
+ _no_split_modules = ["EvaCLIPEncoderLayer"]
764
+
765
+ def __init__(self, config: EvaCLIPTextConfig):
766
+ super().__init__(config)
767
+ self.text_model = EvaCLIPTextTransformer(config)
768
+ # Initialize weights and apply final processing
769
+ self.post_init()
770
+
771
+ def get_input_embeddings(self) -> nn.Module:
772
+ return self.text_model.embeddings.token_embedding
773
+
774
+ def set_input_embeddings(self, value):
775
+ self.text_model.embeddings.token_embedding = value
776
+
777
+ def forward(
778
+ self,
779
+ input_ids: Optional[torch.Tensor] = None,
780
+ attention_mask: Optional[torch.Tensor] = None,
781
+ position_ids: Optional[torch.Tensor] = None,
782
+ output_attentions: Optional[bool] = None,
783
+ output_hidden_states: Optional[bool] = None,
784
+ return_dict: Optional[bool] = None,
785
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
786
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
787
+
788
+ return self.text_model(
789
+ input_ids=input_ids,
790
+ attention_mask=attention_mask,
791
+ position_ids=position_ids,
792
+ output_attentions=output_attentions,
793
+ output_hidden_states=output_hidden_states,
794
+ return_dict=return_dict,
795
+ )
796
+
797
+
798
+ class EvaCLIPModel(EvaCLIPPreTrainedModel):
799
+ config_class = EvaCLIPConfig
800
+
801
+ def __init__(self, config: EvaCLIPConfig):
802
+ super().__init__(config)
803
+
804
+ if not (type(config.text_config).__name__ == "EvaCLIPTextConfig"):
805
+ raise ValueError(
806
+ "config.text_config is expected to be of type EvaCLIPTextConfig but is of type"
807
+ f" {type(config.text_config)}."
808
+ )
809
+
810
+ if not (type(config.vision_config).__name__ == "EvaCLIPVisionConfig"):
811
+ raise ValueError(
812
+ "config.vision_config is expected to be of type EvaCLIPVisionConfig but is of type"
813
+ f" {type(config.vision_config)}."
814
+ )
815
+
816
+ text_config = config.text_config
817
+ vision_config = config.vision_config
818
+
819
+ self.projection_dim = config.projection_dim
820
+ self.text_embed_dim = text_config.hidden_size
821
+ self.vision_embed_dim = vision_config.hidden_size
822
+
823
+ self.text_model = EvaCLIPTextTransformer(text_config)
824
+ self.vision_model = EvaCLIPVisionTransformer(vision_config)
825
+
826
+ self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim)
827
+ self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
828
+ self.logit_scale = nn.Parameter(torch.tensor(config.logit_scale_init_value))
829
+
830
+ # Initialize weights and apply final processing
831
+ self.post_init()
832
+
833
+ def encode_text(
834
+ self,
835
+ input_ids: Optional[torch.Tensor] = None,
836
+ attention_mask: Optional[torch.Tensor] = None,
837
+ position_ids: Optional[torch.Tensor] = None,
838
+ output_attentions: Optional[bool] = None,
839
+ output_hidden_states: Optional[bool] = None,
840
+ return_dict: Optional[bool] = None,
841
+ ) -> torch.FloatTensor:
842
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
843
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
844
+ output_hidden_states = (
845
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
846
+ )
847
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
848
+
849
+ text_outputs = self.text_model(
850
+ input_ids=input_ids,
851
+ attention_mask=attention_mask,
852
+ position_ids=position_ids,
853
+ output_attentions=output_attentions,
854
+ output_hidden_states=output_hidden_states,
855
+ return_dict=return_dict,
856
+ )
857
+
858
+ pooled_output = text_outputs[1]
859
+ text_features = self.text_projection(pooled_output)
860
+
861
+ return text_features
862
+
863
+ def encode_image(
864
+ self,
865
+ pixel_values: Optional[torch.FloatTensor] = None,
866
+ output_attentions: Optional[bool] = None,
867
+ output_hidden_states: Optional[bool] = None,
868
+ return_dict: Optional[bool] = None,
869
+ ) -> torch.FloatTensor:
870
+
871
+ # Use EvaCLIP model's config for some fields (if specified) instead of those of vision & text components.
872
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
873
+ output_hidden_states = (
874
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
875
+ )
876
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
877
+
878
+ vision_outputs = self.vision_model(
879
+ pixel_values=pixel_values,
880
+ output_attentions=output_attentions,
881
+ output_hidden_states=output_hidden_states,
882
+ return_dict=return_dict,
883
+ )
884
+
885
+ pooled_output = vision_outputs[1] # pooled_output
886
+ image_features = self.visual_projection(pooled_output)
887
+
888
+ return image_features
889
+
890
+ def forward(
891
+ self,
892
+ input_ids: Optional[torch.LongTensor] = None,
893
+ pixel_values: Optional[torch.FloatTensor] = None,
894
+ attention_mask: Optional[torch.Tensor] = None,
895
+ position_ids: Optional[torch.LongTensor] = None,
896
+ return_loss: Optional[bool] = None,
897
+ output_attentions: Optional[bool] = None,
898
+ output_hidden_states: Optional[bool] = None,
899
+ return_dict: Optional[bool] = None,
900
+ ) -> Union[Tuple, EvaCLIPOutput]:
901
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
902
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
903
+ output_hidden_states = (
904
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
905
+ )
906
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
907
+
908
+ vision_outputs = self.vision_model(
909
+ pixel_values=pixel_values,
910
+ output_attentions=output_attentions,
911
+ output_hidden_states=output_hidden_states,
912
+ return_dict=return_dict,
913
+ )
914
+
915
+ text_outputs = self.text_model(
916
+ input_ids=input_ids,
917
+ attention_mask=attention_mask,
918
+ position_ids=position_ids,
919
+ output_attentions=output_attentions,
920
+ output_hidden_states=output_hidden_states,
921
+ return_dict=return_dict,
922
+ )
923
+
924
+ image_embeds = vision_outputs[1]
925
+ image_embeds = self.visual_projection(image_embeds)
926
+
927
+ text_embeds = text_outputs[1]
928
+ text_embeds = self.text_projection(text_embeds)
929
+
930
+ # normalized features
931
+ image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
932
+ text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
933
+
934
+ # cosine similarity as logits
935
+ logit_scale = self.logit_scale.exp()
936
+ logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
937
+ logits_per_image = logits_per_text.t()
938
+
939
+ loss = None
940
+ if return_loss:
941
+ loss = clip_loss(logits_per_text)
942
+
943
+ if not return_dict:
944
+ output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
945
+ return ((loss,) + output) if loss is not None else output
946
+
947
+ return EvaCLIPOutput(
948
+ loss=loss,
949
+ logits_per_image=logits_per_image,
950
+ logits_per_text=logits_per_text,
951
+ text_embeds=text_embeds,
952
+ image_embeds=image_embeds,
953
+ text_model_output=text_outputs,
954
+ vision_model_output=vision_outputs,
955
+ )
956
+
957
+
958
+ class EvaCLIPVisionModelWithProjection(EvaCLIPPreTrainedModel):
959
+ config_class = EvaCLIPVisionConfig
960
+ main_input_name = "pixel_values"
961
+
962
+ def __init__(self, config: EvaCLIPVisionConfig):
963
+ super().__init__(config)
964
+
965
+ vision_model = EvaCLIPVisionModel._from_config(config)
966
+ self.vision_model = vision_model.vision_model
967
+
968
+ self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim)
969
+
970
+ # Initialize weights and apply final processing
971
+ self.post_init()
972
+
973
+ def get_input_embeddings(self) -> nn.Module:
974
+ return self.vision_model.embeddings.patch_embedding
975
+
976
+ def forward(
977
+ self,
978
+ pixel_values: Optional[torch.FloatTensor] = None,
979
+ output_attentions: Optional[bool] = None,
980
+ output_hidden_states: Optional[bool] = None,
981
+ return_dict: Optional[bool] = None,
982
+ ) -> Union[Tuple, EvaCLIPVisionModelOutput]:
983
+
984
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
985
+
986
+ vision_outputs = self.vision_model(
987
+ pixel_values=pixel_values,
988
+ output_attentions=output_attentions,
989
+ output_hidden_states=output_hidden_states,
990
+ return_dict=return_dict,
991
+ )
992
+
993
+ pooled_output = vision_outputs[1] # pooled_output
994
+
995
+ image_embeds = self.visual_projection(pooled_output)
996
+
997
+ if not return_dict:
998
+ outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]
999
+ return tuple(output for output in outputs if output is not None)
1000
+
1001
+ return EvaCLIPVisionModelOutput(
1002
+ image_embeds=image_embeds,
1003
+ last_hidden_state=vision_outputs.last_hidden_state,
1004
+ hidden_states=vision_outputs.hidden_states,
1005
+ attentions=vision_outputs.attentions,
1006
+ )
1007
+
1008
+
1009
+ class EvaCLIPTextModelWithProjection(EvaCLIPPreTrainedModel):
1010
+ config_class = EvaCLIPTextConfig
1011
+
1012
+ _no_split_modules = ["EvaCLIPEncoderLayer"]
1013
+
1014
+ def __init__(self, config: EvaCLIPTextConfig):
1015
+ super().__init__(config)
1016
+
1017
+ self.text_model = EvaCLIPTextTransformer(config)
1018
+
1019
+ self.text_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
1020
+
1021
+ def get_input_embeddings(self) -> nn.Module:
1022
+ return self.text_model.embeddings.token_embedding
1023
+
1024
+ def set_input_embeddings(self, value):
1025
+ self.text_model.embeddings.token_embedding = value
1026
+
1027
+ def forward(
1028
+ self,
1029
+ input_ids: Optional[torch.Tensor] = None,
1030
+ attention_mask: Optional[torch.Tensor] = None,
1031
+ position_ids: Optional[torch.Tensor] = None,
1032
+ output_attentions: Optional[bool] = None,
1033
+ output_hidden_states: Optional[bool] = None,
1034
+ return_dict: Optional[bool] = None,
1035
+ ) -> Union[Tuple, EvaCLIPTextModelOutput]:
1036
+
1037
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1038
+
1039
+ text_outputs = self.text_model(
1040
+ input_ids=input_ids,
1041
+ attention_mask=attention_mask,
1042
+ position_ids=position_ids,
1043
+ output_attentions=output_attentions,
1044
+ output_hidden_states=output_hidden_states,
1045
+ return_dict=return_dict,
1046
+ )
1047
+
1048
+ pooled_output = text_outputs[1]
1049
+
1050
+ text_embeds = self.text_projection(pooled_output)
1051
+
1052
+ if not return_dict:
1053
+ outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
1054
+ return tuple(output for output in outputs if output is not None)
1055
+
1056
+ return EvaCLIPTextModelOutput(
1057
+ text_embeds=text_embeds,
1058
+ last_hidden_state=text_outputs.last_hidden_state,
1059
+ hidden_states=text_outputs.hidden_states,
1060
+ attentions=text_outputs.attentions,
1061
+ )
face_clip_model/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
face_clip_model/tokenizer_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "__type": "AddedToken",
4
+ "content": "<|startoftext|>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eos_token": {
11
+ "__type": "AddedToken",
12
+ "content": "<|endoftext|>",
13
+ "lstrip": false,
14
+ "normalized": true,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "errors": "replace",
19
+ "model_max_length": 1000000000000000019884624838656,
20
+ "pad_token": "<|endoftext|>",
21
+ "special_tokens_map_file": null,
22
+ "tokenizer_class": "CLIPTokenizer",
23
+ "unk_token": {
24
+ "__type": "AddedToken",
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": true,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
face_clip_model/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
face_helper_1/detection_Resnet50_Final.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4df91fe5a65250752e9c953810d87013afe6a3f94525a494dfb4402dc098d2e
3
+ size 109430110
face_helper_1/detection_mobilenet0.25_Final.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f44cb2531779098efdfe225e694662424aea5f27995768029fdf508052f0c395
3
+ size 1747774
face_helper_1/parsing_bisenet.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a6e4d352dd5a36b797c7997a58a52132ebda5696b63fe5147ec92d6d324ec20
3
+ size 53263053
face_helper_1/parsing_parsenet.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae07646730ef4f3d5f2bfe5ebaaf979f3425ff7bd8ee68007afc9887a611b8fe
3
+ size 85293541
models/1k3d68.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df5c06b8a0c12e422b2ed8947b8869faa4105387f199c477af038aa01f9a45cc
3
+ size 143607619
models/2d106det.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f001b856447c413801ef5c42091ed0cd516fcd21f2d6b79635b1e733a7109dbf
3
+ size 5030888
models/genderage.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fde69b1c810857b88c64a335084f1c3fe8f01246c9a191b48c7bb756d6652fb
3
+ size 1322532
models/glintr100.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ab1d6435d639628a6f3e5008dd4f929edf4c4124b1a7169e1048f9fef534cdf
3
+ size 260665334
models/scrfd_10g_bnkps.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5838f7fe053675b1c7a08b633df49e7af5495cee0493c7dcf6697200b85b5b91
3
+ size 16923827