jordan0811 commited on
Commit
4afec3a
·
verified ·
1 Parent(s): e8121a6

Delete files m*.py with huggingface_hub

Browse files
Files changed (2) hide show
  1. modeling_minicpmv.py +0 -447
  2. modeling_navit_siglip.py +0 -937
modeling_minicpmv.py DELETED
@@ -1,447 +0,0 @@
1
- import math
2
- from typing import List, Optional
3
- import json
4
- import torch
5
- import torchvision
6
-
7
- from threading import Thread
8
- from copy import deepcopy
9
- from PIL import Image
10
- from transformers import AutoProcessor, TextIteratorStreamer
11
-
12
- from .configuration_minicpm import MiniCPMVConfig
13
- from transformers import LlamaForCausalLM, LlamaPreTrainedModel
14
- from .modeling_navit_siglip import SiglipVisionTransformer
15
- from .resampler import Resampler
16
-
17
-
18
-
19
- class MiniCPMVPreTrainedModel(LlamaPreTrainedModel):
20
- config_class = MiniCPMVConfig
21
-
22
-
23
- class MiniCPMV(MiniCPMVPreTrainedModel):
24
- def __init__(self, config):
25
- super().__init__(config)
26
- self.llm = LlamaForCausalLM(config)
27
- self.vpm = self.init_vision_module()
28
- self.vision_dim = self.vpm.embed_dim
29
- self.embed_dim = self.llm.config.hidden_size
30
- self.resampler = self.init_resampler(self.embed_dim, self.vision_dim)
31
- self.processor = None
32
-
33
- self.terminators = ['<|im_end|>', '</s>']
34
-
35
- def init_vision_module(self):
36
- # same as HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit add tgt_sizes
37
- if self.config._attn_implementation == 'flash_attention_2':
38
- self.config.vision_config._attn_implementation = 'flash_attention_2'
39
- else:
40
- # not suport sdpa
41
- self.config.vision_config._attn_implementation = 'eager'
42
- model = SiglipVisionTransformer(self.config.vision_config)
43
- if self.config.drop_vision_last_layer:
44
- model.encoder.layers = model.encoder.layers[:-1]
45
-
46
- setattr(model, 'embed_dim', model.embeddings.embed_dim)
47
- setattr(model, 'patch_size', model.embeddings.patch_size)
48
-
49
- return model
50
-
51
- def init_resampler(self, embed_dim, vision_dim):
52
- return Resampler(
53
- num_queries=self.config.query_num,
54
- embed_dim=embed_dim,
55
- num_heads=embed_dim // 128,
56
- kv_dim=vision_dim,
57
- adaptive=True
58
- )
59
-
60
- def get_input_embeddings(self):
61
- return self.llm.get_input_embeddings()
62
-
63
- def set_input_embeddings(self, value):
64
- self.llm.embed_tokens = value
65
-
66
- def get_output_embeddings(self):
67
- return self.llm.lm_head
68
-
69
- def set_output_embeddings(self, new_embeddings):
70
- self.llm.lm_head = new_embeddings
71
-
72
- def set_decoder(self, decoder):
73
- self.llm = decoder
74
-
75
- def get_decoder(self):
76
- return self.llm
77
-
78
- def get_vllm_embedding(self, data):
79
- if 'vision_hidden_states' not in data:
80
- dtype = self.llm.model.embed_tokens.weight.dtype
81
- device = self.llm.model.embed_tokens.weight.device
82
- tgt_sizes = data['tgt_sizes']
83
- pixel_values_list = data['pixel_values']
84
- vision_hidden_states = []
85
- all_pixel_values = []
86
- img_cnt = []
87
- for pixel_values in pixel_values_list:
88
- img_cnt.append(len(pixel_values))
89
- all_pixel_values.extend([i.flatten(end_dim=1).permute(1, 0) for i in pixel_values])
90
-
91
- # exist image
92
- if all_pixel_values:
93
- tgt_sizes = [tgt_size for tgt_size in tgt_sizes if isinstance(tgt_size, torch.Tensor)]
94
- tgt_sizes = torch.vstack(tgt_sizes).type(torch.int32)
95
-
96
- max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1])
97
-
98
- all_pixel_values = torch.nn.utils.rnn.pad_sequence(all_pixel_values, batch_first=True,
99
- padding_value=0.0)
100
- B, L, _ = all_pixel_values.shape
101
- all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
102
-
103
- patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool, device=device)
104
- for i in range(B):
105
- patch_attn_mask[i, 0, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
106
-
107
- vision_batch_size = self.config.vision_batch_size
108
- all_pixel_values = all_pixel_values.type(dtype).to(device=device)
109
- if B > vision_batch_size:
110
- hs = []
111
- for i in range(0, B, vision_batch_size):
112
- start_idx = i
113
- end_idx = i + vision_batch_size
114
- tmp_hs = self.vpm(all_pixel_values[start_idx:end_idx], patch_attention_mask=patch_attn_mask[start_idx:end_idx], tgt_sizes=tgt_sizes[start_idx:end_idx]).last_hidden_state
115
- hs.append(tmp_hs)
116
- vision_embedding = torch.cat(hs, dim=0)
117
- else:
118
- vision_embedding = self.vpm(all_pixel_values, patch_attention_mask=patch_attn_mask, tgt_sizes=tgt_sizes).last_hidden_state
119
- vision_embedding = self.resampler(vision_embedding, tgt_sizes)
120
-
121
- start = 0
122
- for pixel_values in pixel_values_list:
123
- img_cnt = len(pixel_values)
124
- if img_cnt > 0:
125
- vision_hidden_states.append(vision_embedding[start: start + img_cnt])
126
- start += img_cnt
127
- else:
128
- vision_hidden_states.append([])
129
- else: # no image
130
- if self.training:
131
- dummy_image = torch.zeros(
132
- (1, 3, 224, 224),
133
- device=device, dtype=dtype
134
- )
135
- tgt_sizes = torch.Tensor([[(224 // self.config.patch_size), math.ceil(224 / self.config.patch_size)]]).type(torch.int32)
136
- dummy_feature = self.resampler(self.vpm(dummy_image).last_hidden_state, tgt_sizes)
137
- else:
138
- dummy_feature = []
139
- for _ in range(len(pixel_values_list)):
140
- vision_hidden_states.append(dummy_feature)
141
-
142
- else:
143
- vision_hidden_states = data['vision_hidden_states']
144
-
145
- if hasattr(self.llm.config, 'scale_emb'):
146
- vllm_embedding = self.llm.model.embed_tokens(data['input_ids']) * self.llm.config.scale_emb
147
- else:
148
- vllm_embedding = self.llm.model.embed_tokens(data['input_ids'])
149
-
150
- vision_hidden_states = [i.type(vllm_embedding.dtype) if isinstance(
151
- i, torch.Tensor) else i for i in vision_hidden_states]
152
-
153
- bs = len(data['input_ids'])
154
- device = vllm_embedding.device
155
- embed_dim = vllm_embedding.shape[-1]
156
-
157
- new_vllm_embeddings = []
158
-
159
- for i in range(bs):
160
- cur_vs_hs = vision_hidden_states[i]
161
- cur_vllm_emb = vllm_embedding[i]
162
-
163
- if len(cur_vs_hs) == 0:
164
- new_vllm_embeddings.append(cur_vllm_emb)
165
- continue
166
-
167
- cur_image_bound = data['image_bound'][i]
168
-
169
- if len(cur_image_bound) > 0:
170
- image_indices = torch.stack([
171
- torch.arange(r[0], r[1], dtype=torch.long)
172
- for r in cur_image_bound
173
- ], dim=0).flatten().to(device)
174
-
175
- indices_expanded = image_indices.view(-1, 1).expand(-1, embed_dim)
176
- vision_features = cur_vs_hs.view(-1, embed_dim)
177
-
178
- updated_emb = cur_vllm_emb.scatter(0, indices_expanded, vision_features)
179
- new_vllm_embeddings.append(updated_emb)
180
- elif self.training:
181
- dummy_term = cur_vs_hs[0].sum() * 0
182
- new_vllm_embeddings.append(cur_vllm_emb + dummy_term)
183
- else:
184
- new_vllm_embeddings.append(cur_vllm_emb)
185
-
186
- vllm_embedding = torch.stack(new_vllm_embeddings, dim=0)
187
-
188
- return vllm_embedding, vision_hidden_states
189
-
190
- def forward(self, data=None, **kwargs):
191
- if isinstance(data, torch.Tensor):
192
- attention_mask = torch.ones_like(data, dtype=torch.bool)
193
- kwargs = {'attention_mask': attention_mask}
194
- return self.llm(
195
- input_ids=data,
196
- **kwargs
197
- )
198
-
199
- if data is None:
200
- data = {
201
- "input_ids": kwargs.pop("input_ids", None),
202
- "pixel_values": kwargs.pop("pixel_values", None),
203
- "image_bound": kwargs.pop("image_bound", None),
204
- "tgt_sizes": kwargs.pop("tgt_sizes", None),
205
- "position_ids": kwargs.pop("position_ids", None),
206
- }
207
- else:
208
- kwargs.pop("input_ids", None)
209
- kwargs.pop("pixel_values", None)
210
- kwargs.pop("image_bound", None)
211
- kwargs.pop("tgt_sizes", None)
212
- kwargs.pop("position_ids", None)
213
- kwargs.pop("inputs_embeds", None)
214
-
215
- vllm_embedding, vision_hidden_states = self.get_vllm_embedding(data)
216
- position_ids = data["position_ids"]
217
- if position_ids.dtype != torch.int64:
218
- position_ids = position_ids.long()
219
-
220
- return self.llm(
221
- input_ids=None,
222
- position_ids=position_ids,
223
- inputs_embeds=vllm_embedding,
224
- **kwargs
225
- )
226
-
227
- def _decode(self, inputs_embeds, tokenizer, attention_mask, decode_text=False, **kwargs):
228
- terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
229
- output = self.llm.generate(
230
- inputs_embeds=inputs_embeds,
231
- pad_token_id=0,
232
- eos_token_id=terminators,
233
- attention_mask=attention_mask,
234
- **kwargs
235
- )
236
- if decode_text:
237
- return self._decode_text(output, tokenizer)
238
- return output
239
-
240
- def _decode_stream(self, inputs_embeds, tokenizer, **kwargs):
241
- terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
242
- streamer = TextIteratorStreamer(tokenizer=tokenizer)
243
- generation_kwargs = {
244
- 'inputs_embeds': inputs_embeds,
245
- 'pad_token_id': 0,
246
- 'eos_token_id': terminators,
247
- 'streamer': streamer
248
- }
249
- generation_kwargs.update(kwargs)
250
-
251
- thread = Thread(target=self.llm.generate, kwargs=generation_kwargs)
252
- thread.start()
253
-
254
- return streamer
255
-
256
- def _decode_text(self, result_ids, tokenizer):
257
- terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
258
- result_text = []
259
- for result in result_ids:
260
- result = result[result != 0]
261
- if result[0] == tokenizer.bos_id:
262
- result = result[1:]
263
- if result[-1] in terminators:
264
- result = result[:-1]
265
- result_text.append(tokenizer.decode(result).strip())
266
- return result_text
267
-
268
- def generate(
269
- self,
270
- input_ids=None,
271
- pixel_values=None,
272
- tgt_sizes=None,
273
- image_bound=None,
274
- attention_mask=None,
275
- tokenizer=None,
276
- vision_hidden_states=None,
277
- return_vision_hidden_states=False,
278
- stream=False,
279
- decode_text=False,
280
- **kwargs
281
- ):
282
- assert input_ids is not None
283
- assert len(input_ids) == len(pixel_values)
284
-
285
- model_inputs = {
286
- "input_ids": input_ids,
287
- "image_bound": image_bound,
288
- }
289
-
290
- if vision_hidden_states is None:
291
- model_inputs["pixel_values"] = pixel_values
292
- model_inputs['tgt_sizes'] = tgt_sizes
293
- else:
294
- model_inputs["vision_hidden_states"] = vision_hidden_states
295
-
296
- with torch.inference_mode():
297
- (
298
- model_inputs["inputs_embeds"],
299
- vision_hidden_states,
300
- ) = self.get_vllm_embedding(model_inputs)
301
-
302
- if stream:
303
- result = self._decode_stream(model_inputs["inputs_embeds"], tokenizer, **kwargs)
304
- else:
305
- result = self._decode(model_inputs["inputs_embeds"], tokenizer, attention_mask, decode_text=decode_text, **kwargs)
306
-
307
- if return_vision_hidden_states:
308
- return result, vision_hidden_states
309
-
310
- return result
311
-
312
- def chat(
313
- self,
314
- image=None,
315
- msgs=None,
316
- tokenizer=None,
317
- processor=None,
318
- vision_hidden_states=None,
319
- max_new_tokens=2048,
320
- min_new_tokens=0,
321
- sampling=True,
322
- max_inp_length=32768,
323
- system_prompt='',
324
- stream=False,
325
- max_slice_nums=None,
326
- use_image_id=None,
327
- **kwargs
328
- ):
329
- if isinstance(msgs[0], list):
330
- batched = True
331
- else:
332
- batched = False
333
- msgs_list = msgs
334
- images_list = image
335
-
336
- if batched is False:
337
- images_list, msgs_list = [images_list], [msgs_list]
338
- else:
339
- assert images_list is None, "Please integrate image to msgs when using batch inference."
340
- images_list = [None] * len(msgs_list)
341
- assert len(images_list) == len(msgs_list), "The batch dim of images_list and msgs_list should be the same."
342
-
343
- if processor is None:
344
- if self.processor is None:
345
- self.processor = AutoProcessor.from_pretrained(self.config._name_or_path, trust_remote_code=True)
346
- processor = self.processor
347
-
348
- assert self.config.query_num == processor.image_processor.image_feature_size, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
349
- assert self.config.patch_size == processor.image_processor.patch_size, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
350
- assert self.config.use_image_id == processor.image_processor.use_image_id, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
351
- assert self.config.slice_config.max_slice_nums == processor.image_processor.max_slice_nums, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
352
- assert self.config.slice_mode == processor.image_processor.slice_mode, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
353
-
354
- prompts_lists = []
355
- input_images_lists = []
356
- for image, msgs in zip(images_list, msgs_list):
357
- if isinstance(msgs, str):
358
- msgs = json.loads(msgs)
359
- copy_msgs = deepcopy(msgs)
360
-
361
- assert len(msgs) > 0, "msgs is empty"
362
- assert sampling or not stream, "if use stream mode, make sure sampling=True"
363
-
364
- if image is not None and isinstance(copy_msgs[0]["content"], str):
365
- copy_msgs[0]["content"] = [image, copy_msgs[0]["content"]]
366
-
367
- images = []
368
- for i, msg in enumerate(copy_msgs):
369
- role = msg["role"]
370
- content = msg["content"]
371
- assert role in ["user", "assistant"]
372
- if i == 0:
373
- assert role == "user", "The role of first msg should be user"
374
- if isinstance(content, str):
375
- content = [content]
376
- cur_msgs = []
377
- for c in content:
378
- if isinstance(c, Image.Image):
379
- images.append(c)
380
- cur_msgs.append("(<image>./</image>)")
381
- elif isinstance(c, str):
382
- cur_msgs.append(c)
383
- msg["content"] = "\n".join(cur_msgs)
384
-
385
- if system_prompt:
386
- sys_msg = {'role': 'system', 'content': system_prompt}
387
- copy_msgs = [sys_msg] + copy_msgs
388
-
389
- prompts_lists.append(processor.tokenizer.apply_chat_template(copy_msgs, tokenize=False, add_generation_prompt=True))
390
- input_images_lists.append(images)
391
-
392
- inputs = processor(
393
- prompts_lists,
394
- input_images_lists,
395
- max_slice_nums=max_slice_nums,
396
- use_image_id=use_image_id,
397
- return_tensors="pt",
398
- max_length=max_inp_length
399
- ).to(self.device)
400
-
401
- if sampling:
402
- generation_config = {
403
- "top_p": 0.8,
404
- "top_k": 100,
405
- "temperature": 0.7,
406
- "do_sample": True,
407
- "repetition_penalty": 1.05
408
- }
409
- else:
410
- generation_config = {
411
- "num_beams": 3,
412
- "repetition_penalty": 1.2,
413
- }
414
-
415
- if min_new_tokens > 0:
416
- generation_config['min_new_tokens'] = min_new_tokens
417
-
418
- generation_config.update(
419
- (k, kwargs[k]) for k in generation_config.keys() & kwargs.keys()
420
- )
421
-
422
- inputs.pop("image_sizes")
423
- with torch.inference_mode():
424
- res = self.generate(
425
- **inputs,
426
- tokenizer=tokenizer,
427
- max_new_tokens=max_new_tokens,
428
- vision_hidden_states=vision_hidden_states,
429
- stream=stream,
430
- decode_text=True,
431
- **generation_config
432
- )
433
-
434
- if stream:
435
- def stream_gen():
436
- for text in res:
437
- for term in self.terminators:
438
- text = text.replace(term, '')
439
- yield text
440
- return stream_gen()
441
-
442
- else:
443
- if batched:
444
- answer = res
445
- else:
446
- answer = res[0]
447
- return answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
modeling_navit_siglip.py DELETED
@@ -1,937 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2024 Google AI and The HuggingFace Team. All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """ PyTorch Siglip model. """
16
- # Copied from HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
17
-
18
-
19
- import os
20
- import math
21
- import warnings
22
- from dataclasses import dataclass
23
- from typing import Any, Optional, Tuple, Union
24
-
25
- import numpy as np
26
- import torch
27
- import torch.nn.functional as F
28
- import torch.utils.checkpoint
29
- from torch import nn
30
- from torch.nn.init import _calculate_fan_in_and_fan_out
31
-
32
- from transformers.activations import ACT2FN
33
- from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
34
- from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
35
- from transformers.modeling_utils import PreTrainedModel
36
- from transformers.configuration_utils import PretrainedConfig
37
- from transformers.utils import (
38
- ModelOutput,
39
- add_start_docstrings,
40
- add_start_docstrings_to_model_forward,
41
- is_flash_attn_2_available,
42
- logging,
43
- replace_return_docstrings,
44
- )
45
- from transformers.utils import logging
46
-
47
- logger = logging.get_logger(__name__)
48
-
49
- class SiglipVisionConfig(PretrainedConfig):
50
- r"""
51
- This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
52
- Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
53
- configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
54
- [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
55
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
56
- documentation from [`PretrainedConfig`] for more information.
57
- Args:
58
- hidden_size (`int`, *optional*, defaults to 768):
59
- Dimensionality of the encoder layers and the pooler layer.
60
- intermediate_size (`int`, *optional*, defaults to 3072):
61
- Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
62
- num_hidden_layers (`int`, *optional*, defaults to 12):
63
- Number of hidden layers in the Transformer encoder.
64
- num_attention_heads (`int`, *optional*, defaults to 12):
65
- Number of attention heads for each attention layer in the Transformer encoder.
66
- num_channels (`int`, *optional*, defaults to 3):
67
- Number of channels in the input images.
68
- image_size (`int`, *optional*, defaults to 224):
69
- The size (resolution) of each image.
70
- patch_size (`int`, *optional*, defaults to 16):
71
- The size (resolution) of each patch.
72
- hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
73
- The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
74
- `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
75
- layer_norm_eps (`float`, *optional*, defaults to 1e-06):
76
- The epsilon used by the layer normalization layers.
77
- attention_dropout (`float`, *optional*, defaults to 0.0):
78
- The dropout ratio for the attention probabilities.
79
- Example:
80
- ```python
81
- >>> from transformers import SiglipVisionConfig, SiglipVisionModel
82
- >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
83
- >>> configuration = SiglipVisionConfig()
84
- >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
85
- >>> model = SiglipVisionModel(configuration)
86
- >>> # Accessing the model configuration
87
- >>> configuration = model.config
88
- ```"""
89
-
90
- model_type = "siglip_vision_model"
91
-
92
- def __init__(
93
- self,
94
- hidden_size=768,
95
- intermediate_size=3072,
96
- num_hidden_layers=12,
97
- num_attention_heads=12,
98
- num_channels=3,
99
- image_size=224,
100
- patch_size=16,
101
- hidden_act="gelu_pytorch_tanh",
102
- layer_norm_eps=1e-6,
103
- attention_dropout=0.0,
104
- **kwargs,
105
- ):
106
- super().__init__(**kwargs)
107
-
108
- self.hidden_size = hidden_size
109
- self.intermediate_size = intermediate_size
110
- self.num_hidden_layers = num_hidden_layers
111
- self.num_attention_heads = num_attention_heads
112
- self.num_channels = num_channels
113
- self.patch_size = patch_size
114
- self.image_size = image_size
115
- self.attention_dropout = attention_dropout
116
- self.layer_norm_eps = layer_norm_eps
117
- self.hidden_act = hidden_act
118
-
119
- @classmethod
120
- def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
121
- cls._set_token_in_kwargs(kwargs)
122
-
123
- config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
124
-
125
- # get the vision config dict if we are loading from SiglipConfig
126
- if config_dict.get("model_type") == "siglip":
127
- config_dict = config_dict["vision_config"]
128
-
129
- if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
130
- logger.warning(
131
- f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
132
- f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
133
- )
134
-
135
- return cls.from_dict(config_dict, **kwargs)
136
-
137
-
138
- _CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
139
-
140
- SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
141
- "google/siglip-base-patch16-224",
142
- # See all SigLIP models at https://huggingface.co/models?filter=siglip
143
- ]
144
-
145
- if is_flash_attn_2_available():
146
- from flash_attn import flash_attn_func, flash_attn_varlen_func
147
- from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
148
-
149
-
150
- # Copied from transformers.models.llama.modeling_llama._get_unpad_data
151
- def _get_unpad_data(attention_mask):
152
- seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
153
- indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
154
- max_seqlen_in_batch = seqlens_in_batch.max().item()
155
- cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
156
- return (
157
- indices,
158
- cu_seqlens,
159
- max_seqlen_in_batch,
160
- )
161
-
162
-
163
- def _trunc_normal_(tensor, mean, std, a, b):
164
- # Cut & paste from PyTorch official master until it's in a few official releases - RW
165
- # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
166
- def norm_cdf(x):
167
- # Computes standard normal cumulative distribution function
168
- return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
169
-
170
- if (mean < a - 2 * std) or (mean > b + 2 * std):
171
- warnings.warn(
172
- "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
173
- "The distribution of values may be incorrect.",
174
- stacklevel=2,
175
- )
176
-
177
- # Values are generated by using a truncated uniform distribution and
178
- # then using the inverse CDF for the normal distribution.
179
- # Get upper and lower cdf values
180
- l = norm_cdf((a - mean) / std)
181
- u = norm_cdf((b - mean) / std)
182
-
183
- # Uniformly fill tensor with values from [l, u], then translate to
184
- # [2l-1, 2u-1].
185
- tensor.uniform_(2 * l - 1, 2 * u - 1)
186
-
187
- # Use inverse cdf transform for normal distribution to get truncated
188
- # standard normal
189
- if tensor.dtype in [torch.float16, torch.bfloat16]:
190
- # The `erfinv_` op is not (yet?) defined in float16+cpu, bfloat16+gpu
191
- og_dtype = tensor.dtype
192
- tensor = tensor.to(torch.float32)
193
- tensor.erfinv_()
194
- tensor = tensor.to(og_dtype)
195
- else:
196
- tensor.erfinv_()
197
-
198
- # Transform to proper mean, std
199
- tensor.mul_(std * math.sqrt(2.0))
200
- tensor.add_(mean)
201
-
202
- # Clamp to ensure it's in the proper range
203
- if tensor.dtype == torch.float16:
204
- # The `clamp_` op is not (yet?) defined in float16+cpu
205
- tensor = tensor.to(torch.float32)
206
- tensor.clamp_(min=a, max=b)
207
- tensor = tensor.to(torch.float16)
208
- else:
209
- tensor.clamp_(min=a, max=b)
210
-
211
-
212
- def trunc_normal_tf_(
213
- tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
214
- ) -> torch.Tensor:
215
- """Fills the input Tensor with values drawn from a truncated
216
- normal distribution. The values are effectively drawn from the
217
- normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
218
- with values outside :math:`[a, b]` redrawn until they are within
219
- the bounds. The method used for generating the random values works
220
- best when :math:`a \\leq \text{mean} \\leq b`.
221
- NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
222
- bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
223
- and the result is subsquently scaled and shifted by the mean and std args.
224
- Args:
225
- tensor: an n-dimensional `torch.Tensor`
226
- mean: the mean of the normal distribution
227
- std: the standard deviation of the normal distribution
228
- a: the minimum cutoff value
229
- b: the maximum cutoff value
230
- """
231
- with torch.no_grad():
232
- _trunc_normal_(tensor, 0, 1.0, a, b)
233
- tensor.mul_(std).add_(mean)
234
-
235
-
236
- def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
237
- fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
238
- if mode == "fan_in":
239
- denom = fan_in
240
- elif mode == "fan_out":
241
- denom = fan_out
242
- elif mode == "fan_avg":
243
- denom = (fan_in + fan_out) / 2
244
-
245
- variance = scale / denom
246
-
247
- if distribution == "truncated_normal":
248
- # constant is stddev of standard normal truncated to (-2, 2)
249
- trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
250
- elif distribution == "normal":
251
- with torch.no_grad():
252
- tensor.normal_(std=math.sqrt(variance))
253
- elif distribution == "uniform":
254
- bound = math.sqrt(3 * variance)
255
- with torch.no_grad():
256
- tensor.uniform_(-bound, bound)
257
- else:
258
- raise ValueError(f"invalid distribution {distribution}")
259
-
260
-
261
- def lecun_normal_(tensor):
262
- variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
263
-
264
-
265
- def default_flax_embed_init(tensor):
266
- variance_scaling_(tensor, mode="fan_in", distribution="normal")
267
-
268
-
269
- @dataclass
270
- # Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Siglip
271
- class SiglipVisionModelOutput(ModelOutput):
272
- """
273
- Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
274
- Args:
275
- image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
276
- The image embeddings obtained by applying the projection layer to the pooler_output.
277
- last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
278
- Sequence of hidden-states at the output of the last layer of the model.
279
- hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
280
- Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
281
- one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
282
- Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
283
- attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
284
- Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
285
- sequence_length)`.
286
- Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
287
- heads.
288
- """
289
-
290
- image_embeds: Optional[torch.FloatTensor] = None
291
- last_hidden_state: torch.FloatTensor = None
292
- hidden_states: Optional[Tuple[torch.FloatTensor]] = None
293
- attentions: Optional[Tuple[torch.FloatTensor]] = None
294
-
295
-
296
- class SiglipVisionEmbeddings(nn.Module):
297
- def __init__(self, config: SiglipVisionConfig):
298
- super().__init__()
299
- self.config = config
300
- self.embed_dim = config.hidden_size
301
- self.image_size = config.image_size
302
- self.patch_size = config.patch_size
303
-
304
- self.patch_embedding = nn.Conv2d(
305
- in_channels=config.num_channels,
306
- out_channels=self.embed_dim,
307
- kernel_size=self.patch_size,
308
- stride=self.patch_size,
309
- padding="valid",
310
- )
311
-
312
- self.num_patches_per_side = self.image_size // self.patch_size
313
- self.num_patches = self.num_patches_per_side**2
314
- self.num_positions = self.num_patches
315
- self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
316
-
317
- def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor, tgt_sizes: Optional[torch.IntTensor]=None) -> torch.Tensor:
318
- batch_size = pixel_values.size(0)
319
-
320
- patch_embeds = self.patch_embedding(pixel_values)
321
- embeddings = patch_embeds.flatten(2).transpose(1, 2)
322
-
323
- max_im_h, max_im_w = pixel_values.size(2), pixel_values.size(3)
324
- max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size
325
- boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
326
- position_ids = torch.full(
327
- size=(
328
- batch_size,
329
- max_nb_patches_h * max_nb_patches_w,
330
- ),
331
- fill_value=0,
332
- )
333
-
334
- for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
335
- if tgt_sizes is not None:
336
- nb_patches_h = tgt_sizes[batch_idx][0]
337
- nb_patches_w = tgt_sizes[batch_idx][1]
338
- else:
339
- nb_patches_h = p_attn_mask[:, 0].sum()
340
- nb_patches_w = p_attn_mask[0].sum()
341
-
342
- fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
343
- fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
344
-
345
- bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
346
- bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
347
-
348
- pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
349
- position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
350
-
351
- position_ids = position_ids.to(self.position_embedding.weight.device)
352
-
353
- embeddings = embeddings + self.position_embedding(position_ids)
354
- return embeddings
355
-
356
-
357
- class SiglipAttention(nn.Module):
358
- """Multi-headed attention from 'Attention Is All You Need' paper"""
359
-
360
- # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
361
- def __init__(self, config):
362
- super().__init__()
363
- self.config = config
364
- self.embed_dim = config.hidden_size
365
- self.num_heads = config.num_attention_heads
366
- self.head_dim = self.embed_dim // self.num_heads
367
- if self.head_dim * self.num_heads != self.embed_dim:
368
- raise ValueError(
369
- f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
370
- f" {self.num_heads})."
371
- )
372
- self.scale = self.head_dim**-0.5
373
- self.dropout = config.attention_dropout
374
-
375
- self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
376
- self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
377
- self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
378
- self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
379
-
380
- def forward(
381
- self,
382
- hidden_states: torch.Tensor,
383
- attention_mask: Optional[torch.Tensor] = None,
384
- output_attentions: Optional[bool] = False,
385
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
386
- """Input shape: Batch x Time x Channel"""
387
-
388
- batch_size, q_len, _ = hidden_states.size()
389
-
390
- query_states = self.q_proj(hidden_states)
391
- key_states = self.k_proj(hidden_states)
392
- value_states = self.v_proj(hidden_states)
393
-
394
- query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
395
- key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
396
- value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
397
-
398
- k_v_seq_len = key_states.shape[-2]
399
- attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
400
-
401
- if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
402
- raise ValueError(
403
- f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
404
- f" {attn_weights.size()}"
405
- )
406
-
407
- if attention_mask is not None:
408
- if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
409
- raise ValueError(
410
- f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
411
- )
412
- attn_weights = attn_weights + attention_mask
413
-
414
- # upcast attention to fp32
415
- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
416
- attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
417
- attn_output = torch.matmul(attn_weights, value_states)
418
-
419
- if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
420
- raise ValueError(
421
- f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
422
- f" {attn_output.size()}"
423
- )
424
-
425
- attn_output = attn_output.transpose(1, 2).contiguous()
426
- attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
427
-
428
- attn_output = self.out_proj(attn_output)
429
-
430
- return attn_output, attn_weights
431
-
432
-
433
- class SiglipFlashAttention2(SiglipAttention):
434
- """
435
- Llama flash attention module. This module inherits from `LlamaAttention` as the weights of the module stays
436
- untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
437
- flash attention and deal with padding tokens in case the input contains any of them.
438
- """
439
-
440
- def __init__(self, *args, **kwargs):
441
- super().__init__(*args, **kwargs)
442
- self.is_causal = False # Hack to make sure we don't use a causal mask
443
-
444
- def forward(
445
- self,
446
- hidden_states: torch.Tensor,
447
- attention_mask: Optional[torch.LongTensor] = None,
448
- position_ids: Optional[torch.LongTensor] = None,
449
- past_key_value: Optional[Tuple[torch.Tensor]] = None,
450
- output_attentions: bool = False,
451
- use_cache: bool = False,
452
- **kwargs,
453
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
454
- output_attentions = False
455
-
456
- bsz, q_len, _ = hidden_states.size()
457
-
458
- query_states = self.q_proj(hidden_states)
459
- key_states = self.k_proj(hidden_states)
460
- value_states = self.v_proj(hidden_states)
461
-
462
- # Flash attention requires the input to have the shape
463
- # batch_size x seq_length x head_dim x hidden_dim
464
- # therefore we just need to keep the original shape
465
- query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
466
- key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
467
- value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
468
-
469
- kv_seq_len = key_states.shape[-2]
470
- if past_key_value is not None:
471
- kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
472
- # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
473
- # query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
474
-
475
- # if past_key_value is not None:
476
- # cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
477
- # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
478
-
479
- # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
480
- # to be able to avoid many of these transpose/reshape/view.
481
- query_states = query_states.transpose(1, 2)
482
- key_states = key_states.transpose(1, 2)
483
- value_states = value_states.transpose(1, 2)
484
-
485
- dropout_rate = self.dropout if self.training else 0.0
486
-
487
- # In PEFT, usually we cast the layer norms in float32 for training stability reasons
488
- # therefore the input hidden states gets silently casted in float32. Hence, we need
489
- # cast them back in the correct dtype just to be sure everything works as expected.
490
- # This might slowdown training & inference so it is recommended to not cast the LayerNorms
491
- # in fp32. (LlamaRMSNorm handles it correctly)
492
-
493
- input_dtype = query_states.dtype
494
- if input_dtype == torch.float32:
495
- if torch.is_autocast_enabled():
496
- target_dtype = torch.get_autocast_gpu_dtype()
497
- # Handle the case where the model is quantized
498
- elif hasattr(self.config, "_pre_quantization_dtype"):
499
- target_dtype = self.config._pre_quantization_dtype
500
- else:
501
- target_dtype = self.q_proj.weight.dtype
502
-
503
- logger.warning_once(
504
- "The input hidden states seems to be silently casted in float32, this might be related to the fact"
505
- " you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
506
- f" {target_dtype}."
507
- )
508
-
509
- query_states = query_states.to(target_dtype)
510
- key_states = key_states.to(target_dtype)
511
- value_states = value_states.to(target_dtype)
512
-
513
- attn_output = self._flash_attention_forward(
514
- query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
515
- )
516
-
517
- attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous()
518
- attn_output = self.out_proj(attn_output)
519
-
520
- if not output_attentions:
521
- attn_weights = None
522
-
523
- return attn_output, attn_weights
524
-
525
- def _flash_attention_forward(
526
- self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
527
- ):
528
- """
529
- Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
530
- first unpad the input, then computes the attention scores and pad the final attention scores.
531
- Args:
532
- query_states (`torch.Tensor`):
533
- Input query states to be passed to Flash Attention API
534
- key_states (`torch.Tensor`):
535
- Input key states to be passed to Flash Attention API
536
- value_states (`torch.Tensor`):
537
- Input value states to be passed to Flash Attention API
538
- attention_mask (`torch.Tensor`):
539
- The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
540
- position of padding tokens and 1 for the position of non-padding tokens.
541
- dropout (`int`, *optional*):
542
- Attention dropout
543
- softmax_scale (`float`, *optional*):
544
- The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
545
- """
546
-
547
- # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
548
- causal = self.is_causal and query_length != 1
549
-
550
- # Contains at least one padding token in the sequence
551
- if attention_mask is not None:
552
- batch_size = query_states.shape[0]
553
- query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
554
- query_states, key_states, value_states, attention_mask, query_length
555
- )
556
-
557
- cu_seqlens_q, cu_seqlens_k = cu_seq_lens
558
- max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
559
-
560
- attn_output_unpad = flash_attn_varlen_func(
561
- query_states,
562
- key_states,
563
- value_states,
564
- cu_seqlens_q=cu_seqlens_q,
565
- cu_seqlens_k=cu_seqlens_k,
566
- max_seqlen_q=max_seqlen_in_batch_q,
567
- max_seqlen_k=max_seqlen_in_batch_k,
568
- dropout_p=dropout,
569
- softmax_scale=softmax_scale,
570
- causal=causal,
571
- )
572
-
573
- attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
574
- else:
575
- attn_output = flash_attn_func(
576
- query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
577
- )
578
-
579
- return attn_output
580
-
581
- def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
582
- indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
583
- batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
584
-
585
- key_layer = index_first_axis(
586
- key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
587
- )
588
- value_layer = index_first_axis(
589
- value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
590
- )
591
- if query_length == kv_seq_len:
592
- query_layer = index_first_axis(
593
- query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
594
- )
595
- cu_seqlens_q = cu_seqlens_k
596
- max_seqlen_in_batch_q = max_seqlen_in_batch_k
597
- indices_q = indices_k
598
- elif query_length == 1:
599
- max_seqlen_in_batch_q = 1
600
- cu_seqlens_q = torch.arange(
601
- batch_size + 1, dtype=torch.int32, device=query_layer.device
602
- ) # There is a memcpy here, that is very bad.
603
- indices_q = cu_seqlens_q[:-1]
604
- query_layer = query_layer.squeeze(1)
605
- else:
606
- # The -q_len: slice assumes left padding.
607
- attention_mask = attention_mask[:, -query_length:]
608
- query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
609
-
610
- return (
611
- query_layer,
612
- key_layer,
613
- value_layer,
614
- indices_q,
615
- (cu_seqlens_q, cu_seqlens_k),
616
- (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
617
- )
618
-
619
-
620
- # Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
621
- class SiglipMLP(nn.Module):
622
- def __init__(self, config):
623
- super().__init__()
624
- self.config = config
625
- self.activation_fn = ACT2FN[config.hidden_act]
626
- self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
627
- self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
628
-
629
- def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
630
- hidden_states = self.fc1(hidden_states)
631
- hidden_states = self.activation_fn(hidden_states)
632
- hidden_states = self.fc2(hidden_states)
633
- return hidden_states
634
-
635
-
636
- # Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
637
- class SiglipEncoderLayer(nn.Module):
638
- def __init__(self, config: SiglipVisionConfig):
639
- super().__init__()
640
- self.embed_dim = config.hidden_size
641
- self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
642
- self.self_attn = (
643
- SiglipAttention(config)
644
- if not self._use_flash_attention_2
645
- else SiglipFlashAttention2(config)
646
- )
647
- self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
648
- self.mlp = SiglipMLP(config)
649
- self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
650
-
651
- def forward(
652
- self,
653
- hidden_states: torch.Tensor,
654
- attention_mask: torch.Tensor,
655
- output_attentions: Optional[bool] = False,
656
- ) -> Tuple[torch.FloatTensor]:
657
- """
658
- Args:
659
- hidden_states (`torch.FloatTensor`):
660
- Input to the layer of shape `(batch, seq_len, embed_dim)`.
661
- attention_mask (`torch.FloatTensor`):
662
- Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
663
- output_attentions (`bool`, *optional*, defaults to `False`):
664
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
665
- returned tensors for more detail.
666
- """
667
- residual = hidden_states
668
-
669
- hidden_states = self.layer_norm1(hidden_states)
670
- hidden_states, attn_weights = self.self_attn(
671
- hidden_states=hidden_states,
672
- attention_mask=attention_mask,
673
- output_attentions=output_attentions,
674
- )
675
- hidden_states = residual + hidden_states
676
-
677
- residual = hidden_states
678
- hidden_states = self.layer_norm2(hidden_states)
679
- hidden_states = self.mlp(hidden_states)
680
- hidden_states = residual + hidden_states
681
-
682
- outputs = (hidden_states,)
683
-
684
- if output_attentions:
685
- outputs += (attn_weights,)
686
-
687
- return outputs
688
-
689
-
690
- class SiglipPreTrainedModel(PreTrainedModel):
691
- """
692
- An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
693
- models.
694
- """
695
-
696
- config_class = SiglipVisionConfig
697
- base_model_prefix = "siglip"
698
- supports_gradient_checkpointing = True
699
-
700
- def _init_weights(self, module):
701
- """Initialize the weights"""
702
-
703
- if isinstance(module, SiglipVisionEmbeddings):
704
- width = self.config.hidden_size
705
- nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
706
- elif isinstance(module, nn.Embedding):
707
- default_flax_embed_init(module.weight)
708
- elif isinstance(module, SiglipAttention):
709
- nn.init.normal_(module.q_proj.weight)
710
- nn.init.normal_(module.k_proj.weight)
711
- nn.init.normal_(module.v_proj.weight)
712
- nn.init.normal_(module.out_proj.weight)
713
- nn.init.zeros_(module.q_proj.bias)
714
- nn.init.zeros_(module.k_proj.bias)
715
- nn.init.zeros_(module.v_proj.bias)
716
- nn.init.zeros_(module.out_proj.bias)
717
- elif isinstance(module, SiglipMLP):
718
- nn.init.normal_(module.fc1.weight)
719
- nn.init.normal_(module.fc2.weight)
720
- nn.init.normal_(module.fc1.bias, std=1e-6)
721
- nn.init.normal_(module.fc2.bias, std=1e-6)
722
- elif isinstance(module, (nn.Linear, nn.Conv2d)):
723
- lecun_normal_(module.weight)
724
- if module.bias is not None:
725
- nn.init.zeros_(module.bias)
726
- elif isinstance(module, nn.LayerNorm):
727
- module.bias.data.zero_()
728
- module.weight.data.fill_(1.0)
729
-
730
-
731
- SIGLIP_START_DOCSTRING = r"""
732
- This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
733
- library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
734
- etc.)
735
- This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
736
- Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
737
- and behavior.
738
- Parameters:
739
- config ([`SiglipVisionConfig`]): Model configuration class with all the parameters of the model.
740
- Initializing with a config file does not load the weights associated with the model, only the
741
- configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
742
- """
743
-
744
-
745
- SIGLIP_VISION_INPUTS_DOCSTRING = r"""
746
- Args:
747
- pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
748
- Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
749
- [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
750
- output_attentions (`bool`, *optional*):
751
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
752
- tensors for more detail.
753
- output_hidden_states (`bool`, *optional*):
754
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
755
- more detail.
756
- return_dict (`bool`, *optional*):
757
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
758
- """
759
-
760
-
761
- # Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
762
- class SiglipEncoder(nn.Module):
763
- """
764
- Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
765
- [`SiglipEncoderLayer`].
766
- Args:
767
- config: SiglipConfig
768
- """
769
-
770
- def __init__(self, config: SiglipVisionConfig):
771
- super().__init__()
772
- self.config = config
773
- self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
774
- self.gradient_checkpointing = False
775
-
776
- # Ignore copy
777
- def forward(
778
- self,
779
- inputs_embeds,
780
- attention_mask: Optional[torch.Tensor] = None,
781
- output_attentions: Optional[bool] = None,
782
- output_hidden_states: Optional[bool] = None,
783
- return_dict: Optional[bool] = None,
784
- ) -> Union[Tuple, BaseModelOutput]:
785
- r"""
786
- Args:
787
- inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
788
- Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
789
- This is useful if you want more control over how to convert `input_ids` indices into associated vectors
790
- than the model's internal embedding lookup matrix.
791
- attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
792
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
793
- - 1 for tokens that are **not masked**,
794
- - 0 for tokens that are **masked**.
795
- [What are attention masks?](../glossary#attention-mask)
796
- output_attentions (`bool`, *optional*):
797
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
798
- returned tensors for more detail.
799
- output_hidden_states (`bool`, *optional*):
800
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
801
- for more detail.
802
- return_dict (`bool`, *optional*):
803
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
804
- """
805
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
806
- output_hidden_states = (
807
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
808
- )
809
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
810
-
811
- encoder_states = () if output_hidden_states else None
812
- all_attentions = () if output_attentions else None
813
-
814
- hidden_states = inputs_embeds
815
- for encoder_layer in self.layers:
816
- if output_hidden_states:
817
- encoder_states = encoder_states + (hidden_states,)
818
- if self.gradient_checkpointing and self.training:
819
- layer_outputs = self._gradient_checkpointing_func(
820
- encoder_layer.__call__,
821
- hidden_states,
822
- attention_mask,
823
- output_attentions,
824
- )
825
- else:
826
- layer_outputs = encoder_layer(
827
- hidden_states,
828
- attention_mask,
829
- output_attentions=output_attentions,
830
- )
831
-
832
- hidden_states = layer_outputs[0]
833
-
834
- if output_attentions:
835
- all_attentions = all_attentions + (layer_outputs[1],)
836
-
837
- if output_hidden_states:
838
- encoder_states = encoder_states + (hidden_states,)
839
-
840
- if not return_dict:
841
- return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
842
- return BaseModelOutput(
843
- last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
844
- )
845
-
846
- @add_start_docstrings(
847
- """The vision model from SigLIP without any head or projection on top.""",
848
- SIGLIP_START_DOCSTRING
849
- )
850
- class SiglipVisionTransformer(SiglipPreTrainedModel):
851
- config_class = SiglipVisionConfig
852
- main_input_name = "pixel_values"
853
- _supports_flash_attn_2 = True
854
-
855
- def __init__(self, config: SiglipVisionConfig):
856
- super().__init__(config)
857
- self.config = config
858
- embed_dim = config.hidden_size
859
-
860
- self.embeddings = SiglipVisionEmbeddings(config)
861
- self.encoder = SiglipEncoder(config)
862
- self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
863
- self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
864
-
865
- # Initialize weights and apply final processing
866
- self.post_init()
867
-
868
- def get_input_embeddings(self) -> nn.Module:
869
- return self.embeddings.patch_embedding
870
-
871
- @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
872
- @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
873
- def forward(
874
- self,
875
- pixel_values,
876
- patch_attention_mask: Optional[torch.BoolTensor] = None,
877
- tgt_sizes: Optional[torch.IntTensor] = None,
878
- output_attentions: Optional[bool] = None,
879
- output_hidden_states: Optional[bool] = None,
880
- return_dict: Optional[bool] = None,
881
- ) -> Union[Tuple, BaseModelOutputWithPooling]:
882
- r"""
883
- Returns:
884
- """
885
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
886
- output_hidden_states = (
887
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
888
- )
889
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
890
-
891
- batch_size = pixel_values.size(0)
892
- if patch_attention_mask is None:
893
- patch_attention_mask = torch.ones(
894
- size=(
895
- batch_size,
896
- pixel_values.size(2) // self.config.patch_size,
897
- pixel_values.size(3) // self.config.patch_size,
898
- ),
899
- dtype=torch.bool,
900
- device=pixel_values.device,
901
- )
902
-
903
- hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask, tgt_sizes=tgt_sizes)
904
-
905
- patch_attention_mask = patch_attention_mask.view(batch_size, -1)
906
- # The call to `_upad_input` in `_flash_attention_forward` is expensive
907
- # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
908
- # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
909
- if not torch.any(~patch_attention_mask):
910
- attention_mask=None
911
- else:
912
- attention_mask = (
913
- _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
914
- if not self._use_flash_attention_2
915
- else patch_attention_mask
916
- )
917
-
918
- encoder_outputs = self.encoder(
919
- inputs_embeds=hidden_states,
920
- attention_mask=attention_mask,
921
- output_attentions=output_attentions,
922
- output_hidden_states=output_hidden_states,
923
- return_dict=return_dict,
924
- )
925
-
926
- last_hidden_state = encoder_outputs[0]
927
- last_hidden_state = self.post_layernorm(last_hidden_state)
928
-
929
- if not return_dict:
930
- return (last_hidden_state, None) + encoder_outputs[1:]
931
-
932
- return BaseModelOutputWithPooling(
933
- last_hidden_state=last_hidden_state,
934
- pooler_output=None,
935
- hidden_states=encoder_outputs.hidden_states,
936
- attentions=encoder_outputs.attentions,
937
- )