tuandunghcmut commited on
Commit
9eb384f
·
verified ·
1 Parent(s): 329abda

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. open_clip/src/open_clip/__init__.py +18 -0
  2. open_clip/src/open_clip/coca_model.py +500 -0
  3. open_clip/src/open_clip/loss.py +448 -0
  4. open_clip/src/open_clip/model_configs/EVA01-g-14.json +18 -0
  5. open_clip/src/open_clip/model_configs/EVA02-E-14.json +18 -0
  6. open_clip/src/open_clip/model_configs/EVA02-L-14.json +18 -0
  7. open_clip/src/open_clip/model_configs/MobileCLIP-B.json +21 -0
  8. open_clip/src/open_clip/model_configs/MobileCLIP-S1.json +21 -0
  9. open_clip/src/open_clip/model_configs/MobileCLIP-S2.json +21 -0
  10. open_clip/src/open_clip/model_configs/RN101-quickgelu.json +22 -0
  11. open_clip/src/open_clip/model_configs/RN50-quickgelu.json +22 -0
  12. open_clip/src/open_clip/model_configs/RN50.json +21 -0
  13. open_clip/src/open_clip/model_configs/RN50x64-quickgelu.json +22 -0
  14. open_clip/src/open_clip/model_configs/ViT-B-16-SigLIP-384.json +29 -0
  15. open_clip/src/open_clip/model_configs/ViT-B-16-SigLIP-512.json +29 -0
  16. open_clip/src/open_clip/model_configs/ViT-B-16-SigLIP.json +29 -0
  17. open_clip/src/open_clip/model_configs/ViT-B-32-256.json +16 -0
  18. open_clip/src/open_clip/model_configs/ViT-H-14-378.json +17 -0
  19. open_clip/src/open_clip/model_configs/ViT-H-14-CLIPA-336.json +26 -0
  20. open_clip/src/open_clip/model_configs/ViT-H-14-quickgelu.json +18 -0
  21. open_clip/src/open_clip/model_configs/ViT-H-14.json +17 -0
  22. open_clip/src/open_clip/model_configs/ViT-L-14-280.json +16 -0
  23. open_clip/src/open_clip/model_configs/ViT-L-14-336-quickgelu.json +17 -0
  24. open_clip/src/open_clip/model_configs/ViT-L-14-quickgelu.json +17 -0
  25. open_clip/src/open_clip/model_configs/ViT-L-14.json +16 -0
  26. open_clip/src/open_clip/model_configs/ViT-L-16-320.json +16 -0
  27. open_clip/src/open_clip/model_configs/ViT-L-16-SigLIP-384.json +29 -0
  28. open_clip/src/open_clip/model_configs/ViT-M-16-alt.json +17 -0
  29. open_clip/src/open_clip/model_configs/ViT-M-32-alt.json +16 -0
  30. open_clip/src/open_clip/model_configs/ViT-M-32.json +16 -0
  31. open_clip/src/open_clip/model_configs/ViT-S-16.json +16 -0
  32. open_clip/src/open_clip/model_configs/ViT-SO400M-14-SigLIP-378.json +30 -0
  33. open_clip/src/open_clip/model_configs/ViT-SO400M-14-SigLIP-384.json +30 -0
  34. open_clip/src/open_clip/model_configs/ViT-bigG-14-CLIPA.json +27 -0
  35. open_clip/src/open_clip/model_configs/ViT-bigG-14-quickgelu.json +19 -0
  36. open_clip/src/open_clip/model_configs/ViT-bigG-14.json +18 -0
  37. open_clip/src/open_clip/model_configs/ViT-e-14.json +18 -0
  38. open_clip/src/open_clip/model_configs/ViTamin-B-LTT.json +20 -0
  39. open_clip/src/open_clip/model_configs/ViTamin-B.json +20 -0
  40. open_clip/src/open_clip/model_configs/ViTamin-L-336.json +20 -0
  41. open_clip/src/open_clip/model_configs/ViTamin-L-384.json +20 -0
  42. open_clip/src/open_clip/model_configs/ViTamin-L.json +20 -0
  43. open_clip/src/open_clip/model_configs/ViTamin-L2-256.json +20 -0
  44. open_clip/src/open_clip/model_configs/ViTamin-L2-336.json +20 -0
  45. open_clip/src/open_clip/model_configs/ViTamin-L2-384.json +20 -0
  46. open_clip/src/open_clip/model_configs/ViTamin-L2.json +20 -0
  47. open_clip/src/open_clip/model_configs/ViTamin-S-LTT.json +20 -0
  48. open_clip/src/open_clip/model_configs/ViTamin-S.json +20 -0
  49. open_clip/src/open_clip/model_configs/ViTamin-XL-256.json +20 -0
  50. open_clip/src/open_clip/model_configs/ViTamin-XL-336.json +20 -0
open_clip/src/open_clip/__init__.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .version import __version__
2
+
3
+ from .coca_model import CoCa
4
+ from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
5
+ from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss
6
+ from .factory import list_models, add_model_config, get_model_config, load_checkpoint
7
+ from .loss import ClipLoss, DistillClipLoss, CoCaLoss
8
+ from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \
9
+ convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype, get_input_dtype, \
10
+ get_model_tokenize_cfg, get_model_preprocess_cfg, set_model_preprocess_cfg
11
+ from .openai import load_openai_model, list_openai_models
12
+ from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \
13
+ get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
14
+ from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub
15
+ from .tokenizer import SimpleTokenizer, tokenize, decode
16
+ from .transform import image_transform, AugmentationCfg
17
+ from .zero_shot_classifier import build_zero_shot_classifier, build_zero_shot_classifier_legacy
18
+ from .zero_shot_metadata import OPENAI_IMAGENET_TEMPLATES, SIMPLE_IMAGENET_TEMPLATES, IMAGENET_CLASSNAMES
open_clip/src/open_clip/coca_model.py ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+ import numpy as np
7
+ from dataclasses import dataclass
8
+
9
+ from .transformer import (
10
+ LayerNormFp32,
11
+ LayerNorm,
12
+ QuickGELU,
13
+ MultimodalTransformer,
14
+ )
15
+ from .model import CLIPTextCfg, CLIPVisionCfg, _build_vision_tower, _build_text_tower
16
+
17
+ try:
18
+ from transformers import (
19
+ BeamSearchScorer,
20
+ LogitsProcessorList,
21
+ TopPLogitsWarper,
22
+ TopKLogitsWarper,
23
+ RepetitionPenaltyLogitsProcessor,
24
+ MinLengthLogitsProcessor,
25
+ MaxLengthCriteria,
26
+ StopStringCriteria,
27
+ EosTokenCriteria,
28
+ StoppingCriteriaList
29
+ )
30
+
31
+ GENERATION_TYPES = {
32
+ "top_k": TopKLogitsWarper,
33
+ "top_p": TopPLogitsWarper,
34
+ "beam_search": "beam_search"
35
+ }
36
+ _has_transformers = True
37
+ except ImportError as e:
38
+ GENERATION_TYPES = {
39
+ "top_k": None,
40
+ "top_p": None,
41
+ "beam_search": "beam_search"
42
+ }
43
+ _has_transformers = False
44
+
45
+
46
+ @dataclass
47
+ class MultimodalCfg(CLIPTextCfg):
48
+ mlp_ratio: int = 4
49
+ dim_head: int = 64
50
+ heads: int = 8
51
+ n_queries: int = 256
52
+ attn_pooler_heads: int = 8
53
+
54
+
55
+ def _build_text_decoder_tower(
56
+ embed_dim,
57
+ multimodal_cfg,
58
+ quick_gelu: bool = False,
59
+ cast_dtype: Optional[torch.dtype] = None,
60
+ ):
61
+ multimodal_cfg = MultimodalCfg(**multimodal_cfg) if isinstance(multimodal_cfg, dict) else multimodal_cfg
62
+ act_layer = QuickGELU if quick_gelu else nn.GELU
63
+ norm_layer = (
64
+ LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
65
+ )
66
+
67
+ decoder = MultimodalTransformer(
68
+ context_length=multimodal_cfg.context_length,
69
+ width=multimodal_cfg.width,
70
+ heads=multimodal_cfg.heads,
71
+ layers=multimodal_cfg.layers,
72
+ ls_init_value=multimodal_cfg.ls_init_value,
73
+ output_dim=embed_dim,
74
+ act_layer=act_layer,
75
+ norm_layer=norm_layer,
76
+ )
77
+
78
+ return decoder
79
+
80
+
81
+ def _token_to_tensor(token_id, device: str = "cpu") -> torch.Tensor:
82
+ if not isinstance(token_id, torch.Tensor):
83
+ if isinstance(token_id, int):
84
+ token_id = [token_id]
85
+ token_id = torch.tensor(token_id, device=device)
86
+ return token_id
87
+
88
+
89
+ class CoCa(nn.Module):
90
+ def __init__(
91
+ self,
92
+ embed_dim,
93
+ multimodal_cfg: MultimodalCfg,
94
+ text_cfg: CLIPTextCfg,
95
+ vision_cfg: CLIPVisionCfg,
96
+ quick_gelu: bool = False,
97
+ init_logit_scale: float = np.log(1 / 0.07),
98
+ init_logit_bias: Optional[float] = None,
99
+ nonscalar_logit_scale: bool = False,
100
+ cast_dtype: Optional[torch.dtype] = None,
101
+ pad_id: int = 0,
102
+ ):
103
+ super().__init__()
104
+ multimodal_cfg = MultimodalCfg(**multimodal_cfg) if isinstance(multimodal_cfg, dict) else multimodal_cfg
105
+ text_cfg = CLIPTextCfg(**text_cfg) if isinstance(text_cfg, dict) else text_cfg
106
+ vision_cfg = CLIPVisionCfg(**vision_cfg) if isinstance(vision_cfg, dict) else vision_cfg
107
+
108
+ self.text = _build_text_tower(
109
+ embed_dim=embed_dim,
110
+ text_cfg=text_cfg,
111
+ quick_gelu=quick_gelu,
112
+ cast_dtype=cast_dtype,
113
+ )
114
+
115
+ vocab_size = (
116
+ text_cfg.vocab_size # for hf models
117
+ if hasattr(text_cfg, "hf_model_name") and text_cfg.hf_model_name is not None
118
+ else text_cfg.vocab_size
119
+ )
120
+
121
+ self.visual = _build_vision_tower(
122
+ embed_dim=embed_dim,
123
+ vision_cfg=vision_cfg,
124
+ quick_gelu=quick_gelu,
125
+ cast_dtype=cast_dtype,
126
+ )
127
+
128
+ self.text_decoder = _build_text_decoder_tower(
129
+ vocab_size,
130
+ multimodal_cfg=multimodal_cfg,
131
+ quick_gelu=quick_gelu,
132
+ cast_dtype=cast_dtype,
133
+ )
134
+
135
+ lshape = [1] if nonscalar_logit_scale else []
136
+ self.logit_scale = nn.Parameter(torch.ones(lshape) * init_logit_scale)
137
+ if init_logit_bias is not None:
138
+ self.logit_bias = nn.Parameter(torch.ones(lshape) * init_logit_bias)
139
+ else:
140
+ self.logit_bias = None
141
+ self.pad_id = pad_id
142
+
143
+ self.context_length = multimodal_cfg.context_length
144
+
145
+ @torch.jit.ignore
146
+ def set_grad_checkpointing(self, enable: bool = True):
147
+ self.visual.set_grad_checkpointing(enable)
148
+ self.text.set_grad_checkpointing(enable)
149
+ self.text_decoder.set_grad_checkpointing(enable)
150
+
151
+ def _encode_image(self, images, normalize: bool = True):
152
+ image_latent, tokens_embs = self.visual(images)
153
+ image_latent = F.normalize(image_latent, dim=-1) if normalize else image_latent
154
+ return image_latent, tokens_embs
155
+
156
+ def _encode_text(self, text, normalize: bool = True):
157
+ text_latent, token_emb = self.text(text)
158
+ text_latent = F.normalize(text_latent, dim=-1) if normalize else text_latent
159
+ return text_latent, token_emb
160
+
161
+ def encode_image(self, images, normalize: bool = True):
162
+ image_latent, _ = self._encode_image(images, normalize=normalize)
163
+ return image_latent
164
+
165
+ def encode_text(self, text, normalize: bool = True):
166
+ text_latent, _ = self._encode_text(text, normalize=normalize)
167
+ return text_latent
168
+
169
+ def forward(
170
+ self,
171
+ image,
172
+ text: Optional[torch.Tensor] = None,
173
+ image_latent: Optional[torch.Tensor] = None,
174
+ image_embs: Optional[torch.Tensor] = None,
175
+ output_labels: bool = True,
176
+ ):
177
+ if image_latent is None or image_embs is None:
178
+ image_latent, image_embs = self._encode_image(image)
179
+
180
+ if text is None:
181
+ return {"image_features": image_latent, "image_embs": image_embs}
182
+
183
+ text_latent, token_embs = self._encode_text(text)
184
+
185
+ # FIXME this isn't an ideal solution, would like to improve -RW
186
+ labels: Optional[torch.Tensor] = text[:, 1:] if output_labels else None
187
+ if output_labels:
188
+ # align text_embs and thus logits with labels for teacher-forcing caption loss
189
+ token_embs = token_embs[:, :-1]
190
+
191
+ logits = self.text_decoder(image_embs, token_embs)
192
+ out_dict = {
193
+ "image_features": image_latent,
194
+ "text_features": text_latent,
195
+ "logits": logits,
196
+ "logit_scale": self.logit_scale.exp()
197
+ }
198
+ if labels is not None:
199
+ out_dict["labels"] = labels
200
+ if self.logit_bias is not None:
201
+ out_dict["logit_bias"] = self.logit_bias
202
+ return out_dict
203
+
204
+ def generate(
205
+ self,
206
+ image,
207
+ text=None,
208
+ seq_len=30,
209
+ max_seq_len=77,
210
+ temperature=1.,
211
+ generation_type="beam_search",
212
+ top_p=0.1, # keep tokens in the 1 - top_p quantile
213
+ top_k=1, # keeps the top_k most probable tokens
214
+ pad_token_id=None,
215
+ eos_token_id=None,
216
+ sot_token_id=None,
217
+ num_beams=6,
218
+ num_beam_groups=3,
219
+ min_seq_len=5,
220
+ stopping_criteria=None,
221
+ repetition_penalty=1.0,
222
+ fixed_output_length=False # if True output.shape == (batch_size, seq_len)
223
+ ):
224
+ # taking many ideas and components from HuggingFace GenerationMixin
225
+ # https://huggingface.co/docs/transformers/main/en/main_classes/text_generation
226
+ assert _has_transformers, "Please install transformers for generate functionality. `pip install transformers`."
227
+ assert seq_len > min_seq_len, "seq_len must be larger than min_seq_len"
228
+ device = image.device
229
+
230
+ with torch.no_grad():
231
+ sot_token_id = _token_to_tensor(49406 if sot_token_id is None else sot_token_id, device=device)
232
+ eos_token_id = _token_to_tensor(49407 if eos_token_id is None else eos_token_id, device=device)
233
+ pad_token_id = self.pad_id if pad_token_id is None else pad_token_id
234
+ logit_processor = LogitsProcessorList(
235
+ [
236
+ MinLengthLogitsProcessor(min_seq_len, eos_token_id),
237
+ RepetitionPenaltyLogitsProcessor(repetition_penalty),
238
+ ]
239
+ )
240
+
241
+ if stopping_criteria is None:
242
+ stopping_criteria = [MaxLengthCriteria(max_length=seq_len)]
243
+ stopping_criteria = StoppingCriteriaList(stopping_criteria)
244
+
245
+ if generation_type == "beam_search":
246
+ output = self._generate_beamsearch(
247
+ image_inputs=image,
248
+ pad_token_id=pad_token_id,
249
+ eos_token_id=eos_token_id,
250
+ sot_token_id=sot_token_id,
251
+ num_beams=num_beams,
252
+ num_beam_groups=num_beam_groups,
253
+ min_seq_len=min_seq_len,
254
+ stopping_criteria=stopping_criteria,
255
+ logit_processor=logit_processor,
256
+ )
257
+ if fixed_output_length and output.shape[1] < seq_len:
258
+ pad_len = seq_len - output.shape[1]
259
+ return torch.cat((
260
+ output,
261
+ torch.ones(output.shape[0], pad_len, device=device, dtype=output.dtype) * pad_token_id
262
+ ),
263
+ dim=1
264
+ )
265
+ return output
266
+
267
+ elif generation_type == "top_p":
268
+ logit_warper = GENERATION_TYPES[generation_type](top_p)
269
+ elif generation_type == "top_k":
270
+ logit_warper = GENERATION_TYPES[generation_type](top_k)
271
+ else:
272
+ raise ValueError(
273
+ f"generation_type has to be one of "
274
+ f"{'| ' + ' | '.join(list(GENERATION_TYPES.keys())) + ' |'}."
275
+ )
276
+
277
+ image_latent, image_embs = self._encode_image(image)
278
+
279
+ if text is None:
280
+ text = torch.ones((image.shape[0], 1), device=device, dtype=torch.long) * sot_token_id
281
+
282
+ was_training = self.training
283
+ num_dims = len(text.shape)
284
+
285
+ if num_dims == 1:
286
+ text = text[None, :]
287
+
288
+ self.eval()
289
+ out = text
290
+
291
+ while True:
292
+ x = out[:, -max_seq_len:]
293
+ cur_len = x.shape[1]
294
+ logits = self(
295
+ image,
296
+ x,
297
+ image_latent=image_latent,
298
+ image_embs=image_embs,
299
+ output_labels=False,
300
+ )["logits"][:, -1]
301
+ mask = (out[:, -1] == eos_token_id) | (out[:, -1] == pad_token_id)
302
+ sample = torch.ones((out.shape[0], 1), device=device, dtype=torch.long) * pad_token_id
303
+
304
+ if mask.all():
305
+ if not fixed_output_length:
306
+ break
307
+ else:
308
+ logits = logits[~mask, :]
309
+ filtered_logits = logit_processor(x[~mask, :], logits)
310
+ filtered_logits = logit_warper(x[~mask, :], filtered_logits)
311
+ probs = F.softmax(filtered_logits / temperature, dim=-1)
312
+
313
+ if (cur_len + 1 == seq_len):
314
+ sample[~mask, :] = torch.ones((sum(~mask), 1), device=device, dtype=torch.long) * eos_token_id
315
+ else:
316
+ sample[~mask, :] = torch.multinomial(probs, 1)
317
+
318
+ out = torch.cat((out, sample), dim=-1)
319
+
320
+ cur_len += 1
321
+
322
+ if all(stopping_criteria(out, None)):
323
+ break
324
+
325
+ if num_dims == 1:
326
+ out = out.squeeze(0)
327
+
328
+ self.train(was_training)
329
+ return out
330
+
331
+ def _generate_beamsearch(
332
+ self,
333
+ image_inputs,
334
+ pad_token_id=None,
335
+ eos_token_id=None,
336
+ sot_token_id=None,
337
+ num_beams=6,
338
+ num_beam_groups=3,
339
+ min_seq_len=5,
340
+ stopping_criteria=None,
341
+ logit_processor=None,
342
+ logit_warper=None,
343
+ ):
344
+ device = image_inputs.device
345
+ batch_size = image_inputs.shape[0]
346
+ image_inputs = torch.repeat_interleave(image_inputs, num_beams, dim=0)
347
+ image_latent, image_embs = self._encode_image(image_inputs)
348
+
349
+ input_ids = torch.ones((batch_size * num_beams, 1), device=device, dtype=torch.long)
350
+ input_ids = input_ids * sot_token_id
351
+ beam_scorer = BeamSearchScorer(
352
+ batch_size=batch_size,
353
+ num_beams=num_beams,
354
+ device=device,
355
+ num_beam_groups=num_beam_groups,
356
+ )
357
+ # instantiate logits processors
358
+ logits_processor = (
359
+ LogitsProcessorList([MinLengthLogitsProcessor(min_seq_len, eos_token_id=eos_token_id)])
360
+ if logit_processor is None
361
+ else logit_processor
362
+ )
363
+
364
+ num_beams = beam_scorer.num_beams
365
+ num_beam_groups = beam_scorer.num_beam_groups
366
+ num_sub_beams = num_beams // num_beam_groups
367
+ batch_size = len(beam_scorer._beam_hyps) // num_beam_groups
368
+ batch_beam_size, cur_len = input_ids.shape
369
+ beam_indices = None
370
+
371
+ if num_beams * batch_size != batch_beam_size:
372
+ raise ValueError(
373
+ f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
374
+ )
375
+
376
+ beam_scores = torch.full((batch_size, num_beams), -1e9, dtype=torch.float, device=device)
377
+ # initialise score of first beam of each group with 0 and the rest with 1e-9. This ensures that the beams in
378
+ # the same group don't produce same tokens everytime.
379
+ beam_scores[:, ::num_sub_beams] = 0
380
+ beam_scores = beam_scores.view((batch_size * num_beams,))
381
+
382
+ while True:
383
+
384
+ # predicted tokens in cur_len step
385
+ current_tokens = torch.zeros(batch_size * num_beams, dtype=input_ids.dtype, device=device)
386
+
387
+ # indices which will form the beams in the next time step
388
+ reordering_indices = torch.zeros(batch_size * num_beams, dtype=torch.long, device=device)
389
+
390
+ # do one decoder step on all beams of all sentences in batch
391
+ model_inputs = prepare_inputs_for_generation(input_ids=input_ids, image_inputs=image_inputs)
392
+ outputs = self(
393
+ model_inputs['images'],
394
+ model_inputs['text'],
395
+ image_latent=image_latent,
396
+ image_embs=image_embs,
397
+ output_labels=False,
398
+ )
399
+
400
+ for beam_group_idx in range(num_beam_groups):
401
+ group_start_idx = beam_group_idx * num_sub_beams
402
+ group_end_idx = min(group_start_idx + num_sub_beams, num_beams)
403
+ group_size = group_end_idx - group_start_idx
404
+
405
+ # indices of beams of current group among all sentences in batch
406
+ batch_group_indices = []
407
+
408
+ for batch_idx in range(batch_size):
409
+ batch_group_indices.extend(
410
+ [batch_idx * num_beams + idx for idx in range(group_start_idx, group_end_idx)]
411
+ )
412
+ group_input_ids = input_ids[batch_group_indices]
413
+
414
+ # select outputs of beams of currentg group only
415
+ next_token_logits = outputs['logits'][batch_group_indices, -1, :]
416
+ vocab_size = next_token_logits.shape[-1]
417
+
418
+ next_token_scores_processed = logits_processor(
419
+ group_input_ids, next_token_logits, current_tokens=current_tokens, beam_group_idx=beam_group_idx
420
+ )
421
+ next_token_scores = next_token_scores_processed + beam_scores[batch_group_indices].unsqueeze(-1)
422
+ next_token_scores = next_token_scores.expand_as(next_token_scores_processed)
423
+
424
+ # reshape for beam search
425
+ next_token_scores = next_token_scores.view(batch_size, group_size * vocab_size)
426
+
427
+ next_token_scores, next_tokens = torch.topk(
428
+ next_token_scores, 2 * group_size, dim=1, largest=True, sorted=True
429
+ )
430
+
431
+ next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
432
+ next_tokens = next_tokens % vocab_size
433
+
434
+ # stateless
435
+ process_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
436
+ beam_outputs = beam_scorer.process(
437
+ group_input_ids,
438
+ next_token_scores,
439
+ next_tokens,
440
+ next_indices,
441
+ pad_token_id=pad_token_id,
442
+ eos_token_id=eos_token_id,
443
+ beam_indices=process_beam_indices,
444
+ group_index=beam_group_idx,
445
+ )
446
+ beam_scores[batch_group_indices] = beam_outputs["next_beam_scores"]
447
+ beam_next_tokens = beam_outputs["next_beam_tokens"]
448
+ beam_idx = beam_outputs["next_beam_indices"]
449
+
450
+ input_ids[batch_group_indices] = group_input_ids[beam_idx]
451
+ group_input_ids = torch.cat([group_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
452
+ current_tokens[batch_group_indices] = group_input_ids[:, -1]
453
+
454
+ # (beam_idx // group_size) -> batch_idx
455
+ # (beam_idx % group_size) -> offset of idx inside the group
456
+ reordering_indices[batch_group_indices] = (
457
+ num_beams * torch.div(beam_idx, group_size, rounding_mode="floor") + group_start_idx + (beam_idx % group_size)
458
+ )
459
+
460
+ input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1)
461
+
462
+ # increase cur_len
463
+ cur_len = cur_len + 1
464
+ if beam_scorer.is_done or all(stopping_criteria(input_ids, None)):
465
+ break
466
+
467
+ final_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
468
+ sequence_outputs = beam_scorer.finalize(
469
+ input_ids,
470
+ beam_scores,
471
+ next_tokens,
472
+ next_indices,
473
+ pad_token_id=pad_token_id,
474
+ eos_token_id=eos_token_id,
475
+ max_length=stopping_criteria.max_length,
476
+ beam_indices=final_beam_indices,
477
+ )
478
+ return sequence_outputs['sequences']
479
+
480
+
481
+ def prepare_inputs_for_generation(input_ids, image_inputs, past=None, **kwargs):
482
+ if past:
483
+ input_ids = input_ids[:, -1].unsqueeze(-1)
484
+
485
+ attention_mask = kwargs.get("attention_mask", None)
486
+ position_ids = kwargs.get("position_ids", None)
487
+
488
+ if attention_mask is not None and position_ids is None:
489
+ # create position_ids on the fly for batch generation
490
+ position_ids = attention_mask.long().cumsum(-1) - 1
491
+ position_ids.masked_fill_(attention_mask == 0, 1)
492
+ else:
493
+ position_ids = None
494
+ return {
495
+ "text": input_ids,
496
+ "images": image_inputs,
497
+ "past_key_values": past,
498
+ "position_ids": position_ids,
499
+ "attention_mask": attention_mask,
500
+ }
open_clip/src/open_clip/loss.py ADDED
@@ -0,0 +1,448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from torch.nn import functional as F
6
+
7
+ try:
8
+ import torch.distributed.nn
9
+ from torch import distributed as dist
10
+
11
+ has_distributed = True
12
+ except ImportError:
13
+ has_distributed = False
14
+
15
+ try:
16
+ import horovod.torch as hvd
17
+ except ImportError:
18
+ hvd = None
19
+
20
+
21
+ def gather_features(
22
+ image_features,
23
+ text_features,
24
+ local_loss=False,
25
+ gather_with_grad=False,
26
+ rank=0,
27
+ world_size=1,
28
+ use_horovod=False
29
+ ):
30
+ assert has_distributed, 'torch.distributed did not import correctly, please use a PyTorch version with support.'
31
+ if use_horovod:
32
+ assert hvd is not None, 'Please install horovod'
33
+ if gather_with_grad:
34
+ all_image_features = hvd.allgather(image_features)
35
+ all_text_features = hvd.allgather(text_features)
36
+ else:
37
+ with torch.no_grad():
38
+ all_image_features = hvd.allgather(image_features)
39
+ all_text_features = hvd.allgather(text_features)
40
+ if not local_loss:
41
+ # ensure grads for local rank when all_* features don't have a gradient
42
+ gathered_image_features = list(all_image_features.chunk(world_size, dim=0))
43
+ gathered_text_features = list(all_text_features.chunk(world_size, dim=0))
44
+ gathered_image_features[rank] = image_features
45
+ gathered_text_features[rank] = text_features
46
+ all_image_features = torch.cat(gathered_image_features, dim=0)
47
+ all_text_features = torch.cat(gathered_text_features, dim=0)
48
+ else:
49
+ # We gather tensors from all gpus
50
+ if gather_with_grad:
51
+ all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features), dim=0)
52
+ all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features), dim=0)
53
+ else:
54
+ gathered_image_features = [torch.zeros_like(image_features) for _ in range(world_size)]
55
+ gathered_text_features = [torch.zeros_like(text_features) for _ in range(world_size)]
56
+ dist.all_gather(gathered_image_features, image_features)
57
+ dist.all_gather(gathered_text_features, text_features)
58
+ if not local_loss:
59
+ # ensure grads for local rank when all_* features don't have a gradient
60
+ gathered_image_features[rank] = image_features
61
+ gathered_text_features[rank] = text_features
62
+ all_image_features = torch.cat(gathered_image_features, dim=0)
63
+ all_text_features = torch.cat(gathered_text_features, dim=0)
64
+
65
+ return all_image_features, all_text_features
66
+
67
+
68
+ class ClipLoss(nn.Module):
69
+
70
+ def __init__(
71
+ self,
72
+ local_loss=False,
73
+ gather_with_grad=False,
74
+ cache_labels=False,
75
+ rank=0,
76
+ world_size=1,
77
+ use_horovod=False,
78
+ ):
79
+ super().__init__()
80
+ self.local_loss = local_loss
81
+ self.gather_with_grad = gather_with_grad
82
+ self.cache_labels = cache_labels
83
+ self.rank = rank
84
+ self.world_size = world_size
85
+ self.use_horovod = use_horovod
86
+
87
+ # cache state
88
+ self.prev_num_logits = 0
89
+ self.labels = {}
90
+
91
+ def get_ground_truth(self, device, num_logits) -> torch.Tensor:
92
+ # calculated ground-truth and cache if enabled
93
+ if self.prev_num_logits != num_logits or device not in self.labels:
94
+ labels = torch.arange(num_logits, device=device, dtype=torch.long)
95
+ if self.world_size > 1 and self.local_loss:
96
+ labels = labels + num_logits * self.rank
97
+ if self.cache_labels:
98
+ self.labels[device] = labels
99
+ self.prev_num_logits = num_logits
100
+ else:
101
+ labels = self.labels[device]
102
+ return labels
103
+
104
+ def get_logits(self, image_features, text_features, logit_scale):
105
+ if self.world_size > 1:
106
+ all_image_features, all_text_features = gather_features(
107
+ image_features,
108
+ text_features,
109
+ local_loss=self.local_loss,
110
+ gather_with_grad=self.gather_with_grad,
111
+ rank=self.rank,
112
+ world_size=self.world_size,
113
+ use_horovod=self.use_horovod,
114
+ )
115
+
116
+ if self.local_loss:
117
+ logits_per_image = logit_scale * image_features @ all_text_features.T
118
+ logits_per_text = logit_scale * text_features @ all_image_features.T
119
+ else:
120
+ logits_per_image = logit_scale * all_image_features @ all_text_features.T
121
+ logits_per_text = logits_per_image.T
122
+ else:
123
+ logits_per_image = logit_scale * image_features @ text_features.T
124
+ logits_per_text = logit_scale * text_features @ image_features.T
125
+
126
+ return logits_per_image, logits_per_text
127
+
128
+ def forward(self, image_features, text_features, logit_scale, output_dict=False):
129
+ device = image_features.device
130
+ logits_per_image, logits_per_text = self.get_logits(image_features, text_features, logit_scale)
131
+
132
+ labels = self.get_ground_truth(device, logits_per_image.shape[0])
133
+
134
+ total_loss = (
135
+ F.cross_entropy(logits_per_image, labels) +
136
+ F.cross_entropy(logits_per_text, labels)
137
+ ) / 2
138
+
139
+ return {"contrastive_loss": total_loss} if output_dict else total_loss
140
+
141
+
142
+ class CoCaLoss(ClipLoss):
143
+ def __init__(
144
+ self,
145
+ caption_loss_weight,
146
+ clip_loss_weight,
147
+ pad_id=0, # pad_token for open_clip custom tokenizer
148
+ local_loss=False,
149
+ gather_with_grad=False,
150
+ cache_labels=False,
151
+ rank=0,
152
+ world_size=1,
153
+ use_horovod=False,
154
+ ):
155
+ super().__init__(
156
+ local_loss=local_loss,
157
+ gather_with_grad=gather_with_grad,
158
+ cache_labels=cache_labels,
159
+ rank=rank,
160
+ world_size=world_size,
161
+ use_horovod=use_horovod
162
+ )
163
+
164
+ self.clip_loss_weight = clip_loss_weight
165
+ self.caption_loss_weight = caption_loss_weight
166
+ self.caption_loss = nn.CrossEntropyLoss(ignore_index=pad_id)
167
+
168
+ def forward(self, image_features, text_features, logits, labels, logit_scale, output_dict=False):
169
+ if self.clip_loss_weight:
170
+ clip_loss = super().forward(image_features, text_features, logit_scale)
171
+ clip_loss = self.clip_loss_weight * clip_loss
172
+ else:
173
+ clip_loss = torch.tensor(0, device=logits.device)
174
+
175
+ caption_loss = self.caption_loss(
176
+ logits.permute(0, 2, 1),
177
+ labels,
178
+ )
179
+ caption_loss = caption_loss * self.caption_loss_weight
180
+
181
+ if output_dict:
182
+ return {"contrastive_loss": clip_loss, "caption_loss": caption_loss}
183
+
184
+ return clip_loss, caption_loss
185
+
186
+
187
+ class DistillClipLoss(ClipLoss):
188
+
189
+ def dist_loss(self, teacher_logits, student_logits):
190
+ return -(teacher_logits.softmax(dim=1) * student_logits.log_softmax(dim=1)).sum(dim=1).mean(dim=0)
191
+
192
+ def forward(
193
+ self,
194
+ image_features,
195
+ text_features,
196
+ logit_scale,
197
+ dist_image_features,
198
+ dist_text_features,
199
+ dist_logit_scale,
200
+ output_dict=False,
201
+ ):
202
+ logits_per_image, logits_per_text = \
203
+ self.get_logits(image_features, text_features, logit_scale)
204
+
205
+ dist_logits_per_image, dist_logits_per_text = \
206
+ self.get_logits(dist_image_features, dist_text_features, dist_logit_scale)
207
+
208
+ labels = self.get_ground_truth(image_features.device, logits_per_image.shape[0])
209
+
210
+ contrastive_loss = (
211
+ F.cross_entropy(logits_per_image, labels) +
212
+ F.cross_entropy(logits_per_text, labels)
213
+ ) / 2
214
+
215
+ distill_loss = (
216
+ self.dist_loss(dist_logits_per_image, logits_per_image) +
217
+ self.dist_loss(dist_logits_per_text, logits_per_text)
218
+ ) / 2
219
+
220
+ if output_dict:
221
+ return {"contrastive_loss": contrastive_loss, "distill_loss": distill_loss}
222
+
223
+ return contrastive_loss, distill_loss
224
+
225
+
226
+ def neighbour_exchange(from_rank, to_rank, tensor, group=None):
227
+ tensor_recv = torch.zeros_like(tensor)
228
+ send_op = torch.distributed.P2POp(
229
+ torch.distributed.isend,
230
+ tensor,
231
+ to_rank,
232
+ group=group,
233
+ )
234
+ recv_op = torch.distributed.P2POp(
235
+ torch.distributed.irecv,
236
+ tensor_recv,
237
+ from_rank,
238
+ group=group,
239
+ )
240
+ reqs = torch.distributed.batch_isend_irecv([send_op, recv_op])
241
+ for req in reqs:
242
+ req.wait()
243
+ return tensor_recv
244
+
245
+
246
+ def neighbour_exchange_bidir(left_rank, right_rank, tensor_to_left, tensor_to_right, group=None):
247
+ tensor_from_left = torch.zeros_like(tensor_to_right)
248
+ tensor_from_right = torch.zeros_like(tensor_to_left)
249
+ send_op_left = torch.distributed.P2POp(
250
+ torch.distributed.isend,
251
+ tensor_to_left,
252
+ left_rank,
253
+ group=group,
254
+ )
255
+ send_op_right = torch.distributed.P2POp(
256
+ torch.distributed.isend,
257
+ tensor_to_right,
258
+ right_rank,
259
+ group=group,
260
+ )
261
+ recv_op_left = torch.distributed.P2POp(
262
+ torch.distributed.irecv,
263
+ tensor_from_left,
264
+ left_rank,
265
+ group=group,
266
+ )
267
+ recv_op_right = torch.distributed.P2POp(
268
+ torch.distributed.irecv,
269
+ tensor_from_right,
270
+ right_rank,
271
+ group=group,
272
+ )
273
+ reqs = torch.distributed.batch_isend_irecv([send_op_right, send_op_left, recv_op_right, recv_op_left])
274
+ for req in reqs:
275
+ req.wait()
276
+ return tensor_from_right, tensor_from_left
277
+
278
+
279
+ class NeighbourExchange(torch.autograd.Function):
280
+ @staticmethod
281
+ def forward(ctx, from_rank, to_rank, group, tensor):
282
+ ctx.group = group
283
+ ctx.from_rank = from_rank
284
+ ctx.to_rank = to_rank
285
+ return neighbour_exchange(from_rank, to_rank, tensor, group=group)
286
+
287
+ @staticmethod
288
+ def backward(ctx, grad_output):
289
+ return (None, None, None) + (NeighbourExchange.apply(ctx.to_rank, ctx.from_rank, ctx.group, grad_output),)
290
+
291
+
292
+ def neighbour_exchange_with_grad(from_rank, to_rank, tensor, group=None):
293
+ return NeighbourExchange.apply(from_rank, to_rank, group, tensor)
294
+
295
+
296
+ class NeighbourExchangeBidir(torch.autograd.Function):
297
+ @staticmethod
298
+ def forward(ctx, left_rank, right_rank, group, tensor_to_left, tensor_to_right):
299
+ ctx.group = group
300
+ ctx.left_rank = left_rank
301
+ ctx.right_rank = right_rank
302
+ return neighbour_exchange_bidir(left_rank, right_rank, tensor_to_left, tensor_to_right, group=group)
303
+
304
+ @staticmethod
305
+ def backward(ctx, *grad_outputs):
306
+ return (None, None, None) + \
307
+ NeighbourExchangeBidir.apply(ctx.right_rank, ctx.left_rank, ctx.group, *grad_outputs)
308
+
309
+
310
+ def neighbour_exchange_bidir_with_grad(left_rank, right_rank, tensor_to_left, tensor_to_right, group=None):
311
+ return NeighbourExchangeBidir.apply(left_rank, right_rank, group, tensor_to_left, tensor_to_right)
312
+
313
+
314
+ class SigLipLoss(nn.Module):
315
+ """ Sigmoid Loss for Language Image Pre-Training (SigLIP) - https://arxiv.org/abs/2303.15343
316
+
317
+ @article{zhai2023sigmoid,
318
+ title={Sigmoid loss for language image pre-training},
319
+ author={Zhai, Xiaohua and Mustafa, Basil and Kolesnikov, Alexander and Beyer, Lucas},
320
+ journal={arXiv preprint arXiv:2303.15343},
321
+ year={2023}
322
+ }
323
+ """
324
+ def __init__(
325
+ self,
326
+ cache_labels: bool = False,
327
+ rank: int = 0,
328
+ world_size: int = 1,
329
+ dist_impl: Optional[str] = None,
330
+ ):
331
+ super().__init__()
332
+ self.cache_labels = cache_labels
333
+ self.rank = rank
334
+ self.world_size = world_size
335
+ self.dist_impl = dist_impl or 'bidir' # default to bidir exchange for now, this will likely change
336
+ assert self.dist_impl in ('bidir', 'shift', 'reduce', 'gather')
337
+
338
+ # cache state FIXME cache not currently used, worthwhile?
339
+ self.prev_num_logits = 0
340
+ self.labels = {}
341
+
342
+ def get_ground_truth(self, device, dtype, num_logits, negative_only=False) -> torch.Tensor:
343
+ labels = -torch.ones((num_logits, num_logits), device=device, dtype=dtype)
344
+ if not negative_only:
345
+ labels = 2 * torch.eye(num_logits, device=device, dtype=dtype) + labels
346
+ return labels
347
+
348
+ def get_logits(self, image_features, text_features, logit_scale, logit_bias=None):
349
+ logits = logit_scale * image_features @ text_features.T
350
+ if logit_bias is not None:
351
+ logits += logit_bias
352
+ return logits
353
+
354
+ def _loss(self, image_features, text_features, logit_scale, logit_bias=None, negative_only=False):
355
+ logits = self.get_logits(image_features, text_features, logit_scale, logit_bias)
356
+ labels = self.get_ground_truth(
357
+ image_features.device,
358
+ image_features.dtype,
359
+ image_features.shape[0],
360
+ negative_only=negative_only,
361
+ )
362
+ loss = -F.logsigmoid(labels * logits).sum() / image_features.shape[0]
363
+ return loss
364
+
365
+ def forward(self, image_features, text_features, logit_scale, logit_bias, output_dict=False):
366
+ loss = self._loss(image_features, text_features, logit_scale, logit_bias)
367
+
368
+ if self.world_size > 1:
369
+ if self.dist_impl == 'bidir':
370
+ right_rank = (self.rank + 1) % self.world_size
371
+ left_rank = (self.rank - 1 + self.world_size) % self.world_size
372
+ text_features_to_right = text_features_to_left = text_features
373
+ num_bidir, remainder = divmod(self.world_size - 1, 2)
374
+ for i in range(num_bidir):
375
+ text_features_recv = neighbour_exchange_bidir_with_grad(
376
+ left_rank,
377
+ right_rank,
378
+ text_features_to_left,
379
+ text_features_to_right,
380
+ )
381
+ for f in text_features_recv:
382
+ loss += self._loss(
383
+ image_features,
384
+ f,
385
+ logit_scale,
386
+ logit_bias,
387
+ negative_only=True,
388
+ )
389
+ text_features_to_left, text_features_to_right = text_features_recv
390
+
391
+ if remainder:
392
+ text_features_recv = neighbour_exchange_with_grad(
393
+ left_rank,
394
+ right_rank,
395
+ text_features_to_right
396
+ )
397
+ loss += self._loss(
398
+ image_features,
399
+ text_features_recv,
400
+ logit_scale,
401
+ logit_bias,
402
+ negative_only=True,
403
+ )
404
+ elif self.dist_impl == "shift":
405
+ right_rank = (self.rank + 1) % self.world_size
406
+ left_rank = (self.rank - 1 + self.world_size) % self.world_size
407
+ text_features_to_right = text_features
408
+ for i in range(self.world_size - 1):
409
+ text_features_from_left = neighbour_exchange_with_grad(
410
+ left_rank,
411
+ right_rank,
412
+ text_features_to_right,
413
+ )
414
+ loss += self._loss(
415
+ image_features,
416
+ text_features_from_left,
417
+ logit_scale,
418
+ logit_bias,
419
+ negative_only=True,
420
+ )
421
+ text_features_to_right = text_features_from_left
422
+ elif self.dist_impl == "reduce":
423
+ for i in range(self.world_size):
424
+ text_from_other = torch.distributed.nn.all_reduce(
425
+ text_features * (self.rank == i),
426
+ torch.distributed.ReduceOp.SUM,
427
+ )
428
+ loss += float(i != self.rank) * self._loss(
429
+ image_features,
430
+ text_from_other,
431
+ logit_scale,
432
+ logit_bias,
433
+ negative_only=True,
434
+ )
435
+ elif self.dist_impl == "gather":
436
+ all_text = torch.distributed.nn.all_gather(text_features)
437
+ for i in range(self.world_size):
438
+ loss += float(i != self.rank) * self._loss(
439
+ image_features,
440
+ all_text[i],
441
+ logit_scale,
442
+ logit_bias,
443
+ negative_only=True,
444
+ )
445
+ else:
446
+ assert False
447
+
448
+ return {"contrastive_loss": loss} if output_dict else loss
open_clip/src/open_clip/model_configs/EVA01-g-14.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "timm_model_name": "eva_giant_patch14_224",
6
+ "timm_model_pretrained": false,
7
+ "timm_pool": "token",
8
+ "timm_proj": null
9
+ },
10
+ "text_cfg": {
11
+ "context_length": 77,
12
+ "vocab_size": 49408,
13
+ "width": 768,
14
+ "heads": 12,
15
+ "layers": 12
16
+ },
17
+ "custom_text": true
18
+ }
open_clip/src/open_clip/model_configs/EVA02-E-14.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "timm_model_name": "eva02_enormous_patch14_clip_224",
6
+ "timm_model_pretrained": false,
7
+ "timm_pool": "token",
8
+ "timm_proj": null
9
+ },
10
+ "text_cfg": {
11
+ "context_length": 77,
12
+ "vocab_size": 49408,
13
+ "width": 1024,
14
+ "heads": 16,
15
+ "layers": 24
16
+ },
17
+ "custom_text": true
18
+ }
open_clip/src/open_clip/model_configs/EVA02-L-14.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "timm_model_name": "eva02_large_patch14_clip_224",
6
+ "timm_model_pretrained": false,
7
+ "timm_pool": "token",
8
+ "timm_proj": null
9
+ },
10
+ "text_cfg": {
11
+ "context_length": 77,
12
+ "vocab_size": 49408,
13
+ "width": 768,
14
+ "heads": 12,
15
+ "layers": 12
16
+ },
17
+ "custom_text": true
18
+ }
open_clip/src/open_clip/model_configs/MobileCLIP-B.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 512,
3
+ "vision_cfg": {
4
+ "timm_model_name": "vit_base_mci_224",
5
+ "timm_model_pretrained": false,
6
+ "timm_pool": "token",
7
+ "timm_proj": null,
8
+ "timm_drop": 0.0,
9
+ "timm_drop_path": 0.0,
10
+ "image_size": 224
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 512,
16
+ "heads": 8,
17
+ "layers": 12,
18
+ "no_causal_mask": false
19
+ },
20
+ "custom_text": true
21
+ }
open_clip/src/open_clip/model_configs/MobileCLIP-S1.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 512,
3
+ "vision_cfg": {
4
+ "timm_model_name": "fastvit_mci1",
5
+ "timm_model_pretrained": false,
6
+ "timm_pool": "avg",
7
+ "timm_proj": null,
8
+ "timm_drop": 0.0,
9
+ "timm_drop_path": 0.0,
10
+ "image_size": 256
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 512,
16
+ "heads": 8,
17
+ "layers": 12,
18
+ "no_causal_mask": true
19
+ },
20
+ "custom_text": true
21
+ }
open_clip/src/open_clip/model_configs/MobileCLIP-S2.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 512,
3
+ "vision_cfg": {
4
+ "timm_model_name": "fastvit_mci2",
5
+ "timm_model_pretrained": false,
6
+ "timm_pool": "avg",
7
+ "timm_proj": null,
8
+ "timm_drop": 0.0,
9
+ "timm_drop_path": 0.0,
10
+ "image_size": 256
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 512,
16
+ "heads": 8,
17
+ "layers": 12,
18
+ "no_causal_mask": true
19
+ },
20
+ "custom_text": true
21
+ }
open_clip/src/open_clip/model_configs/RN101-quickgelu.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 512,
3
+ "quick_gelu": true,
4
+ "vision_cfg": {
5
+ "image_size": 224,
6
+ "layers": [
7
+ 3,
8
+ 4,
9
+ 23,
10
+ 3
11
+ ],
12
+ "width": 64,
13
+ "patch_size": null
14
+ },
15
+ "text_cfg": {
16
+ "context_length": 77,
17
+ "vocab_size": 49408,
18
+ "width": 512,
19
+ "heads": 8,
20
+ "layers": 12
21
+ }
22
+ }
open_clip/src/open_clip/model_configs/RN50-quickgelu.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "quick_gelu": true,
4
+ "vision_cfg": {
5
+ "image_size": 224,
6
+ "layers": [
7
+ 3,
8
+ 4,
9
+ 6,
10
+ 3
11
+ ],
12
+ "width": 64,
13
+ "patch_size": null
14
+ },
15
+ "text_cfg": {
16
+ "context_length": 77,
17
+ "vocab_size": 49408,
18
+ "width": 512,
19
+ "heads": 8,
20
+ "layers": 12
21
+ }
22
+ }
open_clip/src/open_clip/model_configs/RN50.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": [
6
+ 3,
7
+ 4,
8
+ 6,
9
+ 3
10
+ ],
11
+ "width": 64,
12
+ "patch_size": null
13
+ },
14
+ "text_cfg": {
15
+ "context_length": 77,
16
+ "vocab_size": 49408,
17
+ "width": 512,
18
+ "heads": 8,
19
+ "layers": 12
20
+ }
21
+ }
open_clip/src/open_clip/model_configs/RN50x64-quickgelu.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "quick_gelu": true,
4
+ "vision_cfg": {
5
+ "image_size": 448,
6
+ "layers": [
7
+ 3,
8
+ 15,
9
+ 36,
10
+ 10
11
+ ],
12
+ "width": 128,
13
+ "patch_size": null
14
+ },
15
+ "text_cfg": {
16
+ "context_length": 77,
17
+ "vocab_size": 49408,
18
+ "width": 1024,
19
+ "heads": 16,
20
+ "layers": 12
21
+ }
22
+ }
open_clip/src/open_clip/model_configs/ViT-B-16-SigLIP-384.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "init_logit_bias": -10,
4
+ "custom_text": true,
5
+ "vision_cfg": {
6
+ "image_size": 384,
7
+ "timm_model_name": "vit_base_patch16_siglip_384",
8
+ "timm_model_pretrained": false,
9
+ "timm_pool": "map",
10
+ "timm_proj": "none"
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 64,
14
+ "vocab_size": 32000,
15
+ "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16
+ "tokenizer_kwargs": {
17
+ "clean": "canonicalize"
18
+ },
19
+ "width": 768,
20
+ "heads": 12,
21
+ "layers": 12,
22
+ "no_causal_mask": true,
23
+ "proj_bias": true,
24
+ "pool_type": "last",
25
+ "norm_kwargs":{
26
+ "eps": 1e-6
27
+ }
28
+ }
29
+ }
open_clip/src/open_clip/model_configs/ViT-B-16-SigLIP-512.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "init_logit_bias": -10,
4
+ "custom_text": true,
5
+ "vision_cfg": {
6
+ "image_size": 512,
7
+ "timm_model_name": "vit_base_patch16_siglip_512",
8
+ "timm_model_pretrained": false,
9
+ "timm_pool": "map",
10
+ "timm_proj": "none"
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 64,
14
+ "vocab_size": 32000,
15
+ "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16
+ "tokenizer_kwargs": {
17
+ "clean": "canonicalize"
18
+ },
19
+ "width": 768,
20
+ "heads": 12,
21
+ "layers": 12,
22
+ "no_causal_mask": true,
23
+ "proj_bias": true,
24
+ "pool_type": "last",
25
+ "norm_kwargs":{
26
+ "eps": 1e-6
27
+ }
28
+ }
29
+ }
open_clip/src/open_clip/model_configs/ViT-B-16-SigLIP.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "init_logit_bias": -10,
4
+ "custom_text": true,
5
+ "vision_cfg": {
6
+ "image_size": 224,
7
+ "timm_model_name": "vit_base_patch16_siglip_224",
8
+ "timm_model_pretrained": false,
9
+ "timm_pool": "map",
10
+ "timm_proj": "none"
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 64,
14
+ "vocab_size": 32000,
15
+ "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16
+ "tokenizer_kwargs": {
17
+ "clean": "canonicalize"
18
+ },
19
+ "width": 768,
20
+ "heads": 12,
21
+ "layers": 12,
22
+ "no_causal_mask": true,
23
+ "proj_bias": true,
24
+ "pool_type": "last",
25
+ "norm_kwargs":{
26
+ "eps": 1e-6
27
+ }
28
+ }
29
+ }
open_clip/src/open_clip/model_configs/ViT-B-32-256.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 512,
3
+ "vision_cfg": {
4
+ "image_size": 256,
5
+ "layers": 12,
6
+ "width": 768,
7
+ "patch_size": 32
8
+ },
9
+ "text_cfg": {
10
+ "context_length": 77,
11
+ "vocab_size": 49408,
12
+ "width": 512,
13
+ "heads": 8,
14
+ "layers": 12
15
+ }
16
+ }
open_clip/src/open_clip/model_configs/ViT-H-14-378.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "image_size": 378,
5
+ "layers": 32,
6
+ "width": 1280,
7
+ "head_width": 80,
8
+ "patch_size": 14
9
+ },
10
+ "text_cfg": {
11
+ "context_length": 77,
12
+ "vocab_size": 49408,
13
+ "width": 1024,
14
+ "heads": 16,
15
+ "layers": 24
16
+ }
17
+ }
open_clip/src/open_clip/model_configs/ViT-H-14-CLIPA-336.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "image_size": 336,
5
+ "layers": 32,
6
+ "width": 1280,
7
+ "head_width": 80,
8
+ "patch_size": 14,
9
+ "no_ln_pre": true,
10
+ "pool_type": "avg",
11
+ "final_ln_after_pool": true
12
+ },
13
+ "text_cfg": {
14
+ "context_length": 32,
15
+ "vocab_size": 32000,
16
+ "hf_tokenizer_name": "bert-base-uncased",
17
+ "tokenizer_kwargs": {
18
+ "strip_sep_token": true
19
+ },
20
+ "width": 1024,
21
+ "heads": 16,
22
+ "layers": 24,
23
+ "pool_type": "last",
24
+ "no_causal_mask": true
25
+ }
26
+ }
open_clip/src/open_clip/model_configs/ViT-H-14-quickgelu.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "quick_gelu": true,
4
+ "vision_cfg": {
5
+ "image_size": 224,
6
+ "layers": 32,
7
+ "width": 1280,
8
+ "head_width": 80,
9
+ "patch_size": 14
10
+ },
11
+ "text_cfg": {
12
+ "context_length": 77,
13
+ "vocab_size": 49408,
14
+ "width": 1024,
15
+ "heads": 16,
16
+ "layers": 24
17
+ }
18
+ }
open_clip/src/open_clip/model_configs/ViT-H-14.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 32,
6
+ "width": 1280,
7
+ "head_width": 80,
8
+ "patch_size": 14
9
+ },
10
+ "text_cfg": {
11
+ "context_length": 77,
12
+ "vocab_size": 49408,
13
+ "width": 1024,
14
+ "heads": 16,
15
+ "layers": 24
16
+ }
17
+ }
open_clip/src/open_clip/model_configs/ViT-L-14-280.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "vision_cfg": {
4
+ "image_size": 280,
5
+ "layers": 24,
6
+ "width": 1024,
7
+ "patch_size": 14
8
+ },
9
+ "text_cfg": {
10
+ "context_length": 77,
11
+ "vocab_size": 49408,
12
+ "width": 768,
13
+ "heads": 12,
14
+ "layers": 12
15
+ }
16
+ }
open_clip/src/open_clip/model_configs/ViT-L-14-336-quickgelu.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "quick_gelu": true,
4
+ "vision_cfg": {
5
+ "image_size": 336,
6
+ "layers": 24,
7
+ "width": 1024,
8
+ "patch_size": 14
9
+ },
10
+ "text_cfg": {
11
+ "context_length": 77,
12
+ "vocab_size": 49408,
13
+ "width": 768,
14
+ "heads": 12,
15
+ "layers": 12
16
+ }
17
+ }
open_clip/src/open_clip/model_configs/ViT-L-14-quickgelu.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "quick_gelu": true,
4
+ "vision_cfg": {
5
+ "image_size": 224,
6
+ "layers": 24,
7
+ "width": 1024,
8
+ "patch_size": 14
9
+ },
10
+ "text_cfg": {
11
+ "context_length": 77,
12
+ "vocab_size": 49408,
13
+ "width": 768,
14
+ "heads": 12,
15
+ "layers": 12
16
+ }
17
+ }
open_clip/src/open_clip/model_configs/ViT-L-14.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 24,
6
+ "width": 1024,
7
+ "patch_size": 14
8
+ },
9
+ "text_cfg": {
10
+ "context_length": 77,
11
+ "vocab_size": 49408,
12
+ "width": 768,
13
+ "heads": 12,
14
+ "layers": 12
15
+ }
16
+ }
open_clip/src/open_clip/model_configs/ViT-L-16-320.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "vision_cfg": {
4
+ "image_size": 320,
5
+ "layers": 24,
6
+ "width": 1024,
7
+ "patch_size": 16
8
+ },
9
+ "text_cfg": {
10
+ "context_length": 77,
11
+ "vocab_size": 49408,
12
+ "width": 768,
13
+ "heads": 12,
14
+ "layers": 12
15
+ }
16
+ }
open_clip/src/open_clip/model_configs/ViT-L-16-SigLIP-384.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "init_logit_bias": -10,
4
+ "custom_text": true,
5
+ "vision_cfg": {
6
+ "image_size": 384,
7
+ "timm_model_name": "vit_large_patch16_siglip_384",
8
+ "timm_model_pretrained": false,
9
+ "timm_pool": "map",
10
+ "timm_proj": "none"
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 64,
14
+ "vocab_size": 32000,
15
+ "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16
+ "tokenizer_kwargs": {
17
+ "clean": "canonicalize"
18
+ },
19
+ "width": 1024,
20
+ "heads": 16,
21
+ "layers": 24,
22
+ "no_causal_mask": true,
23
+ "proj_bias": true,
24
+ "pool_type": "last",
25
+ "norm_kwargs":{
26
+ "eps": 1e-6
27
+ }
28
+ }
29
+ }
open_clip/src/open_clip/model_configs/ViT-M-16-alt.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 384,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 12,
6
+ "width": 512,
7
+ "patch_size": 16,
8
+ "ls_init_value": 1e-4
9
+ },
10
+ "text_cfg": {
11
+ "context_length": 77,
12
+ "vocab_size": 49408,
13
+ "width": 384,
14
+ "heads": 6,
15
+ "layers": 12
16
+ }
17
+ }
open_clip/src/open_clip/model_configs/ViT-M-32-alt.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 384,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 12,
6
+ "width": 512,
7
+ "patch_size": 32
8
+ },
9
+ "text_cfg": {
10
+ "context_length": 77,
11
+ "vocab_size": 49408,
12
+ "width": 384,
13
+ "heads": 6,
14
+ "layers": 12
15
+ }
16
+ }
open_clip/src/open_clip/model_configs/ViT-M-32.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 512,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 12,
6
+ "width": 512,
7
+ "patch_size": 32
8
+ },
9
+ "text_cfg": {
10
+ "context_length": 77,
11
+ "vocab_size": 49408,
12
+ "width": 512,
13
+ "heads": 8,
14
+ "layers": 12
15
+ }
16
+ }
open_clip/src/open_clip/model_configs/ViT-S-16.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 384,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 12,
6
+ "width": 384,
7
+ "patch_size": 16
8
+ },
9
+ "text_cfg": {
10
+ "context_length": 77,
11
+ "vocab_size": 49408,
12
+ "width": 384,
13
+ "heads": 6,
14
+ "layers": 12
15
+ }
16
+ }
open_clip/src/open_clip/model_configs/ViT-SO400M-14-SigLIP-378.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1152,
3
+ "init_logit_bias": -10,
4
+ "custom_text": true,
5
+ "vision_cfg": {
6
+ "image_size": 378,
7
+ "timm_model_name": "vit_so400m_patch14_siglip_378",
8
+ "timm_model_pretrained": false,
9
+ "timm_pool": "map",
10
+ "timm_proj": "none"
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 64,
14
+ "vocab_size": 32000,
15
+ "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16
+ "tokenizer_kwargs": {
17
+ "clean": "canonicalize"
18
+ },
19
+ "width": 1152,
20
+ "heads": 16,
21
+ "layers": 27,
22
+ "mlp_ratio": 3.7362,
23
+ "no_causal_mask": true,
24
+ "proj_bias": true,
25
+ "pool_type": "last",
26
+ "norm_kwargs":{
27
+ "eps": 1e-6
28
+ }
29
+ }
30
+ }
open_clip/src/open_clip/model_configs/ViT-SO400M-14-SigLIP-384.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1152,
3
+ "init_logit_bias": -10,
4
+ "custom_text": true,
5
+ "vision_cfg": {
6
+ "image_size": 384,
7
+ "timm_model_name": "vit_so400m_patch14_siglip_384",
8
+ "timm_model_pretrained": false,
9
+ "timm_pool": "map",
10
+ "timm_proj": "none"
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 64,
14
+ "vocab_size": 32000,
15
+ "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
16
+ "tokenizer_kwargs": {
17
+ "clean": "canonicalize"
18
+ },
19
+ "width": 1152,
20
+ "heads": 16,
21
+ "layers": 27,
22
+ "mlp_ratio": 3.7362,
23
+ "no_causal_mask": true,
24
+ "proj_bias": true,
25
+ "pool_type": "last",
26
+ "norm_kwargs":{
27
+ "eps": 1e-6
28
+ }
29
+ }
30
+ }
open_clip/src/open_clip/model_configs/ViT-bigG-14-CLIPA.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1280,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 48,
6
+ "width": 1664,
7
+ "head_width": 104,
8
+ "mlp_ratio": 4.9231,
9
+ "patch_size": 14,
10
+ "no_ln_pre": true,
11
+ "pool_type": "avg",
12
+ "final_ln_after_pool": true
13
+ },
14
+ "text_cfg": {
15
+ "context_length": 32,
16
+ "vocab_size": 32000,
17
+ "hf_tokenizer_name": "bert-base-uncased",
18
+ "tokenizer_kwargs": {
19
+ "strip_sep_token": true
20
+ },
21
+ "width": 1280,
22
+ "heads": 20,
23
+ "layers": 32,
24
+ "pool_type": "last",
25
+ "no_causal_mask": true
26
+ }
27
+ }
open_clip/src/open_clip/model_configs/ViT-bigG-14-quickgelu.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1280,
3
+ "quick_gelu": true,
4
+ "vision_cfg": {
5
+ "image_size": 224,
6
+ "layers": 48,
7
+ "width": 1664,
8
+ "head_width": 104,
9
+ "mlp_ratio": 4.9231,
10
+ "patch_size": 14
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 1280,
16
+ "heads": 20,
17
+ "layers": 32
18
+ }
19
+ }
open_clip/src/open_clip/model_configs/ViT-bigG-14.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1280,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 48,
6
+ "width": 1664,
7
+ "head_width": 104,
8
+ "mlp_ratio": 4.9231,
9
+ "patch_size": 14
10
+ },
11
+ "text_cfg": {
12
+ "context_length": 77,
13
+ "vocab_size": 49408,
14
+ "width": 1280,
15
+ "heads": 20,
16
+ "layers": 32
17
+ }
18
+ }
open_clip/src/open_clip/model_configs/ViT-e-14.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1280,
3
+ "vision_cfg": {
4
+ "image_size": 224,
5
+ "layers": 56,
6
+ "width": 1792,
7
+ "head_width": 112,
8
+ "mlp_ratio": 8.5715,
9
+ "patch_size": 14
10
+ },
11
+ "text_cfg": {
12
+ "context_length": 77,
13
+ "vocab_size": 49408,
14
+ "width": 1280,
15
+ "heads": 20,
16
+ "layers": 36
17
+ }
18
+ }
open_clip/src/open_clip/model_configs/ViTamin-B-LTT.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "vision_cfg": {
4
+ "timm_model_name": "vitamin_base_224",
5
+ "timm_model_pretrained": false,
6
+ "timm_pool": "",
7
+ "timm_proj": "linear",
8
+ "timm_drop": 0.0,
9
+ "timm_drop_path": 0.1,
10
+ "image_size": 224
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 768,
16
+ "heads": 12,
17
+ "layers": 12
18
+ },
19
+ "custom_text": true
20
+ }
open_clip/src/open_clip/model_configs/ViTamin-B.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 512,
3
+ "vision_cfg": {
4
+ "timm_model_name": "vitamin_base_224",
5
+ "timm_model_pretrained": false,
6
+ "timm_pool": "",
7
+ "timm_proj": "linear",
8
+ "timm_drop": 0.0,
9
+ "timm_drop_path": 0.1,
10
+ "image_size": 224
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 512,
16
+ "heads": 8,
17
+ "layers": 12
18
+ },
19
+ "custom_text": true
20
+ }
open_clip/src/open_clip/model_configs/ViTamin-L-336.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "vision_cfg": {
4
+ "timm_model_name": "vitamin_large_336",
5
+ "timm_model_pretrained": false,
6
+ "timm_pool": "",
7
+ "timm_proj": "linear",
8
+ "timm_drop": 0.0,
9
+ "timm_drop_path": 0.1,
10
+ "image_size": 336
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 768,
16
+ "heads": 12,
17
+ "layers": 12
18
+ },
19
+ "custom_text": true
20
+ }
open_clip/src/open_clip/model_configs/ViTamin-L-384.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "vision_cfg": {
4
+ "timm_model_name": "vitamin_large_384",
5
+ "timm_model_pretrained": false,
6
+ "timm_pool": "",
7
+ "timm_proj": "linear",
8
+ "timm_drop": 0.0,
9
+ "timm_drop_path": 0.1,
10
+ "image_size": 384
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 768,
16
+ "heads": 12,
17
+ "layers": 12
18
+ },
19
+ "custom_text": true
20
+ }
open_clip/src/open_clip/model_configs/ViTamin-L.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "vision_cfg": {
4
+ "timm_model_name": "vitamin_large_224",
5
+ "timm_model_pretrained": false,
6
+ "timm_pool": "",
7
+ "timm_proj": "linear",
8
+ "timm_drop": 0.0,
9
+ "timm_drop_path": 0.1,
10
+ "image_size": 224
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 768,
16
+ "heads": 12,
17
+ "layers": 12
18
+ },
19
+ "custom_text": true
20
+ }
open_clip/src/open_clip/model_configs/ViTamin-L2-256.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "timm_model_name": "vitamin_large2_256",
5
+ "timm_model_pretrained": false,
6
+ "timm_pool": "",
7
+ "timm_proj": "linear",
8
+ "timm_drop": 0.0,
9
+ "timm_drop_path": 0.1,
10
+ "image_size": 256
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 1024,
16
+ "heads": 16,
17
+ "layers": 24
18
+ },
19
+ "custom_text": true
20
+ }
open_clip/src/open_clip/model_configs/ViTamin-L2-336.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "timm_model_name": "vitamin_large2_336",
5
+ "timm_model_pretrained": false,
6
+ "timm_pool": "",
7
+ "timm_proj": "linear",
8
+ "timm_drop": 0.0,
9
+ "timm_drop_path": 0.1,
10
+ "image_size": 336
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 1024,
16
+ "heads": 16,
17
+ "layers": 24
18
+ },
19
+ "custom_text": true
20
+ }
open_clip/src/open_clip/model_configs/ViTamin-L2-384.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "timm_model_name": "vitamin_large2_384",
5
+ "timm_model_pretrained": false,
6
+ "timm_pool": "",
7
+ "timm_proj": "linear",
8
+ "timm_drop": 0.0,
9
+ "timm_drop_path": 0.1,
10
+ "image_size": 384
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 1024,
16
+ "heads": 16,
17
+ "layers": 24
18
+ },
19
+ "custom_text": true
20
+ }
open_clip/src/open_clip/model_configs/ViTamin-L2.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1024,
3
+ "vision_cfg": {
4
+ "timm_model_name": "vitamin_large2_224",
5
+ "timm_model_pretrained": false,
6
+ "timm_pool": "",
7
+ "timm_proj": "linear",
8
+ "timm_drop": 0.0,
9
+ "timm_drop_path": 0.1,
10
+ "image_size": 224
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 1024,
16
+ "heads": 16,
17
+ "layers": 24
18
+ },
19
+ "custom_text": true
20
+ }
open_clip/src/open_clip/model_configs/ViTamin-S-LTT.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 768,
3
+ "vision_cfg": {
4
+ "timm_model_name": "vitamin_small_224",
5
+ "timm_model_pretrained": false,
6
+ "timm_pool": "",
7
+ "timm_proj": "linear",
8
+ "timm_drop": 0.0,
9
+ "timm_drop_path": 0.1,
10
+ "image_size": 224
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 768,
16
+ "heads": 12,
17
+ "layers": 12
18
+ },
19
+ "custom_text": true
20
+ }
open_clip/src/open_clip/model_configs/ViTamin-S.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 384,
3
+ "vision_cfg": {
4
+ "timm_model_name": "vitamin_small_224",
5
+ "timm_model_pretrained": false,
6
+ "timm_pool": "",
7
+ "timm_proj": "linear",
8
+ "timm_drop": 0.0,
9
+ "timm_drop_path": 0.1,
10
+ "image_size": 224
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 384,
16
+ "heads": 6,
17
+ "layers": 12
18
+ },
19
+ "custom_text": true
20
+ }
open_clip/src/open_clip/model_configs/ViTamin-XL-256.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1152,
3
+ "vision_cfg": {
4
+ "timm_model_name": "vitamin_xlarge_256",
5
+ "timm_model_pretrained": false,
6
+ "timm_pool": "",
7
+ "timm_proj": "linear",
8
+ "timm_drop": 0.0,
9
+ "timm_drop_path": 0.1,
10
+ "image_size": 256
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 1152,
16
+ "heads": 16,
17
+ "layers": 27
18
+ },
19
+ "custom_text": true
20
+ }
open_clip/src/open_clip/model_configs/ViTamin-XL-336.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "embed_dim": 1152,
3
+ "vision_cfg": {
4
+ "timm_model_name": "vitamin_xlarge_336",
5
+ "timm_model_pretrained": false,
6
+ "timm_pool": "",
7
+ "timm_proj": "linear",
8
+ "timm_drop": 0.0,
9
+ "timm_drop_path": 0.1,
10
+ "image_size": 336
11
+ },
12
+ "text_cfg": {
13
+ "context_length": 77,
14
+ "vocab_size": 49408,
15
+ "width": 1152,
16
+ "heads": 16,
17
+ "layers": 27
18
+ },
19
+ "custom_text": true
20
+ }