tuandunghcmut commited on Apr 10, 2025

Commit

9eb384f

verified ·

1 Parent(s): 329abda

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

open_clip/src/open_clip/__init__.py +18 -0
open_clip/src/open_clip/coca_model.py +500 -0
open_clip/src/open_clip/loss.py +448 -0
open_clip/src/open_clip/model_configs/EVA01-g-14.json +18 -0
open_clip/src/open_clip/model_configs/EVA02-E-14.json +18 -0
open_clip/src/open_clip/model_configs/EVA02-L-14.json +18 -0
open_clip/src/open_clip/model_configs/MobileCLIP-B.json +21 -0
open_clip/src/open_clip/model_configs/MobileCLIP-S1.json +21 -0
open_clip/src/open_clip/model_configs/MobileCLIP-S2.json +21 -0
open_clip/src/open_clip/model_configs/RN101-quickgelu.json +22 -0
open_clip/src/open_clip/model_configs/RN50-quickgelu.json +22 -0
open_clip/src/open_clip/model_configs/RN50.json +21 -0
open_clip/src/open_clip/model_configs/RN50x64-quickgelu.json +22 -0
open_clip/src/open_clip/model_configs/ViT-B-16-SigLIP-384.json +29 -0
open_clip/src/open_clip/model_configs/ViT-B-16-SigLIP-512.json +29 -0
open_clip/src/open_clip/model_configs/ViT-B-16-SigLIP.json +29 -0
open_clip/src/open_clip/model_configs/ViT-B-32-256.json +16 -0
open_clip/src/open_clip/model_configs/ViT-H-14-378.json +17 -0
open_clip/src/open_clip/model_configs/ViT-H-14-CLIPA-336.json +26 -0
open_clip/src/open_clip/model_configs/ViT-H-14-quickgelu.json +18 -0
open_clip/src/open_clip/model_configs/ViT-H-14.json +17 -0
open_clip/src/open_clip/model_configs/ViT-L-14-280.json +16 -0
open_clip/src/open_clip/model_configs/ViT-L-14-336-quickgelu.json +17 -0
open_clip/src/open_clip/model_configs/ViT-L-14-quickgelu.json +17 -0
open_clip/src/open_clip/model_configs/ViT-L-14.json +16 -0
open_clip/src/open_clip/model_configs/ViT-L-16-320.json +16 -0
open_clip/src/open_clip/model_configs/ViT-L-16-SigLIP-384.json +29 -0
open_clip/src/open_clip/model_configs/ViT-M-16-alt.json +17 -0
open_clip/src/open_clip/model_configs/ViT-M-32-alt.json +16 -0
open_clip/src/open_clip/model_configs/ViT-M-32.json +16 -0
open_clip/src/open_clip/model_configs/ViT-S-16.json +16 -0
open_clip/src/open_clip/model_configs/ViT-SO400M-14-SigLIP-378.json +30 -0
open_clip/src/open_clip/model_configs/ViT-SO400M-14-SigLIP-384.json +30 -0
open_clip/src/open_clip/model_configs/ViT-bigG-14-CLIPA.json +27 -0
open_clip/src/open_clip/model_configs/ViT-bigG-14-quickgelu.json +19 -0
open_clip/src/open_clip/model_configs/ViT-bigG-14.json +18 -0
open_clip/src/open_clip/model_configs/ViT-e-14.json +18 -0
open_clip/src/open_clip/model_configs/ViTamin-B-LTT.json +20 -0
open_clip/src/open_clip/model_configs/ViTamin-B.json +20 -0
open_clip/src/open_clip/model_configs/ViTamin-L-336.json +20 -0
open_clip/src/open_clip/model_configs/ViTamin-L-384.json +20 -0
open_clip/src/open_clip/model_configs/ViTamin-L.json +20 -0
open_clip/src/open_clip/model_configs/ViTamin-L2-256.json +20 -0
open_clip/src/open_clip/model_configs/ViTamin-L2-336.json +20 -0
open_clip/src/open_clip/model_configs/ViTamin-L2-384.json +20 -0
open_clip/src/open_clip/model_configs/ViTamin-L2.json +20 -0
open_clip/src/open_clip/model_configs/ViTamin-S-LTT.json +20 -0
open_clip/src/open_clip/model_configs/ViTamin-S.json +20 -0
open_clip/src/open_clip/model_configs/ViTamin-XL-256.json +20 -0
open_clip/src/open_clip/model_configs/ViTamin-XL-336.json +20 -0

open_clip/src/open_clip/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from .version import __version__
+from .coca_model import CoCa
+from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
+from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_loss
+from .factory import list_models, add_model_config, get_model_config, load_checkpoint
+from .loss import ClipLoss, DistillClipLoss, CoCaLoss
+from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \
+    convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype, get_input_dtype, \
+    get_model_tokenize_cfg, get_model_preprocess_cfg, set_model_preprocess_cfg
+from .openai import load_openai_model, list_openai_models
+from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \
+    get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
+from .push_to_hf_hub import push_pretrained_to_hf_hub, push_to_hf_hub
+from .tokenizer import SimpleTokenizer, tokenize, decode
+from .transform import image_transform, AugmentationCfg
+from .zero_shot_classifier import build_zero_shot_classifier, build_zero_shot_classifier_legacy
+from .zero_shot_metadata import OPENAI_IMAGENET_TEMPLATES, SIMPLE_IMAGENET_TEMPLATES, IMAGENET_CLASSNAMES

open_clip/src/open_clip/coca_model.py ADDED Viewed

	@@ -0,0 +1,500 @@

+from typing import Optional
+import torch
+from torch import nn
+from torch.nn import functional as F
+import numpy as np
+from dataclasses import dataclass
+from .transformer import (
+    LayerNormFp32,
+    LayerNorm,
+    QuickGELU,
+    MultimodalTransformer,
+)
+from .model import CLIPTextCfg, CLIPVisionCfg, _build_vision_tower, _build_text_tower
+try:
+    from transformers import (
+        BeamSearchScorer,
+        LogitsProcessorList,
+        TopPLogitsWarper,
+        TopKLogitsWarper,
+        RepetitionPenaltyLogitsProcessor,
+        MinLengthLogitsProcessor,
+        MaxLengthCriteria,
+        StopStringCriteria,
+        EosTokenCriteria,
+        StoppingCriteriaList
+    )
+    GENERATION_TYPES = {
+        "top_k": TopKLogitsWarper,
+        "top_p": TopPLogitsWarper,
+        "beam_search": "beam_search"
+    }
+    _has_transformers = True
+except ImportError as e:
+    GENERATION_TYPES = {
+        "top_k": None,
+        "top_p": None,
+        "beam_search": "beam_search"
+    }
+    _has_transformers = False
+@dataclass
+class MultimodalCfg(CLIPTextCfg):
+    mlp_ratio: int = 4
+    dim_head: int = 64
+    heads: int = 8
+    n_queries: int = 256
+    attn_pooler_heads: int = 8
+def _build_text_decoder_tower(
+        embed_dim,
+        multimodal_cfg,
+        quick_gelu: bool = False,
+        cast_dtype: Optional[torch.dtype] = None,
+):
+    multimodal_cfg = MultimodalCfg(**multimodal_cfg) if isinstance(multimodal_cfg, dict) else multimodal_cfg
+    act_layer = QuickGELU if quick_gelu else nn.GELU
+    norm_layer = (
+        LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm
+    )
+    decoder = MultimodalTransformer(
+        context_length=multimodal_cfg.context_length,
+        width=multimodal_cfg.width,
+        heads=multimodal_cfg.heads,
+        layers=multimodal_cfg.layers,
+        ls_init_value=multimodal_cfg.ls_init_value,
+        output_dim=embed_dim,
+        act_layer=act_layer,
+        norm_layer=norm_layer,
+    )
+    return decoder
+def _token_to_tensor(token_id, device: str = "cpu") -> torch.Tensor:
+    if not isinstance(token_id, torch.Tensor):
+        if isinstance(token_id, int):
+            token_id = [token_id]
+        token_id = torch.tensor(token_id, device=device)
+    return token_id
+class CoCa(nn.Module):
+    def __init__(
+            self,
+            embed_dim,
+            multimodal_cfg: MultimodalCfg,
+            text_cfg: CLIPTextCfg,
+            vision_cfg: CLIPVisionCfg,
+            quick_gelu: bool = False,
+            init_logit_scale: float = np.log(1 / 0.07),
+            init_logit_bias: Optional[float] = None,
+            nonscalar_logit_scale: bool = False,
+            cast_dtype: Optional[torch.dtype] = None,
+            pad_id: int = 0,
+    ):
+        super().__init__()
+        multimodal_cfg = MultimodalCfg(**multimodal_cfg) if isinstance(multimodal_cfg, dict) else multimodal_cfg
+        text_cfg = CLIPTextCfg(**text_cfg) if isinstance(text_cfg, dict) else text_cfg
+        vision_cfg = CLIPVisionCfg(**vision_cfg) if isinstance(vision_cfg, dict) else vision_cfg
+        self.text = _build_text_tower(
+            embed_dim=embed_dim,
+            text_cfg=text_cfg,
+            quick_gelu=quick_gelu,
+            cast_dtype=cast_dtype,
+        )
+        vocab_size = (
+            text_cfg.vocab_size  # for hf models
+            if hasattr(text_cfg, "hf_model_name") and text_cfg.hf_model_name is not None
+            else text_cfg.vocab_size
+        )
+        self.visual = _build_vision_tower(
+            embed_dim=embed_dim,
+            vision_cfg=vision_cfg,
+            quick_gelu=quick_gelu,
+            cast_dtype=cast_dtype,
+        )
+        self.text_decoder = _build_text_decoder_tower(
+            vocab_size,
+            multimodal_cfg=multimodal_cfg,
+            quick_gelu=quick_gelu,
+            cast_dtype=cast_dtype,
+        )
+        lshape = [1] if nonscalar_logit_scale else []
+        self.logit_scale = nn.Parameter(torch.ones(lshape) * init_logit_scale)
+        if init_logit_bias is not None:
+            self.logit_bias = nn.Parameter(torch.ones(lshape) * init_logit_bias)
+        else:
+            self.logit_bias = None
+        self.pad_id = pad_id
+        self.context_length = multimodal_cfg.context_length
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable: bool = True):
+        self.visual.set_grad_checkpointing(enable)
+        self.text.set_grad_checkpointing(enable)
+        self.text_decoder.set_grad_checkpointing(enable)
+    def _encode_image(self, images, normalize: bool = True):
+        image_latent, tokens_embs = self.visual(images)
+        image_latent = F.normalize(image_latent, dim=-1) if normalize else image_latent
+        return image_latent, tokens_embs
+    def _encode_text(self, text, normalize: bool = True):
+        text_latent, token_emb = self.text(text)
+        text_latent = F.normalize(text_latent, dim=-1) if normalize else text_latent
+        return text_latent, token_emb
+    def encode_image(self, images, normalize: bool = True):
+        image_latent, _ = self._encode_image(images, normalize=normalize)
+        return image_latent
+    def encode_text(self, text, normalize: bool = True):
+        text_latent, _ = self._encode_text(text, normalize=normalize)
+        return text_latent
+    def forward(
+            self,
+            image,
+            text: Optional[torch.Tensor] = None,
+            image_latent: Optional[torch.Tensor] = None,
+            image_embs: Optional[torch.Tensor] = None,
+            output_labels: bool = True,
+    ):
+        if image_latent is None or image_embs is None:
+            image_latent, image_embs = self._encode_image(image)
+        if text is None:
+            return {"image_features": image_latent, "image_embs": image_embs}
+        text_latent, token_embs = self._encode_text(text)
+        # FIXME this isn't an ideal solution, would like to improve -RW
+        labels: Optional[torch.Tensor] = text[:, 1:] if output_labels else None
+        if output_labels:
+            # align text_embs and thus logits with labels for teacher-forcing caption loss
+            token_embs = token_embs[:, :-1]
+        logits = self.text_decoder(image_embs, token_embs)
+        out_dict = {
+            "image_features": image_latent,
+            "text_features": text_latent,
+            "logits": logits,
+            "logit_scale": self.logit_scale.exp()
+        }
+        if labels is not None:
+            out_dict["labels"] = labels
+        if self.logit_bias is not None:
+            out_dict["logit_bias"] = self.logit_bias
+        return out_dict
+    def generate(
+        self,
+        image,
+        text=None,
+        seq_len=30,
+        max_seq_len=77,
+        temperature=1.,
+        generation_type="beam_search",
+        top_p=0.1,  # keep tokens in the 1 - top_p quantile
+        top_k=1,  # keeps the top_k most probable tokens
+        pad_token_id=None,
+        eos_token_id=None,
+        sot_token_id=None,
+        num_beams=6,
+        num_beam_groups=3,
+        min_seq_len=5,
+        stopping_criteria=None,
+        repetition_penalty=1.0,
+        fixed_output_length=False # if True output.shape == (batch_size, seq_len)
+    ):
+        # taking many ideas and components from HuggingFace GenerationMixin
+        # https://huggingface.co/docs/transformers/main/en/main_classes/text_generation
+        assert _has_transformers, "Please install transformers for generate functionality. `pip install transformers`."
+        assert seq_len > min_seq_len, "seq_len must be larger than min_seq_len"
+        device = image.device
+        with torch.no_grad():
+            sot_token_id = _token_to_tensor(49406 if sot_token_id is None else sot_token_id, device=device)
+            eos_token_id = _token_to_tensor(49407 if eos_token_id is None else eos_token_id, device=device)
+            pad_token_id = self.pad_id if pad_token_id is None else pad_token_id
+            logit_processor = LogitsProcessorList(
+                [
+                    MinLengthLogitsProcessor(min_seq_len, eos_token_id),
+                    RepetitionPenaltyLogitsProcessor(repetition_penalty),
+                ]
+            )
+            if stopping_criteria is None:
+                stopping_criteria = [MaxLengthCriteria(max_length=seq_len)]
+            stopping_criteria = StoppingCriteriaList(stopping_criteria)
+            if generation_type == "beam_search":
+                output = self._generate_beamsearch(
+                    image_inputs=image,
+                    pad_token_id=pad_token_id,
+                    eos_token_id=eos_token_id,
+                    sot_token_id=sot_token_id,
+                    num_beams=num_beams,
+                    num_beam_groups=num_beam_groups,
+                    min_seq_len=min_seq_len,
+                    stopping_criteria=stopping_criteria,
+                    logit_processor=logit_processor,
+                )
+                if fixed_output_length and output.shape[1] < seq_len:
+                    pad_len = seq_len - output.shape[1]
+                    return torch.cat((
+                            output,
+                            torch.ones(output.shape[0], pad_len, device=device, dtype=output.dtype) * pad_token_id
+                        ),
+                        dim=1
+                    )
+                return output
+            elif generation_type == "top_p":
+                logit_warper = GENERATION_TYPES[generation_type](top_p)
+            elif generation_type == "top_k":
+                logit_warper = GENERATION_TYPES[generation_type](top_k)
+            else:
+                raise ValueError(
+                    f"generation_type has to be one of "
+                    f"{'| ' + ' | '.join(list(GENERATION_TYPES.keys())) + ' |'}."
+                )
+            image_latent, image_embs = self._encode_image(image)
+            if text is None:
+                text = torch.ones((image.shape[0], 1), device=device, dtype=torch.long) * sot_token_id
+            was_training = self.training
+            num_dims = len(text.shape)
+            if num_dims == 1:
+                text = text[None, :]
+            self.eval()
+            out = text
+            while True:
+                x = out[:, -max_seq_len:]
+                cur_len = x.shape[1]
+                logits = self(
+                    image,
+                    x,
+                    image_latent=image_latent,
+                    image_embs=image_embs,
+                    output_labels=False,
+                )["logits"][:, -1]
+                mask = (out[:, -1] == eos_token_id) | (out[:, -1] == pad_token_id)
+                sample = torch.ones((out.shape[0], 1), device=device, dtype=torch.long) * pad_token_id
+                if mask.all():
+                    if not fixed_output_length:
+                        break
+                else:
+                    logits = logits[~mask, :]
+                    filtered_logits = logit_processor(x[~mask, :], logits)
+                    filtered_logits = logit_warper(x[~mask, :], filtered_logits)
+                    probs = F.softmax(filtered_logits / temperature, dim=-1)
+                    if (cur_len + 1 == seq_len):
+                        sample[~mask, :] = torch.ones((sum(~mask), 1), device=device, dtype=torch.long) * eos_token_id
+                    else:
+                        sample[~mask, :] = torch.multinomial(probs, 1)
+                out = torch.cat((out, sample), dim=-1)
+                cur_len += 1
+                if all(stopping_criteria(out, None)):
+                    break
+            if num_dims == 1:
+                out = out.squeeze(0)
+            self.train(was_training)
+            return out
+    def _generate_beamsearch(
+            self,
+            image_inputs,
+            pad_token_id=None,
+            eos_token_id=None,
+            sot_token_id=None,
+            num_beams=6,
+            num_beam_groups=3,
+            min_seq_len=5,
+            stopping_criteria=None,
+            logit_processor=None,
+            logit_warper=None,
+    ):
+        device = image_inputs.device
+        batch_size = image_inputs.shape[0]
+        image_inputs = torch.repeat_interleave(image_inputs, num_beams, dim=0)
+        image_latent, image_embs = self._encode_image(image_inputs)
+        input_ids = torch.ones((batch_size * num_beams, 1), device=device, dtype=torch.long)
+        input_ids = input_ids * sot_token_id
+        beam_scorer = BeamSearchScorer(
+            batch_size=batch_size,
+            num_beams=num_beams,
+            device=device,
+            num_beam_groups=num_beam_groups,
+        )
+        # instantiate logits processors
+        logits_processor = (
+            LogitsProcessorList([MinLengthLogitsProcessor(min_seq_len, eos_token_id=eos_token_id)])
+            if logit_processor is None
+            else logit_processor
+        )
+        num_beams = beam_scorer.num_beams
+        num_beam_groups = beam_scorer.num_beam_groups
+        num_sub_beams = num_beams // num_beam_groups
+        batch_size = len(beam_scorer._beam_hyps) // num_beam_groups
+        batch_beam_size, cur_len = input_ids.shape
+        beam_indices = None
+        if num_beams * batch_size != batch_beam_size:
+            raise ValueError(
+                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+            )
+        beam_scores = torch.full((batch_size, num_beams), -1e9, dtype=torch.float, device=device)
+        # initialise score of first beam of each group with 0 and the rest with 1e-9. This ensures that the beams in
+        # the same group don't produce same tokens everytime.
+        beam_scores[:, ::num_sub_beams] = 0
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+        while True:
+            # predicted tokens in cur_len step
+            current_tokens = torch.zeros(batch_size * num_beams, dtype=input_ids.dtype, device=device)
+            # indices which will form the beams in the next time step
+            reordering_indices = torch.zeros(batch_size * num_beams, dtype=torch.long, device=device)
+            # do one decoder step on all beams of all sentences in batch
+            model_inputs = prepare_inputs_for_generation(input_ids=input_ids, image_inputs=image_inputs)
+            outputs = self(
+                model_inputs['images'],
+                model_inputs['text'],
+                image_latent=image_latent,
+                image_embs=image_embs,
+                output_labels=False,
+            )
+            for beam_group_idx in range(num_beam_groups):
+                group_start_idx = beam_group_idx * num_sub_beams
+                group_end_idx = min(group_start_idx + num_sub_beams, num_beams)
+                group_size = group_end_idx - group_start_idx
+                # indices of beams of current group among all sentences in batch
+                batch_group_indices = []
+                for batch_idx in range(batch_size):
+                    batch_group_indices.extend(
+                        [batch_idx * num_beams + idx for idx in range(group_start_idx, group_end_idx)]
+                    )
+                group_input_ids = input_ids[batch_group_indices]
+                # select outputs of beams of currentg group only
+                next_token_logits = outputs['logits'][batch_group_indices, -1, :]
+                vocab_size = next_token_logits.shape[-1]
+                next_token_scores_processed = logits_processor(
+                    group_input_ids, next_token_logits, current_tokens=current_tokens, beam_group_idx=beam_group_idx
+                )
+                next_token_scores = next_token_scores_processed + beam_scores[batch_group_indices].unsqueeze(-1)
+                next_token_scores = next_token_scores.expand_as(next_token_scores_processed)
+                # reshape for beam search
+                next_token_scores = next_token_scores.view(batch_size, group_size * vocab_size)
+                next_token_scores, next_tokens = torch.topk(
+                    next_token_scores, 2 * group_size, dim=1, largest=True, sorted=True
+                )
+                next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
+                next_tokens = next_tokens % vocab_size
+                # stateless
+                process_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
+                beam_outputs = beam_scorer.process(
+                    group_input_ids,
+                    next_token_scores,
+                    next_tokens,
+                    next_indices,
+                    pad_token_id=pad_token_id,
+                    eos_token_id=eos_token_id,
+                    beam_indices=process_beam_indices,
+                    group_index=beam_group_idx,
+                )
+                beam_scores[batch_group_indices] = beam_outputs["next_beam_scores"]
+                beam_next_tokens = beam_outputs["next_beam_tokens"]
+                beam_idx = beam_outputs["next_beam_indices"]
+                input_ids[batch_group_indices] = group_input_ids[beam_idx]
+                group_input_ids = torch.cat([group_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+                current_tokens[batch_group_indices] = group_input_ids[:, -1]
+                # (beam_idx // group_size) -> batch_idx
+                # (beam_idx % group_size) -> offset of idx inside the group
+                reordering_indices[batch_group_indices] = (
+                    num_beams * torch.div(beam_idx, group_size, rounding_mode="floor") + group_start_idx + (beam_idx % group_size)
+                )
+            input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1)
+            # increase cur_len
+            cur_len = cur_len + 1
+            if beam_scorer.is_done or all(stopping_criteria(input_ids, None)):
+                break
+        final_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
+        sequence_outputs = beam_scorer.finalize(
+            input_ids,
+            beam_scores,
+            next_tokens,
+            next_indices,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            max_length=stopping_criteria.max_length,
+            beam_indices=final_beam_indices,
+        )
+        return sequence_outputs['sequences']
+def prepare_inputs_for_generation(input_ids, image_inputs, past=None, **kwargs):
+    if past:
+        input_ids = input_ids[:, -1].unsqueeze(-1)
+    attention_mask = kwargs.get("attention_mask", None)
+    position_ids = kwargs.get("position_ids", None)
+    if attention_mask is not None and position_ids is None:
+        # create position_ids on the fly for batch generation
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+    else:
+        position_ids = None
+    return {
+        "text": input_ids,
+        "images": image_inputs,
+        "past_key_values": past,
+        "position_ids": position_ids,
+        "attention_mask": attention_mask,
+    }

open_clip/src/open_clip/loss.py ADDED Viewed

	@@ -0,0 +1,448 @@

+from typing import Optional
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+try:
+    import torch.distributed.nn
+    from torch import distributed as dist
+    has_distributed = True
+except ImportError:
+    has_distributed = False
+try:
+    import horovod.torch as hvd
+except ImportError:
+    hvd = None
+def gather_features(
+        image_features,
+        text_features,
+        local_loss=False,
+        gather_with_grad=False,
+        rank=0,
+        world_size=1,
+        use_horovod=False
+):
+    assert has_distributed, 'torch.distributed did not import correctly, please use a PyTorch version with support.'
+    if use_horovod:
+        assert hvd is not None, 'Please install horovod'
+        if gather_with_grad:
+            all_image_features = hvd.allgather(image_features)
+            all_text_features = hvd.allgather(text_features)
+        else:
+            with torch.no_grad():
+                all_image_features = hvd.allgather(image_features)
+                all_text_features = hvd.allgather(text_features)
+            if not local_loss:
+                # ensure grads for local rank when all_* features don't have a gradient
+                gathered_image_features = list(all_image_features.chunk(world_size, dim=0))
+                gathered_text_features = list(all_text_features.chunk(world_size, dim=0))
+                gathered_image_features[rank] = image_features
+                gathered_text_features[rank] = text_features
+                all_image_features = torch.cat(gathered_image_features, dim=0)
+                all_text_features = torch.cat(gathered_text_features, dim=0)
+    else:
+        # We gather tensors from all gpus
+        if gather_with_grad:
+            all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features), dim=0)
+            all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features), dim=0)
+        else:
+            gathered_image_features = [torch.zeros_like(image_features) for _ in range(world_size)]
+            gathered_text_features = [torch.zeros_like(text_features) for _ in range(world_size)]
+            dist.all_gather(gathered_image_features, image_features)
+            dist.all_gather(gathered_text_features, text_features)
+            if not local_loss:
+                # ensure grads for local rank when all_* features don't have a gradient
+                gathered_image_features[rank] = image_features
+                gathered_text_features[rank] = text_features
+            all_image_features = torch.cat(gathered_image_features, dim=0)
+            all_text_features = torch.cat(gathered_text_features, dim=0)
+    return all_image_features, all_text_features
+class ClipLoss(nn.Module):
+    def __init__(
+            self,
+            local_loss=False,
+            gather_with_grad=False,
+            cache_labels=False,
+            rank=0,
+            world_size=1,
+            use_horovod=False,
+    ):
+        super().__init__()
+        self.local_loss = local_loss
+        self.gather_with_grad = gather_with_grad
+        self.cache_labels = cache_labels
+        self.rank = rank
+        self.world_size = world_size
+        self.use_horovod = use_horovod
+        # cache state
+        self.prev_num_logits = 0
+        self.labels = {}
+    def get_ground_truth(self, device, num_logits) -> torch.Tensor:
+        # calculated ground-truth and cache if enabled
+        if self.prev_num_logits != num_logits or device not in self.labels:
+            labels = torch.arange(num_logits, device=device, dtype=torch.long)
+            if self.world_size > 1 and self.local_loss:
+                labels = labels + num_logits * self.rank
+            if self.cache_labels:
+                self.labels[device] = labels
+                self.prev_num_logits = num_logits
+        else:
+            labels = self.labels[device]
+        return labels
+    def get_logits(self, image_features, text_features, logit_scale):
+        if self.world_size > 1:
+            all_image_features, all_text_features = gather_features(
+                image_features,
+                text_features,
+                local_loss=self.local_loss,
+                gather_with_grad=self.gather_with_grad,
+                rank=self.rank,
+                world_size=self.world_size,
+                use_horovod=self.use_horovod,
+            )
+            if self.local_loss:
+                logits_per_image = logit_scale * image_features @ all_text_features.T
+                logits_per_text = logit_scale * text_features @ all_image_features.T
+            else:
+                logits_per_image = logit_scale * all_image_features @ all_text_features.T
+                logits_per_text = logits_per_image.T
+        else:
+            logits_per_image = logit_scale * image_features @ text_features.T
+            logits_per_text = logit_scale * text_features @ image_features.T
+        return logits_per_image, logits_per_text
+    def forward(self, image_features, text_features, logit_scale, output_dict=False):
+        device = image_features.device
+        logits_per_image, logits_per_text = self.get_logits(image_features, text_features, logit_scale)
+        labels = self.get_ground_truth(device, logits_per_image.shape[0])
+        total_loss = (
+            F.cross_entropy(logits_per_image, labels) +
+            F.cross_entropy(logits_per_text, labels)
+        ) / 2
+        return {"contrastive_loss": total_loss} if output_dict else total_loss
+class CoCaLoss(ClipLoss):
+    def __init__(
+            self,
+            caption_loss_weight,
+            clip_loss_weight,
+            pad_id=0,  # pad_token for open_clip custom tokenizer
+            local_loss=False,
+            gather_with_grad=False,
+            cache_labels=False,
+            rank=0,
+            world_size=1,
+            use_horovod=False,
+    ):
+        super().__init__(
+            local_loss=local_loss,
+            gather_with_grad=gather_with_grad,
+            cache_labels=cache_labels,
+            rank=rank,
+            world_size=world_size,
+            use_horovod=use_horovod
+        )
+        self.clip_loss_weight = clip_loss_weight
+        self.caption_loss_weight = caption_loss_weight
+        self.caption_loss = nn.CrossEntropyLoss(ignore_index=pad_id)
+    def forward(self, image_features, text_features, logits, labels, logit_scale, output_dict=False):
+        if self.clip_loss_weight:
+            clip_loss = super().forward(image_features, text_features, logit_scale)
+            clip_loss = self.clip_loss_weight * clip_loss
+        else:
+            clip_loss = torch.tensor(0, device=logits.device)
+        caption_loss = self.caption_loss(
+            logits.permute(0, 2, 1),
+            labels,
+        )
+        caption_loss = caption_loss * self.caption_loss_weight
+        if output_dict:
+            return {"contrastive_loss": clip_loss, "caption_loss": caption_loss}
+        return clip_loss, caption_loss
+class DistillClipLoss(ClipLoss):
+    def dist_loss(self, teacher_logits, student_logits):
+        return -(teacher_logits.softmax(dim=1) * student_logits.log_softmax(dim=1)).sum(dim=1).mean(dim=0)
+    def forward(
+            self,
+            image_features,
+            text_features,
+            logit_scale,
+            dist_image_features,
+            dist_text_features,
+            dist_logit_scale,
+            output_dict=False,
+    ):
+        logits_per_image, logits_per_text = \
+            self.get_logits(image_features, text_features, logit_scale)
+        dist_logits_per_image, dist_logits_per_text = \
+            self.get_logits(dist_image_features, dist_text_features, dist_logit_scale)
+        labels = self.get_ground_truth(image_features.device, logits_per_image.shape[0])
+        contrastive_loss = (
+            F.cross_entropy(logits_per_image, labels) +
+            F.cross_entropy(logits_per_text, labels)
+        ) / 2
+        distill_loss = (
+            self.dist_loss(dist_logits_per_image, logits_per_image) +
+            self.dist_loss(dist_logits_per_text, logits_per_text)
+        ) / 2
+        if output_dict:
+            return {"contrastive_loss": contrastive_loss, "distill_loss": distill_loss}
+        return contrastive_loss, distill_loss
+def neighbour_exchange(from_rank, to_rank, tensor, group=None):
+    tensor_recv = torch.zeros_like(tensor)
+    send_op = torch.distributed.P2POp(
+        torch.distributed.isend,
+        tensor,
+        to_rank,
+        group=group,
+    )
+    recv_op = torch.distributed.P2POp(
+        torch.distributed.irecv,
+        tensor_recv,
+        from_rank,
+        group=group,
+    )
+    reqs = torch.distributed.batch_isend_irecv([send_op, recv_op])
+    for req in reqs:
+        req.wait()
+    return tensor_recv
+def neighbour_exchange_bidir(left_rank, right_rank, tensor_to_left, tensor_to_right, group=None):
+    tensor_from_left = torch.zeros_like(tensor_to_right)
+    tensor_from_right = torch.zeros_like(tensor_to_left)
+    send_op_left = torch.distributed.P2POp(
+        torch.distributed.isend,
+        tensor_to_left,
+        left_rank,
+        group=group,
+    )
+    send_op_right = torch.distributed.P2POp(
+        torch.distributed.isend,
+        tensor_to_right,
+        right_rank,
+        group=group,
+    )
+    recv_op_left = torch.distributed.P2POp(
+        torch.distributed.irecv,
+        tensor_from_left,
+        left_rank,
+        group=group,
+    )
+    recv_op_right = torch.distributed.P2POp(
+        torch.distributed.irecv,
+        tensor_from_right,
+        right_rank,
+        group=group,
+    )
+    reqs = torch.distributed.batch_isend_irecv([send_op_right, send_op_left, recv_op_right, recv_op_left])
+    for req in reqs:
+        req.wait()
+    return tensor_from_right, tensor_from_left
+class NeighbourExchange(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, from_rank, to_rank, group, tensor):
+        ctx.group = group
+        ctx.from_rank = from_rank
+        ctx.to_rank = to_rank
+        return neighbour_exchange(from_rank, to_rank, tensor, group=group)
+    @staticmethod
+    def backward(ctx, grad_output):
+        return (None, None, None) + (NeighbourExchange.apply(ctx.to_rank, ctx.from_rank, ctx.group, grad_output),)
+def neighbour_exchange_with_grad(from_rank, to_rank, tensor, group=None):
+    return NeighbourExchange.apply(from_rank, to_rank, group, tensor)
+class NeighbourExchangeBidir(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, left_rank, right_rank, group, tensor_to_left, tensor_to_right):
+        ctx.group = group
+        ctx.left_rank = left_rank
+        ctx.right_rank = right_rank
+        return neighbour_exchange_bidir(left_rank, right_rank, tensor_to_left, tensor_to_right, group=group)
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        return (None, None, None) + \
+            NeighbourExchangeBidir.apply(ctx.right_rank, ctx.left_rank, ctx.group, *grad_outputs)
+def neighbour_exchange_bidir_with_grad(left_rank, right_rank, tensor_to_left, tensor_to_right, group=None):
+    return NeighbourExchangeBidir.apply(left_rank, right_rank, group, tensor_to_left, tensor_to_right)
+class SigLipLoss(nn.Module):
+    """ Sigmoid Loss for Language Image Pre-Training (SigLIP) - https://arxiv.org/abs/2303.15343
+    @article{zhai2023sigmoid,
+      title={Sigmoid loss for language image pre-training},
+      author={Zhai, Xiaohua and Mustafa, Basil and Kolesnikov, Alexander and Beyer, Lucas},
+      journal={arXiv preprint arXiv:2303.15343},
+      year={2023}
+    }
+    """
+    def __init__(
+            self,
+            cache_labels: bool = False,
+            rank: int = 0,
+            world_size: int = 1,
+            dist_impl: Optional[str] = None,
+    ):
+        super().__init__()
+        self.cache_labels = cache_labels
+        self.rank = rank
+        self.world_size = world_size
+        self.dist_impl = dist_impl or 'bidir'  # default to bidir exchange for now, this will likely change
+        assert self.dist_impl in ('bidir', 'shift', 'reduce', 'gather')
+        # cache state FIXME cache not currently used, worthwhile?
+        self.prev_num_logits = 0
+        self.labels = {}
+    def get_ground_truth(self, device, dtype, num_logits, negative_only=False) -> torch.Tensor:
+        labels = -torch.ones((num_logits, num_logits), device=device, dtype=dtype)
+        if not negative_only:
+            labels = 2 * torch.eye(num_logits, device=device, dtype=dtype) + labels
+        return labels
+    def get_logits(self, image_features, text_features, logit_scale, logit_bias=None):
+        logits = logit_scale * image_features @ text_features.T
+        if logit_bias is not None:
+            logits += logit_bias
+        return logits
+    def _loss(self, image_features, text_features, logit_scale, logit_bias=None, negative_only=False):
+        logits = self.get_logits(image_features, text_features, logit_scale, logit_bias)
+        labels = self.get_ground_truth(
+            image_features.device,
+            image_features.dtype,
+            image_features.shape[0],
+            negative_only=negative_only,
+        )
+        loss = -F.logsigmoid(labels * logits).sum() / image_features.shape[0]
+        return loss
+    def forward(self, image_features, text_features, logit_scale, logit_bias, output_dict=False):
+        loss = self._loss(image_features, text_features, logit_scale, logit_bias)
+        if self.world_size > 1:
+            if self.dist_impl == 'bidir':
+                right_rank = (self.rank + 1) % self.world_size
+                left_rank = (self.rank - 1 + self.world_size) % self.world_size
+                text_features_to_right = text_features_to_left = text_features
+                num_bidir, remainder = divmod(self.world_size - 1, 2)
+                for i in range(num_bidir):
+                    text_features_recv = neighbour_exchange_bidir_with_grad(
+                        left_rank,
+                        right_rank,
+                        text_features_to_left,
+                        text_features_to_right,
+                    )
+                    for f in text_features_recv:
+                        loss += self._loss(
+                            image_features,
+                            f,
+                            logit_scale,
+                            logit_bias,
+                            negative_only=True,
+                        )
+                    text_features_to_left, text_features_to_right = text_features_recv
+                if remainder:
+                    text_features_recv = neighbour_exchange_with_grad(
+                        left_rank,
+                        right_rank,
+                        text_features_to_right
+                    )
+                    loss += self._loss(
+                        image_features,
+                        text_features_recv,
+                        logit_scale,
+                        logit_bias,
+                        negative_only=True,
+                    )
+            elif self.dist_impl == "shift":
+                right_rank = (self.rank + 1) % self.world_size
+                left_rank = (self.rank - 1 + self.world_size) % self.world_size
+                text_features_to_right = text_features
+                for i in range(self.world_size - 1):
+                    text_features_from_left = neighbour_exchange_with_grad(
+                        left_rank,
+                        right_rank,
+                        text_features_to_right,
+                    )
+                    loss += self._loss(
+                        image_features,
+                        text_features_from_left,
+                        logit_scale,
+                        logit_bias,
+                        negative_only=True,
+                    )
+                    text_features_to_right = text_features_from_left
+            elif self.dist_impl == "reduce":
+                for i in range(self.world_size):
+                    text_from_other = torch.distributed.nn.all_reduce(
+                        text_features * (self.rank == i),
+                        torch.distributed.ReduceOp.SUM,
+                    )
+                    loss += float(i != self.rank) * self._loss(
+                        image_features,
+                        text_from_other,
+                        logit_scale,
+                        logit_bias,
+                        negative_only=True,
+                    )
+            elif self.dist_impl == "gather":
+                all_text = torch.distributed.nn.all_gather(text_features)
+                for i in range(self.world_size):
+                    loss += float(i != self.rank) * self._loss(
+                        image_features,
+                        all_text[i],
+                        logit_scale,
+                        logit_bias,
+                        negative_only=True,
+                    )
+            else:
+                assert False
+        return {"contrastive_loss": loss} if output_dict else loss

open_clip/src/open_clip/model_configs/EVA01-g-14.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "image_size": 224,
+        "timm_model_name": "eva_giant_patch14_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "token",
+        "timm_proj": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    },
+    "custom_text": true
+}

open_clip/src/open_clip/model_configs/EVA02-E-14.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "image_size": 224,
+        "timm_model_name": "eva02_enormous_patch14_clip_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "token",
+        "timm_proj": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 1024,
+        "heads": 16,
+        "layers": 24
+    },
+    "custom_text": true
+}

open_clip/src/open_clip/model_configs/EVA02-L-14.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "embed_dim": 768,
+    "vision_cfg": {
+        "image_size": 224,
+        "timm_model_name": "eva02_large_patch14_clip_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "token",
+        "timm_proj": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    },
+    "custom_text": true
+}

open_clip/src/open_clip/model_configs/MobileCLIP-B.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "timm_model_name": "vit_base_mci_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "token",
+        "timm_proj": null,
+        "timm_drop": 0.0,
+        "timm_drop_path": 0.0,
+        "image_size": 224
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12,
+        "no_causal_mask": false
+    },
+    "custom_text": true
+}

open_clip/src/open_clip/model_configs/MobileCLIP-S1.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "timm_model_name": "fastvit_mci1",
+        "timm_model_pretrained": false,
+        "timm_pool": "avg",
+        "timm_proj": null,
+        "timm_drop": 0.0,
+        "timm_drop_path": 0.0,
+        "image_size": 256
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12,
+        "no_causal_mask": true
+    },
+    "custom_text": true
+}

open_clip/src/open_clip/model_configs/MobileCLIP-S2.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "timm_model_name": "fastvit_mci2",
+        "timm_model_pretrained": false,
+        "timm_pool": "avg",
+        "timm_proj": null,
+        "timm_drop": 0.0,
+        "timm_drop_path": 0.0,
+        "image_size": 256
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12,
+        "no_causal_mask": true
+    },
+    "custom_text": true
+}

open_clip/src/open_clip/model_configs/RN101-quickgelu.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "embed_dim": 512,
+    "quick_gelu": true,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": [
+            3,
+            4,
+            23,
+            3
+        ],
+        "width": 64,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}

open_clip/src/open_clip/model_configs/RN50-quickgelu.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "embed_dim": 1024,
+    "quick_gelu": true,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": [
+            3,
+            4,
+            6,
+            3
+        ],
+        "width": 64,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}

open_clip/src/open_clip/model_configs/RN50.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": [
+            3,
+            4,
+            6,
+            3
+        ],
+        "width": 64,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}

open_clip/src/open_clip/model_configs/RN50x64-quickgelu.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "embed_dim": 1024,
+    "quick_gelu": true,
+    "vision_cfg": {
+        "image_size": 448,
+        "layers": [
+            3,
+            15,
+            36,
+            10
+        ],
+        "width": 128,
+        "patch_size": null
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 1024,
+        "heads": 16,
+        "layers": 12
+    }
+}

open_clip/src/open_clip/model_configs/ViT-B-16-SigLIP-384.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "embed_dim": 768,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 384,
+        "timm_model_name": "vit_base_patch16_siglip_384",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 32000,
+        "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 768,
+        "heads": 12,
+        "layers": 12,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        }
+    }
+}

open_clip/src/open_clip/model_configs/ViT-B-16-SigLIP-512.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "embed_dim": 768,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 512,
+        "timm_model_name": "vit_base_patch16_siglip_512",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 32000,
+        "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 768,
+        "heads": 12,
+        "layers": 12,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        }
+    }
+}

open_clip/src/open_clip/model_configs/ViT-B-16-SigLIP.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "embed_dim": 768,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 224,
+        "timm_model_name": "vit_base_patch16_siglip_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 32000,
+        "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 768,
+        "heads": 12,
+        "layers": 12,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        }
+    }
+}

open_clip/src/open_clip/model_configs/ViT-B-32-256.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "image_size": 256,
+        "layers": 12,
+        "width": 768,
+        "patch_size": 32
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}

open_clip/src/open_clip/model_configs/ViT-H-14-378.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "image_size": 378,
+        "layers": 32,
+        "width": 1280,
+        "head_width": 80,
+        "patch_size": 14
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 1024,
+        "heads": 16,
+        "layers": 24
+    }
+}

open_clip/src/open_clip/model_configs/ViT-H-14-CLIPA-336.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "image_size": 336,
+        "layers": 32,
+        "width": 1280,
+        "head_width": 80,
+        "patch_size": 14,
+        "no_ln_pre": true,
+        "pool_type": "avg",
+        "final_ln_after_pool": true
+    },
+    "text_cfg": {
+        "context_length": 32,
+        "vocab_size": 32000,
+        "hf_tokenizer_name": "bert-base-uncased",
+        "tokenizer_kwargs": {
+            "strip_sep_token": true
+        },
+        "width": 1024,
+        "heads": 16,
+        "layers": 24,
+        "pool_type": "last",
+        "no_causal_mask": true
+    }
+}

open_clip/src/open_clip/model_configs/ViT-H-14-quickgelu.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "embed_dim": 1024,
+    "quick_gelu": true,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 32,
+        "width": 1280,
+        "head_width": 80,
+        "patch_size": 14
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 1024,
+        "heads": 16,
+        "layers": 24
+    }
+}

open_clip/src/open_clip/model_configs/ViT-H-14.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 32,
+        "width": 1280,
+        "head_width": 80,
+        "patch_size": 14
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 1024,
+        "heads": 16,
+        "layers": 24
+    }
+}

open_clip/src/open_clip/model_configs/ViT-L-14-280.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "embed_dim": 768,
+    "vision_cfg": {
+        "image_size": 280,
+        "layers": 24,
+        "width": 1024,
+        "patch_size": 14
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    }
+}

open_clip/src/open_clip/model_configs/ViT-L-14-336-quickgelu.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+    "embed_dim": 768,
+    "quick_gelu": true,
+    "vision_cfg": {
+        "image_size": 336,
+        "layers": 24,
+        "width": 1024,
+        "patch_size": 14
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    }
+}

open_clip/src/open_clip/model_configs/ViT-L-14-quickgelu.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+    "embed_dim": 768,
+    "quick_gelu": true,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 24,
+        "width": 1024,
+        "patch_size": 14
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    }
+}

open_clip/src/open_clip/model_configs/ViT-L-14.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "embed_dim": 768,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 24,
+        "width": 1024,
+        "patch_size": 14
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    }
+}

open_clip/src/open_clip/model_configs/ViT-L-16-320.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "embed_dim": 768,
+    "vision_cfg": {
+        "image_size": 320,
+        "layers": 24,
+        "width": 1024,
+        "patch_size": 16
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 768,
+        "heads": 12,
+        "layers": 12
+    }
+}

open_clip/src/open_clip/model_configs/ViT-L-16-SigLIP-384.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+    "embed_dim": 1024,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 384,
+        "timm_model_name": "vit_large_patch16_siglip_384",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 32000,
+        "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 1024,
+        "heads": 16,
+        "layers": 24,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        }
+    }
+}

open_clip/src/open_clip/model_configs/ViT-M-16-alt.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+    "embed_dim": 384,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 12,
+        "width": 512,
+        "patch_size": 16,
+        "ls_init_value": 1e-4
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 384,
+        "heads": 6,
+        "layers": 12
+    }
+}

open_clip/src/open_clip/model_configs/ViT-M-32-alt.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "embed_dim": 384,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 12,
+        "width": 512,
+        "patch_size": 32
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 384,
+        "heads": 6,
+        "layers": 12
+    }
+}

open_clip/src/open_clip/model_configs/ViT-M-32.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "embed_dim": 512,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 12,
+        "width": 512,
+        "patch_size": 32
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 512,
+        "heads": 8,
+        "layers": 12
+    }
+}

open_clip/src/open_clip/model_configs/ViT-S-16.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "embed_dim": 384,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 12,
+        "width": 384,
+        "patch_size": 16
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 384,
+        "heads": 6,
+        "layers": 12
+    }
+}

open_clip/src/open_clip/model_configs/ViT-SO400M-14-SigLIP-378.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+    "embed_dim": 1152,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 378,
+        "timm_model_name": "vit_so400m_patch14_siglip_378",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 32000,
+        "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 1152,
+        "heads": 16,
+        "layers": 27,
+        "mlp_ratio": 3.7362,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        }
+    }
+}

open_clip/src/open_clip/model_configs/ViT-SO400M-14-SigLIP-384.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+    "embed_dim": 1152,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 384,
+        "timm_model_name": "vit_so400m_patch14_siglip_384",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 32000,
+        "hf_tokenizer_name": "timm/ViT-B-16-SigLIP",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 1152,
+        "heads": 16,
+        "layers": 27,
+        "mlp_ratio": 3.7362,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        }
+    }
+}

open_clip/src/open_clip/model_configs/ViT-bigG-14-CLIPA.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+    "embed_dim": 1280,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 48,
+        "width": 1664,
+        "head_width": 104,
+        "mlp_ratio": 4.9231,
+        "patch_size": 14,
+        "no_ln_pre": true,
+        "pool_type": "avg",
+        "final_ln_after_pool": true
+    },
+    "text_cfg": {
+        "context_length": 32,
+        "vocab_size": 32000,
+        "hf_tokenizer_name": "bert-base-uncased",
+        "tokenizer_kwargs": {
+            "strip_sep_token": true
+        },
+        "width": 1280,
+        "heads": 20,
+        "layers": 32,
+        "pool_type": "last",
+        "no_causal_mask": true
+    }
+}

open_clip/src/open_clip/model_configs/ViT-bigG-14-quickgelu.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+    "embed_dim": 1280,
+    "quick_gelu": true,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 48,
+        "width": 1664,
+        "head_width": 104,
+        "mlp_ratio": 4.9231,
+        "patch_size": 14
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 1280,
+        "heads": 20,
+        "layers": 32
+    }
+}

open_clip/src/open_clip/model_configs/ViT-bigG-14.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "embed_dim": 1280,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 48,
+        "width": 1664,
+        "head_width": 104,
+        "mlp_ratio": 4.9231,
+        "patch_size": 14
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 1280,
+        "heads": 20,
+        "layers": 32
+    }
+}

open_clip/src/open_clip/model_configs/ViT-e-14.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "embed_dim": 1280,
+    "vision_cfg": {
+        "image_size": 224,
+        "layers": 56,
+        "width": 1792,
+        "head_width": 112,
+        "mlp_ratio": 8.5715,
+        "patch_size": 14
+    },
+    "text_cfg": {
+        "context_length": 77,
+        "vocab_size": 49408,
+        "width": 1280,
+        "heads": 20,
+        "layers": 36
+    }
+}

open_clip/src/open_clip/model_configs/ViTamin-B-LTT.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "embed_dim": 768,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_base_224",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 224
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 768,
+      "heads": 12,
+      "layers": 12
+    },
+    "custom_text": true
+}

open_clip/src/open_clip/model_configs/ViTamin-B.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "embed_dim": 512,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_base_224",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 224
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 512,
+      "heads": 8,
+      "layers": 12
+    },
+    "custom_text": true
+}

open_clip/src/open_clip/model_configs/ViTamin-L-336.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "embed_dim": 768,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_large_336",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 336
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 768,
+      "heads": 12,
+      "layers": 12
+    },
+    "custom_text": true
+}

open_clip/src/open_clip/model_configs/ViTamin-L-384.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "embed_dim": 768,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_large_384",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 384
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 768,
+      "heads": 12,
+      "layers": 12
+    },
+    "custom_text": true
+}

open_clip/src/open_clip/model_configs/ViTamin-L.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "embed_dim": 768,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_large_224",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 224
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 768,
+      "heads": 12,
+      "layers": 12
+    },
+    "custom_text": true
+}

open_clip/src/open_clip/model_configs/ViTamin-L2-256.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_large2_256",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 256
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 1024,
+      "heads": 16,
+      "layers": 24
+    },
+    "custom_text": true
+}

open_clip/src/open_clip/model_configs/ViTamin-L2-336.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_large2_336",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 336
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 1024,
+      "heads": 16,
+      "layers": 24
+    },
+    "custom_text": true
+}

open_clip/src/open_clip/model_configs/ViTamin-L2-384.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_large2_384",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 384
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 1024,
+      "heads": 16,
+      "layers": 24
+    },
+    "custom_text": true
+}

open_clip/src/open_clip/model_configs/ViTamin-L2.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "embed_dim": 1024,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_large2_224",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 224
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 1024,
+      "heads": 16,
+      "layers": 24
+    },
+    "custom_text": true
+}

open_clip/src/open_clip/model_configs/ViTamin-S-LTT.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "embed_dim": 768,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_small_224",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 224
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 768,
+      "heads": 12,
+      "layers": 12
+    },
+    "custom_text": true
+}

open_clip/src/open_clip/model_configs/ViTamin-S.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "embed_dim": 384,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_small_224",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 224
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 384,
+      "heads": 6,
+      "layers": 12
+    },
+    "custom_text": true
+}

open_clip/src/open_clip/model_configs/ViTamin-XL-256.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "embed_dim": 1152,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_xlarge_256",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 256
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 1152,
+      "heads": 16,
+      "layers": 27
+    },
+    "custom_text": true
+}

open_clip/src/open_clip/model_configs/ViTamin-XL-336.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+    "embed_dim": 1152,
+    "vision_cfg": {
+      "timm_model_name": "vitamin_xlarge_336",
+      "timm_model_pretrained": false,
+      "timm_pool": "",
+      "timm_proj": "linear",
+      "timm_drop": 0.0,
+      "timm_drop_path": 0.1,
+      "image_size": 336
+    },
+    "text_cfg": {
+      "context_length": 77,
+      "vocab_size": 49408,
+      "width": 1152,
+      "heads": 16,
+      "layers": 27
+    },
+    "custom_text": true
+}