Zero-Shot Image Classification
Transformers
Safetensors
tipsv2
feature-extraction
vision
image-text
contrastive-learning
zero-shot
custom_code
Instructions to use google/tipsv2-b14 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use google/tipsv2-b14 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("zero-shot-image-classification", model="google/tipsv2-b14", trust_remote_code=True) pipe( "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/parrots.png", candidate_labels=["animals", "humans", "landscape"], )# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("google/tipsv2-b14", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """TIPSv2 model configuration.""" | |
| from transformers import PretrainedConfig | |
| _VISION_FN_BY_GEOMETRY = { | |
| (768, 12): "vit_base", | |
| (1024, 24): "vit_large", | |
| (1152, 27): "vit_so400m", | |
| (1536, 40): "vit_giant2", | |
| } | |
| class TIPSv2Config(PretrainedConfig): | |
| """Configuration for TIPSv2 vision-language model.""" | |
| model_type = "tipsv2" | |
| def __init__( | |
| self, | |
| vision_config=None, | |
| text_config=None, | |
| temperature_init_value=0.01, | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| vision_config = vision_config or {} | |
| text_config = text_config or {} | |
| hidden_size = vision_config.get("hidden_size", 768) | |
| num_hidden_layers = vision_config.get("num_hidden_layers", 12) | |
| self.vision_fn = _VISION_FN_BY_GEOMETRY[(hidden_size, num_hidden_layers)] | |
| self.embed_dim = hidden_size | |
| self.patch_size = vision_config.get("patch_size", 14) | |
| self.img_size = vision_config.get("image_size", 448) | |
| self.ffn_layer = "swiglu" if vision_config.get("use_swiglu_ffn", False) else "mlp" | |
| self.init_values = vision_config.get("layerscale_value", 1.0) | |
| self.num_register_tokens = vision_config.get("num_register_tokens", 1) | |
| self.text_hidden_size = text_config.get("hidden_size", 768) | |
| self.text_mlp_dim = text_config.get("intermediate_size", 3072) | |
| self.text_num_heads = text_config.get("num_attention_heads", 12) | |
| self.text_num_layers = text_config.get("num_hidden_layers", 12) | |
| self.vocab_size = text_config.get("vocab_size", 32000) | |
| self.max_len = text_config.get("max_position_embeddings", 64) | |
| self.temperature = temperature_init_value | |