Zero-Shot Image Classification
Transformers
Safetensors
tipsv2
feature-extraction
vision
image-text
contrastive-learning
zero-shot
custom_code
Instructions to use google/tipsv2-l14 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use google/tipsv2-l14 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("zero-shot-image-classification", model="google/tipsv2-l14", trust_remote_code=True) pipe( "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/parrots.png", candidate_labels=["animals", "humans", "landscape"], )# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("google/tipsv2-l14", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
Update configuration_tips.py
Browse filesUploads configuration_tips.py.
- configuration_tips.py +0 -3
configuration_tips.py
CHANGED
|
@@ -26,7 +26,6 @@ class TIPSv2Config(PretrainedConfig):
|
|
| 26 |
super().__init__(**kwargs)
|
| 27 |
vision_config = vision_config or {}
|
| 28 |
text_config = text_config or {}
|
| 29 |
-
# Vision encoder
|
| 30 |
hidden_size = vision_config.get("hidden_size", 768)
|
| 31 |
num_hidden_layers = vision_config.get("num_hidden_layers", 12)
|
| 32 |
self.vision_fn = _VISION_FN_BY_GEOMETRY[(hidden_size, num_hidden_layers)]
|
|
@@ -36,12 +35,10 @@ class TIPSv2Config(PretrainedConfig):
|
|
| 36 |
self.ffn_layer = "swiglu" if vision_config.get("use_swiglu_ffn", False) else "mlp"
|
| 37 |
self.init_values = vision_config.get("layerscale_value", 1.0)
|
| 38 |
self.num_register_tokens = vision_config.get("num_register_tokens", 1)
|
| 39 |
-
# Text encoder
|
| 40 |
self.text_hidden_size = text_config.get("hidden_size", 768)
|
| 41 |
self.text_mlp_dim = text_config.get("intermediate_size", 3072)
|
| 42 |
self.text_num_heads = text_config.get("num_attention_heads", 12)
|
| 43 |
self.text_num_layers = text_config.get("num_hidden_layers", 12)
|
| 44 |
self.vocab_size = text_config.get("vocab_size", 32000)
|
| 45 |
self.max_len = text_config.get("max_position_embeddings", 64)
|
| 46 |
-
# Contrastive
|
| 47 |
self.temperature = temperature_init_value
|
|
|
|
| 26 |
super().__init__(**kwargs)
|
| 27 |
vision_config = vision_config or {}
|
| 28 |
text_config = text_config or {}
|
|
|
|
| 29 |
hidden_size = vision_config.get("hidden_size", 768)
|
| 30 |
num_hidden_layers = vision_config.get("num_hidden_layers", 12)
|
| 31 |
self.vision_fn = _VISION_FN_BY_GEOMETRY[(hidden_size, num_hidden_layers)]
|
|
|
|
| 35 |
self.ffn_layer = "swiglu" if vision_config.get("use_swiglu_ffn", False) else "mlp"
|
| 36 |
self.init_values = vision_config.get("layerscale_value", 1.0)
|
| 37 |
self.num_register_tokens = vision_config.get("num_register_tokens", 1)
|
|
|
|
| 38 |
self.text_hidden_size = text_config.get("hidden_size", 768)
|
| 39 |
self.text_mlp_dim = text_config.get("intermediate_size", 3072)
|
| 40 |
self.text_num_heads = text_config.get("num_attention_heads", 12)
|
| 41 |
self.text_num_layers = text_config.get("num_hidden_layers", 12)
|
| 42 |
self.vocab_size = text_config.get("vocab_size", 32000)
|
| 43 |
self.max_len = text_config.get("max_position_embeddings", 64)
|
|
|
|
| 44 |
self.temperature = temperature_init_value
|