Zero-Shot Image Classification
Transformers
Safetensors
tipsv2
feature-extraction
vision
image-text
contrastive-learning
zero-shot
custom_code
Instructions to use google/tipsv2-b14 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use google/tipsv2-b14 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("zero-shot-image-classification", model="google/tipsv2-b14", trust_remote_code=True) pipe( "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/parrots.png", candidate_labels=["animals", "humans", "landscape"], )# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("google/tipsv2-b14", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
Update files for transformers integration
#3
by guarin HF Staff - opened
- config.json +67 -16
- configuration_tips.py +29 -31
- processor_config.json +16 -0
- tokenizer_config.json +12 -0
config.json
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
{
|
| 2 |
-
"model_type": "tipsv2",
|
| 3 |
"architectures": [
|
| 4 |
"TIPSv2Model"
|
| 5 |
],
|
|
@@ -7,18 +6,70 @@
|
|
| 7 |
"AutoConfig": "configuration_tips.TIPSv2Config",
|
| 8 |
"AutoModel": "modeling_tips.TIPSv2Model"
|
| 9 |
},
|
| 10 |
-
"
|
| 11 |
-
"
|
| 12 |
-
"
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
{
|
|
|
|
| 2 |
"architectures": [
|
| 3 |
"TIPSv2Model"
|
| 4 |
],
|
|
|
|
| 6 |
"AutoConfig": "configuration_tips.TIPSv2Config",
|
| 7 |
"AutoModel": "modeling_tips.TIPSv2Model"
|
| 8 |
},
|
| 9 |
+
"model_type": "tipsv2",
|
| 10 |
+
"temperature_init_value": 0.005065968260169029,
|
| 11 |
+
"text_config": {
|
| 12 |
+
"attention_dropout": 0.0,
|
| 13 |
+
"bos_token_id": null,
|
| 14 |
+
"eos_token_id": null,
|
| 15 |
+
"hidden_act": "relu",
|
| 16 |
+
"hidden_size": 768,
|
| 17 |
+
"initializer_range": 0.02,
|
| 18 |
+
"intermediate_size": 3072,
|
| 19 |
+
"layer_norm_eps": 1e-05,
|
| 20 |
+
"max_position_embeddings": 64,
|
| 21 |
+
"model_type": "tipsv2_text_model",
|
| 22 |
+
"num_attention_heads": 12,
|
| 23 |
+
"num_hidden_layers": 12,
|
| 24 |
+
"pad_token_id": 0,
|
| 25 |
+
"pooling_epsilon": 1e-08,
|
| 26 |
+
"scale_sqrt_depth": true,
|
| 27 |
+
"vocab_size": 32000
|
| 28 |
+
},
|
| 29 |
+
"transformers_version": "5.10.0.dev0",
|
| 30 |
+
"vision_config": {
|
| 31 |
+
"apply_layernorm": true,
|
| 32 |
+
"attention_probs_dropout_prob": 0.0,
|
| 33 |
+
"drop_path_rate": 0.0,
|
| 34 |
+
"hidden_act": "gelu",
|
| 35 |
+
"hidden_dropout_prob": 0.0,
|
| 36 |
+
"hidden_size": 768,
|
| 37 |
+
"image_size": 448,
|
| 38 |
+
"initializer_range": 0.02,
|
| 39 |
+
"interpolate_antialias": true,
|
| 40 |
+
"interpolate_offset": 0.0,
|
| 41 |
+
"layer_norm_eps": 1e-06,
|
| 42 |
+
"layerscale_value": 1.0,
|
| 43 |
+
"mlp_ratio": 4,
|
| 44 |
+
"model_type": "tipsv2_vision_model",
|
| 45 |
+
"num_attention_heads": 12,
|
| 46 |
+
"num_channels": 3,
|
| 47 |
+
"num_hidden_layers": 12,
|
| 48 |
+
"num_register_tokens": 1,
|
| 49 |
+
"out_features": [
|
| 50 |
+
"stage12"
|
| 51 |
+
],
|
| 52 |
+
"out_indices": [
|
| 53 |
+
12
|
| 54 |
+
],
|
| 55 |
+
"patch_size": 14,
|
| 56 |
+
"qkv_bias": true,
|
| 57 |
+
"reshape_hidden_states": true,
|
| 58 |
+
"stage_names": [
|
| 59 |
+
"stem",
|
| 60 |
+
"stage1",
|
| 61 |
+
"stage2",
|
| 62 |
+
"stage3",
|
| 63 |
+
"stage4",
|
| 64 |
+
"stage5",
|
| 65 |
+
"stage6",
|
| 66 |
+
"stage7",
|
| 67 |
+
"stage8",
|
| 68 |
+
"stage9",
|
| 69 |
+
"stage10",
|
| 70 |
+
"stage11",
|
| 71 |
+
"stage12"
|
| 72 |
+
],
|
| 73 |
+
"use_swiglu_ffn": false
|
| 74 |
+
}
|
| 75 |
+
}
|
configuration_tips.py
CHANGED
|
@@ -3,6 +3,14 @@
|
|
| 3 |
from transformers import PretrainedConfig
|
| 4 |
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
class TIPSv2Config(PretrainedConfig):
|
| 7 |
"""Configuration for TIPSv2 vision-language model."""
|
| 8 |
|
|
@@ -10,37 +18,27 @@ class TIPSv2Config(PretrainedConfig):
|
|
| 10 |
|
| 11 |
def __init__(
|
| 12 |
self,
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
patch_size=14,
|
| 17 |
-
img_size=448,
|
| 18 |
-
ffn_layer="mlp",
|
| 19 |
-
init_values=1.0,
|
| 20 |
-
num_register_tokens=1,
|
| 21 |
-
# Text encoder
|
| 22 |
-
text_hidden_size=768,
|
| 23 |
-
text_mlp_dim=3072,
|
| 24 |
-
text_num_heads=12,
|
| 25 |
-
text_num_layers=12,
|
| 26 |
-
vocab_size=32000,
|
| 27 |
-
max_len=64,
|
| 28 |
-
# Contrastive
|
| 29 |
-
temperature=0.01,
|
| 30 |
**kwargs,
|
| 31 |
):
|
| 32 |
super().__init__(**kwargs)
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
self.
|
| 38 |
-
self.
|
| 39 |
-
self.
|
| 40 |
-
self.
|
| 41 |
-
self.
|
| 42 |
-
self.
|
| 43 |
-
self.
|
| 44 |
-
self.
|
| 45 |
-
self.
|
| 46 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from transformers import PretrainedConfig
|
| 4 |
|
| 5 |
|
| 6 |
+
_VISION_FN_BY_GEOMETRY = {
|
| 7 |
+
(768, 12): "vit_base",
|
| 8 |
+
(1024, 24): "vit_large",
|
| 9 |
+
(1152, 27): "vit_so400m",
|
| 10 |
+
(1536, 40): "vit_giant2",
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
|
| 14 |
class TIPSv2Config(PretrainedConfig):
|
| 15 |
"""Configuration for TIPSv2 vision-language model."""
|
| 16 |
|
|
|
|
| 18 |
|
| 19 |
def __init__(
|
| 20 |
self,
|
| 21 |
+
vision_config=None,
|
| 22 |
+
text_config=None,
|
| 23 |
+
temperature_init_value=0.01,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
**kwargs,
|
| 25 |
):
|
| 26 |
super().__init__(**kwargs)
|
| 27 |
+
vision_config = vision_config or {}
|
| 28 |
+
text_config = text_config or {}
|
| 29 |
+
hidden_size = vision_config.get("hidden_size", 768)
|
| 30 |
+
num_hidden_layers = vision_config.get("num_hidden_layers", 12)
|
| 31 |
+
self.vision_fn = _VISION_FN_BY_GEOMETRY[(hidden_size, num_hidden_layers)]
|
| 32 |
+
self.embed_dim = hidden_size
|
| 33 |
+
self.patch_size = vision_config.get("patch_size", 14)
|
| 34 |
+
self.img_size = vision_config.get("image_size", 448)
|
| 35 |
+
self.ffn_layer = "swiglu" if vision_config.get("use_swiglu_ffn", False) else "mlp"
|
| 36 |
+
self.init_values = vision_config.get("layerscale_value", 1.0)
|
| 37 |
+
self.num_register_tokens = vision_config.get("num_register_tokens", 1)
|
| 38 |
+
self.text_hidden_size = text_config.get("hidden_size", 768)
|
| 39 |
+
self.text_mlp_dim = text_config.get("intermediate_size", 3072)
|
| 40 |
+
self.text_num_heads = text_config.get("num_attention_heads", 12)
|
| 41 |
+
self.text_num_layers = text_config.get("num_hidden_layers", 12)
|
| 42 |
+
self.vocab_size = text_config.get("vocab_size", 32000)
|
| 43 |
+
self.max_len = text_config.get("max_position_embeddings", 64)
|
| 44 |
+
self.temperature = temperature_init_value
|
processor_config.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"image_processor": {
|
| 3 |
+
"do_convert_rgb": true,
|
| 4 |
+
"do_normalize": false,
|
| 5 |
+
"do_rescale": true,
|
| 6 |
+
"do_resize": true,
|
| 7 |
+
"image_processor_type": "Tipsv2ImageProcessor",
|
| 8 |
+
"resample": 2,
|
| 9 |
+
"rescale_factor": 0.00392156862745098,
|
| 10 |
+
"size": {
|
| 11 |
+
"height": 448,
|
| 12 |
+
"width": 448
|
| 13 |
+
}
|
| 14 |
+
},
|
| 15 |
+
"processor_class": "Tipsv2Processor"
|
| 16 |
+
}
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"backend": "tokenizers",
|
| 3 |
+
"bos_token": null,
|
| 4 |
+
"do_lower_case": true,
|
| 5 |
+
"eos_token": null,
|
| 6 |
+
"model_max_length": 64,
|
| 7 |
+
"pad_token": "<pad>",
|
| 8 |
+
"processor_class": "Tipsv2Processor",
|
| 9 |
+
"token_type_ids_pattern": "all_zeros",
|
| 10 |
+
"tokenizer_class": "Tipsv2Tokenizer",
|
| 11 |
+
"unk_token": "<unk>"
|
| 12 |
+
}
|