tipsv2-b14 / configuration_tips.py
guarin's picture
guarin HF Staff
Update configuration_tips.py
0e4a47a verified
Raw
History Blame
1.7 kB
"""TIPSv2 model configuration."""
from transformers import PretrainedConfig
_VISION_FN_BY_GEOMETRY = {
(768, 12): "vit_base",
(1024, 24): "vit_large",
(1152, 27): "vit_so400m",
(1536, 40): "vit_giant2",
}
class TIPSv2Config(PretrainedConfig):
"""Configuration for TIPSv2 vision-language model."""
model_type = "tipsv2"
def __init__(
self,
vision_config=None,
text_config=None,
temperature_init_value=0.01,
**kwargs,
):
super().__init__(**kwargs)
vision_config = vision_config or {}
text_config = text_config or {}
hidden_size = vision_config.get("hidden_size", 768)
num_hidden_layers = vision_config.get("num_hidden_layers", 12)
self.vision_fn = _VISION_FN_BY_GEOMETRY[(hidden_size, num_hidden_layers)]
self.embed_dim = hidden_size
self.patch_size = vision_config.get("patch_size", 14)
self.img_size = vision_config.get("image_size", 448)
self.ffn_layer = "swiglu" if vision_config.get("use_swiglu_ffn", False) else "mlp"
self.init_values = vision_config.get("layerscale_value", 1.0)
self.num_register_tokens = vision_config.get("num_register_tokens", 1)
self.text_hidden_size = text_config.get("hidden_size", 768)
self.text_mlp_dim = text_config.get("intermediate_size", 3072)
self.text_num_heads = text_config.get("num_attention_heads", 12)
self.text_num_layers = text_config.get("num_hidden_layers", 12)
self.vocab_size = text_config.get("vocab_size", 32000)
self.max_len = text_config.get("max_position_embeddings", 64)
self.temperature = temperature_init_value