"""TIPSv2 model configuration.""" from transformers import PretrainedConfig class TIPSv2Config(PretrainedConfig): """Configuration for TIPSv2 vision-language model.""" model_type = "tipsv2" def __init__( self, # Vision encoder vision_fn="vit_base", embed_dim=768, patch_size=14, img_size=448, ffn_layer="mlp", init_values=1.0, num_register_tokens=1, # Text encoder text_hidden_size=768, text_mlp_dim=3072, text_num_heads=12, text_num_layers=12, vocab_size=32000, max_len=64, # Contrastive temperature=0.01, **kwargs, ): super().__init__(**kwargs) self.vision_fn = vision_fn self.embed_dim = embed_dim self.patch_size = patch_size self.img_size = img_size self.ffn_layer = ffn_layer self.init_values = init_values self.num_register_tokens = num_register_tokens self.text_hidden_size = text_hidden_size self.text_mlp_dim = text_mlp_dim self.text_num_heads = text_num_heads self.text_num_layers = text_num_layers self.vocab_size = vocab_size self.max_len = max_len self.temperature = temperature