File size: 824 Bytes
ff7fe7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from transformers import PretrainedConfig

class MultimodalConfig(PretrainedConfig):
    model_type = "multimodal_embedder"

    def __init__(

        self,

        text_model_name="newmindai/modernbert-base-tr-uncased-allnli-stsb",

        vision_model_name="facebook/dinov2-base",

        text_dim=768,

        image_dim=768,

        embed_dim=384,

        temperature_init=1/0.07,

        use_mean_pooling_for_text=True,

        **kwargs

    ):
        super().__init__(**kwargs)
        self.text_model_name = text_model_name
        self.vision_model_name = vision_model_name
        self.text_dim = text_dim
        self.image_dim = image_dim
        self.embed_dim = embed_dim
        self.temperature_init = temperature_init
        self.use_mean_pooling_for_text = use_mean_pooling_for_text