from transformers import PretrainedConfig class MultimodalConfig(PretrainedConfig): model_type = "multimodal_embedder" def __init__( self, text_model_name="newmindai/modernbert-base-tr-uncased-allnli-stsb", vision_model_name="facebook/dinov2-base", text_dim=768, image_dim=768, embed_dim=384, temperature_init=1/0.07, use_mean_pooling_for_text=True, **kwargs ): super().__init__(**kwargs) self.text_model_name = text_model_name self.vision_model_name = vision_model_name self.text_dim = text_dim self.image_dim = image_dim self.embed_dim = embed_dim self.temperature_init = temperature_init self.use_mean_pooling_for_text = use_mean_pooling_for_text