| from transformers import PretrainedConfig | |
| class MultimodalConfig(PretrainedConfig): | |
| model_type = "multimodal_embedder" | |
| def __init__( | |
| self, | |
| text_model_name="newmindai/modernbert-base-tr-uncased-allnli-stsb", | |
| vision_model_name="facebook/dinov2-base", | |
| text_dim=768, | |
| image_dim=768, | |
| embed_dim=384, | |
| temperature_init=1/0.07, | |
| use_mean_pooling_for_text=True, | |
| **kwargs | |
| ): | |
| super().__init__(**kwargs) | |
| self.text_model_name = text_model_name | |
| self.vision_model_name = vision_model_name | |
| self.text_dim = text_dim | |
| self.image_dim = image_dim | |
| self.embed_dim = embed_dim | |
| self.temperature_init = temperature_init | |
| self.use_mean_pooling_for_text = use_mean_pooling_for_text | |