|
|
| from transformers import PretrainedConfig |
|
|
| class Talk2DINOConfig(PretrainedConfig): |
| model_type = "talk2dino" |
|
|
| def __init__( |
| self, |
| avg_self_attn_token=False, |
| clip_model_name="ViT-B/16", |
| disentangled_self_attn_token=True, |
| is_eval=True, |
| keep_cls=False, |
| keep_end_seq=False, |
| loss=None, |
| model_name="dinov2_vitb14_reg", |
| pre_trained=True, |
| proj_class="vitb_mlp_infonce", |
| proj_model="ProjectionLayer", |
| proj_name="vitb_mlp_infonce", |
| resize_dim=518, |
| type="DINOText", |
| unfreeze_last_image_layer=False, |
| unfreeze_last_text_layer=False, |
| use_avg_text_token=False, |
| with_bg_clean=False, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
|
|
| |
| self.avg_self_attn_token = avg_self_attn_token |
| self.clip_model_name = clip_model_name |
| self.disentangled_self_attn_token = disentangled_self_attn_token |
| self.is_eval = is_eval |
| self.keep_cls = keep_cls |
| self.keep_end_seq = keep_end_seq |
| self.loss = loss |
| self.model_name = model_name |
| self.pre_trained = pre_trained |
| self.proj_class = proj_class |
| self.proj_model = proj_model |
| self.proj_name = proj_name |
| self.resize_dim = resize_dim |
| self.type = type |
| self.unfreeze_last_image_layer = unfreeze_last_image_layer |
| self.unfreeze_last_text_layer = unfreeze_last_text_layer |
| self.use_avg_text_token = use_avg_text_token |
| self.with_bg_clean = with_bg_clean |
|
|