| """Raon-VisionEncoder configuration.""" | |
| from transformers import PretrainedConfig | |
| class RaonVEVisionConfig(PretrainedConfig): | |
| model_type = "raon_ve_vision" | |
| def __init__( | |
| self, | |
| image_size=256, | |
| timm_model_name="vit_so400m_patch16_siglip_256", | |
| timm_model_pretrained=False, | |
| timm_pool="map", | |
| timm_proj="none", | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| self.image_size = image_size | |
| self.timm_model_name = timm_model_name | |
| self.timm_model_pretrained = timm_model_pretrained | |
| self.timm_pool = timm_pool | |
| self.timm_proj = timm_proj | |
| class RaonVETextConfig(PretrainedConfig): | |
| model_type = "raon_ve_text" | |
| def __init__( | |
| self, | |
| context_length=64, | |
| vocab_size=256000, | |
| width=1152, | |
| heads=16, | |
| layers=27, | |
| mlp_ratio=3.7362, | |
| no_causal_mask=True, | |
| proj_bias=True, | |
| pool_type="last", | |
| hf_tokenizer_name="timm/ViT-SO400M-16-SigLIP2-256", | |
| tokenizer_kwargs=None, | |
| norm_kwargs=None, | |
| act_kwargs=None, | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| self.context_length = context_length | |
| self.vocab_size = vocab_size | |
| self.width = width | |
| self.heads = heads | |
| self.layers = layers | |
| self.mlp_ratio = mlp_ratio | |
| self.no_causal_mask = no_causal_mask | |
| self.proj_bias = proj_bias | |
| self.pool_type = pool_type | |
| self.hf_tokenizer_name = hf_tokenizer_name | |
| self.tokenizer_kwargs = tokenizer_kwargs or {"clean": "canonicalize"} | |
| self.norm_kwargs = norm_kwargs or {"eps": 1e-6} | |
| self.act_kwargs = act_kwargs or {"approximate": "tanh"} | |
| class RaonVEConfig(PretrainedConfig): | |
| model_type = "raon_ve" | |
| is_composition = True | |
| def __init__( | |
| self, | |
| embed_dim=1152, | |
| init_logit_bias=-10, | |
| vision_config=None, | |
| text_config=None, | |
| **kwargs, | |
| ): | |
| super().__init__(**kwargs) | |
| self.embed_dim = embed_dim | |
| self.init_logit_bias = init_logit_bias | |
| if isinstance(vision_config, dict): | |
| self.vision_config = RaonVEVisionConfig(**vision_config) | |
| elif vision_config is None: | |
| self.vision_config = RaonVEVisionConfig() | |
| else: | |
| self.vision_config = vision_config | |
| if isinstance(text_config, dict): | |
| self.text_config = RaonVETextConfig(**text_config) | |
| elif text_config is None: | |
| self.text_config = RaonVETextConfig() | |
| else: | |
| self.text_config = text_config | |
| def to_dict(self): | |
| output = super().to_dict() | |
| output["vision_config"] = self.vision_config.to_dict() | |
| output["text_config"] = self.text_config.to_dict() | |
| return output | |