Raon-VisionEncoder / configuration_raonve.py
ValentineKRAFTON's picture
initial commit
acd771b verified
"""Raon-VisionEncoder configuration."""
from transformers import PretrainedConfig
class RaonVEVisionConfig(PretrainedConfig):
model_type = "raon_ve_vision"
def __init__(
self,
image_size=256,
timm_model_name="vit_so400m_patch16_siglip_256",
timm_model_pretrained=False,
timm_pool="map",
timm_proj="none",
**kwargs,
):
super().__init__(**kwargs)
self.image_size = image_size
self.timm_model_name = timm_model_name
self.timm_model_pretrained = timm_model_pretrained
self.timm_pool = timm_pool
self.timm_proj = timm_proj
class RaonVETextConfig(PretrainedConfig):
model_type = "raon_ve_text"
def __init__(
self,
context_length=64,
vocab_size=256000,
width=1152,
heads=16,
layers=27,
mlp_ratio=3.7362,
no_causal_mask=True,
proj_bias=True,
pool_type="last",
hf_tokenizer_name="timm/ViT-SO400M-16-SigLIP2-256",
tokenizer_kwargs=None,
norm_kwargs=None,
act_kwargs=None,
**kwargs,
):
super().__init__(**kwargs)
self.context_length = context_length
self.vocab_size = vocab_size
self.width = width
self.heads = heads
self.layers = layers
self.mlp_ratio = mlp_ratio
self.no_causal_mask = no_causal_mask
self.proj_bias = proj_bias
self.pool_type = pool_type
self.hf_tokenizer_name = hf_tokenizer_name
self.tokenizer_kwargs = tokenizer_kwargs or {"clean": "canonicalize"}
self.norm_kwargs = norm_kwargs or {"eps": 1e-6}
self.act_kwargs = act_kwargs or {"approximate": "tanh"}
class RaonVEConfig(PretrainedConfig):
model_type = "raon_ve"
is_composition = True
def __init__(
self,
embed_dim=1152,
init_logit_bias=-10,
vision_config=None,
text_config=None,
**kwargs,
):
super().__init__(**kwargs)
self.embed_dim = embed_dim
self.init_logit_bias = init_logit_bias
if isinstance(vision_config, dict):
self.vision_config = RaonVEVisionConfig(**vision_config)
elif vision_config is None:
self.vision_config = RaonVEVisionConfig()
else:
self.vision_config = vision_config
if isinstance(text_config, dict):
self.text_config = RaonVETextConfig(**text_config)
elif text_config is None:
self.text_config = RaonVETextConfig()
else:
self.text_config = text_config
def to_dict(self):
output = super().to_dict()
output["vision_config"] = self.vision_config.to_dict()
output["text_config"] = self.text_config.to_dict()
return output