| from typing import List, Tuple |
|
|
| from transformers import PretrainedConfig, AutoTokenizer |
|
|
|
|
| class MolmoVisionConfig(PretrainedConfig): |
| def __init__( |
| self, |
| image_default_input_size: Tuple[int, int] = (336, 336), |
| image_patch_size: int = 14, |
| image_pos_patch_size: int = 14, |
| image_emb_dim: int = 1024, |
| image_num_heads: int = 16, |
| image_num_key_value_heads: int = 16, |
| image_num_layers: int = 23, |
| image_head_dim: int = 64, |
| image_mlp_dim: int = 4096, |
| image_mlp_activations: str = "quick_gelu", |
| residual_dropout: float = 0, |
| image_num_pos: int = 577, |
| image_norm_eps: float = 1e-5, |
| float32_attention: bool = True, |
| attention_type: str = "spda", |
| **kwargs |
| ): |
| super().__init__(**kwargs) |
| self.image_default_input_size = image_default_input_size |
| self.image_patch_size = image_patch_size |
| self.image_pos_patch_size = image_pos_patch_size |
| self.image_emb_dim = image_emb_dim |
| self.image_num_heads = image_num_heads |
| self.image_num_key_value_heads = image_num_key_value_heads |
| self.image_num_layers = image_num_layers |
| self.image_head_dim = image_head_dim |
| self.image_mlp_dim = image_mlp_dim |
| self.image_mlp_activations = image_mlp_activations |
| self.residual_dropout = residual_dropout |
| self.image_num_pos = image_num_pos |
| self.image_norm_eps = image_norm_eps |
| self.float32_attention = float32_attention |
|
|
| @property |
| def image_num_patch(self): |
| h, w = self.image_default_input_size |
| return h // self.image_patch_size, w // self.image_patch_size |
|
|
|
|
| class MolmoConfig(PretrainedConfig): |
| model_type = "molmo" |
| keys_to_ignore_at_inference = ["past_key_values"] |
|
|
| def __init__( |
| self, |
| vocab_size=50304, |
| embedding_size=50304, |
| hidden_size=4096, |
| intermediate_size=11008, |
| num_hidden_layers=32, |
| num_attention_heads=32, |
| num_key_value_heads=None, |
| float32_attention=True, |
| max_position_embeddings=2048, |
| initializer_range=0.02, |
| use_cache=True, |
| layer_norm_eps: float = 1e-5, |
| rope_theta=10000.0, |
| clip_qkv=None, |
| activation_type="silu", |
| qkv_bias: bool = False, |
| weight_tying: bool = False, |
| use_position_ids: bool=True, |
| tie_word_embeddings: bool=True, |
| bias_for_layer_norm: bool=False, |
| qk_layer_norm: bool=False, |
| norm_after: bool = False, |
| layer_norm_type: str="rms", |
| vision_config: MolmoVisionConfig=None, |
| vit_layers=(-2, -9), |
| residual_dropout: float=0.0, |
| embedding_dropout: float=0.0, |
| attention_dropout: float=0.0, |
| image_feature_dropout: float=0.0, |
| additional_vocab_size=128, |
| attention_type: str = "sdpa", |
| image_padding_embed="pad_and_partial_pad", |
| moe_num_experts=None, |
| moe_top_k=None, |
| normalize_input_embeds: bool=False, |
| scale_logits: bool=False, |
| **kwargs, |
| ): |
| if isinstance(vision_config, dict): |
| self.vision_config = MolmoVisionConfig(**vision_config) |
| elif vision_config is None: |
| self.vision_config = MolmoVisionConfig() |
| else: |
| self.vision_config = vision_config |
|
|
| self.vocab_size = vocab_size |
| self.embedding_size = embedding_size |
| self.max_position_embeddings = max_position_embeddings |
| self.hidden_size = hidden_size |
| self.intermediate_size = intermediate_size |
| self.num_hidden_layers = num_hidden_layers |
| self.num_attention_heads = num_attention_heads |
| self.layer_norm_eps = layer_norm_eps |
| self.weight_tying = weight_tying |
| self.use_position_ids = use_position_ids |
| self.qk_layer_norm = qk_layer_norm |
| self.num_key_value_heads = num_key_value_heads |
| self.float32_attention= float32_attention |
| self.initializer_range = initializer_range |
| self.use_cache = use_cache |
| self.rope_theta = rope_theta |
| self.clip_qkv = clip_qkv |
| self.activation_type = activation_type |
| self.qkv_bias = qkv_bias |
| self.norm_after = norm_after |
| self.tie_word_embeddings = tie_word_embeddings |
| self.layer_norm_type = layer_norm_type |
| self.moe_num_experts = moe_num_experts |
| self.moe_top_k = moe_top_k |
| self.vit_layers = vit_layers |
| self.residual_dropout = residual_dropout |
| self.embedding_dropout = embedding_dropout |
| self.attention_dropout = attention_dropout |
| self.image_feature_dropout = image_feature_dropout |
| self.image_padding_embed = image_padding_embed |
| self.bias_for_layer_norm = bias_for_layer_norm |
| self.additional_vocab_size = additional_vocab_size |
| self.attention_type = attention_type |
| self.normalize_input_embeds = normalize_input_embeds |
| self.scale_logits = scale_logits |
|
|
| super().__init__( |
| tie_word_embeddings=tie_word_embeddings, |
| **kwargs, |
| ) |
|
|
| @property |
| def effective_num_key_value_heads(self) -> int: |
| if self.num_key_value_heads is None: |
| return self.num_attention_heads |
| else: |
| return self.num_key_value_heads |
|
|
| @property |
| def image_num_patch(self): |
| assert self.vision_config is not None |
| return self.vision_config.image_num_patch |
|
|
|
|
| MolmoVisionConfig.register_for_auto_class() |
| MolmoConfig.register_for_auto_class() |