| """ |
| MolmoAct configuration |
| """ |
|
|
| from typing import Tuple, Optional, Dict, Any |
|
|
| from transformers import PretrainedConfig |
| from transformers.modeling_rope_utils import rope_config_validation |
| from transformers.utils import logging |
|
|
| logger = logging.get_logger(__name__) |
|
|
|
|
| class MolmoActVitConfig(PretrainedConfig): |
| r""" |
| This is the configuration class to store the configuration of a [`MolmoActVisionTransformer`]. |
| It is used to instantiate a `MolmoActVisionTransformer` according to the specified arguments, |
| defining the model architecture. |
| |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
| documentation from [`PretrainedConfig`] for more information. |
| |
| Example: |
| ```python |
| >>> from transformers import MolmoActVitConfig, MolmoActVisionTransformer |
| |
| >>> # Initializing a MolmoActVitConfig |
| >>> configuration = MolmoActVitConfig() |
| |
| >>> # Initializing a MolmoActVisionTransformer (with random weights) |
| >>> model = MolmoActVisionTransformer(configuration) |
| |
| >>> # Accessing the model configuration |
| >>> configuration = model.config |
| ```""" |
|
|
| model_type = "molmoact_vit" |
|
|
| def __init__( |
| self, |
| hidden_size: int = 1152, |
| intermediate_size: int = 4304, |
| num_hidden_layers: int = 27, |
| num_attention_heads: int = 16, |
| num_key_value_heads: int = 16, |
| head_dim: int = 72, |
| hidden_act: str = "gelu_pytorch_tanh", |
| layer_norm_eps: float = 1e-6, |
| image_default_input_size: Tuple[int, int] = (378, 378), |
| image_patch_size: int = 14, |
| image_num_pos: int = 577, |
| attention_dropout: float = 0.0, |
| residual_dropout: float = 0.0, |
| initializer_range: float = 0.02, |
| float32_attention: bool = True, |
| use_cls_token: bool = False, |
| patch_bias: bool = True, |
| pre_layernorm: bool = False, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
| self.hidden_size = hidden_size |
| self.intermediate_size = intermediate_size |
| self.num_hidden_layers = num_hidden_layers |
| self.num_attention_heads = num_attention_heads |
| self.num_key_value_heads = num_key_value_heads |
| self.head_dim = head_dim |
| self.hidden_act = hidden_act |
| self.layer_norm_eps = layer_norm_eps |
| self.image_default_input_size = image_default_input_size |
| self.image_patch_size = image_patch_size |
| self.image_num_pos = image_num_pos |
| self.attention_dropout = attention_dropout |
| self.residual_dropout = residual_dropout |
| self.initializer_range = initializer_range |
| self.float32_attention = float32_attention |
| self.use_cls_token = use_cls_token |
| self.patch_bias = patch_bias |
| self.pre_layernorm = pre_layernorm |
|
|
| @property |
| def image_num_patch(self): |
| h, w = self.image_default_input_size |
| return h // self.image_patch_size, w // self.image_patch_size |
|
|
|
|
| class MolmoActAdapterConfig(PretrainedConfig): |
| r""" |
| This is the configuration class to store the configuration of MolmoActAdapter. With MolmoActVitConfig, |
| It is used to instantiate an MolmoActVisionBackbone according to the specified arguments, |
| defining the model architecture. |
| |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
| documentation from [`PretrainedConfig`] for more information. |
| |
| Example: |
| |
| ```python |
| >>> from transformers import MolmoActVitConfig, MolmoActAdapterConfig, MolmoActVisionBackbone |
| |
| >>> # Initializing a MolmoActVitConfig and a MolmoActAdapterConfig |
| >>> vit_config = MolmoActVitConfig() |
| >>> adapter_config = MolmoPoolingConfig() |
| |
| >>> # Initializing a MolmoActVisionBackbone (with random weights) |
| >>> model = MolmoActVisionBackbone(vit_config, adapter_config) |
| |
| >>> # Accessing the model configuration |
| >>> vit_configuration = model.vit_config |
| >>> adapter_configuration = model.adapter_config |
| ```""" |
|
|
| def __init__( |
| self, |
| vit_layers: Tuple = (-3, -9), |
| hidden_size: int = 1152, |
| num_attention_heads: int = 16, |
| num_key_value_heads: int = 16, |
| head_dim: int = 72, |
| float32_attention: bool = True, |
| attention_dropout: float = 0.0, |
| residual_dropout: float = 0.0, |
| hidden_act: str = "silu", |
| intermediate_size: int = 18944, |
| text_hidden_size: int = 3584, |
| image_feature_dropout: float = 0.0, |
| initializer_range: float = 0.02, |
| |
| image_padding_embed: Optional[str] = None, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
| self.vit_layers = vit_layers |
| self.hidden_size = hidden_size |
| self.num_attention_heads = num_attention_heads |
| self.num_key_value_heads = num_key_value_heads |
| self.head_dim = head_dim |
| self.float32_attention = float32_attention |
| self.attention_dropout = attention_dropout |
| self.residual_dropout = residual_dropout |
| self.hidden_act = hidden_act |
| self.intermediate_size = intermediate_size |
| self.text_hidden_size = text_hidden_size |
| self.image_feature_dropout = image_feature_dropout |
| self.initializer_range = initializer_range |
| |
| self.image_padding_embed = image_padding_embed |
|
|
|
|
| class MolmoActLlmConfig(PretrainedConfig): |
| r""" |
| This is the configuration class to store the configuration of a [`MolmoActLlm`]. It is used to instantiate a |
| `MolmoActLlm` according to the specified arguments, defining the model architecture. |
| |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
| documentation from [`PretrainedConfig`] for more information. |
| |
| Example: |
| ```python |
| >>> from transformers import MolmoActLlmConfig, MolmoActLlm |
| |
| >>> # Initializing a MolmoActLlmConfig |
| >>> configuration = MolmoActLlmConfig() |
| |
| >>> # Initializing a MolmoActLlm (with random weights) |
| >>> model = MolmoActLlm(configuration) |
| |
| >>> # Accessing the model configuration |
| >>> configuration = model.config |
| ```""" |
|
|
| model_type = "molmoact_llm" |
| keys_to_ignore_at_inference = ["past_key_values"] |
| base_model_tp_plan = { |
| "blocks.*.self_attn.att_proj": "colwise", |
| "blocks.*.self_attn.attn_out": "rowwise", |
| "blocks.*.mlp.ff_proj": "colwise", |
| "blocks.*.mlp.ff_out": "rowwise", |
| } |
| base_model_pp_plan = { |
| "wte": (["input_ids"], ["inputs_embeds"]), |
| "blocks": (["hidden_states", "attention_mask"], ["hidden_states"]), |
| "ln_f": (["hidden_states"], ["hidden_states"]), |
| } |
|
|
| def __init__( |
| self, |
| hidden_size: int = 3584, |
| num_attention_heads: int = 28, |
| num_key_value_heads: Optional[int] = 4, |
| head_dim: int = 128, |
| vocab_size: int = 152064, |
| additional_vocab_size: int = 128, |
| qkv_bias: bool = True, |
| num_hidden_layers: int = 48, |
| intermediate_size: int = 18944, |
| hidden_act: str = "silu", |
| embedding_dropout: float=0.0, |
| attention_dropout: float=0.0, |
| residual_dropout: float = 0.0, |
| max_position_embeddings: int = 4096, |
| rope_theta: float = 1000000.0, |
| rope_scaling: Dict[str, Any] = None, |
| use_qk_norm: bool = False, |
| qk_norm_type: str = "olmo", |
| layer_norm_eps: int = 1e-6, |
| norm_after: bool = False, |
| initializer_range: float = 0.02, |
| use_cache=True, |
| tie_word_embeddings=False, |
| **kwargs, |
| ): |
| super().__init__( |
| tie_word_embeddings=tie_word_embeddings, |
| **kwargs |
| ) |
| self.hidden_size = hidden_size |
| self.num_attention_heads = num_attention_heads |
| if num_key_value_heads is None: |
| num_key_value_heads = num_attention_heads |
| self.num_key_value_heads = num_key_value_heads |
| self.head_dim = head_dim |
| self.vocab_size = vocab_size |
| self.additional_vocab_size = additional_vocab_size |
| self.qkv_bias = qkv_bias |
| self.num_hidden_layers = num_hidden_layers |
| self.intermediate_size = intermediate_size |
| self.hidden_act = hidden_act |
| self.embedding_dropout = embedding_dropout |
| self.attention_dropout = attention_dropout |
| self.residual_dropout = residual_dropout |
| self.max_position_embeddings = max_position_embeddings |
| self.rope_theta = rope_theta |
| self.rope_scaling = rope_scaling |
| self.use_qk_norm = use_qk_norm |
| self.qk_norm_type = qk_norm_type |
| self.layer_norm_eps = layer_norm_eps |
| self.norm_after = norm_after |
| self.initializer_range = initializer_range |
| self.use_cache = use_cache |
|
|
| |
| rope_config_validation(self) |
|
|
|
|
| class MolmoActConfig(PretrainedConfig): |
| r""" |
| This is the configuration class to store the configuration of a [`MolmoActForActionReasoning`]. |
| It is used to instantiate an MolmoAct model according to the specified arguments, defining the model architecture. |
| |
| Example: |
| |
| ```python |
| >>> from transformers import MolmoActConfig, MolmoActVitConfig, MolmoActAdapterConfig, MolmoActLlmConfig |
| |
| >>> # Initializing a MolmoActVitConfig |
| >>> vit_config = MolmoActVitConfig() |
| |
| >>> # Initializing a MolmoActAdapterConfig |
| >>> adapter_config = MolmoActAdapterConfig() |
| |
| >>> # Initializing a MolmoActLlmConfig |
| >>> llm_config = MolmoActLlmConfig() |
| |
| >>> # Initializing a MolmoActConfig |
| >>> configuration = MolmoActConfig(vit_config, adapter_config, llm_config, image_patch_id=152069) |
| |
| >>> # Initializing a model |
| >>> model = MolmoActForActionReasoning(configuration) |
| |
| >>> # Accessing the model configuration |
| >>> configuration = model.config |
| ```""" |
|
|
| model_type = "molmoact" |
| sub_configs = { |
| "llm_config": MolmoActLlmConfig, |
| "vit_config": MolmoActVitConfig, |
| "adapter_config": MolmoActAdapterConfig, |
| } |
|
|
| def __init__( |
| self, |
| vit_config: MolmoActVitConfig = None, |
| adapter_config: MolmoActAdapterConfig = None, |
| llm_config: MolmoActLlmConfig = None, |
| image_patch_id: int = None, |
| initializer_range: float = 0.02, |
| n_action_bins: int = 256, |
| norm_stats: dict = {}, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
| if vit_config is None: |
| self.vit_config = MolmoActVitConfig() |
| elif isinstance(vit_config, dict): |
| self.vit_config = MolmoActVitConfig(**vit_config) |
| else: |
| self.vit_config = vit_config |
| if adapter_config is None: |
| self.adapter_config = MolmoActAdapterConfig() |
| elif isinstance(adapter_config, dict): |
| self.adapter_config = MolmoActAdapterConfig(**adapter_config) |
| else: |
| self.adapter_config = adapter_config |
| if llm_config is None: |
| self.llm_config = MolmoActLlmConfig() |
| elif isinstance(llm_config, dict): |
| self.llm_config = MolmoActLlmConfig(**llm_config) |
| else: |
| self.llm_config = llm_config |
| self.image_patch_id = image_patch_id |
| self.initializer_range = initializer_range |
|
|
| self.n_action_bins = n_action_bins |
| self.norm_stats = norm_stats |
|
|
| @property |
| def image_num_patch(self): |
| assert self.vit_config is not None |
| return self.vit_config.image_num_patch |
| |
| @property |
| def num_attention_heads(self): |
| return self.llm_config.num_attention_heads |
| |
| @property |
| def num_key_value_heads(self): |
| return self.llm_config.num_key_value_heads |
|
|
| @property |
| def head_dim(self): |
| return self.llm_config.head_dim |
|
|
| @property |
| def num_hidden_layers(self): |
| return self.llm_config.num_hidden_layers |
| |
| @property |
| def hidden_size(self): |
| return self.llm_config.hidden_size |
| |
| @property |
| def vocab_size(self): |
| return self.llm_config.vocab_size |
| |
| @property |
| def max_position_embeddings(self): |
| return self.llm_config.max_position_embeddings |
|
|
|
|
| MolmoActVitConfig.register_for_auto_class() |
| MolmoActAdapterConfig.register_for_auto_class() |
| MolmoActLlmConfig.register_for_auto_class() |
| MolmoActConfig.register_for_auto_class() |