| """ |
| Molmo2 configuration |
| """ |
|
|
| from typing import Optional |
|
|
| from transformers import PretrainedConfig, LogitsProcessor |
| from transformers.utils import logging |
|
|
| from .configuration_molmo2 import Molmo2TextConfig, Molmo2VitConfig, \ |
| Molmo2AdapterConfig |
|
|
| logger = logging.get_logger(__name__) |
|
|
|
|
| class MolmoPointAdapterConfig(PretrainedConfig): |
| r""" |
| This is the configuration class to store the configuration of Molmo2Adapter. With Molmo2VitConfig, |
| It is used to instantiate an Molmo2VisionBackbone according to the specified arguments, |
| defining the model architecture. |
| |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
| documentation from [`PretrainedConfig`] for more information. |
| |
| Example: |
| |
| ```python |
| >>> from transformers import Molmo2VitConfig, Molmo2AdapterConfig, Molmo2VisionBackbone |
| |
| >>> # Initializing a Molmo2VitConfig and a Molmo2AdapterConfig |
| >>> vit_config = Molmo2VitConfig() |
| >>> adapter_config = MolmoPoolingConfig() |
| |
| >>> # Initializing a Molmo2VisionBackbone (with random weights) |
| >>> model = Molmo2VisionBackbone(vit_config, adapter_config) |
| |
| >>> # Accessing the model configuration |
| >>> vit_configuration = model.vit_config |
| >>> adapter_configuration = model.adapter_config |
| ```""" |
|
|
| model_type = "molmo_point" |
| base_config_key = "adapter_config" |
|
|
| def __init__( |
| self, |
| vit_layers: tuple = (-3, -9), |
| pooling_attention_mask: bool = False, |
| hidden_size: int = 1152, |
| num_attention_heads: int = 16, |
| num_key_value_heads: int = 16, |
| head_dim: int = 72, |
| float32_attention: bool = True, |
| attention_dropout: float = 0.0, |
| residual_dropout: float = 0.0, |
| hidden_act: str = "silu", |
| intermediate_size: int = 18944, |
| text_hidden_size: int = 3584, |
| image_feature_dropout: float = 0.0, |
| initializer_range: float = 0.02, |
| attn_implementation: str = "eager", |
| positional_embeddings: int = 16, |
| attention_pooling_out_layer: bool = False, |
| **kwargs, |
| ): |
| self.attn_implementation = attn_implementation |
| super().__init__( |
| attn_implementation=attn_implementation, |
| **kwargs |
| ) |
| self.vit_layers = vit_layers |
| self.pooling_attention_mask = pooling_attention_mask |
| self.hidden_size = hidden_size |
| self.num_attention_heads = num_attention_heads |
| self.num_key_value_heads = num_key_value_heads |
| self.head_dim = head_dim |
| self.float32_attention = float32_attention |
| self.attention_dropout = attention_dropout |
| self.residual_dropout = residual_dropout |
| self.hidden_act = hidden_act |
| self.intermediate_size = intermediate_size |
| self.text_hidden_size = text_hidden_size |
| self.image_feature_dropout = image_feature_dropout |
| self.initializer_range = initializer_range |
| self.positional_embeddings = positional_embeddings |
| self.attention_pooling_out_layer = attention_pooling_out_layer |
|
|
|
|
| class MolmoPointConfig(PretrainedConfig): |
| r""" |
| This is the configuration class to store the configuration of a [`MolmoPointForConditionalGeneration`]. |
| It is used to instantiate an Molmo2 model according to the specified arguments, defining the model architecture. |
| |
| Example: |
| |
| ```python |
| >>> from transformers import Molmo2Config, Molmo2VitConfig, Molmo2AdapterConfig, Molmo2TextConfig |
| |
| >>> # Initializing a Molmo2VitConfig |
| >>> vit_config = Molmo2VitConfig() |
| |
| >>> # Initializing a Molmo2AdapterConfig |
| >>> adapter_config = MolmoPointAdapterConfig() |
| |
| >>> # Initializing a Molmo2TextConfig |
| >>> text_config = Molmo2TextConfig() |
| |
| >>> # Initializing a Molmo2Config |
| >>> configuration = MolmoPointConfig( |
| >>> vit_config=vit_config, |
| >>> adapter_config=adapter_config, |
| >>> text_config=text_config, |
| >>> image_start_token_id=151936, |
| >>> image_end_token_id=151937, |
| >>> image_patch_id=151938, |
| >>> image_col_id=151939, |
| >>> low_res_image_start_token_id=151940, |
| >>> image_low_res_id=151942, |
| >>> frame_start_token_id=151943, |
| >>> frame_end_token_id=151944, |
| >>> ) |
| |
| >>> # Initializing a model |
| >>> model = MolmoPointForConditionalGeneration(configuration) |
| |
| >>> # Accessing the model configuration |
| >>> configuration = model.config |
| ```""" |
|
|
| model_type = "molmo_point" |
| sub_configs = { |
| "text_config": Molmo2TextConfig, |
| "vit_config": Molmo2VitConfig, |
| "adapter_config": MolmoPointAdapterConfig, |
| } |
|
|
| def __init__( |
| self, |
| vit_config: Molmo2VitConfig = None, |
| adapter_config: MolmoPointAdapterConfig = None, |
| text_config: Molmo2TextConfig = None, |
| image_start_token_id: int = None, |
| low_res_image_start_token_id: int = None, |
| image_end_token_id: int = None, |
| image_patch_id: int = None, |
| image_non_indexable_patch_id: int = None, |
| image_col_id: int = None, |
| frame_start_token_id: int = None, |
| frame_end_token_id: int = None, |
| patch_token_id: int = None, |
| subpatch_token_id: int = None, |
| location_token_id: int = None, |
| use_frame_special_tokens: bool = True, |
| initializer_range: float = 0.02, |
| |
| |
| patch_location: Optional[str]="3x3", |
| no_more_points_class: bool=False, |
| patch_embed_dim: int=256, |
| patch_embedding_kind: str="linear", |
| embed_selected_vit_patch: Optional[str]="linear", |
| embed_location: bool=False, |
| layer_norm_x: bool=True, |
| norm_logits: bool=True, |
| |
| mask_patches: Optional[str]="always", |
| mask_subpatches: str="inference", |
| mask_repeats: Optional[str]="inference", |
| token_prediction_rotary: bool=True, |
| token_prediction_rotary_theta: Optional[float]=50000, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
| if vit_config is None: |
| self.vit_config = Molmo2VitConfig() |
| elif isinstance(vit_config, dict): |
| self.vit_config = Molmo2VitConfig(**vit_config) |
| else: |
| self.vit_config = vit_config |
| if adapter_config is None: |
| self.adapter_config = Molmo2AdapterConfig() |
| elif isinstance(adapter_config, dict): |
| self.adapter_config = Molmo2AdapterConfig(**adapter_config) |
| else: |
| self.adapter_config = adapter_config |
| if text_config is None: |
| self.text_config = Molmo2TextConfig() |
| elif isinstance(text_config, dict): |
| self.text_config = Molmo2TextConfig(**text_config) |
| else: |
| self.text_config = text_config |
| self.image_start_token_id = image_start_token_id |
| self.low_res_image_start_token_id = low_res_image_start_token_id |
| self.image_end_token_id = image_end_token_id |
| self.image_high_res_id = image_patch_id |
| self.image_non_indexable_patch_id = image_non_indexable_patch_id |
| self.image_patch_id = image_patch_id |
| self.image_col_id = image_col_id |
| self.frame_start_token_id = frame_start_token_id |
| self.frame_end_token_id = frame_end_token_id |
| self.patch_token_id = patch_token_id |
| self.subpatch_token_id = subpatch_token_id |
| self.location_token_id = location_token_id |
| self.use_frame_special_tokens = use_frame_special_tokens |
| self.initializer_range = initializer_range |
| self.patch_location = patch_location |
| self.no_more_points_class = no_more_points_class |
| self.patch_embed_dim = patch_embed_dim |
| self.patch_embedding_kind = patch_embedding_kind |
| self.embed_selected_vit_patch = embed_selected_vit_patch |
| self.embed_location = embed_location |
| self.layer_norm_x = layer_norm_x |
| self.norm_logits = norm_logits |
| self.mask_patches = mask_patches |
| self.mask_subpatches = mask_subpatches |
| self.mask_repeats = mask_repeats |
| self.token_prediction_rotary = token_prediction_rotary |
| self.token_prediction_rotary_theta = token_prediction_rotary_theta |
|
|
| @property |
| def image_num_patch(self): |
| assert self.vit_config is not None |
| return self.vit_config.image_num_patch |
| |
| @property |
| def num_attention_heads(self): |
| return self.text_config.num_attention_heads |
| |
| @property |
| def num_key_value_heads(self): |
| return self.text_config.num_key_value_heads |
|
|
| @property |
| def head_dim(self): |
| return self.text_config.head_dim |
|
|
| @property |
| def num_hidden_layers(self): |
| return self.text_config.num_hidden_layers |
| |
| @property |
| def hidden_size(self): |
| return self.text_config.hidden_size |
| |
| @property |
| def vocab_size(self): |
| return self.text_config.vocab_size |
| |
| @property |
| def max_position_embeddings(self): |
| return self.text_config.max_position_embeddings |
|
|
|
|
| MolmoPointAdapterConfig.register_for_auto_class() |
| MolmoPointConfig.register_for_auto_class() |