| from transformers import PretrainedConfig, PreTrainedModel |
| import json |
|
|
| class Idefics2ConnectorConfig(PretrainedConfig): |
| r""" |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
| documentation from [`PretrainedConfig`] for more information. |
| |
| Args: |
| hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): |
| The non-linear activation function (function or string) in the perceiver block. |
| resampler_n_latents (`int`, *optional*, defaults to 64): |
| Number of latent embeddings to resample ("compress") the input sequence to (usually < 128). |
| resampler_depth (`int`, *optional*, defaults to 3): |
| Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (<= 3). |
| resampler_n_heads (`int`, *optional*, defaults to 16): |
| Number of heads in each Transformer block (for multi-headed self-attention). |
| resampler_head_dim (`int`, *optional*, defaults to 96): |
| Dimensionality of each head projection in the Transformer block. |
| num_key_value_heads (`int`, *optional*, defaults to 4): |
| Number of key-value heads in the perceiver attention block. |
| attention_dropout (`float`, *optional*, defaults to 0.0): |
| The dropout ratio for the attention probabilities. |
| """ |
| _auto_class = 'AutoConfig' |
| model_type = "Idefics2ConnectorConfig" |
|
|
| def __init__( |
| self, |
| vision_hidden_size=1152, |
| hidden_size=4096, |
| hidden_act="silu", |
| resampler_n_latents=64, |
| resampler_depth=3, |
| rms_norm_eps=1e-05, |
| resampler_n_heads=16, |
| resampler_head_dim=96, |
| num_key_value_heads=4, |
| attention_dropout=0.0, |
| intermediate_size=14336, |
| integrate_sub_images=None, |
| num_sub_images=None, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
| self.vision_hidden_size = vision_hidden_size |
| self.hidden_size = hidden_size |
| self.hidden_act = hidden_act |
| self.resampler_n_latents = resampler_n_latents |
| self.resampler_depth = resampler_depth |
| self.rms_norm_eps = rms_norm_eps |
| self.resampler_n_heads = resampler_n_heads |
| self.num_key_value_heads = num_key_value_heads |
| self.resampler_head_dim = resampler_head_dim |
| self.attention_dropout = attention_dropout |
| self.intermediate_size = intermediate_size |
| self.integrate_sub_images = integrate_sub_images |
| self.num_sub_images = num_sub_images |
|
|
| if self.num_key_value_heads > self.resampler_n_heads: |
| raise ValueError( |
| f"num_key_value_heads={self.num_key_value_heads} must be less than or equal to" |
| f" resampler_n_heads={self.resampler_n_heads}" |
| ) |
|
|
| @classmethod |
| def from_pretrained(cls, config_path, **kwargs) -> "PretrainedConfig": |
| |
| with open(config_path, "r", encoding="utf-8") as f: |
| config_dict = json.load(f) |
| cls = Idefics2ConnectorConfig( |
| vision_hidden_size=config_dict['vision_hidden_size'], |
| hidden_size=config_dict['hidden_size'], |
| hidden_act="silu", |
| resampler_n_latents=config_dict['resampler_n_latents'], |
| resampler_depth=config_dict['resampler_depth'], |
| rms_norm_eps=config_dict['rms_norm_eps'], |
| intermediate_size=config_dict['intermediate_size'], |
| integrate_sub_images=config_dict['integrate_sub_images'], |
| num_sub_images=config_dict['num_sub_images'] |
| ) |
| |
| return cls |
|
|