| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """HyperCLOVAX-SEED Vision Encoder configuration""" |
|
|
| from typing import List, Optional |
|
|
| from transformers import AutoConfig |
| from transformers.configuration_utils import PretrainedConfig |
|
|
|
|
| class HyperCLOVAXSeedVisionEncoderConfig(PretrainedConfig): |
| r""" |
| Configuration class for the HyperCLOVAX-SEED Vision Encoder. |
| |
| Args: |
| depth (`int`, *optional*, defaults to 32): |
| Number of transformer blocks. |
| hidden_size (`int`, *optional*, defaults to 1280): |
| Dimension of the hidden representations. |
| hidden_act (`str`, *optional*, defaults to `"silu"`): |
| Activation function used in the MLP blocks. |
| intermediate_size (`int`, *optional*, defaults to 3420): |
| Dimension of the MLP intermediate representations. |
| num_heads (`int`, *optional*, defaults to 16): |
| Number of attention heads. |
| in_channels (`int`, *optional*, defaults to 3): |
| Number of input image channels. |
| patch_size (`int`, *optional*, defaults to 14): |
| Spatial patch size (height and width). |
| spatial_merge_size (`int`, *optional*, defaults to 2): |
| Number of patches to merge spatially before the language model. |
| temporal_patch_size (`int`, *optional*, defaults to 2): |
| Number of frames to merge per temporal patch. |
| tokens_per_second (`int`, *optional*, defaults to 4): |
| Number of temporal tokens representing one second of video. |
| window_size (`int`, *optional*, defaults to 112): |
| Window size (in pixels) for window attention blocks. |
| out_hidden_size (`int`, *optional*, defaults to 3584): |
| Output hidden size after the patch merger (should match the LLM hidden size). |
| fullatt_block_indexes (`List[int]`, *optional*, defaults to `[7, 15, 23, 31]`): |
| Indices of transformer blocks that use full (global) attention. |
| All other blocks use window attention. |
| initializer_range (`float`, *optional*, defaults to 0.02): |
| Standard deviation for weight initialization. |
| |
| ```python |
| >>> from transformers import AutoConfig |
| |
| >>> configuration = AutoConfig.from_pretrained("naver-hyperclovax/HyperCLOVAX-SEED-Think-4B") |
| ``` |
| """ |
|
|
| model_type = "hyperclovax_seed_vision_encoder" |
|
|
| def __init__( |
| self, |
| depth: int = 32, |
| hidden_size: int = 1280, |
| hidden_act: str = "silu", |
| intermediate_size: int = 3420, |
| num_heads: int = 16, |
| in_channels: int = 3, |
| patch_size: int = 14, |
| spatial_merge_size: int = 2, |
| temporal_patch_size: int = 2, |
| tokens_per_second: int = 4, |
| window_size: int = 112, |
| out_hidden_size: int = 3584, |
| fullatt_block_indexes: Optional[List[int]] = None, |
| initializer_range: float = 0.02, |
| disable_merger: bool = False, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
| self.depth = depth |
| self.hidden_size = hidden_size |
| self.hidden_act = hidden_act |
| self.intermediate_size = intermediate_size |
| self.num_heads = num_heads |
| self.in_channels = in_channels |
| self.patch_size = patch_size |
| self.spatial_merge_size = spatial_merge_size |
| self.temporal_patch_size = temporal_patch_size |
| self.tokens_per_second = tokens_per_second |
| self.window_size = window_size |
| self.out_hidden_size = out_hidden_size |
| self.fullatt_block_indexes = fullatt_block_indexes if fullatt_block_indexes is not None else [7, 15, 23, 31] |
| self.initializer_range = initializer_range |
| self.disable_merger = disable_merger |
|
|
|
|
| AutoConfig.register("hyperclovax_seed_vision_encoder", HyperCLOVAXSeedVisionEncoderConfig) |
|
|
| __all__ = ["HyperCLOVAXSeedVisionEncoderConfig"] |
|
|