| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """HyperCLOVAX SEED CLIP model configuration. |
| |
| Config classes for HyperCLOVAX SEED CLIP with vision encoder + SiglipText encoder. |
| """ |
|
|
| from transformers import AutoConfig |
| from transformers.configuration_utils import PretrainedConfig |
| from transformers.models.siglip.configuration_siglip import SiglipTextConfig |
| from transformers.utils import logging |
|
|
| logger = logging.get_logger(__name__) |
|
|
|
|
| class HyperCLOVAXSeedCLIPVisionConfig(PretrainedConfig): |
| """Vision config for HyperCLOVAX SEED CLIP. |
| |
| Includes parameters for the vision encoder transformer and |
| the Siglip2-style attention pooling head. |
| |
| Args: |
| depth (`int`, *optional*, defaults to 32): |
| Number of transformer blocks. |
| hidden_size (`int`, *optional*, defaults to 1280): |
| Dimension of the hidden representations. |
| hidden_act (`str`, *optional*, defaults to `"silu"`): |
| Activation function used in the MLP blocks. |
| intermediate_size (`int`, *optional*, defaults to 3456): |
| Dimension of the MLP intermediate representations. |
| num_heads (`int`, *optional*, defaults to 16): |
| Number of attention heads. |
| in_channels (`int`, *optional*, defaults to 3): |
| Number of input image channels. |
| patch_size (`int`, *optional*, defaults to 16): |
| Spatial patch size (height and width). |
| spatial_merge_size (`int`, *optional*, defaults to 2): |
| Number of patches to merge spatially. |
| temporal_patch_size (`int`, *optional*, defaults to 2): |
| Number of frames to merge per temporal patch. |
| tokens_per_second (`int`, *optional*, defaults to 4): |
| Number of temporal tokens representing one second of video. |
| window_size (`int`, *optional*, defaults to 128): |
| Window size (in pixels) for window attention blocks. |
| fullatt_block_indexes (`List[int]`, *optional*, defaults to all blocks): |
| Indices of transformer blocks that use full (global) attention. |
| Defaults to all blocks (full attention everywhere). |
| initializer_range (`float`, *optional*, defaults to 0.02): |
| Standard deviation for weight initialization. |
| attn_pool_heads (`int`, *optional*, defaults to 16): |
| Number of attention heads in the pooling head. |
| attn_pool_mlp_ratio (`float`, *optional*, defaults to 4.0): |
| MLP expansion ratio in the pooling head. |
| attn_implementation (`str`, *optional*, defaults to `"eager"`): |
| Attention implementation (`"eager"`, `"sdpa"`, or `"flash_attention_2"`). |
| """ |
|
|
| model_type = "hyperclovax_seed_clip_vision" |
|
|
| def __init__( |
| self, |
| depth=32, |
| hidden_size=1280, |
| hidden_act="silu", |
| intermediate_size=3456, |
| num_heads=16, |
| in_channels=3, |
| patch_size=16, |
| spatial_merge_size=2, |
| temporal_patch_size=2, |
| tokens_per_second=4, |
| window_size=128, |
| fullatt_block_indexes=None, |
| initializer_range=0.02, |
| attn_pool_heads=16, |
| attn_pool_mlp_ratio=4.0, |
| attn_implementation="eager", |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
| self.depth = depth |
| self.hidden_size = hidden_size |
| self.hidden_act = hidden_act |
| self.intermediate_size = intermediate_size |
| self.num_heads = num_heads |
| self.in_channels = in_channels |
| self.patch_size = patch_size |
| self.spatial_merge_size = spatial_merge_size |
| self.temporal_patch_size = temporal_patch_size |
| self.tokens_per_second = tokens_per_second |
| self.window_size = window_size |
| self.fullatt_block_indexes = ( |
| fullatt_block_indexes if fullatt_block_indexes is not None else list(range(depth)) |
| ) |
| self.initializer_range = initializer_range |
| self.attn_pool_heads = attn_pool_heads |
| self.attn_pool_mlp_ratio = attn_pool_mlp_ratio |
| self.attn_implementation = attn_implementation |
|
|
|
|
| AutoConfig.register("hyperclovax_seed_clip_vision", HyperCLOVAXSeedCLIPVisionConfig) |
|
|
|
|
| class HyperCLOVAXSeedCLIPConfig(PretrainedConfig): |
| """Config for HyperCLOVAX SEED CLIP. |
| |
| Combines a SiglipTextConfig (text) and HyperCLOVAXSeedCLIPVisionConfig (vision). |
| """ |
|
|
| model_type = "hyperclovax_seed_clip" |
| sub_configs = {"text_config": SiglipTextConfig, "vision_config": HyperCLOVAXSeedCLIPVisionConfig} |
|
|
| def __init__(self, text_config=None, vision_config=None, **kwargs): |
| super().__init__(**kwargs) |
|
|
| if text_config is None: |
| text_config = {} |
| logger.info("text_config is None. Using SiglipTextConfig defaults.") |
| if vision_config is None: |
| vision_config = {} |
| logger.info("vision_config is None. Using HyperCLOVAXSeedCLIPVisionConfig defaults.") |
|
|
| self.text_config = SiglipTextConfig(**text_config) |
| self.vision_config = HyperCLOVAXSeedCLIPVisionConfig(**vision_config) |
| self.initializer_factor = 1.0 |
|
|
| @classmethod |
| def from_text_vision_configs( |
| cls, |
| text_config: SiglipTextConfig, |
| vision_config: HyperCLOVAXSeedCLIPVisionConfig, |
| **kwargs, |
| ): |
| return cls( |
| text_config=text_config.to_dict(), |
| vision_config=vision_config.to_dict(), |
| **kwargs, |
| ) |
|
|