# coding=utf-8 # Copyright 2026 NAVER Cloud Corp. and the HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """HyperCLOVAX SEED CLIP model configuration. Config classes for HyperCLOVAX SEED CLIP with vision encoder + SiglipText encoder. """ from transformers import AutoConfig from transformers.configuration_utils import PretrainedConfig from transformers.models.siglip.configuration_siglip import SiglipTextConfig from transformers.utils import logging logger = logging.get_logger(__name__) class HyperCLOVAXSeedCLIPVisionConfig(PretrainedConfig): """Vision config for HyperCLOVAX SEED CLIP. Includes parameters for the vision encoder transformer and the Siglip2-style attention pooling head. Args: depth (`int`, *optional*, defaults to 32): Number of transformer blocks. hidden_size (`int`, *optional*, defaults to 1280): Dimension of the hidden representations. hidden_act (`str`, *optional*, defaults to `"silu"`): Activation function used in the MLP blocks. intermediate_size (`int`, *optional*, defaults to 3456): Dimension of the MLP intermediate representations. num_heads (`int`, *optional*, defaults to 16): Number of attention heads. in_channels (`int`, *optional*, defaults to 3): Number of input image channels. patch_size (`int`, *optional*, defaults to 16): Spatial patch size (height and width). spatial_merge_size (`int`, *optional*, defaults to 2): Number of patches to merge spatially. temporal_patch_size (`int`, *optional*, defaults to 2): Number of frames to merge per temporal patch. tokens_per_second (`int`, *optional*, defaults to 4): Number of temporal tokens representing one second of video. window_size (`int`, *optional*, defaults to 128): Window size (in pixels) for window attention blocks. fullatt_block_indexes (`List[int]`, *optional*, defaults to all blocks): Indices of transformer blocks that use full (global) attention. Defaults to all blocks (full attention everywhere). initializer_range (`float`, *optional*, defaults to 0.02): Standard deviation for weight initialization. attn_pool_heads (`int`, *optional*, defaults to 16): Number of attention heads in the pooling head. attn_pool_mlp_ratio (`float`, *optional*, defaults to 4.0): MLP expansion ratio in the pooling head. attn_implementation (`str`, *optional*, defaults to `"eager"`): Attention implementation (`"eager"`, `"sdpa"`, or `"flash_attention_2"`). """ model_type = "hyperclovax_seed_clip_vision" def __init__( self, depth=32, hidden_size=1280, hidden_act="silu", intermediate_size=3456, num_heads=16, in_channels=3, patch_size=16, spatial_merge_size=2, temporal_patch_size=2, tokens_per_second=4, window_size=128, fullatt_block_indexes=None, initializer_range=0.02, attn_pool_heads=16, attn_pool_mlp_ratio=4.0, attn_implementation="eager", **kwargs, ): super().__init__(**kwargs) self.depth = depth self.hidden_size = hidden_size self.hidden_act = hidden_act self.intermediate_size = intermediate_size self.num_heads = num_heads self.in_channels = in_channels self.patch_size = patch_size self.spatial_merge_size = spatial_merge_size self.temporal_patch_size = temporal_patch_size self.tokens_per_second = tokens_per_second self.window_size = window_size self.fullatt_block_indexes = ( fullatt_block_indexes if fullatt_block_indexes is not None else list(range(depth)) ) self.initializer_range = initializer_range self.attn_pool_heads = attn_pool_heads self.attn_pool_mlp_ratio = attn_pool_mlp_ratio self.attn_implementation = attn_implementation AutoConfig.register("hyperclovax_seed_clip_vision", HyperCLOVAXSeedCLIPVisionConfig) class HyperCLOVAXSeedCLIPConfig(PretrainedConfig): """Config for HyperCLOVAX SEED CLIP. Combines a SiglipTextConfig (text) and HyperCLOVAXSeedCLIPVisionConfig (vision). """ model_type = "hyperclovax_seed_clip" sub_configs = {"text_config": SiglipTextConfig, "vision_config": HyperCLOVAXSeedCLIPVisionConfig} def __init__(self, text_config=None, vision_config=None, **kwargs): super().__init__(**kwargs) if text_config is None: text_config = {} logger.info("text_config is None. Using SiglipTextConfig defaults.") if vision_config is None: vision_config = {} logger.info("vision_config is None. Using HyperCLOVAXSeedCLIPVisionConfig defaults.") self.text_config = SiglipTextConfig(**text_config) self.vision_config = HyperCLOVAXSeedCLIPVisionConfig(**vision_config) self.initializer_factor = 1.0 @classmethod def from_text_vision_configs( cls, text_config: SiglipTextConfig, vision_config: HyperCLOVAXSeedCLIPVisionConfig, **kwargs, ): return cls( text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs, )