# coding=utf-8 # Copyright 2026 NAVER Cloud Corp. and the HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """HyperCLOVAX-SEED Vision Encoder configuration""" from typing import List, Optional from transformers import AutoConfig from transformers.configuration_utils import PretrainedConfig class HyperCLOVAXSeedVisionEncoderConfig(PretrainedConfig): r""" Configuration class for the HyperCLOVAX-SEED Vision Encoder. Args: depth (`int`, *optional*, defaults to 32): Number of transformer blocks. hidden_size (`int`, *optional*, defaults to 1280): Dimension of the hidden representations. hidden_act (`str`, *optional*, defaults to `"silu"`): Activation function used in the MLP blocks. intermediate_size (`int`, *optional*, defaults to 3420): Dimension of the MLP intermediate representations. num_heads (`int`, *optional*, defaults to 16): Number of attention heads. in_channels (`int`, *optional*, defaults to 3): Number of input image channels. patch_size (`int`, *optional*, defaults to 14): Spatial patch size (height and width). spatial_merge_size (`int`, *optional*, defaults to 2): Number of patches to merge spatially before the language model. temporal_patch_size (`int`, *optional*, defaults to 2): Number of frames to merge per temporal patch. tokens_per_second (`int`, *optional*, defaults to 4): Number of temporal tokens representing one second of video. window_size (`int`, *optional*, defaults to 112): Window size (in pixels) for window attention blocks. out_hidden_size (`int`, *optional*, defaults to 3584): Output hidden size after the patch merger (should match the LLM hidden size). fullatt_block_indexes (`List[int]`, *optional*, defaults to `[7, 15, 23, 31]`): Indices of transformer blocks that use full (global) attention. All other blocks use window attention. initializer_range (`float`, *optional*, defaults to 0.02): Standard deviation for weight initialization. ```python >>> from transformers import AutoConfig >>> configuration = AutoConfig.from_pretrained("naver-hyperclovax/HyperCLOVAX-SEED-Think-4B") ``` """ model_type = "hyperclovax_seed_vision_encoder" def __init__( self, depth: int = 32, hidden_size: int = 1280, hidden_act: str = "silu", intermediate_size: int = 3420, num_heads: int = 16, in_channels: int = 3, patch_size: int = 14, spatial_merge_size: int = 2, temporal_patch_size: int = 2, tokens_per_second: int = 4, window_size: int = 112, out_hidden_size: int = 3584, fullatt_block_indexes: Optional[List[int]] = None, initializer_range: float = 0.02, disable_merger: bool = False, **kwargs, ): super().__init__(**kwargs) self.depth = depth self.hidden_size = hidden_size self.hidden_act = hidden_act self.intermediate_size = intermediate_size self.num_heads = num_heads self.in_channels = in_channels self.patch_size = patch_size self.spatial_merge_size = spatial_merge_size self.temporal_patch_size = temporal_patch_size self.tokens_per_second = tokens_per_second self.window_size = window_size self.out_hidden_size = out_hidden_size self.fullatt_block_indexes = fullatt_block_indexes if fullatt_block_indexes is not None else [7, 15, 23, 31] self.initializer_range = initializer_range self.disable_merger = disable_merger AutoConfig.register("hyperclovax_seed_vision_encoder", HyperCLOVAXSeedVisionEncoderConfig) __all__ = ["HyperCLOVAXSeedVisionEncoderConfig"]