HyperCLOVAX-SEED-Think-4B / configuration_hyperclovax_seed_vision_encoder.py
bigshanedogg's picture
Upload folder using huggingface_hub
0c1d6f8 verified
# coding=utf-8
# Copyright 2026 NAVER Cloud Corp. and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""HyperCLOVAX-SEED Vision Encoder configuration"""
from typing import List, Optional
from transformers import AutoConfig
from transformers.configuration_utils import PretrainedConfig
class HyperCLOVAXSeedVisionEncoderConfig(PretrainedConfig):
r"""
Configuration class for the HyperCLOVAX-SEED Vision Encoder.
Args:
depth (`int`, *optional*, defaults to 32):
Number of transformer blocks.
hidden_size (`int`, *optional*, defaults to 1280):
Dimension of the hidden representations.
hidden_act (`str`, *optional*, defaults to `"silu"`):
Activation function used in the MLP blocks.
intermediate_size (`int`, *optional*, defaults to 3420):
Dimension of the MLP intermediate representations.
num_heads (`int`, *optional*, defaults to 16):
Number of attention heads.
in_channels (`int`, *optional*, defaults to 3):
Number of input image channels.
patch_size (`int`, *optional*, defaults to 14):
Spatial patch size (height and width).
spatial_merge_size (`int`, *optional*, defaults to 2):
Number of patches to merge spatially before the language model.
temporal_patch_size (`int`, *optional*, defaults to 2):
Number of frames to merge per temporal patch.
tokens_per_second (`int`, *optional*, defaults to 4):
Number of temporal tokens representing one second of video.
window_size (`int`, *optional*, defaults to 112):
Window size (in pixels) for window attention blocks.
out_hidden_size (`int`, *optional*, defaults to 3584):
Output hidden size after the patch merger (should match the LLM hidden size).
fullatt_block_indexes (`List[int]`, *optional*, defaults to `[7, 15, 23, 31]`):
Indices of transformer blocks that use full (global) attention.
All other blocks use window attention.
initializer_range (`float`, *optional*, defaults to 0.02):
Standard deviation for weight initialization.
```python
>>> from transformers import AutoConfig
>>> configuration = AutoConfig.from_pretrained("naver-hyperclovax/HyperCLOVAX-SEED-Think-4B")
```
"""
model_type = "hyperclovax_seed_vision_encoder"
def __init__(
self,
depth: int = 32,
hidden_size: int = 1280,
hidden_act: str = "silu",
intermediate_size: int = 3420,
num_heads: int = 16,
in_channels: int = 3,
patch_size: int = 14,
spatial_merge_size: int = 2,
temporal_patch_size: int = 2,
tokens_per_second: int = 4,
window_size: int = 112,
out_hidden_size: int = 3584,
fullatt_block_indexes: Optional[List[int]] = None,
initializer_range: float = 0.02,
disable_merger: bool = False,
**kwargs,
):
super().__init__(**kwargs)
self.depth = depth
self.hidden_size = hidden_size
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.num_heads = num_heads
self.in_channels = in_channels
self.patch_size = patch_size
self.spatial_merge_size = spatial_merge_size
self.temporal_patch_size = temporal_patch_size
self.tokens_per_second = tokens_per_second
self.window_size = window_size
self.out_hidden_size = out_hidden_size
self.fullatt_block_indexes = fullatt_block_indexes if fullatt_block_indexes is not None else [7, 15, 23, 31]
self.initializer_range = initializer_range
self.disable_merger = disable_merger
AutoConfig.register("hyperclovax_seed_vision_encoder", HyperCLOVAXSeedVisionEncoderConfig)
__all__ = ["HyperCLOVAXSeedVisionEncoderConfig"]