HyperCLOVAX-SEED-Think-4B / configuration_hyperclovax_seed_vision_encoder.py

Upload folder using huggingface_hub

0c1d6f8 verified 6 days ago

4.47 kB

	# coding=utf-8
	# Copyright 2026 NAVER Cloud Corp. and the HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""HyperCLOVAX-SEED Vision Encoder configuration"""

	from typing import List, Optional

	from transformers import AutoConfig
	from transformers.configuration_utils import PretrainedConfig


	class HyperCLOVAXSeedVisionEncoderConfig(PretrainedConfig):
	r"""
	Configuration class for the HyperCLOVAX-SEED Vision Encoder.

	Args:
	depth (`int`, optional, defaults to 32):
	Number of transformer blocks.
	hidden_size (`int`, optional, defaults to 1280):
	Dimension of the hidden representations.
	hidden_act (`str`, optional, defaults to `"silu"`):
	Activation function used in the MLP blocks.
	intermediate_size (`int`, optional, defaults to 3420):
	Dimension of the MLP intermediate representations.
	num_heads (`int`, optional, defaults to 16):
	Number of attention heads.
	in_channels (`int`, optional, defaults to 3):
	Number of input image channels.
	patch_size (`int`, optional, defaults to 14):
	Spatial patch size (height and width).
	spatial_merge_size (`int`, optional, defaults to 2):
	Number of patches to merge spatially before the language model.
	temporal_patch_size (`int`, optional, defaults to 2):
	Number of frames to merge per temporal patch.
	tokens_per_second (`int`, optional, defaults to 4):
	Number of temporal tokens representing one second of video.
	window_size (`int`, optional, defaults to 112):
	Window size (in pixels) for window attention blocks.
	out_hidden_size (`int`, optional, defaults to 3584):
	Output hidden size after the patch merger (should match the LLM hidden size).
	fullatt_block_indexes (`List[int]`, optional, defaults to `[7, 15, 23, 31]`):
	Indices of transformer blocks that use full (global) attention.
	All other blocks use window attention.
	initializer_range (`float`, optional, defaults to 0.02):
	Standard deviation for weight initialization.

	```python
	>>> from transformers import AutoConfig

	>>> configuration = AutoConfig.from_pretrained("naver-hyperclovax/HyperCLOVAX-SEED-Think-4B")
	```
	"""

	model_type = "hyperclovax_seed_vision_encoder"

	def __init__(
	self,
	depth: int = 32,
	hidden_size: int = 1280,
	hidden_act: str = "silu",
	intermediate_size: int = 3420,
	num_heads: int = 16,
	in_channels: int = 3,
	patch_size: int = 14,
	spatial_merge_size: int = 2,
	temporal_patch_size: int = 2,
	tokens_per_second: int = 4,
	window_size: int = 112,
	out_hidden_size: int = 3584,
	fullatt_block_indexes: Optional[List[int]] = None,
	initializer_range: float = 0.02,
	disable_merger: bool = False,
	**kwargs,
	):
	super().__init__(**kwargs)
	self.depth = depth
	self.hidden_size = hidden_size
	self.hidden_act = hidden_act
	self.intermediate_size = intermediate_size
	self.num_heads = num_heads
	self.in_channels = in_channels
	self.patch_size = patch_size
	self.spatial_merge_size = spatial_merge_size
	self.temporal_patch_size = temporal_patch_size
	self.tokens_per_second = tokens_per_second
	self.window_size = window_size
	self.out_hidden_size = out_hidden_size
	self.fullatt_block_indexes = fullatt_block_indexes if fullatt_block_indexes is not None else [7, 15, 23, 31]
	self.initializer_range = initializer_range
	self.disable_merger = disable_merger


	AutoConfig.register("hyperclovax_seed_vision_encoder", HyperCLOVAXSeedVisionEncoderConfig)

	__all__ = ["HyperCLOVAXSeedVisionEncoderConfig"]