HyperCLOVAX-SEED-CLIP / configuration_hyperclovax_seed_clip.py

Upload folder using huggingface_hub

f2f8be1 verified 7 days ago

6.01 kB

	# coding=utf-8
	# Copyright 2026 NAVER Cloud Corp. and the HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""HyperCLOVAX SEED CLIP model configuration.

	Config classes for HyperCLOVAX SEED CLIP with vision encoder + SiglipText encoder.
	"""

	from transformers import AutoConfig
	from transformers.configuration_utils import PretrainedConfig
	from transformers.models.siglip.configuration_siglip import SiglipTextConfig
	from transformers.utils import logging

	logger = logging.get_logger(__name__)


	class HyperCLOVAXSeedCLIPVisionConfig(PretrainedConfig):
	"""Vision config for HyperCLOVAX SEED CLIP.

	Includes parameters for the vision encoder transformer and
	the Siglip2-style attention pooling head.

	Args:
	depth (`int`, optional, defaults to 32):
	Number of transformer blocks.
	hidden_size (`int`, optional, defaults to 1280):
	Dimension of the hidden representations.
	hidden_act (`str`, optional, defaults to `"silu"`):
	Activation function used in the MLP blocks.
	intermediate_size (`int`, optional, defaults to 3456):
	Dimension of the MLP intermediate representations.
	num_heads (`int`, optional, defaults to 16):
	Number of attention heads.
	in_channels (`int`, optional, defaults to 3):
	Number of input image channels.
	patch_size (`int`, optional, defaults to 16):
	Spatial patch size (height and width).
	spatial_merge_size (`int`, optional, defaults to 2):
	Number of patches to merge spatially.
	temporal_patch_size (`int`, optional, defaults to 2):
	Number of frames to merge per temporal patch.
	tokens_per_second (`int`, optional, defaults to 4):
	Number of temporal tokens representing one second of video.
	window_size (`int`, optional, defaults to 128):
	Window size (in pixels) for window attention blocks.
	fullatt_block_indexes (`List[int]`, optional, defaults to all blocks):
	Indices of transformer blocks that use full (global) attention.
	Defaults to all blocks (full attention everywhere).
	initializer_range (`float`, optional, defaults to 0.02):
	Standard deviation for weight initialization.
	attn_pool_heads (`int`, optional, defaults to 16):
	Number of attention heads in the pooling head.
	attn_pool_mlp_ratio (`float`, optional, defaults to 4.0):
	MLP expansion ratio in the pooling head.
	attn_implementation (`str`, optional, defaults to `"eager"`):
	Attention implementation (`"eager"`, `"sdpa"`, or `"flash_attention_2"`).
	"""

	model_type = "hyperclovax_seed_clip_vision"

	def __init__(
	self,
	depth=32,
	hidden_size=1280,
	hidden_act="silu",
	intermediate_size=3456,
	num_heads=16,
	in_channels=3,
	patch_size=16,
	spatial_merge_size=2,
	temporal_patch_size=2,
	tokens_per_second=4,
	window_size=128,
	fullatt_block_indexes=None,
	initializer_range=0.02,
	attn_pool_heads=16,
	attn_pool_mlp_ratio=4.0,
	attn_implementation="eager",
	**kwargs,
	):
	super().__init__(**kwargs)
	self.depth = depth
	self.hidden_size = hidden_size
	self.hidden_act = hidden_act
	self.intermediate_size = intermediate_size
	self.num_heads = num_heads
	self.in_channels = in_channels
	self.patch_size = patch_size
	self.spatial_merge_size = spatial_merge_size
	self.temporal_patch_size = temporal_patch_size
	self.tokens_per_second = tokens_per_second
	self.window_size = window_size
	self.fullatt_block_indexes = (
	fullatt_block_indexes if fullatt_block_indexes is not None else list(range(depth))
	)
	self.initializer_range = initializer_range
	self.attn_pool_heads = attn_pool_heads
	self.attn_pool_mlp_ratio = attn_pool_mlp_ratio
	self.attn_implementation = attn_implementation


	AutoConfig.register("hyperclovax_seed_clip_vision", HyperCLOVAXSeedCLIPVisionConfig)


	class HyperCLOVAXSeedCLIPConfig(PretrainedConfig):
	"""Config for HyperCLOVAX SEED CLIP.

	Combines a SiglipTextConfig (text) and HyperCLOVAXSeedCLIPVisionConfig (vision).
	"""

	model_type = "hyperclovax_seed_clip"
	sub_configs = {"text_config": SiglipTextConfig, "vision_config": HyperCLOVAXSeedCLIPVisionConfig}

	def __init__(self, text_config=None, vision_config=None, **kwargs):
	super().__init__(**kwargs)

	if text_config is None:
	text_config = {}
	logger.info("text_config is None. Using SiglipTextConfig defaults.")
	if vision_config is None:
	vision_config = {}
	logger.info("vision_config is None. Using HyperCLOVAXSeedCLIPVisionConfig defaults.")

	self.text_config = SiglipTextConfig(**text_config)
	self.vision_config = HyperCLOVAXSeedCLIPVisionConfig(**vision_config)
	self.initializer_factor = 1.0

	@classmethod
	def from_text_vision_configs(
	cls,
	text_config: SiglipTextConfig,
	vision_config: HyperCLOVAXSeedCLIPVisionConfig,
	**kwargs,
	):
	return cls(
	text_config=text_config.to_dict(),
	vision_config=vision_config.to_dict(),
	**kwargs,
	)