HyperCLOVAX-SEED-CLIP / configuration_hyperclovax_seed_clip.py
bigshanedogg's picture
Upload folder using huggingface_hub
f2f8be1 verified
# coding=utf-8
# Copyright 2026 NAVER Cloud Corp. and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""HyperCLOVAX SEED CLIP model configuration.
Config classes for HyperCLOVAX SEED CLIP with vision encoder + SiglipText encoder.
"""
from transformers import AutoConfig
from transformers.configuration_utils import PretrainedConfig
from transformers.models.siglip.configuration_siglip import SiglipTextConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
class HyperCLOVAXSeedCLIPVisionConfig(PretrainedConfig):
"""Vision config for HyperCLOVAX SEED CLIP.
Includes parameters for the vision encoder transformer and
the Siglip2-style attention pooling head.
Args:
depth (`int`, *optional*, defaults to 32):
Number of transformer blocks.
hidden_size (`int`, *optional*, defaults to 1280):
Dimension of the hidden representations.
hidden_act (`str`, *optional*, defaults to `"silu"`):
Activation function used in the MLP blocks.
intermediate_size (`int`, *optional*, defaults to 3456):
Dimension of the MLP intermediate representations.
num_heads (`int`, *optional*, defaults to 16):
Number of attention heads.
in_channels (`int`, *optional*, defaults to 3):
Number of input image channels.
patch_size (`int`, *optional*, defaults to 16):
Spatial patch size (height and width).
spatial_merge_size (`int`, *optional*, defaults to 2):
Number of patches to merge spatially.
temporal_patch_size (`int`, *optional*, defaults to 2):
Number of frames to merge per temporal patch.
tokens_per_second (`int`, *optional*, defaults to 4):
Number of temporal tokens representing one second of video.
window_size (`int`, *optional*, defaults to 128):
Window size (in pixels) for window attention blocks.
fullatt_block_indexes (`List[int]`, *optional*, defaults to all blocks):
Indices of transformer blocks that use full (global) attention.
Defaults to all blocks (full attention everywhere).
initializer_range (`float`, *optional*, defaults to 0.02):
Standard deviation for weight initialization.
attn_pool_heads (`int`, *optional*, defaults to 16):
Number of attention heads in the pooling head.
attn_pool_mlp_ratio (`float`, *optional*, defaults to 4.0):
MLP expansion ratio in the pooling head.
attn_implementation (`str`, *optional*, defaults to `"eager"`):
Attention implementation (`"eager"`, `"sdpa"`, or `"flash_attention_2"`).
"""
model_type = "hyperclovax_seed_clip_vision"
def __init__(
self,
depth=32,
hidden_size=1280,
hidden_act="silu",
intermediate_size=3456,
num_heads=16,
in_channels=3,
patch_size=16,
spatial_merge_size=2,
temporal_patch_size=2,
tokens_per_second=4,
window_size=128,
fullatt_block_indexes=None,
initializer_range=0.02,
attn_pool_heads=16,
attn_pool_mlp_ratio=4.0,
attn_implementation="eager",
**kwargs,
):
super().__init__(**kwargs)
self.depth = depth
self.hidden_size = hidden_size
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.num_heads = num_heads
self.in_channels = in_channels
self.patch_size = patch_size
self.spatial_merge_size = spatial_merge_size
self.temporal_patch_size = temporal_patch_size
self.tokens_per_second = tokens_per_second
self.window_size = window_size
self.fullatt_block_indexes = (
fullatt_block_indexes if fullatt_block_indexes is not None else list(range(depth))
)
self.initializer_range = initializer_range
self.attn_pool_heads = attn_pool_heads
self.attn_pool_mlp_ratio = attn_pool_mlp_ratio
self.attn_implementation = attn_implementation
AutoConfig.register("hyperclovax_seed_clip_vision", HyperCLOVAXSeedCLIPVisionConfig)
class HyperCLOVAXSeedCLIPConfig(PretrainedConfig):
"""Config for HyperCLOVAX SEED CLIP.
Combines a SiglipTextConfig (text) and HyperCLOVAXSeedCLIPVisionConfig (vision).
"""
model_type = "hyperclovax_seed_clip"
sub_configs = {"text_config": SiglipTextConfig, "vision_config": HyperCLOVAXSeedCLIPVisionConfig}
def __init__(self, text_config=None, vision_config=None, **kwargs):
super().__init__(**kwargs)
if text_config is None:
text_config = {}
logger.info("text_config is None. Using SiglipTextConfig defaults.")
if vision_config is None:
vision_config = {}
logger.info("vision_config is None. Using HyperCLOVAXSeedCLIPVisionConfig defaults.")
self.text_config = SiglipTextConfig(**text_config)
self.vision_config = HyperCLOVAXSeedCLIPVisionConfig(**vision_config)
self.initializer_factor = 1.0
@classmethod
def from_text_vision_configs(
cls,
text_config: SiglipTextConfig,
vision_config: HyperCLOVAXSeedCLIPVisionConfig,
**kwargs,
):
return cls(
text_config=text_config.to_dict(),
vision_config=vision_config.to_dict(),
**kwargs,
)