HyperCLOVAX-SEED-Think-4B / configuration_hyperclovax_seed_vision_v2.py
bigshanedogg's picture
Upload folder using huggingface_hub
0c1d6f8 verified
# coding=utf-8
# Copyright 2026 NAVER Cloud Corp. and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""HyperCLOVAX-Vision-V2 multimodal model configuration"""
from enum import Enum
from typing import Dict, List, Optional, Union
from transformers import (
AutoConfig,
CLIPVisionConfig,
LlamaConfig,
PretrainedConfig,
Qwen2AudioEncoderConfig,
SiglipVisionConfig,
WhisperConfig,
)
try:
from transformers import Qwen2_5_VLVisionConfig
except ImportError:
from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
from .configuration_hyperclovax_seed_audio_encoder import HyperCLOVAXSeedAudioEncoderConfig
from .configuration_hyperclovax_seed_vision_encoder import HyperCLOVAXSeedVisionEncoderConfig
class ProjectorType(str, Enum):
"""Projector (connector) types shared by vision and audio branches."""
LINEAR = "linear"
MLP = "mlp"
INVERTED_MLP = "inverted_mlp"
CABSTRACTOR = "cabstractor"
PATCH_MERGER = "patch_merger"
class HyperCLOVAXVisionV2Config(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`HyperCLOVAXVisionV2ForCausalLM`]. It is used to
instantiate a HyperCLOVAX-Vision-V2 multimodal model according to the specified arguments, defining the model
architecture including text, vision, and audio components.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
text_config (`dict` or [`PretrainedConfig`], *optional*):
Configuration for the text backbone model. Accepts a `LlamaConfig`.
vision_config (`dict` or [`PretrainedConfig`], *optional*):
Configuration for the continuous vision encoder.
discrete_vision_config (`dict` or [`PretrainedConfig`], *optional*):
Configuration for the discrete vision tokenizer.
audio_config (`dict` or [`PretrainedConfig`], *optional*):
Configuration for the continuous audio encoder.
discrete_audio_config (`dict` or [`PretrainedConfig`], *optional*):
Configuration for the discrete audio encoder.
text_model_name_or_path (`str`, *optional*):
Path or identifier of a pretrained text model to load config from.
vision_model_name_or_path (`str`, *optional*):
Path or identifier of a pretrained vision model to load config from.
discrete_vision_model_name_or_path (`str`, *optional*):
Path or identifier of a pretrained discrete vision model to load config from.
audio_model_name_or_path (`str`, *optional*):
Path or identifier of a pretrained audio model to load config from.
discrete_audio_model_name_or_path (`str`, *optional*):
Path or identifier of a pretrained discrete audio model to load config from.
vision_projector_type (`str`, *optional*, defaults to `"mlp"`):
Type of the multimodal projector for vision features.
audio_projector_type (`str`, *optional*, defaults to `"mlp"`):
Type of the projector for audio features.
video_audio_compressor_type (`str`, *optional*):
Type of the video-audio compressor.
video_audio_compressor_config (`dict` or [`PretrainedConfig`], *optional*):
Configuration for the video-audio compressor module.
vision_feature_layer (`int`, *optional*, defaults to -2):
Index of the vision encoder layer to extract features from.
discrete_image_unit_0_id (`int`, *optional*, defaults to 135166):
Token id for `<|vision00000|>`, the first discrete vision unit token.
discrete_audio_unit_0_id (`int`, *optional*, defaults to 128604):
Token id for `<|audio0000|>`, the first discrete audio unit token.
anyres (`bool`, *optional*, defaults to `False`):
Whether to use any-resolution image processing.
unpad (`bool`, *optional*, defaults to `False`):
Whether to remove padding from image features.
max_num_grids (`int`, *optional*, defaults to -1):
Maximum number of grids for any-resolution processing. -1 means no limit.
num_queries_vis_abstractor (`int`, *optional*, defaults to -1):
Number of query tokens for the visual abstractor. -1 means disabled.
video_num_queries_fast (`int`, *optional*):
Number of query tokens for fast video frames.
video_num_queries_slow (`int`, *optional*):
Number of query tokens for slow video frames.
video_first_last_frames_slows (`int`, *optional*):
Number of first/last frames to process as slow frames.
video_max_num_frames (`int`, *optional*):
Maximum number of video frames to process.
ignore_index (`int`, *optional*, defaults to -100):
The index to ignore in loss computation.
proj_pos_emb (`bool`, *optional*, defaults to `True`):
Whether to use positional embeddings in the projector.
proj_prenorm (`bool`, *optional*, defaults to `False`):
Whether to apply pre-normalization in the projector.
use_1x1_grid (`bool`, *optional*, defaults to `False`):
Whether to use 1x1 grid for single-image processing.
possible_resolutions (`List[List[int]]`, *optional*):
List of possible resolutions `[height, width]` for any-resolution processing.
```python
>>> from transformers import AutoConfig
>>> # Initializing a HyperCLOVAX-Vision-V2 configuration from a pretrained checkpoint
>>> configuration = AutoConfig.from_pretrained("naver-hyperclovax/HyperCLOVAX-SEED-Think-4B")
```
"""
model_type = "hyperclovax_vision_v2"
keys_to_ignore_at_inference = ["past_key_values"]
sub_configs = {
"text_config": AutoConfig,
"vision_config": AutoConfig,
"audio_config": AutoConfig,
}
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs) -> "HyperCLOVAXVisionV2Config":
config = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
# Propagate _name_or_path to sub-configs so that AutoModel.from_config()
# can resolve dynamic module references (auto_map) from the checkpoint directory.
for attr in cls.sub_configs:
sub_config = getattr(config, attr, None)
if sub_config is not None and hasattr(sub_config, "_name_or_path"):
sub_config._name_or_path = config._name_or_path
return config
def __init__(
self,
text_config: Optional[Union[Dict, PretrainedConfig]] = None,
vision_config: Optional[Union[Dict, PretrainedConfig]] = None,
audio_config: Optional[Union[Dict, PretrainedConfig]] = None,
vision_projector_type: str = ProjectorType.MLP,
audio_projector_type: str = ProjectorType.MLP,
vision_feature_layer: int = -2,
discrete_image_unit_0_id: int = 135166, # <|vision00000|>
discrete_audio_unit_0_id: int = 128604, # <|audio0000|>
anyres: bool = False,
unpad: bool = False,
max_num_grids: int = -1,
num_queries_vis_abstractor: int = -1,
video_num_queries_fast: Optional[int] = None,
video_num_queries_slow: Optional[int] = None,
video_first_last_frames_slows: Optional[int] = None,
video_max_num_frames: Optional[int] = None,
ignore_index: int = -100,
proj_pos_emb: bool = True,
proj_prenorm: bool = False,
use_1x1_grid: bool = False,
possible_resolutions: Optional[List[List[int]]] = None,
**kwargs,
):
super().__init__(**kwargs)
# text_config
if isinstance(text_config, dict):
if text_config["model_type"] == LlamaConfig.model_type:
text_config = LlamaConfig(**text_config)
else:
raise ValueError(f'Invalid text_config type: {text_config["model_type"]}')
if text_config is not None:
self.hidden_size = text_config.hidden_size
else:
self.hidden_size = kwargs.get("hidden_size", 4096)
self.text_config = text_config
# audio_config
if isinstance(audio_config, dict):
if audio_config["model_type"] == HyperCLOVAXSeedAudioEncoderConfig.model_type:
audio_config = HyperCLOVAXSeedAudioEncoderConfig(**audio_config)
elif audio_config["model_type"] == Qwen2AudioEncoderConfig.model_type:
audio_config = Qwen2AudioEncoderConfig(**audio_config)
elif audio_config["model_type"] == "whisper_feature_extractor":
audio_config = WhisperConfig(**audio_config)
else:
raise ValueError(f'Invalid audio_config type: {audio_config["model_type"]}')
self.audio_config = audio_config
# vision_config
if isinstance(vision_config, dict):
if vision_config["model_type"] == CLIPVisionConfig.model_type:
vision_config = CLIPVisionConfig(**vision_config)
elif vision_config["model_type"] == HyperCLOVAXSeedVisionEncoderConfig.model_type:
vision_config = HyperCLOVAXSeedVisionEncoderConfig(**vision_config)
elif vision_config["model_type"] == SiglipVisionConfig.model_type:
vision_config = SiglipVisionConfig(**vision_config)
elif vision_config["model_type"] == Qwen2_5_VLVisionConfig.model_type:
vision_config = Qwen2_5_VLVisionConfig(**vision_config)
else:
raise ValueError(f'Invalid vision_config type: {vision_config["model_type"]}')
self.vision_config = vision_config
# vision-language model config
self.vision_projector_type = vision_projector_type
self.audio_projector_type = audio_projector_type
self.vision_feature_layer = vision_feature_layer
self.anyres = anyres
self.unpad = unpad
self.max_num_grids = max_num_grids
self.num_queries_vis_abstractor = num_queries_vis_abstractor
self.video_num_queries_fast = video_num_queries_fast
self.video_num_queries_slow = video_num_queries_slow
self.video_first_last_frames_slows = video_first_last_frames_slows
self.video_max_num_frames = video_max_num_frames
self.discrete_image_unit_0_id = discrete_image_unit_0_id
self.discrete_audio_unit_0_id = discrete_audio_unit_0_id
self.ignore_index = ignore_index
self.proj_pos_emb = proj_pos_emb
self.proj_prenorm = proj_prenorm
self.use_1x1_grid = use_1x1_grid
self.possible_resolutions = possible_resolutions if possible_resolutions is not None else []
# needed for HyperCLOVAXVisionForSequenceClassification
if self.text_config is not None:
self.pad_token_id = self.text_config.pad_token_id
AutoConfig.register("hyperclovax_vision_v2", HyperCLOVAXVisionV2Config)
__all__ = ["HyperCLOVAXVisionV2Config", "ProjectorType"]