HyperCLOVAX-SEED-Think-4B / configuration_hyperclovax_seed_vision_v2.py

Upload folder using huggingface_hub

0c1d6f8 verified 22 days ago

11.7 kB

	# coding=utf-8
	# Copyright 2026 NAVER Cloud Corp. and the HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""HyperCLOVAX-Vision-V2 multimodal model configuration"""

	from enum import Enum
	from typing import Dict, List, Optional, Union

	from transformers import (
	AutoConfig,
	CLIPVisionConfig,
	LlamaConfig,
	PretrainedConfig,
	Qwen2AudioEncoderConfig,
	SiglipVisionConfig,
	WhisperConfig,
	)
	try:
	from transformers import Qwen2_5_VLVisionConfig
	except ImportError:
	from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig

	from .configuration_hyperclovax_seed_audio_encoder import HyperCLOVAXSeedAudioEncoderConfig
	from .configuration_hyperclovax_seed_vision_encoder import HyperCLOVAXSeedVisionEncoderConfig


	class ProjectorType(str, Enum):
	"""Projector (connector) types shared by vision and audio branches."""

	LINEAR = "linear"
	MLP = "mlp"
	INVERTED_MLP = "inverted_mlp"
	CABSTRACTOR = "cabstractor"
	PATCH_MERGER = "patch_merger"


	class HyperCLOVAXVisionV2Config(PretrainedConfig):
	r"""
	This is the configuration class to store the configuration of a [`HyperCLOVAXVisionV2ForCausalLM`]. It is used to
	instantiate a HyperCLOVAX-Vision-V2 multimodal model according to the specified arguments, defining the model
	architecture including text, vision, and audio components.

	Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
	documentation from [`PretrainedConfig`] for more information.

	Args:
	text_config (`dict` or [`PretrainedConfig`], optional):
	Configuration for the text backbone model. Accepts a `LlamaConfig`.
	vision_config (`dict` or [`PretrainedConfig`], optional):
	Configuration for the continuous vision encoder.
	discrete_vision_config (`dict` or [`PretrainedConfig`], optional):
	Configuration for the discrete vision tokenizer.
	audio_config (`dict` or [`PretrainedConfig`], optional):
	Configuration for the continuous audio encoder.
	discrete_audio_config (`dict` or [`PretrainedConfig`], optional):
	Configuration for the discrete audio encoder.
	text_model_name_or_path (`str`, optional):
	Path or identifier of a pretrained text model to load config from.
	vision_model_name_or_path (`str`, optional):
	Path or identifier of a pretrained vision model to load config from.
	discrete_vision_model_name_or_path (`str`, optional):
	Path or identifier of a pretrained discrete vision model to load config from.
	audio_model_name_or_path (`str`, optional):
	Path or identifier of a pretrained audio model to load config from.
	discrete_audio_model_name_or_path (`str`, optional):
	Path or identifier of a pretrained discrete audio model to load config from.
	vision_projector_type (`str`, optional, defaults to `"mlp"`):
	Type of the multimodal projector for vision features.
	audio_projector_type (`str`, optional, defaults to `"mlp"`):
	Type of the projector for audio features.
	video_audio_compressor_type (`str`, optional):
	Type of the video-audio compressor.
	video_audio_compressor_config (`dict` or [`PretrainedConfig`], optional):
	Configuration for the video-audio compressor module.
	vision_feature_layer (`int`, optional, defaults to -2):
	Index of the vision encoder layer to extract features from.
	discrete_image_unit_0_id (`int`, optional, defaults to 135166):
	Token id for `<\|vision00000\|>`, the first discrete vision unit token.
	discrete_audio_unit_0_id (`int`, optional, defaults to 128604):
	Token id for `<\|audio0000\|>`, the first discrete audio unit token.
	anyres (`bool`, optional, defaults to `False`):
	Whether to use any-resolution image processing.
	unpad (`bool`, optional, defaults to `False`):
	Whether to remove padding from image features.
	max_num_grids (`int`, optional, defaults to -1):
	Maximum number of grids for any-resolution processing. -1 means no limit.
	num_queries_vis_abstractor (`int`, optional, defaults to -1):
	Number of query tokens for the visual abstractor. -1 means disabled.
	video_num_queries_fast (`int`, optional):
	Number of query tokens for fast video frames.
	video_num_queries_slow (`int`, optional):
	Number of query tokens for slow video frames.
	video_first_last_frames_slows (`int`, optional):
	Number of first/last frames to process as slow frames.
	video_max_num_frames (`int`, optional):
	Maximum number of video frames to process.
	ignore_index (`int`, optional, defaults to -100):
	The index to ignore in loss computation.
	proj_pos_emb (`bool`, optional, defaults to `True`):
	Whether to use positional embeddings in the projector.
	proj_prenorm (`bool`, optional, defaults to `False`):
	Whether to apply pre-normalization in the projector.
	use_1x1_grid (`bool`, optional, defaults to `False`):
	Whether to use 1x1 grid for single-image processing.
	possible_resolutions (`List[List[int]]`, optional):
	List of possible resolutions `[height, width]` for any-resolution processing.

	```python
	>>> from transformers import AutoConfig

	>>> # Initializing a HyperCLOVAX-Vision-V2 configuration from a pretrained checkpoint
	>>> configuration = AutoConfig.from_pretrained("naver-hyperclovax/HyperCLOVAX-SEED-Think-4B")
	```
	"""
	model_type = "hyperclovax_vision_v2"
	keys_to_ignore_at_inference = ["past_key_values"]

	sub_configs = {
	"text_config": AutoConfig,
	"vision_config": AutoConfig,
	"audio_config": AutoConfig,
	}

	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs) -> "HyperCLOVAXVisionV2Config":
	config = super().from_pretrained(pretrained_model_name_or_path, **kwargs)
	# Propagate _name_or_path to sub-configs so that AutoModel.from_config()
	# can resolve dynamic module references (auto_map) from the checkpoint directory.
	for attr in cls.sub_configs:
	sub_config = getattr(config, attr, None)
	if sub_config is not None and hasattr(sub_config, "_name_or_path"):
	sub_config._name_or_path = config._name_or_path
	return config

	def __init__(
	self,
	text_config: Optional[Union[Dict, PretrainedConfig]] = None,
	vision_config: Optional[Union[Dict, PretrainedConfig]] = None,
	audio_config: Optional[Union[Dict, PretrainedConfig]] = None,
	vision_projector_type: str = ProjectorType.MLP,
	audio_projector_type: str = ProjectorType.MLP,
	vision_feature_layer: int = -2,
	discrete_image_unit_0_id: int = 135166, # <\|vision00000\|>
	discrete_audio_unit_0_id: int = 128604, # <\|audio0000\|>
	anyres: bool = False,
	unpad: bool = False,
	max_num_grids: int = -1,
	num_queries_vis_abstractor: int = -1,
	video_num_queries_fast: Optional[int] = None,
	video_num_queries_slow: Optional[int] = None,
	video_first_last_frames_slows: Optional[int] = None,
	video_max_num_frames: Optional[int] = None,
	ignore_index: int = -100,
	proj_pos_emb: bool = True,
	proj_prenorm: bool = False,
	use_1x1_grid: bool = False,
	possible_resolutions: Optional[List[List[int]]] = None,
	**kwargs,
	):
	super().__init__(**kwargs)
	# text_config
	if isinstance(text_config, dict):
	if text_config["model_type"] == LlamaConfig.model_type:
	text_config = LlamaConfig(**text_config)
	else:
	raise ValueError(f'Invalid text_config type: {text_config["model_type"]}')
	if text_config is not None:
	self.hidden_size = text_config.hidden_size
	else:
	self.hidden_size = kwargs.get("hidden_size", 4096)
	self.text_config = text_config

	# audio_config
	if isinstance(audio_config, dict):
	if audio_config["model_type"] == HyperCLOVAXSeedAudioEncoderConfig.model_type:
	audio_config = HyperCLOVAXSeedAudioEncoderConfig(**audio_config)
	elif audio_config["model_type"] == Qwen2AudioEncoderConfig.model_type:
	audio_config = Qwen2AudioEncoderConfig(**audio_config)
	elif audio_config["model_type"] == "whisper_feature_extractor":
	audio_config = WhisperConfig(**audio_config)
	else:
	raise ValueError(f'Invalid audio_config type: {audio_config["model_type"]}')
	self.audio_config = audio_config

	# vision_config
	if isinstance(vision_config, dict):
	if vision_config["model_type"] == CLIPVisionConfig.model_type:
	vision_config = CLIPVisionConfig(**vision_config)
	elif vision_config["model_type"] == HyperCLOVAXSeedVisionEncoderConfig.model_type:
	vision_config = HyperCLOVAXSeedVisionEncoderConfig(**vision_config)
	elif vision_config["model_type"] == SiglipVisionConfig.model_type:
	vision_config = SiglipVisionConfig(**vision_config)
	elif vision_config["model_type"] == Qwen2_5_VLVisionConfig.model_type:
	vision_config = Qwen2_5_VLVisionConfig(**vision_config)
	else:
	raise ValueError(f'Invalid vision_config type: {vision_config["model_type"]}')
	self.vision_config = vision_config

	# vision-language model config
	self.vision_projector_type = vision_projector_type
	self.audio_projector_type = audio_projector_type
	self.vision_feature_layer = vision_feature_layer
	self.anyres = anyres
	self.unpad = unpad
	self.max_num_grids = max_num_grids
	self.num_queries_vis_abstractor = num_queries_vis_abstractor
	self.video_num_queries_fast = video_num_queries_fast
	self.video_num_queries_slow = video_num_queries_slow
	self.video_first_last_frames_slows = video_first_last_frames_slows
	self.video_max_num_frames = video_max_num_frames

	self.discrete_image_unit_0_id = discrete_image_unit_0_id
	self.discrete_audio_unit_0_id = discrete_audio_unit_0_id

	self.ignore_index = ignore_index
	self.proj_pos_emb = proj_pos_emb
	self.proj_prenorm = proj_prenorm
	self.use_1x1_grid = use_1x1_grid
	self.possible_resolutions = possible_resolutions if possible_resolutions is not None else []

	# needed for HyperCLOVAXVisionForSequenceClassification
	if self.text_config is not None:
	self.pad_token_id = self.text_config.pad_token_id


	AutoConfig.register("hyperclovax_vision_v2", HyperCLOVAXVisionV2Config)

	__all__ = ["HyperCLOVAXVisionV2Config", "ProjectorType"]