| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """HyperCLOVAX-Vision-V2 multimodal model configuration""" |
|
|
| from enum import Enum |
| from typing import Dict, List, Optional, Union |
|
|
| from transformers import ( |
| AutoConfig, |
| CLIPVisionConfig, |
| LlamaConfig, |
| PretrainedConfig, |
| Qwen2AudioEncoderConfig, |
| SiglipVisionConfig, |
| WhisperConfig, |
| ) |
| try: |
| from transformers import Qwen2_5_VLVisionConfig |
| except ImportError: |
| from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig |
|
|
| from .configuration_hyperclovax_seed_audio_encoder import HyperCLOVAXSeedAudioEncoderConfig |
| from .configuration_hyperclovax_seed_vision_encoder import HyperCLOVAXSeedVisionEncoderConfig |
|
|
|
|
| class ProjectorType(str, Enum): |
| """Projector (connector) types shared by vision and audio branches.""" |
|
|
| LINEAR = "linear" |
| MLP = "mlp" |
| INVERTED_MLP = "inverted_mlp" |
| CABSTRACTOR = "cabstractor" |
| PATCH_MERGER = "patch_merger" |
|
|
|
|
| class HyperCLOVAXVisionV2Config(PretrainedConfig): |
| r""" |
| This is the configuration class to store the configuration of a [`HyperCLOVAXVisionV2ForCausalLM`]. It is used to |
| instantiate a HyperCLOVAX-Vision-V2 multimodal model according to the specified arguments, defining the model |
| architecture including text, vision, and audio components. |
| |
| Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
| documentation from [`PretrainedConfig`] for more information. |
| |
| Args: |
| text_config (`dict` or [`PretrainedConfig`], *optional*): |
| Configuration for the text backbone model. Accepts a `LlamaConfig`. |
| vision_config (`dict` or [`PretrainedConfig`], *optional*): |
| Configuration for the continuous vision encoder. |
| discrete_vision_config (`dict` or [`PretrainedConfig`], *optional*): |
| Configuration for the discrete vision tokenizer. |
| audio_config (`dict` or [`PretrainedConfig`], *optional*): |
| Configuration for the continuous audio encoder. |
| discrete_audio_config (`dict` or [`PretrainedConfig`], *optional*): |
| Configuration for the discrete audio encoder. |
| text_model_name_or_path (`str`, *optional*): |
| Path or identifier of a pretrained text model to load config from. |
| vision_model_name_or_path (`str`, *optional*): |
| Path or identifier of a pretrained vision model to load config from. |
| discrete_vision_model_name_or_path (`str`, *optional*): |
| Path or identifier of a pretrained discrete vision model to load config from. |
| audio_model_name_or_path (`str`, *optional*): |
| Path or identifier of a pretrained audio model to load config from. |
| discrete_audio_model_name_or_path (`str`, *optional*): |
| Path or identifier of a pretrained discrete audio model to load config from. |
| vision_projector_type (`str`, *optional*, defaults to `"mlp"`): |
| Type of the multimodal projector for vision features. |
| audio_projector_type (`str`, *optional*, defaults to `"mlp"`): |
| Type of the projector for audio features. |
| video_audio_compressor_type (`str`, *optional*): |
| Type of the video-audio compressor. |
| video_audio_compressor_config (`dict` or [`PretrainedConfig`], *optional*): |
| Configuration for the video-audio compressor module. |
| vision_feature_layer (`int`, *optional*, defaults to -2): |
| Index of the vision encoder layer to extract features from. |
| discrete_image_unit_0_id (`int`, *optional*, defaults to 135166): |
| Token id for `<|vision00000|>`, the first discrete vision unit token. |
| discrete_audio_unit_0_id (`int`, *optional*, defaults to 128604): |
| Token id for `<|audio0000|>`, the first discrete audio unit token. |
| anyres (`bool`, *optional*, defaults to `False`): |
| Whether to use any-resolution image processing. |
| unpad (`bool`, *optional*, defaults to `False`): |
| Whether to remove padding from image features. |
| max_num_grids (`int`, *optional*, defaults to -1): |
| Maximum number of grids for any-resolution processing. -1 means no limit. |
| num_queries_vis_abstractor (`int`, *optional*, defaults to -1): |
| Number of query tokens for the visual abstractor. -1 means disabled. |
| video_num_queries_fast (`int`, *optional*): |
| Number of query tokens for fast video frames. |
| video_num_queries_slow (`int`, *optional*): |
| Number of query tokens for slow video frames. |
| video_first_last_frames_slows (`int`, *optional*): |
| Number of first/last frames to process as slow frames. |
| video_max_num_frames (`int`, *optional*): |
| Maximum number of video frames to process. |
| ignore_index (`int`, *optional*, defaults to -100): |
| The index to ignore in loss computation. |
| proj_pos_emb (`bool`, *optional*, defaults to `True`): |
| Whether to use positional embeddings in the projector. |
| proj_prenorm (`bool`, *optional*, defaults to `False`): |
| Whether to apply pre-normalization in the projector. |
| use_1x1_grid (`bool`, *optional*, defaults to `False`): |
| Whether to use 1x1 grid for single-image processing. |
| possible_resolutions (`List[List[int]]`, *optional*): |
| List of possible resolutions `[height, width]` for any-resolution processing. |
| |
| ```python |
| >>> from transformers import AutoConfig |
| |
| >>> # Initializing a HyperCLOVAX-Vision-V2 configuration from a pretrained checkpoint |
| >>> configuration = AutoConfig.from_pretrained("naver-hyperclovax/HyperCLOVAX-SEED-Think-4B") |
| ``` |
| """ |
| model_type = "hyperclovax_vision_v2" |
| keys_to_ignore_at_inference = ["past_key_values"] |
|
|
| sub_configs = { |
| "text_config": AutoConfig, |
| "vision_config": AutoConfig, |
| "audio_config": AutoConfig, |
| } |
|
|
| @classmethod |
| def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs) -> "HyperCLOVAXVisionV2Config": |
| config = super().from_pretrained(pretrained_model_name_or_path, **kwargs) |
| |
| |
| for attr in cls.sub_configs: |
| sub_config = getattr(config, attr, None) |
| if sub_config is not None and hasattr(sub_config, "_name_or_path"): |
| sub_config._name_or_path = config._name_or_path |
| return config |
|
|
| def __init__( |
| self, |
| text_config: Optional[Union[Dict, PretrainedConfig]] = None, |
| vision_config: Optional[Union[Dict, PretrainedConfig]] = None, |
| audio_config: Optional[Union[Dict, PretrainedConfig]] = None, |
| vision_projector_type: str = ProjectorType.MLP, |
| audio_projector_type: str = ProjectorType.MLP, |
| vision_feature_layer: int = -2, |
| discrete_image_unit_0_id: int = 135166, |
| discrete_audio_unit_0_id: int = 128604, |
| anyres: bool = False, |
| unpad: bool = False, |
| max_num_grids: int = -1, |
| num_queries_vis_abstractor: int = -1, |
| video_num_queries_fast: Optional[int] = None, |
| video_num_queries_slow: Optional[int] = None, |
| video_first_last_frames_slows: Optional[int] = None, |
| video_max_num_frames: Optional[int] = None, |
| ignore_index: int = -100, |
| proj_pos_emb: bool = True, |
| proj_prenorm: bool = False, |
| use_1x1_grid: bool = False, |
| possible_resolutions: Optional[List[List[int]]] = None, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
| |
| if isinstance(text_config, dict): |
| if text_config["model_type"] == LlamaConfig.model_type: |
| text_config = LlamaConfig(**text_config) |
| else: |
| raise ValueError(f'Invalid text_config type: {text_config["model_type"]}') |
| if text_config is not None: |
| self.hidden_size = text_config.hidden_size |
| else: |
| self.hidden_size = kwargs.get("hidden_size", 4096) |
| self.text_config = text_config |
|
|
| |
| if isinstance(audio_config, dict): |
| if audio_config["model_type"] == HyperCLOVAXSeedAudioEncoderConfig.model_type: |
| audio_config = HyperCLOVAXSeedAudioEncoderConfig(**audio_config) |
| elif audio_config["model_type"] == Qwen2AudioEncoderConfig.model_type: |
| audio_config = Qwen2AudioEncoderConfig(**audio_config) |
| elif audio_config["model_type"] == "whisper_feature_extractor": |
| audio_config = WhisperConfig(**audio_config) |
| else: |
| raise ValueError(f'Invalid audio_config type: {audio_config["model_type"]}') |
| self.audio_config = audio_config |
|
|
| |
| if isinstance(vision_config, dict): |
| if vision_config["model_type"] == CLIPVisionConfig.model_type: |
| vision_config = CLIPVisionConfig(**vision_config) |
| elif vision_config["model_type"] == HyperCLOVAXSeedVisionEncoderConfig.model_type: |
| vision_config = HyperCLOVAXSeedVisionEncoderConfig(**vision_config) |
| elif vision_config["model_type"] == SiglipVisionConfig.model_type: |
| vision_config = SiglipVisionConfig(**vision_config) |
| elif vision_config["model_type"] == Qwen2_5_VLVisionConfig.model_type: |
| vision_config = Qwen2_5_VLVisionConfig(**vision_config) |
| else: |
| raise ValueError(f'Invalid vision_config type: {vision_config["model_type"]}') |
| self.vision_config = vision_config |
|
|
| |
| self.vision_projector_type = vision_projector_type |
| self.audio_projector_type = audio_projector_type |
| self.vision_feature_layer = vision_feature_layer |
| self.anyres = anyres |
| self.unpad = unpad |
| self.max_num_grids = max_num_grids |
| self.num_queries_vis_abstractor = num_queries_vis_abstractor |
| self.video_num_queries_fast = video_num_queries_fast |
| self.video_num_queries_slow = video_num_queries_slow |
| self.video_first_last_frames_slows = video_first_last_frames_slows |
| self.video_max_num_frames = video_max_num_frames |
|
|
| self.discrete_image_unit_0_id = discrete_image_unit_0_id |
| self.discrete_audio_unit_0_id = discrete_audio_unit_0_id |
|
|
| self.ignore_index = ignore_index |
| self.proj_pos_emb = proj_pos_emb |
| self.proj_prenorm = proj_prenorm |
| self.use_1x1_grid = use_1x1_grid |
| self.possible_resolutions = possible_resolutions if possible_resolutions is not None else [] |
|
|
| |
| if self.text_config is not None: |
| self.pad_token_id = self.text_config.pad_token_id |
|
|
|
|
| AutoConfig.register("hyperclovax_vision_v2", HyperCLOVAXVisionV2Config) |
|
|
| __all__ = ["HyperCLOVAXVisionV2Config", "ProjectorType"] |
|
|