| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """HyperCLOVAX-Vision-V2 multimodal model. |
| |
| Integrates a vision encoder, vision projector, causal language model, and |
| optionally an audio encoder. The published model uses: |
| - Language model: HyperCLOVAX or Llama |
| - Vision encoder: HyperCLOVAXSeedVisionEncoder + PatchMerger projector |
| - Audio encoder: HyperCLOVAXSeedAudioEncoder + MLP projector |
| |
| Acknowledgements: |
| - VLM integration pattern adapted from LLaVA |
| (https://github.com/haotian-liu/LLaVA), Apache-2.0 License. |
| - CAbstractor and weight initialization adapted from Honeybee |
| (https://github.com/kakaobrain/honeybee), Apache-2.0 License. |
| - PatchMerger projector adapted from Qwen2.5-VL |
| (https://github.com/QwenLM/Qwen2.5-VL), Apache-2.0 License. |
| """ |
|
|
| from functools import partial |
| from typing import Any, Dict, List, Optional, Tuple, Type, Union |
|
|
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
|
|
| try: |
| from einops import rearrange |
| from timm.layers import LayerNorm, LayerNorm2d |
| from timm.models.regnet import RegStage |
| except ImportError: |
| pass |
|
|
| from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, PretrainedConfig |
| from transformers.modeling_utils import PreTrainedModel |
| from transformers.cache_utils import Cache |
| from transformers.generation import GenerationMixin |
| from transformers.modeling_outputs import ( |
| BaseModelOutputWithPast, |
| CausalLMOutputWithPast, |
| SequenceClassifierOutputWithPast, |
| ) |
|
|
| from .configuration_hyperclovax_seed_vision_v2 import HyperCLOVAXVisionV2Config, ProjectorType |
| from .configuration_hyperclovax_seed_vision_encoder import HyperCLOVAXSeedVisionEncoderConfig |
|
|
| try: |
| from transformers import Qwen2_5_VLVisionConfig |
| except ImportError: |
| from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig |
|
|
|
|
| class HyperCLOVAXVisionV2MLP(nn.Module): |
| """MLP projector for vision features (standard or inverted-bottleneck).""" |
|
|
| def __init__( |
| self, |
| vision_projector_type: str, |
| in_features: int, |
| hidden_features: Optional[int] = None, |
| out_features: Optional[int] = None, |
| act_layer: Type[nn.Module] = nn.GELU, |
| ) -> None: |
| super().__init__() |
| out_features = out_features or in_features |
| hidden_features = hidden_features or in_features |
| self.vision_projector_type = vision_projector_type |
| if vision_projector_type == ProjectorType.MLP: |
| self.fc1 = nn.Linear(in_features, hidden_features) |
| self.act = act_layer() |
| self.fc2 = nn.Linear(hidden_features, out_features) |
| elif vision_projector_type == ProjectorType.INVERTED_MLP: |
| self.fc1 = nn.Linear(in_features, 2 * hidden_features) |
| self.act = act_layer() |
| self.fc2 = nn.Linear(2 * hidden_features, out_features) |
| else: |
| raise NotImplementedError(f"{vision_projector_type} is not implemented") |
|
|
| def forward( |
| self, |
| x: torch.Tensor, |
| ) -> torch.Tensor: |
| x = self.fc1(x) |
| x = self.act(x) |
| x = self.fc2(x) |
| return x |
|
|
|
|
| class HyperCLOVAXVisionV2CAbstractor(nn.Module): |
| """C-Abstractor: convolutional visual abstractor with adaptive pooling. |
| |
| Adapted from the C-Abstractor in Honeybee. |
| |
| Encodes a flattened patch sequence ``(B, L, encoder_hidden_size)`` through |
| two RegNet stages separated by adaptive average pooling, then projects to |
| the LLM hidden size via a small MLP readout. |
| |
| Args: |
| num_queries: Number of output visual tokens (must be a perfect square). |
| num_input_tokens: Number of input patch tokens (used for positional embedding). |
| encoder_hidden_size: Hidden size of the vision encoder output. |
| hidden_size: Internal channel size of the RegNet stages. |
| output_hidden_size: Output size (= LLM hidden size). |
| pos_emb: If ``True``, add a learnable positional embedding to the input. |
| prenorm: If ``True``, apply LayerNorm before the convolutional stages. |
| """ |
|
|
| def __init__( |
| self, |
| num_queries: int, |
| num_input_tokens: int, |
| encoder_hidden_size: int, |
| hidden_size: int, |
| output_hidden_size: int, |
| pos_emb: bool = True, |
| prenorm: bool = False, |
| depth: int = 3, |
| mlp_depth: int = 2, |
| ): |
| super().__init__() |
| if not (num_queries ** 0.5).is_integer(): |
| raise ValueError(f"num_queries must be a perfect square, got {num_queries}") |
| hw = int(num_queries ** 0.5) |
|
|
| self.num_input_tokens = num_input_tokens |
| self.output_hidden_size = output_hidden_size |
|
|
| self.pos_emb: Optional[nn.Parameter] |
| if pos_emb: |
| self.pos_emb = nn.Parameter(torch.zeros(1, num_input_tokens, encoder_hidden_size)) |
| self.pos_emb.data.normal_(mean=0.0, std=0.02) |
| else: |
| self.pos_emb = None |
|
|
| self.prenorm = LayerNorm(encoder_hidden_size) if prenorm else None |
|
|
| RegBlock = partial(RegStage, stride=1, dilation=1, act_layer=nn.SiLU, norm_layer=LayerNorm2d) |
| self.net = nn.Sequential( |
| RegBlock(depth, encoder_hidden_size, hidden_size), |
| nn.AdaptiveAvgPool2d((hw, hw)), |
| RegBlock(depth, hidden_size, hidden_size), |
| ) |
|
|
| layers = [nn.Linear(hidden_size, output_hidden_size)] |
| for _ in range(1, mlp_depth): |
| layers.append(nn.SiLU()) |
| layers.append(nn.Linear(output_hidden_size, output_hidden_size)) |
| self.readout = nn.Sequential(*layers) |
|
|
| def forward( |
| self, |
| x: torch.Tensor, |
| num_queries_vis_abstractors: Optional[List[int]] = None, |
| num_grids: Optional[List[int]] = None, |
| ) -> Union[torch.Tensor, List[torch.Tensor]]: |
| """ |
| Args: |
| x: ``(B, L, encoder_hidden_size)`` patch features from the vision backbone. |
| num_queries_vis_abstractors: Per-image query counts for adaptive pooling. |
| If ``None``, uses the fixed grid size from ``__init__``. |
| num_grids: Cumulative grid-boundary indices corresponding to |
| ``num_queries_vis_abstractors``. Required when the above is set. |
| |
| Returns: |
| ``(B, num_queries, output_hidden_size)`` tensor when using the fixed |
| grid (``num_queries_vis_abstractors`` is ``None``), or a list of |
| per-image tensors when using adaptive pooling. |
| """ |
| if self.prenorm is not None: |
| x = self.prenorm(x) |
| if self.pos_emb is not None: |
| x = x + self.pos_emb |
|
|
| |
| hw = int(x.size(1) ** 0.5) |
| x = rearrange(x, "b (h w) d -> b d h w", h=hw, w=hw) |
|
|
| if num_queries_vis_abstractors is not None: |
| assert num_grids is not None |
| return self._forward_adaptive(x, num_queries_vis_abstractors, num_grids) |
|
|
| x = self.net(x) |
| x = rearrange(x, "b d h w -> b (h w) d") |
| return self.readout(x) |
|
|
| def _forward_adaptive( |
| self, |
| x: torch.Tensor, |
| num_queries_vis_abstractors: List[int], |
| num_grids: List[int], |
| ) -> List[torch.Tensor]: |
| """Adaptive-query forward: replaces the fixed sampler with per-image pooling.""" |
| |
| assert len(self.net) == 3 |
| x = self.net[0](x) |
|
|
| outputs = [] |
| for i, num_queries in enumerate(num_queries_vis_abstractors): |
| hw = int(num_queries ** 0.5) |
| out = nn.AdaptiveAvgPool2d((hw, hw))(x[num_grids[i]: num_grids[i + 1], :]) |
| out = self.net[2](out) |
| out = rearrange(out, "b d h w -> b (h w) d") |
| outputs.append(self.readout(out)) |
| return outputs |
|
|
|
|
| class HyperCLOVAXVisionV2RMSNorm(nn.Module): |
| """RMS normalisation layer used inside HyperCLOVAXVisionV2PatchMerger.""" |
|
|
| def __init__( |
| self, |
| hidden_size: int, |
| eps: float = 1e-6, |
| ) -> None: |
| super().__init__() |
| self.weight = nn.Parameter(torch.ones(hidden_size)) |
| self.variance_epsilon = eps |
|
|
| def forward( |
| self, |
| hidden_states: torch.Tensor, |
| ) -> torch.Tensor: |
| input_dtype = hidden_states.dtype |
| hidden_states = hidden_states.to(torch.float32) |
| variance = hidden_states.pow(2).mean(-1, keepdim=True) |
| hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) |
| return self.weight * hidden_states.to(input_dtype) |
|
|
| def extra_repr(self) -> str: |
| return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" |
|
|
|
|
| class HyperCLOVAXVisionV2PatchMerger(nn.Module): |
| """Patch-merger projector that maps vision tokens to LLM embedding space. |
| |
| Adapted from the PatchMerger in Qwen2.5-VL. |
| |
| Accepts a tuple ``(hidden_states, window_index)`` from the vision encoder |
| (the encoder's built-in merger is bypassed), applies RMSNorm + MLP over the |
| spatially-merged window, then restores the original token order. |
| |
| Args: |
| dim: Output hidden size (= LLM hidden size). |
| context_dim: Input hidden size (= vision encoder ``out_hidden_size``). |
| spatial_merge_size: Spatial merge factor used in the vision encoder |
| (default 2, matching Qwen2.5-VL defaults). |
| """ |
|
|
| def __init__( |
| self, |
| dim: int, |
| context_dim: int, |
| spatial_merge_size: int = 2, |
| ) -> None: |
| super().__init__() |
| self.hidden_size = context_dim * (spatial_merge_size ** 2) |
| self.ln_q = HyperCLOVAXVisionV2RMSNorm(context_dim, eps=1e-6) |
| self.mlp = nn.Sequential( |
| nn.Linear(self.hidden_size, self.hidden_size), |
| nn.GELU(), |
| nn.Linear(self.hidden_size, dim), |
| ) |
|
|
| def forward( |
| self, |
| inputs: Tuple[torch.Tensor, torch.Tensor], |
| ) -> torch.Tensor: |
| """ |
| Args: |
| inputs: Tuple of ``(hidden_states, window_index)`` produced by the |
| monkey-patched Qwen vision encoder forward. |
| Returns: |
| Tensor of shape ``(total_tokens, dim)`` in the original token order. |
| """ |
| x, window_index = inputs |
| |
| |
| if self.mlp[0].weight.dtype == torch.float16: |
| with torch.amp.autocast(device_type=x.device.type, dtype=torch.float32): |
| x = self.mlp(self.ln_q(x).view(-1, self.hidden_size)) |
| else: |
| x = self.mlp(self.ln_q(x).view(-1, self.hidden_size)) |
| reverse_indices = torch.argsort(window_index) |
| return x[reverse_indices, :] |
|
|
|
|
| class HyperCLOVAXVisionV2PreTrainedModel(PreTrainedModel): |
| """Base class for all HyperCLOVAX-Vision-V2 models.""" |
|
|
| config_class = HyperCLOVAXVisionV2Config |
| base_model_prefix = "model" |
| _no_split_modules = ["HyperCLOVAXSeedVisionBlock", "Qwen2DecoderLayer", "LlamaDecoderLayer"] |
| supports_gradient_checkpointing = True |
| _skip_keys_device_placement = "past_key_values" |
| _supports_flash_attn_2 = True |
| _supports_sdpa = True |
| _supports_flex_attn = True |
| _supports_cache_class = True |
| _supports_quantized_cache = True |
| _supports_static_cache = True |
| _supports_attention_backend = True |
|
|
| def _init_weights( |
| self, |
| module: nn.Module, |
| ) -> None: |
| """Initialize weights following Honeybee conventions.""" |
| |
| if isinstance(module, (nn.Conv2d, nn.Conv3d, nn.Embedding, nn.Linear)): |
| module.weight.data.normal_(mean=0.0, std=0.02) |
| if hasattr(module, "bias") and module.bias is not None: |
| module.bias.data.zero_() |
| elif isinstance(module, nn.LayerNorm): |
| module.bias.data.zero_() |
| module.weight.data.fill_(1.0) |
|
|
|
|
| class HyperCLOVAXVisionV2Model(HyperCLOVAXVisionV2PreTrainedModel): |
| """Backbone model: vision encoder + multimodal projector + LLM base (no LM head).""" |
|
|
| def __init__( |
| self, |
| config: HyperCLOVAXVisionV2Config, |
| ) -> None: |
| super().__init__(config) |
|
|
| |
| vision_config = config.vision_config |
| vision_config.anyres = config.anyres |
| vision_config.max_num_grids = config.max_num_grids |
| vision_config.torch_dtype = getattr(config, "dtype", None) or getattr(config, "torch_dtype", None) |
| self.vision_config = vision_config |
|
|
| if config.anyres: |
| if not getattr(config, "possible_resolutions", []): |
| assert config.max_num_grids > 0 |
| possible_resolutions = [ |
| [ys * vision_config.image_size, xs * vision_config.image_size] |
| for i in range(1, config.max_num_grids + 1) |
| for j in range(1, config.max_num_grids + 1) |
| for ys, xs in ([(i, j)] if (i != 1 or j != 1 or config.use_1x1_grid) and i * j <= config.max_num_grids else []) |
| ] |
| self.config.possible_resolutions = possible_resolutions |
| else: |
| self.config.possible_resolutions = config.possible_resolutions |
|
|
| if vision_config.model_type != Qwen2_5_VLVisionConfig.model_type: |
| vision_config._attn_implementation = config._attn_implementation |
| if not vision_config.name_or_path: |
| vision_config._name_or_path = config._name_or_path |
| self.vision_model = AutoModel.from_config( |
| vision_config, |
| trust_remote_code=True, |
| attn_implementation=config._attn_implementation, |
| ) |
|
|
| |
| text_config = config.text_config |
| text_config.torch_dtype = getattr(config, "dtype", None) or getattr(config, "torch_dtype", None) |
| if text_config.model_type in ["llama", "hyperclovax", "gpt2"]: |
| text_config._attn_implementation = config._attn_implementation |
| if text_config.model_type != "hyperclovax": |
| text_config.logits_scaling = 1.0 |
| text_config.vocab_size = ( |
| text_config.padded_vocab_size if hasattr(text_config, "padded_vocab_size") else text_config.vocab_size |
| ) |
|
|
| self.language_model = AutoModelForCausalLM.from_config(text_config, trust_remote_code=True) |
|
|
| self.text_config = text_config |
| self.num_queries_vis_abstractor = config.num_queries_vis_abstractor |
|
|
| |
| input_hidden_size = vision_config.hidden_size |
| if vision_config.model_type == Qwen2_5_VLVisionConfig.model_type: |
| input_hidden_size = vision_config.out_hidden_size |
|
|
| if config.vision_projector_type == ProjectorType.LINEAR: |
| self.mm_projector = nn.Linear(input_hidden_size, text_config.hidden_size) |
|
|
| elif config.vision_projector_type == ProjectorType.CABSTRACTOR: |
| self.mm_projector = HyperCLOVAXVisionV2CAbstractor( |
| num_queries=self.num_queries_vis_abstractor, |
| num_input_tokens=(vision_config.image_size // vision_config.patch_size) ** 2, |
| encoder_hidden_size=input_hidden_size, |
| hidden_size=input_hidden_size, |
| output_hidden_size=text_config.hidden_size, |
| pos_emb=config.proj_pos_emb, |
| prenorm=config.proj_prenorm, |
| ) |
| self.mm_projector.pos_emb.to(config.torch_dtype) |
|
|
| elif config.vision_projector_type == ProjectorType.PATCH_MERGER: |
| |
| |
| |
| self.mm_projector = HyperCLOVAXVisionV2PatchMerger( |
| dim=text_config.hidden_size, |
| context_dim=input_hidden_size, |
| ) |
|
|
| else: |
| self.mm_projector = HyperCLOVAXVisionV2MLP( |
| config.vision_projector_type, |
| input_hidden_size, |
| hidden_features=input_hidden_size, |
| out_features=text_config.hidden_size, |
| ) |
|
|
| self.mm_projector.to(config.torch_dtype) |
|
|
| self.vision_feature_layer = config.vision_feature_layer |
| self.anyres = config.anyres |
|
|
| if self.anyres: |
| self.image_newline = nn.Parameter(torch.empty(text_config.hidden_size, dtype=self.dtype)) |
|
|
| |
| self.audio_model = None |
| self.audio_projector = None |
|
|
| if isinstance(getattr(config, "audio_config", None), PretrainedConfig): |
| audio_config = config.audio_config |
| audio_config.torch_dtype = getattr(config, "torch_dtype", None) |
| if not audio_config.name_or_path: |
| audio_config._name_or_path = config._name_or_path |
| self.audio_model = AutoModel.from_config( |
| audio_config, |
| trust_remote_code=True, |
| attn_implementation=config._attn_implementation, |
| ) |
|
|
| if config.audio_projector_type == ProjectorType.LINEAR: |
| self.audio_projector = nn.Linear( |
| in_features=audio_config.d_model, |
| out_features=text_config.hidden_size, |
| ) |
| else: |
| self.audio_projector = HyperCLOVAXVisionV2MLP( |
| config.audio_projector_type, |
| audio_config.d_model, |
| hidden_features=audio_config.d_model, |
| out_features=text_config.hidden_size, |
| ) |
| self.audio_projector.to(self.audio_model.dtype) |
|
|
| def process_audio_input( |
| self, |
| audio_values: torch.Tensor, |
| audio_attention_mask: torch.Tensor, |
| ) -> List[torch.Tensor]: |
| """Encode audio chunks into LLM embedding space. |
| |
| Args: |
| audio_values: ``(total_chunks, 128, 3000)`` mel spectrogram tensor. |
| audio_attention_mask: ``(total_chunks, 3000)`` attention mask. |
| |
| Returns: |
| List containing one tensor of shape ``(total_chunks * T, hidden_size)``. |
| """ |
| emb = self.audio_model( |
| audio_values, |
| attention_mask=audio_attention_mask, |
| ).last_hidden_state |
| emb = emb.flatten(0, 1) |
| emb = self.audio_projector(emb) |
| return [emb] |
|
|
| def get_input_embeddings(self) -> nn.Embedding: |
| return self.language_model.get_input_embeddings() |
|
|
| def set_input_embeddings( |
| self, |
| value: nn.Embedding, |
| ) -> None: |
| self.language_model.set_input_embeddings(value) |
|
|
| def get_output_embeddings(self) -> nn.Linear: |
| return self.language_model.get_output_embeddings() |
|
|
| def set_output_embeddings( |
| self, |
| new_embeddings: nn.Linear, |
| ) -> None: |
| self.language_model.set_output_embeddings(new_embeddings) |
|
|
| def get_decoder(self) -> nn.Module: |
| return self.language_model.get_decoder() |
|
|
| def set_decoder( |
| self, |
| decoder: nn.Module, |
| ) -> None: |
| self.language_model.set_decoder(decoder) |
|
|
| def tie_weights( |
| self, |
| **kwargs, |
| ) -> None: |
| |
| |
| |
| |
| |
| if getattr(self.config.text_config, "tie_word_embeddings", False): |
| input_embeddings = self.language_model.get_input_embeddings() |
| output_embeddings = self.language_model.get_output_embeddings() |
| if ( |
| input_embeddings is not None |
| and output_embeddings is not None |
| and input_embeddings.weight.device != output_embeddings.weight.device |
| ): |
| output_embeddings.weight = nn.Parameter(output_embeddings.weight.to(input_embeddings.weight.device)) |
| return self.language_model.tie_weights(**kwargs) |
|
|
| def resize_token_embeddings( |
| self, |
| new_num_tokens: Optional[int] = None, |
| pad_to_multiple_of: Optional[int] = None, |
| ) -> nn.Embedding: |
| model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of) |
| self.config.text_config.vocab_size = model_embeds.num_embeddings |
| return model_embeds |
|
|
| def forward( |
| self, |
| input_ids: Optional[torch.LongTensor] = None, |
| pixel_values: Optional[torch.FloatTensor] = None, |
| past_key_values: Optional[Cache] = None, |
| attention_mask: Optional[torch.Tensor] = None, |
| position_ids: Optional[torch.LongTensor] = None, |
| inputs_embeds: Optional[torch.FloatTensor] = None, |
| token_type_ids: Optional[torch.LongTensor] = None, |
| use_cache: Optional[bool] = None, |
| cache_position: Optional[torch.LongTensor] = None, |
| output_attentions: Optional[bool] = None, |
| output_hidden_states: Optional[bool] = None, |
| return_dict: Optional[bool] = None, |
| |
| audio_values: Optional[torch.FloatTensor] = None, |
| audio_attention_mask: Optional[torch.FloatTensor] = None, |
| audio_masks: Optional[List[torch.Tensor]] = None, |
| num_audio_tokens: Optional[torch.LongTensor] = None, |
| |
| image_grid_thw: Optional[torch.LongTensor] = None, |
| num_image_tokens: Optional[torch.LongTensor] = None, |
| |
| pixel_values_videos: Optional[torch.FloatTensor] = None, |
| video_grid_thw: Optional[torch.LongTensor] = None, |
| num_video_tokens: Optional[torch.LongTensor] = None, |
| video_audio_values: Optional[torch.FloatTensor] = None, |
| video_audio_attention_mask: Optional[torch.FloatTensor] = None, |
| video_audio_masks: Optional[List[torch.Tensor]] = None, |
| num_video_audio_tokens: Optional[torch.LongTensor] = None, |
| **kwargs, |
| ) -> Union[Tuple, BaseModelOutputWithPast]: |
| """ |
| Fuse multimodal inputs into token embeddings and run the language model backbone. |
| |
| Image, video, and audio tokens identified by their respective token IDs in |
| ``input_ids`` are replaced with the corresponding encoder+projector outputs |
| before being passed to the language model. |
| |
| Returns: |
| ``BaseModelOutputWithPast`` (or tuple when ``return_dict=False``). |
| """ |
| return_dict = return_dict if return_dict is not None else self.config.use_return_dict |
|
|
| if audio_values is not None: |
| raise ValueError( |
| "Standalone audio input (`audio_values`) is not supported by this model. " |
| "Audio is only supported as part of video input (`video_audio_values`)." |
| ) |
|
|
| if inputs_embeds is None: |
| |
| |
| |
| |
| embed_module = self.get_input_embeddings() |
| inputs_embeds = F.embedding( |
| input_ids.to(embed_module.weight.device), |
| embed_module.weight, |
| embed_module.padding_idx, |
| ) |
|
|
| if pixel_values is not None: |
| image_features = self.process_image_input( |
| pixel_values=pixel_values, |
| image_grid_thw=image_grid_thw, |
| ) |
| positions = input_ids.eq(self.config.image_token_id).nonzero(as_tuple=False) |
| inputs_embeds[positions[:, 0], positions[:, 1]] = ( |
| torch.cat(image_features).to(device=inputs_embeds.device, dtype=inputs_embeds.dtype) |
| ) |
|
|
| if pixel_values_videos is not None: |
| video_features = self.process_video_input( |
| pixel_values_videos=pixel_values_videos, |
| video_grid_thw=video_grid_thw, |
| ) |
| positions = input_ids.eq(self.config.video_token_id).nonzero(as_tuple=False) |
| inputs_embeds[positions[:, 0], positions[:, 1]] = ( |
| torch.cat(video_features).to(device=inputs_embeds.device, dtype=inputs_embeds.dtype) |
| ) |
|
|
| if video_audio_values is not None and self.audio_model is not None: |
| video_audio_token_id = getattr(self.config, "video_audio_token_id", None) |
| if video_audio_token_id is not None: |
| video_audio_features = self.process_audio_input( |
| audio_values=video_audio_values, |
| audio_attention_mask=video_audio_attention_mask, |
| ) |
| positions = input_ids.eq(video_audio_token_id).nonzero(as_tuple=False) |
| inputs_embeds[positions[:, 0], positions[:, 1]] = ( |
| torch.cat(video_audio_features).to(device=inputs_embeds.device, dtype=inputs_embeds.dtype) |
| ) |
|
|
| input_ids = None |
|
|
| return self.language_model.base_model( |
| input_ids=input_ids, |
| inputs_embeds=inputs_embeds, |
| attention_mask=attention_mask, |
| position_ids=position_ids, |
| past_key_values=past_key_values, |
| use_cache=use_cache, |
| cache_position=cache_position, |
| output_attentions=output_attentions, |
| output_hidden_states=output_hidden_states, |
| return_dict=return_dict, |
| ) |
|
|
| def process_image_input( |
| self, |
| pixel_values: torch.FloatTensor, |
| image_grid_thw: Optional[torch.LongTensor] = None, |
| ) -> List[torch.Tensor]: |
| """Encode image pixel values into LLM-space feature tensors. |
| |
| Args: |
| pixel_values: Flat tensor of shape ``(total_patches, channels * patch_size * patch_size)``. |
| image_grid_thw: Grid shape ``(num_images, 3)`` with (T, H, W) per image. |
| |
| Returns: |
| List containing one tensor of shape ``(total_image_tokens, hidden_size)``. |
| """ |
| features = self.vision_model(pixel_values, grid_thw=image_grid_thw) |
| features = self.mm_projector(features) |
| return [features] |
|
|
| def process_video_input( |
| self, |
| pixel_values_videos: torch.FloatTensor, |
| video_grid_thw: Optional[torch.LongTensor] = None, |
| ) -> List[torch.Tensor]: |
| """Encode video pixel values into LLM-space feature tensors. |
| |
| Args: |
| pixel_values_videos: Flat tensor of shape ``(total_patches, channels * patch_size * patch_size)``. |
| video_grid_thw: Grid shape ``(num_videos, 3)`` with (T, H, W) per video. |
| |
| Returns: |
| List containing one tensor of shape ``(total_video_tokens, hidden_size)``. |
| """ |
| features = self.vision_model(pixel_values_videos, grid_thw=video_grid_thw) |
| features = self.mm_projector(features) |
| return [features] |
|
|
|
|
| class HyperCLOVAXVisionV2ForCausalLM(HyperCLOVAXVisionV2PreTrainedModel, GenerationMixin): |
| """HyperCLOVAX-Vision-V2 model with a causal language modelling head.""" |
|
|
| def __init__( |
| self, |
| config: HyperCLOVAXVisionV2Config, |
| ) -> None: |
| super().__init__(config) |
| self.model = HyperCLOVAXVisionV2Model(config) |
| self.post_init() |
|
|
| |
| def get_input_embeddings(self) -> nn.Embedding: |
| return self.model.get_input_embeddings() |
|
|
| def set_input_embeddings( |
| self, |
| value: nn.Embedding, |
| ) -> None: |
| self.model.set_input_embeddings(value) |
|
|
| def get_output_embeddings(self) -> nn.Linear: |
| return self.model.get_output_embeddings() |
|
|
| def set_output_embeddings( |
| self, |
| new_embeddings: nn.Linear, |
| ) -> None: |
| self.model.set_output_embeddings(new_embeddings) |
|
|
| def get_decoder(self) -> nn.Module: |
| return self.model.get_decoder() |
|
|
| def set_decoder( |
| self, |
| decoder: nn.Module, |
| ) -> None: |
| self.model.set_decoder(decoder) |
|
|
| def tie_weights( |
| self, |
| **kwargs, |
| ) -> None: |
| return self.model.tie_weights(**kwargs) |
|
|
| def resize_token_embeddings( |
| self, |
| new_num_tokens: Optional[int] = None, |
| pad_to_multiple_of: Optional[int] = None, |
| ) -> nn.Embedding: |
| return self.model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of) |
|
|
| |
| @property |
| def language_model(self) -> nn.Module: |
| return self.model.language_model |
|
|
| @property |
| def vision_model(self) -> nn.Module: |
| return self.model.vision_model |
|
|
| @property |
| def mm_projector(self) -> nn.Module: |
| return self.model.mm_projector |
|
|
| @property |
| def audio_model(self) -> Optional[nn.Module]: |
| return self.model.audio_model |
|
|
| @property |
| def audio_projector(self) -> Optional[nn.Module]: |
| return self.model.audio_projector |
|
|
| @property |
| def vision_model_type(self) -> str: |
| return self.model.vision_config.model_type |
|
|
| @property |
| def anyres(self) -> bool: |
| return self.model.anyres |
|
|
| @property |
| def image_newline(self) -> Optional[nn.Parameter]: |
| return self.model.image_newline |
|
|
| def forward( |
| self, |
| input_ids: Optional[torch.LongTensor] = None, |
| pixel_values: Optional[torch.FloatTensor] = None, |
| past_key_values: Optional[Cache] = None, |
| attention_mask: Optional[torch.Tensor] = None, |
| position_ids: Optional[torch.LongTensor] = None, |
| inputs_embeds: Optional[torch.FloatTensor] = None, |
| token_type_ids: Optional[torch.LongTensor] = None, |
| labels: Optional[torch.LongTensor] = None, |
| use_cache: Optional[bool] = None, |
| cache_position: Optional[torch.LongTensor] = None, |
| output_attentions: Optional[bool] = None, |
| output_hidden_states: Optional[bool] = None, |
| return_dict: Optional[bool] = None, |
| |
| audio_values: Optional[torch.FloatTensor] = None, |
| audio_attention_mask: Optional[torch.FloatTensor] = None, |
| audio_masks: Optional[List[torch.Tensor]] = None, |
| num_audio_tokens: Optional[torch.LongTensor] = None, |
| |
| image_grid_thw: Optional[torch.LongTensor] = None, |
| num_image_tokens: Optional[torch.LongTensor] = None, |
| |
| pixel_values_videos: Optional[torch.FloatTensor] = None, |
| video_grid_thw: Optional[torch.LongTensor] = None, |
| num_video_tokens: Optional[torch.LongTensor] = None, |
| video_audio_values: Optional[torch.FloatTensor] = None, |
| video_audio_attention_mask: Optional[torch.FloatTensor] = None, |
| video_audio_masks: Optional[List[torch.Tensor]] = None, |
| num_video_audio_tokens: Optional[torch.LongTensor] = None, |
| logits_to_keep: Union[int, torch.Tensor] = 0, |
| **kwargs, |
| ) -> Union[Tuple, CausalLMOutputWithPast]: |
| """ |
| Multimodal causal language model forward pass. |
| |
| Calls the backbone model to fuse multimodal inputs, then computes logits |
| via the LM head. Loss is computed against ``labels`` when provided. |
| |
| Returns: |
| ``CausalLMOutputWithPast`` (or tuple when ``return_dict=False``). |
| """ |
| return_dict = return_dict if return_dict is not None else self.config.use_return_dict |
|
|
| outputs = self.model.forward( |
| input_ids=input_ids, |
| pixel_values=pixel_values, |
| past_key_values=past_key_values, |
| attention_mask=attention_mask, |
| position_ids=position_ids, |
| inputs_embeds=inputs_embeds, |
| token_type_ids=token_type_ids, |
| use_cache=use_cache, |
| cache_position=cache_position, |
| output_attentions=output_attentions, |
| output_hidden_states=output_hidden_states, |
| return_dict=return_dict, |
| audio_values=audio_values, |
| audio_attention_mask=audio_attention_mask, |
| image_grid_thw=image_grid_thw, |
| num_image_tokens=num_image_tokens, |
| pixel_values_videos=pixel_values_videos, |
| video_grid_thw=video_grid_thw, |
| num_video_tokens=num_video_tokens, |
| video_audio_values=video_audio_values, |
| video_audio_attention_mask=video_audio_attention_mask, |
| video_audio_masks=video_audio_masks, |
| num_video_audio_tokens=num_video_audio_tokens, |
| ) |
| hidden_states = outputs[0] |
| slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep |
| logits = self.model.language_model.lm_head(hidden_states[:, slice_indices, :]) * getattr( |
| self.config.text_config, "logits_scaling", 1.0 |
| ) |
|
|
| loss = None |
| if labels is not None: |
| loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs) |
|
|
| return CausalLMOutputWithPast( |
| loss=loss, |
| logits=logits, |
| past_key_values=outputs.past_key_values, |
| hidden_states=outputs.hidden_states, |
| attentions=outputs.attentions, |
| ) |
|
|
| def prepare_inputs_for_generation( |
| self, |
| input_ids: torch.LongTensor, |
| past_key_values: Optional[Cache] = None, |
| attention_mask: Optional[torch.Tensor] = None, |
| inputs_embeds: Optional[torch.FloatTensor] = None, |
| cache_position: Optional[torch.LongTensor] = None, |
| pixel_values: Optional[torch.FloatTensor] = None, |
| image_grid_thw: Optional[torch.LongTensor] = None, |
| pixel_values_videos: Optional[torch.FloatTensor] = None, |
| video_grid_thw: Optional[torch.LongTensor] = None, |
| audio_values: Optional[torch.FloatTensor] = None, |
| audio_attention_mask: Optional[torch.FloatTensor] = None, |
| video_audio_values: Optional[torch.FloatTensor] = None, |
| video_audio_attention_mask: Optional[torch.FloatTensor] = None, |
| **kwargs: Any, |
| ) -> Dict[str, Any]: |
| |
| |
| model_inputs = super().prepare_inputs_for_generation( |
| input_ids, |
| past_key_values=past_key_values, |
| attention_mask=attention_mask, |
| inputs_embeds=inputs_embeds, |
| cache_position=cache_position, |
| **kwargs, |
| ) |
|
|
| |
| |
| |
| is_prefill = past_key_values is None or past_key_values.get_seq_length() == 0 |
| if is_prefill: |
| model_inputs["pixel_values"] = pixel_values |
| model_inputs["image_grid_thw"] = image_grid_thw |
| model_inputs["pixel_values_videos"] = pixel_values_videos |
| model_inputs["video_grid_thw"] = video_grid_thw |
| model_inputs["audio_values"] = audio_values |
| model_inputs["audio_attention_mask"] = audio_attention_mask |
| model_inputs["video_audio_values"] = video_audio_values |
| model_inputs["video_audio_attention_mask"] = video_audio_attention_mask |
|
|
| return model_inputs |
|
|
|
|
| class HyperCLOVAXVisionV2ForSequenceClassification(HyperCLOVAXVisionV2PreTrainedModel): |
| """HyperCLOVAX-Vision-V2 model with a sequence classification head.""" |
|
|
| def __init__( |
| self, |
| config: HyperCLOVAXVisionV2Config, |
| ) -> None: |
| super().__init__(config) |
| self.num_labels = getattr(config, "num_labels", 2) |
| self.model = HyperCLOVAXVisionV2Model(config) |
| self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) |
| self.post_init() |
|
|
| def get_input_embeddings(self) -> nn.Embedding: |
| return self.model.get_input_embeddings() |
|
|
| def set_input_embeddings( |
| self, |
| value: nn.Embedding, |
| ) -> None: |
| self.model.set_input_embeddings(value) |
|
|
| def forward( |
| self, |
| input_ids: Optional[torch.LongTensor] = None, |
| pixel_values: Optional[torch.FloatTensor] = None, |
| attention_mask: Optional[torch.Tensor] = None, |
| position_ids: Optional[torch.LongTensor] = None, |
| past_key_values: Optional[Cache] = None, |
| inputs_embeds: Optional[torch.FloatTensor] = None, |
| token_type_ids: Optional[torch.LongTensor] = None, |
| labels: Optional[torch.LongTensor] = None, |
| use_cache: Optional[bool] = None, |
| output_attentions: Optional[bool] = None, |
| output_hidden_states: Optional[bool] = None, |
| return_dict: Optional[bool] = None, |
| |
| image_grid_thw: Optional[torch.LongTensor] = None, |
| num_image_tokens: Optional[torch.LongTensor] = None, |
| |
| pixel_values_videos: Optional[torch.FloatTensor] = None, |
| video_grid_thw: Optional[torch.LongTensor] = None, |
| num_video_tokens: Optional[torch.LongTensor] = None, |
| ) -> SequenceClassifierOutputWithPast: |
| """ |
| Sequence classification forward pass. |
| |
| Extracts the last non-padding token's hidden state, projects it via |
| ``self.score``, and computes loss against ``labels`` when provided. |
| |
| Returns: |
| ``SequenceClassifierOutputWithPast``. |
| """ |
| return_dict = return_dict if return_dict is not None else self.config.use_return_dict |
|
|
| transformer_outputs: BaseModelOutputWithPast = self.model( |
| input_ids=input_ids, |
| pixel_values=pixel_values, |
| attention_mask=attention_mask, |
| position_ids=position_ids, |
| past_key_values=past_key_values, |
| inputs_embeds=inputs_embeds, |
| token_type_ids=token_type_ids, |
| use_cache=use_cache, |
| output_attentions=output_attentions, |
| output_hidden_states=output_hidden_states, |
| return_dict=return_dict, |
| image_grid_thw=image_grid_thw, |
| num_image_tokens=num_image_tokens, |
| pixel_values_videos=pixel_values_videos, |
| video_grid_thw=video_grid_thw, |
| num_video_tokens=num_video_tokens, |
| ) |
| hidden_states = transformer_outputs[0] |
| logits = self.score(hidden_states) |
|
|
| batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0] |
| if self.config.pad_token_id is None and batch_size != 1: |
| raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") |
|
|
| if self.config.pad_token_id is None or input_ids is None: |
| last_non_pad_token = -1 |
| else: |
| non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32) |
| token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32) |
| last_non_pad_token = (token_indices * non_pad_mask).argmax(-1) |
|
|
| pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token] |
|
|
| loss = None |
| if labels is not None: |
| loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) |
|
|
| return SequenceClassifierOutputWithPast( |
| loss=loss, |
| logits=pooled_logits, |
| past_key_values=transformer_outputs.past_key_values, |
| hidden_states=transformer_outputs.hidden_states, |
| attentions=transformer_outputs.attentions, |
| ) |
|
|
|
|
| AutoConfig.register("hyperclovax_vision_v2", HyperCLOVAXVisionV2Config) |
| AutoModel.register(HyperCLOVAXVisionV2Config, HyperCLOVAXVisionV2Model) |
| AutoModelForCausalLM.register(HyperCLOVAXVisionV2Config, HyperCLOVAXVisionV2ForCausalLM) |
| AutoModelForSequenceClassification.register(HyperCLOVAXVisionV2Config, HyperCLOVAXVisionV2ForSequenceClassification) |
|
|
| __all__ = [ |
| "HyperCLOVAXVisionV2PreTrainedModel", |
| "HyperCLOVAXVisionV2Model", |
| "HyperCLOVAXVisionV2ForCausalLM", |
| "HyperCLOVAXVisionV2ForSequenceClassification", |
| ] |
|
|