HyperCLOVAX-SEED-Think-4B / modeling_hyperclovax_seed_vision_v2.py

Upload folder using huggingface_hub

0c1d6f8 verified 9 days ago

42.1 kB

	# coding=utf-8
	# Copyright 2026 NAVER Cloud Corp. and the HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""HyperCLOVAX-Vision-V2 multimodal model.

	Integrates a vision encoder, vision projector, causal language model, and
	optionally an audio encoder. The published model uses:
	- Language model: HyperCLOVAX or Llama
	- Vision encoder: HyperCLOVAXSeedVisionEncoder + PatchMerger projector
	- Audio encoder: HyperCLOVAXSeedAudioEncoder + MLP projector

	Acknowledgements:
	- VLM integration pattern adapted from LLaVA
	(https://github.com/haotian-liu/LLaVA), Apache-2.0 License.
	- CAbstractor and weight initialization adapted from Honeybee
	(https://github.com/kakaobrain/honeybee), Apache-2.0 License.
	- PatchMerger projector adapted from Qwen2.5-VL
	(https://github.com/QwenLM/Qwen2.5-VL), Apache-2.0 License.
	"""

	from functools import partial
	from typing import Any, Dict, List, Optional, Tuple, Type, Union

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	try:
	from einops import rearrange
	from timm.layers import LayerNorm, LayerNorm2d
	from timm.models.regnet import RegStage
	except ImportError:
	pass

	from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, PretrainedConfig
	from transformers.modeling_utils import PreTrainedModel
	from transformers.cache_utils import Cache
	from transformers.generation import GenerationMixin
	from transformers.modeling_outputs import (
	BaseModelOutputWithPast,
	CausalLMOutputWithPast,
	SequenceClassifierOutputWithPast,
	)

	from .configuration_hyperclovax_seed_vision_v2 import HyperCLOVAXVisionV2Config, ProjectorType
	from .configuration_hyperclovax_seed_vision_encoder import HyperCLOVAXSeedVisionEncoderConfig

	try:
	from transformers import Qwen2_5_VLVisionConfig
	except ImportError:
	from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig


	class HyperCLOVAXVisionV2MLP(nn.Module):
	"""MLP projector for vision features (standard or inverted-bottleneck)."""

	def __init__(
	self,
	vision_projector_type: str,
	in_features: int,
	hidden_features: Optional[int] = None,
	out_features: Optional[int] = None,
	act_layer: Type[nn.Module] = nn.GELU,
	) -> None:
	super().__init__()
	out_features = out_features or in_features
	hidden_features = hidden_features or in_features
	self.vision_projector_type = vision_projector_type
	if vision_projector_type == ProjectorType.MLP:
	self.fc1 = nn.Linear(in_features, hidden_features)
	self.act = act_layer()
	self.fc2 = nn.Linear(hidden_features, out_features)
	elif vision_projector_type == ProjectorType.INVERTED_MLP:
	self.fc1 = nn.Linear(in_features, 2 * hidden_features)
	self.act = act_layer()
	self.fc2 = nn.Linear(2 * hidden_features, out_features)
	else:
	raise NotImplementedError(f"{vision_projector_type} is not implemented")

	def forward(
	self,
	x: torch.Tensor,
	) -> torch.Tensor:
	x = self.fc1(x)
	x = self.act(x)
	x = self.fc2(x)
	return x


	class HyperCLOVAXVisionV2CAbstractor(nn.Module):
	"""C-Abstractor: convolutional visual abstractor with adaptive pooling.

	Adapted from the C-Abstractor in Honeybee.

	Encodes a flattened patch sequence ``(B, L, encoder_hidden_size)`` through
	two RegNet stages separated by adaptive average pooling, then projects to
	the LLM hidden size via a small MLP readout.

	Args:
	num_queries: Number of output visual tokens (must be a perfect square).
	num_input_tokens: Number of input patch tokens (used for positional embedding).
	encoder_hidden_size: Hidden size of the vision encoder output.
	hidden_size: Internal channel size of the RegNet stages.
	output_hidden_size: Output size (= LLM hidden size).
	pos_emb: If ``True``, add a learnable positional embedding to the input.
	prenorm: If ``True``, apply LayerNorm before the convolutional stages.
	"""

	def __init__(
	self,
	num_queries: int,
	num_input_tokens: int,
	encoder_hidden_size: int,
	hidden_size: int,
	output_hidden_size: int,
	pos_emb: bool = True,
	prenorm: bool = False,
	depth: int = 3,
	mlp_depth: int = 2,
	):
	super().__init__()
	if not (num_queries ** 0.5).is_integer():
	raise ValueError(f"num_queries must be a perfect square, got {num_queries}")
	hw = int(num_queries ** 0.5)

	self.num_input_tokens = num_input_tokens
	self.output_hidden_size = output_hidden_size

	self.pos_emb: Optional[nn.Parameter]
	if pos_emb:
	self.pos_emb = nn.Parameter(torch.zeros(1, num_input_tokens, encoder_hidden_size))
	self.pos_emb.data.normal_(mean=0.0, std=0.02)
	else:
	self.pos_emb = None

	self.prenorm = LayerNorm(encoder_hidden_size) if prenorm else None

	RegBlock = partial(RegStage, stride=1, dilation=1, act_layer=nn.SiLU, norm_layer=LayerNorm2d)
	self.net = nn.Sequential(
	RegBlock(depth, encoder_hidden_size, hidden_size),
	nn.AdaptiveAvgPool2d((hw, hw)),
	RegBlock(depth, hidden_size, hidden_size),
	)

	layers = [nn.Linear(hidden_size, output_hidden_size)]
	for _ in range(1, mlp_depth):
	layers.append(nn.SiLU())
	layers.append(nn.Linear(output_hidden_size, output_hidden_size))
	self.readout = nn.Sequential(*layers)

	def forward(
	self,
	x: torch.Tensor,
	num_queries_vis_abstractors: Optional[List[int]] = None,
	num_grids: Optional[List[int]] = None,
	) -> Union[torch.Tensor, List[torch.Tensor]]:
	"""
	Args:
	x: ``(B, L, encoder_hidden_size)`` patch features from the vision backbone.
	num_queries_vis_abstractors: Per-image query counts for adaptive pooling.
	If ``None``, uses the fixed grid size from ``__init__``.
	num_grids: Cumulative grid-boundary indices corresponding to
	``num_queries_vis_abstractors``. Required when the above is set.

	Returns:
	``(B, num_queries, output_hidden_size)`` tensor when using the fixed
	grid (``num_queries_vis_abstractors`` is ``None``), or a list of
	per-image tensors when using adaptive pooling.
	"""
	if self.prenorm is not None:
	x = self.prenorm(x)
	if self.pos_emb is not None:
	x = x + self.pos_emb

	# Reshape flat patch sequence to spatial grid: [B, L, d] → [B, d, h, w]
	hw = int(x.size(1) ** 0.5)
	x = rearrange(x, "b (h w) d -> b d h w", h=hw, w=hw)

	if num_queries_vis_abstractors is not None:
	assert num_grids is not None
	return self._forward_adaptive(x, num_queries_vis_abstractors, num_grids)

	x = self.net(x)
	x = rearrange(x, "b d h w -> b (h w) d")
	return self.readout(x)

	def _forward_adaptive(
	self,
	x: torch.Tensor,
	num_queries_vis_abstractors: List[int],
	num_grids: List[int],
	) -> List[torch.Tensor]:
	"""Adaptive-query forward: replaces the fixed sampler with per-image pooling."""
	# self.net = (s1, fixed_sampler, s2) — apply only s1 here
	assert len(self.net) == 3
	x = self.net[0](x)

	outputs = []
	for i, num_queries in enumerate(num_queries_vis_abstractors):
	hw = int(num_queries ** 0.5)
	out = nn.AdaptiveAvgPool2d((hw, hw))(x[num_grids[i]: num_grids[i + 1], :])
	out = self.net[2](out)
	out = rearrange(out, "b d h w -> b (h w) d")
	outputs.append(self.readout(out))
	return outputs


	class HyperCLOVAXVisionV2RMSNorm(nn.Module):
	"""RMS normalisation layer used inside HyperCLOVAXVisionV2PatchMerger."""

	def __init__(
	self,
	hidden_size: int,
	eps: float = 1e-6,
	) -> None:
	super().__init__()
	self.weight = nn.Parameter(torch.ones(hidden_size))
	self.variance_epsilon = eps

	def forward(
	self,
	hidden_states: torch.Tensor,
	) -> torch.Tensor:
	input_dtype = hidden_states.dtype
	hidden_states = hidden_states.to(torch.float32)
	variance = hidden_states.pow(2).mean(-1, keepdim=True)
	hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
	return self.weight * hidden_states.to(input_dtype)

	def extra_repr(self) -> str:
	return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


	class HyperCLOVAXVisionV2PatchMerger(nn.Module):
	"""Patch-merger projector that maps vision tokens to LLM embedding space.

	Adapted from the PatchMerger in Qwen2.5-VL.

	Accepts a tuple ``(hidden_states, window_index)`` from the vision encoder
	(the encoder's built-in merger is bypassed), applies RMSNorm + MLP over the
	spatially-merged window, then restores the original token order.

	Args:
	dim: Output hidden size (= LLM hidden size).
	context_dim: Input hidden size (= vision encoder ``out_hidden_size``).
	spatial_merge_size: Spatial merge factor used in the vision encoder
	(default 2, matching Qwen2.5-VL defaults).
	"""

	def __init__(
	self,
	dim: int,
	context_dim: int,
	spatial_merge_size: int = 2,
	) -> None:
	super().__init__()
	self.hidden_size = context_dim * (spatial_merge_size ** 2)
	self.ln_q = HyperCLOVAXVisionV2RMSNorm(context_dim, eps=1e-6)
	self.mlp = nn.Sequential(
	nn.Linear(self.hidden_size, self.hidden_size),
	nn.GELU(),
	nn.Linear(self.hidden_size, dim),
	)

	def forward(
	self,
	inputs: Tuple[torch.Tensor, torch.Tensor],
	) -> torch.Tensor:
	"""
	Args:
	inputs: Tuple of ``(hidden_states, window_index)`` produced by the
	monkey-patched Qwen vision encoder forward.
	Returns:
	Tensor of shape ``(total_tokens, dim)`` in the original token order.
	"""
	x, window_index = inputs
	# fp16 models accumulate rounding error in the linear layers; promote
	# to float32 for the merge step (matches vLLM behaviour).
	if self.mlp[0].weight.dtype == torch.float16:
	with torch.amp.autocast(device_type=x.device.type, dtype=torch.float32):
	x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
	else:
	x = self.mlp(self.ln_q(x).view(-1, self.hidden_size))
	reverse_indices = torch.argsort(window_index)
	return x[reverse_indices, :]


	class HyperCLOVAXVisionV2PreTrainedModel(PreTrainedModel):
	"""Base class for all HyperCLOVAX-Vision-V2 models."""

	config_class = HyperCLOVAXVisionV2Config
	base_model_prefix = "model"
	_no_split_modules = ["HyperCLOVAXSeedVisionBlock", "Qwen2DecoderLayer", "LlamaDecoderLayer"]
	supports_gradient_checkpointing = True
	_skip_keys_device_placement = "past_key_values"
	_supports_flash_attn_2 = True
	_supports_sdpa = True
	_supports_flex_attn = True
	_supports_cache_class = True
	_supports_quantized_cache = True
	_supports_static_cache = True
	_supports_attention_backend = True

	def _init_weights(
	self,
	module: nn.Module,
	) -> None:
	"""Initialize weights following Honeybee conventions."""
	# https://github.com/kakaobrain/honeybee/blob/main/honeybee/common_layers.py#L55
	if isinstance(module, (nn.Conv2d, nn.Conv3d, nn.Embedding, nn.Linear)):
	module.weight.data.normal_(mean=0.0, std=0.02)
	if hasattr(module, "bias") and module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.LayerNorm):
	module.bias.data.zero_()
	module.weight.data.fill_(1.0)


	class HyperCLOVAXVisionV2Model(HyperCLOVAXVisionV2PreTrainedModel):
	"""Backbone model: vision encoder + multimodal projector + LLM base (no LM head)."""

	def __init__(
	self,
	config: HyperCLOVAXVisionV2Config,
	) -> None:
	super().__init__(config)

	# vision encoder
	vision_config = config.vision_config
	vision_config.anyres = config.anyres
	vision_config.max_num_grids = config.max_num_grids
	vision_config.torch_dtype = getattr(config, "dtype", None) or getattr(config, "torch_dtype", None)
	self.vision_config = vision_config

	if config.anyres:
	if not getattr(config, "possible_resolutions", []):
	assert config.max_num_grids > 0
	possible_resolutions = [
	[ys * vision_config.image_size, xs * vision_config.image_size]
	for i in range(1, config.max_num_grids + 1)
	for j in range(1, config.max_num_grids + 1)
	for ys, xs in ([(i, j)] if (i != 1 or j != 1 or config.use_1x1_grid) and i * j <= config.max_num_grids else [])
	]
	self.config.possible_resolutions = possible_resolutions
	else:
	self.config.possible_resolutions = config.possible_resolutions

	if vision_config.model_type != Qwen2_5_VLVisionConfig.model_type:
	vision_config._attn_implementation = config._attn_implementation
	if not vision_config.name_or_path:
	vision_config._name_or_path = config._name_or_path
	self.vision_model = AutoModel.from_config(
	vision_config,
	trust_remote_code=True,
	attn_implementation=config._attn_implementation,
	)

	# language model
	text_config = config.text_config
	text_config.torch_dtype = getattr(config, "dtype", None) or getattr(config, "torch_dtype", None)
	if text_config.model_type in ["llama", "hyperclovax", "gpt2"]:
	text_config._attn_implementation = config._attn_implementation
	if text_config.model_type != "hyperclovax":
	text_config.logits_scaling = 1.0
	text_config.vocab_size = (
	text_config.padded_vocab_size if hasattr(text_config, "padded_vocab_size") else text_config.vocab_size
	)

	self.language_model = AutoModelForCausalLM.from_config(text_config, trust_remote_code=True)

	self.text_config = text_config
	self.num_queries_vis_abstractor = config.num_queries_vis_abstractor

	# vision projector (connector)
	input_hidden_size = vision_config.hidden_size
	if vision_config.model_type == Qwen2_5_VLVisionConfig.model_type:
	input_hidden_size = vision_config.out_hidden_size

	if config.vision_projector_type == ProjectorType.LINEAR:
	self.mm_projector = nn.Linear(input_hidden_size, text_config.hidden_size)

	elif config.vision_projector_type == ProjectorType.CABSTRACTOR:
	self.mm_projector = HyperCLOVAXVisionV2CAbstractor(
	num_queries=self.num_queries_vis_abstractor,
	num_input_tokens=(vision_config.image_size // vision_config.patch_size) ** 2,
	encoder_hidden_size=input_hidden_size,
	hidden_size=input_hidden_size,
	output_hidden_size=text_config.hidden_size,
	pos_emb=config.proj_pos_emb,
	prenorm=config.proj_prenorm,
	)
	self.mm_projector.pos_emb.to(config.torch_dtype)

	elif config.vision_projector_type == ProjectorType.PATCH_MERGER:
	# Custom patch-merger with HyperCLOVAX RMSNorm and fp16 autocast.
	# Requires the Qwen vision encoder to be monkey-patched so it returns
	# (hidden_states, window_index) instead of applying its built-in merger.
	self.mm_projector = HyperCLOVAXVisionV2PatchMerger(
	dim=text_config.hidden_size,
	context_dim=input_hidden_size,
	)

	else:
	self.mm_projector = HyperCLOVAXVisionV2MLP(
	config.vision_projector_type,
	input_hidden_size,
	hidden_features=input_hidden_size,
	out_features=text_config.hidden_size,
	)

	self.mm_projector.to(config.torch_dtype)

	self.vision_feature_layer = config.vision_feature_layer
	self.anyres = config.anyres

	if self.anyres:
	self.image_newline = nn.Parameter(torch.empty(text_config.hidden_size, dtype=self.dtype))

	# audio encoder
	self.audio_model = None
	self.audio_projector = None

	if isinstance(getattr(config, "audio_config", None), PretrainedConfig):
	audio_config = config.audio_config
	audio_config.torch_dtype = getattr(config, "torch_dtype", None)
	if not audio_config.name_or_path:
	audio_config._name_or_path = config._name_or_path
	self.audio_model = AutoModel.from_config(
	audio_config,
	trust_remote_code=True,
	attn_implementation=config._attn_implementation,
	)

	if config.audio_projector_type == ProjectorType.LINEAR:
	self.audio_projector = nn.Linear(
	in_features=audio_config.d_model,
	out_features=text_config.hidden_size,
	)
	else:
	self.audio_projector = HyperCLOVAXVisionV2MLP(
	config.audio_projector_type,
	audio_config.d_model,
	hidden_features=audio_config.d_model,
	out_features=text_config.hidden_size,
	)
	self.audio_projector.to(self.audio_model.dtype)

	def process_audio_input(
	self,
	audio_values: torch.Tensor,
	audio_attention_mask: torch.Tensor,
	) -> List[torch.Tensor]:
	"""Encode audio chunks into LLM embedding space.

	Args:
	audio_values: ``(total_chunks, 128, 3000)`` mel spectrogram tensor.
	audio_attention_mask: ``(total_chunks, 3000)`` attention mask.

	Returns:
	List containing one tensor of shape ``(total_chunks * T, hidden_size)``.
	"""
	emb = self.audio_model(
	audio_values,
	attention_mask=audio_attention_mask,
	).last_hidden_state # (total_chunks, T, d_model)
	emb = emb.flatten(0, 1) # (total_chunks * T, d_model)
	emb = self.audio_projector(emb)
	return [emb]

	def get_input_embeddings(self) -> nn.Embedding:
	return self.language_model.get_input_embeddings()

	def set_input_embeddings(
	self,
	value: nn.Embedding,
	) -> None:
	self.language_model.set_input_embeddings(value)

	def get_output_embeddings(self) -> nn.Linear:
	return self.language_model.get_output_embeddings()

	def set_output_embeddings(
	self,
	new_embeddings: nn.Linear,
	) -> None:
	self.language_model.set_output_embeddings(new_embeddings)

	def get_decoder(self) -> nn.Module:
	return self.language_model.get_decoder()

	def set_decoder(
	self,
	decoder: nn.Module,
	) -> None:
	self.language_model.set_decoder(decoder)

	def tie_weights(
	self,
	**kwargs,
	) -> None:
	# Under device_map="auto", embed_tokens and lm_head may land on different
	# CUDA devices. The new transformers tie_weights() calls torch.equal() on
	# both tensors before deciding whether to tie them, which raises RuntimeError
	# when the tensors are on different devices. Move lm_head.weight to the
	# same device as embed_tokens.weight beforehand so the comparison succeeds.
	if getattr(self.config.text_config, "tie_word_embeddings", False):
	input_embeddings = self.language_model.get_input_embeddings()
	output_embeddings = self.language_model.get_output_embeddings()
	if (
	input_embeddings is not None
	and output_embeddings is not None
	and input_embeddings.weight.device != output_embeddings.weight.device
	):
	output_embeddings.weight = nn.Parameter(output_embeddings.weight.to(input_embeddings.weight.device))
	return self.language_model.tie_weights(**kwargs)

	def resize_token_embeddings(
	self,
	new_num_tokens: Optional[int] = None,
	pad_to_multiple_of: Optional[int] = None,
	) -> nn.Embedding:
	model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
	self.config.text_config.vocab_size = model_embeds.num_embeddings
	return model_embeds

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	pixel_values: Optional[torch.FloatTensor] = None,
	past_key_values: Optional[Cache] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	token_type_ids: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	# audio inputs (from processor)
	audio_values: Optional[torch.FloatTensor] = None,
	audio_attention_mask: Optional[torch.FloatTensor] = None,
	audio_masks: Optional[List[torch.Tensor]] = None, # reserved; not used in forward
	num_audio_tokens: Optional[torch.LongTensor] = None, # reserved; not used in forward
	# vision inputs (from processor)
	image_grid_thw: Optional[torch.LongTensor] = None,
	num_image_tokens: Optional[torch.LongTensor] = None, # reserved; not used in forward
	# video inputs (from processor)
	pixel_values_videos: Optional[torch.FloatTensor] = None,
	video_grid_thw: Optional[torch.LongTensor] = None,
	num_video_tokens: Optional[torch.LongTensor] = None, # reserved; not used in forward
	video_audio_values: Optional[torch.FloatTensor] = None,
	video_audio_attention_mask: Optional[torch.FloatTensor] = None,
	video_audio_masks: Optional[List[torch.Tensor]] = None, # reserved; not used in forward
	num_video_audio_tokens: Optional[torch.LongTensor] = None, # reserved; not used in forward
	**kwargs,
	) -> Union[Tuple, BaseModelOutputWithPast]:
	"""
	Fuse multimodal inputs into token embeddings and run the language model backbone.

	Image, video, and audio tokens identified by their respective token IDs in
	``input_ids`` are replaced with the corresponding encoder+projector outputs
	before being passed to the language model.

	Returns:
	``BaseModelOutputWithPast`` (or tuple when ``return_dict=False``).
	"""
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	if audio_values is not None:
	raise ValueError(
	"Standalone audio input (`audio_values`) is not supported by this model. "
	"Audio is only supported as part of video input (`video_audio_values`)."
	)

	if inputs_embeds is None:
	# With device_map="auto", accelerate hooks may have an stale execution_device
	# that differs from the actual weight device (e.g. due to tied embeddings).
	# Bypass the hook by calling F.embedding directly so that input and weight
	# are guaranteed to be on the same device.
	embed_module = self.get_input_embeddings()
	inputs_embeds = F.embedding(
	input_ids.to(embed_module.weight.device),
	embed_module.weight,
	embed_module.padding_idx,
	)

	if pixel_values is not None:
	image_features = self.process_image_input(
	pixel_values=pixel_values,
	image_grid_thw=image_grid_thw,
	)
	positions = input_ids.eq(self.config.image_token_id).nonzero(as_tuple=False)
	inputs_embeds[positions[:, 0], positions[:, 1]] = (
	torch.cat(image_features).to(device=inputs_embeds.device, dtype=inputs_embeds.dtype)
	)

	if pixel_values_videos is not None:
	video_features = self.process_video_input(
	pixel_values_videos=pixel_values_videos,
	video_grid_thw=video_grid_thw,
	)
	positions = input_ids.eq(self.config.video_token_id).nonzero(as_tuple=False)
	inputs_embeds[positions[:, 0], positions[:, 1]] = (
	torch.cat(video_features).to(device=inputs_embeds.device, dtype=inputs_embeds.dtype)
	)

	if video_audio_values is not None and self.audio_model is not None:
	video_audio_token_id = getattr(self.config, "video_audio_token_id", None)
	if video_audio_token_id is not None:
	video_audio_features = self.process_audio_input(
	audio_values=video_audio_values,
	audio_attention_mask=video_audio_attention_mask,
	)
	positions = input_ids.eq(video_audio_token_id).nonzero(as_tuple=False)
	inputs_embeds[positions[:, 0], positions[:, 1]] = (
	torch.cat(video_audio_features).to(device=inputs_embeds.device, dtype=inputs_embeds.dtype)
	)

	input_ids = None

	return self.language_model.base_model(
	input_ids=input_ids,
	inputs_embeds=inputs_embeds,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	use_cache=use_cache,
	cache_position=cache_position,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	def process_image_input(
	self,
	pixel_values: torch.FloatTensor,
	image_grid_thw: Optional[torch.LongTensor] = None,
	) -> List[torch.Tensor]:
	"""Encode image pixel values into LLM-space feature tensors.

	Args:
	pixel_values: Flat tensor of shape ``(total_patches, channels * patch_size * patch_size)``.
	image_grid_thw: Grid shape ``(num_images, 3)`` with (T, H, W) per image.

	Returns:
	List containing one tensor of shape ``(total_image_tokens, hidden_size)``.
	"""
	features = self.vision_model(pixel_values, grid_thw=image_grid_thw)
	features = self.mm_projector(features)
	return [features]

	def process_video_input(
	self,
	pixel_values_videos: torch.FloatTensor,
	video_grid_thw: Optional[torch.LongTensor] = None,
	) -> List[torch.Tensor]:
	"""Encode video pixel values into LLM-space feature tensors.

	Args:
	pixel_values_videos: Flat tensor of shape ``(total_patches, channels * patch_size * patch_size)``.
	video_grid_thw: Grid shape ``(num_videos, 3)`` with (T, H, W) per video.

	Returns:
	List containing one tensor of shape ``(total_video_tokens, hidden_size)``.
	"""
	features = self.vision_model(pixel_values_videos, grid_thw=video_grid_thw)
	features = self.mm_projector(features)
	return [features]


	class HyperCLOVAXVisionV2ForCausalLM(HyperCLOVAXVisionV2PreTrainedModel, GenerationMixin):
	"""HyperCLOVAX-Vision-V2 model with a causal language modelling head."""

	def __init__(
	self,
	config: HyperCLOVAXVisionV2Config,
	) -> None:
	super().__init__(config)
	self.model = HyperCLOVAXVisionV2Model(config)
	self.post_init()

	# Delegate embedding / decoder accessors to the inner model
	def get_input_embeddings(self) -> nn.Embedding:
	return self.model.get_input_embeddings()

	def set_input_embeddings(
	self,
	value: nn.Embedding,
	) -> None:
	self.model.set_input_embeddings(value)

	def get_output_embeddings(self) -> nn.Linear:
	return self.model.get_output_embeddings()

	def set_output_embeddings(
	self,
	new_embeddings: nn.Linear,
	) -> None:
	self.model.set_output_embeddings(new_embeddings)

	def get_decoder(self) -> nn.Module:
	return self.model.get_decoder()

	def set_decoder(
	self,
	decoder: nn.Module,
	) -> None:
	self.model.set_decoder(decoder)

	def tie_weights(
	self,
	**kwargs,
	) -> None:
	return self.model.tie_weights(**kwargs)

	def resize_token_embeddings(
	self,
	new_num_tokens: Optional[int] = None,
	pad_to_multiple_of: Optional[int] = None,
	) -> nn.Embedding:
	return self.model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)

	# Convenience properties
	@property
	def language_model(self) -> nn.Module:
	return self.model.language_model

	@property
	def vision_model(self) -> nn.Module:
	return self.model.vision_model

	@property
	def mm_projector(self) -> nn.Module:
	return self.model.mm_projector

	@property
	def audio_model(self) -> Optional[nn.Module]:
	return self.model.audio_model

	@property
	def audio_projector(self) -> Optional[nn.Module]:
	return self.model.audio_projector

	@property
	def vision_model_type(self) -> str:
	return self.model.vision_config.model_type

	@property
	def anyres(self) -> bool:
	return self.model.anyres

	@property
	def image_newline(self) -> Optional[nn.Parameter]:
	return self.model.image_newline

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	pixel_values: Optional[torch.FloatTensor] = None,
	past_key_values: Optional[Cache] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	token_type_ids: Optional[torch.LongTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	# audio inputs (from processor)
	audio_values: Optional[torch.FloatTensor] = None,
	audio_attention_mask: Optional[torch.FloatTensor] = None,
	audio_masks: Optional[List[torch.Tensor]] = None, # reserved; not used in forward
	num_audio_tokens: Optional[torch.LongTensor] = None, # reserved; not used in forward
	# vision inputs (from processor)
	image_grid_thw: Optional[torch.LongTensor] = None,
	num_image_tokens: Optional[torch.LongTensor] = None, # reserved; not used in forward
	# video inputs (from processor)
	pixel_values_videos: Optional[torch.FloatTensor] = None,
	video_grid_thw: Optional[torch.LongTensor] = None,
	num_video_tokens: Optional[torch.LongTensor] = None, # reserved; not used in forward
	video_audio_values: Optional[torch.FloatTensor] = None,
	video_audio_attention_mask: Optional[torch.FloatTensor] = None,
	video_audio_masks: Optional[List[torch.Tensor]] = None, # reserved; not used in forward
	num_video_audio_tokens: Optional[torch.LongTensor] = None, # reserved; not used in forward
	logits_to_keep: Union[int, torch.Tensor] = 0,
	**kwargs,
	) -> Union[Tuple, CausalLMOutputWithPast]:
	"""
	Multimodal causal language model forward pass.

	Calls the backbone model to fuse multimodal inputs, then computes logits
	via the LM head. Loss is computed against ``labels`` when provided.

	Returns:
	``CausalLMOutputWithPast`` (or tuple when ``return_dict=False``).
	"""
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	outputs = self.model.forward(
	input_ids=input_ids,
	pixel_values=pixel_values,
	past_key_values=past_key_values,
	attention_mask=attention_mask,
	position_ids=position_ids,
	inputs_embeds=inputs_embeds,
	token_type_ids=token_type_ids,
	use_cache=use_cache,
	cache_position=cache_position,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	audio_values=audio_values,
	audio_attention_mask=audio_attention_mask,
	image_grid_thw=image_grid_thw,
	num_image_tokens=num_image_tokens,
	pixel_values_videos=pixel_values_videos,
	video_grid_thw=video_grid_thw,
	num_video_tokens=num_video_tokens,
	video_audio_values=video_audio_values,
	video_audio_attention_mask=video_audio_attention_mask,
	video_audio_masks=video_audio_masks,
	num_video_audio_tokens=num_video_audio_tokens,
	)
	hidden_states = outputs[0]
	slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
	logits = self.model.language_model.lm_head(hidden_states[:, slice_indices, :]) * getattr(
	self.config.text_config, "logits_scaling", 1.0
	)

	loss = None
	if labels is not None:
	loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs)

	return CausalLMOutputWithPast(
	loss=loss,
	logits=logits,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	)

	def prepare_inputs_for_generation(
	self,
	input_ids: torch.LongTensor,
	past_key_values: Optional[Cache] = None,
	attention_mask: Optional[torch.Tensor] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	cache_position: Optional[torch.LongTensor] = None,
	pixel_values: Optional[torch.FloatTensor] = None,
	image_grid_thw: Optional[torch.LongTensor] = None,
	pixel_values_videos: Optional[torch.FloatTensor] = None,
	video_grid_thw: Optional[torch.LongTensor] = None,
	audio_values: Optional[torch.FloatTensor] = None,
	audio_attention_mask: Optional[torch.FloatTensor] = None,
	video_audio_values: Optional[torch.FloatTensor] = None,
	video_audio_attention_mask: Optional[torch.FloatTensor] = None,
	**kwargs: Any,
	) -> Dict[str, Any]:
	# Overwritten -- multimodal inputs are declared as explicit named params
	# so they are naturally excluded from **kwargs and do not leak into super().
	model_inputs = super().prepare_inputs_for_generation(
	input_ids,
	past_key_values=past_key_values,
	attention_mask=attention_mask,
	inputs_embeds=inputs_embeds,
	cache_position=cache_position,
	**kwargs,
	)

	# Prefill detection: no past KV cache yet.
	# - transformers 4.x: past_key_values is None
	# - transformers 5.x: pre-creates an empty DynamicCache, so get_seq_length() == 0
	is_prefill = past_key_values is None or past_key_values.get_seq_length() == 0
	if is_prefill:
	model_inputs["pixel_values"] = pixel_values
	model_inputs["image_grid_thw"] = image_grid_thw
	model_inputs["pixel_values_videos"] = pixel_values_videos
	model_inputs["video_grid_thw"] = video_grid_thw
	model_inputs["audio_values"] = audio_values
	model_inputs["audio_attention_mask"] = audio_attention_mask
	model_inputs["video_audio_values"] = video_audio_values
	model_inputs["video_audio_attention_mask"] = video_audio_attention_mask

	return model_inputs


	class HyperCLOVAXVisionV2ForSequenceClassification(HyperCLOVAXVisionV2PreTrainedModel):
	"""HyperCLOVAX-Vision-V2 model with a sequence classification head."""

	def __init__(
	self,
	config: HyperCLOVAXVisionV2Config,
	) -> None:
	super().__init__(config)
	self.num_labels = getattr(config, "num_labels", 2)
	self.model = HyperCLOVAXVisionV2Model(config)
	self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
	self.post_init()

	def get_input_embeddings(self) -> nn.Embedding:
	return self.model.get_input_embeddings()

	def set_input_embeddings(
	self,
	value: nn.Embedding,
	) -> None:
	self.model.set_input_embeddings(value)

	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	pixel_values: Optional[torch.FloatTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	token_type_ids: Optional[torch.LongTensor] = None,
	labels: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	# vision inputs (from processor)
	image_grid_thw: Optional[torch.LongTensor] = None,
	num_image_tokens: Optional[torch.LongTensor] = None, # reserved; not used in forward
	# video inputs (from processor)
	pixel_values_videos: Optional[torch.FloatTensor] = None,
	video_grid_thw: Optional[torch.LongTensor] = None,
	num_video_tokens: Optional[torch.LongTensor] = None, # reserved; not used in forward
	) -> SequenceClassifierOutputWithPast:
	"""
	Sequence classification forward pass.

	Extracts the last non-padding token's hidden state, projects it via
	``self.score``, and computes loss against ``labels`` when provided.

	Returns:
	``SequenceClassifierOutputWithPast``.
	"""
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	transformer_outputs: BaseModelOutputWithPast = self.model(
	input_ids=input_ids,
	pixel_values=pixel_values,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	token_type_ids=token_type_ids,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	image_grid_thw=image_grid_thw,
	num_image_tokens=num_image_tokens,
	pixel_values_videos=pixel_values_videos,
	video_grid_thw=video_grid_thw,
	num_video_tokens=num_video_tokens,
	)
	hidden_states = transformer_outputs[0]
	logits = self.score(hidden_states)

	batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
	if self.config.pad_token_id is None and batch_size != 1:
	raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")

	if self.config.pad_token_id is None or input_ids is None:
	last_non_pad_token = -1
	else:
	non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
	token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
	last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)

	pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]

	loss = None
	if labels is not None:
	loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)

	return SequenceClassifierOutputWithPast(
	loss=loss,
	logits=pooled_logits,
	past_key_values=transformer_outputs.past_key_values,
	hidden_states=transformer_outputs.hidden_states,
	attentions=transformer_outputs.attentions,
	)


	AutoConfig.register("hyperclovax_vision_v2", HyperCLOVAXVisionV2Config)
	AutoModel.register(HyperCLOVAXVisionV2Config, HyperCLOVAXVisionV2Model)
	AutoModelForCausalLM.register(HyperCLOVAXVisionV2Config, HyperCLOVAXVisionV2ForCausalLM)
	AutoModelForSequenceClassification.register(HyperCLOVAXVisionV2Config, HyperCLOVAXVisionV2ForSequenceClassification)

	__all__ = [
	"HyperCLOVAXVisionV2PreTrainedModel",
	"HyperCLOVAXVisionV2Model",
	"HyperCLOVAXVisionV2ForCausalLM",
	"HyperCLOVAXVisionV2ForSequenceClassification",
	]