reka-edge-2603 / modeling_yasa2.py

upload

7d24555 verified about 1 month ago

77.2 kB

	from __future__ import annotations

	import dataclasses
	import glob
	from collections.abc import Callable
	from pathlib import Path
	from typing import Any, Dict, Optional, Tuple, Union, cast

	import numpy as np
	import torch
	import torch.nn as nn
	from einops import rearrange
	from safetensors.torch import load_file as safetensors_load
	from transformers import PretrainedConfig
	from transformers.activations import ACT2FN
	from transformers.cache_utils import Cache, DynamicCache
	from transformers.generation.utils import GenerationMixin
	from transformers.integrations import use_kernel_forward_from_hub
	from transformers.masking_utils import create_causal_mask
	from transformers.modeling_layers import GradientCheckpointingLayer
	from transformers.modeling_outputs import (
	BaseModelOutputWithPast,
	BaseModelOutputWithPooling,
	)
	from transformers.modeling_rope_utils import (
	ROPE_INIT_FUNCTIONS,
	dynamic_rope_update,
	)
	from transformers.modeling_utils import (
	ALL_ATTENTION_FUNCTIONS,
	PreTrainedModel,
	)
	from transformers.processing_utils import Unpack
	from transformers.utils import (
	ModelOutput,
	TransformersKwargs,
	auto_docstring,
	can_return_tuple,
	logging,
	)
	from transformers.utils.deprecation import deprecate_kwarg
	try:
	from transformers.utils.generic import check_model_inputs
	except ImportError:
	def check_model_inputs(args, *kwargs):
	def _wrap(fn):
	return fn
	return _wrap

	from .configuration_yasa2 import ConvNextConfig, Yasa2Config, YasaConfig

	logger = logging.get_logger(__name__)


	# ---- Model outputs ----
	@dataclasses.dataclass
	class Yasa2ModelOutputWithPast(BaseModelOutputWithPast):
	"""
	Base class for Yasa2 model outputs with past key values.

	Args:
	last_hidden_state (`torch.FloatTensor`, optional):
	Last hidden state of the model.
	past_key_values (`Cache`, optional):
	Cache of key/value tensors for each layer.
	hidden_states (`Tuple[torch.FloatTensor]`, optional):
	Tuple of hidden states from the model.
	attentions (`Tuple[torch.FloatTensor]`, optional):
	Tuple of attention maps from the model.
	"""

	last_hidden_state: Optional[torch.FloatTensor] = None
	past_key_values: Optional[Cache] = None
	hidden_states: Optional[Tuple[torch.FloatTensor]] = None
	attentions: Optional[Tuple[torch.FloatTensor]] = None
	vision_hidden_states: Optional[torch.FloatTensor] = None


	@dataclasses.dataclass
	class Yasa2ForConditionalGenerationModelOutput(ModelOutput):
	"""
	Outputs for Yasa2 conditional generation.

	Args:
	loss (`torch.FloatTensor` of shape `(1,)`, optional, returned when `labels` is provided):
	Language modeling loss (for next-token prediction).
	logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
	Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
	past_key_values (`Cache`, optional, returned when `use_cache=True`):
	Cache of key/value tensors for each layer.
	hidden_states (`Tuple[torch.FloatTensor]`, optional):
	Tuple of hidden states from the language model.
	attentions (`Tuple[torch.FloatTensor]`, optional):
	Tuple of attention maps from the language model.
	vision_hidden_states (`torch.FloatTensor`, optional):
	Vision embeddings after projection and pooling.
	language_model_outputs (`Yasa2ModelOutputWithPast`, optional):
	The full language model outputs.
	"""

	loss: Optional[torch.FloatTensor] = None
	logits: Optional[torch.FloatTensor] = None
	past_key_values: Optional[Cache] = None
	hidden_states: Optional[Tuple[torch.FloatTensor]] = None
	attentions: Optional[Tuple[torch.FloatTensor]] = None
	vision_hidden_states: Optional[torch.FloatTensor] = None
	language_model_outputs: Optional[Yasa2ModelOutputWithPast] = None


	# ---- Utilities ----
	def get_2d_sincos_pos_embed(
	embed_dim: int, image_size: int \| tuple[int, int]
	) -> np.ndarray:
	"""Generate 2D sincos positional embeddings for a vision grid.

	Args:
	embed_dim (int): Embedding dimension.
	image_size (int \| tuple[int, int]): Image size as an int or (height, width) tuple.

	Returns:
	np.ndarray: Positional embedding array of shape (H*W, embed_dim).
	"""
	if isinstance(image_size, int):
	grid_h_size, grid_w_size = image_size, image_size
	else:
	grid_h_size, grid_w_size = image_size[0], image_size[1]

	grid_h = np.arange(grid_h_size, dtype=np.float32)
	grid_w = np.arange(grid_w_size, dtype=np.float32)
	# Build a meshgrid of spatial coordinates to compute positional embeddings.
	grid = np.meshgrid(grid_w, grid_h)
	grid = np.stack(grid, axis=0)

	pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
	return pos_embed


	def get_2d_sincos_pos_embed_from_grid(
	embed_dim: int, grid: np.ndarray
	) -> np.ndarray:
	"""Generate 2D sincos positional embeddings from a coordinate grid.

	Args:
	embed_dim (int): Embedding dimension.
	grid (np.ndarray): Grid array of shape (2, H, W).

	Returns:
	np.ndarray: Positional embedding array of shape (H, W, embed_dim).
	"""
	assert embed_dim % 2 == 0

	emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])
	emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])

	emb = np.concatenate([emb_h, emb_w], axis=-1)
	return emb


	def get_1d_sincos_pos_embed_from_grid(
	embed_dim: int, pos: np.ndarray
	) -> np.ndarray:
	"""Generate 1D sincos positional embeddings from a positional array.

	Args:
	embed_dim (int): Embedding dimension.
	pos (np.ndarray): Position grid array for one dimension.

	Returns:
	np.ndarray: Positional embedding array with sin/cos features.
	"""
	assert embed_dim % 2 == 0
	omega = np.arange(embed_dim // 2, dtype=np.float32)
	omega /= embed_dim / 2.0
	omega = 1.0 / 10000**omega

	out = np.einsum("hw,d->hwd", pos, omega)

	emb_sin = np.sin(out)
	emb_cos = np.cos(out)

	emb = np.concatenate([emb_sin, emb_cos], axis=-1)
	return emb


	# ---- ConvNeXt V2 backbone ----
	def drop_path(
	input: torch.Tensor, drop_prob: float = 0.0, training: bool = False
	) -> torch.Tensor:
	"""Apply stochastic depth (drop path) to the input tensor.

	Args:
	input (torch.Tensor): Input tensor to apply drop path to.
	drop_prob (float): Probability of dropping a path. Defaults to 0.0.
	training (bool): Whether the model runs in training mode. Defaults to False.

	Returns:
	torch.Tensor: Tensor with drop path applied when enabled.
	"""
	if drop_prob == 0.0 or not training:
	return input
	keep_prob = 1 - drop_prob
	shape = (input.shape[0],) + (1,) * (input.ndim - 1)
	# Sample a random tensor that determines which paths to keep per sample.
	random_tensor = keep_prob + torch.rand(
	shape, dtype=input.dtype, device=input.device
	)
	random_tensor.floor_()
	output = input.div(keep_prob) * random_tensor
	return output


	class ConvNextDropPath(nn.Module):
	"""Drop paths (stochastic depth) per sample in residual blocks."""

	def __init__(self, drop_prob: Optional[float] = None):
	"""Initialize the drop-path module.

	Args:
	drop_prob (Optional[float]): Probability of dropping a path.
	"""
	super().__init__()
	self.drop_prob = drop_prob

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	"""Apply drop path to the provided hidden states.

	Args:
	hidden_states (torch.Tensor): Tensor to apply stochastic depth to.

	Returns:
	torch.Tensor: Tensor after stochastic depth.
	"""
	return drop_path(hidden_states, self.drop_prob, self.training)

	def extra_repr(self) -> str:
	"""Return a string representation for module printing.

	Returns:
	str: Description containing the configured drop probability.
	"""
	return "p={}".format(self.drop_prob)


	class ConvNextLayerNorm(nn.Module):
	r"""LayerNorm that supports channels_last (default) or channels_first.
	The ordering of the dimensions in the inputs. channels_last corresponds to inputs with shape (batch_size, height,
	width, channels) while channels_first corresponds to inputs with shape (batch_size, channels, height, width).
	"""

	def __init__(
	self,
	normalized_shape: int,
	eps: float = 1e-6,
	data_format: str = "channels_last",
	) -> None:
	"""Initialize ConvNext LayerNorm.

	Args:
	normalized_shape (int): Expected shape of the input channels.
	eps (float): Small epsilon to avoid division by zero.
	data_format (str): Either 'channels_last' or 'channels_first'.

	Raises:
	NotImplementedError: If data_format is not supported.
	"""
	super().__init__()
	self.weight = nn.Parameter(torch.ones(normalized_shape))
	self.bias = nn.Parameter(torch.zeros(normalized_shape))
	self.eps = eps
	self.data_format = data_format
	if self.data_format not in ["channels_last", "channels_first"]:
	raise NotImplementedError(
	f"Unsupported data format: {self.data_format}"
	)
	self.normalized_shape = (normalized_shape,)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	"""Apply layer normalization according to the configured data format.

	Args:
	x (torch.Tensor): Input tensor of shape (N, C, H, W) or (N, H, W, C).

	Returns:
	torch.Tensor: Normalized tensor with the same shape as input.
	"""
	if self.data_format == "channels_last":
	x = nn.functional.layer_norm(
	x, self.normalized_shape, self.weight, self.bias, self.eps
	)
	elif self.data_format == "channels_first":
	input_dtype = x.dtype
	x = x.float()
	u = x.mean(1, keepdim=True)
	s = (x - u).pow(2).mean(1, keepdim=True)
	# Compute normalized values in fp32 for stable statistics before restoring dtype.
	x = (x - u) / torch.sqrt(s + self.eps)
	x = x.to(dtype=input_dtype)
	x = self.weight[:, None, None] * x + self.bias[:, None, None]
	return x


	class ConvNextV2GRN(nn.Module):
	"""Global Response Normalization (GRN) layer for ConvNeXt V2."""

	def __init__(self, dim: int):
	"""Initialize the GRN layer parameters.

	Args:
	dim (int): Channel dimension of the input tensor.
	"""
	super().__init__()
	self.weight = nn.Parameter(torch.zeros(1, 1, 1, dim))
	self.bias = nn.Parameter(torch.zeros(1, 1, 1, dim))

	def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
	"""Apply Global Response Normalization to the hidden states.

	Args:
	hidden_states (torch.FloatTensor): Input tensor shaped (batch, height, width, channels).

	Returns:
	torch.FloatTensor: Normalized tensor with the same shape.
	"""
	# Compute and normalize global spatial feature maps
	global_features = torch.norm(
	hidden_states, p=2, dim=(1, 2), keepdim=True
	)
	norm_features = global_features / (
	global_features.mean(dim=-1, keepdim=True) + 1e-6
	)
	# Combine normalized features with learnable scale and bias.
	hidden_states = (
	self.weight * (hidden_states * norm_features)
	+ self.bias
	+ hidden_states
	)
	return hidden_states


	class ConvNextEmbeddings(nn.Module):
	"""ConvNeXt patch embedding layer."""

	def __init__(
	self, num_channels: int = 3, hidden_size: int = 96, patch_size: int = 4
	) -> None:
	"""Initialize ConvNeXt patch embeddings.

	Args:
	num_channels (int): Number of image channels. Defaults to 3.
	hidden_size (int): Hidden dimension size. Defaults to 96.
	patch_size (int): Size of patches for initial convolution. Defaults to 4.
	"""
	super().__init__()
	self.patch_embeddings = nn.Conv2d(
	num_channels,
	hidden_size,
	kernel_size=patch_size,
	stride=patch_size,
	)

	self.layernorm = ConvNextLayerNorm(
	hidden_size, eps=1e-6, data_format="channels_first"
	)
	self.num_channels = num_channels

	def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
	"""Create patch embeddings from pixel values.

	Args:
	pixel_values (torch.FloatTensor): Image tensor shaped (batch, channels, height, width).

	Returns:
	torch.Tensor: Embedded tensor after patch convolution.

	Raises:
	ValueError: If the channel dimension does not match the expected count.
	"""
	num_channels = pixel_values.shape[1]
	if num_channels != self.num_channels:
	raise ValueError(
	"Make sure that the channel dimension of the pixel values match with the one set in the configuration."
	)
	embeddings = self.patch_embeddings(pixel_values)
	embeddings = self.layernorm(embeddings)
	return embeddings


	class ConvNextLayer(nn.Module):
	"""ConvNeXt V2 layer with GRN."""

	def __init__(
	self,
	dim: int,
	drop_path: float = 0,
	layer_scale_init_value: float = 1e-6,
	use_grn: bool = True,
	) -> None:
	"""Construct a ConvNeXt V2 layer with GRN and scaling.

	Args:
	dim (int): Input/output channel dimension.
	drop_path (float): Drop path probability for stochastic depth.
	layer_scale_init_value (float): Initial scaling factor for residual branches.
	use_grn (bool): Whether to enable Global Response Normalization.
	"""
	super().__init__()
	self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)
	self.layernorm = ConvNextLayerNorm(dim, eps=1e-6)
	self.pwconv1 = nn.Linear(dim, 4 * dim)
	self.act = nn.GELU()
	if not use_grn:
	raise ValueError("ConvNeXt V2 requires use_grn=True.")
	self.grn = ConvNextV2GRN(4 * dim)
	self.pwconv2 = nn.Linear(4 * dim, dim)
	self.layer_scale_parameter = (
	nn.Parameter(
	layer_scale_init_value * torch.ones((dim)), requires_grad=True
	)
	if layer_scale_init_value > 0
	else None
	)
	self.drop_path = (
	ConvNextDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
	)

	def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
	"""Run the ConvNeXt layer forward.

	Args:
	hidden_states (torch.FloatTensor): Input tensor shaped (batch, channels, height, width).

	Returns:
	torch.Tensor: Tensor after depthwise conv, GRN, and residual connection.
	"""
	input = hidden_states
	x = self.dwconv(hidden_states)
	x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C)
	x = self.layernorm(x)
	x = self.pwconv1(x)
	x = self.act(x)
	x = self.grn(x)
	x = self.pwconv2(x)
	if self.layer_scale_parameter is not None:
	x = self.layer_scale_parameter * x
	x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W)

	x = input + self.drop_path(x)
	return x


	class ConvNextStage(nn.Module):
	"""ConvNeXt V2 stage with optional downsampling and residual blocks."""

	def __init__(
	self,
	in_channels: int,
	out_channels: int,
	kernel_size: int = 2,
	stride: int = 2,
	depth: int = 2,
	drop_path_rates: Optional[list[float]] = None,
	layer_scale_init_value: float = 1e-6,
	use_grn: bool = True,
	) -> None:
	"""Build a ConvNeXt stage that can downsample and stack layers.

	Args:
	in_channels (int): Number of input channels.
	out_channels (int): Number of output channels.
	kernel_size (int): Kernel size for stripe downsampling.
	stride (int): Stride for downsampling.
	depth (int): Number of layers in the stage.
	drop_path_rates (Optional[list[float]]): Per-layer drop path rates.
	layer_scale_init_value (float): Residual scaling initial value.
	use_grn (bool): Whether to enable GRN.
	"""
	super().__init__()

	if in_channels != out_channels or stride > 1:
	self.downsampling_layer = nn.Sequential(
	ConvNextLayerNorm(
	in_channels, eps=1e-6, data_format="channels_first"
	),
	nn.Conv2d(
	in_channels,
	out_channels,
	kernel_size=kernel_size,
	stride=stride,
	),
	)
	else:
	self.downsampling_layer = nn.Identity()
	drop_path_rates = drop_path_rates or [0.0] * depth
	self.layers = nn.Sequential(
	*[
	ConvNextLayer(
	dim=out_channels,
	drop_path=drop_path_rates[j],
	layer_scale_init_value=layer_scale_init_value,
	use_grn=use_grn,
	)
	for j in range(depth)
	]
	)

	def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor:
	"""Process a batch through downsampling and residual layers.

	Args:
	hidden_states (torch.FloatTensor): Input tensor of shape (batch, channels, height, width).

	Returns:
	torch.Tensor: Output tensor after the stage.
	"""
	hidden_states = self.downsampling_layer(hidden_states)
	hidden_states = self.layers(hidden_states)
	return hidden_states


	class ConvNextEncoder(nn.Module):
	"""ConvNeXt V2 encoder."""

	def __init__(
	self,
	hidden_sizes: list[int],
	depths: list[int],
	drop_path_rate: float = 0.0,
	layer_scale_init_value: float = 1e-6,
	use_grn: bool = True,
	) -> None:
	"""Construct the ConvNeXt encoder with multiple stages.

	Args:
	hidden_sizes (list[int]): Hidden dimensions per stage.
	depths (list[int]): Number of layers per stage.
	drop_path_rate (float): Maximum drop path rate (linear schedule).
	layer_scale_init_value (float): Initial residual scaling.
	use_grn (bool): Whether to use GRN within layers.
	"""
	super().__init__()
	self.stages = nn.ModuleList()
	self.gradient_checkpointing = False
	num_stages = len(hidden_sizes)
	total_depth = sum(depths)
	drop_path_schedule = np.linspace(
	0.0, float(drop_path_rate), total_depth
	).tolist()
	drop_path_rates = []
	start = 0
	for depth in depths:
	end = start + depth
	drop_path_rates.append(drop_path_schedule[start:end])
	start = end
	# Keep track of the previous stage channel count for connecting stages.
	prev_chs = hidden_sizes[0]
	for i in range(num_stages):
	out_chs = hidden_sizes[i]
	stage = ConvNextStage(
	in_channels=prev_chs,
	out_channels=out_chs,
	stride=2 if i > 0 else 1,
	depth=depths[i],
	drop_path_rates=drop_path_rates[i],
	layer_scale_init_value=layer_scale_init_value,
	use_grn=use_grn,
	)
	self.stages.append(stage)
	prev_chs = out_chs

	def forward(
	self,
	hidden_states: torch.FloatTensor,
	output_hidden_states: Optional[bool] = False,
	return_dict: Optional[bool] = True,
	) -> Tuple:
	"""Forward propagate through the ConvNeXt encoder stack.

	Args:
	hidden_states (torch.FloatTensor): Input tensor shaped (batch, channels, height, width).
	output_hidden_states (Optional[bool]): Whether to collect intermediate states.
	return_dict (Optional[bool]): Whether to return tuple or dict-like output.

	Returns:
	Tuple: Last hidden state followed by optional hidden states tuple.
	"""
	all_hidden_states = () if output_hidden_states else None

	for i, layer_module in enumerate(self.stages):
	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states,)

	if self.gradient_checkpointing and self.training:
	hidden_states = torch.utils.checkpoint.checkpoint(
	layer_module,
	hidden_states,
	use_reentrant=False,
	)
	else:
	hidden_states = layer_module(hidden_states)

	if output_hidden_states:
	all_hidden_states = all_hidden_states + (hidden_states,)

	if not return_dict:
	return tuple(
	v for v in [hidden_states, all_hidden_states] if v is not None
	)

	return (hidden_states, all_hidden_states)


	class ConvNextModel(nn.Module):
	"""ConvNeXt V2 model."""

	def __init__(
	self,
	hidden_sizes: list[int],
	depths: list[int],
	num_channels: int = 3,
	patch_size: int = 4,
	drop_path_rate: float = 0.0,
	layer_scale_init_value: float = 1e-6,
	use_grn: bool = True,
	) -> None:
	"""Build the ConvNeXt V2 model with embedding, encoder, and pooling.

	Args:
	hidden_sizes (list[int]): Hidden channel sizes per stage.
	depths (list[int]): Layer counts per stage.
	num_channels (int): Number of image channels.
	patch_size (int): Patch size for initial embedding.
	drop_path_rate (float): Drop path rate range for residual blocks.
	layer_scale_init_value (float): Initial scale for residuals.
	use_grn (bool): Whether to enable GRN.
	"""
	super().__init__()
	if not use_grn:
	raise ValueError("ConvNeXt V2 requires use_grn=True.")
	self.embeddings = ConvNextEmbeddings(
	num_channels, hidden_sizes[0], patch_size
	)
	self.encoder = ConvNextEncoder(
	hidden_sizes,
	depths,
	drop_path_rate,
	layer_scale_init_value,
	use_grn,
	)
	self.layernorm = nn.LayerNorm(hidden_sizes[-1], eps=1e-6)

	# Initialize weights
	self.apply(self._init_weights)

	def _init_weights(self, module: nn.Module) -> None:
	"""Initialize module weights following standard ConvNeXt heuristics.

	Args:
	module (nn.Module): Module to initialize.
	"""
	if isinstance(module, (nn.Linear, nn.Conv2d)):
	module.weight.data.normal_(mean=0.0, std=0.02)
	if module.bias is not None:
	module.bias.data.zero_()
	elif isinstance(module, nn.LayerNorm):
	module.bias.data.zero_()
	module.weight.data.fill_(1.0)

	def forward(
	self,
	pixel_values: Optional[torch.FloatTensor] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = True,
	return_pooled: bool = True,
	) -> Union[Tuple, BaseModelOutputWithPooling]:
	"""Encode images and optionally return pooled features.

	Args:
	pixel_values (Optional[torch.FloatTensor]): Input tensor shaped (batch, channels, height, width).
	output_hidden_states (Optional[bool]): Whether to return intermediate hidden states.
	return_dict (Optional[bool]): Whether to return output as BaseModelOutput.
	return_pooled (bool): Whether to include pooled output.

	Returns:
	Union[Tuple, BaseModelOutputWithPooling]: Model outputs containing last hidden states and optionally pooled output.

	Raises:
	ValueError: If `pixel_values` is None.
	"""
	if pixel_values is None:
	raise ValueError("You have to specify pixel_values")

	embedding_output = self.embeddings(pixel_values)

	encoder_outputs = self.encoder(
	embedding_output,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	last_hidden_state = encoder_outputs[0]
	all_hidden_states = (
	encoder_outputs[1] if output_hidden_states else None
	)

	# Skip pooled output when callers only need token features.
	pooled_output = None
	if return_pooled:
	# Global average pooling, (N, C, H, W) -> (N, C).
	pooled_output = self.layernorm(last_hidden_state.mean([-2, -1]))

	if not return_dict:
	outputs = [last_hidden_state]
	if return_pooled:
	outputs.append(pooled_output)
	if output_hidden_states:
	outputs.append(all_hidden_states)
	return tuple(outputs)

	return BaseModelOutputWithPooling(
	last_hidden_state=last_hidden_state,
	pooler_output=pooled_output,
	hidden_states=all_hidden_states,
	)

	@staticmethod
	def from_pretrained(model_path: Path \| str) -> "ConvNextModel":
	"""Load ConvNeXt model weights from a pretrained checkpoint directory.

	Args:
	model_path (Path \| str): Directory path containing the checkpoint files.

	Returns:
	ConvNextModel: Initialized model with weights loaded from checkpoint.

	Raises:
	NotImplementedError: If config.json is missing in the directory.
	FileNotFoundError: If no weight file is found.
	"""

	model_path_str = str(model_path)
	model_path_obj = Path(model_path_str)

	# Check if this is a HuggingFace model path
	is_ckpt_dir = (
	model_path_obj.is_dir()
	and (model_path_obj / "config.json").exists()
	)

	if not is_ckpt_dir:
	raise NotImplementedError(
	"The checkpoint path should be a directory containing config.json "
	"and model.safetensors or pytorch_model.bin files."
	)

	# Load configuration
	config = ConvNextConfig.from_pretrained(model_path_str)

	checkpoint_dir = model_path_obj

	# Create our model directly
	if not config.use_grn:
	raise ValueError(
	"ConvNeXt V2 requires use_grn=True in the checkpoint config."
	)
	logger.info(
	"Loading ConvNeXt V2 model from checkpoint: %s", checkpoint_dir
	)

	model = ConvNextModel(
	hidden_sizes=config.hidden_sizes,
	depths=config.depths,
	num_channels=config.num_channels,
	patch_size=config.patch_size,
	drop_path_rate=config.drop_path_rate,
	layer_scale_init_value=config.layer_scale_init_value,
	use_grn=config.use_grn,
	)

	# Load state dict from checkpoint files
	state_dict = {}

	# Try to load from safetensors first (preferred)
	safetensors_file = checkpoint_dir / "model.safetensors"
	if safetensors_file.exists():
	logger.info("Loading weights from %s", safetensors_file)
	state_dict = safetensors_load(str(safetensors_file))
	else:
	# Try pytorch_model.bin
	pytorch_file = checkpoint_dir / "pytorch_model.bin"
	if pytorch_file.exists():
	logger.info("Loading weights from %s", pytorch_file)
	state_dict = torch.load(
	str(pytorch_file), map_location="cpu", weights_only=False
	)
	else:
	# Try sharded checkpoints
	shard_files = sorted(
	glob.glob(str(checkpoint_dir / "pytorch_model-*.bin"))
	)
	if shard_files:
	logger.info(
	"Loading weights from %s sharded files",
	len(shard_files),
	)
	for shard_file in shard_files:
	state_dict.update(
	torch.load(
	shard_file,
	map_location="cpu",
	weights_only=False,
	)
	)
	else:
	raise FileNotFoundError(
	f"Could not find model weights in {checkpoint_dir}. "
	"Expected model.safetensors, pytorch_model.bin, or pytorch_model-*.bin files."
	)

	# Load the mapped state dict into our model
	missing_keys, unexpected_keys = model.load_state_dict(
	state_dict, strict=False
	)

	if missing_keys:
	logger.warning(
	"Some weights of the model were not initialized from the checkpoint "
	"and are newly initialized: %s",
	missing_keys,
	)

	if unexpected_keys:
	logger.warning(
	"Some weights of the checkpoint were not used when initializing the model: %s",
	unexpected_keys,
	)

	return model


	class ConvNextVisionModel(nn.Module):
	"""Vision model wrapper around ConvNeXt V2 backbone."""

	def __init__(self, config: Optional[ConvNextConfig] = None):
	"""Wrap ConvNeXt backbone for use within the multimodal stack.

	Args:
	config (Optional[ConvNextConfig]): Configuration for the ConvNeXt backbone.

	Raises:
	ValueError: If the config lacks required ConvNeXt attributes.
	"""
	super().__init__()
	if config is None:
	config = ConvNextConfig.convnext_large()

	self.config = config

	# Support both HuggingFace config and ensure we extract the right parameters
	if hasattr(config, "hidden_sizes"):
	# HuggingFace-style config
	hidden_sizes = config.hidden_sizes
	depths = config.depths
	num_channels = config.num_channels
	patch_size = config.patch_size
	drop_path_rate = config.drop_path_rate
	layer_scale_init_value = config.layer_scale_init_value
	use_grn = config.use_grn
	else:
	raise ValueError("Config must be a ConvNextConfig")
	if not use_grn:
	raise ValueError("ConvNeXt V2 requires use_grn=True.")

	self.backbone = ConvNextModel(
	hidden_sizes=hidden_sizes,
	depths=depths,
	num_channels=num_channels,
	patch_size=patch_size,
	drop_path_rate=drop_path_rate,
	layer_scale_init_value=layer_scale_init_value,
	use_grn=use_grn,
	)

	@staticmethod
	def from_pretrained(model_path: Path \| str) -> "ConvNextVisionModel":
	"""Load a vision wrapper with pretrained ConvNeXt weights.

	Args:
	model_path (Path \| str): Directory path containing the pretrained weights.

	Returns:
	ConvNextVisionModel: Wrapper instance with backbone weights loaded.
	"""
	# Load the backbone model
	backbone = ConvNextModel.from_pretrained(model_path)
	config = ConvNextConfig.from_pretrained(str(model_path))
	wrapper = ConvNextVisionModel(config)
	wrapper.backbone = backbone

	return wrapper

	def forward(
	self,
	pixel_values: torch.FloatTensor,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: bool = True,
	patch_attention_mask: Optional[torch.Tensor] = None,
	return_pooled: bool = True,
	) -> Union[Tuple, BaseModelOutputWithPooling]:
	"""Encode pixel values and reformat the ConvNeXt output.

	Args:
	pixel_values (torch.FloatTensor): Input tensor shaped (batch, channels, height, width).
	output_attentions (Optional[bool]): Ignored but present for compatibility.
	output_hidden_states (Optional[bool]): Whether to return staged hidden states.
	return_dict (bool): Whether to return `BaseModelOutputWithPooling`.
	patch_attention_mask (Optional[torch.Tensor]): Mask for patch tokens (unused here).
	return_pooled (bool): Whether to request pooled output.

	Returns:
	Union[Tuple, BaseModelOutputWithPooling]: Vision outputs in sequence format.
	"""
	# Avoid pooled output unless requested to reduce extra work.
	outputs = self.backbone(
	pixel_values,
	output_hidden_states=output_hidden_states,
	return_dict=True,
	return_pooled=return_pooled,
	)
	outputs = cast(BaseModelOutputWithPooling, outputs)
	last_hidden_state = outputs.last_hidden_state # (b, c, h, w)
	pooled = outputs.pooler_output if return_pooled else None

	# Convert to sequence format: (b, c, h, w) -> (b, h*w, c)
	last_hidden_state = rearrange(
	last_hidden_state, "b c h w -> b (h w) c"
	)

	if return_dict:
	return BaseModelOutputWithPooling(
	last_hidden_state=last_hidden_state,
	pooler_output=pooled,
	hidden_states=(
	outputs.hidden_states if output_hidden_states else None
	),
	)

	if output_hidden_states:
	outputs_tuple = [last_hidden_state]
	if return_pooled:
	outputs_tuple.append(pooled)
	outputs_tuple.append(outputs.hidden_states)
	return tuple(outputs_tuple)

	if return_pooled:
	return (last_hidden_state, pooled)

	return (last_hidden_state,)


	# ---- Yasa language model utilities (inlined) ----
	@use_kernel_forward_from_hub("RMSNorm")
	class YasaRMSNorm(nn.Module):
	def __init__(self, hidden_size, eps=1e-6):
	"""
	YasaRMSNorm is equivalent to T5LayerNorm
	"""
	super().__init__()
	self.weight = nn.Parameter(torch.ones(hidden_size))
	self.variance_epsilon = eps

	def forward(self, hidden_states):
	input_dtype = hidden_states.dtype
	hidden_states = hidden_states.to(torch.float32)
	variance = hidden_states.pow(2).mean(-1, keepdim=True)
	hidden_states = hidden_states * torch.rsqrt(
	variance + self.variance_epsilon
	)
	return self.weight * hidden_states.to(input_dtype)

	def extra_repr(self):
	return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"


	class YasaRotaryEmbedding(nn.Module):
	inv_freq: torch.Tensor # fix linting for `register_buffer`

	def __init__(self, config: YasaConfig, device=None):
	super().__init__()
	# BC: "rope_type" was originally "type"
	if hasattr(config, "rope_scaling") and isinstance(
	config.rope_scaling, dict
	):
	self.rope_type = config.rope_scaling.get(
	"rope_type", config.rope_scaling.get("type")
	)
	else:
	self.rope_type = "default"
	self.max_seq_len_cached = config.max_position_embeddings
	self.original_max_seq_len = config.max_position_embeddings

	self.config = config
	self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]

	inv_freq, self.attention_scaling = self.rope_init_fn(
	self.config, device
	)
	self.register_buffer("inv_freq", inv_freq, persistent=False)
	self.original_inv_freq = self.inv_freq

	@torch.no_grad()
	@dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope)
	def forward(self, x, position_ids):
	inv_freq_expanded = (
	self.inv_freq[None, :, None]
	.float()
	.expand(position_ids.shape[0], -1, 1)
	.to(x.device)
	)
	position_ids_expanded = position_ids[:, None, :].float()

	device_type = (
	x.device.type
	if isinstance(x.device.type, str) and x.device.type != "mps"
	else "cpu"
	)
	with torch.autocast(
	device_type=device_type, enabled=False
	): # Force float32
	freqs = (
	inv_freq_expanded.float() @ position_ids_expanded.float()
	).transpose(1, 2)
	emb = torch.cat((freqs, freqs), dim=-1)
	cos = emb.cos() * self.attention_scaling
	sin = emb.sin() * self.attention_scaling

	return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)


	def rotate_half(x):
	"""Rotates half the hidden dims of the input."""
	x1 = x[..., : x.shape[-1] // 2]
	x2 = x[..., x.shape[-1] // 2 :]
	return torch.cat((-x2, x1), dim=-1)


	def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
	"""Applies Rotary Position Embedding to the query and key tensors.

	Args:
	q (`torch.Tensor`): The query tensor.
	k (`torch.Tensor`): The key tensor.
	cos (`torch.Tensor`): The cosine part of the rotary embedding.
	sin (`torch.Tensor`): The sine part of the rotary embedding.
	position_ids (`torch.Tensor`, optional):
	Deprecated and unused.
	unsqueeze_dim (`int`, optional, defaults to 1):
	The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
	sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
	that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
	k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
	cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
	the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
	Returns:
	`tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
	"""
	cos = cos.unsqueeze(unsqueeze_dim)
	sin = sin.unsqueeze(unsqueeze_dim)
	q_embed = (q * cos) + (rotate_half(q) * sin)
	k_embed = (k * cos) + (rotate_half(k) * sin)
	return q_embed, k_embed


	class YasaMLP(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.hidden_size = config.hidden_size
	self.intermediate_size = config.intermediate_size
	self.gate_proj = nn.Linear(
	self.hidden_size, self.intermediate_size, bias=config.mlp_bias
	)
	self.up_proj = nn.Linear(
	self.hidden_size, self.intermediate_size, bias=config.mlp_bias
	)
	self.down_proj = nn.Linear(
	self.intermediate_size, self.hidden_size, bias=config.mlp_bias
	)
	self.act_fn = ACT2FN[config.hidden_act]

	def forward(self, x):
	down_proj = self.down_proj(
	self.act_fn(self.gate_proj(x)) * self.up_proj(x)
	)
	return down_proj


	def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
	"""
	This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
	num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
	"""
	batch, num_key_value_heads, slen, head_dim = hidden_states.shape
	if n_rep == 1:
	return hidden_states
	hidden_states = hidden_states[:, :, None, :, :].expand(
	batch, num_key_value_heads, n_rep, slen, head_dim
	)
	return hidden_states.reshape(
	batch, num_key_value_heads * n_rep, slen, head_dim
	)


	def eager_attention_forward(
	module: nn.Module,
	query: torch.Tensor,
	key: torch.Tensor,
	value: torch.Tensor,
	attention_mask: Optional[torch.Tensor],
	scaling: float,
	dropout: float = 0.0,
	**kwargs: Unpack[TransformersKwargs],
	):
	key_states = repeat_kv(key, module.num_key_value_groups)
	value_states = repeat_kv(value, module.num_key_value_groups)

	attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
	if attention_mask is not None:
	causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
	attn_weights = attn_weights + causal_mask

	attn_weights = nn.functional.softmax(
	attn_weights, dim=-1, dtype=torch.float32
	).to(query.dtype)
	attn_weights = nn.functional.dropout(
	attn_weights, p=dropout, training=module.training
	)
	attn_output = torch.matmul(attn_weights, value_states)
	attn_output = attn_output.transpose(1, 2).contiguous()

	return attn_output, attn_weights


	class YasaAttention(nn.Module):
	"""Multi-headed attention from 'Attention Is All You Need' paper"""

	def __init__(self, config: YasaConfig, layer_idx: int):
	super().__init__()
	self.config = config
	self.layer_idx = layer_idx
	self.head_dim = getattr(
	config,
	"head_dim",
	config.hidden_size // config.num_attention_heads,
	)
	self.num_key_value_groups = (
	config.num_attention_heads // config.num_key_value_heads
	)
	self.scaling = self.head_dim**-0.5
	self.attention_dropout = config.attention_dropout
	self.is_causal = True

	self.q_proj = nn.Linear(
	config.hidden_size,
	config.num_attention_heads * self.head_dim,
	bias=config.attention_bias,
	)
	self.k_proj = nn.Linear(
	config.hidden_size,
	config.num_key_value_heads * self.head_dim,
	bias=config.attention_bias,
	)
	self.v_proj = nn.Linear(
	config.hidden_size,
	config.num_key_value_heads * self.head_dim,
	bias=config.attention_bias,
	)
	self.o_proj = nn.Linear(
	config.num_attention_heads * self.head_dim,
	config.hidden_size,
	bias=config.attention_bias,
	)

	@deprecate_kwarg(
	"past_key_value", new_name="past_key_values", version="4.58"
	)
	def forward(
	self,
	hidden_states: torch.Tensor,
	position_embeddings: tuple[torch.Tensor, torch.Tensor],
	attention_mask: Optional[torch.Tensor],
	past_key_values: Optional[Cache] = None,
	cache_position: Optional[torch.LongTensor] = None,
	**kwargs: Unpack[TransformersKwargs],
	) -> tuple[torch.Tensor, torch.Tensor]:
	input_shape = hidden_states.shape[:-1]
	hidden_shape = (*input_shape, -1, self.head_dim)

	query_states = (
	self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
	)
	key_states = (
	self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
	)
	value_states = (
	self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
	)

	cos, sin = position_embeddings
	query_states, key_states = apply_rotary_pos_emb(
	query_states, key_states, cos, sin
	)

	if past_key_values is not None:
	# sin and cos are specific to RoPE models; cache_position needed for the static cache
	cache_kwargs = {
	"sin": sin,
	"cos": cos,
	"cache_position": cache_position,
	}
	key_states, value_states = past_key_values.update(
	key_states, value_states, self.layer_idx, cache_kwargs
	)

	attention_interface: Callable = eager_attention_forward
	if self.config._attn_implementation != "eager":
	attention_interface = ALL_ATTENTION_FUNCTIONS[
	self.config._attn_implementation
	]

	attn_output, attn_weights = attention_interface(
	self,
	query_states,
	key_states,
	value_states,
	attention_mask,
	dropout=0.0 if not self.training else self.attention_dropout,
	scaling=self.scaling,
	**kwargs,
	)

	attn_output = attn_output.reshape(*input_shape, -1).contiguous()
	attn_output = self.o_proj(attn_output)
	return attn_output, attn_weights


	class YasaDecoderLayer(GradientCheckpointingLayer):
	def __init__(self, config: YasaConfig, layer_idx: int):
	super().__init__()
	self.hidden_size = config.hidden_size

	self.self_attn = YasaAttention(config=config, layer_idx=layer_idx)

	self.mlp = YasaMLP(config)
	self.input_layernorm = YasaRMSNorm(
	config.hidden_size, eps=config.rms_norm_eps
	)
	self.post_attention_layernorm = YasaRMSNorm(
	config.hidden_size, eps=config.rms_norm_eps
	)

	@deprecate_kwarg(
	"past_key_value", new_name="past_key_values", version="4.58"
	)
	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	use_cache: Optional[bool] = False,
	cache_position: Optional[torch.LongTensor] = None,
	position_embeddings: Optional[
	tuple[torch.Tensor, torch.Tensor]
	] = None, # necessary, but kept here for BC
	**kwargs: Unpack[TransformersKwargs],
	) -> torch.Tensor:
	residual = hidden_states
	hidden_states = self.input_layernorm(hidden_states)
	# Self Attention
	hidden_states, _ = self.self_attn(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	use_cache=use_cache,
	cache_position=cache_position,
	position_embeddings=position_embeddings,
	**kwargs,
	)
	hidden_states = residual + hidden_states

	# Fully Connected
	residual = hidden_states
	hidden_states = self.post_attention_layernorm(hidden_states)
	hidden_states = self.mlp(hidden_states)
	hidden_states = residual + hidden_states
	return hidden_states


	class YasaPreTrainedModel(PreTrainedModel):
	config = Yasa2Config
	base_model_prefix = "model"
	supports_gradient_checkpointing = True
	_no_split_modules = ["YasaDecoderLayer"]
	_skip_keys_device_placement = ["past_key_values"]
	_supports_flash_attn = True
	_supports_sdpa = True
	_supports_flex_attn = True

	_can_compile_fullgraph = True
	_supports_attention_backend = True
	_can_record_outputs = {
	"hidden_states": YasaDecoderLayer,
	"attentions": YasaAttention,
	}


	@auto_docstring
	class YasaModel(YasaPreTrainedModel):
	def __init__(self, config: YasaConfig):
	super().__init__(config)
	self.padding_idx = config.pad_token_id
	self.vocab_size = config.vocab_size

	self.embed_tokens = nn.Embedding(
	config.vocab_size, config.hidden_size, self.padding_idx
	)
	self.layers = nn.ModuleList(
	[
	YasaDecoderLayer(config, layer_idx)
	for layer_idx in range(config.num_hidden_layers)
	]
	)
	self.norm = YasaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
	self.rotary_emb = YasaRotaryEmbedding(config=config)
	self.gradient_checkpointing = False

	# Initialize weights and apply final processing
	self.post_init()

	@check_model_inputs()
	@auto_docstring
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[Cache] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	cache_position: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	**kwargs: Unpack[TransformersKwargs],
	) -> BaseModelOutputWithPast:
	if (input_ids is None) ^ (inputs_embeds is not None):
	raise ValueError(
	"You must specify exactly one of input_ids or inputs_embeds"
	)

	if inputs_embeds is None:
	inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)

	if use_cache and past_key_values is None:
	past_key_values = DynamicCache(config=self.config)

	if cache_position is None:
	past_seen_tokens = (
	past_key_values.get_seq_length()
	if past_key_values is not None
	else 0
	)
	cache_position: torch.Tensor = (
	torch.arange(
	inputs_embeds.shape[1], device=inputs_embeds.device
	)
	+ past_seen_tokens
	)

	if position_ids is None:
	position_ids = cache_position.unsqueeze(0)

	causal_mask = create_causal_mask(
	config=self.config,
	input_embeds=inputs_embeds,
	attention_mask=attention_mask,
	cache_position=cache_position,
	past_key_values=past_key_values,
	position_ids=position_ids,
	)

	hidden_states = inputs_embeds
	position_embeddings = self.rotary_emb(
	hidden_states, position_ids=position_ids
	)

	for decoder_layer in self.layers[: self.config.num_hidden_layers]:
	hidden_states = decoder_layer(
	hidden_states,
	attention_mask=causal_mask,
	position_embeddings=position_embeddings,
	position_ids=position_ids,
	past_key_values=past_key_values,
	use_cache=use_cache,
	cache_position=cache_position,
	**kwargs,
	)

	hidden_states = self.norm(hidden_states)
	return BaseModelOutputWithPast(
	last_hidden_state=hidden_states,
	past_key_values=past_key_values,
	)


	class Yasa2Model(YasaPreTrainedModel):
	"""Pretrained base class that holds the full Yasa2 multimodal stack."""

	config_class: PretrainedConfig = Yasa2Config

	base_model_prefix: str = ""
	_checkpoint_conversion_mapping: Dict[str, str] = {}
	_no_split_modules = ["YasaDecoderLayer", "ConvNextVisionModel"]
	config: Yasa2Config

	def __init__(
	self,
	config: Yasa2Config,
	):
	"""Initialize the full Yasa2 multimodal stack.

	Args:
	config (Yasa2Config): Configuration for the multimodal model.
	"""
	super().__init__(config)

	self.vision_pooling = config.vision_pooling
	if self.vision_pooling != "adaptive_avg":
	raise ValueError(
	f"Yasa2 only supports adaptive_avg vision pooling, got {self.vision_pooling}"
	)
	self.adaptive_pooling = nn.AdaptiveAvgPool2d(
	int(config.num_query_tokens**0.5)
	)

	if not (config.num_query_tokens**0.5).is_integer():
	raise ValueError(
	f"num_query_tokens {config.num_query_tokens} must be a "
	"square number for adaptive_avg pooling"
	)

	# Set up vision backbone
	vision_config = config.vision_config
	if isinstance(vision_config, dict):
	vision_config = ConvNextConfig(**vision_config)
	self.vision_model = ConvNextVisionModel(vision_config)

	self.language_projection = nn.Sequential(
	nn.Linear(
	config.vision_config.hidden_size,
	config.text_config.hidden_size,
	),
	nn.GELU(),
	nn.Linear(
	config.text_config.hidden_size,
	config.text_config.hidden_size,
	),
	)

	# Set up language model
	self.language_model = YasaModel(config.text_config)

	# Store only the raw non-learned vision positional embedding data.
	# Build device/dtype-specific tensors lazily in forward.
	self.add_vision_pos_embed = config.use_vision_pos_embed
	self._vision_pos_embed_np = get_2d_sincos_pos_embed(
	config.vision_config.hidden_size,
	image_size=50,
	)
	self._vision_pos_embed_cache: Dict[str, torch.Tensor] = {}

	self.post_init()

	def get_input_embeddings(self) -> torch.nn.Module:
	"""Return the multimodal head's input embeddings.

	Returns:
	torch.nn.Module: Embedding module used by the language model.
	"""
	return self.language_model.get_input_embeddings()

	def set_input_embeddings(self, value: torch.nn.Module) -> None:
	"""Override the multimodal head's input embeddings.

	Args:
	value (torch.nn.Module): Embedding module to register.
	"""
	self.language_model.set_input_embeddings(value)

	def set_decoder(self, decoder: YasaModel) -> None:
	"""Proxy to set the multimodal model decoder.

	Args:
	decoder: Decoder to register with the multimodal model.
	"""
	self.language_model = decoder

	def get_decoder(self) -> YasaModel:
	"""Return the decoder component.

	Returns:
	YasaModel: Registered decoder module.
	"""
	return self.language_model

	def state_dict(self, args: Any, *kwargs: Any) -> Dict[str, torch.Tensor]:
	"""Return a filtered state dict that omits derived or non-persistent buffers.

	Args:
	*args: Positional arguments forwarded to the superclass.
	**kwargs: Keyword arguments forwarded to the superclass.

	Returns:
	Dict[str, torch.Tensor]: Filtered parameter mapping.
	"""
	state_dict = super().state_dict(args, *kwargs)
	for key in list(state_dict.keys()):
	# masked_bias is a constant non-persistent attention buffer (-1e9).
	if "attention.masked_bias" in key:
	state_dict.pop(key, None)
	continue
	# rotary_emb.inv_freq is derived from rotary dims/base and rebuilt at init.
	if "rotary_emb.inv_freq" in key:
	state_dict.pop(key, None)
	return state_dict

	def _encode_vision_adaptive_2d_avg_pooling(
	self,
	pixel_values: torch.Tensor,
	patch_attention_mask: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	"""Encode vision inputs via the ConvNeXt backbone and adaptive avg pooling.

	Args:
	pixel_values (torch.Tensor): Vision input tensor.
	patch_attention_mask (Optional[torch.Tensor]): Optional patch mask.

	Returns:
	torch.Tensor: Vision embeddings projected into text hidden size.
	"""
	# Vision prefill only needs patch tokens; skip pooled output.
	image_embeds = self.vision_model(
	pixel_values=pixel_values,
	output_attentions=None,
	output_hidden_states=None,
	return_dict=False,
	patch_attention_mask=patch_attention_mask,
	return_pooled=False,
	)[0]

	img_num, seq_length, vision_hidden_size = image_embeds.size()
	height, width = int(seq_length0.5), int(seq_length0.5)
	if self.add_vision_pos_embed:
	vision_pos_embed = self._get_vision_pos_embed(
	device=image_embeds.device,
	dtype=image_embeds.dtype,
	seq_len=image_embeds.size(1),
	)
	image_embeds = image_embeds + vision_pos_embed

	image_embeds = image_embeds.permute(0, 2, 1).contiguous()
	image_embeds = image_embeds.reshape(
	img_num, vision_hidden_size, height, width
	)

	if (
	self.config.apply_patch_attention_mask
	and patch_attention_mask is not None
	and patch_attention_mask.numel() > 0
	):
	patch_attention_mask = patch_attention_mask.reshape(
	img_num, height, width
	)
	image_embeds = image_embeds * patch_attention_mask.unsqueeze(1).to(
	dtype=image_embeds.dtype
	)

	# Force pooling in fp32 with autocast disabled; bf16 pooling can produce NaNs.
	pooled_dtype = image_embeds.dtype
	with torch.autocast(device_type="cuda", enabled=False):
	image_embeds = torch.nn.functional.adaptive_avg_pool2d(
	image_embeds.float(), self.adaptive_pooling.output_size
	)
	image_embeds = image_embeds.to(dtype=pooled_dtype)
	image_embeds = image_embeds.flatten(2)
	image_embeds = image_embeds.permute(0, 2, 1).contiguous()

	vision_embeds = self.language_projection(image_embeds)

	return vision_embeds

	def _get_vision_pos_embed(
	self,
	device: torch.device,
	dtype: torch.dtype,
	seq_len: int,
	) -> torch.Tensor:
	"""Return cached/runtime-built vision positional embeddings."""
	cache_key = f"{device}:{dtype}"
	cached = self._vision_pos_embed_cache.get(cache_key)
	if cached is None:
	cached = (
	torch.from_numpy(self._vision_pos_embed_np)
	.view(-1, self.config.vision_config.hidden_size)
	.to(device=device, dtype=dtype)
	.unsqueeze(0)
	)
	self._vision_pos_embed_cache[cache_key] = cached
	return cached[:, :seq_len, :]

	def get_image_features(
	self, pixel_values: torch.Tensor, **kwargs: Any
	) -> torch.Tensor:
	"""Return vision features for vLLM compatibility."""
	patch_attention_mask = kwargs.get("patch_attention_mask")
	return self._encode_vision_adaptive_2d_avg_pooling(
	pixel_values, patch_attention_mask=patch_attention_mask
	)

	@classmethod
	def scatter_embeddings_to_target_special_id(
	cls,
	target_tensor: torch.Tensor,
	target_input_ids: torch.Tensor,
	src_embeddings: torch.Tensor,
	special_token_id: int,
	) -> torch.Tensor:
	"""Scatter vision embeddings into the language embedding buffer at special tokens.

	Args:
	target_tensor (torch.Tensor): Target embedding buffer to update.
	target_input_ids (torch.Tensor): Input IDs aligned with the target tensor.
	src_embeddings (torch.Tensor): Source embeddings to scatter from vision outputs.
	special_token_id (int): Token ID used to locate insertion positions.

	Returns:
	torch.Tensor: Updated target tensor with vision embeddings placed at special IDs.
	"""
	b_source, n_source, d_embedding = src_embeddings.shape
	b_target, n_target, d_target = target_tensor.shape

	if b_target != target_input_ids.size(0):
	raise ValueError(
	"Batch size mismatch: target_input_ids "
	f"{target_input_ids.size(0)} vs target_tensor {b_target}"
	)
	if n_target != target_input_ids.size(1):
	raise ValueError(
	"Sequence length mismatch: target_input_ids "
	f"{target_input_ids.size(1)} vs target_tensor {n_target}"
	)
	if d_embedding != d_target:
	raise ValueError(
	"Embedding dimension mismatch: src_embeddings "
	f"{d_embedding} vs target_tensor {d_target}"
	)

	special_token_mask = target_input_ids.view(-1) == special_token_id
	special_token_indices = torch.nonzero(special_token_mask).squeeze(-1)

	if len(special_token_indices) != b_source * n_source:
	raise ValueError(
	"Special token count mismatch: found "
	f"{len(special_token_indices)}, expected {b_source * n_source}"
	)

	target_tensor = target_tensor.view(-1, d_embedding)
	src_embeddings = src_embeddings.view(-1, d_embedding)
	target_tensor[special_token_indices] = src_embeddings
	target_tensor = target_tensor.view(b_target, n_target, d_embedding)
	return target_tensor

	def _interleave_scatter(
	self,
	input_ids: torch.Tensor,
	attention_mask: torch.Tensor,
	inputs_embeds: torch.Tensor,
	vision_embeds: torch.Tensor,
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""Scatter vision embeddings into language embeddings at the image token positions.

	Args:
	input_ids (torch.Tensor): Token IDs containing image placeholders.
	attention_mask (torch.Tensor): Attention mask for text tokens.
	inputs_embeds (torch.Tensor): Language model input embeddings.
	vision_embeds (torch.Tensor): Vision embeddings to be inserted.

	Returns:
	Tuple[torch.Tensor, torch.Tensor]: Updated inputs_embeds and attention_mask.
	"""
	inputs_embeds = Yasa2Model.scatter_embeddings_to_target_special_id(
	target_tensor=inputs_embeds,
	target_input_ids=input_ids,
	src_embeddings=vision_embeds,
	special_token_id=self.config.image_token_id,
	)
	return inputs_embeds, attention_mask

	@can_return_tuple
	def forward(
	self,
	input_ids: Optional[torch.LongTensor],
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	past_key_values: Optional[
	Union[Cache, Tuple[Tuple[torch.FloatTensor]]]
	] = None,
	cache_position: Optional[torch.LongTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	pixel_values: Optional[torch.Tensor] = None,
	patch_attention_mask: Optional[torch.Tensor] = None,
	token_type_ids: Optional[torch.Tensor] = None,
	mm_token_type_ids: Optional[torch.Tensor] = None,
	**kwargs: Any,
	) -> Union[Tuple[torch.Tensor, ...], "Yasa2ModelOutputWithPast"]:
	"""Forward pass combining language and vision inputs for Yasa2.

	Args:
	input_ids (Optional[torch.LongTensor]): Token IDs for the language model.
	attention_mask (Optional[torch.Tensor]): Attention mask aligned with `input_ids`.
	position_ids (Optional[torch.LongTensor]): Position indices feeding the language model.
	inputs_embeds (Optional[torch.FloatTensor]): Precomputed token embeddings.
	past_key_values (Optional[Union[Cache, Tuple[Tuple[torch.FloatTensor]]]]): Cached decoder key/value tensors.
	cache_position (Optional[torch.LongTensor]): Positions used for cache alignment.
	use_cache (Optional[bool]): Whether to request cached key/values.
	output_attentions (Optional[bool]): Whether to return attention weights.
	output_hidden_states (Optional[bool]): Whether to return hidden states for each layer.
	return_dict (Optional[bool]): Whether to return a `ModelOutput`.
	pixel_values (Optional[torch.Tensor]): Vision inputs providing image context.
	patch_attention_mask (Optional[torch.Tensor]): Optional patch mask for vision tokens.
	token_type_ids (Optional[torch.Tensor]): Unused token type ids for compatibility.
	mm_token_type_ids (Optional[torch.Tensor]): Unused multimodal token type ids.

	Returns:
	Union[Tuple[torch.Tensor, ...], Yasa2ModelOutputWithPast]: Combined multimodal outputs.
	"""
	return_dict = (
	return_dict
	if return_dict is not None
	else self.config.use_return_dict
	)
	use_cache = (
	use_cache if use_cache is not None else self.config.use_cache
	)

	if input_ids is None and inputs_embeds is None:
	raise ValueError(
	"You must provide either input_ids or inputs_embeds."
	)
	if inputs_embeds is not None and pixel_values is not None:
	raise ValueError(
	"pixel_values cannot be used when inputs_embeds is provided."
	)

	if inputs_embeds is None:
	inputs_embeds = self.language_model.get_input_embeddings()(
	input_ids
	)

	if attention_mask is None:
	pad_token_id = self.config.text_config.pad_token_id
	if input_ids is not None and pad_token_id is not None:
	if (input_ids == pad_token_id).any():
	attention_mask = input_ids.ne(pad_token_id)

	if attention_mask is not None:
	if attention_mask.numel() == 0:
	attention_mask = None

	if cache_position is not None:
	expected_len = inputs_embeds.shape[1]
	if cache_position.shape[-1] != expected_len:
	raise ValueError(
	"cache_position length must match input sequence length: "
	f"{cache_position.shape[-1]} vs {expected_len}"
	)

	vision_embeds = None
	if pixel_values is not None and len(pixel_values) > 0:
	if input_ids is None:
	raise ValueError(
	"input_ids is required when pixel_values is provided."
	)
	vision_embeds = self._encode_vision_adaptive_2d_avg_pooling(
	pixel_values,
	patch_attention_mask=patch_attention_mask,
	)
	inputs_embeds, attention_mask = self._interleave_scatter(
	input_ids,
	attention_mask,
	inputs_embeds,
	vision_embeds,
	)

	outputs = self.language_model(
	input_ids=None,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	head_mask=None,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	cache_position=cache_position,
	return_dict=True,
	**kwargs,
	)

	return Yasa2ModelOutputWithPast(
	last_hidden_state=outputs.last_hidden_state,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	vision_hidden_states=vision_embeds,
	)


	class Yasa2ForConditionalGeneration(YasaPreTrainedModel, GenerationMixin):
	"""Yasa2 multimodal conditional generation model (vision + text)."""

	config_class = Yasa2Config

	_checkpoint_conversion_mapping = {}
	_tied_weights_keys = [] # Weights are not tied
	config: Yasa2Config

	def __init__(self, config: Yasa2Config):
	"""Initialize the Yasa2 conditional generation model.

	Args:
	config: Yasa2 configuration object.
	"""
	super().__init__(config)

	self.model = Yasa2Model(config)
	self.lm_head = nn.Linear(
	config.hidden_size, config.vocab_size, bias=False
	)
	self.vocab_size = config.vocab_size

	# Initialize weights and apply final processing
	self.post_init()

	def get_input_embeddings(self) -> torch.nn.Module:
	"""Return the multimodal head's input embeddings.

	Returns:
	torch.nn.Module: Embedding module used by the language model.
	"""
	return self.model.language_model.get_input_embeddings()

	def set_input_embeddings(self, value: torch.nn.Module) -> None:
	"""Override the multimodal head's input embeddings.

	Args:
	value (torch.nn.Module): Embedding module to register.
	"""
	self.model.language_model.set_input_embeddings(value)

	def set_decoder(self, decoder):
	"""Proxy to set the multimodal model decoder.

	Args:
	decoder: Decoder to register with the multimodal model.
	"""
	self.model.set_decoder(decoder)

	def get_decoder(self):
	"""Proxy to return the multimodal decoder."""
	return self.model.get_decoder()

	# Make modules available throught conditional class for BC
	@property
	def language_model(self) -> torch.nn.Module:
	"""Expose the language model component.

	Returns:
	torch.nn.Module: Language model module.
	"""
	return self.model.language_model

	@property
	def vision_backbone(self) -> torch.nn.Module:
	"""Expose the vision encoder backbone.

	Returns:
	torch.nn.Module: Vision backbone module.
	"""
	return self.model.vision_model

	@can_return_tuple
	def forward(
	self,
	input_ids: Optional[torch.LongTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_values: Optional[
	Union[Cache, Tuple[Tuple[torch.FloatTensor]]]
	] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	use_cache: Optional[bool] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	cache_position: Optional[torch.LongTensor] = None,
	pixel_values: Optional[torch.Tensor] = None,
	patch_attention_mask: Optional[torch.Tensor] = None,
	token_type_ids: Optional[torch.Tensor] = None,
	mm_token_type_ids: Optional[torch.Tensor] = None,
	labels: Optional[torch.LongTensor] = None,
	return_dict: Optional[bool] = None,
	**kwargs: Any,
	) -> Union[
	Tuple[torch.Tensor, ...], "Yasa2ForConditionalGenerationModelOutput"
	]:
	"""Run the multimodal model, project outputs to logits, and compute loss if needed.

	Args:
	input_ids (Optional[torch.LongTensor]): Language token IDs.
	attention_mask (Optional[torch.Tensor]): Attention mask for language tokens.
	position_ids (Optional[torch.LongTensor]): Position indices.
	past_key_values (Optional[Union[Cache, Tuple[Tuple[torch.FloatTensor]]]]): Cached decoder states.
	inputs_embeds (Optional[torch.FloatTensor]): Input embeddings instead of token IDs.
	use_cache (Optional[bool]): Whether to cache key/value pairs.
	output_attentions (Optional[bool]): Whether to return attention weights.
	output_hidden_states (Optional[bool]): Whether to return hidden states.
	cache_position (Optional[torch.LongTensor]): Positions used for caching.
	pixel_values (Optional[torch.Tensor]): Vision inputs.
	patch_attention_mask (Optional[torch.Tensor]): Optional mask for vision patches.
	token_type_ids (Optional[torch.Tensor]): Unused token type ids for compatibility.
	mm_token_type_ids (Optional[torch.Tensor]): Unused multimodal token type ids.
	labels (Optional[torch.LongTensor]): Labels for computing cross-entropy loss.
	return_dict (Optional[bool]): Whether to return a dict-like output.

	Returns:
	Union[Tuple[torch.Tensor, ...], Yasa2ForConditionalGenerationModelOutput]: Model logits, caches, and optional loss.
	"""
	return_dict = (
	return_dict
	if return_dict is not None
	else self.config.use_return_dict
	)

	outputs = self.model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	position_ids=position_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	cache_position=cache_position,
	use_cache=use_cache,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	pixel_values=pixel_values,
	patch_attention_mask=patch_attention_mask,
	return_dict=True,
	**kwargs,
	)

	hidden_states = outputs.last_hidden_state
	logits = self.lm_head(hidden_states)

	loss = None
	if labels is not None:
	labels = labels.to(logits.device)
	shift_logits = logits[..., :-1, :].contiguous()
	shift_labels = labels[..., 1:]
	loss_fct = nn.CrossEntropyLoss(
	ignore_index=self.config.label_ignore_index
	)
	loss = loss_fct(
	shift_logits.reshape(-1, shift_logits.size(-1)),
	shift_labels.reshape(-1),
	)

	return Yasa2ForConditionalGenerationModelOutput(
	loss=loss,
	logits=logits,
	past_key_values=outputs.past_key_values,
	hidden_states=outputs.hidden_states,
	attentions=outputs.attentions,
	vision_hidden_states=outputs.vision_hidden_states,
	language_model_outputs=outputs,
	)

	def generate(
	self,
	input_ids: Optional[torch.LongTensor],
	attention_mask: Optional[torch.Tensor] = None,
	pixel_values: Optional[torch.Tensor] = None,
	patch_attention_mask: Optional[torch.Tensor] = None,
	**generate_kwargs,
	) -> torch.LongTensor:
	"""Generate text tokens conditioned on vision and/or language inputs.

	Args:
	input_ids (Optional[torch.LongTensor]): Seed language tokens.
	attention_mask (Optional[torch.Tensor]): Language attention mask.
	pixel_values (Optional[torch.Tensor]): Vision inputs appended to prompts.
	patch_attention_mask (Optional[torch.Tensor]): Mask for vision patches.
	**generate_kwargs: Additional generation options forwarded to the `super().generate`.

	Returns:
	torch.LongTensor: Generated token IDs.
	"""
	return super().generate(
	input_ids=input_ids,
	attention_mask=attention_mask,
	pixel_values=pixel_values,
	patch_attention_mask=patch_attention_mask,
	**generate_kwargs,
	)

	def prepare_inputs_for_generation(
	self,
	input_ids: torch.LongTensor,
	past_key_values: Optional[
	Union[Cache, Tuple[Tuple[torch.FloatTensor]]]
	] = None,
	inputs_embeds: Optional[torch.FloatTensor] = None,
	attention_mask: Optional[torch.Tensor] = None,
	cache_position: Optional[torch.LongTensor] = None,
	pixel_values: Optional[torch.Tensor] = None,
	patch_attention_mask: Optional[torch.Tensor] = None,
	**kwargs: Any,
	) -> Dict[str, Any]:
	"""Prepare multimodal inputs for generation bookkeeping.

	Args:
	input_ids (torch.LongTensor): Current token IDs for generation.
	past_key_values (Optional[Union[Cache, Tuple[Tuple[torch.FloatTensor]]]]): Cached past key/value tensors.
	inputs_embeds (Optional[torch.FloatTensor]): Optional token embeddings.
	attention_mask (Optional[torch.Tensor]): Language attention mask.
	cache_position (Optional[torch.LongTensor]): Cache alignment positions.
	pixel_values (Optional[torch.Tensor]): Vision inputs that should be reused.
	patch_attention_mask (Optional[torch.Tensor]): Vision patch mask for the prefill step.
	**kwargs: Additional arguments forwarded to the base implementation.

	Returns:
	Dict[str, Any]: Prepared inputs for the next generation step.
	"""
	model_inputs = super().prepare_inputs_for_generation(
	input_ids=input_ids,
	past_key_values=past_key_values,
	inputs_embeds=inputs_embeds,
	attention_mask=attention_mask,
	cache_position=cache_position,
	**kwargs,
	)

	is_prefill = past_key_values is None or (
	cache_position is not None and cache_position[0] == 0
	)
	if is_prefill:
	model_inputs["pixel_values"] = pixel_values
	model_inputs["patch_attention_mask"] = patch_attention_mask

	return model_inputs


	Yasa2ForConditionalGeneration.register_for_auto_class(
	"AutoModelForImageTextToText"
	)