Buckets:

leideng
/

QCFuse

Files

xet

leideng/QCFuse / srt /models /idefics2.py

leideng

17 days ago

download

raw

12.1 kB

	# Copyright 2023 The SGLang team.
	# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
	#
	# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
	# and OPT implementations in this library. It has been modified from its
	# original forms to accommodate minor architectural differences compared
	# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from typing import Optional

	import torch
	from torch import nn
	from transformers import PretrainedConfig

	from sglang.srt.layers.activation import get_act_fn
	from sglang.srt.layers.attention.vision import VisionAttention
	from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
	from sglang.srt.layers.quantization.base_config import QuantizationConfig
	from sglang.srt.utils import add_prefix


	class Idefics2VisionMLP(nn.Module):

	def __init__(
	self,
	config: PretrainedConfig,
	quant_config: Optional[QuantizationConfig] = None,
	prefix: str = "",
	) -> None:
	super().__init__()
	self.config = config
	self.activation_fn = get_act_fn(config.hidden_act)
	self.fc1 = ColumnParallelLinear(
	config.hidden_size,
	config.intermediate_size,
	bias=True,
	quant_config=quant_config,
	prefix=add_prefix("fc1", prefix),
	)
	self.fc2 = RowParallelLinear(
	config.intermediate_size,
	config.hidden_size,
	bias=True,
	quant_config=quant_config,
	prefix=add_prefix("fc2", prefix),
	)

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	hidden_states, _ = self.fc1(hidden_states)
	hidden_states = self.activation_fn(hidden_states)
	hidden_states, _ = self.fc2(hidden_states)
	return hidden_states


	class Idefics2EncoderLayer(nn.Module):

	def __init__(
	self,
	config: PretrainedConfig,
	quant_config: Optional[QuantizationConfig] = None,
	prefix: str = "",
	) -> None:
	super().__init__()
	self.embed_dim = config.hidden_size
	self.num_heads = config.num_attention_heads
	self.self_attn = VisionAttention(
	embed_dim=config.hidden_size,
	num_heads=self.num_heads,
	projection_size=config.intermediate_size,
	use_qkv_parallel=True,
	quant_config=quant_config,
	dropout=config.attention_dropout,
	qkv_backend="sdpa",
	softmax_in_single_precision=True,
	flatten_batch=False,
	prefix=add_prefix("self_attn", prefix),
	)
	self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
	self.mlp = Idefics2VisionMLP(
	config,
	quant_config=quant_config,
	prefix=add_prefix("mlp", prefix),
	)
	self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)

	def forward(
	self,
	hidden_states: torch.Tensor,
	cu_seqlens: torch.Tensor,
	) -> torch.Tensor:
	"""
	Args:
	hidden_states (`torch.FloatTensor`):
	Input to the layer of shape `(batch, seq_len, embed_dim)`.

	"""
	residual = hidden_states
	hidden_states = self.layer_norm1(hidden_states)
	hidden_states = self.self_attn(hidden_states, cu_seqlens=cu_seqlens)

	hidden_states = residual + hidden_states
	residual = hidden_states
	hidden_states = self.layer_norm2(hidden_states)
	hidden_states = self.mlp(hidden_states)
	hidden_states = residual + hidden_states
	return hidden_states


	class Idefics2Encoder(nn.Module):
	"""
	Transformer encoder consisting of `config.num_hidden_layers` self attention
	layers. Each layer is a
	[`Idefics2EncoderLayer`].

	Args:
	config: Idefics2Config
	"""

	def __init__(
	self,
	config: PretrainedConfig,
	quant_config: Optional[QuantizationConfig] = None,
	prefix: str = "",
	) -> None:
	super().__init__()

	self.config = config
	self.layers = nn.ModuleList(
	[
	Idefics2EncoderLayer(
	config,
	quant_config=quant_config,
	prefix=add_prefix(f"layers.{i}", prefix),
	)
	for i in range(config.num_hidden_layers)
	]
	)

	def forward(
	self,
	inputs_embeds: torch.Tensor,
	cu_seqlens: torch.Tensor,
	) -> torch.Tensor:
	r"""
	Args:
	inputs_embeds (torch.Tensor):
	Optionally, instead of passing `input_ids` you can choose to
	directly pass an embedded representation.
	This is useful if you want more control over how to convert
	`input_ids` indices into associated vectorsthan the model's
	internal embedding lookup matrix.
	"""
	hidden_states = inputs_embeds
	for encoder_layer in self.layers:
	layer_outputs = encoder_layer(
	hidden_states,
	cu_seqlens=cu_seqlens,
	)
	hidden_states = layer_outputs
	return hidden_states


	class Idefics2VisionEmbeddings(nn.Module):
	"""
	This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings
	` to enable images of variable
	resolution.

	The modifications are adapted from [Patch n' Pack: NaViT, a Vision
	Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
	which allows treating images in their native aspect ratio and without the
	need to resize them to the same fixed size. In particular, we start from the
	original pre-trained SigLIP model(which uses images of fixed-size square
	images) and adapt it by training on images of variable resolutions.
	"""

	def __init__(self, config: PretrainedConfig):
	super().__init__()
	self.embed_dim = config.hidden_size
	self.image_size = config.image_size
	self.patch_size = config.patch_size
	self.patch_embedding = nn.Conv2d(
	in_channels=config.num_channels,
	out_channels=self.embed_dim,
	kernel_size=self.patch_size,
	stride=self.patch_size,
	padding="valid",
	)
	self.num_patches_per_side = self.image_size // self.patch_size
	self.num_patches = self.num_patches_per_side**2
	self.num_positions = self.num_patches
	self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)

	def get_position_ids(
	self,
	pixel_values: torch.FloatTensor,
	patch_attention_mask: torch.BoolTensor,
	tgt_sizes: Optional[torch.IntTensor] = None,
	):
	batch_size, _, max_im_h, max_im_w = pixel_values.shape

	max_nb_patches_h, max_nb_patches_w = (
	max_im_h // self.patch_size,
	max_im_w // self.patch_size,
	)
	boundaries = torch.arange(
	1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side
	)
	position_ids = torch.full(
	size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0
	)

	for batch_idx, p_attn_mask in enumerate(patch_attention_mask):

	if tgt_sizes is not None:
	nb_patches_h = tgt_sizes[batch_idx][0]
	nb_patches_w = tgt_sizes[batch_idx][1]
	else:
	nb_patches_h = p_attn_mask[:, 0].sum()
	nb_patches_w = p_attn_mask[0].sum()
	fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
	fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
	bucket_coords_h = torch.bucketize(
	fractional_coords_h, boundaries, right=True
	)
	bucket_coords_w = torch.bucketize(
	fractional_coords_w, boundaries, right=True
	)
	pos_ids = (
	bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w
	).flatten()
	position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
	position_ids = position_ids.to(self.position_embedding.weight.device)
	return position_ids

	def forward(
	self,
	pixel_values: torch.FloatTensor,
	patch_attention_mask: torch.BoolTensor,
	tgt_sizes: Optional[torch.IntTensor] = None,
	) -> torch.Tensor:
	target_dtype = self.patch_embedding.weight.dtype
	pixel_values = pixel_values.to(
	device=self.patch_embedding.weight.device, dtype=target_dtype
	)
	patch_embeds = self.patch_embedding(pixel_values)
	embeddings = patch_embeds.flatten(2).transpose(1, 2)
	position_ids = self.get_position_ids(
	pixel_values, patch_attention_mask, tgt_sizes
	)

	embeddings = embeddings + self.position_embedding(position_ids)
	return embeddings


	class Idefics2VisionTransformer(nn.Module):

	def __init__(
	self,
	config: PretrainedConfig,
	quant_config: Optional[QuantizationConfig] = None,
	require_post_norm: bool = True,
	prefix: str = "",
	) -> None:
	super().__init__()

	embed_dim = config.hidden_size
	self.config = config
	self.embeddings = Idefics2VisionEmbeddings(config)
	self.encoder = Idefics2Encoder(
	config=config,
	quant_config=quant_config,
	prefix=add_prefix("encoder", prefix),
	)
	self.post_layernorm = (
	nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
	if require_post_norm
	else nn.Identity()
	)

	def get_input_embeddings(self) -> nn.Embedding:
	return self.embeddings

	def compute_cu_seqlens(
	self,
	tgt_sizes: Optional[torch.Tensor] = None,
	input_embeds: Optional[torch.Tensor] = None,
	) -> torch.Tensor:
	# shape: (batch_size,)
	if tgt_sizes is not None:
	seqlen = tgt_sizes[:, 0] * tgt_sizes[:, 1]
	elif input_embeds is not None:
	seqlen = torch.full(
	size=(input_embeds.shape[0],),
	fill_value=input_embeds.shape[1],
	dtype=torch.int32,
	device=input_embeds.device,
	)
	else:
	raise ValueError(
	"Either `tgt_sizes` or `input_embeds` must be provided to compute cu_seqlens."
	)

	cu_seqlens = torch.cat(
	[
	torch.tensor([0], device=seqlen.device, dtype=torch.int32),
	torch.cumsum(seqlen, dim=0, dtype=torch.int32),
	],
	dim=0,
	).to(seqlen.device)
	return cu_seqlens

	def forward(
	self,
	pixel_values,
	patch_attention_mask: Optional[torch.BoolTensor] = None,
	tgt_sizes: Optional[torch.IntTensor] = None,
	) -> torch.Tensor:
	hidden_states = self.embeddings(
	pixel_values=pixel_values,
	patch_attention_mask=patch_attention_mask,
	tgt_sizes=tgt_sizes,
	)
	cu_seqlens = self.compute_cu_seqlens(tgt_sizes, hidden_states)
	encoder_outputs = self.encoder(
	hidden_states,
	cu_seqlens=cu_seqlens,
	)
	last_hidden_state = self.post_layernorm(encoder_outputs)
	return last_hidden_state

Xet Storage Details

Size:: 12.1 kB
Xet hash:: 7673570187b566bc25180e4859c66b972dbd2c5ad23b13c4f8c841e1523e1147

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.