update

d1941eb 28 days ago

17.1 kB

	import math
	from typing import Optional

	import torch
	import torch.nn.functional as F
	from torch import nn
	from torch.nn.attention.flex_attention import (
	BlockMask,
	create_block_mask,
	flex_attention,
	)


	class MLP(nn.Module):
	def __init__(
	self,
	in_features: int,
	hidden_features: int,
	out_features: Optional[int] = None,
	bias: bool = True,
	) -> None:
	super().__init__()
	out_features = out_features or in_features
	self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
	self.act = nn.GELU()
	self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	return self.fc2(self.act(self.fc1(x)))


	class SwiGLUFFN(nn.Module):
	def __init__(
	self,
	in_features: int,
	hidden_features: int,
	out_features: Optional[int] = None,
	bias: bool = True,
	) -> None:
	super().__init__()
	out_features = out_features or in_features
	self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
	self.w3 = nn.Linear(hidden_features, out_features, bias=bias)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	x1, x2 = self.w12(x).chunk(2, dim=-1)
	return self.w3(F.silu(x1) * x2)


	class PatchEmbed(nn.Module):
	"""
	Image to patch embedding.

	Input:
	(B, C, H, W)
	Output:
	(B, N, D)
	"""

	def __init__(
	self,
	img_size: int = 224,
	patch_size: int = 16,
	in_chans: int = 3,
	embed_dim: int = 768,
	) -> None:
	super().__init__()
	self.img_size = img_size
	self.patch_size = patch_size
	self.grid_size = (img_size // patch_size, img_size // patch_size)
	self.num_patches = self.grid_size[0] * self.grid_size[1]

	self.proj = nn.Conv2d(
	in_chans,
	embed_dim,
	kernel_size=patch_size,
	stride=patch_size,
	bias=True,
	)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	_, _, h, w = x.shape
	if h % self.patch_size != 0 or w % self.patch_size != 0:
	raise ValueError(
	f"Input size {(h, w)} must be divisible by patch_size={self.patch_size}."
	)

	x = self.proj(x) # (B, D, H', W')
	x = x.flatten(2).transpose(1, 2) # (B, N, D)
	return x


	class LayerScale(nn.Module):
	def __init__(self, dim: int, init_values: Optional[float]) -> None:
	super().__init__()
	if init_values is None:
	self.gamma = None
	else:
	self.gamma = nn.Parameter(torch.full((dim,), float(init_values)))

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	if self.gamma is None:
	return x
	return x * self.gamma


	class Attention(nn.Module):
	"""
	Multi-head self-attention using PyTorch FlexAttention.
	"""

	def __init__(
	self,
	dim: int,
	num_heads: int = 8,
	qkv_bias: bool = True,
	proj_bias: bool = True,
	) -> None:
	super().__init__()
	if dim % num_heads != 0:
	raise ValueError(f"dim={dim} must be divisible by num_heads={num_heads}")

	self.num_heads = num_heads
	self.head_dim = dim // num_heads
	self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
	self.proj = nn.Linear(dim, dim, bias=proj_bias)

	def forward(
	self,
	x: torch.Tensor,
	block_mask: Optional[BlockMask] = None,
	) -> torch.Tensor:
	seq_len, dim = x.shape

	qkv = self.qkv(x)
	qkv = qkv.view(seq_len, 3, self.num_heads, self.head_dim)
	qkv = qkv.permute(1, 2, 0, 3).unsqueeze(1) # (3, 1, H, N, Dh)
	q, k, v = qkv.unbind(dim=0)

	x = flex_attention(
	q,
	k,
	v,
	block_mask=block_mask,
	)
	x = x.transpose(1, 2).contiguous().view(seq_len, dim)
	x = self.proj(x)
	return x


	def build_ffn(
	ffn_layer: str,
	dim: int,
	mlp_ratio: float,
	bias: bool = True,
	) -> nn.Module:
	hidden_dim = int(dim * mlp_ratio)

	if ffn_layer == "mlp":
	return MLP(
	in_features=dim,
	hidden_features=hidden_dim,
	out_features=dim,
	bias=bias,
	)
	if ffn_layer in {"swiglu", "swiglufused"}:
	return SwiGLUFFN(
	in_features=dim,
	hidden_features=hidden_dim,
	out_features=dim,
	bias=bias,
	)

	raise ValueError(f"Unsupported ffn_layer: {ffn_layer}")


	class Block(nn.Module):
	def __init__(
	self,
	dim: int,
	num_heads: int,
	mlp_ratio: float = 4.0,
	qkv_bias: bool = True,
	proj_bias: bool = True,
	ffn_bias: bool = True,
	init_values: Optional[float] = None,
	ffn_layer: str = "mlp",
	norm_eps: float = 1e-6,
	) -> None:
	super().__init__()
	self.norm1 = nn.LayerNorm(dim, eps=norm_eps)
	self.attn = Attention(
	dim=dim,
	num_heads=num_heads,
	qkv_bias=qkv_bias,
	proj_bias=proj_bias,
	)
	self.ls1 = LayerScale(dim, init_values)

	self.norm2 = nn.LayerNorm(dim, eps=norm_eps)
	self.mlp = build_ffn(
	ffn_layer=ffn_layer,
	dim=dim,
	mlp_ratio=mlp_ratio,
	bias=ffn_bias,
	)
	self.ls2 = LayerScale(dim, init_values)

	def forward(
	self,
	x: torch.Tensor,
	block_mask: Optional[BlockMask] = None,
	) -> torch.Tensor:
	x = x + self.ls1(self.attn(self.norm1(x), block_mask=block_mask))
	x = x + self.ls2(self.mlp(self.norm2(x)))
	return x


	class VisionTransformer(nn.Module):
	def __init__(
	self,
	image_size: int = 224,
	patch_size: int = 16,
	in_chans: int = 3,
	hidden_size: int = 768,
	num_layers: int = 12,
	num_heads: int = 12,
	mlp_ratio: float = 4.0,
	qkv_bias: bool = True,
	ffn_bias: bool = True,
	proj_bias: bool = True,
	init_values: Optional[float] = None,
	ffn_layer: str = "mlp",
	num_register_tokens: int = 0,
	norm_eps: float = 1e-6,
	) -> None:
	super().__init__()
	self.embed_dim = hidden_size
	self.patch_size = patch_size
	self.num_register_tokens = num_register_tokens
	self.num_tokens = 1 # cls token

	self.patch_embed = PatchEmbed(
	img_size=image_size,
	patch_size=patch_size,
	in_chans=in_chans,
	embed_dim=hidden_size,
	)
	num_patches = self.patch_embed.num_patches

	self.cls_token = nn.Parameter(torch.zeros(1, 1, hidden_size))
	self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, hidden_size))

	self.register_tokens = (
	nn.Parameter(torch.zeros(1, num_register_tokens, hidden_size))
	if num_register_tokens > 0
	else None
	)
	self.mask_token = nn.Parameter(torch.zeros(1, hidden_size))
	self.blocks = nn.ModuleList(
	[
	Block(
	dim=hidden_size,
	num_heads=num_heads,
	mlp_ratio=mlp_ratio,
	qkv_bias=qkv_bias,
	proj_bias=proj_bias,
	ffn_bias=ffn_bias,
	init_values=init_values,
	ffn_layer=ffn_layer,
	norm_eps=norm_eps,
	)
	for _ in range(num_layers)
	]
	)
	self.norm = nn.LayerNorm(hidden_size, eps=norm_eps)
	self.head = nn.Identity()

	self.reset_parameters()

	def reset_parameters(self) -> None:
	nn.init.trunc_normal_(self.pos_embed, std=0.02)
	nn.init.normal_(self.cls_token, std=1e-6)
	nn.init.normal_(self.mask_token, std=1e-6)

	if self.register_tokens is not None:
	nn.init.normal_(self.register_tokens, std=1e-6)

	self.apply(self._init_module)

	@staticmethod
	def _init_module(module: nn.Module) -> None:
	if isinstance(module, nn.Linear):
	nn.init.trunc_normal_(module.weight, std=0.02)
	if module.bias is not None:
	nn.init.zeros_(module.bias)
	elif isinstance(module, nn.Conv2d):
	nn.init.trunc_normal_(module.weight, std=0.02)
	if module.bias is not None:
	nn.init.zeros_(module.bias)
	elif isinstance(module, nn.LayerNorm):
	nn.init.ones_(module.weight)
	nn.init.zeros_(module.bias)

	def interpolate_patch_pos_encoding(
	self,
	position_ids: torch.Tensor,
	grid_sizes: torch.Tensor,
	dtype: torch.dtype,
	) -> torch.Tensor:
	"""
	Sample patch positional embeddings for packed variable-size grids.
	"""
	num_ref_tokens = self.pos_embed.shape[1] - 1

	patch_pos = self.pos_embed[:, 1:]

	ref_size = int(math.sqrt(num_ref_tokens))
	if ref_size * ref_size != num_ref_tokens:
	raise ValueError("Reference positional embedding is not a square grid.")

	patch_pos_grid = patch_pos.view(1, ref_size, ref_size, self.embed_dim).permute(
	0, 3, 1, 2
	)

	position_ids = position_ids.to(device=patch_pos_grid.device)
	grid_sizes = grid_sizes.to(device=patch_pos_grid.device)

	row = position_ids[:, 0].to(dtype=torch.float32)
	col = position_ids[:, 1].to(dtype=torch.float32)
	grid_h = grid_sizes[:, 0].clamp_min(1).to(dtype=torch.float32)
	grid_w = grid_sizes[:, 1].clamp_min(1).to(dtype=torch.float32)

	y = ((row + 0.5) / grid_h) * 2.0 - 1.0
	x = ((col + 0.5) / grid_w) * 2.0 - 1.0
	sample_grid = torch.stack([x, y], dim=-1).view(1, -1, 1, 2)

	patch_pos = F.grid_sample(
	patch_pos_grid.to(dtype=torch.float32),
	sample_grid,
	mode="bicubic",
	padding_mode="border",
	align_corners=False,
	)
	return patch_pos.squeeze(0).squeeze(-1).transpose(0, 1).to(dtype=dtype)

	def prepare_packed_tokens(
	self,
	pixel_values: torch.Tensor,
	input_ids: torch.Tensor,
	position_ids: torch.Tensor,
	grid_sizes: torch.Tensor,
	document_ids: torch.Tensor,
	) -> torch.Tensor:
	if pixel_values.ndim != 4:
	raise ValueError(
	f"pixel_values must have shape (S, C, P, P), got {pixel_values.shape}"
	)
	if pixel_values.shape[-2:] != (self.patch_size, self.patch_size):
	raise ValueError(
	"packed pixel_values patches must have spatial shape "
	f"({self.patch_size}, {self.patch_size}), got {pixel_values.shape[-2:]}"
	)

	seq_len = pixel_values.shape[0]
	for name, tensor, trailing_shape in (
	("input_ids", input_ids, ()),
	("position_ids", position_ids, (2,)),
	("grid_sizes", grid_sizes, (2,)),
	("document_ids", document_ids, ()),
	):
	expected_shape = (seq_len, *trailing_shape)
	if tuple(tensor.shape) != expected_shape:
	raise ValueError(
	f"{name} must have shape {expected_shape}, got {tensor.shape}"
	)

	input_ids = input_ids.to(device=pixel_values.device)
	position_ids = position_ids.to(device=pixel_values.device)
	grid_sizes = grid_sizes.to(device=pixel_values.device)
	document_ids = document_ids.to(device=pixel_values.device)

	x = self.patch_embed(pixel_values).squeeze(1) # (S, D)
	valid_mask = document_ids >= 0

	cls_mask = (input_ids == 1) & valid_mask
	if cls_mask.any():
	cls = self.cls_token[0, 0].to(dtype=x.dtype)
	x = torch.where(cls_mask.unsqueeze(-1), cls.unsqueeze(0), x)

	register_mask = (input_ids == 2) & valid_mask

	if self.register_tokens is not None:
	register_rank = torch.cumsum(register_mask.to(torch.long), dim=0) - 1
	register_rank = register_rank.remainder(self.num_register_tokens)
	register_values = self.register_tokens[0].to(dtype=x.dtype)[register_rank]
	x = torch.where(register_mask.unsqueeze(-1), register_values, x)

	cls_pos = self.pos_embed[:, :1].to(dtype=x.dtype).squeeze(0).squeeze(0)
	x = torch.where(cls_mask.unsqueeze(-1), x + cls_pos.unsqueeze(0), x)

	patch_mask = (input_ids == 0) & valid_mask

	patch_pos = self.interpolate_patch_pos_encoding(
	position_ids=position_ids,
	grid_sizes=grid_sizes,
	dtype=x.dtype,
	)

	x = x + torch.where(
	patch_mask.unsqueeze(-1),
	patch_pos,
	torch.zeros_like(patch_pos),
	)

	return x

	@staticmethod
	def build_document_block_mask(
	document_ids: torch.Tensor,
	) -> BlockMask:
	document_ids = document_ids.contiguous()
	seq_len = document_ids.shape[0]

	def mask_mod(
	batch_idx: torch.Tensor,
	head_idx: torch.Tensor,
	query_idx: torch.Tensor,
	key_value_idx: torch.Tensor,
	) -> torch.Tensor:
	del batch_idx, head_idx
	query_doc = document_ids[query_idx]
	key_value_doc = document_ids[key_value_idx]
	return (query_doc >= 0) & (query_doc == key_value_doc)

	return create_block_mask(
	mask_mod,
	B=1,
	H=None,
	Q_LEN=seq_len,
	KV_LEN=seq_len,
	device=document_ids.device,
	)

	@torch.compiler.disable
	def forward_head(
	self,
	x_norm: torch.Tensor,
	cls_mask: torch.Tensor,
	register_mask: torch.Tensor,
	patch_mask: torch.Tensor,
	):
	cls_token = x_norm[cls_mask]
	register_tokens = x_norm[register_mask]
	patch_tokens = x_norm[patch_mask]

	return self.head(cls_token), self.head(register_tokens), patch_tokens

	def forward(
	self,
	pixel_values: torch.Tensor,
	input_ids: torch.Tensor,
	position_ids: torch.Tensor,
	grid_sizes: torch.Tensor,
	document_ids: torch.Tensor,
	block_mask: Optional[BlockMask] = None,
	) -> dict[str, torch.Tensor]:
	x = self.prepare_packed_tokens(
	pixel_values=pixel_values,
	input_ids=input_ids,
	position_ids=position_ids,
	grid_sizes=grid_sizes,
	document_ids=document_ids,
	)

	document_ids = document_ids.to(device=x.device)
	if block_mask is None:
	block_mask = self.build_document_block_mask(document_ids)
	valid_mask = document_ids >= 0

	for block in self.blocks:
	x = block(x, block_mask=block_mask)

	x_norm = self.norm(x)

	cls_mask = (input_ids.to(device=x.device) == 1) & valid_mask
	register_mask = (input_ids.to(device=x.device) == 2) & valid_mask
	patch_mask = (input_ids.to(device=x.device) == 0) & valid_mask

	cls_token, register_tokens, patch_tokens = self.forward_head(
	x_norm=x_norm,
	cls_mask=cls_mask,
	register_mask=register_mask,
	patch_mask=patch_mask,
	)
	return cls_token, register_tokens, patch_tokens, x_norm


	def vit_small(patch_size: int = 14, **kwargs) -> VisionTransformer:
	kwargs.setdefault("num_register_tokens", 1)
	return VisionTransformer(
	patch_size=patch_size,
	hidden_size=384,
	num_layers=12,
	num_heads=6,
	mlp_ratio=4.0,
	**kwargs,
	)


	def vit_base(patch_size: int = 14, **kwargs) -> VisionTransformer:
	kwargs.setdefault("num_register_tokens", 1)
	return VisionTransformer(
	patch_size=patch_size,
	hidden_size=768,
	num_layers=12,
	num_heads=12,
	mlp_ratio=4.0,
	**kwargs,
	)


	def vit_large(patch_size: int = 14, **kwargs) -> VisionTransformer:
	kwargs.setdefault("num_register_tokens", 1)
	return VisionTransformer(
	patch_size=patch_size,
	hidden_size=1024,
	num_layers=24,
	num_heads=16,
	mlp_ratio=4.0,
	**kwargs,
	)


	def vit_so400m(patch_size: int = 14, **kwargs) -> VisionTransformer:
	kwargs.setdefault("num_register_tokens", 1)
	return VisionTransformer(
	patch_size=patch_size,
	hidden_size=1152,
	num_layers=27,
	num_heads=16,
	mlp_ratio=4304 / 1152,
	**kwargs,
	)


	def vit_giant2(patch_size: int = 14, **kwargs) -> VisionTransformer:
	kwargs.setdefault("num_register_tokens", 1)
	return VisionTransformer(
	patch_size=patch_size,
	hidden_size=1536,
	num_layers=40,
	num_heads=24,
	mlp_ratio=4.0,
	**kwargs,
	)