Spaces:

omnipart
/

OmniPart

Running on Zero

App Files Files Community

OmniPart / modules /part_synthesis /models /structured_latent_vae /base.py

omnipart

init

491eded 5 months ago

raw

history blame

7.37 kB

	"""
	Base Sparse Transformer Implementation for TRELLIS Framework

	This file implements the base architecture for sparse transformers used in structured latent variable models.
	It provides a configurable foundation with multiple attention mechanisms (full, windowed, shifted window)
	and supports different positional encoding strategies. The sparse implementation allows for efficient
	processing of data with varying density patterns.

	The main class SparseTransformerBase serves as the foundation for encoder and decoder implementations
	in the structured latent VAE models.
	"""

	from typing import *
	import torch
	import torch.nn as nn
	from ...modules.utils import convert_module_to_f16, convert_module_to_f32
	from ...modules import sparse as sp
	from ...modules.transformer import AbsolutePositionEmbedder
	from ...modules.sparse.transformer import SparseTransformerBlock


	def block_attn_config(self):
	"""
	Return the attention configuration for each transformer block.

	Generates configurations for each block based on the specified attention mode:
	- shift_window: Uses serialized attention with shifting window patterns
	- shift_sequence: Uses serialized attention with sequence shifts
	- shift_order: Uses serialized attention with different serialization orders
	- full: Uses standard full attention (non-sparse)
	- swin: Uses Swin Transformer-style windowed attention

	Yields:
	Tuple containing attention mode and its parameters
	"""
	for i in range(self.num_blocks):
	if self.attn_mode == "shift_window":
	yield "serialized", self.window_size, 0, (16 * (i % 2),) * 3, sp.SerializeMode.Z_ORDER
	elif self.attn_mode == "shift_sequence":
	yield "serialized", self.window_size, self.window_size // 2 * (i % 2), (0, 0, 0), sp.SerializeMode.Z_ORDER
	elif self.attn_mode == "shift_order":
	yield "serialized", self.window_size, 0, (0, 0, 0), sp.SerializeModes[i % 4]
	elif self.attn_mode == "full":
	yield "full", None, None, None, None
	elif self.attn_mode == "swin":
	yield "windowed", self.window_size, None, self.window_size // 2 * (i % 2), None


	class SparseTransformerBase(nn.Module):
	"""
	Sparse Transformer without output layers.
	Serve as the base class for encoder and decoder.

	Implements a transformer architecture that can work with sparse data structures,
	supporting various attention mechanisms and positional encodings.
	"""
	def __init__(
	self,
	in_channels: int,
	model_channels: int,
	num_blocks: int,
	num_heads: Optional[int] = None,
	num_head_channels: Optional[int] = 64,
	mlp_ratio: float = 4.0,
	attn_mode: Literal["full", "shift_window", "shift_sequence", "shift_order", "swin"] = "full",
	window_size: Optional[int] = None,
	pe_mode: Literal["ape", "rope"] = "ape",
	use_fp16: bool = False,
	use_checkpoint: bool = False,
	qk_rms_norm: bool = False,
	):
	"""
	Initialize the sparse transformer base model.

	Args:
	in_channels: Number of input channels
	model_channels: Hidden dimension size
	num_blocks: Number of transformer blocks
	num_heads: Number of attention heads (calculated from head_channels if None)
	num_head_channels: Number of channels per attention head
	mlp_ratio: Ratio for MLP hidden dimension
	attn_mode: Attention mechanism type
	window_size: Size of attention window for windowed modes
	pe_mode: Positional encoding mode (absolute or rotary)
	use_fp16: Whether to use half precision
	use_checkpoint: Whether to use gradient checkpointing
	qk_rms_norm: Whether to use RMS normalization for query and key
	"""
	super().__init__()
	self.in_channels = in_channels
	self.model_channels = model_channels
	self.num_blocks = num_blocks
	self.window_size = window_size
	self.num_heads = num_heads or model_channels // num_head_channels
	self.mlp_ratio = mlp_ratio
	self.attn_mode = attn_mode
	self.pe_mode = pe_mode
	self.use_fp16 = use_fp16
	self.use_checkpoint = use_checkpoint
	self.qk_rms_norm = qk_rms_norm
	self.dtype = torch.float16 if use_fp16 else torch.float32

	# Create positional embedder if using absolute positional encoding
	if pe_mode == "ape":
	self.pos_embedder = AbsolutePositionEmbedder(model_channels)

	# Input projection layer
	self.input_layer = sp.SparseLinear(in_channels, model_channels)

	# Build transformer blocks with configurations from block_attn_config
	self.blocks = nn.ModuleList([
	SparseTransformerBlock(
	model_channels,
	num_heads=self.num_heads,
	mlp_ratio=self.mlp_ratio,
	attn_mode=attn_mode,
	window_size=window_size,
	shift_sequence=shift_sequence,
	shift_window=shift_window,
	serialize_mode=serialize_mode,
	use_checkpoint=self.use_checkpoint,
	use_rope=(pe_mode == "rope"),
	qk_rms_norm=self.qk_rms_norm,
	)
	for attn_mode, window_size, shift_sequence, shift_window, serialize_mode in block_attn_config(self)
	])

	@property
	def device(self) -> torch.device:
	"""
	Return the device of the model.
	"""
	return next(self.parameters()).device

	def convert_to_fp16(self) -> None:
	"""
	Convert the torso of the model to float16 precision.
	Used for mixed precision training.
	"""
	self.blocks.apply(convert_module_to_f16)

	def convert_to_fp32(self) -> None:
	"""
	Convert the torso of the model back to float32 precision.
	Used after mixed precision training or inference.
	"""
	self.blocks.apply(convert_module_to_f32)

	def initialize_weights(self) -> None:
	"""
	Initialize the weights of the model using Xavier uniform initialization.
	This helps with training stability and convergence.
	"""
	def _basic_init(module):
	if isinstance(module, nn.Linear):
	torch.nn.init.xavier_uniform_(module.weight)
	if module.bias is not None:
	nn.init.constant_(module.bias, 0)
	self.apply(_basic_init)

	def forward(self, x: sp.SparseTensor) -> sp.SparseTensor:
	"""
	Forward pass through the sparse transformer.

	Args:
	x: Input sparse tensor

	Returns:
	Processed sparse tensor after passing through all transformer blocks
	"""
	# Project input to model dimension
	h = self.input_layer(x)

	# Add positional embeddings if using absolute positional encoding
	if self.pe_mode == "ape":
	h = h + self.pos_embedder(x.coords[:, 1:])

	# Convert to target precision
	h = h.type(self.dtype)

	# Pass through transformer blocks sequentially
	for block in self.blocks:
	h = block(h)

	return h