UniRig

Build error

App Files Files Community

UniRig / src /model /michelangelo /models /tsal /sal_perceiver.py

MohamedRashad

Add skin and tokenizer systems with parsing and tokenization functionalities

11b119e 7 months ago

raw

history blame contribute delete

21 kB

	# -- coding: utf-8 --
	#
	# This file is part of UniRig.
	#
	# This file is derived from https://github.com/NeuralCarver/Michelangelo
	#
	# Copyright (c) https://github.com/NeuralCarver/Michelangelo original authors
	# Copyright (c) 2025 VAST-AI-Research and contributors.
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details.
	#
	# You should have received a copy of the GNU General Public License
	# along with this program. If not, see <https://www.gnu.org/licenses/>.

	import torch
	import torch.nn as nn
	from typing import Optional, Union
	from einops import repeat
	import math
	from torch_cluster import fps
	import random
	import time
	import numpy as np

	from ..modules import checkpoint
	from ..modules.embedder import FourierEmbedder
	from ..modules.transformer_blocks import (
	ResidualCrossAttentionBlock,
	Transformer
	)

	from .tsal_base import ShapeAsLatentModule


	class CrossAttentionEncoder(nn.Module):

	def __init__(self, *,
	device: Optional[torch.device],
	dtype: Optional[torch.dtype],
	num_latents: int,
	fourier_embedder: FourierEmbedder,
	point_feats: int,
	width: int,
	heads: int,
	layers: int,
	init_scale: float = 0.25,
	qkv_bias: bool = True,
	flash: bool = False,
	use_ln_post: bool = False,
	use_checkpoint: bool = False,
	query_method: bool = False,
	use_full_input: bool = True,
	token_num: int = 256,
	no_query: bool=False):

	super().__init__()

	self.query_method = query_method
	self.token_num = token_num
	self.use_full_input = use_full_input

	self.use_checkpoint = use_checkpoint
	self.num_latents = num_latents

	if no_query:
	self.query = None
	else:
	self.query = nn.Parameter(torch.randn((num_latents, width), device=device, dtype=dtype) * 0.02)

	self.fourier_embedder = fourier_embedder
	self.input_proj = nn.Linear(self.fourier_embedder.out_dim + point_feats, width, device=device, dtype=dtype)
	self.cross_attn = ResidualCrossAttentionBlock(
	device=device,
	dtype=dtype,
	width=width,
	heads=heads,
	init_scale=init_scale,
	qkv_bias=qkv_bias,
	flash=flash,
	)

	self.self_attn = Transformer(
	device=device,
	dtype=dtype,
	n_ctx=num_latents,
	width=width,
	layers=layers,
	heads=heads,
	init_scale=init_scale,
	qkv_bias=qkv_bias,
	flash=flash,
	use_checkpoint=False
	)

	if use_ln_post:
	self.ln_post = nn.LayerNorm(width, dtype=dtype, device=device)
	else:
	self.ln_post = None

	def _forward(self, pc, feats):
	"""

	Args:
	pc (torch.FloatTensor): [B, N, 3]
	feats (torch.FloatTensor or None): [B, N, C]

	Returns:

	"""
	if self.query_method:
	token_num = self.num_latents
	bs = pc.shape[0]
	data = self.fourier_embedder(pc)
	if feats is not None:
	data = torch.cat([data, feats], dim=-1)
	data = self.input_proj(data)

	query = repeat(self.query, "m c -> b m c", b=bs)

	latents = self.cross_attn(query, data)
	latents = self.self_attn(latents)

	if self.ln_post is not None:
	latents = self.ln_post(latents)

	pre_pc = None
	else:

	if isinstance(self.token_num, int):
	token_num = self.token_num
	else:
	token_num = random.choice(self.token_num)

	if self.training:
	rng = np.random.default_rng()
	else:
	rng = np.random.default_rng(seed=0)
	ind = rng.choice(pc.shape[1], token_num * 4, replace=token_num * 4 > pc.shape[1])

	pre_pc = pc[:,ind,:]
	pre_feats = feats[:,ind,:]


	B, N, D = pre_pc.shape
	C = pre_feats.shape[-1]
	###### fps
	pos = pre_pc.view(B*N, D)
	pos_feats = pre_feats.view(B*N, C)
	batch = torch.arange(B).to(pc.device)
	batch = torch.repeat_interleave(batch, N)

	idx = fps(pos, batch, ratio=1. / 4, random_start=self.training)

	sampled_pc = pos[idx]
	sampled_pc = sampled_pc.view(B, -1, 3)

	sampled_feats = pos_feats[idx]
	sampled_feats = sampled_feats.view(B, -1, C)

	######
	if self.use_full_input:
	data = self.fourier_embedder(pc)
	else:
	data = self.fourier_embedder(pre_pc)

	if feats is not None:
	if not self.use_full_input:
	feats = pre_feats
	data = torch.cat([data, feats], dim=-1)
	data = self.input_proj(data)

	sampled_data = self.fourier_embedder(sampled_pc)
	if feats is not None:
	sampled_data = torch.cat([sampled_data, sampled_feats], dim=-1)
	sampled_data = self.input_proj(sampled_data)

	latents = self.cross_attn(sampled_data, data)
	latents = self.self_attn(latents)

	if self.ln_post is not None:
	latents = self.ln_post(latents)

	pre_pc = torch.cat([pre_pc, pre_feats], dim=-1)

	return latents, pc, token_num, pre_pc

	def forward(self, pc: torch.FloatTensor, feats: Optional[torch.FloatTensor] = None):
	"""

	Args:
	pc (torch.FloatTensor): [B, N, 3]
	feats (torch.FloatTensor or None): [B, N, C]

	Returns:
	dict
	"""

	return checkpoint(self._forward, (pc, feats), self.parameters(), self.use_checkpoint)


	class CrossAttentionDecoder(nn.Module):

	def __init__(self, *,
	device: Optional[torch.device],
	dtype: Optional[torch.dtype],
	num_latents: int,
	out_channels: int,
	fourier_embedder: FourierEmbedder,
	width: int,
	heads: int,
	init_scale: float = 0.25,
	qkv_bias: bool = True,
	flash: bool = False,
	use_checkpoint: bool = False,
	mlp_width_scale: int = 4,
	supervision_type: str = 'occupancy'):

	super().__init__()

	self.use_checkpoint = use_checkpoint
	self.fourier_embedder = fourier_embedder
	self.supervision_type = supervision_type

	self.query_proj = nn.Linear(self.fourier_embedder.out_dim, width, device=device, dtype=dtype)

	self.cross_attn_decoder = ResidualCrossAttentionBlock(
	device=device,
	dtype=dtype,
	n_data=num_latents,
	width=width,
	heads=heads,
	init_scale=init_scale,
	qkv_bias=qkv_bias,
	flash=flash,
	mlp_width_scale=mlp_width_scale,
	)

	self.ln_post = nn.LayerNorm(width, device=device, dtype=dtype)
	self.output_proj = nn.Linear(width, out_channels, device=device, dtype=dtype)
	if self.supervision_type == 'occupancy-sdf':
	self.output_proj_sdf = nn.Linear(width, out_channels, device=device, dtype=dtype)



	def _forward(self, queries: torch.FloatTensor, latents: torch.FloatTensor):
	if next(self.query_proj.parameters()).dtype == torch.float16:
	queries = queries.half()
	latents = latents.half()
	# print(f"queries: {queries.dtype}, {queries.device}")
	# print(f"latents: {latents.dtype}, {latents.device}"z)
	queries = self.query_proj(self.fourier_embedder(queries))
	x = self.cross_attn_decoder(queries, latents)
	x = self.ln_post(x)
	x_1 = self.output_proj(x)
	if self.supervision_type == 'occupancy-sdf':
	x_2 = self.output_proj_sdf(x)
	return x_1, x_2
	else:
	return x_1

	def forward(self, queries: torch.FloatTensor, latents: torch.FloatTensor):
	return checkpoint(self._forward, (queries, latents), self.parameters(), self.use_checkpoint)


	class ShapeAsLatentPerceiver(ShapeAsLatentModule):
	def __init__(self, *,
	device: Optional[torch.device],
	dtype: Optional[torch.dtype],
	num_latents: int,
	point_feats: int = 0,
	embed_dim: int = 0,
	num_freqs: int = 8,
	include_pi: bool = True,
	width: int,
	heads: int,
	num_encoder_layers: int,
	num_decoder_layers: int,
	decoder_width: Optional[int] = None,
	init_scale: float = 0.25,
	qkv_bias: bool = True,
	flash: bool = False,
	use_ln_post: bool = False,
	use_checkpoint: bool = False,
	supervision_type: str = 'occupancy',
	query_method: bool = False,
	token_num: int = 256,
	grad_type: str = "numerical",
	grad_interval: float = 0.005,
	use_full_input: bool = True,
	freeze_encoder: bool = False,
	decoder_mlp_width_scale: int = 4,
	residual_kl: bool = False,
	):

	super().__init__()

	self.use_checkpoint = use_checkpoint

	self.num_latents = num_latents
	assert grad_type in ["numerical", "analytical"]
	self.grad_type = grad_type
	self.grad_interval = grad_interval
	self.supervision_type = supervision_type
	self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)

	init_scale = init_scale * math.sqrt(1.0 / width)
	self.encoder = CrossAttentionEncoder(
	device=device,
	dtype=dtype,
	fourier_embedder=self.fourier_embedder,
	num_latents=num_latents,
	point_feats=point_feats,
	width=width,
	heads=heads,
	layers=num_encoder_layers,
	init_scale=init_scale,
	qkv_bias=qkv_bias,
	flash=flash,
	use_ln_post=use_ln_post,
	use_checkpoint=use_checkpoint,
	query_method=query_method,
	use_full_input=use_full_input,
	token_num=token_num
	)

	self.embed_dim = embed_dim
	self.residual_kl = residual_kl
	if decoder_width is None:
	decoder_width = width
	if embed_dim > 0:
	# VAE embed
	self.pre_kl = nn.Linear(width, embed_dim * 2, device=device, dtype=dtype)
	self.post_kl = nn.Linear(embed_dim, decoder_width, device=device, dtype=dtype)
	self.latent_shape = (num_latents, embed_dim)
	if self.residual_kl:
	assert self.post_kl.out_features % self.post_kl.in_features == 0
	assert self.pre_kl.in_features % self.pre_kl.out_features == 0
	else:
	self.latent_shape = (num_latents, width)

	self.transformer = Transformer(
	device=device,
	dtype=dtype,
	n_ctx=num_latents,
	width=decoder_width,
	layers=num_decoder_layers,
	heads=heads,
	init_scale=init_scale,
	qkv_bias=qkv_bias,
	flash=flash,
	use_checkpoint=use_checkpoint
	)

	# geometry decoder
	self.geo_decoder = CrossAttentionDecoder(
	device=device,
	dtype=dtype,
	fourier_embedder=self.fourier_embedder,
	out_channels=1,
	num_latents=num_latents,
	width=decoder_width,
	heads=heads,
	init_scale=init_scale,
	qkv_bias=qkv_bias,
	flash=flash,
	use_checkpoint=use_checkpoint,
	supervision_type=supervision_type,
	mlp_width_scale=decoder_mlp_width_scale
	)

	if freeze_encoder:
	for p in self.encoder.parameters():
	p.requires_grad = False
	for p in self.pre_kl.parameters():
	p.requires_grad = False
	print("freeze encoder and pre kl")

	def forward(self,
	pc: torch.FloatTensor,
	feats: torch.FloatTensor,
	volume_queries: torch.FloatTensor,
	sample_posterior: bool = True):
	"""

	Args:
	pc (torch.FloatTensor): [B, N, 3]
	feats (torch.FloatTensor or None): [B, N, C]
	volume_queries (torch.FloatTensor): [B, P, 3]
	sample_posterior (bool):

	Returns:
	logits (torch.FloatTensor): [B, P]
	center_pos (torch.FloatTensor): [B, M, 3]
	posterior (DiagonalGaussianDistribution or None).

	"""

	latents, center_pos, posterior = self.encode(pc, feats, sample_posterior=sample_posterior)

	latents = self.decode(latents)
	logits = self.query_geometry(volume_queries, latents)

	return logits, center_pos, posterior


	class AlignedShapeLatentPerceiver(ShapeAsLatentPerceiver):

	def __init__(self, *,
	device: Optional[torch.device],
	dtype: Optional[str],
	num_latents: int,
	point_feats: int = 0,
	embed_dim: int = 0,
	num_freqs: int = 8,
	include_pi: bool = True,
	width: int,
	heads: int,
	num_encoder_layers: int,
	num_decoder_layers: int,
	decoder_width: Optional[int] = None,
	init_scale: float = 0.25,
	qkv_bias: bool = True,
	flash: bool = False,
	use_ln_post: bool = False,
	use_checkpoint: bool = False,
	supervision_type: str = 'occupancy',
	grad_type: str = "numerical",
	grad_interval: float = 0.005,
	query_method: bool = False,
	use_full_input: bool = True,
	token_num: int = 256,
	freeze_encoder: bool = False,
	decoder_mlp_width_scale: int = 4,
	residual_kl: bool = False,
	):

	MAP_DTYPE = {
	'float32': torch.float32,
	'float16': torch.float16,
	'bfloat16': torch.bfloat16,
	}
	if dtype is not None:
	dtype = MAP_DTYPE[dtype]
	super().__init__(
	device=device,
	dtype=dtype,
	num_latents=1 + num_latents,
	point_feats=point_feats,
	embed_dim=embed_dim,
	num_freqs=num_freqs,
	include_pi=include_pi,
	width=width,
	decoder_width=decoder_width,
	heads=heads,
	num_encoder_layers=num_encoder_layers,
	num_decoder_layers=num_decoder_layers,
	init_scale=init_scale,
	qkv_bias=qkv_bias,
	flash=flash,
	use_ln_post=use_ln_post,
	use_checkpoint=use_checkpoint,
	supervision_type=supervision_type,
	grad_type=grad_type,
	grad_interval=grad_interval,
	query_method=query_method,
	token_num=token_num,
	use_full_input=use_full_input,
	freeze_encoder=freeze_encoder,
	decoder_mlp_width_scale=decoder_mlp_width_scale,
	residual_kl=residual_kl,
	)

	self.width = width

	def encode(self,
	pc: torch.FloatTensor,
	feats: Optional[torch.FloatTensor] = None,
	sample_posterior: bool = True,
	only_shape: bool=False):
	"""

	Args:
	pc (torch.FloatTensor): [B, N, 3]
	feats (torch.FloatTensor or None): [B, N, c]
	sample_posterior (bool):

	Returns:
	shape_embed (torch.FloatTensor)
	kl_embed (torch.FloatTensor):
	posterior (DiagonalGaussianDistribution or None):
	"""

	shape_embed, latents, token_num, pre_pc = self.encode_latents(pc, feats)
	if only_shape:
	return shape_embed
	kl_embed, posterior = self.encode_kl_embed(latents, sample_posterior)

	return shape_embed, kl_embed, posterior, token_num, pre_pc

	def encode_latents(self,
	pc: torch.FloatTensor,
	feats: Optional[torch.FloatTensor] = None):

	x, _, token_num, pre_pc = self.encoder(pc, feats)

	shape_embed = x[:, 0]
	# latents = x[:, 1:]
	# use all tokens
	latents = x

	return shape_embed, latents, token_num, pre_pc

	def forward(self,
	pc: torch.FloatTensor,
	feats: torch.FloatTensor,
	volume_queries: torch.FloatTensor,
	sample_posterior: bool = True):
	raise NotImplementedError()

	#####################################################
	# a simplified verstion of perceiver encoder
	#####################################################

	class ShapeAsLatentPerceiverEncoder(ShapeAsLatentModule):
	def __init__(self, *,
	device: Optional[torch.device],
	dtype: Optional[Union[torch.dtype, str]],
	num_latents: int,
	point_feats: int = 0,
	embed_dim: int = 0,
	num_freqs: int = 8,
	include_pi: bool = True,
	width: int,
	heads: int,
	num_encoder_layers: int,
	init_scale: float = 0.25,
	qkv_bias: bool = True,
	flash: bool = False,
	use_ln_post: bool = False,
	use_checkpoint: bool = False,
	supervision_type: str = 'occupancy',
	query_method: bool = False,
	token_num: int = 256,
	grad_type: str = "numerical",
	grad_interval: float = 0.005,
	use_full_input: bool = True,
	freeze_encoder: bool = False,
	residual_kl: bool = False,
	):

	super().__init__()


	MAP_DTYPE = {
	'float32': torch.float32,
	'float16': torch.float16,
	'bfloat16': torch.bfloat16,
	}

	if dtype is not None and isinstance(dtype, str):
	dtype = MAP_DTYPE[dtype]

	self.use_checkpoint = use_checkpoint

	self.num_latents = num_latents
	assert grad_type in ["numerical", "analytical"]
	self.grad_type = grad_type
	self.grad_interval = grad_interval
	self.supervision_type = supervision_type
	self.fourier_embedder = FourierEmbedder(num_freqs=num_freqs, include_pi=include_pi)

	init_scale = init_scale * math.sqrt(1.0 / width)
	self.encoder = CrossAttentionEncoder(
	device=device,
	dtype=dtype,
	fourier_embedder=self.fourier_embedder,
	num_latents=num_latents,
	point_feats=point_feats,
	width=width,
	heads=heads,
	layers=num_encoder_layers,
	init_scale=init_scale,
	qkv_bias=qkv_bias,
	flash=flash,
	use_ln_post=use_ln_post,
	use_checkpoint=use_checkpoint,
	query_method=query_method,
	use_full_input=use_full_input,
	token_num=token_num,
	no_query=True,
	)

	self.embed_dim = embed_dim
	self.residual_kl = residual_kl
	if freeze_encoder:
	for p in self.encoder.parameters():
	p.requires_grad = False
	print("freeze encoder")
	self.width = width

	def encode_latents(self,
	pc: torch.FloatTensor,
	feats: Optional[torch.FloatTensor] = None):

	x, _, token_num, pre_pc = self.encoder(pc, feats)

	shape_embed = x[:, 0]
	latents = x

	return shape_embed, latents, token_num, pre_pc

	def forward(self):
	raise NotImplementedError()