Spaces:

multimodalart
/

PersonaLive

Running on Zero

App Files Files Community

PersonaLive / src /models /motion_module.py

apolinario

ZeroGPU backend self-test: PersonaLive pipeline on Blackwell

d1d91bf 16 days ago

Raw

History Blame Contribute Delete

17 kB

	import math
	from dataclasses import dataclass
	from typing import Callable, Optional

	import torch
	from diffusers.models.attention import FeedForward
	from diffusers.models.attention_processor import Attention, AttnProcessor
	from diffusers.utils import BaseOutput
	from diffusers.utils.import_utils import is_xformers_available
	from einops import rearrange, repeat
	from torch import nn


	def zero_module(module):
	# Zero out the parameters of a module and return it.
	assert isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear), type(module)
	for p in module.parameters():
	p.detach().zero_()
	return module

	def random_module(m):
	assert isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear), type(m)
	# Initialize weights with He initialization and zero out the biases
	n = (m.kernel_size[0] * m.kernel_size[1] * m.in_channels) if isinstance(m, nn.Conv2d) else m.in_features
	nn.init.normal_(m.weight, mean=0.0, std=math.sqrt(2. / n))
	if m.bias is not None:
	nn.init.zeros_(m.bias)
	return m


	@dataclass
	class TemporalTransformer3DModelOutput(BaseOutput):
	sample: torch.FloatTensor


	if is_xformers_available():
	import xformers
	import xformers.ops
	else:
	xformers = None


	def get_motion_module(in_channels, motion_module_type: str, motion_module_kwargs: dict):
	if motion_module_type == "Vanilla":
	return VanillaTemporalModule(
	in_channels=in_channels,
	**motion_module_kwargs,
	)
	elif motion_module_type == "RefImage_Vanilla":
	return VanillaTemporalModule(
	in_channels=in_channels,
	skip_ref_image=True,
	**motion_module_kwargs,
	)
	elif motion_module_type == "RefImageCond_Vanilla":
	return VanillaTemporalModule(
	in_channels=in_channels,
	cond_ref_image=True,
	**motion_module_kwargs,
	)
	else:
	raise ValueError


	class VanillaTemporalModule(nn.Module):

	def __init__(
	self,
	in_channels,
	num_attention_heads=8,
	num_transformer_block=2,
	attention_block_types=("Temporal_Self", "Temporal_Self"),
	cross_attention_dim=768,
	cross_frame_attention_mode=None,
	temporal_position_encoding=False,
	temporal_position_encoding_max_len=24,
	temporal_attention_dim_div=1,
	zero_initialize=True,
	skip_ref_image=False,
	cond_ref_image=False,
	):
	super().__init__()
	self.skip_ref_image = skip_ref_image
	self.cond_ref_image = cond_ref_image

	self.temporal_transformer = TemporalTransformer3DModel(
	in_channels=in_channels,
	num_attention_heads=num_attention_heads,
	attention_head_dim=in_channels
	// num_attention_heads
	// temporal_attention_dim_div,
	num_layers=num_transformer_block,
	attention_block_types=attention_block_types,
	cross_attention_dim=cross_attention_dim,
	cross_frame_attention_mode=cross_frame_attention_mode,
	temporal_position_encoding=temporal_position_encoding,
	temporal_position_encoding_max_len=temporal_position_encoding_max_len,
	)

	if zero_initialize:
	self.temporal_transformer.proj_out = zero_module(
	self.temporal_transformer.proj_out
	)

	def set_use_cross_frame_attention(self, value):
	self.skip_ref_image = value

	def forward(
	self,
	input_tensor,
	temb,
	encoder_hidden_states,
	attention_mask=None,
	anchor_frame_idx=None,
	debug=False
	):
	hidden_states = input_tensor
	if self.skip_ref_image:
	# if input_tensor.shape[2] > 1:
	hidden_states, ref_hidden_states = input_tensor[:, :, :-1], input_tensor[:, :, -1:]

	hidden_states = self.temporal_transformer(
	hidden_states, encoder_hidden_states, attention_mask, debug=debug
	)

	output = hidden_states
	if self.skip_ref_image:
	# if input_tensor.shape[2] > 1:
	output = torch.cat([output, ref_hidden_states], dim=2)
	elif self.cond_ref_image:
	output = torch.cat([output[:, :, :-1], input_tensor[:, :, -1:]], dim=2)
	return output


	class TemporalTransformer3DModel(nn.Module):
	def __init__(
	self,
	in_channels,
	num_attention_heads,
	attention_head_dim,
	num_layers,
	attention_block_types=(
	"Temporal_Self",
	"Temporal_Self",
	),
	dropout=0.0,
	norm_num_groups=32,
	cross_attention_dim=768,
	activation_fn="geglu",
	attention_bias=False,
	upcast_attention=False,
	cross_frame_attention_mode=None,
	temporal_position_encoding=False,
	temporal_position_encoding_max_len=24,
	):
	super().__init__()

	inner_dim = num_attention_heads * attention_head_dim

	self.norm = torch.nn.GroupNorm(
	num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True
	)
	self.proj_in = nn.Linear(in_channels, inner_dim)

	self.transformer_blocks = nn.ModuleList(
	[
	TemporalTransformerBlock(
	dim=inner_dim,
	num_attention_heads=num_attention_heads,
	attention_head_dim=attention_head_dim,
	attention_block_types=attention_block_types,
	dropout=dropout,
	norm_num_groups=norm_num_groups,
	cross_attention_dim=cross_attention_dim,
	activation_fn=activation_fn,
	attention_bias=attention_bias,
	upcast_attention=upcast_attention,
	cross_frame_attention_mode=cross_frame_attention_mode,
	temporal_position_encoding=temporal_position_encoding,
	temporal_position_encoding_max_len=temporal_position_encoding_max_len,
	)
	for d in range(num_layers)
	]
	)
	self.proj_out = nn.Linear(inner_dim, in_channels)

	def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, debug=False):
	assert (
	hidden_states.dim() == 5
	), f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
	video_length = hidden_states.shape[2]
	hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")

	if encoder_hidden_states is not None and encoder_hidden_states.ndim == 4:
	assert encoder_hidden_states.shape[1] == video_length, (video_length, encoder_hidden_states.shape)
	encoder_hidden_states = rearrange(encoder_hidden_states, "b d n c -> (b d) n c",)

	batch, channel, height, weight = hidden_states.shape
	residual = hidden_states

	hidden_states = self.norm(hidden_states)
	inner_dim = hidden_states.shape[1]
	hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(
	batch, height * weight, inner_dim
	)
	hidden_states = self.proj_in(hidden_states)

	# Transformer Blocks
	for block in self.transformer_blocks:
	hidden_states = block(
	hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	video_length=video_length,
	)

	# output
	hidden_states = self.proj_out(hidden_states)
	hidden_states = (
	hidden_states.reshape(batch, height, weight, inner_dim)
	.permute(0, 3, 1, 2)
	.contiguous()
	)
	if False:
	print(
	'TemporalModule',
	hidden_states.shape,
	# round(torch.abs(residual).mean().item(), 6),
	# round(torch.abs(residual).max().item(), 6),
	# round(torch.abs(hidden_states).mean().item(), 6),
	# round(torch.abs(hidden_states).max().item(), 6),
	)
	# hidden_states *= 0
	output = hidden_states + residual
	output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)

	return output


	class TemporalTransformerBlock(nn.Module):

	def __init__(
	self,
	dim,
	num_attention_heads,
	attention_head_dim,
	attention_block_types=(
	"Temporal_Self",
	"Temporal_Self",
	),
	dropout=0.0,
	norm_num_groups=32,
	cross_attention_dim=768,
	activation_fn="geglu",
	attention_bias=False,
	upcast_attention=False,
	cross_frame_attention_mode=None,
	temporal_position_encoding=False,
	temporal_position_encoding_max_len=24,
	proj_out_dim=None,
	):
	super().__init__()

	attention_blocks = []
	norms = []

	for block_name in attention_block_types:
	attention_blocks.append(
	VersatileAttention(
	attention_mode=block_name.split("_")[0],
	cross_attention_dim=cross_attention_dim
	if block_name.endswith("_Cross")
	else None,
	query_dim=dim,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	upcast_attention=upcast_attention,
	cross_frame_attention_mode=cross_frame_attention_mode,
	temporal_position_encoding=temporal_position_encoding,
	temporal_position_encoding_max_len=temporal_position_encoding_max_len,
	)
	)
	norms.append(nn.LayerNorm(dim))

	self.attention_blocks = nn.ModuleList(attention_blocks)
	self.norms = nn.ModuleList(norms)

	self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
	self.ff_norm = nn.LayerNorm(dim)

	self.proj_out = nn.Linear(dim, proj_out_dim) if proj_out_dim is not None else None

	def forward(
	self,
	hidden_states,
	encoder_hidden_states=None,
	attention_mask=None,
	video_length=None,
	att_flag=False
	):
	for attention_block, norm in zip(self.attention_blocks, self.norms):
	norm_hidden_states = norm(hidden_states)
	if att_flag:
	print(
	'block',
	round(torch.abs(hidden_states).mean().item(), 6),
	round(torch.abs(norm_hidden_states).mean().item(), 6),
	)
	hidden_states = (
	attention_block(
	norm_hidden_states,
	encoder_hidden_states=encoder_hidden_states
	if attention_block.is_cross_attention
	else None,
	video_length=video_length,
	att_flag=att_flag
	)
	+ hidden_states
	)

	hidden_states = self.ff(self.ff_norm(hidden_states)) + hidden_states

	output = hidden_states if self.proj_out is None else self.proj_out(hidden_states)
	return output


	class PositionalEncoding(nn.Module):
	def __init__(self, d_model, dropout=0.0, max_len=24):
	super().__init__()
	self.dropout = nn.Dropout(p=dropout)
	position = torch.arange(max_len).unsqueeze(1)
	div_term = torch.exp(
	torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
	)
	pe = torch.zeros(1, max_len, d_model)
	pe[0, :, 0::2] = torch.sin(position * div_term)
	pe[0, :, 1::2] = torch.cos(position * div_term)
	self.register_buffer("pe", pe)

	def forward(self, x):
	x = x + self.pe[:, : x.size(1)]
	return self.dropout(x)


	class VersatileAttention(Attention):
	def __init__(
	self,
	attention_mode=None,
	cross_frame_attention_mode=None,
	temporal_position_encoding=False,
	temporal_position_encoding_max_len=24,
	*args,
	**kwargs,
	):
	super().__init__(args, *kwargs)
	assert attention_mode in ["Temporal", "Spatial"], attention_mode

	self.attention_mode = attention_mode
	self.is_cross_attention = kwargs["cross_attention_dim"] is not None

	self.pos_encoder = (
	PositionalEncoding(
	kwargs["query_dim"],
	dropout=0.0,
	max_len=temporal_position_encoding_max_len,
	)
	if (temporal_position_encoding and attention_mode == "Temporal")
	else None
	)

	def extra_repr(self):
	return f"(Module Info) Attention_Mode: {self.attention_mode}, Is_Cross_Attention: {self.is_cross_attention}"

	def set_use_memory_efficient_attention_xformers(
	self,
	use_memory_efficient_attention_xformers: bool,
	attention_op: Optional[Callable] = None,
	):
	if use_memory_efficient_attention_xformers:
	if not is_xformers_available():
	raise ModuleNotFoundError(
	(
	"Refer to https://github.com/facebookresearch/xformers for more information on how to install"
	" xformers"
	),
	name="xformers",
	)
	elif not torch.cuda.is_available():
	raise ValueError(
	"torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is"
	" only available for GPU "
	)
	else:
	try:
	# Make sure we can run the memory efficient attention
	_ = xformers.ops.memory_efficient_attention(
	torch.randn((1, 2, 40), device="cuda"),
	torch.randn((1, 2, 40), device="cuda"),
	torch.randn((1, 2, 40), device="cuda"),
	)
	except Exception as e:
	raise e

	# XFormersAttnProcessor corrupts video generation and work with Pytorch 1.13.
	# Pytorch 2.0.1 AttnProcessor works the same as XFormersAttnProcessor in Pytorch 1.13.
	# You don't need XFormersAttnProcessor here.
	# processor = XFormersAttnProcessor(
	# attention_op=attention_op,
	# )
	processor = AttnProcessor()
	else:
	processor = AttnProcessor()

	self.set_processor(processor)

	def forward(
	self,
	hidden_states,
	encoder_hidden_states=None,
	attention_mask=None,
	video_length=None,
	bank=None,
	att_flag=False,
	**cross_attention_kwargs,
	):
	if self.attention_mode == "Temporal":
	d = hidden_states.shape[1] # d means HxW
	hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)

	if encoder_hidden_states is not None:
	if not encoder_hidden_states.shape[0] == hidden_states.shape[0]:
	encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b d) n c", d=d)

	if bank is not None and self.attention_mode == "Temporal" and not self.is_cross_attention:
	# motion_frames作为之前的帧，引入motion module进行condition
	modify_norm_hidden_states = torch.cat(bank + [hidden_states], dim=1)

	if self.pos_encoder is not None:
	modify_norm_hidden_states = self.pos_encoder(modify_norm_hidden_states)

	hidden_states = self.processor(
	self,
	hidden_states,
	encoder_hidden_states=modify_norm_hidden_states,
	attention_mask=attention_mask,
	**cross_attention_kwargs,
	) # 改为cross-att

	else:
	if self.pos_encoder is not None:
	hidden_states = self.pos_encoder(hidden_states)
	inp = hidden_states
	hidden_states = self.processor(
	self,
	hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=attention_mask,
	**cross_attention_kwargs,
	)
	if att_flag:
	print(
	'ver_att',
	round(torch.abs(inp).mean().item(), 6),
	round(torch.abs(encoder_hidden_states).mean().item(), 6),
	round(torch.abs(hidden_states).mean().item(), 6),
	)

	if self.attention_mode == "Temporal":
	hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)

	return hidden_states