Spaces:

Bai-YT
/

ConsistencyTTA

Runtime error

ConsistencyTTA / diffusers /models /transformer_2d.py

Bai-YT

Gradio App for ConsistencyTTA V1

66982e9 over 1 year ago

16.6 kB

	# Copyright 2023 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from dataclasses import dataclass
	from typing import Any, Dict, Optional

	import torch
	import torch.nn.functional as F
	from torch import nn

	from ..utils.configuration_utils import ConfigMixin, register_to_config
	from ..utils.outputs import BaseOutput
	from ..utils.deprecation_utils import deprecate
	from ..models.embeddings import ImagePositionalEmbeddings
	from .attention import BasicTransformerBlock
	from .embeddings import PatchEmbed
	from .modeling_utils import ModelMixin


	@dataclass
	class Transformer2DModelOutput(BaseOutput):
	"""
	Args:
	sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or
	`(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete):
	Hidden states conditioned on `encoder_hidden_states` input. If discrete, returns probability distributions
	for the unnoised latent pixels.
	"""

	sample: torch.FloatTensor


	class Transformer2DModel(ModelMixin, ConfigMixin):
	"""
	Transformer model for image-like data. Takes either discrete (classes of vector embeddings) or continuous (actual
	embeddings) inputs.

	When input is continuous: First, project the input (aka embedding) and reshape to b, t, d. Then apply standard
	transformer action. Finally, reshape to image.

	When input is discrete: First, input (classes of latent pixels) is converted to embeddings and has positional
	embeddings applied, see `ImagePositionalEmbeddings`. Then apply standard transformer action. Finally, predict
	classes of unnoised image.

	Note that it is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised
	image do not contain a prediction for the masked pixel as the unnoised image cannot be masked.

	Parameters:
	num_attention_heads (`int`, optional, defaults to 16): The number of heads to use for multi-head attention.
	attention_head_dim (`int`, optional, defaults to 88): The number of channels in each head.
	in_channels (`int`, optional):
	Pass if the input is continuous. The number of channels in the input and output.
	num_layers (`int`, optional, defaults to 1): The number of layers of Transformer blocks to use.
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	cross_attention_dim (`int`, optional): The number of encoder_hidden_states dimensions to use.
	sample_size (`int`, optional): Pass if the input is discrete. The width of the latent images.
	Note that this is fixed at training time as it is used for learning a number of position embeddings.
	See `ImagePositionalEmbeddings`.
	num_vector_embeds (`int`, optional):
	Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels.
	Includes the class for the masked latent pixel.
	activation_fn (`str`, optional, defaults to `"geglu"`): Activation function to be used in feed-forward.
	num_embeds_ada_norm ( `int`, optional): Pass if at least one of the norm_layers is `AdaLayerNorm`.
	The number of diffusion steps used during training. Note that this is fixed at training time as it is
	used to learn a number of embeddings that are added to the hidden states. During inference, you can
	denoise for up to but not more than steps than `num_embeds_ada_norm`.
	attention_bias (`bool`, optional):
	Configure if the TransformerBlocks' attention should contain a bias parameter.
	"""

	@register_to_config
	def __init__(
	self,
	num_attention_heads: int = 16,
	attention_head_dim: int = 88,
	in_channels: Optional[int] = None,
	out_channels: Optional[int] = None,
	num_layers: int = 1,
	dropout: float = 0.0,
	norm_num_groups: int = 32,
	cross_attention_dim: Optional[int] = None,
	attention_bias: bool = False,
	sample_size: Optional[int] = None,
	num_vector_embeds: Optional[int] = None,
	patch_size: Optional[int] = None,
	activation_fn: str = "geglu",
	num_embeds_ada_norm: Optional[int] = None,
	use_linear_projection: bool = False,
	only_cross_attention: bool = False,
	upcast_attention: bool = False,
	norm_type: str = "layer_norm",
	norm_elementwise_affine: bool = True,
	):
	super().__init__()
	self.use_linear_projection = use_linear_projection
	self.num_attention_heads = num_attention_heads
	self.attention_head_dim = attention_head_dim
	inner_dim = num_attention_heads * attention_head_dim

	# 1. Transformer2DModel can process both standard continuous images of
	# shape `(batch_size, num_channels, width, height)` as well as
	# quantized image embeddings of shape `(batch_size, num_image_vectors)`
	# Define whether input is continuous or discrete depending on configuration
	self.is_input_continuous = (in_channels is not None) and (patch_size is None)
	self.is_input_vectorized = num_vector_embeds is not None
	self.is_input_patches = in_channels is not None and patch_size is not None

	if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
	deprecation_message = (
	f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
	" incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
	" Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
	" results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
	" would be very nice if you could open a Pull request for the `transformer/config.json` file"
	)
	deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
	norm_type = "ada_norm"

	if self.is_input_continuous and self.is_input_vectorized:
	raise ValueError(
	f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
	" sure that either `in_channels` or `num_vector_embeds` is None."
	)
	elif self.is_input_vectorized and self.is_input_patches:
	raise ValueError(
	f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
	" sure that either `num_vector_embeds` or `num_patches` is None."
	)
	elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
	raise ValueError(
	f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
	f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
	)

	# 2. Define input layers
	if self.is_input_continuous:
	self.in_channels = in_channels

	self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
	if use_linear_projection:
	self.proj_in = nn.Linear(in_channels, inner_dim)
	else:
	self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
	elif self.is_input_vectorized:
	assert sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
	assert num_vector_embeds is not None, "Transformer2DModel over discrete input must provide num_embed"

	self.height = sample_size
	self.width = sample_size
	self.num_vector_embeds = num_vector_embeds
	self.num_latent_pixels = self.height * self.width

	self.latent_image_embedding = ImagePositionalEmbeddings(
	num_embed=num_vector_embeds, embed_dim=inner_dim, height=self.height, width=self.width
	)
	elif self.is_input_patches:
	assert sample_size is not None, "Transformer2DModel over patched input must provide sample_size"

	self.height = sample_size
	self.width = sample_size

	self.patch_size = patch_size
	self.pos_embed = PatchEmbed(
	height=sample_size,
	width=sample_size,
	patch_size=patch_size,
	in_channels=in_channels,
	embed_dim=inner_dim,
	)

	# 3. Define transformers blocks
	self.transformer_blocks = nn.ModuleList(
	[
	BasicTransformerBlock(
	inner_dim,
	num_attention_heads,
	attention_head_dim,
	dropout=dropout,
	cross_attention_dim=cross_attention_dim,
	activation_fn=activation_fn,
	num_embeds_ada_norm=num_embeds_ada_norm,
	attention_bias=attention_bias,
	only_cross_attention=only_cross_attention,
	upcast_attention=upcast_attention,
	norm_type=norm_type,
	norm_elementwise_affine=norm_elementwise_affine,
	)
	for d in range(num_layers)
	]
	)

	# 4. Define output layers
	self.out_channels = in_channels if out_channels is None else out_channels
	if self.is_input_continuous:
	# TODO: should use out_channels for continuous projections
	if use_linear_projection:
	self.proj_out = nn.Linear(inner_dim, in_channels)
	else:
	self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
	elif self.is_input_vectorized:
	self.norm_out = nn.LayerNorm(inner_dim)
	self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
	elif self.is_input_patches:
	self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=False, eps=1e-6)
	self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
	self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)

	def forward(
	self,
	hidden_states: torch.Tensor,
	encoder_hidden_states: Optional[torch.Tensor] = None,
	timestep: Optional[torch.LongTensor] = None,
	class_labels: Optional[torch.LongTensor] = None,
	cross_attention_kwargs: Dict[str, Any] = None,
	attention_mask: Optional[torch.Tensor] = None,
	encoder_attention_mask: Optional[torch.Tensor] = None,
	return_dict: bool = True,
	):
	"""
	Args:
	hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`.
	When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input
	hidden_states
	encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, optional):
	Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
	self-attention.
	timestep ( `torch.LongTensor`, optional):
	Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
	class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, optional):
	Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class
	labels conditioning.
	attention_mask ( `torch.Tensor` of shape (batch size, num latent pixels), optional ).
	Bias to add to attention scores.
	encoder_attention_mask ( `torch.Tensor` of shape (batch size, num encoder tokens), optional ).
	Bias to add to cross-attention scores.
	return_dict (`bool`, optional, defaults to `True`):
	Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.

	Returns:
	[`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`:
	[`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`.
	When returning a tuple, the first element is the sample tensor.
	"""
	# 1. Input
	if self.is_input_continuous:
	batch, _, height, width = hidden_states.shape
	residual = hidden_states

	hidden_states = self.norm(hidden_states)
	if not self.use_linear_projection:
	hidden_states = self.proj_in(hidden_states)
	inner_dim = hidden_states.shape[1]
	hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
	else:
	inner_dim = hidden_states.shape[1]
	hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
	hidden_states = self.proj_in(hidden_states)

	elif self.is_input_vectorized:
	hidden_states = self.latent_image_embedding(hidden_states)

	elif self.is_input_patches:
	hidden_states = self.pos_embed(hidden_states)

	# 2. Blocks
	for block in self.transformer_blocks:
	hidden_states = block(
	hidden_states,
	attention_mask=attention_mask,
	encoder_hidden_states=encoder_hidden_states,
	encoder_attention_mask=encoder_attention_mask,
	timestep=timestep,
	cross_attention_kwargs=cross_attention_kwargs,
	class_labels=class_labels,
	)

	# 3. Output
	if self.is_input_continuous:
	if not self.use_linear_projection:
	hidden_states = hidden_states.reshape(
	batch, height, width, inner_dim
	).permute(0, 3, 1, 2).contiguous()
	hidden_states = self.proj_out(hidden_states)
	else:
	hidden_states = self.proj_out(hidden_states)
	hidden_states = hidden_states.reshape(
	batch, height, width, inner_dim
	).permute(0, 3, 1, 2).contiguous()
	output = hidden_states + residual

	elif self.is_input_vectorized:
	hidden_states = self.norm_out(hidden_states)
	logits = self.out(hidden_states)
	# (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
	logits = logits.permute(0, 2, 1)

	# log(p(x_0))
	output = F.log_softmax(logits.double(), dim=1).float()

	elif self.is_input_patches:
	# TODO: cleanup!
	conditioning = self.transformer_blocks[0].norm1.emb(
	timestep, class_labels, hidden_dtype=hidden_states.dtype
	)
	shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
	hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
	hidden_states = self.proj_out_2(hidden_states)

	# unpatchify
	height = width = int(hidden_states.shape[1] ** 0.5)
	hidden_states = hidden_states.reshape(
	shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
	)
	hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
	output = hidden_states.reshape(
	shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
	)

	if not return_dict:
	return (output,)

	return Transformer2DModelOutput(sample=output)