Spaces:

tellurion
/

ColorizeDiffusion

Running on Zero

App Files Files Community

ColorizeDiffusion / refnet /ldm /openaimodel.py

tellurion

initialize huggingface space demo

d066167 8 days ago

raw

history blame contribute delete

13 kB

	from abc import abstractmethod
	import math

	import numpy as np
	import torch as th
	import torch.nn as nn
	import torch.nn.functional as F

	from refnet.ldm.util import (
	conv_nd,
	linear,
	avg_pool_nd,
	zero_module,
	normalization,
	timestep_embedding,
	)
	from refnet.util import checkpoint_wrapper



	# dummy replace
	def convert_module_to_f16(x):
	pass

	def convert_module_to_f32(x):
	pass


	## go
	class AttentionPool2d(nn.Module):
	"""
	Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
	"""

	def __init__(
	self,
	spacial_dim: int,
	embed_dim: int,
	num_heads_channels: int,
	output_dim: int = None,
	):
	super().__init__()
	self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim 2 + 1) / embed_dim 0.5)
	self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
	self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
	self.num_heads = embed_dim // num_heads_channels
	self.attention = QKVAttention(self.num_heads)

	def forward(self, x):
	b, c, *_spatial = x.shape
	x = x.reshape(b, c, -1) # NC(HW)
	x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1) # NC(HW+1)
	x = x + self.positional_embedding[None, :, :].to(x.dtype) # NC(HW+1)
	x = self.qkv_proj(x)
	x = self.attention(x)
	x = self.c_proj(x)
	return x[:, :, 0]


	class TimestepBlock(nn.Module):
	"""
	Any module where forward() takes timestep embeddings as a second argument.
	"""

	@abstractmethod
	def forward(self, x, emb):
	"""
	Apply the module to `x` given `emb` timestep embeddings.
	"""


	class Upsample(nn.Module):
	"""
	An upsampling layer with an optional convolution.
	:param channels: channels in the inputs and outputs.
	:param use_conv: a bool determining if a convolution is applied.
	:param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
	upsampling occurs in the inner-two dimensions.
	"""

	def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
	super().__init__()
	self.channels = channels
	self.out_channels = out_channels or channels
	self.use_conv = use_conv
	self.dims = dims
	if use_conv:
	self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding)

	def forward(self, x):
	assert x.shape[1] == self.channels
	if self.dims == 3:
	x = F.interpolate(
	x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
	)
	else:
	x = F.interpolate(x, scale_factor=2, mode="nearest")
	if self.use_conv:
	x = self.conv(x)
	return x

	class TransposedUpsample(nn.Module):
	'Learned 2x upsampling without padding'
	def __init__(self, channels, out_channels=None, ks=5):
	super().__init__()
	self.channels = channels
	self.out_channels = out_channels or channels

	self.up = nn.ConvTranspose2d(self.channels,self.out_channels,kernel_size=ks,stride=2)

	def forward(self,x):
	return self.up(x)


	class Downsample(nn.Module):
	"""
	A downsampling layer with an optional convolution.
	:param channels: channels in the inputs and outputs.
	:param use_conv: a bool determining if a convolution is applied.
	:param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
	downsampling occurs in the inner-two dimensions.
	"""

	def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
	super().__init__()
	self.channels = channels
	self.out_channels = out_channels or channels
	self.use_conv = use_conv
	self.dims = dims
	stride = 2 if dims != 3 else (1, 2, 2)
	if use_conv:
	self.op = conv_nd(
	dims, self.channels, self.out_channels, 3, stride=stride, padding=padding
	)
	else:
	assert self.channels == self.out_channels
	self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)

	def forward(self, x):
	assert x.shape[1] == self.channels
	return self.op(x)


	class ResBlock(TimestepBlock):
	"""
	A residual block that can optionally change the number of channels.
	:param channels: the number of input channels.
	:param emb_channels: the number of timestep embedding channels.
	:param dropout: the rate of dropout.
	:param out_channels: if specified, the number of out channels.
	:param use_conv: if True and out_channels is specified, use a spatial
	convolution instead of a smaller 1x1 convolution to change the
	channels in the skip connection.
	:param dims: determines if the signal is 1D, 2D, or 3D.
	:param use_checkpoint: if True, use gradient checkpointing on this module.
	:param up: if True, use this block for upsampling.
	:param down: if True, use this block for downsampling.
	"""

	def __init__(
	self,
	channels,
	emb_channels,
	dropout,
	out_channels=None,
	use_conv=False,
	use_scale_shift_norm=False,
	dims=2,
	use_checkpoint=False,
	up=False,
	down=False,
	):
	super().__init__()
	self.channels = channels
	self.emb_channels = emb_channels
	self.dropout = dropout
	self.out_channels = out_channels or channels
	self.use_conv = use_conv
	self.checkpoint = use_checkpoint
	self.use_scale_shift_norm = use_scale_shift_norm

	self.in_layers = nn.Sequential(
	normalization(channels),
	nn.SiLU(),
	conv_nd(dims, channels, self.out_channels, 3, padding=1),
	)

	self.updown = up or down

	if up:
	self.h_upd = Upsample(channels, False, dims)
	self.x_upd = Upsample(channels, False, dims)
	elif down:
	self.h_upd = Downsample(channels, False, dims)
	self.x_upd = Downsample(channels, False, dims)
	else:
	self.h_upd = self.x_upd = nn.Identity()

	self.emb_layers = nn.Sequential(
	nn.SiLU(),
	linear(
	emb_channels,
	2 * self.out_channels if use_scale_shift_norm else self.out_channels,
	),
	)
	self.out_layers = nn.Sequential(
	normalization(self.out_channels),
	nn.SiLU(),
	nn.Dropout(p=dropout),
	zero_module(
	conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
	),
	)

	if self.out_channels == channels:
	self.skip_connection = nn.Identity()
	elif use_conv:
	self.skip_connection = conv_nd(
	dims, channels, self.out_channels, 3, padding=1
	)
	else:
	self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)

	@checkpoint_wrapper
	def forward(self, x, emb):
	if self.updown:
	in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
	h = in_rest(x)
	h = self.h_upd(h)
	x = self.x_upd(x)
	h = in_conv(h)
	else:
	h = self.in_layers(x)
	emb_out = self.emb_layers(emb).type(h.dtype)
	while len(emb_out.shape) < len(h.shape):
	emb_out = emb_out[..., None]
	if self.use_scale_shift_norm:
	out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
	scale, shift = th.chunk(emb_out, 2, dim=1)
	h = out_norm(h) * (1 + scale) + shift
	h = out_rest(h)
	else:
	h = h + emb_out
	h = self.out_layers(h)
	return self.skip_connection(x) + h


	class AttentionBlock(nn.Module):
	"""
	An attention block that allows spatial positions to attend to each other.
	Originally ported from here, but adapted to the N-d case.
	https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
	"""

	def __init__(
	self,
	channels,
	num_heads=1,
	num_head_channels=-1,
	use_checkpoint=False,
	use_new_attention_order=False,
	):
	super().__init__()
	self.channels = channels
	if num_head_channels == -1:
	self.num_heads = num_heads
	else:
	assert (
	channels % num_head_channels == 0
	), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
	self.num_heads = channels // num_head_channels
	self.use_checkpoint = use_checkpoint
	self.norm = normalization(channels)
	self.qkv = conv_nd(1, channels, channels * 3, 1)
	if use_new_attention_order:
	# split qkv before split heads
	self.attention = QKVAttention(self.num_heads)
	else:
	# split heads before split qkv
	self.attention = QKVAttentionLegacy(self.num_heads)

	self.proj_out = zero_module(conv_nd(1, channels, channels, 1))

	@checkpoint_wrapper
	def forward(self, x):
	b, c, *spatial = x.shape
	x = x.reshape(b, c, -1)
	qkv = self.qkv(self.norm(x))
	h = self.attention(qkv)
	h = self.proj_out(h)
	return (x + h).reshape(b, c, *spatial)


	def count_flops_attn(model, _x, y):
	"""
	A counter for the `thop` package to count the operations in an
	attention operation.
	Meant to be used like:
	macs, params = thop.profile(
	model,
	inputs=(inputs, timestamps),
	custom_ops={QKVAttention: QKVAttention.count_flops},
	)
	"""
	b, c, *spatial = y[0].shape
	num_spatial = int(np.prod(spatial))
	# We perform two matmuls with the same number of ops.
	# The first computes the weight matrix, the second computes
	# the combination of the value vectors.
	matmul_ops = 2 * b * (num_spatial ** 2) * c
	model.total_ops += th.DoubleTensor([matmul_ops])


	class QKVAttentionLegacy(nn.Module):
	"""
	A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
	"""

	def __init__(self, n_heads):
	super().__init__()
	self.n_heads = n_heads

	def forward(self, qkv):
	"""
	Apply QKV attention.
	:param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
	:return: an [N x (H * C) x T] tensor after attention.
	"""
	bs, width, length = qkv.shape
	assert width % (3 * self.n_heads) == 0
	ch = width // (3 * self.n_heads)
	q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
	scale = 1 / math.sqrt(math.sqrt(ch))
	weight = th.einsum(
	"bct,bcs->bts", q * scale, k * scale
	) # More stable with f16 than dividing afterwards
	weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
	a = th.einsum("bts,bcs->bct", weight, v)
	return a.reshape(bs, -1, length)

	@staticmethod
	def count_flops(model, _x, y):
	return count_flops_attn(model, _x, y)


	class QKVAttention(nn.Module):
	"""
	A module which performs QKV attention and splits in a different order.
	"""

	def __init__(self, n_heads):
	super().__init__()
	self.n_heads = n_heads

	def forward(self, qkv):
	"""
	Apply QKV attention.
	:param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
	:return: an [N x (H * C) x T] tensor after attention.
	"""
	bs, width, length = qkv.shape
	assert width % (3 * self.n_heads) == 0
	ch = width // (3 * self.n_heads)
	q, k, v = qkv.chunk(3, dim=1)
	scale = 1 / math.sqrt(math.sqrt(ch))
	weight = th.einsum(
	"bct,bcs->bts",
	(q * scale).view(bs * self.n_heads, ch, length),
	(k * scale).view(bs * self.n_heads, ch, length),
	) # More stable with f16 than dividing afterwards
	weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
	a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
	return a.reshape(bs, -1, length)

	@staticmethod
	def count_flops(model, _x, y):
	return count_flops_attn(model, _x, y)


	class Timestep(nn.Module):
	def __init__(self, dim):
	super().__init__()
	self.dim = dim

	def forward(self, t):
	return timestep_embedding(t, self.dim)